Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test
@ 2023-03-29 14:13 J. Dekker
  2023-03-29 14:13 ` [FFmpeg-devel] [PATCH v4 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
  2023-03-29 20:13 ` [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test Martin Storsjö
  0 siblings, 2 replies; 6+ messages in thread
From: J. Dekker @ 2023-03-29 14:13 UTC (permalink / raw)
  To: ffmpeg-devel

Signed-off-by: J. Dekker <jdek@itanimul.li>
---

 Will support other variants in the second version of these tests.

 tests/checkasm/Makefile       |   2 +-
 tests/checkasm/checkasm.c     |   1 +
 tests/checkasm/checkasm.h     |   1 +
 tests/checkasm/hevc_deblock.c | 100 ++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak       |   1 +
 5 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/hevc_deblock.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a6f06c7007..3e62a22bd6 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -28,7 +28,7 @@ AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER)   += huffyuvdsp.o
 AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_OPUS_DECODER)      += opusdsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)       += pixblockdsp.o
-AVCODECOBJS-$(CONFIG_HEVC_DECODER)      += hevc_add_res.o hevc_idct.o hevc_sao.o hevc_pel.o
+AVCODECOBJS-$(CONFIG_HEVC_DECODER)      += hevc_add_res.o hevc_deblock.o hevc_idct.o hevc_sao.o hevc_pel.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
 AVCODECOBJS-$(CONFIG_V210_DECODER)      += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)      += v210enc.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e96d84a7da..c2184d260d 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -116,6 +116,7 @@ static const struct {
     #endif
     #if CONFIG_HEVC_DECODER
         { "hevc_add_res", checkasm_check_hevc_add_res },
+        { "hevc_deblock", checkasm_check_hevc_deblock },
         { "hevc_idct", checkasm_check_hevc_idct },
         { "hevc_pel", checkasm_check_hevc_pel },
         { "hevc_sao", checkasm_check_hevc_sao },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8744a81218..89c643e6a0 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -60,6 +60,7 @@ void checkasm_check_h264dsp(void);
 void checkasm_check_h264pred(void);
 void checkasm_check_h264qpel(void);
 void checkasm_check_hevc_add_res(void);
+void checkasm_check_hevc_deblock(void);
 void checkasm_check_hevc_idct(void);
 void checkasm_check_hevc_pel(void);
 void checkasm_check_hevc_sao(void);
diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
new file mode 100644
index 0000000000..f73e68e8a6
--- /dev/null
+++ b/tests/checkasm/hevc_deblock.c
@@ -0,0 +1,100 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hevcdsp.h"
+
+#include "checkasm.h"
+
+static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define BUF_STRIDE (8 * 2)
+#define BUF_LINES (8)
+#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
+#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
+
+#define randomize_buffers(buf0, buf1, size)                 \
+    do {                                                    \
+        uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+        int k;                                              \
+        for (k = 0; k < size; k += 4) {                     \
+            uint32_t r = rnd() & mask;                      \
+            AV_WN32A(buf0 + k, r);                          \
+            AV_WN32A(buf1 + k, r);                          \
+        }                                                   \
+    } while (0)
+
+static void check_deblock_chroma(HEVCDSPContext h, int bit_depth)
+{
+    int32_t tc[2] = { 0, 0 };
+    // no_p, no_q can only be { 0,0 } for assembly functions, see deblocking_filter_CTB() in hevc_filter.c
+    uint8_t no_p[2] = { 0, 0 };
+    uint8_t no_q[2] = { 0, 0 };
+    LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
+
+    declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
+
+    if (check_func(h.hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma%d", bit_depth)) {
+        randomize_buffers(buf0, buf1, BUF_SIZE);
+        for (int i = 0; i < 4; i++) {
+            // see betatable[] in hevc_filter.c
+            tc[0] = (rnd() & 63) + (rnd() & 1);
+            tc[1] = (rnd() & 63) + (rnd() & 1);
+
+            call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+            call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+            if (memcmp(buf0, buf1, BUF_SIZE))
+                fail();
+        }
+        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+    }
+
+    if (check_func(h.hevc_v_loop_filter_chroma, "hevc_v_loop_filter_chroma%d", bit_depth)) {
+        randomize_buffers(buf0, buf1, BUF_SIZE);
+        for (int i = 0; i < 4; i++) {
+            // see betatable[] in hevc_filter.c
+            tc[0] = (rnd() & 63) + (rnd() & 1);
+            tc[1] = (rnd() & 63) + (rnd() & 1);
+
+            call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+            call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+            if (memcmp(buf0, buf1, BUF_SIZE))
+                fail();
+        }
+        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+    }
+}
+
+void checkasm_check_hevc_deblock(void)
+{
+    int bit_depth;
+
+    for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+        HEVCDSPContext h;
+        ff_hevc_dsp_init(&h, bit_depth);
+        check_deblock_chroma(h, bit_depth);
+    }
+    report("chroma");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index a4e95541f5..faac764388 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -15,6 +15,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-h264pred                                  \
                 fate-checkasm-h264qpel                                  \
                 fate-checkasm-hevc_add_res                              \
+                fate-checkasm-hevc_deblock                              \
                 fate-checkasm-hevc_idct                                 \
                 fate-checkasm-hevc_pel                                  \
                 fate-checkasm-hevc_sao                                  \
-- 
2.37.1 (Apple Git-137.1)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH v4 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit
  2023-03-29 14:13 [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test J. Dekker
@ 2023-03-29 14:13 ` J. Dekker
  2023-03-29 20:29   ` Martin Storsjö
  2023-03-29 20:13 ` [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test Martin Storsjö
  1 sibling, 1 reply; 6+ messages in thread
From: J. Dekker @ 2023-03-29 14:13 UTC (permalink / raw)
  To: ffmpeg-devel

Benched on Ampere Altra:

hevc_h_loop_filter_chroma8_c: 367.7
hevc_h_loop_filter_chroma8_neon: 31.0
hevc_h_loop_filter_chroma10_c: 396.7
hevc_h_loop_filter_chroma10_neon: 27.5
hevc_h_loop_filter_chroma12_c: 377.0
hevc_h_loop_filter_chroma12_neon: 31.7
hevc_v_loop_filter_chroma8_c: 369.0
hevc_v_loop_filter_chroma8_neon: 55.0
hevc_v_loop_filter_chroma10_c: 389.0
hevc_v_loop_filter_chroma10_neon: 54.0
hevc_v_loop_filter_chroma12_c: 389.5
hevc_v_loop_filter_chroma12_neon: 53.0

Signed-off-by: J. Dekker <jdek@itanimul.li>
---

 Included Martin's comments, decent speedup on vertical filter (~50%).

 libavcodec/aarch64/Makefile               |   3 +-
 libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +++
 3 files changed, 200 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 02fb51c3ab..216191640c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
                                            aarch64/vp9lpf_neon.o               \
                                            aarch64/vp9mc_16bpp_neon.o          \
                                            aarch64/vp9mc_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
+NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_deblock_neon.o      \
+                                           aarch64/hevcdsp_idct_neon.o         \
                                            aarch64/hevcdsp_init_aarch64.o      \
                                            aarch64/hevcdsp_qpel_neon.o         \
                                            aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
new file mode 100644
index 0000000000..ed342e5ded
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -0,0 +1,180 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start bitdepth
+        mov             x4, x30
+        ldr             w14, [x2]
+        ldr             w15, [x2, #4]
+.if \bitdepth > 8
+        lsl             w14, w14, #(\bitdepth - 8)
+        lsl             w15, w15, #(\bitdepth - 8)
+.endif
+        adds            w2, w14, w15
+        b.eq            1f
+        dup             v16.4h, w14
+        dup             v17.4h, w15
+        trn1            v16.2d, v16.2d, v17.2d
+.if \bitdepth > 8
+        mvni            v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8
+        movi            v18.8h, #0
+.endif
+        neg             v17.8h, v16.8h
+.endm
+
+.macro hevc_loop_filter_chroma_body bitdepth
+.if \bitdepth <= 8
+        uxtl            v20.8h, v0.8b // p1
+        uxtl            v1.8h,  v1.8b // p0
+        uxtl            v2.8h,  v2.8b // q0
+        uxtl            v23.8h, v3.8b // q1
+        va              .req v20
+        vb              .req v23
+.else   // required to specify both cases as we are unable to do: v0 .req v20
+        va              .req v0
+        vb              .req v3
+.endif
+        sub             v5.8h, v2.8h, v1.8h // q0 - p0
+        sub             v6.8h, va.8h, vb.8h // p1 - q1
+        shl             v5.8h, v5.8h, #2
+        add             v5.8h, v6.8h, v5.8h
+        srshr           v5.8h, v5.8h, #3
+        clip            v17.8h, v16.8h, v5.8h
+        sqadd           v1.8h, v1.8h, v5.8h // p0 + delta
+        sqsub           v2.8h, v2.8h, v5.8h // q0 - delta
+.if \bitdepth <= 8
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+.unreq  va
+.unreq  vb
+.else
+        clip            v18.8h, v19.8h, v1.8h, v2.8h
+.endif
+.endm
+
+function hevc_loop_filter_chroma_body_8_neon, export=0
+        hevc_loop_filter_chroma_body 8
+        ret
+endfunc
+
+function hevc_loop_filter_chroma_body_10_neon, export=0
+hevc_loop_filter_chroma_body_12_neon:
+        hevc_loop_filter_chroma_body 10
+        ret
+endfunc
+
+// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+.macro hevc_h_loop_filter_chroma bitdepth
+function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1
+        hevc_loop_filter_chroma_start \bitdepth
+        sub             x0, x0, x1, lsl #1
+.if \bitdepth > 8
+        ld1             {v0.8h}, [x0], x1
+        ld1             {v1.8h}, [x0], x1
+        ld1             {v2.8h}, [x0], x1
+        ld1             {v3.8h}, [x0]
+.else
+        ld1             {v0.8b}, [x0], x1
+        ld1             {v1.8b}, [x0], x1
+        ld1             {v2.8b}, [x0], x1
+        ld1             {v3.8b}, [x0]
+.endif
+        sub             x0, x0, x1, lsl #1
+        bl              hevc_loop_filter_chroma_body_\bitdepth\()_neon
+.if \bitdepth > 8
+        st1             {v1.8h}, [x0], x1
+        st1             {v2.8h}, [x0]
+.else
+        st1             {v1.8b}, [x0], x1
+        st1             {v2.8b}, [x0]
+.endif
+1:      ret             x4
+endfunc
+.endm
+
+.macro hevc_v_loop_filter_chroma bitdepth
+function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1
+        hevc_loop_filter_chroma_start \bitdepth
+        sub             x0, x0, #((0x5200 >> \bitdepth) & 0x6) // high -> 4, low -> 2
+        add             x3, x0, x1
+        lsl             x1, x1, #1
+.if \bitdepth > 8
+        ld1             {v0.d}[0], [x0], x1
+        ld1             {v1.d}[0], [x3], x1
+        ld1             {v2.d}[0], [x0], x1
+        ld1             {v3.d}[0], [x3], x1
+        ld1             {v0.d}[1], [x0], x1
+        ld1             {v1.d}[1], [x3], x1
+        ld1             {v2.d}[1], [x0], x1
+        ld1             {v3.d}[1], [x3], x1
+        transpose_4x8H  v0, v1, v2, v3, v28, v29, v30, v31
+.else
+        ld1             {v0.s}[0], [x0], x1
+        ld1             {v1.s}[0], [x3], x1
+        ld1             {v2.s}[0], [x0], x1
+        ld1             {v3.s}[0], [x3], x1
+        ld1             {v0.s}[1], [x0], x1
+        ld1             {v1.s}[1], [x3], x1
+        ld1             {v2.s}[1], [x0], x1
+        ld1             {v3.s}[1], [x3], x1
+        transpose_4x8B  v0, v1, v2, v3, v28, v29, v30, v31
+.endif
+        sub             x0, x0, x1, lsl #2
+        sub             x3, x3, x1, lsl #2
+        bl              hevc_loop_filter_chroma_body_\bitdepth\()_neon
+.if \bitdepth > 8
+        transpose_4x8H  v0, v1, v2, v3, v28, v29, v30, v31
+        st1             {v0.d}[0], [x0], x1
+        st1             {v1.d}[0], [x3], x1
+        st1             {v2.d}[0], [x0], x1
+        st1             {v3.d}[0], [x3], x1
+        st1             {v0.d}[1], [x0], x1
+        st1             {v1.d}[1], [x3], x1
+        st1             {v2.d}[1], [x0], x1
+        st1             {v3.d}[1], [x3]
+.else
+        transpose_4x8B  v0, v1, v2, v3, v28, v29, v30, v31
+        st1             {v0.s}[0], [x0], x1
+        st1             {v1.s}[0], [x3], x1
+        st1             {v2.s}[0], [x0], x1
+        st1             {v3.s}[0], [x3], x1
+        st1             {v0.s}[1], [x0], x1
+        st1             {v1.s}[1], [x3], x1
+        st1             {v2.s}[1], [x0], x1
+        st1             {v3.s}[1], [x3]
+.endif
+1:      ret             x4
+endfunc
+.endm
+
+hevc_h_loop_filter_chroma 8
+hevc_h_loop_filter_chroma 10
+hevc_h_loop_filter_chroma 12
+
+hevc_v_loop_filter_chroma 8
+hevc_v_loop_filter_chroma 10
+hevc_v_loop_filter_chroma 12
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1deefca0a2..a923bae35c 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -25,6 +25,18 @@
 #include "libavutil/aarch64/cpu.h"
 #include "libavcodec/hevcdsp.h"
 
+void ff_hevc_v_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride,
+                                         const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride,
+                                         const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride,
+                                          const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
 void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
                                      ptrdiff_t stride);
 void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs,
@@ -117,6 +129,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
     if (!have_neon(av_get_cpu_flags())) return;
 
     if (bit_depth == 8) {
+        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_8_neon;
+        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_8_neon;
         c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
         c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
@@ -167,6 +181,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
     }
     if (bit_depth == 10) {
+        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
+        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_10_neon;
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
         c->add_residual[1]             = ff_hevc_add_residual_8x8_10_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_10_neon;
@@ -180,6 +196,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
     }
     if (bit_depth == 12) {
+        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_12_neon;
+        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_12_neon;
         c->add_residual[0]             = ff_hevc_add_residual_4x4_12_neon;
         c->add_residual[1]             = ff_hevc_add_residual_8x8_12_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_12_neon;
-- 
2.37.1 (Apple Git-137.1)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test
  2023-03-29 14:13 [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test J. Dekker
  2023-03-29 14:13 ` [FFmpeg-devel] [PATCH v4 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
@ 2023-03-29 20:13 ` Martin Storsjö
  2023-04-06  5:02   ` J. Dekker
  1 sibling, 1 reply; 6+ messages in thread
From: Martin Storsjö @ 2023-03-29 20:13 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, 29 Mar 2023, J. Dekker wrote:

> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> Will support other variants in the second version of these tests.
>
> tests/checkasm/Makefile       |   2 +-
> tests/checkasm/checkasm.c     |   1 +
> tests/checkasm/checkasm.h     |   1 +
> tests/checkasm/hevc_deblock.c | 100 ++++++++++++++++++++++++++++++++++
> tests/fate/checkasm.mak       |   1 +
> 5 files changed, 104 insertions(+), 1 deletion(-)
> create mode 100644 tests/checkasm/hevc_deblock.c
>
> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index a6f06c7007..3e62a22bd6 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -28,7 +28,7 @@ AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER)   += huffyuvdsp.o
> AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
> AVCODECOBJS-$(CONFIG_OPUS_DECODER)      += opusdsp.o
> AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)       += pixblockdsp.o
> -AVCODECOBJS-$(CONFIG_HEVC_DECODER)      += hevc_add_res.o hevc_idct.o hevc_sao.o hevc_pel.o
> +AVCODECOBJS-$(CONFIG_HEVC_DECODER)      += hevc_add_res.o hevc_deblock.o hevc_idct.o hevc_sao.o hevc_pel.o
> AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
> AVCODECOBJS-$(CONFIG_V210_DECODER)      += v210dec.o
> AVCODECOBJS-$(CONFIG_V210_ENCODER)      += v210enc.o
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index e96d84a7da..c2184d260d 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -116,6 +116,7 @@ static const struct {
>     #endif
>     #if CONFIG_HEVC_DECODER
>         { "hevc_add_res", checkasm_check_hevc_add_res },
> +        { "hevc_deblock", checkasm_check_hevc_deblock },
>         { "hevc_idct", checkasm_check_hevc_idct },
>         { "hevc_pel", checkasm_check_hevc_pel },
>         { "hevc_sao", checkasm_check_hevc_sao },
> diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
> index 8744a81218..89c643e6a0 100644
> --- a/tests/checkasm/checkasm.h
> +++ b/tests/checkasm/checkasm.h
> @@ -60,6 +60,7 @@ void checkasm_check_h264dsp(void);
> void checkasm_check_h264pred(void);
> void checkasm_check_h264qpel(void);
> void checkasm_check_hevc_add_res(void);
> +void checkasm_check_hevc_deblock(void);
> void checkasm_check_hevc_idct(void);
> void checkasm_check_hevc_pel(void);
> void checkasm_check_hevc_sao(void);
> diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
> new file mode 100644
> index 0000000000..f73e68e8a6
> --- /dev/null
> +++ b/tests/checkasm/hevc_deblock.c
> @@ -0,0 +1,100 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <string.h>
> +
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mem_internal.h"
> +
> +#include "libavcodec/avcodec.h"
> +#include "libavcodec/hevcdsp.h"
> +
> +#include "checkasm.h"
> +
> +static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
> +
> +#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
> +#define BUF_STRIDE (8 * 2)
> +#define BUF_LINES (8)
> +#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
> +#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
> +
> +#define randomize_buffers(buf0, buf1, size)                 \
> +    do {                                                    \
> +        uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
> +        int k;                                              \
> +        for (k = 0; k < size; k += 4) {                     \
> +            uint32_t r = rnd() & mask;                      \
> +            AV_WN32A(buf0 + k, r);                          \
> +            AV_WN32A(buf1 + k, r);                          \
> +        }                                                   \
> +    } while (0)
> +
> +static void check_deblock_chroma(HEVCDSPContext h, int bit_depth)
> +{
> +    int32_t tc[2] = { 0, 0 };
> +    // no_p, no_q can only be { 0,0 } for assembly functions, see deblocking_filter_CTB() in hevc_filter.c

It's not strictly about assembly functions; there's just two separate 
function pointers, hevc_v_loop_filter_chroma and 
hevc_v_loop_filter_chroma_c - you can implement both in assembly. I guess 
the intent of the _c variant originally was that it wasn't the assembly 
version for whatever assembly implementation there was at the time, but we 
can support both in assembly too.

(As a later step, we can probably template this test somehow to test both 
of them.)

> +    uint8_t no_p[2] = { 0, 0 };
> +    uint8_t no_q[2] = { 0, 0 };
> +    LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
> +    LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
> +
> +    declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
> +
> +    if (check_func(h.hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma%d", bit_depth)) {
> +        randomize_buffers(buf0, buf1, BUF_SIZE);
> +        for (int i = 0; i < 4; i++) {
> +            // see betatable[] in hevc_filter.c
> +            tc[0] = (rnd() & 63) + (rnd() & 1);
> +            tc[1] = (rnd() & 63) + (rnd() & 1);
> +
> +            call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
> +            call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
> +            if (memcmp(buf0, buf1, BUF_SIZE))
> +                fail();
> +        }
> +        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
> +    }
> +
> +    if (check_func(h.hevc_v_loop_filter_chroma, "hevc_v_loop_filter_chroma%d", bit_depth)) {
> +        randomize_buffers(buf0, buf1, BUF_SIZE);
> +        for (int i = 0; i < 4; i++) {
> +            // see betatable[] in hevc_filter.c
> +            tc[0] = (rnd() & 63) + (rnd() & 1);
> +            tc[1] = (rnd() & 63) + (rnd() & 1);

I wonder if it'd be better to test with a freshly randomized buffer 
instead of iterating over the same data multiple times? Worst case, the 
deblocked area converges to so little differences that the last few 
iterations don't make any difference.

I.e., I'd suggest moving the randomize_buffers call into the loop here.


Other than that, this patch LGTM.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit
  2023-03-29 14:13 ` [FFmpeg-devel] [PATCH v4 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
@ 2023-03-29 20:29   ` Martin Storsjö
  2023-04-06  5:00     ` J. Dekker
  0 siblings, 1 reply; 6+ messages in thread
From: Martin Storsjö @ 2023-03-29 20:29 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, 29 Mar 2023, J. Dekker wrote:

> Benched on Ampere Altra:
>
> hevc_h_loop_filter_chroma8_c: 367.7
> hevc_h_loop_filter_chroma8_neon: 31.0
> hevc_h_loop_filter_chroma10_c: 396.7
> hevc_h_loop_filter_chroma10_neon: 27.5
> hevc_h_loop_filter_chroma12_c: 377.0
> hevc_h_loop_filter_chroma12_neon: 31.7
> hevc_v_loop_filter_chroma8_c: 369.0
> hevc_v_loop_filter_chroma8_neon: 55.0
> hevc_v_loop_filter_chroma10_c: 389.0
> hevc_v_loop_filter_chroma10_neon: 54.0
> hevc_v_loop_filter_chroma12_c: 389.5
> hevc_v_loop_filter_chroma12_neon: 53.0
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> Included Martin's comments, decent speedup on vertical filter (~50%).
>
> libavcodec/aarch64/Makefile               |   3 +-
> libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +++
> 3 files changed, 200 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 02fb51c3ab..216191640c 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
>                                            aarch64/vp9lpf_neon.o               \
>                                            aarch64/vp9mc_16bpp_neon.o          \
>                                            aarch64/vp9mc_neon.o
> -NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
> +NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_deblock_neon.o      \
> +                                           aarch64/hevcdsp_idct_neon.o         \
>                                            aarch64/hevcdsp_init_aarch64.o      \
>                                            aarch64/hevcdsp_qpel_neon.o         \
>                                            aarch64/hevcdsp_sao_neon.o
> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> new file mode 100644
> index 0000000000..ed342e5ded
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> @@ -0,0 +1,180 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
> + * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +
> +#include "libavutil/aarch64/asm.S"
> +#include "neon.S"
> +
> +.macro hevc_loop_filter_chroma_start bitdepth
> +        mov             x4, x30
> +        ldr             w14, [x2]
> +        ldr             w15, [x2, #4]
> +.if \bitdepth > 8
> +        lsl             w14, w14, #(\bitdepth - 8)
> +        lsl             w15, w15, #(\bitdepth - 8)
> +.endif
> +        adds            w2, w14, w15
> +        b.eq            1f
> +        dup             v16.4h, w14
> +        dup             v17.4h, w15
> +        trn1            v16.2d, v16.2d, v17.2d
> +.if \bitdepth > 8
> +        mvni            v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8
> +        movi            v18.8h, #0
> +.endif
> +        neg             v17.8h, v16.8h
> +.endm
> +
> +.macro hevc_loop_filter_chroma_body bitdepth
> +.if \bitdepth <= 8
> +        uxtl            v20.8h, v0.8b // p1
> +        uxtl            v1.8h,  v1.8b // p0
> +        uxtl            v2.8h,  v2.8b // q0
> +        uxtl            v23.8h, v3.8b // q1
> +        va              .req v20
> +        vb              .req v23
> +.else   // required to specify both cases as we are unable to do: v0 .req v20
> +        va              .req v0
> +        vb              .req v3
> +.endif
> +        sub             v5.8h, v2.8h, v1.8h // q0 - p0
> +        sub             v6.8h, va.8h, vb.8h // p1 - q1
> +        shl             v5.8h, v5.8h, #2
> +        add             v5.8h, v6.8h, v5.8h
> +        srshr           v5.8h, v5.8h, #3
> +        clip            v17.8h, v16.8h, v5.8h
> +        sqadd           v1.8h, v1.8h, v5.8h // p0 + delta
> +        sqsub           v2.8h, v2.8h, v5.8h // q0 - delta
> +.if \bitdepth <= 8
> +        sqxtun          v1.8b, v1.8h
> +        sqxtun          v2.8b, v2.8h
> +.unreq  va
> +.unreq  vb

Shouldn't the .unreq be outside of the .if/.else?

> +.else
> +        clip            v18.8h, v19.8h, v1.8h, v2.8h
> +.endif
> +.endm
> +
> +function hevc_loop_filter_chroma_body_8_neon, export=0
> +        hevc_loop_filter_chroma_body 8
> +        ret
> +endfunc
> +
> +function hevc_loop_filter_chroma_body_10_neon, export=0
> +hevc_loop_filter_chroma_body_12_neon:
> +        hevc_loop_filter_chroma_body 10
> +        ret
> +endfunc
> +
> +// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
> +
> +.macro hevc_h_loop_filter_chroma bitdepth
> +function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1
> +        hevc_loop_filter_chroma_start \bitdepth
> +        sub             x0, x0, x1, lsl #1
> +.if \bitdepth > 8
> +        ld1             {v0.8h}, [x0], x1
> +        ld1             {v1.8h}, [x0], x1
> +        ld1             {v2.8h}, [x0], x1
> +        ld1             {v3.8h}, [x0]
> +.else
> +        ld1             {v0.8b}, [x0], x1
> +        ld1             {v1.8b}, [x0], x1
> +        ld1             {v2.8b}, [x0], x1
> +        ld1             {v3.8b}, [x0]
> +.endif
> +        sub             x0, x0, x1, lsl #1
> +        bl              hevc_loop_filter_chroma_body_\bitdepth\()_neon
> +.if \bitdepth > 8
> +        st1             {v1.8h}, [x0], x1
> +        st1             {v2.8h}, [x0]
> +.else
> +        st1             {v1.8b}, [x0], x1
> +        st1             {v2.8b}, [x0]
> +.endif
> +1:      ret             x4
> +endfunc
> +.endm
> +
> +.macro hevc_v_loop_filter_chroma bitdepth
> +function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1
> +        hevc_loop_filter_chroma_start \bitdepth
> +        sub             x0, x0, #((0x5200 >> \bitdepth) & 0x6) // high -> 4, low -> 2

TBH, I think this is rather obfuscated - I'd prefer to just move the sub 
(and the two instructions inbetween) back inside of the .if/.else, to have 
the sub instruction say more explicitly exactly what it does.

Other than that, this patch LGTM now.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit
  2023-03-29 20:29   ` Martin Storsjö
@ 2023-04-06  5:00     ` J. Dekker
  0 siblings, 0 replies; 6+ messages in thread
From: J. Dekker @ 2023-04-06  5:00 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Mar 29, 2023 at 11:29:09PM +0300, Martin Storsjö wrote:
> On Wed, 29 Mar 2023, J. Dekker wrote:
> 
> > Benched on Ampere Altra:
> > 
> > hevc_h_loop_filter_chroma8_c: 367.7
> > hevc_h_loop_filter_chroma8_neon: 31.0
> > hevc_h_loop_filter_chroma10_c: 396.7
> > hevc_h_loop_filter_chroma10_neon: 27.5
> > hevc_h_loop_filter_chroma12_c: 377.0
> > hevc_h_loop_filter_chroma12_neon: 31.7
> > hevc_v_loop_filter_chroma8_c: 369.0
> > hevc_v_loop_filter_chroma8_neon: 55.0
> > hevc_v_loop_filter_chroma10_c: 389.0
> > hevc_v_loop_filter_chroma10_neon: 54.0
> > hevc_v_loop_filter_chroma12_c: 389.5
> > hevc_v_loop_filter_chroma12_neon: 53.0
> > 
> > Signed-off-by: J. Dekker <jdek@itanimul.li>
> > ---
> > 
> > Included Martin's comments, decent speedup on vertical filter (~50%).
> > 
> > libavcodec/aarch64/Makefile               |   3 +-
> > libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++
> > libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +++
> > 3 files changed, 200 insertions(+), 1 deletion(-)
> > create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S
> > 
> > diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> > index 02fb51c3ab..216191640c 100644
> > --- a/libavcodec/aarch64/Makefile
> > +++ b/libavcodec/aarch64/Makefile
> > @@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
> >                                            aarch64/vp9lpf_neon.o               \
> >                                            aarch64/vp9mc_16bpp_neon.o          \
> >                                            aarch64/vp9mc_neon.o
> > -NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
> > +NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_deblock_neon.o      \
> > +                                           aarch64/hevcdsp_idct_neon.o         \
> >                                            aarch64/hevcdsp_init_aarch64.o      \
> >                                            aarch64/hevcdsp_qpel_neon.o         \
> >                                            aarch64/hevcdsp_sao_neon.o
> > diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> > new file mode 100644
> > index 0000000000..ed342e5ded
> > --- /dev/null
> > +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> > @@ -0,0 +1,180 @@
> > +/* -*-arm64-*-
> > + * vim: syntax=arm64asm
> > + *
> > + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
> > + * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +#include "neon.S"
> > +
> > +.macro hevc_loop_filter_chroma_start bitdepth
> > +        mov             x4, x30
> > +        ldr             w14, [x2]
> > +        ldr             w15, [x2, #4]
> > +.if \bitdepth > 8
> > +        lsl             w14, w14, #(\bitdepth - 8)
> > +        lsl             w15, w15, #(\bitdepth - 8)
> > +.endif
> > +        adds            w2, w14, w15
> > +        b.eq            1f
> > +        dup             v16.4h, w14
> > +        dup             v17.4h, w15
> > +        trn1            v16.2d, v16.2d, v17.2d
> > +.if \bitdepth > 8
> > +        mvni            v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8
> > +        movi            v18.8h, #0
> > +.endif
> > +        neg             v17.8h, v16.8h
> > +.endm
> > +
> > +.macro hevc_loop_filter_chroma_body bitdepth
> > +.if \bitdepth <= 8
> > +        uxtl            v20.8h, v0.8b // p1
> > +        uxtl            v1.8h,  v1.8b // p0
> > +        uxtl            v2.8h,  v2.8b // q0
> > +        uxtl            v23.8h, v3.8b // q1
> > +        va              .req v20
> > +        vb              .req v23
> > +.else   // required to specify both cases as we are unable to do: v0 .req v20
> > +        va              .req v0
> > +        vb              .req v3
> > +.endif
> > +        sub             v5.8h, v2.8h, v1.8h // q0 - p0
> > +        sub             v6.8h, va.8h, vb.8h // p1 - q1
> > +        shl             v5.8h, v5.8h, #2
> > +        add             v5.8h, v6.8h, v5.8h
> > +        srshr           v5.8h, v5.8h, #3
> > +        clip            v17.8h, v16.8h, v5.8h
> > +        sqadd           v1.8h, v1.8h, v5.8h // p0 + delta
> > +        sqsub           v2.8h, v2.8h, v5.8h // q0 - delta
> > +.if \bitdepth <= 8
> > +        sqxtun          v1.8b, v1.8h
> > +        sqxtun          v2.8b, v2.8h
> > +.unreq  va
> > +.unreq  vb
> 
> Shouldn't the .unreq be outside of the .if/.else?
> 
> > +.else
> > +        clip            v18.8h, v19.8h, v1.8h, v2.8h
> > +.endif
> > +.endm
> > +
> > +function hevc_loop_filter_chroma_body_8_neon, export=0
> > +        hevc_loop_filter_chroma_body 8
> > +        ret
> > +endfunc
> > +
> > +function hevc_loop_filter_chroma_body_10_neon, export=0
> > +hevc_loop_filter_chroma_body_12_neon:
> > +        hevc_loop_filter_chroma_body 10
> > +        ret
> > +endfunc
> > +
> > +// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
> > +
> > +.macro hevc_h_loop_filter_chroma bitdepth
> > +function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1
> > +        hevc_loop_filter_chroma_start \bitdepth
> > +        sub             x0, x0, x1, lsl #1
> > +.if \bitdepth > 8
> > +        ld1             {v0.8h}, [x0], x1
> > +        ld1             {v1.8h}, [x0], x1
> > +        ld1             {v2.8h}, [x0], x1
> > +        ld1             {v3.8h}, [x0]
> > +.else
> > +        ld1             {v0.8b}, [x0], x1
> > +        ld1             {v1.8b}, [x0], x1
> > +        ld1             {v2.8b}, [x0], x1
> > +        ld1             {v3.8b}, [x0]
> > +.endif
> > +        sub             x0, x0, x1, lsl #1
> > +        bl              hevc_loop_filter_chroma_body_\bitdepth\()_neon
> > +.if \bitdepth > 8
> > +        st1             {v1.8h}, [x0], x1
> > +        st1             {v2.8h}, [x0]
> > +.else
> > +        st1             {v1.8b}, [x0], x1
> > +        st1             {v2.8b}, [x0]
> > +.endif
> > +1:      ret             x4
> > +endfunc
> > +.endm
> > +
> > +.macro hevc_v_loop_filter_chroma bitdepth
> > +function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1
> > +        hevc_loop_filter_chroma_start \bitdepth
> > +        sub             x0, x0, #((0x5200 >> \bitdepth) & 0x6) // high -> 4, low -> 2
> 
> TBH, I think this is rather obfuscated - I'd prefer to just move the sub
> (and the two instructions inbetween) back inside of the .if/.else, to have
> the sub instruction say more explicitly exactly what it does.
> 
> Other than that, this patch LGTM now.
> 

Thanks, pushed with changes.

-- 
jd
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test
  2023-03-29 20:13 ` [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test Martin Storsjö
@ 2023-04-06  5:02   ` J. Dekker
  0 siblings, 0 replies; 6+ messages in thread
From: J. Dekker @ 2023-04-06  5:02 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Mar 29, 2023 at 11:13:15PM +0300, Martin Storsjö wrote:
> On Wed, 29 Mar 2023, J. Dekker wrote:
> 
> > Signed-off-by: J. Dekker <jdek@itanimul.li>
> > ---
> > 
> > Will support other variants in the second version of these tests.
> > 
> > tests/checkasm/Makefile       |   2 +-
> > tests/checkasm/checkasm.c     |   1 +
> > tests/checkasm/checkasm.h     |   1 +
> > tests/checkasm/hevc_deblock.c | 100 ++++++++++++++++++++++++++++++++++
> > tests/fate/checkasm.mak       |   1 +
> > 5 files changed, 104 insertions(+), 1 deletion(-)
> > create mode 100644 tests/checkasm/hevc_deblock.c
> > 
> [...]
> 
> Other than that, this patch LGTM.
> 
> // Martin

Pushed with changes, thanks for reviews

-- 
jd
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-04-06  5:02 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-29 14:13 [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test J. Dekker
2023-03-29 14:13 ` [FFmpeg-devel] [PATCH v4 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
2023-03-29 20:29   ` Martin Storsjö
2023-04-06  5:00     ` J. Dekker
2023-03-29 20:13 ` [FFmpeg-devel] [PATCH v4 1/2] checkasm: add hevc_deblock chroma test Martin Storsjö
2023-04-06  5:02   ` J. Dekker

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git