* [FFmpeg-devel] [PATCH 1/2] checkasm: add hevc_deblock chroma test
@ 2023-03-21 18:35 J. Dekker
2023-03-21 18:35 ` [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
0 siblings, 1 reply; 9+ messages in thread
From: J. Dekker @ 2023-03-21 18:35 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
Only the chroma part, writing the luma part alongside the NEON asm.
tests/checkasm/Makefile | 2 +-
tests/checkasm/checkasm.c | 1 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/hevc_deblock.c | 87 +++++++++++++++++++++++++++++++++++
4 files changed, 90 insertions(+), 1 deletion(-)
create mode 100644 tests/checkasm/hevc_deblock.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a6f06c7007..3e62a22bd6 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -28,7 +28,7 @@ AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuvdsp.o
AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o
AVCODECOBJS-$(CONFIG_OPUS_DECODER) += opusdsp.o
AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o
-AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_idct.o hevc_sao.o hevc_pel.o
+AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_deblock.o hevc_idct.o hevc_sao.o hevc_pel.o
AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER) += utvideodsp.o
AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o
AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e96d84a7da..c2184d260d 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -116,6 +116,7 @@ static const struct {
#endif
#if CONFIG_HEVC_DECODER
{ "hevc_add_res", checkasm_check_hevc_add_res },
+ { "hevc_deblock", checkasm_check_hevc_deblock },
{ "hevc_idct", checkasm_check_hevc_idct },
{ "hevc_pel", checkasm_check_hevc_pel },
{ "hevc_sao", checkasm_check_hevc_sao },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8744a81218..89c643e6a0 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -60,6 +60,7 @@ void checkasm_check_h264dsp(void);
void checkasm_check_h264pred(void);
void checkasm_check_h264qpel(void);
void checkasm_check_hevc_add_res(void);
+void checkasm_check_hevc_deblock(void);
void checkasm_check_hevc_idct(void);
void checkasm_check_hevc_pel(void);
void checkasm_check_hevc_sao(void);
diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
new file mode 100644
index 0000000000..06c9c1969e
--- /dev/null
+++ b/tests/checkasm/hevc_deblock.c
@@ -0,0 +1,87 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hevcdsp.h"
+
+#include "checkasm.h"
+
+static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define BUF_STRIDE (8 * 2)
+#define BUF_LINES (8)
+#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
+#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
+
+#define randomize_buffers(buf0, buf1, size) \
+ do { \
+ uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
+ int k; \
+ for (k = 0; k < size; k += 4) { \
+ uint32_t r = rnd() & mask; \
+ AV_WN32A(buf0 + k, r); \
+ AV_WN32A(buf1 + k, r); \
+ } \
+ } while (0)
+
+static void check_deblock_chroma(HEVCDSPContext h, int bit_depth)
+{
+ int32_t tc[2] = { 1, 1 };
+ uint8_t no_p[2] = { 0, 0 };
+ uint8_t no_q[2] = { 0, 0 };
+ LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
+
+ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
+
+ randomize_buffers(buf0, buf1, BUF_SIZE);
+ if (check_func(h.hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma_%d", bit_depth)) {
+ call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ if (memcmp(buf0, buf1, BUF_SIZE))
+ fail();
+ bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ }
+
+ randomize_buffers(buf0, buf1, BUF_SIZE);
+ if (check_func(h.hevc_v_loop_filter_chroma, "hevc_v_loop_filter_chroma_%d", bit_depth)) {
+ call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ if (memcmp(buf0, buf1, BUF_SIZE))
+ fail();
+ bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ }
+}
+
+void checkasm_check_hevc_deblock(void)
+{
+ int bit_depth;
+
+ for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ HEVCDSPContext h;
+ ff_hevc_dsp_init(&h, bit_depth);
+ check_deblock_chroma(h, bit_depth);
+ }
+ report("chroma");
+}
--
2.37.1 (Apple Git-137.1)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit
2023-03-21 18:35 [FFmpeg-devel] [PATCH 1/2] checkasm: add hevc_deblock chroma test J. Dekker
@ 2023-03-21 18:35 ` J. Dekker
2023-03-21 20:30 ` Martin Storsjö
0 siblings, 1 reply; 9+ messages in thread
From: J. Dekker @ 2023-03-21 18:35 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
Implements comments from Martin, split body into its own function,
interleaved stores, however `.elif' is not valid on macOS so left
that out.
libavcodec/aarch64/Makefile | 3 +-
libavcodec/aarch64/hevcdsp_deblock_neon.S | 187 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 +++
3 files changed, 207 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 02fb51c3ab..216191640c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9lpf_neon.o \
aarch64/vp9mc_16bpp_neon.o \
aarch64/vp9mc_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
+NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \
+ aarch64/hevcdsp_idct_neon.o \
aarch64/hevcdsp_init_aarch64.o \
aarch64/hevcdsp_qpel_neon.o \
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
new file mode 100644
index 0000000000..0f0e829fb5
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -0,0 +1,187 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start bitdepth
+ mov x4, x30
+ ldr w14, [x2]
+ ldr w15, [x2, #4]
+.if \bitdepth > 8
+ lsl w14, w14, #(\bitdepth - 8)
+ lsl w15, w15, #(\bitdepth - 8)
+.endif
+ adds w2, w14, w15
+ b.eq 1f
+ dup v16.4h, w14
+ dup v17.4h, w15
+ trn1 v16.2d, v16.2d, v17.2d
+.if \bitdepth == 10
+ mvni v19.8h, #0xFC, lsl #8 // movi #0x03FF
+.endif
+.if \bitdepth == 12
+ mvni v19.8h, #0xF0, lsl #8 // movi #0x0FFF
+.endif
+.if \bitdepth > 8
+ movi v18.8h, #0
+.endif
+ neg v17.8h, v16.8h
+.endm
+
+.macro hevc_loop_filter_chroma_body bitdepth
+.if \bitdepth <= 8
+ uxtl v0.8h, v0.8b // p1
+ uxtl v1.8h, v1.8b // p0
+ uxtl v2.8h, v2.8b // q0
+ uxtl v3.8h, v3.8b // q1
+.endif
+ sub v5.8h, v2.8h, v1.8h // q0 - p0
+ sub v6.8h, v0.8h, v3.8h // p1 - q1
+ shl v5.8h, v5.8h, #2
+ add v5.8h, v6.8h, v5.8h
+ srshr v5.8h, v5.8h, #3
+ smin v5.8h, v5.8h, v16.8h
+ smax v5.8h, v5.8h, v17.8h
+ sqadd v1.8h, v1.8h, v5.8h // p0 + delta
+ sqsub v2.8h, v2.8h, v5.8h // q0 - delta
+.if \bitdepth <= 8
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+.else
+ smin v1.8h, v1.8h, v19.8h
+ smin v2.8h, v2.8h, v19.8h
+ smax v1.8h, v1.8h, v18.8h
+ smax v2.8h, v2.8h, v18.8h
+.endif
+.endm
+
+function hevc_loop_filter_chroma_body_8_neon, export=0
+ hevc_loop_filter_chroma_body 8
+ ret
+endfunc
+
+function hevc_loop_filter_chroma_body_10_neon, export=0
+hevc_loop_filter_chroma_body_12_neon:
+ hevc_loop_filter_chroma_body 10
+ ret
+endfunc
+
+// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+.macro hevc_h_loop_filter_chroma bitdepth
+function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1
+ hevc_loop_filter_chroma_start \bitdepth
+ sub x0, x0, x1, lsl #1
+.if \bitdepth > 8
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v1.8h}, [x0], x1
+ ld1 {v2.8h}, [x0], x1
+ ld1 {v3.8h}, [x0]
+.else
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x0]
+.endif
+ sub x0, x0, x1, lsl #1
+ bl hevc_loop_filter_chroma_body_\bitdepth\()_neon
+.if \bitdepth > 8
+ st1 {v1.8h}, [x0], x1
+ st1 {v2.8h}, [x0]
+.else
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0]
+.endif
+1: ret x4
+endfunc
+.endm
+
+.macro hevc_v_loop_filter_chroma bitdepth
+function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1
+ hevc_loop_filter_chroma_start \bitdepth
+.if \bitdepth > 8
+ sub x0, x0, #8
+ add x3, x0, x1
+ lsl x1, x1, #1
+ ld1 {v20.8h}, [x0], x1
+ ld1 {v21.8h}, [x3], x1
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v1.8h}, [x3], x1
+ ld1 {v2.8h}, [x0], x1
+ ld1 {v3.8h}, [x3], x1
+ ld1 {v22.8h}, [x0], x1
+ ld1 {v23.8h}, [x3], x1
+ transpose_8x8H v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+.else
+ sub x0, x0, #4
+ add x3, x0, x1
+ lsl x1, x1, #1
+ ld1 {v20.8b}, [x0], x1
+ ld1 {v21.8b}, [x3], x1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x3], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x3], x1
+ ld1 {v22.8b}, [x0], x1
+ ld1 {v23.8b}, [x3], x1
+ transpose_8x8B v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+.endif
+ sub x0, x0, x1, lsl #2
+ sub x3, x3, x1, lsl #2
+ bl hevc_loop_filter_chroma_body_\bitdepth\()_neon
+.if \bitdepth > 8
+ transpose_8x8H v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x3], x1
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x3], x1
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x3], x1
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x3]
+.else
+ xtn v0.8b, v0.8h // restore
+ xtn v3.8b, v3.8h
+ transpose_8x8B v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+ st1 {v20.8b}, [x0], x1
+ st1 {v21.8b}, [x3], x1
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x3], x1
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x3], x1
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x3]
+.endif
+1: ret x4
+endfunc
+.endm
+
+hevc_h_loop_filter_chroma 8
+hevc_h_loop_filter_chroma 10
+hevc_h_loop_filter_chroma 12
+
+hevc_v_loop_filter_chroma 8
+hevc_v_loop_filter_chroma 10
+hevc_v_loop_filter_chroma 12
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1deefca0a2..a923bae35c 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -25,6 +25,18 @@
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/hevcdsp.h"
+void ff_hevc_v_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs,
@@ -117,6 +129,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
if (!have_neon(av_get_cpu_flags())) return;
if (bit_depth == 8) {
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_neon;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_neon;
c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon;
@@ -167,6 +181,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
}
if (bit_depth == 10) {
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_neon;
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon;
@@ -180,6 +196,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon;
}
if (bit_depth == 12) {
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_neon;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_neon;
c->add_residual[0] = ff_hevc_add_residual_4x4_12_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_12_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_12_neon;
--
2.37.1 (Apple Git-137.1)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit
2023-03-21 18:35 ` [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
@ 2023-03-21 20:30 ` Martin Storsjö
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro J. Dekker
0 siblings, 1 reply; 9+ messages in thread
From: Martin Storsjö @ 2023-03-21 20:30 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Tue, 21 Mar 2023, J. Dekker wrote:
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> Implements comments from Martin, split body into its own function,
> interleaved stores, however `.elif' is not valid on macOS so left
> that out.
I didn't review the patches themselves right now yet, but noticed this
comment - there's no ".elif", it's ".elseif".
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro
2023-03-21 20:30 ` Martin Storsjö
@ 2023-03-22 0:07 ` J. Dekker
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH v3 2/3] checkasm: add hevc_deblock chroma test J. Dekker
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: J. Dekker @ 2023-03-22 0:07 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 19 +++++--------------
libavcodec/aarch64/neon.S | 11 +++++++++++
2 files changed, 16 insertions(+), 14 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 467cb0f48a..3e59dd20bb 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -5,7 +5,7 @@
*
* Ported from arm/hevcdsp_idct_neon.S by
* Copyright (c) 2020 Reimar Döffinger
- * Copyright (c) 2020 J. Dekker
+ * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
*
* This file is part of FFmpeg.
*
@@ -38,13 +38,6 @@ const trans, align=4
.short 31, 22, 13, 4
endconst
-.macro clip2 in1, in2, min, max
- smax \in1, \in1, \min
- smax \in2, \in2, \min
- smin \in1, \in1, \max
- smin \in2, \in2, \max
-.endm
-
function ff_hevc_add_residual_4x4_8_neon, export=1
ld1 {v0.8h-v1.8h}, [x1]
ld1 {v2.s}[0], [x0], x2
@@ -182,7 +175,7 @@ function hevc_add_residual_4x4_16_neon, export=0
ld1 {v3.d}[1], [x12], x2
movi v4.8h, #0
sqadd v1.8h, v1.8h, v3.8h
- clip2 v0.8h, v1.8h, v4.8h, v21.8h
+ clip v4.8h, v21.8h, v0.8h, v1.8h
st1 {v0.d}[0], [x0], x2
st1 {v0.d}[1], [x0], x2
st1 {v1.d}[0], [x0], x2
@@ -201,7 +194,7 @@ function hevc_add_residual_8x8_16_neon, export=0
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.8h}, [x12]
sqadd v1.8h, v1.8h, v3.8h
- clip2 v0.8h, v1.8h, v4.8h, v21.8h
+ clip v4.8h, v21.8h, v0.8h, v1.8h
st1 {v0.8h}, [x0], x2
st1 {v1.8h}, [x12], x2
bne 1b
@@ -221,8 +214,7 @@ function hevc_add_residual_16x16_16_neon, export=0
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
- clip2 v0.8h, v1.8h, v20.8h, v21.8h
- clip2 v2.8h, v3.8h, v20.8h, v21.8h
+ clip v20.8h, v21.8h, v0.8h, v1.8h, v2.8h, v3.8h
st1 {v0.8h-v1.8h}, [x0], x2
st1 {v2.8h-v3.8h}, [x12], x2
bne 1b
@@ -239,8 +231,7 @@ function hevc_add_residual_32x32_16_neon, export=0
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
- clip2 v0.8h, v1.8h, v20.8h, v21.8h
- clip2 v2.8h, v3.8h, v20.8h, v21.8h
+ clip v20.8h, v21.8h, v0.8h, v1.8h, v2.8h, v3.8h
st1 {v0.8h-v3.8h}, [x0], x2
bne 1b
ret
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 1ad32c359d..bc105e4861 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -1,6 +1,8 @@
/*
* This file is part of FFmpeg.
*
+ * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
+ *
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
@@ -16,6 +18,15 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+.macro clip min, max, regs:vararg
+.irp x, \regs
+ smax \x, \x, \min
+.endr
+.irp x, \regs
+ smin \x, \x, \max
+.endr
+.endm
+
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8B, \r0\().8B, \r1\().8B
trn2 \r9\().8B, \r0\().8B, \r1\().8B
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* [FFmpeg-devel] [PATCH v3 2/3] checkasm: add hevc_deblock chroma test
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro J. Dekker
@ 2023-03-22 0:07 ` J. Dekker
2023-03-22 9:04 ` Martin Storsjö
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH v3 3/3] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
2023-03-22 8:38 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro Martin Storsjö
2 siblings, 1 reply; 9+ messages in thread
From: J. Dekker @ 2023-03-22 0:07 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
Added missing FATE target.
tests/checkasm/Makefile | 2 +-
tests/checkasm/checkasm.c | 1 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/hevc_deblock.c | 87 +++++++++++++++++++++++++++++++++++
tests/fate/checkasm.mak | 1 +
5 files changed, 91 insertions(+), 1 deletion(-)
create mode 100644 tests/checkasm/hevc_deblock.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a6f06c7007..3e62a22bd6 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -28,7 +28,7 @@ AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuvdsp.o
AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o
AVCODECOBJS-$(CONFIG_OPUS_DECODER) += opusdsp.o
AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o
-AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_idct.o hevc_sao.o hevc_pel.o
+AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_deblock.o hevc_idct.o hevc_sao.o hevc_pel.o
AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER) += utvideodsp.o
AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o
AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e96d84a7da..c2184d260d 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -116,6 +116,7 @@ static const struct {
#endif
#if CONFIG_HEVC_DECODER
{ "hevc_add_res", checkasm_check_hevc_add_res },
+ { "hevc_deblock", checkasm_check_hevc_deblock },
{ "hevc_idct", checkasm_check_hevc_idct },
{ "hevc_pel", checkasm_check_hevc_pel },
{ "hevc_sao", checkasm_check_hevc_sao },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8744a81218..89c643e6a0 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -60,6 +60,7 @@ void checkasm_check_h264dsp(void);
void checkasm_check_h264pred(void);
void checkasm_check_h264qpel(void);
void checkasm_check_hevc_add_res(void);
+void checkasm_check_hevc_deblock(void);
void checkasm_check_hevc_idct(void);
void checkasm_check_hevc_pel(void);
void checkasm_check_hevc_sao(void);
diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
new file mode 100644
index 0000000000..06c9c1969e
--- /dev/null
+++ b/tests/checkasm/hevc_deblock.c
@@ -0,0 +1,87 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hevcdsp.h"
+
+#include "checkasm.h"
+
+static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define BUF_STRIDE (8 * 2)
+#define BUF_LINES (8)
+#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
+#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
+
+#define randomize_buffers(buf0, buf1, size) \
+ do { \
+ uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
+ int k; \
+ for (k = 0; k < size; k += 4) { \
+ uint32_t r = rnd() & mask; \
+ AV_WN32A(buf0 + k, r); \
+ AV_WN32A(buf1 + k, r); \
+ } \
+ } while (0)
+
+static void check_deblock_chroma(HEVCDSPContext h, int bit_depth)
+{
+ int32_t tc[2] = { 1, 1 };
+ uint8_t no_p[2] = { 0, 0 };
+ uint8_t no_q[2] = { 0, 0 };
+ LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
+
+ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
+
+ randomize_buffers(buf0, buf1, BUF_SIZE);
+ if (check_func(h.hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma_%d", bit_depth)) {
+ call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ if (memcmp(buf0, buf1, BUF_SIZE))
+ fail();
+ bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ }
+
+ randomize_buffers(buf0, buf1, BUF_SIZE);
+ if (check_func(h.hevc_v_loop_filter_chroma, "hevc_v_loop_filter_chroma_%d", bit_depth)) {
+ call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ if (memcmp(buf0, buf1, BUF_SIZE))
+ fail();
+ bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+ }
+}
+
+void checkasm_check_hevc_deblock(void)
+{
+ int bit_depth;
+
+ for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ HEVCDSPContext h;
+ ff_hevc_dsp_init(&h, bit_depth);
+ check_deblock_chroma(h, bit_depth);
+ }
+ report("chroma");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index a4e95541f5..faac764388 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -15,6 +15,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp \
fate-checkasm-h264pred \
fate-checkasm-h264qpel \
fate-checkasm-hevc_add_res \
+ fate-checkasm-hevc_deblock \
fate-checkasm-hevc_idct \
fate-checkasm-hevc_pel \
fate-checkasm-hevc_sao \
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* [FFmpeg-devel] [PATCH v3 3/3] lavc/aarch64: add hevc deblock chroma 8-12bit
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro J. Dekker
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH v3 2/3] checkasm: add hevc_deblock chroma test J. Dekker
@ 2023-03-22 0:07 ` J. Dekker
2023-03-22 9:26 ` Martin Storsjö
2023-03-22 8:38 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro Martin Storsjö
2 siblings, 1 reply; 9+ messages in thread
From: J. Dekker @ 2023-03-22 0:07 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
- Using clip macro
- Avoided need for .elseif at all, just used better mvni
libavcodec/aarch64/Makefile | 3 +-
libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 +++
3 files changed, 200 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 02fb51c3ab..216191640c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9lpf_neon.o \
aarch64/vp9mc_16bpp_neon.o \
aarch64/vp9mc_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
+NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \
+ aarch64/hevcdsp_idct_neon.o \
aarch64/hevcdsp_init_aarch64.o \
aarch64/hevcdsp_qpel_neon.o \
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
new file mode 100644
index 0000000000..49b40f21c8
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -0,0 +1,180 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start bitdepth
+ mov x4, x30
+ ldr w14, [x2]
+ ldr w15, [x2, #4]
+.if \bitdepth > 8
+ lsl w14, w14, #(\bitdepth - 8)
+ lsl w15, w15, #(\bitdepth - 8)
+.endif
+ adds w2, w14, w15
+ b.eq 1f
+ dup v16.4h, w14
+ dup v17.4h, w15
+ trn1 v16.2d, v16.2d, v17.2d
+.if \bitdepth > 8
+ mvni v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8
+ movi v18.8h, #0
+.endif
+ neg v17.8h, v16.8h
+.endm
+
+.macro hevc_loop_filter_chroma_body bitdepth
+.if \bitdepth <= 8
+ uxtl v0.8h, v0.8b // p1
+ uxtl v1.8h, v1.8b // p0
+ uxtl v2.8h, v2.8b // q0
+ uxtl v3.8h, v3.8b // q1
+.endif
+ sub v5.8h, v2.8h, v1.8h // q0 - p0
+ sub v6.8h, v0.8h, v3.8h // p1 - q1
+ shl v5.8h, v5.8h, #2
+ add v5.8h, v6.8h, v5.8h
+ srshr v5.8h, v5.8h, #3
+ clip v17.8h, v16.8h, v5.8h
+ sqadd v1.8h, v1.8h, v5.8h // p0 + delta
+ sqsub v2.8h, v2.8h, v5.8h // q0 - delta
+.if \bitdepth <= 8
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+.else
+ smin v1.8h, v1.8h, v19.8h
+ smin v2.8h, v2.8h, v19.8h
+ clip v18.8h, v19.8h, v1.8h, v2.8h
+.endif
+.endm
+
+function hevc_loop_filter_chroma_body_8_neon, export=0
+ hevc_loop_filter_chroma_body 8
+ ret
+endfunc
+
+function hevc_loop_filter_chroma_body_10_neon, export=0
+hevc_loop_filter_chroma_body_12_neon:
+ hevc_loop_filter_chroma_body 10
+ ret
+endfunc
+
+// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+.macro hevc_h_loop_filter_chroma bitdepth
+function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1
+ hevc_loop_filter_chroma_start \bitdepth
+ sub x0, x0, x1, lsl #1
+.if \bitdepth > 8
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v1.8h}, [x0], x1
+ ld1 {v2.8h}, [x0], x1
+ ld1 {v3.8h}, [x0]
+.else
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x0]
+.endif
+ sub x0, x0, x1, lsl #1
+ bl hevc_loop_filter_chroma_body_\bitdepth\()_neon
+.if \bitdepth > 8
+ st1 {v1.8h}, [x0], x1
+ st1 {v2.8h}, [x0]
+.else
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0]
+.endif
+1: ret x4
+endfunc
+.endm
+
+.macro hevc_v_loop_filter_chroma bitdepth
+function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1
+ hevc_loop_filter_chroma_start \bitdepth
+.if \bitdepth > 8
+ sub x0, x0, #8
+ add x3, x0, x1
+ lsl x1, x1, #1
+ ld1 {v20.8h}, [x0], x1
+ ld1 {v21.8h}, [x3], x1
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v1.8h}, [x3], x1
+ ld1 {v2.8h}, [x0], x1
+ ld1 {v3.8h}, [x3], x1
+ ld1 {v22.8h}, [x0], x1
+ ld1 {v23.8h}, [x3], x1
+ transpose_8x8H v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+.else
+ sub x0, x0, #4
+ add x3, x0, x1
+ lsl x1, x1, #1
+ ld1 {v20.8b}, [x0], x1
+ ld1 {v21.8b}, [x3], x1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x3], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x3], x1
+ ld1 {v22.8b}, [x0], x1
+ ld1 {v23.8b}, [x3], x1
+ transpose_8x8B v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+.endif
+ sub x0, x0, x1, lsl #2
+ sub x3, x3, x1, lsl #2
+ bl hevc_loop_filter_chroma_body_\bitdepth\()_neon
+.if \bitdepth > 8
+ transpose_8x8H v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x3], x1
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x3], x1
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x3], x1
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x3]
+.else
+ xtn v0.8b, v0.8h // restore
+ xtn v3.8b, v3.8h
+ transpose_8x8B v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+ st1 {v20.8b}, [x0], x1
+ st1 {v21.8b}, [x3], x1
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x3], x1
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x3], x1
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x3]
+.endif
+1: ret x4
+endfunc
+.endm
+
+hevc_h_loop_filter_chroma 8
+hevc_h_loop_filter_chroma 10
+hevc_h_loop_filter_chroma 12
+
+hevc_v_loop_filter_chroma 8
+hevc_v_loop_filter_chroma 10
+hevc_v_loop_filter_chroma 12
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1deefca0a2..a923bae35c 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -25,6 +25,18 @@
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/hevcdsp.h"
+void ff_hevc_v_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride,
+ const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q);
void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs,
@@ -117,6 +129,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
if (!have_neon(av_get_cpu_flags())) return;
if (bit_depth == 8) {
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_neon;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_neon;
c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon;
@@ -167,6 +181,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
}
if (bit_depth == 10) {
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_neon;
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon;
@@ -180,6 +196,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon;
}
if (bit_depth == 12) {
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_neon;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_neon;
c->add_residual[0] = ff_hevc_add_residual_4x4_12_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_12_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_12_neon;
--
2.39.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro J. Dekker
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH v3 2/3] checkasm: add hevc_deblock chroma test J. Dekker
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH v3 3/3] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
@ 2023-03-22 8:38 ` Martin Storsjö
2 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2023-03-22 8:38 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Wed, 22 Mar 2023, J. Dekker wrote:
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 19 +++++--------------
> libavcodec/aarch64/neon.S | 11 +++++++++++
> 2 files changed, 16 insertions(+), 14 deletions(-)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index 467cb0f48a..3e59dd20bb 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -5,7 +5,7 @@
> *
> * Ported from arm/hevcdsp_idct_neon.S by
> * Copyright (c) 2020 Reimar Döffinger
> - * Copyright (c) 2020 J. Dekker
> + * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
> *
> * This file is part of FFmpeg.
> *
> @@ -38,13 +38,6 @@ const trans, align=4
> .short 31, 22, 13, 4
> endconst
>
> -.macro clip2 in1, in2, min, max
> - smax \in1, \in1, \min
> - smax \in2, \in2, \min
> - smin \in1, \in1, \max
> - smin \in2, \in2, \max
> -.endm
> -
> function ff_hevc_add_residual_4x4_8_neon, export=1
> ld1 {v0.8h-v1.8h}, [x1]
> ld1 {v2.s}[0], [x0], x2
> @@ -182,7 +175,7 @@ function hevc_add_residual_4x4_16_neon, export=0
> ld1 {v3.d}[1], [x12], x2
> movi v4.8h, #0
> sqadd v1.8h, v1.8h, v3.8h
> - clip2 v0.8h, v1.8h, v4.8h, v21.8h
> + clip v4.8h, v21.8h, v0.8h, v1.8h
> st1 {v0.d}[0], [x0], x2
> st1 {v0.d}[1], [x0], x2
> st1 {v1.d}[0], [x0], x2
> @@ -201,7 +194,7 @@ function hevc_add_residual_8x8_16_neon, export=0
> sqadd v0.8h, v0.8h, v2.8h
> ld1 {v3.8h}, [x12]
> sqadd v1.8h, v1.8h, v3.8h
> - clip2 v0.8h, v1.8h, v4.8h, v21.8h
> + clip v4.8h, v21.8h, v0.8h, v1.8h
> st1 {v0.8h}, [x0], x2
> st1 {v1.8h}, [x12], x2
> bne 1b
> @@ -221,8 +214,7 @@ function hevc_add_residual_16x16_16_neon, export=0
> sqadd v1.8h, v1.8h, v17.8h
> sqadd v2.8h, v2.8h, v18.8h
> sqadd v3.8h, v3.8h, v19.8h
> - clip2 v0.8h, v1.8h, v20.8h, v21.8h
> - clip2 v2.8h, v3.8h, v20.8h, v21.8h
> + clip v20.8h, v21.8h, v0.8h, v1.8h, v2.8h, v3.8h
> st1 {v0.8h-v1.8h}, [x0], x2
> st1 {v2.8h-v3.8h}, [x12], x2
> bne 1b
> @@ -239,8 +231,7 @@ function hevc_add_residual_32x32_16_neon, export=0
> sqadd v1.8h, v1.8h, v17.8h
> sqadd v2.8h, v2.8h, v18.8h
> sqadd v3.8h, v3.8h, v19.8h
> - clip2 v0.8h, v1.8h, v20.8h, v21.8h
> - clip2 v2.8h, v3.8h, v20.8h, v21.8h
> + clip v20.8h, v21.8h, v0.8h, v1.8h, v2.8h, v3.8h
> st1 {v0.8h-v3.8h}, [x0], x2
> bne 1b
> ret
> diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
> index 1ad32c359d..bc105e4861 100644
> --- a/libavcodec/aarch64/neon.S
> +++ b/libavcodec/aarch64/neon.S
> @@ -1,6 +1,8 @@
> /*
> * This file is part of FFmpeg.
> *
> + * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
> + *
> * FFmpeg is free software; you can redistribute it and/or
> * modify it under the terms of the GNU Lesser General Public
> * License as published by the Free Software Foundation; either
> @@ -16,6 +18,15 @@
> * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> */
>
> +.macro clip min, max, regs:vararg
> +.irp x, \regs
> + smax \x, \x, \min
> +.endr
> +.irp x, \regs
> + smin \x, \x, \max
> +.endr
> +.endm
> +
LGTM, the vararg argument handling looks neat here.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 2/3] checkasm: add hevc_deblock chroma test
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH v3 2/3] checkasm: add hevc_deblock chroma test J. Dekker
@ 2023-03-22 9:04 ` Martin Storsjö
0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2023-03-22 9:04 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Wed, 22 Mar 2023, J. Dekker wrote:
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> Added missing FATE target.
>
> tests/checkasm/Makefile | 2 +-
> tests/checkasm/checkasm.c | 1 +
> tests/checkasm/checkasm.h | 1 +
> tests/checkasm/hevc_deblock.c | 87 +++++++++++++++++++++++++++++++++++
> tests/fate/checkasm.mak | 1 +
> 5 files changed, 91 insertions(+), 1 deletion(-)
> create mode 100644 tests/checkasm/hevc_deblock.c
>
> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index a6f06c7007..3e62a22bd6 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -28,7 +28,7 @@ AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuvdsp.o
> AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o
> AVCODECOBJS-$(CONFIG_OPUS_DECODER) += opusdsp.o
> AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o
> -AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_idct.o hevc_sao.o hevc_pel.o
> +AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_deblock.o hevc_idct.o hevc_sao.o hevc_pel.o
> AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER) += utvideodsp.o
> AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o
> AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index e96d84a7da..c2184d260d 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -116,6 +116,7 @@ static const struct {
> #endif
> #if CONFIG_HEVC_DECODER
> { "hevc_add_res", checkasm_check_hevc_add_res },
> + { "hevc_deblock", checkasm_check_hevc_deblock },
> { "hevc_idct", checkasm_check_hevc_idct },
> { "hevc_pel", checkasm_check_hevc_pel },
> { "hevc_sao", checkasm_check_hevc_sao },
> diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
> index 8744a81218..89c643e6a0 100644
> --- a/tests/checkasm/checkasm.h
> +++ b/tests/checkasm/checkasm.h
> @@ -60,6 +60,7 @@ void checkasm_check_h264dsp(void);
> void checkasm_check_h264pred(void);
> void checkasm_check_h264qpel(void);
> void checkasm_check_hevc_add_res(void);
> +void checkasm_check_hevc_deblock(void);
> void checkasm_check_hevc_idct(void);
> void checkasm_check_hevc_pel(void);
> void checkasm_check_hevc_sao(void);
> diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
> new file mode 100644
> index 0000000000..06c9c1969e
> --- /dev/null
> +++ b/tests/checkasm/hevc_deblock.c
> @@ -0,0 +1,87 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <string.h>
> +
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mem_internal.h"
> +
> +#include "libavcodec/avcodec.h"
> +#include "libavcodec/hevcdsp.h"
> +
> +#include "checkasm.h"
> +
> +static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
> +
> +#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
> +#define BUF_STRIDE (8 * 2)
> +#define BUF_LINES (8)
> +#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
> +#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
This looks like it's at least plenty of space, although it leaves me a bit
confused about the intended layout of the test buffer. The way I think
about it is that we'd have our buffer to be e.g. a 8x8 square (or 16x16);
then when we're deblocking vertically, we'd have BUF_OFFSET be
(height/2)*stride, i.e. starting at the middle left side of the square.
When deblocking horizontally, the start position can be (width/2), i.e at
the center of the top edge of the square.
The current version is acceptable though, but I don't quite get the
intended layout/sizing here, other than "big enough".
> +
> +#define randomize_buffers(buf0, buf1, size) \
> + do { \
> + uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
> + int k; \
> + for (k = 0; k < size; k += 4) { \
> + uint32_t r = rnd() & mask; \
> + AV_WN32A(buf0 + k, r); \
> + AV_WN32A(buf1 + k, r); \
> + } \
> + } while (0)
> +
> +static void check_deblock_chroma(HEVCDSPContext h, int bit_depth)
> +{
> + int32_t tc[2] = { 1, 1 };
> + uint8_t no_p[2] = { 0, 0 };
> + uint8_t no_q[2] = { 0, 0 };
I think it might be good to check what we honor these flags too; either an
exhaustive loop testing all combinations of 0/1 for the no_p/no_q flags,
plus possibly some different values for tc - or at least maybe a couple
tests with these flags/coefficients set to random combinations?
> + LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
> + LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
> +
> + declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
> +
> + randomize_buffers(buf0, buf1, BUF_SIZE);
> + if (check_func(h.hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma_%d", bit_depth)) {
> + call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
> + call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
> + if (memcmp(buf0, buf1, BUF_SIZE))
> + fail();
> + bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
> + }
> +
> + randomize_buffers(buf0, buf1, BUF_SIZE);
> + if (check_func(h.hevc_v_loop_filter_chroma, "hevc_v_loop_filter_chroma_%d", bit_depth)) {
> + call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
> + call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
> + if (memcmp(buf0, buf1, BUF_SIZE))
> + fail();
> + bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
> + }
> +}
I mentioned last time that pure random data isn't a very good input for
deblocking tests, but I see that the hevc chroma deblocker doesn't compare
pixel differences for deciding whether to deblock or not, so with that in
mind, I guess this is sufficient.
The current test is at least good enough for making sure we run through
the code (for the non-functional test aspects, register clobbering etc)
and makes sure we do some work so that we can do proper benchmarking.
So I guess this is acceptable in this form, but with a suggestion to check
more combinations of tc/no_p/no_c.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 3/3] lavc/aarch64: add hevc deblock chroma 8-12bit
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH v3 3/3] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
@ 2023-03-22 9:26 ` Martin Storsjö
0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2023-03-22 9:26 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Wed, 22 Mar 2023, J. Dekker wrote:
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> - Using clip macro
> - Avoided need for .elseif at all, just used better mvni
Can you provide some benchmark numbers for it in the commit message, to
get the ballpark figures?
> libavcodec/aarch64/Makefile | 3 +-
> libavcodec/aarch64/hevcdsp_deblock_neon.S | 180 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 +++
> 3 files changed, 200 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 02fb51c3ab..216191640c 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -65,7 +65,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
> aarch64/vp9lpf_neon.o \
> aarch64/vp9mc_16bpp_neon.o \
> aarch64/vp9mc_neon.o
> -NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
> +NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \
> + aarch64/hevcdsp_idct_neon.o \
> aarch64/hevcdsp_init_aarch64.o \
> aarch64/hevcdsp_qpel_neon.o \
> aarch64/hevcdsp_sao_neon.o
> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> new file mode 100644
> index 0000000000..49b40f21c8
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
> @@ -0,0 +1,180 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
> + * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +
> +#include "libavutil/aarch64/asm.S"
> +#include "neon.S"
> +
> +.macro hevc_loop_filter_chroma_start bitdepth
> + mov x4, x30
> + ldr w14, [x2]
> + ldr w15, [x2, #4]
> +.if \bitdepth > 8
> + lsl w14, w14, #(\bitdepth - 8)
> + lsl w15, w15, #(\bitdepth - 8)
> +.endif
> + adds w2, w14, w15
> + b.eq 1f
> + dup v16.4h, w14
> + dup v17.4h, w15
> + trn1 v16.2d, v16.2d, v17.2d
> +.if \bitdepth > 8
> + mvni v19.8h, #((0xff << (\bitdepth - 8)) & 0xff), lsl #8
> + movi v18.8h, #0
> +.endif
> + neg v17.8h, v16.8h
> +.endm
> +
> +.macro hevc_loop_filter_chroma_body bitdepth
> +.if \bitdepth <= 8
> + uxtl v0.8h, v0.8b // p1
> + uxtl v1.8h, v1.8b // p0
> + uxtl v2.8h, v2.8b // q0
> + uxtl v3.8h, v3.8b // q1
> +.endif
> + sub v5.8h, v2.8h, v1.8h // q0 - p0
> + sub v6.8h, v0.8h, v3.8h // p1 - q1
> + shl v5.8h, v5.8h, #2
> + add v5.8h, v6.8h, v5.8h
> + srshr v5.8h, v5.8h, #3
> + clip v17.8h, v16.8h, v5.8h
> + sqadd v1.8h, v1.8h, v5.8h // p0 + delta
> + sqsub v2.8h, v2.8h, v5.8h // q0 - delta
For the bitdepth==8 case, I'm wondering if it'd be more straightforward to
do like the arm code does, i.e. don't do uxtl at the start, but widen with
usubl at the "q0-p0" stage, and then add with uaddw at the end. That makes
the function less templateable though. Would you mind giving that a try
just to get benchmark numbers for it? If there's not a big difference I
guess this more templated form is fine too.
If you try this, just comment out the more complex function that does
transposes. (We can simplify that one a little if we don't need to reverse
the uxtl.)
> +.if \bitdepth <= 8
> + sqxtun v1.8b, v1.8h
> + sqxtun v2.8b, v2.8h
> +.else
> + smin v1.8h, v1.8h, v19.8h
> + smin v2.8h, v2.8h, v19.8h
> + clip v18.8h, v19.8h, v1.8h, v2.8h
Don't you do duplicate clipping here - the smin instructions are redundant
with the clip macro?
> +.endif
> +.endm
> +
> +function hevc_loop_filter_chroma_body_8_neon, export=0
> + hevc_loop_filter_chroma_body 8
> + ret
> +endfunc
> +
> +function hevc_loop_filter_chroma_body_10_neon, export=0
> +hevc_loop_filter_chroma_body_12_neon:
> + hevc_loop_filter_chroma_body 10
> + ret
> +endfunc
> +
> +// void ff_hevc_h_loop_filter_chroma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
> +
> +.macro hevc_h_loop_filter_chroma bitdepth
> +function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1
> + hevc_loop_filter_chroma_start \bitdepth
> + sub x0, x0, x1, lsl #1
> +.if \bitdepth > 8
> + ld1 {v0.8h}, [x0], x1
> + ld1 {v1.8h}, [x0], x1
> + ld1 {v2.8h}, [x0], x1
> + ld1 {v3.8h}, [x0]
> +.else
> + ld1 {v0.8b}, [x0], x1
> + ld1 {v1.8b}, [x0], x1
> + ld1 {v2.8b}, [x0], x1
> + ld1 {v3.8b}, [x0]
> +.endif
> + sub x0, x0, x1, lsl #1
> + bl hevc_loop_filter_chroma_body_\bitdepth\()_neon
> +.if \bitdepth > 8
> + st1 {v1.8h}, [x0], x1
> + st1 {v2.8h}, [x0]
> +.else
> + st1 {v1.8b}, [x0], x1
> + st1 {v2.8b}, [x0]
> +.endif
> +1: ret x4
> +endfunc
> +.endm
> +
> +.macro hevc_v_loop_filter_chroma bitdepth
> +function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1
> + hevc_loop_filter_chroma_start \bitdepth
> +.if \bitdepth > 8
> + sub x0, x0, #8
> + add x3, x0, x1
> + lsl x1, x1, #1
> + ld1 {v20.8h}, [x0], x1
> + ld1 {v21.8h}, [x3], x1
> + ld1 {v0.8h}, [x0], x1
> + ld1 {v1.8h}, [x3], x1
> + ld1 {v2.8h}, [x0], x1
> + ld1 {v3.8h}, [x3], x1
> + ld1 {v22.8h}, [x0], x1
> + ld1 {v23.8h}, [x3], x1
> + transpose_8x8H v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
Reading, transposing and writing back all of 8x8 pixels here, when we
really just want a 4x8 slice of pixels, is a bit excessive.
I see that the existing 32 bit assembly does it like that though, so I
guess it's acceptable - but I'd like to leave a remark that it can be done
more optimized.
I believe this should be doable by loading the first 4 rows into v0-v3
.d[0] for 10/12 bpp and .s[0] for 8bpp, the last 4 rows into .d[1] or
.s[1], then do a transpose_4x8H or 4x8B on it, then the same in reverse at
the end.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2023-03-22 9:27 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-21 18:35 [FFmpeg-devel] [PATCH 1/2] checkasm: add hevc_deblock chroma test J. Dekker
2023-03-21 18:35 ` [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
2023-03-21 20:30 ` Martin Storsjö
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro J. Dekker
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH v3 2/3] checkasm: add hevc_deblock chroma test J. Dekker
2023-03-22 9:04 ` Martin Storsjö
2023-03-22 0:07 ` [FFmpeg-devel] [PATCH v3 3/3] lavc/aarch64: add hevc deblock chroma 8-12bit J. Dekker
2023-03-22 9:26 ` Martin Storsjö
2023-03-22 8:38 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: add clip N macro Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git