* [FFmpeg-devel] [PATCH] lavc/aarch64: add hevc horizontal qpel/uni/bi
@ 2022-05-24 11:38 J. Dekker
2022-05-25 10:17 ` Martin Storsjö
0 siblings, 1 reply; 2+ messages in thread
From: J. Dekker @ 2022-05-24 11:38 UTC (permalink / raw)
To: ffmpeg-devel
checkasm --benchmark on Ampere Altra (Neoverse N1):
put_hevc_qpel_bi_h4_8_c: 173.7
put_hevc_qpel_bi_h4_8_neon: 77.0
put_hevc_qpel_bi_h6_8_c: 385.7
put_hevc_qpel_bi_h6_8_neon: 125.7
put_hevc_qpel_bi_h8_8_c: 680.7
put_hevc_qpel_bi_h8_8_neon: 137.5
put_hevc_qpel_bi_h12_8_c: 1480.0
put_hevc_qpel_bi_h12_8_neon: 438.5
put_hevc_qpel_bi_h16_8_c: 2663.2
put_hevc_qpel_bi_h16_8_neon: 561.5
put_hevc_qpel_bi_h24_8_c: 6039.0
put_hevc_qpel_bi_h24_8_neon: 1717.5
put_hevc_qpel_bi_h32_8_c: 11104.2
put_hevc_qpel_bi_h32_8_neon: 2222.0
put_hevc_qpel_bi_h48_8_c: 25175.2
put_hevc_qpel_bi_h48_8_neon: 4983.7
put_hevc_qpel_bi_h64_8_c: 42806.5
put_hevc_qpel_bi_h64_8_neon: 8848.5
put_hevc_qpel_h4_8_c: 149.7
put_hevc_qpel_h4_8_neon: 68.2
put_hevc_qpel_h6_8_c: 318.5
put_hevc_qpel_h6_8_neon: 105.2
put_hevc_qpel_h8_8_c: 577.0
put_hevc_qpel_h8_8_neon: 133.2
put_hevc_qpel_h12_8_c: 1276.0
put_hevc_qpel_h12_8_neon: 394.5
put_hevc_qpel_h16_8_c: 2278.2
put_hevc_qpel_h16_8_neon: 517.5
put_hevc_qpel_h24_8_c: 5081.7
put_hevc_qpel_h24_8_neon: 1546.5
put_hevc_qpel_h32_8_c: 9081.0
put_hevc_qpel_h32_8_neon: 2054.0
put_hevc_qpel_h48_8_c: 20280.7
put_hevc_qpel_h48_8_neon: 4615.5
put_hevc_qpel_h64_8_c: 36042.0
put_hevc_qpel_h64_8_neon: 8197.5
put_hevc_qpel_uni_h4_8_c: 165.5
put_hevc_qpel_uni_h4_8_neon: 73.5
put_hevc_qpel_uni_h6_8_c: 366.5
put_hevc_qpel_uni_h6_8_neon: 118.5
put_hevc_qpel_uni_h8_8_c: 661.7
put_hevc_qpel_uni_h8_8_neon: 138.2
put_hevc_qpel_uni_h12_8_c: 1440.5
put_hevc_qpel_uni_h12_8_neon: 399.5
put_hevc_qpel_uni_h16_8_c: 2489.0
put_hevc_qpel_uni_h16_8_neon: 532.2
put_hevc_qpel_uni_h24_8_c: 5896.5
put_hevc_qpel_uni_h24_8_neon: 1558.5
put_hevc_qpel_uni_h32_8_c: 10675.5
put_hevc_qpel_uni_h32_8_neon: 2092.2
put_hevc_qpel_uni_h48_8_c: 24103.0
put_hevc_qpel_uni_h48_8_neon: 4680.2
put_hevc_qpel_uni_h64_8_c: 42789.2
put_hevc_qpel_uni_h64_8_neon: 8330.0
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/Makefile | 1 +
libavcodec/aarch64/hevcdsp_init_aarch64.c | 43 +-
libavcodec/aarch64/hevcdsp_qpel_neon.S | 520 ++++++++++++++++++++++
3 files changed, 563 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index c8935f205e..2f95649c66 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -65,4 +65,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9mc_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
aarch64/hevcdsp_init_aarch64.o \
+ aarch64/hevcdsp_qpel_neon.o \
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..ca2cb7cf97 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,21 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
int16_t *sao_offset_val, int sao_left_class,
int width, int height);
-
+void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
@@ -80,6 +94,33 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
// for the current size, but if enabled for bigger sizes, the cases
// of non-multiple of 8 seem to arise.
// c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon;
+ c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_neon;
+ c->put_hevc_qpel[2][0][1] = ff_hevc_put_hevc_qpel_h6_8_neon;
+ c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_neon;
+ c->put_hevc_qpel[4][0][1] =
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h12_8_neon;
+ c->put_hevc_qpel[5][0][1] =
+ c->put_hevc_qpel[7][0][1] =
+ c->put_hevc_qpel[8][0][1] =
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h16_8_neon;
+ c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
+ c->put_hevc_qpel_uni[2][0][1] = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
+ c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
+ c->put_hevc_qpel_uni[4][0][1] =
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
+ c->put_hevc_qpel_uni[5][0][1] =
+ c->put_hevc_qpel_uni[7][0][1] =
+ c->put_hevc_qpel_uni[8][0][1] =
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
+ c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
+ c->put_hevc_qpel_bi[2][0][1] = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
+ c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
+ c->put_hevc_qpel_bi[4][0][1] =
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
+ c->put_hevc_qpel_bi[5][0][1] =
+ c->put_hevc_qpel_bi[7][0][1] =
+ c->put_hevc_qpel_bi[8][0][1] =
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
}
if (bit_depth == 10) {
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..bbaa32a9d9
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,520 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+const qpel_filters, align=4
+ .byte 0, 0, 0, 0, 0, 0, 0, 0
+ .byte -1, 4,-10, 58, 17, -5, 1, 0
+ .byte -1, 4,-11, 40, 40,-11, 4, -1
+ .byte 0, 1, -5, 17, 58,-10, 4, -1
+endconst
+
+.macro load_qpel_filter m
+ movrel x15, qpel_filters
+ add x15, x15, \m, lsl #3
+ ld1 {v0.8b}, [x15]
+ sxtl v0.8h, v0.8b
+.endm
+
+// void put_hevc_qpel_h(int16_t *dst,
+// uint8_t *_src, ptrdiff_t _srcstride,
+// int height, intptr_t mx, intptr_t my, int width)
+
+// void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride,
+// uint8_t *_src, ptrdiff_t _srcstride,
+// int height, intptr_t mx, intptr_t my, int width)
+
+// void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
+// uint8_t *_src, ptrdiff_t _srcstride,
+// int16_t *src2, int height, intptr_t mx,
+// intptr_t my, int width)
+
+.macro put_hevc type
+function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
+.ifc \type, qpel
+ load_qpel_filter x4
+ lsl x10, x2, #1 // src stride * 2
+ sub x13, x1, #3 // src1 = src - 3
+ mov x15, #(MAX_PB_SIZE << 2) // dst stride
+ add x14, x13, x2 // src2 = src1 + src stride
+ add x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
+.else
+.ifc \type, qpel_bi
+ load_qpel_filter x6
+ mov x6, #(MAX_PB_SIZE << 2) // rsrc stride << 1
+ add x7, x4, #(MAX_PB_SIZE << 1) // rsrc2
+.else
+ load_qpel_filter x5
+.endif
+ lsl x10, x3, #1 // src stride * 2
+ sub x13, x2, #3 // src1 = src - 3
+ lsl x15, x1, #1 // dst stride * 2
+ add x14, x13, x3 // src2 = src1 + src stride
+ add x17, x0, x1 // dst2 = dst1 + dst stride
+.endif
+0: ld1 {v16.8b, v17.8b}, [x13], x10
+ ld1 {v18.8b, v19.8b}, [x14], x10
+.ifc \type, qpel_bi
+ ld1 {v25.8h}, [x4], x6
+ ld1 {v26.8h}, [x7], x6
+.endif
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+
+ mul v23.8h, v16.8h, v0.h[0]
+ mul v24.8h, v18.8h, v0.h[0]
+
+.irpc i, 1234567
+ ext v20.16b, v16.16b, v17.16b, #(2*\i)
+ ext v21.16b, v18.16b, v19.16b, #(2*\i)
+ mla v23.8h, v20.8h, v0.h[\i]
+ mla v24.8h, v21.8h, v0.h[\i]
+.endr
+
+.ifc \type, qpel
+ subs w3, w3, #2
+ st1 {v23.4h}, [ x0], x15
+ st1 {v24.4h}, [x17], x15
+.else
+.ifc \type, qpel_bi
+ subs w5, w5, #2
+ sqadd v23.8h, v23.8h, v25.8h
+ sqadd v24.8h, v24.8h, v26.8h
+ sqrshrun v23.8b, v23.8h, #7
+ sqrshrun v24.8b, v24.8h, #7
+.else
+ subs w4, w4, #2
+ sqrshrun v23.8b, v23.8h, #6
+ sqrshrun v24.8b, v24.8h, #6
+.endif
+ st1 {v23.s}[0], [ x0], x15
+ st1 {v24.s}[0], [x17], x15
+.endif
+ b.gt 0b // double line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
+.ifc \type, qpel
+ load_qpel_filter x4
+ lsl x10, x2, #1 // width * 2
+ sub x13, x1, #3 // src1 = src - 3
+ mov x15, #(MAX_PB_SIZE * 4 - 8) // dst stride
+ add x14, x13, x2 // src2 = src1 + src stride
+ add x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
+.else
+.ifc \type, qpel_bi
+ load_qpel_filter x6
+ mov x6, #(MAX_PB_SIZE << 2) // rsrc stride << 1
+ add x7, x4, #(MAX_PB_SIZE << 1) // rsrc2
+.else
+ load_qpel_filter x5
+.endif
+ lsl x10, x3, #1 // src stride * 2
+ sub x13, x2, #3 // src1 = src - 3
+ lsl x15, x1, #1 // dst stride * 2
+ subs x15, x15, #4
+ add x14, x13, x3 // src2 = src1 + src stride
+ add x17, x0, x1 // dst2 = dst1 + dst stride
+.endif
+0: ld1 {v16.8b, v17.8b}, [x13], x10
+ ld1 {v18.8b, v19.8b}, [x14], x10
+.ifc \type, qpel_bi
+ ld1 {v25.8h}, [x4], x6
+ ld1 {v26.8h}, [x7], x6
+.endif
+
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+
+ mul v23.8h, v16.8h, v0.h[0]
+ mul v24.8h, v18.8h, v0.h[0]
+
+.irpc i, 1234567
+ ext v20.16b, v16.16b, v17.16b, #(2*\i)
+ ext v21.16b, v18.16b, v19.16b, #(2*\i)
+ mla v23.8h, v20.8h, v0.h[\i]
+ mla v24.8h, v21.8h, v0.h[\i]
+.endr
+
+.ifc \type, qpel
+ subs w3, w3, #2
+ st1 {v23.4h}, [ x0], #8
+ st1 {v23.s}[2], [ x0], x15
+ st1 {v24.4h}, [x17], #8
+ st1 {v24.s}[2], [x17], x15
+.else
+.ifc \type, qpel_bi
+ subs w5, w5, #2
+ sqadd v23.8h, v23.8h, v25.8h
+ sqadd v24.8h, v24.8h, v26.8h
+ sqrshrun v23.8b, v23.8h, #7
+ sqrshrun v24.8b, v24.8h, #7
+.else
+ subs w4, w4, #2
+ sqrshrun v23.8b, v23.8h, #6
+ sqrshrun v24.8b, v24.8h, #6
+.endif
+ st1 {v23.s}[0], [ x0], #4
+ st1 {v23.h}[2], [ x0], x15
+ st1 {v24.s}[0], [x17], #4
+ st1 {v24.h}[2], [x17], x15
+.endif
+ b.gt 0b // double line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
+.ifc \type, qpel
+ load_qpel_filter x4
+ lsl x10, x2, #1 // width * 2
+ sub x13, x1, #3 // src1 = src - 3
+ mov x15, #(MAX_PB_SIZE << 2) // dst stride
+ add x14, x13, x2 // src2 = src1 + src stride
+ add x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
+.else
+.ifc \type, qpel_bi
+ load_qpel_filter x6
+ mov x6, #(MAX_PB_SIZE << 2) // rsrc stride << 1
+ add x7, x4, #(MAX_PB_SIZE << 1) // rsrc2
+.else
+ load_qpel_filter x5
+.endif
+ lsl x10, x3, #1 // src stride * 2
+ sub x13, x2, #3 // src1 = src - 3
+ lsl x15, x1, #1 // dst stride * 2
+ add x14, x13, x3 // src2 = src1 + src stride
+ add x17, x0, x1 // dst2 = dst1 + dst stride
+.endif
+0: ld1 {v16.8b, v17.8b}, [x13], x10
+ ld1 {v18.8b, v19.8b}, [x14], x10
+.ifc \type, qpel_bi
+ ld1 {v25.8h}, [x4], x6
+ ld1 {v26.8h}, [x7], x6
+.endif
+
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+
+ mul v23.8h, v16.8h, v0.h[0]
+ mul v24.8h, v18.8h, v0.h[0]
+
+.irpc i, 1234567
+ ext v20.16b, v16.16b, v17.16b, #(2*\i)
+ ext v21.16b, v18.16b, v19.16b, #(2*\i)
+ mla v23.8h, v20.8h, v0.h[\i]
+ mla v24.8h, v21.8h, v0.h[\i]
+.endr
+
+.ifc \type, qpel
+ subs w3, w3, #2
+ st1 {v23.8h}, [ x0], x15
+ st1 {v24.8h}, [x17], x15
+.else
+.ifc \type, qpel_bi
+ subs w5, w5, #2
+ sqadd v23.8h, v23.8h, v25.8h
+ sqadd v24.8h, v24.8h, v26.8h
+ sqrshrun v23.8b, v23.8h, #7
+ sqrshrun v24.8b, v24.8h, #7
+.else
+ subs w4, w4, #2
+ sqrshrun v23.8b, v23.8h, #6
+ sqrshrun v24.8b, v24.8h, #6
+.endif
+ st1 {v23.8b}, [ x0], x15
+ st1 {v24.8b}, [x17], x15
+.endif
+ b.gt 0b // double line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
+.ifc \type, qpel
+ load_qpel_filter x4
+ // blocks
+ mov w8, #0xAAAB
+ movk w8, #0x2AAA, lsl #16
+ smull x15, w8, w6
+ asr x15, x15, #33
+ sub w6, w15, w6, asr #31
+ // fast divide by 12, thank gcc for this one...
+
+ // src constants
+ lsl x10, x2, #1 // width * 2
+ sub x1, x1, #3 // src = src - 3
+
+ // dst constants
+ mov x15, #(MAX_PB_SIZE * 4 - 16) // dst stride
+
+ // loop
+ mov x8, xzr // hblock
+0: mov w7, w3
+
+ // 12 * hblock
+ lsl x12, x8, #3
+ add x12, x12, x8, lsl #2
+
+ add x13, x1, x12 // src1 = src0 + 12 * hblock
+ add x14, x13, x2 // src2 = src1 + src stride
+
+ add x16, x0, x12, lsl #1 // dst1 = dst0 + 12 * hblock * 2
+ add x17, x16, #(MAX_PB_SIZE << 1) // dst2 = dst1 + dst stride
+.else
+ // blocks
+.ifc \type, qpel_bi
+ ldrh w7, [sp]
+ load_qpel_filter x6
+.else
+ load_qpel_filter x5
+.endif
+ mov w9, #0xAAAB
+ movk w9, #0x2AAA, lsl #16
+ smull x15, w9, w7
+ asr x15, x15, #33
+ sub w6, w15, w7, asr #31
+
+ // src constants
+ lsl x10, x3, #1 // src stride * 2
+ sub x2, x2, #3 // src = src - 3
+
+ // dst constants
+ lsl x15, x1, #1 // dst stride * 2
+.ifc \type, qpel_bi
+ mov x9, #(MAX_PB_SIZE << 2)
+.endif
+ sub x15, x15, #8
+ // loop
+ mov x8, xzr // hblock
+0:
+.ifc \type, qpel_bi // height
+ mov w7, w5
+.else
+ mov w7, w4
+.endif
+ // 12 * hblock
+ lsl x12, x8, #3
+ add x12, x12, x8, lsl #2
+
+ add x13, x2, x12 // src1 = src0 + 12 * hblock
+ add x14, x13, x3 // src2 = src1 + src stride
+
+ add x16, x0, x12 // dst1 = dst0 + 12 * hblock
+ add x17, x16, x1 // dst2 = dst1 + dst stride
+.ifc \type, qpel_bi
+ add x11, x4, x12, lsl #1 // rsrc1 = rsrc0 + 12 * hblock * 2
+ add x12, x11, #(MAX_PB_SIZE << 1) // rsrc2 = rsrc1 + rsrc stride
+.endif
+.endif
+1: ld1 {v16.8b-v18.8b}, [x13], x10
+ ld1 {v19.8b-v21.8b}, [x14], x10
+
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+
+ uxtl v19.8h, v19.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+
+ mul v26.8h, v16.8h, v0.h[0]
+ mul v27.8h, v17.8h, v0.h[0]
+ mul v28.8h, v19.8h, v0.h[0]
+ mul v29.8h, v20.8h, v0.h[0]
+
+.irpc i, 1234567
+ ext v22.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v17.16b, v18.16b, #(2*\i)
+
+ ext v24.16b, v19.16b, v20.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+
+ mla v26.8h, v22.8h, v0.h[\i]
+ mla v27.8h, v23.8h, v0.h[\i]
+
+ mla v28.8h, v24.8h, v0.h[\i]
+ mla v29.8h, v25.8h, v0.h[\i]
+.endr
+ subs w7, w7, #2
+.ifc \type, qpel
+ st1 {v26.8h}, [x16], #16
+ st1 {v27.4h}, [x16], x15
+ st1 {v28.8h}, [x17], #16
+ st1 {v29.4h}, [x17], x15
+.else
+.ifc \type, qpel_bi
+ ld1 {v16.8h, v17.8h}, [x11], x9
+ ld1 {v18.8h, v19.8h}, [x12], x9
+ sqadd v26.8h, v26.8h, v16.8h
+ sqadd v27.8h, v27.8h, v17.8h
+ sqadd v28.8h, v28.8h, v18.8h
+ sqadd v29.8h, v29.8h, v19.8h
+ sqrshrun v26.8b, v26.8h, #7
+ sqrshrun v27.8b, v27.8h, #7
+ sqrshrun v28.8b, v28.8h, #7
+ sqrshrun v29.8b, v29.8h, #7
+.else
+ sqrshrun v26.8b, v26.8h, #6
+ sqrshrun v27.8b, v27.8h, #6
+ sqrshrun v28.8b, v28.8h, #6
+ sqrshrun v29.8b, v29.8h, #6
+.endif
+ st1 {v26.8b}, [x16], #8
+ st1 {v27.s}[0], [x16], x15
+ st1 {v28.8b}, [x17], #8
+ st1 {v29.s}[0], [x17], x15
+.endif
+ b.gt 1b // double line
+ add x8, x8, #1
+ cmp x8, x6
+ b.lt 0b // line of blocks
+ ret
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
+ mov x8, xzr // hblock
+.ifc \type, qpel
+ load_qpel_filter x4
+ // blocks
+ lsr w6, w6, #4 // horizontal block count
+ // src constants
+ lsl x10, x2, #1 // width * 2
+ sub x1, x1, #3 // src = src - 3
+ // dst constants
+ mov x15, #(MAX_PB_SIZE * 4 - 16) // dst stride
+ // loop
+0: mov w7, w3 // reset height
+
+ add x13, x1, x8, lsl #4
+ add x14, x13, x2 // src2 = src1 + src stride
+
+ add x16, x0, x8, lsl #5 // dst1 = dst0 + hblock * 16 * 2
+ add x17, x16, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
+.else
+.ifc \type, qpel_bi
+ mov x9, #(MAX_PB_SIZE << 2)
+ ldrh w7, [sp]
+ load_qpel_filter x6
+.else
+ load_qpel_filter x5
+.endif
+ // blocks
+ lsr w6, w7, #4 // horizontal block count
+ // src constants
+ lsl x10, x3, #1 // src stride * 2
+ sub x2, x2, #3 // src = src - 3
+ // dst constants
+ lsl x15, x1, #1 // dst stride * 2
+ sub x15, x15, #8
+ // loop
+0:
+.ifc \type, qpel_bi // height
+ mov w7, w5
+.else
+ mov w7, w4
+.endif
+
+ add x13, x2, x8, lsl #4 // src1 = src0 + hblock * 16
+ add x14, x13, x3 // src2 = src1 + src stride
+
+ add x16, x0, x8, lsl #4 // dst1 = dst0 + hblock * 16
+ add x17, x16, x1 // dst2 = dst1 + dst stride
+.ifc \type, qpel_bi
+ add x11, x4, x8, lsl #5 // rsrc1 = rsrc0 + 16 * hblock * 2
+ add x12, x11, #(MAX_PB_SIZE << 1) // rsrc2 = rsrc1 + rsrc stride
+.endif
+.endif
+1: ld1 {v16.8b-v18.8b}, [x13], x10
+ ld1 {v19.8b-v21.8b}, [x14], x10
+
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+
+ uxtl v19.8h, v19.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+
+ mul v26.8h, v16.8h, v0.h[0]
+ mul v27.8h, v17.8h, v0.h[0]
+ mul v28.8h, v19.8h, v0.h[0]
+ mul v29.8h, v20.8h, v0.h[0]
+
+.irpc i, 1234567
+ ext v22.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v17.16b, v18.16b, #(2*\i)
+
+ ext v24.16b, v19.16b, v20.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+
+ mla v26.8h, v22.8h, v0.h[\i]
+ mla v27.8h, v23.8h, v0.h[\i]
+
+ mla v28.8h, v24.8h, v0.h[\i]
+ mla v29.8h, v25.8h, v0.h[\i]
+.endr
+ subs w7, w7, #2
+.ifc \type, qpel
+ st1 {v26.8h}, [x16], #16
+ st1 {v27.8h}, [x16], x15
+ st1 {v28.8h}, [x17], #16
+ st1 {v29.8h}, [x17], x15
+.else
+.ifc \type, qpel_bi
+ ld1 {v16.8h, v17.8h}, [x11], x9
+ ld1 {v18.8h, v19.8h}, [x12], x9
+ sqadd v26.8h, v26.8h, v16.8h
+ sqadd v27.8h, v27.8h, v17.8h
+ sqadd v28.8h, v28.8h, v18.8h
+ sqadd v29.8h, v29.8h, v19.8h
+ sqrshrun v26.8b, v26.8h, #7
+ sqrshrun v27.8b, v27.8h, #7
+ sqrshrun v28.8b, v28.8h, #7
+ sqrshrun v29.8b, v29.8h, #7
+.else
+ sqrshrun v26.8b, v26.8h, #6
+ sqrshrun v27.8b, v27.8h, #6
+ sqrshrun v28.8b, v28.8h, #6
+ sqrshrun v29.8b, v29.8h, #6
+.endif
+ st1 {v26.8b}, [x16], #8
+ st1 {v27.8b}, [x16], x15
+ st1 {v28.8b}, [x17], #8
+ st1 {v29.8b}, [x17], x15
+.endif
+ b.gt 1b // double line
+ add x8, x8, #1
+ cmp x8, x6
+ b.lt 0b // horizontal tiling
+ ret
+endfunc
+.endm
+
+put_hevc qpel
+put_hevc qpel_uni
+put_hevc qpel_bi
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: add hevc horizontal qpel/uni/bi
2022-05-24 11:38 [FFmpeg-devel] [PATCH] lavc/aarch64: add hevc horizontal qpel/uni/bi J. Dekker
@ 2022-05-25 10:17 ` Martin Storsjö
0 siblings, 0 replies; 2+ messages in thread
From: Martin Storsjö @ 2022-05-25 10:17 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Tue, 24 May 2022, J. Dekker wrote:
> libavcodec/aarch64/Makefile | 1 +
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 43 +-
> libavcodec/aarch64/hevcdsp_qpel_neon.S | 520 ++++++++++++++++++++++
> 3 files changed, 563 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
Overall comment, now this looks much more straightforward than before,
that's good! Some inline comments below.
> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> new file mode 100644
> index 0000000000..bbaa32a9d9
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> @@ -0,0 +1,520 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +#define MAX_PB_SIZE 64
> +
> +const qpel_filters, align=4
> + .byte 0, 0, 0, 0, 0, 0, 0, 0
> + .byte -1, 4,-10, 58, 17, -5, 1, 0
> + .byte -1, 4,-11, 40, 40,-11, 4, -1
> + .byte 0, 1, -5, 17, 58,-10, 4, -1
> +endconst
> +
> +.macro load_qpel_filter m
> + movrel x15, qpel_filters
> + add x15, x15, \m, lsl #3
> + ld1 {v0.8b}, [x15]
> + sxtl v0.8h, v0.8b
> +.endm
> +
> +// void put_hevc_qpel_h(int16_t *dst,
> +// uint8_t *_src, ptrdiff_t _srcstride,
> +// int height, intptr_t mx, intptr_t my, int width)
> +
> +// void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride,
> +// uint8_t *_src, ptrdiff_t _srcstride,
> +// int height, intptr_t mx, intptr_t my, int width)
> +
> +// void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
> +// uint8_t *_src, ptrdiff_t _srcstride,
> +// int16_t *src2, int height, intptr_t mx,
> +// intptr_t my, int width)
> +
> +.macro put_hevc type
> +function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
> +.ifc \type, qpel
> + load_qpel_filter x4
> + lsl x10, x2, #1 // src stride * 2
> + sub x13, x1, #3 // src1 = src - 3
> + mov x15, #(MAX_PB_SIZE << 2) // dst stride
> + add x14, x13, x2 // src2 = src1 + src stride
> + add x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
> +.else
> +.ifc \type, qpel_bi
> + load_qpel_filter x6
> + mov x6, #(MAX_PB_SIZE << 2) // rsrc stride << 1
> + add x7, x4, #(MAX_PB_SIZE << 1) // rsrc2
> +.else
> + load_qpel_filter x5
> +.endif
> + lsl x10, x3, #1 // src stride * 2
> + sub x13, x2, #3 // src1 = src - 3
> + lsl x15, x1, #1 // dst stride * 2
> + add x14, x13, x3 // src2 = src1 + src stride
> + add x17, x0, x1 // dst2 = dst1 + dst stride
> +.endif
> +0: ld1 {v16.8b, v17.8b}, [x13], x10
> + ld1 {v18.8b, v19.8b}, [x14], x10
> +.ifc \type, qpel_bi
> + ld1 {v25.8h}, [x4], x6
> + ld1 {v26.8h}, [x7], x6
> +.endif
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> + uxtl v19.8h, v19.8b
> +
> + mul v23.8h, v16.8h, v0.h[0]
> + mul v24.8h, v18.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v20.16b, v16.16b, v17.16b, #(2*\i)
> + ext v21.16b, v18.16b, v19.16b, #(2*\i)
> + mla v23.8h, v20.8h, v0.h[\i]
> + mla v24.8h, v21.8h, v0.h[\i]
> +.endr
As we're only interested in .4h output here, we can do all the mul/mla
here with .4h too, which should give a bit of extra speedup here.
(Theoretically, one could consider packing two .4h halves into one
register and making do with only one mul/mla .8h, but I think two separate
.4h operations are quicker than the extra gymnastics it would require to
shuffle the inputs for that.)
> +
> +.ifc \type, qpel
> + subs w3, w3, #2
> + st1 {v23.4h}, [ x0], x15
> + st1 {v24.4h}, [x17], x15
> +.else
> +.ifc \type, qpel_bi
> + subs w5, w5, #2
> + sqadd v23.8h, v23.8h, v25.8h
> + sqadd v24.8h, v24.8h, v26.8h
These could also be plain .4h then.
> + sqrshrun v23.8b, v23.8h, #7
> + sqrshrun v24.8b, v24.8h, #7
> +.else
> + subs w4, w4, #2
> + sqrshrun v23.8b, v23.8h, #6
> + sqrshrun v24.8b, v24.8h, #6
> +.endif
> + st1 {v23.s}[0], [ x0], x15
> + st1 {v24.s}[0], [x17], x15
> +.endif
> + b.gt 0b // double line
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
> +.ifc \type, qpel
> + load_qpel_filter x4
> + lsl x10, x2, #1 // width * 2
> + sub x13, x1, #3 // src1 = src - 3
> + mov x15, #(MAX_PB_SIZE * 4 - 8) // dst stride
> + add x14, x13, x2 // src2 = src1 + src stride
> + add x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
> +.else
> +.ifc \type, qpel_bi
> + load_qpel_filter x6
> + mov x6, #(MAX_PB_SIZE << 2) // rsrc stride << 1
> + add x7, x4, #(MAX_PB_SIZE << 1) // rsrc2
> +.else
> + load_qpel_filter x5
> +.endif
> + lsl x10, x3, #1 // src stride * 2
> + sub x13, x2, #3 // src1 = src - 3
> + lsl x15, x1, #1 // dst stride * 2
> + subs x15, x15, #4
> + add x14, x13, x3 // src2 = src1 + src stride
> + add x17, x0, x1 // dst2 = dst1 + dst stride
> +.endif
> +0: ld1 {v16.8b, v17.8b}, [x13], x10
> + ld1 {v18.8b, v19.8b}, [x14], x10
> +.ifc \type, qpel_bi
> + ld1 {v25.8h}, [x4], x6
> + ld1 {v26.8h}, [x7], x6
> +.endif
> +
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> + uxtl v19.8h, v19.8b
> +
> + mul v23.8h, v16.8h, v0.h[0]
> + mul v24.8h, v18.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v20.16b, v16.16b, v17.16b, #(2*\i)
> + ext v21.16b, v18.16b, v19.16b, #(2*\i)
> + mla v23.8h, v20.8h, v0.h[\i]
> + mla v24.8h, v21.8h, v0.h[\i]
> +.endr
> +
> +.ifc \type, qpel
> + subs w3, w3, #2
> + st1 {v23.4h}, [ x0], #8
> + st1 {v23.s}[2], [ x0], x15
> + st1 {v24.4h}, [x17], #8
> + st1 {v24.s}[2], [x17], x15
As the first st1 updates x0, there's some latency before the next
instruction can start, so here it's better to interleave the stores as x0,
x17, x0, x17. Same thing below, and in the h12 function.
> +.else
> +.ifc \type, qpel_bi
> + subs w5, w5, #2
> + sqadd v23.8h, v23.8h, v25.8h
> + sqadd v24.8h, v24.8h, v26.8h
> + sqrshrun v23.8b, v23.8h, #7
> + sqrshrun v24.8b, v24.8h, #7
> +.else
> + subs w4, w4, #2
> + sqrshrun v23.8b, v23.8h, #6
> + sqrshrun v24.8b, v24.8h, #6
> +.endif
> + st1 {v23.s}[0], [ x0], #4
> + st1 {v23.h}[2], [ x0], x15
> + st1 {v24.s}[0], [x17], #4
> + st1 {v24.h}[2], [x17], x15
> +.endif
> + b.gt 0b // double line
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
> +.ifc \type, qpel
> + load_qpel_filter x4
> + lsl x10, x2, #1 // width * 2
> + sub x13, x1, #3 // src1 = src - 3
> + mov x15, #(MAX_PB_SIZE << 2) // dst stride
> + add x14, x13, x2 // src2 = src1 + src stride
> + add x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
> +.else
> +.ifc \type, qpel_bi
> + load_qpel_filter x6
> + mov x6, #(MAX_PB_SIZE << 2) // rsrc stride << 1
> + add x7, x4, #(MAX_PB_SIZE << 1) // rsrc2
> +.else
> + load_qpel_filter x5
> +.endif
> + lsl x10, x3, #1 // src stride * 2
> + sub x13, x2, #3 // src1 = src - 3
> + lsl x15, x1, #1 // dst stride * 2
> + add x14, x13, x3 // src2 = src1 + src stride
> + add x17, x0, x1 // dst2 = dst1 + dst stride
> +.endif
> +0: ld1 {v16.8b, v17.8b}, [x13], x10
> + ld1 {v18.8b, v19.8b}, [x14], x10
> +.ifc \type, qpel_bi
> + ld1 {v25.8h}, [x4], x6
> + ld1 {v26.8h}, [x7], x6
> +.endif
> +
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> + uxtl v19.8h, v19.8b
> +
> + mul v23.8h, v16.8h, v0.h[0]
> + mul v24.8h, v18.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v20.16b, v16.16b, v17.16b, #(2*\i)
> + ext v21.16b, v18.16b, v19.16b, #(2*\i)
> + mla v23.8h, v20.8h, v0.h[\i]
> + mla v24.8h, v21.8h, v0.h[\i]
> +.endr
> +
> +.ifc \type, qpel
> + subs w3, w3, #2
> + st1 {v23.8h}, [ x0], x15
> + st1 {v24.8h}, [x17], x15
> +.else
> +.ifc \type, qpel_bi
> + subs w5, w5, #2
> + sqadd v23.8h, v23.8h, v25.8h
> + sqadd v24.8h, v24.8h, v26.8h
> + sqrshrun v23.8b, v23.8h, #7
> + sqrshrun v24.8b, v24.8h, #7
> +.else
> + subs w4, w4, #2
> + sqrshrun v23.8b, v23.8h, #6
> + sqrshrun v24.8b, v24.8h, #6
> +.endif
> + st1 {v23.8b}, [ x0], x15
> + st1 {v24.8b}, [x17], x15
> +.endif
> + b.gt 0b // double line
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
> +.ifc \type, qpel
> + load_qpel_filter x4
> + // blocks
> + mov w8, #0xAAAB
> + movk w8, #0x2AAA, lsl #16
> + smull x15, w8, w6
> + asr x15, x15, #33
> + sub w6, w15, w6, asr #31
> + // fast divide by 12, thank gcc for this one...
> +
> + // src constants
> + lsl x10, x2, #1 // width * 2
> + sub x1, x1, #3 // src = src - 3
> +
> + // dst constants
> + mov x15, #(MAX_PB_SIZE * 4 - 16) // dst stride
> +
> + // loop
> + mov x8, xzr // hblock
> +0: mov w7, w3
> +
> + // 12 * hblock
> + lsl x12, x8, #3
> + add x12, x12, x8, lsl #2
> +
> + add x13, x1, x12 // src1 = src0 + 12 * hblock
> + add x14, x13, x2 // src2 = src1 + src stride
> +
> + add x16, x0, x12, lsl #1 // dst1 = dst0 + 12 * hblock * 2
> + add x17, x16, #(MAX_PB_SIZE << 1) // dst2 = dst1 + dst stride
> +.else
> + // blocks
> +.ifc \type, qpel_bi
> + ldrh w7, [sp]
> + load_qpel_filter x6
> +.else
> + load_qpel_filter x5
> +.endif
> + mov w9, #0xAAAB
> + movk w9, #0x2AAA, lsl #16
> + smull x15, w9, w7
> + asr x15, x15, #33
> + sub w6, w15, w7, asr #31
> +
> + // src constants
> + lsl x10, x3, #1 // src stride * 2
> + sub x2, x2, #3 // src = src - 3
> +
> + // dst constants
> + lsl x15, x1, #1 // dst stride * 2
> +.ifc \type, qpel_bi
> + mov x9, #(MAX_PB_SIZE << 2)
> +.endif
> + sub x15, x15, #8
> + // loop
> + mov x8, xzr // hblock
> +0:
> +.ifc \type, qpel_bi // height
> + mov w7, w5
> +.else
> + mov w7, w4
> +.endif
> + // 12 * hblock
> + lsl x12, x8, #3
> + add x12, x12, x8, lsl #2
> +
> + add x13, x2, x12 // src1 = src0 + 12 * hblock
> + add x14, x13, x3 // src2 = src1 + src stride
> +
> + add x16, x0, x12 // dst1 = dst0 + 12 * hblock
> + add x17, x16, x1 // dst2 = dst1 + dst stride
> +.ifc \type, qpel_bi
> + add x11, x4, x12, lsl #1 // rsrc1 = rsrc0 + 12 * hblock * 2
> + add x12, x11, #(MAX_PB_SIZE << 1) // rsrc2 = rsrc1 + rsrc stride
> +.endif
> +.endif
> +1: ld1 {v16.8b-v18.8b}, [x13], x10
> + ld1 {v19.8b-v21.8b}, [x14], x10
> +
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> +
> + uxtl v19.8h, v19.8b
> + uxtl v20.8h, v20.8b
> + uxtl v21.8h, v21.8b
> +
> + mul v26.8h, v16.8h, v0.h[0]
> + mul v27.8h, v17.8h, v0.h[0]
> + mul v28.8h, v19.8h, v0.h[0]
> + mul v29.8h, v20.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v22.16b, v16.16b, v17.16b, #(2*\i)
> + ext v23.16b, v17.16b, v18.16b, #(2*\i)
> +
> + ext v24.16b, v19.16b, v20.16b, #(2*\i)
> + ext v25.16b, v20.16b, v21.16b, #(2*\i)
> +
> + mla v26.8h, v22.8h, v0.h[\i]
> + mla v27.8h, v23.8h, v0.h[\i]
> +
> + mla v28.8h, v24.8h, v0.h[\i]
> + mla v29.8h, v25.8h, v0.h[\i]
> +.endr
> + subs w7, w7, #2
> +.ifc \type, qpel
> + st1 {v26.8h}, [x16], #16
> + st1 {v27.4h}, [x16], x15
> + st1 {v28.8h}, [x17], #16
> + st1 {v29.4h}, [x17], x15
> +.else
> +.ifc \type, qpel_bi
> + ld1 {v16.8h, v17.8h}, [x11], x9
> + ld1 {v18.8h, v19.8h}, [x12], x9
> + sqadd v26.8h, v26.8h, v16.8h
> + sqadd v27.8h, v27.8h, v17.8h
> + sqadd v28.8h, v28.8h, v18.8h
> + sqadd v29.8h, v29.8h, v19.8h
> + sqrshrun v26.8b, v26.8h, #7
> + sqrshrun v27.8b, v27.8h, #7
> + sqrshrun v28.8b, v28.8h, #7
> + sqrshrun v29.8b, v29.8h, #7
> +.else
> + sqrshrun v26.8b, v26.8h, #6
> + sqrshrun v27.8b, v27.8h, #6
> + sqrshrun v28.8b, v28.8h, #6
> + sqrshrun v29.8b, v29.8h, #6
> +.endif
> + st1 {v26.8b}, [x16], #8
> + st1 {v27.s}[0], [x16], x15
> + st1 {v28.8b}, [x17], #8
> + st1 {v29.s}[0], [x17], x15
> +.endif
> + b.gt 1b // double line
> + add x8, x8, #1
> + cmp x8, x6
> + b.lt 0b // line of blocks
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
> + mov x8, xzr // hblock
> +.ifc \type, qpel
> + load_qpel_filter x4
> + // blocks
> + lsr w6, w6, #4 // horizontal block count
> + // src constants
> + lsl x10, x2, #1 // width * 2
> + sub x1, x1, #3 // src = src - 3
> + // dst constants
> + mov x15, #(MAX_PB_SIZE * 4 - 16) // dst stride
> + // loop
> +0: mov w7, w3 // reset height
> +
> + add x13, x1, x8, lsl #4
> + add x14, x13, x2 // src2 = src1 + src stride
> +
> + add x16, x0, x8, lsl #5 // dst1 = dst0 + hblock * 16 * 2
> + add x17, x16, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
Alternatively, instead of doing "src = src_base + hblock*16" at the start
of each loop here, you could consider doing "src -= height*stride; src +=
16" at the end of each loop iteration. Overall I think it amounts to
essentially the same amount of instructions (although that would end up
needing an msub), so it doesn't make any difference in that aspect.
But it would free up the x8 register, so that instead of counting x8 from
0 up to x6, you could just count down x6 and quit the loop when it reaches
zero.
> +.else
> +.ifc \type, qpel_bi
> + mov x9, #(MAX_PB_SIZE << 2)
> + ldrh w7, [sp]
> + load_qpel_filter x6
> +.else
> + load_qpel_filter x5
> +.endif
> + // blocks
> + lsr w6, w7, #4 // horizontal block count
> + // src constants
> + lsl x10, x3, #1 // src stride * 2
> + sub x2, x2, #3 // src = src - 3
> + // dst constants
> + lsl x15, x1, #1 // dst stride * 2
> + sub x15, x15, #8
> + // loop
> +0:
> +.ifc \type, qpel_bi // height
> + mov w7, w5
> +.else
> + mov w7, w4
> +.endif
> +
> + add x13, x2, x8, lsl #4 // src1 = src0 + hblock * 16
> + add x14, x13, x3 // src2 = src1 + src stride
> +
> + add x16, x0, x8, lsl #4 // dst1 = dst0 + hblock * 16
> + add x17, x16, x1 // dst2 = dst1 + dst stride
> +.ifc \type, qpel_bi
> + add x11, x4, x8, lsl #5 // rsrc1 = rsrc0 + 16 * hblock * 2
> + add x12, x11, #(MAX_PB_SIZE << 1) // rsrc2 = rsrc1 + rsrc stride
> +.endif
> +.endif
> +1: ld1 {v16.8b-v18.8b}, [x13], x10
> + ld1 {v19.8b-v21.8b}, [x14], x10
> +
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> +
> + uxtl v19.8h, v19.8b
> + uxtl v20.8h, v20.8b
> + uxtl v21.8h, v21.8b
> +
> + mul v26.8h, v16.8h, v0.h[0]
> + mul v27.8h, v17.8h, v0.h[0]
> + mul v28.8h, v19.8h, v0.h[0]
> + mul v29.8h, v20.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v22.16b, v16.16b, v17.16b, #(2*\i)
> + ext v23.16b, v17.16b, v18.16b, #(2*\i)
> +
> + ext v24.16b, v19.16b, v20.16b, #(2*\i)
> + ext v25.16b, v20.16b, v21.16b, #(2*\i)
> +
> + mla v26.8h, v22.8h, v0.h[\i]
> + mla v27.8h, v23.8h, v0.h[\i]
> +
> + mla v28.8h, v24.8h, v0.h[\i]
> + mla v29.8h, v25.8h, v0.h[\i]
> +.endr
> + subs w7, w7, #2
> +.ifc \type, qpel
> + st1 {v26.8h}, [x16], #16
> + st1 {v27.8h}, [x16], x15
> + st1 {v28.8h}, [x17], #16
> + st1 {v29.8h}, [x17], x15
> +.else
> +.ifc \type, qpel_bi
> + ld1 {v16.8h, v17.8h}, [x11], x9
> + ld1 {v18.8h, v19.8h}, [x12], x9
> + sqadd v26.8h, v26.8h, v16.8h
> + sqadd v27.8h, v27.8h, v17.8h
> + sqadd v28.8h, v28.8h, v18.8h
> + sqadd v29.8h, v29.8h, v19.8h
> + sqrshrun v26.8b, v26.8h, #7
> + sqrshrun v27.8b, v27.8h, #7
> + sqrshrun v28.8b, v28.8h, #7
> + sqrshrun v29.8b, v29.8h, #7
> +.else
> + sqrshrun v26.8b, v26.8h, #6
> + sqrshrun v27.8b, v27.8h, #6
> + sqrshrun v28.8b, v28.8h, #6
> + sqrshrun v29.8b, v29.8h, #6
> +.endif
> + st1 {v26.8b}, [x16], #8
> + st1 {v27.8b}, [x16], x15
> + st1 {v28.8b}, [x17], #8
> + st1 {v29.8b}, [x17], x15
> +.endif
> + b.gt 1b // double line
> + add x8, x8, #1
> + cmp x8, x6
> + b.lt 0b // horizontal tiling
If you restructure the loop counting, you could do "subs x8, x8, #1" first
here, then do the resetting/incrementing of the src pointers, then a "b.gt
0b", hiding the latency between the subs and the branch (because here,
there's a tight dependency chain between add, cmp and b.lt).
I think this might show a little difference if benchmarked on an in-order
core, with large widths. (But this is not a big deal.)
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2022-05-25 10:17 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-24 11:38 [FFmpeg-devel] [PATCH] lavc/aarch64: add hevc horizontal qpel/uni/bi J. Dekker
2022-05-25 10:17 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git