* [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: add hevc qpel assembly
@ 2022-01-20 8:10 J. Dekker
2022-01-20 8:38 ` Martin Storsjö
0 siblings, 1 reply; 6+ messages in thread
From: J. Dekker @ 2022-01-20 8:10 UTC (permalink / raw)
To: ffmpeg-devel
Based on patch by: Rafal Dabrowa <fatwildcat@gmail.com>
---
libavcodec/aarch64/Makefile | 1 +
libavcodec/aarch64/hevcdsp_init_aarch64.c | 69 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 2799 +++++++++++++++++++++
3 files changed, 2869 insertions(+)
Some changes since last time it was submitted, namely: split, macro'd
and some scheduling and other improvements.
create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..8592692479 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -63,4 +63,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9mc_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
aarch64/hevcdsp_init_aarch64.o \
+ aarch64/hevcdsp_qpel_neon.o \
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..69f0d9bc6f 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,63 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
int16_t *sao_offset_val, int sao_left_class,
int width, int height);
+#define NEON8_FNPROTO(fn, args) \
+ void ff_hevc_put_hevc_##fn##4_8_neon args; \
+ void ff_hevc_put_hevc_##fn##6_8_neon args; \
+ void ff_hevc_put_hevc_##fn##8_8_neon args; \
+ void ff_hevc_put_hevc_##fn##12_8_neon args; \
+ void ff_hevc_put_hevc_##fn##16_8_neon args; \
+ void ff_hevc_put_hevc_##fn##24_8_neon args; \
+ void ff_hevc_put_hevc_##fn##32_8_neon args; \
+ void ff_hevc_put_hevc_##fn##48_8_neon args; \
+ void ff_hevc_put_hevc_##fn##64_8_neon args; \
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon; \
+ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon; \
+ member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon; \
+ member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+ member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+ member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+ member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+ member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+ member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
@@ -80,6 +136,19 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
// for the current size, but if enabled for bigger sizes, the cases
// of non-multiple of 8 seem to arise.
// c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon;
+
+ NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1, qpel_bi_h);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv);
+
}
if (bit_depth == 10) {
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..e8cc6f5f25
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,2799 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+.Lqpel_filters:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0
+ .byte -1, 4,-10, 58, 17, -5, 1, 0
+ .byte -1, 4,-11, 40, 40,-11, 4, -1
+ .byte 0, 1, -5, 17, 58,-10, 4, -1
+
+.macro load_qpel_filterb freg, xreg
+ adr \xreg, .Lqpel_filters
+ add \xreg, \xreg, \freg, lsl #3
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
+ ld4r {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
+ neg v0.16b, v0.16b
+ neg v2.16b, v2.16b
+ neg v5.16b, v5.16b
+ neg v7.16b, v7.16b
+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+ umlsl \dst\().8h, \src0\().8b, v0.8b
+ umlal \dst\().8h, \src1\().8b, v1.8b
+ umlsl \dst\().8h, \src2\().8b, v2.8b
+ umlal \dst\().8h, \src3\().8b, v3.8b
+ umlal \dst\().8h, \src4\().8b, v4.8b
+ umlsl \dst\().8h, \src5\().8b, v5.8b
+ umlal \dst\().8h, \src6\().8b, v6.8b
+ umlsl \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+ umlsl2 \dst\().8h, \src0\().16b, v0.16b
+ umlal2 \dst\().8h, \src1\().16b, v1.16b
+ umlsl2 \dst\().8h, \src2\().16b, v2.16b
+ umlal2 \dst\().8h, \src3\().16b, v3.16b
+ umlal2 \dst\().8h, \src4\().16b, v4.16b
+ umlsl2 \dst\().8h, \src5\().16b, v5.16b
+ umlal2 \dst\().8h, \src6\().16b, v6.16b
+ umlsl2 \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro load_qpel_filterh freg, xreg
+ adr \xreg, .Lqpel_filters
+ add \xreg, \xreg, \freg, lsl #3
+ ld1 {v0.8b}, [\xreg]
+ sxtl v0.8h, v0.8b
+.endm
+
+.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+ smull \dst\().4s, \src0\().4h, v0.h[0]
+ smlal \dst\().4s, \src1\().4h, v0.h[1]
+ smlal \dst\().4s, \src2\().4h, v0.h[2]
+ smlal \dst\().4s, \src3\().4h, v0.h[3]
+ smlal \dst\().4s, \src4\().4h, v0.h[4]
+ smlal \dst\().4s, \src5\().4h, v0.h[5]
+ smlal \dst\().4s, \src6\().4h, v0.h[6]
+ smlal \dst\().4s, \src7\().4h, v0.h[7]
+.ifc \op, sshr
+ sshr \dst\().4s, \dst\().4s, \shift
+.else
+ \op \dst\().4h, \dst\().4s, \shift
+.endif
+.endm
+
+.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+ smull2 \dstt\().4s, \src0\().8h, v0.h[0]
+ smlal2 \dstt\().4s, \src1\().8h, v0.h[1]
+ smlal2 \dstt\().4s, \src2\().8h, v0.h[2]
+ smlal2 \dstt\().4s, \src3\().8h, v0.h[3]
+ smlal2 \dstt\().4s, \src4\().8h, v0.h[4]
+ smlal2 \dstt\().4s, \src5\().8h, v0.h[5]
+ smlal2 \dstt\().4s, \src6\().8h, v0.h[6]
+ smlal2 \dstt\().4s, \src7\().8h, v0.h[7]
+.ifc \op, sshr
+ sshr \dst\().4s, \dstt\().4s, \shift
+.else
+ \op \dst\().8h, \dstt\().4s, \shift
+.endif
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #8
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b}, [x1], #8
+ ld1 {v17.s}[0], [x1], x2
+.macro calc src0, src1, idx
+ ushr \src0\().2d, \src1\().2d, #8
+ mov \src0\().b[7], v17.b[\idx]
+.endm
+ calc v18, v16, 0
+ calc v19, v18, 1
+ calc v20, v19, 2
+ ushr v21.2d, v20.2d, #8
+ ushr v22.2d, v21.2d, #8
+ ushr v23.2d, v22.2d, #8
+ ushr v24.2d, v23.2d, #8
+ movi v28.8h, #0
+ calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
+ subs w3, w3, #1
+ st1 {v28.4h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ mov x14, #(MAX_PB_SIZE * 2 - 8)
+1: ld1 {v16.8b, v17.8b}, [x1], x2
+ // same macro
+ calc v18, v16, 0
+ calc v19, v18, 1
+ calc v20, v19, 2
+ calc v21, v20, 3
+ calc v22, v21, 4
+ ushr v23.2d, v22.2d, #8
+ ushr v24.2d, v23.2d, #8
+ movi v28.8h, #0
+ calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
+ st1 {v28.4h}, [x0], #8
+ subs w3, w3, #1
+ st1 {v28.s}[2], [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon, export=1
+ sxtw x4, w4
+ sxtw x7, w7
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x1], x2
+ // same macro
+ calc v18, v16, 0
+ calc v19, v18, 1
+ calc v20, v19, 2
+ calc v21, v20, 3
+ calc v22, v21, 4
+ calc v23, v22, 5
+ calc v24, v23, 6
+.purgem calc
+ movi v28.8h, #0
+ calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
+ subs w3, w3, #1
+ st1 {v28.8h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h12_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #16
+ mov x14, #(MAX_PB_SIZE * 2 - 16)
+1: ld2 {v16.8b, v17.8b}, [x1], #16
+ ld1 {v27.s}[0], [x1], x2
+ ushr v18.2d, v16.2d, #8
+ ushr v19.2d, v17.2d, #8
+ mov v18.b[7], v27.b[0]
+ mov v19.b[7], v27.b[1]
+ ushr v20.2d, v18.2d, #8
+ ushr v21.2d, v19.2d, #8
+ mov v20.b[7], v27.b[2]
+ mov v21.b[7], v27.b[3]
+ ushr v22.2d, v20.2d, #8
+ ushr v23.2d, v21.2d, #8
+ ushr v24.2d, v22.2d, #8
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ zip1 v16.8h, v28.8h, v29.8h
+ zip2 v17.8h, v28.8h, v29.8h
+ st1 {v16.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v17.4h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #16
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld2 {v16.8b, v17.8b}, [x1], #16
+ ld1 {v27.8b}, [x1], x2
+ ushr v18.2d, v16.2d, #8
+ ushr v19.2d, v17.2d, #8
+ mov v18.b[7], v27.b[0]
+ mov v19.b[7], v27.b[1]
+ ushr v20.2d, v18.2d, #8
+ ushr v21.2d, v19.2d, #8
+ mov v20.b[7], v27.b[2]
+ mov v21.b[7], v27.b[3]
+ ushr v22.2d, v20.2d, #8
+ ushr v23.2d, v21.2d, #8
+ mov v22.b[7], v27.b[4]
+ mov v23.b[7], v27.b[5]
+ ushr v24.2d, v22.2d, #8
+ mov v24.b[7], v27.b[6]
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ subs w3, w3, #1
+ st2 {v28.8h, v29.8h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #24
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld3 {v16.8b, v17.8b, v18.8b}, [x1], #24
+ ld1 {v27.8b}, [x1], x2
+ ushr v19.2d, v16.2d, #8
+ ushr v20.2d, v17.2d, #8
+ ushr v21.2d, v18.2d, #8
+ mov v19.b[7], v27.b[0]
+ mov v20.b[7], v27.b[1]
+ mov v21.b[7], v27.b[2]
+ ushr v22.2d, v19.2d, #8
+ ushr v23.2d, v20.2d, #8
+ ushr v24.2d, v21.2d, #8
+ mov v22.b[7], v27.b[3]
+ mov v23.b[7], v27.b[4]
+ mov v24.b[7], v27.b[5]
+ ushr v25.2d, v22.2d, #8
+ mov v25.b[7], v27.b[6]
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
+ subs w3, w3, #1
+ st3 {v28.8h, v29.8h, v30.8h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #32
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
+ movi v28.8h, #0
+ movi v29.8h, #0
+ ld1 {v27.8b}, [x1], x2
+ movi v30.8h, #0
+ movi v31.8h, #0
+ ushr v20.2d, v16.2d, #8
+ ushr v21.2d, v17.2d, #8
+ ushr v22.2d, v18.2d, #8
+ ushr v23.2d, v19.2d, #8
+ mov v20.b[7], v27.b[0]
+ mov v21.b[7], v27.b[1]
+ mov v22.b[7], v27.b[2]
+ mov v23.b[7], v27.b[3]
+ ushr v24.2d, v20.2d, #8
+ ushr v25.2d, v21.2d, #8
+ ushr v26.2d, v22.2d, #8
+ mov v24.b[7], v27.b[4]
+ mov v25.b[7], v27.b[5]
+ mov v26.b[7], v27.b[6]
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
+ calc_qpelb v31, v19, v20, v21, v22, v23, v24, v25, v26
+ st4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #48
+ mov x7, #24
+ mov x14, #80
+1: ld3 {v16.16b, v17.16b, v18.16b}, [x1], x7
+ movi v28.8h, #0
+ ld1 {v26.8b}, [x1], x7
+ movi v29.8h, #0
+ ld1 {v27.8b}, [x1], x2
+ movi v30.8h, #0
+ ushr v19.2d, v16.2d, #8
+ ushr v20.2d, v17.2d, #8
+ ushr v21.2d, v18.2d, #8
+ mov v19.b[7], v26.b[0]
+ mov v19.b[15], v27.b[0]
+ mov v20.b[7], v26.b[1]
+ mov v20.b[15], v27.b[1]
+ mov v21.b[7], v26.b[2]
+ mov v21.b[15], v27.b[2]
+ ushr v22.2d, v19.2d, #8
+ ushr v23.2d, v20.2d, #8
+ ushr v24.2d, v21.2d, #8
+ mov v22.b[7], v26.b[3]
+ mov v22.b[15], v27.b[3]
+ mov v23.b[7], v26.b[4]
+ mov v23.b[15], v27.b[4]
+ mov v24.b[7], v26.b[5]
+ mov v24.b[15], v27.b[5]
+ ushr v25.2d, v22.2d, #8
+ mov v25.b[7], v26.b[6]
+ mov v25.b[15], v27.b[6]
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
+ st3 {v28.8h, v29.8h, v30.8h}, [x0], #48
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ calc_qpelb2 v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb2 v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb2 v30, v18, v19, v20, v21, v22, v23, v24, v25
+ st3 {v28.8h, v29.8h, v30.8h}, [x0], x14
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #64
+ mov x7, #32
+1: ld4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x7
+ ld1 {v27.8b}, [x1], x7
+ ld1 {v28.8b}, [x1], x2
+ ushr v20.2d, v16.2d, #8
+ ushr v21.2d, v17.2d, #8
+ ushr v22.2d, v18.2d, #8
+ ushr v23.2d, v19.2d, #8
+ mov v20.b[7], v27.b[0]
+ mov v21.b[7], v27.b[1]
+ mov v22.b[7], v27.b[2]
+ mov v23.b[7], v27.b[3]
+ mov v20.b[15], v28.b[0]
+ mov v21.b[15], v28.b[1]
+ mov v22.b[15], v28.b[2]
+ mov v23.b[15], v28.b[3]
+ ushr v24.2d, v20.2d, #8
+ ushr v25.2d, v21.2d, #8
+ ushr v26.2d, v22.2d, #8
+ mov v24.b[7], v27.b[4]
+ mov v25.b[7], v27.b[5]
+ mov v26.b[7], v27.b[6]
+ mov v24.b[15], v28.b[4]
+ mov v25.b[15], v28.b[5]
+ mov v26.b[15], v28.b[6]
+.macro calc fn
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+ \fn v28, v16, v17, v18, v19, v20, v21, v22, v23
+ \fn v29, v17, v18, v19, v20, v21, v22, v23, v24
+ \fn v30, v18, v19, v20, v21, v22, v23, v24, v25
+ \fn v31, v19, v20, v21, v22, v23, v24, v25, v26
+ st4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+.endm
+ calc calc_qpelb
+ calc calc_qpelb2
+.purgem calc
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+.macro calc_all
+ calc v23, v16, v17, v18, v19, v20, v21, v22, v23
+ b.eq 2f
+ calc v16, v17, v18, v19, v20, v21, v22, v23, v16
+ b.eq 2f
+ calc v17, v18, v19, v20, v21, v22, v23, v16, v17
+ b.eq 2f
+ calc v18, v19, v20, v21, v22, v23, v16, v17, v18
+ b.eq 2f
+ calc v19, v20, v21, v22, v23, v16, v17, v18, v19
+ b.eq 2f
+ calc v20, v21, v22, v23, v16, v17, v18, v19, v20
+ b.eq 2f
+ calc v21, v22, v23, v16, v17, v18, v19, v20, v21
+ b.eq 2f
+ calc v22, v23, v16, v17, v18, v19, v20, v21, v22
+ b.hi 1b
+.endm
+
+.macro calc_all2
+ calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
+ b.eq 2f
+ calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
+ b.eq 2f
+ calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
+ b.eq 2f
+ calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
+ b.eq 2f
+ calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
+ b.eq 2f
+ calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
+ b.eq 2f
+ calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
+ b.eq 2f
+ calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
+ b.hi 1b
+.endm
+
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.s}[0], [x1], x2
+ ld1 {v17.s}[0], [x1], x2
+ ld1 {v18.s}[0], [x1], x2
+ ld1 {v19.s}[0], [x1], x2
+ ld1 {v20.s}[0], [x1], x2
+ ld1 {v21.s}[0], [x1], x2
+ ld1 {v22.s}[0], [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], x9
+ subs w3, w3, #1
+ b.eq 2f
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 8)
+ sub x1, x1, x2
+ ld1 {v16.8b}, [x1], x2
+ ld1 {v17.8b}, [x1], x2
+ ld1 {v18.8b}, [x1], x2
+ ld1 {v19.8b}, [x1], x2
+ ld1 {v20.8b}, [x1], x2
+ ld1 {v21.8b}, [x1], x2
+ ld1 {v22.8b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], #8
+ st1 {v24.s}[2], [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.8b}, [x1], x2
+ ld1 {v17.8b}, [x1], x2
+ ld1 {v18.8b}, [x1], x2
+ ld1 {v19.8b}, [x1], x2
+ ld1 {v20.8b}, [x1], x2
+ ld1 {v21.8b}, [x1], x2
+ ld1 {v22.8b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 16)
+ sub x1, x1, x2
+ ld1 {v16.16b}, [x1], x2
+ ld1 {v17.16b}, [x1], x2
+ ld1 {v18.16b}, [x1], x2
+ ld1 {v19.16b}, [x1], x2
+ ld1 {v20.16b}, [x1], x2
+ ld1 {v21.16b}, [x1], x2
+ ld1 {v22.16b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v25.4h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.16b}, [x1], x2
+ ld1 {v17.16b}, [x1], x2
+ ld1 {v18.16b}, [x1], x2
+ ld1 {v19.16b}, [x1], x2
+ ld1 {v20.16b}, [x1], x2
+ ld1 {v21.16b}, [x1], x2
+ ld1 {v22.16b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ subs w3, w3, #1
+ st1 {v24.8h, v25.8h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+ sub sp, sp, #48
+ st1 {v8.16b, v9.16b, v10.16b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h, v9.8h, v10.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.16b, v9.16b, v10.16b}, [sp], #48
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.16b-v11.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+ stp x5, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ ldr x5, [sp]
+ add x0, x0, #48
+ add x1, x1, #24
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+0: mov x8, x1 // src
+ ld1 {v16.16b, v17.16b}, [x8], x2
+ mov w11, w3 // height
+ ld1 {v18.16b, v19.16b}, [x8], x2
+ mov x10, x0 // dst
+ ld1 {v20.16b, v21.16b}, [x8], x2
+ ld1 {v22.16b, v23.16b}, [x8], x2
+ ld1 {v24.16b, v25.16b}, [x8], x2
+ ld1 {v26.16b, v27.16b}, [x8], x2
+ ld1 {v28.16b, v29.16b}, [x8], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x8], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs x11, x11, #1
+ st1 {v8.8h-v11.8h}, [x10], x9
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #64
+ add x1, x1, #32
+ subs w6, w6, #32
+ b.hi 0b
+ ld1 {v8.16b-v11.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
+ ldp x5, x30, [sp], #16
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.4h}, [sp], x7
+ ld1 {v17.4h}, [sp], x7
+ ld1 {v18.4h}, [sp], x7
+ ld1 {v19.4h}, [sp], x7
+ ld1 {v20.4h}, [sp], x7
+ ld1 {v21.4h}, [sp], x7
+ ld1 {v22.4h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ subs w3, w3, #1
+ st1 {v1.4h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x8, #120
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h}, [sp], x7
+ ld1 {v17.8h}, [sp], x7
+ ld1 {v18.8h}, [sp], x7
+ ld1 {v19.8h}, [sp], x7
+ ld1 {v20.8h}, [sp], x7
+ ld1 {v21.8h}, [sp], x7
+ ld1 {v22.8h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ st1 {v1.4h}, [x0], #8
+ subs w3, w3, #1
+ st1 {v1.s}[2], [x0], x8
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h}, [sp], x7
+ ld1 {v17.8h}, [sp], x7
+ ld1 {v18.8h}, [sp], x7
+ ld1 {v19.8h}, [sp], x7
+ ld1 {v20.8h}, [sp], x7
+ ld1 {v21.8h}, [sp], x7
+ ld1 {v22.8h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h12_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ mov x8, #112
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ st1 {v1.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v2.4h}, [x0], x8
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x3, x3, #7
+ add x0, sp, #32
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h, v2.8h}, [x0], x7
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, #64
+ add w10, w3, #7
+ st1 {v12.16b-v15.16b}, [sp]
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h24_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ ld1 {v26.8h-v28.8h}, [sp], x7
+1: ld1 {v29.8h-v31.8h}, [sp], x7
+ calc_qpelh v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+ calc_qpelh2 v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
+ calc_qpelh v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+ calc_qpelh2 v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
+ calc_qpelh v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+ calc_qpelh2 v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ calc_qpelh v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+ calc_qpelh2 v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
+ calc_qpelh v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+ calc_qpelh2 v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
+ calc_qpelh v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+ calc_qpelh2 v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ calc_qpelh v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+ calc_qpelh2 v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
+ calc_qpelh v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+ calc_qpelh2 v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
+ calc_qpelh v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+ calc_qpelh2 v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ calc_qpelh v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+ calc_qpelh2 v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
+ calc_qpelh v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+ calc_qpelh2 v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
+ calc_qpelh v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+ calc_qpelh2 v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ calc_qpelh v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+ calc_qpelh2 v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
+ calc_qpelh v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+ calc_qpelh2 v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
+ calc_qpelh v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+ calc_qpelh2 v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ calc_qpelh v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+ calc_qpelh2 v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
+ calc_qpelh v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+ calc_qpelh2 v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
+ calc_qpelh v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+ calc_qpelh2 v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ calc_qpelh v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+ calc_qpelh2 v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
+ calc_qpelh v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+ calc_qpelh2 v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
+ calc_qpelh v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+ calc_qpelh2 v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v26.8h-v28.8h}, [sp], x7
+ calc_qpelh v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+ calc_qpelh2 v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
+ calc_qpelh v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+ calc_qpelh2 v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
+ calc_qpelh v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+ calc_qpelh2 v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.hi 1b
+2: ld1 {v12.16b-v15.16b}, [sp], #64
+ ld1 {v8.16b-v11.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon, export=1
+ add w10, w3, #7
+ sub x1, x1, x2, lsl #1
+ lsl x10, x10, #7
+ sub x1, x1, x2
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ add x3, x3, #7
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x7
+ mov w9, w3 // height
+ ld1 {v18.8h, v19.8h}, [x8], x7
+ mov x5, x0 // dst
+ ld1 {v20.8h, v21.8h}, [x8], x7
+ ld1 {v22.8h, v23.8h}, [x8], x7
+ ld1 {v24.8h, v25.8h}, [x8], x7
+ ld1 {v26.8h, v27.8h}, [x8], x7
+ ld1 {v28.8h, v29.8h}, [x8], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs x9, x9, #1
+ st1 {v1.8h, v2.8h}, [x5], x7
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #32
+ add sp, sp, #32
+ subs w6, w6, #16
+ b.hi 0b
+ add w10, w3, #6
+ add sp, sp, #64 // discard rest of first line
+ lsl x10, x10, #7
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x1, x1, #24
+ add x0, x0, #48
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x1, x1, #32
+ add x0, x0, #64
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h4_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+.macro calc op, src
+ \op v20.8h, v16.8b, v\src\().8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], v17.b[\src]
+.endm
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+// no purgem
+ umlsl v20.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ subs w4, w4, #1
+ st1 {v20.s}[0], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h6_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+ sub x1, x1, #4
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+// same macro as above
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ st1 {v20.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v20.h}[2], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h8_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+// same macro as above
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+.purgem calc
+ umlsl v20.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ subs w4, w4, #1
+ st1 {v20.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h12_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+ sub x1, x1, #8
+1: ld2 {v16.8b, v17.8b}, [x2]
+ movi v20.8h, #0
+ ldr w12, [x2, #16]
+ movi v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+.if \tail-1
+ ushr \r1\().2d, \r1\().2d, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ mov v16.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ mov v17.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ mov v16.b[7], w12
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7
+ calc umlal, umlsl, v21, v17, v16, v6, v7, 1
+.purgem calc
+ zip1 v16.8h, v20.8h, v21.8h
+ zip2 v17.8h, v20.8h, v21.8h
+ sqrshrun v20.8b, v16.8h, #6
+ sqrshrun2 v20.16b, v17.8h, #6
+ st1 {v20.8b}, [x0], #8
+ add x2, x2, x3
+ st1 {v20.s}[2], [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h16_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld2 {v16.8b, v17.8b}, [x2]
+ ldr x12, [x2, #16]
+ movi v20.8h, #0
+ movi v21.8h, #0
+.macro calc op1, op2, dst, r0, r1, src0, src1, tail=0
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7, 1
+.purgem calc
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ st2 {v20.8b, v21.8b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h24_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld3 {v16.8b-v18.8b}, [x2]
+ ldr x12, [x2, #24]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, src0, src1, src2
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ umlsl \dst\().8h, \r2\().8b, \src2\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ lsr x12, x12, #8
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v18, v16, v17, v3, v4, v5
+.purgem calc
+ umlal v20.8h, v16.8b, v6.8b
+ umlsl v20.8h, v17.8b, v7.8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v18.8b, v7.8b
+ umlal v22.8h, v18.8b, v6.8b
+ umlsl v22.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v22.8b, v22.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ st3 {v20.8b-v22.8b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h32_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld4 {v16.8b-v19.8b}, [x2]
+ ldr x12, [x2, #32]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst\().8h, \r3\().8b, \src3\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
+.purgem calc
+ umlal v23.8h, v19.8b, v4.8b
+ sqrshrun v20.8b, v20.8h, #6
+ umlsl v23.8h, v16.8b, v5.8b
+ sqrshrun v21.8b, v21.8h, #6
+ umlal v23.8h, v17.8b, v6.8b
+ sqrshrun v22.8b, v22.8h, #6
+ umlsl v23.8h, v18.8b, v7.8b
+ sqrshrun v23.8b, v23.8h, #6
+ st4 {v20.8b-v23.8b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h48_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld3 {v16.16b-v18.16b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ ldr x12, [x2, #24]
+ movi v23.8h, #0
+ movi v24.8h, #0
+ movi v25.8h, #0
+ ldr x13, [x2, #48]
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ umlsl \dst0\().8h, \r2\().8b, \src2\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ umlsl2 \dst1\().8h, \r2\().16b, \src2\().16b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ mov \r0\().b[15], w13
+ lsr x12, x12, #8
+ lsr x13, x13, #8
+.endm
+ calc umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
+.purgem calc
+.macro calc r0, r1, r2, r3
+ umlal \r0\().8h, \r2\().8b, v6.8b
+ umlsl \r0\().8h, \r3\().8b, v7.8b
+ umlal2 \r1\().8h, \r2\().16b, v6.16b
+ umlsl2 \r1\().8h, \r3\().16b, v7.16b
+.endm
+ calc v20, v23, v16, v17
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ mov v16.b[15], w13
+ calc v21, v24, v17, v18
+ calc v22, v25, v18, v16
+.purgem calc
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ sqrshrun v22.8b, v22.8h, #6
+ sqrshrun2 v20.16b, v23.8h, #6
+ sqrshrun2 v21.16b, v24.8h, #6
+ sqrshrun2 v22.16b, v25.8h, #6
+ st3 {v20.16b-v22.16b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h64_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld4 {v16.16b-v19.16b}, [x2]
+ ldr x12, [x2, #32]
+ ldr x13, [x2, #64]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ movi v24.8h, #0
+ movi v25.8h, #0
+ movi v26.8h, #0
+ movi v27.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst0\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst0\().8h, \r3\().8b, \src3\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ \op1\()2 \dst1\().8h, \r2\().16b, \src2\().16b
+ \op2\()2 \dst1\().8h, \r3\().16b, \src3\().16b
+.if \tail-1
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ mov \r0\().b[15], w13
+ lsr x12, x12, #8
+ lsr x13, x13, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
+ calc umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
+.purgem calc
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ sqrshrun v22.8b, v22.8h, #6
+ sqrshrun v23.8b, v23.8h, #6
+ sqrshrun2 v20.16b, v24.8h, #6
+ sqrshrun2 v21.16b, v25.8h, #6
+ sqrshrun2 v22.16b, v26.8h, #6
+ sqrshrun2 v23.16b, v27.8h, #6
+ st4 {v20.16b-v23.16b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v17.s}[0], [x2], x3
+ ld1 {v18.s}[0], [x2], x3
+ ld1 {v19.s}[0], [x2], x3
+ ld1 {v20.s}[0], [x2], x3
+ ld1 {v21.s}[0], [x2], x3
+ ld1 {v22.s}[0], [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ subs w4, w4, #1
+ st1 {v24.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x1, x1, #4
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ st1 {v24.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v24.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ subs w4, w4, #1
+ st1 {v24.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x1, x1, #8
+ sub x2, x2, x3
+0: mov x8, x2 // src
+ ld1 {v16.16b}, [x8], x3
+ mov w11, w4 // height
+ ld1 {v17.16b}, [x8], x3
+ mov x10, x0 // dst
+ ld1 {v18.16b}, [x8], x3
+ ld1 {v19.16b}, [x8], x3
+ ld1 {v20.16b}, [x8], x3
+ ld1 {v21.16b}, [x8], x3
+ ld1 {v22.16b}, [x8], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x8], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ sqrshrun2 v24.16b, v25.8h, #6
+ st1 {v24.8b}, [x10], #8
+ subs x11, x11, #1
+ st1 {v24.s}[2], [x10], x1
+.endm
+1: calc_all
+.purgem calc
+2: add x0, x0, #12
+ add x2, x2, #12
+ subs w7, w7, #12
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+0: mov x8, x2 // src
+ ld1 {v16.16b}, [x8], x3
+ mov w11, w4 // height
+ ld1 {v17.16b}, [x8], x3
+ mov x10, x0 // dst
+ ld1 {v18.16b}, [x8], x3
+ ld1 {v19.16b}, [x8], x3
+ ld1 {v20.16b}, [x8], x3
+ ld1 {v21.16b}, [x8], x3
+ ld1 {v22.16b}, [x8], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x8], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ sqrshrun2 v24.16b, v25.8h, #6
+ subs x11, x11, #1
+ st1 {v24.16b}, [x10], x1
+.endm
+1: calc_all
+.purgem calc
+2: add x0, x0, #16
+ add x2, x2, #16
+ subs w7, w7, #16
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv4_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x30, xzr, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add x3, x4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
+ ldp x30, xzr, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ ld1 {v16.4h}, [sp], x9
+ ld1 {v17.4h}, [sp], x9
+ ld1 {v18.4h}, [sp], x9
+ ld1 {v19.4h}, [sp], x9
+ ld1 {v20.4h}, [sp], x9
+ ld1 {v21.4h}, [sp], x9
+ ld1 {v22.4h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ sqxtun v1.8b, v1.8h
+ subs w4, w4, #1
+ st1 {v1.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv6_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x30, xzr, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
+ ldp x30, xzr, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ sub x1, x1, #4
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ sqxtun v1.8b, v1.8h
+ st1 {v1.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v1.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv8_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x30, xzr, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
+ ldp x30, xzr, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ sqxtun v1.8b, v1.8h
+ subs w4, w4, #1
+ st1 {v1.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv12_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ mov x2, x3
+ add x0, sp, #48
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h12_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ sub x1, x1, #8
+ ld1 {v16.8h, v17.8h}, [sp], x9
+ ld1 {v18.8h, v19.8h}, [sp], x9
+ ld1 {v20.8h, v21.8h}, [sp], x9
+ ld1 {v22.8h, v23.8h}, [sp], x9
+ ld1 {v24.8h, v25.8h}, [sp], x9
+ ld1 {v26.8h, v27.8h}, [sp], x9
+ ld1 {v28.8h, v29.8h}, [sp], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
+ sqxtun v1.8b, v1.8h
+ sqxtun2 v1.16b, v2.8h
+ st1 {v1.8b}, [x0], #8
+ subs w4, w4, #1
+ st1 {v1.s}[2], [x0], x1
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv16_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+.Lqpel_uni_hv16_loop:
+ mov x9, #(MAX_PB_SIZE * 2)
+ load_qpel_filterh x6, x5
+ sub w12, w9, w7, lsl #1
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x9
+ mov w11, w4 // height
+ ld1 {v18.8h, v19.8h}, [x8], x9
+ mov x10, x0 // dst
+ ld1 {v20.8h, v21.8h}, [x8], x9
+ ld1 {v22.8h, v23.8h}, [x8], x9
+ ld1 {v24.8h, v25.8h}, [x8], x9
+ ld1 {v26.8h, v27.8h}, [x8], x9
+ ld1 {v28.8h, v29.8h}, [x8], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn2, #12
+ sqxtun v1.8b, v1.8h
+ subs x11, x11, #1
+ sqxtun2 v1.16b, v2.8h
+ st1 {v1.16b}, [x10], x1
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #16
+ add sp, sp, #32
+ subs w7, w7, #16
+ b.ne 0b
+ add w10, w4, #6
+ add sp, sp, x12 // discard rest of first line
+ lsl x10, x10, #7
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv24_8_neon, export=1
+ stp x6, x30, [sp, #-16]!
+ mov x7, #16
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ add x2, x2, #16
+ ldp x0, x1, [sp], #16
+ mov x7, #8
+ add x0, x0, #16
+ ldr x6, [sp]
+ bl X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv32_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ add x0, sp, #48
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv48_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ mov x2, x3
+ add x0, sp, #48
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h48_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv64_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h64_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h4_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+.macro calc op, idx
+ \op v20.8h, v16.8b, v\idx\().8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], v17.b[\idx]
+.endm
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+ ld1 {v24.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqrshrun v16.8b, v16.8h, #7
+ subs w5, w5, #1
+ st1 {v16.s}[0], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h6_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ sub x1, x1, #4
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+ // same macro
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+ ld1 {v24.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqrshrun v16.8b, v16.8h, #7
+ st1 {v16.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v16.h}[2], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h8_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+ // same macro
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+.purgem calc
+ ld1 {v24.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqrshrun v16.8b, v16.8h, #7
+ subs w5, w5, #1
+ st1 {v16.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h12_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ sub x1, x1, #8
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v16.8b, v17.8b}, [x2]
+ movi v20.8h, #0
+ ldr w12, [x2, #16]
+ movi v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+.if \tail-1
+ ushr \r1\().2d, \r1\().2d, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ mov v16.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ mov v17.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ mov v16.b[7], w12
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7
+ calc umlal, umlsl, v21, v17, v16, v6, v7, 1
+.purgem calc
+ ld2 {v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqadd v17.8h, v21.8h, v25.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ zip1 v16.16b, v16.16b, v17.16b
+ st1 {v16.8b}, [x0], #8
+ subs w5, w5, #1
+ st1 {v16.s}[2], [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h16_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v16.8b, v17.8b}, [x2]
+ movi v20.8h, #0
+ ldr x12, [x2, #16]
+ movi v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+ ushr \r1\().2d, \r1\().2d, #8
+ mov \r1\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7, 1
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v16.8b, v7.8b
+.purgem calc
+ ld2 {v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqadd v17.8h, v21.8h, v25.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ subs w5, w5, #1
+ st2 {v16.8b, v17.8b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h24_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+ mov x11, x7 // height
+1: ld3 {v16.8b-v18.8b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ ldr x12, [x2, #24]
+ movi v22.8h, #0
+.macro calc op1, op2, r0, r1, r2, r3, src0, src1, src2, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+ umlsl \r0\().8h, \r3\().8b, \src2\().8b
+ ushr \r1\().2d, \r1\().2d, #8
+ mov \r1\().b[7], w12
+ lsr x12, x12, #8
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v18, v16, v17, v3, v4, v5
+.purgem calc
+ umlal v20.8h, v16.8b, v6.8b
+ umlsl v20.8h, v17.8b, v7.8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v18.8b, v7.8b
+ umlal v22.8h, v18.8b, v6.8b
+ umlsl v22.8h, v16.8b, v7.8b
+ ld3 {v23.8h, v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v23.8h
+ sqadd v17.8h, v21.8h, v24.8h
+ sqadd v18.8h, v22.8h, v25.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ sqrshrun v18.8b, v18.8h, #7
+ subs w5, w5, #1
+ st3 {v16.8b, v17.8b, v18.8b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h32_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+ mov x11, x7 // height
+1: ld4 {v16.8b-v19.8b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ ldr x12, [x2, #32]
+ movi v23.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst\().8h, \r3\().8b, \src3\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
+.purgem calc
+ umlal v23.8h, v19.8b, v4.8b
+ umlsl v23.8h, v16.8b, v5.8b
+ umlal v23.8h, v17.8b, v6.8b
+ umlsl v23.8h, v18.8b, v7.8b
+ ld4 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqadd v17.8h, v21.8h, v25.8h
+ sqadd v18.8h, v22.8h, v26.8h
+ sqadd v19.8h, v23.8h, v27.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ sqrshrun v18.8b, v18.8h, #7
+ sqrshrun v19.8b, v19.8h, #7
+ st4 {v16.8b-v19.8b}, [x0], x1
+ add x2, x2, x3
+ subs w5, w5, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h48_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #80
+ mov x11, x7 // height
+1: ld3 {v16.16b-v18.16b}, [x2]
+ ldr x12, [x2, #24]
+ ldr x13, [x2, #48]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ movi v24.8h, #0
+ movi v25.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2, tail=0
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ umlsl \dst0\().8h, \r2\().8b, \src2\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ umlsl2 \dst1\().8h, \r2\().16b, \src2\().16b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ mov \r0\().b[15], w13
+
+ lsr x12, x12, #8
+ lsr x13, x13, #8
+
+.endm
+ calc umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
+.purgem calc
+ umlal v20.8h, v16.8b, v6.8b
+ umlsl v20.8h, v17.8b, v7.8b
+ umlal2 v23.8h, v16.16b, v6.16b
+ umlsl2 v23.8h, v17.16b, v7.16b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ mov v16.b[15], w13
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v18.8b, v7.8b
+ umlal2 v24.8h, v17.16b, v6.16b
+ umlsl2 v24.8h, v18.16b, v7.16b
+ umlal v22.8h, v18.8b, v6.8b
+ umlsl v22.8h, v16.8b, v7.8b
+ umlal2 v25.8h, v18.16b, v6.16b
+ umlsl2 v25.8h, v16.16b, v7.16b
+ ld3 {v26.8h, v27.8h, v28.8h}, [x4], #48
+ sqadd v16.8h, v20.8h, v26.8h
+ sqadd v17.8h, v21.8h, v27.8h
+ sqadd v18.8h, v22.8h, v28.8h
+ ld3 {v26.8h, v27.8h, v28.8h}, [x4], x10
+ sqadd v19.8h, v23.8h, v26.8h
+ sqadd v20.8h, v24.8h, v27.8h
+ sqadd v21.8h, v25.8h, v28.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ sqrshrun v18.8b, v18.8h, #7
+ sqrshrun2 v16.16b, v19.8h, #7
+ sqrshrun2 v17.16b, v20.8h, #7
+ sqrshrun2 v18.16b, v21.8h, #7
+ subs w5, w5, #1
+ st3 {v16.16b-v18.16b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h64_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+1: ld4 {v16.16b-v19.16b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ ldr x12, [x2, #32]
+ movi v24.8h, #0
+ movi v25.8h, #0
+ ldr x13, [x2, #64]
+ movi v26.8h, #0
+ movi v27.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst0\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst0\().8h, \r3\().8b, \src3\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ \op1\()2 \dst1\().8h, \r2\().16b, \src2\().16b
+ \op2\()2 \dst1\().8h, \r3\().16b, \src3\().16b
+.if \tail-1
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ lsr x12, x12, #8
+ mov \r0\().b[15], w13
+ lsr x13, x13, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
+ calc umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
+.purgem calc
+ ld4 {v28.8h-v31.8h}, [x4], #64
+ sqadd v20.8h, v20.8h, v28.8h
+ sqadd v21.8h, v21.8h, v29.8h
+ sqadd v22.8h, v22.8h, v30.8h
+ sqadd v23.8h, v23.8h, v31.8h
+ ld4 {v28.8h-v31.8h}, [x4], #64
+ sqadd v24.8h, v24.8h, v28.8h
+ sqadd v25.8h, v25.8h, v29.8h
+ sqadd v26.8h, v26.8h, v30.8h
+ sqadd v27.8h, v27.8h, v31.8h
+ sqrshrun v16.8b, v20.8h, #7
+ sqrshrun v17.8b, v21.8h, #7
+ sqrshrun v18.8b, v22.8h, #7
+ sqrshrun v19.8b, v23.8h, #7
+ sqrshrun2 v16.16b, v24.8h, #7
+ sqrshrun2 v17.16b, v25.8h, #7
+ sqrshrun2 v18.16b, v26.8h, #7
+ sqrshrun2 v19.16b, v27.8h, #7
+ subs w5, w5, #1
+ st4 {v16.16b-v19.16b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v17.s}[0], [x2], x3
+ ld1 {v18.s}[0], [x2], x3
+ ld1 {v19.s}[0], [x2], x3
+ ld1 {v20.s}[0], [x2], x3
+ ld1 {v21.s}[0], [x2], x3
+ ld1 {v22.s}[0], [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v25.4h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v25.8h
+ sqrshrun v25.8b, v24.8h, #7
+ subs w5, w5, #1
+ st1 {v25.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ sub x1, x1, #4
+ ld1 {v17.8b}, [x2], x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v25.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v25.8h
+ sqrshrun v25.8b, v24.8h, #7
+ st1 {v25.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v25.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+ .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v25.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v25.8h
+ sqrshrun v25.8b, v24.8h, #7
+ subs w5, w5, #1
+ st1 {v25.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ sub x1, x1, #8
+ ld1 {v16.16b}, [x2], x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+ ld1 {v19.16b}, [x2], x3
+ ld1 {v20.16b}, [x2], x3
+ ld1 {v21.16b}, [x2], x3
+ ld1 {v22.16b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x2], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v26.8h, v27.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v26.8h
+ sqadd v25.8h, v25.8h, v27.8h
+ sqrshrun v26.8b, v24.8h, #7
+ sqrshrun2 v26.16b, v25.8h, #7
+ st1 {v26.8b}, [x0], #8
+ subs w5, w5, #1
+ st1 {v26.s}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b}, [x2], x3
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+ ld1 {v19.16b}, [x2], x3
+ ld1 {v20.16b}, [x2], x3
+ ld1 {v21.16b}, [x2], x3
+ ld1 {v22.16b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x2], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v26.8h, v27.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v26.8h
+ sqadd v25.8h, v25.8h, v27.8h
+ sqrshrun v26.8b, v24.8h, #7
+ subs w5, w5, #1
+ sqrshrun2 v26.16b, v25.8h, #7
+ st1 {v26.16b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1
+ stp x7, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ ldr x7, [sp]
+ add x0, x0, #16
+ add x2, x2, #16
+ add x4, x4, #32
+ bl X(ff_hevc_put_hevc_qpel_bi_v8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v12.16b-v15.16b}, [sp]
+ sub x2, x2, x3, lsl #1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ sub x2, x2, x3
+ load_qpel_filterb x7, x6
+ ldr w6, [sp, #128]
+ mov x12, #(MAX_PB_SIZE * 2)
+0: mov x8, x2 // src
+ ld1 {v16.16b, v17.16b}, [x8], x3
+ mov w11, w5 // height
+ ld1 {v18.16b, v19.16b}, [x8], x3
+ mov x10, x0 // dst
+ ld1 {v20.16b, v21.16b}, [x8], x3
+ mov x9, x4 // src2
+ ld1 {v22.16b, v23.16b}, [x8], x3
+ ld1 {v24.16b, v25.16b}, [x8], x3
+ ld1 {v26.16b, v27.16b}, [x8], x3
+ ld1 {v28.16b, v29.16b}, [x8], x3
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x3
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12 // src2
+ sqadd v8.8h, v8.8h, v12.8h
+ sqadd v9.8h, v9.8h, v13.8h
+ sqadd v10.8h, v10.8h, v14.8h
+ sqadd v11.8h, v11.8h, v15.8h
+ sqrshrun v12.8b, v8.8h, #7
+ sqrshrun2 v12.16b, v9.8h, #7
+ sqrshrun v13.8b, v10.8h, #7
+ sqrshrun2 v13.16b, v11.8h, #7
+ subs x11, x11, #1
+ st1 {v12.16b, v13.16b}, [x10], x1
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #32 // dst
+ add x2, x2, #32 // src
+ add x4, x4, #64 // src2
+ subs w6, w6, #32
+ b.ne 0b
+ ld1 {v8.16b-v11.16b}, [sp], #64
+ ld1 {v12.16b-v15.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1
+ stp x7, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ mov x8, #32
+ stp x8, x8, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+ ldp x8, xzr, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ ldr x7, [sp]
+ add x0, x0, #32
+ add x2, x2, #32
+ add x4, x4, #64
+ bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv4_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x7, x6
+ ld1 {v16.4h}, [sp], x9
+ ld1 {v17.4h}, [sp], x9
+ ld1 {v18.4h}, [sp], x9
+ ld1 {v19.4h}, [sp], x9
+ ld1 {v20.4h}, [sp], x9
+ ld1 {v21.4h}, [sp], x9
+ ld1 {v22.4h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ ld1 {v5.4h}, [x4], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ rshrn v1.4h, v1.4s, #7
+ sqxtun v1.8b, v1.8h
+ subs w5, w5, #1
+ st1 {v1.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv6_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add x3, x5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x7, x6
+ sub x1, x1, #4
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ ld1 {v5.8h}, [x4], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ saddw2 v2.4s, v2.4s, v5.8h
+ rshrn v1.4h, v1.4s, #7
+ rshrn2 v1.8h, v2.4s, #7
+ sqxtun v1.8b, v1.8h
+ st1 {v1.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v1.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv8_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add x3, x5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x7, x6
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ ld1 {v5.8h}, [x4], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ saddw2 v2.4s, v2.4s, v5.8h
+ rshrn v1.4h, v1.4s, #7
+ rshrn2 v1.8h, v2.4s, #7
+ sqxtun v1.8b, v1.8h
+ subs w5, w5, #1
+ st1 {v1.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv12_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x6, x7, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+ ldp x6, x7, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ add x4, x4, #16
+ ldp x0, x1, [sp], #16
+ add x2, x2, #8
+ add x0, x0, #8
+ bl X(ff_hevc_put_hevc_qpel_bi_hv4_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv16_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #16 // width
+.Lqpel_bi_hv16_loop:
+ load_qpel_filterh x7, x8
+ mov x9, #(MAX_PB_SIZE * 2)
+ mov x10, x6
+
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x9
+ mov w11, w5 // height
+ ld1 {v18.8h, v19.8h}, [x8], x9
+ mov x12, x4 // src2
+ ld1 {v20.8h, v21.8h}, [x8], x9
+ mov x7, x0 // dst
+ ld1 {v22.8h, v23.8h}, [x8], x9
+ ld1 {v24.8h, v25.8h}, [x8], x9
+ ld1 {v26.8h, v27.8h}, [x8], x9
+ ld1 {v28.8h, v29.8h}, [x8], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
+ calc_qpelh2 v4, v4, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
+ ld1 {v5.8h, v6.8h}, [x12], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ saddw2 v2.4s, v2.4s, v5.8h
+ saddw v3.4s, v3.4s, v6.4h
+ saddw2 v4.4s, v4.4s, v6.8h
+ rshrn v1.4h, v1.4s, #7
+ rshrn2 v1.8h, v2.4s, #7
+ rshrn v2.4h, v3.4s, #7
+ rshrn2 v2.8h, v4.4s, #7
+ sqxtun v1.8b, v1.8h
+ sqxtun2 v1.16b, v2.8h
+ subs x11, x11, #1
+ st1 {v1.16b}, [x7], x1
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #16
+ add sp, sp, #32
+ subs x10, x10, #16
+ add x4, x4, #32
+ b.ne 0b
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub x10, x10, x6, lsl #1 // part of first line
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv24_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x6, x7, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_hv16_8_neon)
+ ldp x6, x7, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x4, x4, #32
+ add x2, x2, #16
+ add x0, x0, #16
+ bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv32_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #32 // width
+ b .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv48_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h48_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #48 // width
+ b .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv64_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h64_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #64 // width
+ b .Lqpel_bi_hv16_loop
+endfunc
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: add hevc qpel assembly
2022-01-20 8:10 [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: add hevc qpel assembly J. Dekker
@ 2022-01-20 8:38 ` Martin Storsjö
2022-02-03 13:51 ` [FFmpeg-devel] [PATCH v2 " J. Dekker
0 siblings, 1 reply; 6+ messages in thread
From: Martin Storsjö @ 2022-01-20 8:38 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Thu, 20 Jan 2022, J. Dekker wrote:
> Based on patch by: Rafal Dabrowa <fatwildcat@gmail.com>
> ---
> libavcodec/aarch64/Makefile | 1 +
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 69 +
> libavcodec/aarch64/hevcdsp_qpel_neon.S | 2799 +++++++++++++++++++++
> 3 files changed, 2869 insertions(+)
>
> Some changes since last time it was submitted, namely: split, macro'd
> and some scheduling and other improvements.
This fails to compile with every single toolchain I tested it with. It
fails to compile on Apple Clang 12, LLVM.org Clang (nightly), GCC 7, GCC
9, MSVC.
It fails with various incarnations of this issue:
src/libavcodec/aarch64/hevcdsp_init_aarch64.c:140:9: error: use of
undeclared identifier 'ff_hevc_put_hevc_pel_pixels4_8_neon'
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels);
Not spending time on reviewing it in this form.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* [FFmpeg-devel] [PATCH v2 1/2] lavc/aarch64: add hevc qpel assembly
2022-01-20 8:38 ` Martin Storsjö
@ 2022-02-03 13:51 ` J. Dekker
2022-02-03 13:51 ` [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc epel assembly J. Dekker
2022-02-07 22:11 ` [FFmpeg-devel] [PATCH v2 1/2] lavc/aarch64: add hevc qpel assembly Martin Storsjö
0 siblings, 2 replies; 6+ messages in thread
From: J. Dekker @ 2022-02-03 13:51 UTC (permalink / raw)
To: ffmpeg-devel
Thanks: Rafal Dabrowa <fatwildcat@gmail.com>
---
libavcodec/aarch64/Makefile | 1 +
libavcodec/aarch64/hevcdsp_init_aarch64.c | 67 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 2799 +++++++++++++++++++++
3 files changed, 2867 insertions(+)
create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
Had trouble testing on a Linux machine as well, but have a workflow
setup for that now so should be easier in the future. Passes FATE on
both macOS and Linux.
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..8592692479 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -63,4 +63,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9mc_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
aarch64/hevcdsp_init_aarch64.o \
+ aarch64/hevcdsp_qpel_neon.o \
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..3e5d85247e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,63 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
int16_t *sao_offset_val, int sao_left_class,
int width, int height);
+#define NEON8_FNPROTO(fn, args) \
+ void ff_hevc_put_hevc_##fn##4_8_neon args; \
+ void ff_hevc_put_hevc_##fn##6_8_neon args; \
+ void ff_hevc_put_hevc_##fn##8_8_neon args; \
+ void ff_hevc_put_hevc_##fn##12_8_neon args; \
+ void ff_hevc_put_hevc_##fn##16_8_neon args; \
+ void ff_hevc_put_hevc_##fn##24_8_neon args; \
+ void ff_hevc_put_hevc_##fn##32_8_neon args; \
+ void ff_hevc_put_hevc_##fn##48_8_neon args; \
+ void ff_hevc_put_hevc_##fn##64_8_neon args; \
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon; \
+ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon; \
+ member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon; \
+ member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+ member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+ member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+ member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+ member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+ member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
@@ -80,6 +136,17 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
// for the current size, but if enabled for bigger sizes, the cases
// of non-multiple of 8 seem to arise.
// c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon;
+
+ NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1, qpel_bi_h);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv);
+
}
if (bit_depth == 10) {
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..e8cc6f5f25
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,2799 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+.Lqpel_filters:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0
+ .byte -1, 4,-10, 58, 17, -5, 1, 0
+ .byte -1, 4,-11, 40, 40,-11, 4, -1
+ .byte 0, 1, -5, 17, 58,-10, 4, -1
+
+.macro load_qpel_filterb freg, xreg
+ adr \xreg, .Lqpel_filters
+ add \xreg, \xreg, \freg, lsl #3
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
+ ld4r {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
+ neg v0.16b, v0.16b
+ neg v2.16b, v2.16b
+ neg v5.16b, v5.16b
+ neg v7.16b, v7.16b
+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+ umlsl \dst\().8h, \src0\().8b, v0.8b
+ umlal \dst\().8h, \src1\().8b, v1.8b
+ umlsl \dst\().8h, \src2\().8b, v2.8b
+ umlal \dst\().8h, \src3\().8b, v3.8b
+ umlal \dst\().8h, \src4\().8b, v4.8b
+ umlsl \dst\().8h, \src5\().8b, v5.8b
+ umlal \dst\().8h, \src6\().8b, v6.8b
+ umlsl \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+ umlsl2 \dst\().8h, \src0\().16b, v0.16b
+ umlal2 \dst\().8h, \src1\().16b, v1.16b
+ umlsl2 \dst\().8h, \src2\().16b, v2.16b
+ umlal2 \dst\().8h, \src3\().16b, v3.16b
+ umlal2 \dst\().8h, \src4\().16b, v4.16b
+ umlsl2 \dst\().8h, \src5\().16b, v5.16b
+ umlal2 \dst\().8h, \src6\().16b, v6.16b
+ umlsl2 \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro load_qpel_filterh freg, xreg
+ adr \xreg, .Lqpel_filters
+ add \xreg, \xreg, \freg, lsl #3
+ ld1 {v0.8b}, [\xreg]
+ sxtl v0.8h, v0.8b
+.endm
+
+.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+ smull \dst\().4s, \src0\().4h, v0.h[0]
+ smlal \dst\().4s, \src1\().4h, v0.h[1]
+ smlal \dst\().4s, \src2\().4h, v0.h[2]
+ smlal \dst\().4s, \src3\().4h, v0.h[3]
+ smlal \dst\().4s, \src4\().4h, v0.h[4]
+ smlal \dst\().4s, \src5\().4h, v0.h[5]
+ smlal \dst\().4s, \src6\().4h, v0.h[6]
+ smlal \dst\().4s, \src7\().4h, v0.h[7]
+.ifc \op, sshr
+ sshr \dst\().4s, \dst\().4s, \shift
+.else
+ \op \dst\().4h, \dst\().4s, \shift
+.endif
+.endm
+
+.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+ smull2 \dstt\().4s, \src0\().8h, v0.h[0]
+ smlal2 \dstt\().4s, \src1\().8h, v0.h[1]
+ smlal2 \dstt\().4s, \src2\().8h, v0.h[2]
+ smlal2 \dstt\().4s, \src3\().8h, v0.h[3]
+ smlal2 \dstt\().4s, \src4\().8h, v0.h[4]
+ smlal2 \dstt\().4s, \src5\().8h, v0.h[5]
+ smlal2 \dstt\().4s, \src6\().8h, v0.h[6]
+ smlal2 \dstt\().4s, \src7\().8h, v0.h[7]
+.ifc \op, sshr
+ sshr \dst\().4s, \dstt\().4s, \shift
+.else
+ \op \dst\().8h, \dstt\().4s, \shift
+.endif
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #8
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b}, [x1], #8
+ ld1 {v17.s}[0], [x1], x2
+.macro calc src0, src1, idx
+ ushr \src0\().2d, \src1\().2d, #8
+ mov \src0\().b[7], v17.b[\idx]
+.endm
+ calc v18, v16, 0
+ calc v19, v18, 1
+ calc v20, v19, 2
+ ushr v21.2d, v20.2d, #8
+ ushr v22.2d, v21.2d, #8
+ ushr v23.2d, v22.2d, #8
+ ushr v24.2d, v23.2d, #8
+ movi v28.8h, #0
+ calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
+ subs w3, w3, #1
+ st1 {v28.4h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ mov x14, #(MAX_PB_SIZE * 2 - 8)
+1: ld1 {v16.8b, v17.8b}, [x1], x2
+ // same macro
+ calc v18, v16, 0
+ calc v19, v18, 1
+ calc v20, v19, 2
+ calc v21, v20, 3
+ calc v22, v21, 4
+ ushr v23.2d, v22.2d, #8
+ ushr v24.2d, v23.2d, #8
+ movi v28.8h, #0
+ calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
+ st1 {v28.4h}, [x0], #8
+ subs w3, w3, #1
+ st1 {v28.s}[2], [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon, export=1
+ sxtw x4, w4
+ sxtw x7, w7
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x1], x2
+ // same macro
+ calc v18, v16, 0
+ calc v19, v18, 1
+ calc v20, v19, 2
+ calc v21, v20, 3
+ calc v22, v21, 4
+ calc v23, v22, 5
+ calc v24, v23, 6
+.purgem calc
+ movi v28.8h, #0
+ calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
+ subs w3, w3, #1
+ st1 {v28.8h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h12_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #16
+ mov x14, #(MAX_PB_SIZE * 2 - 16)
+1: ld2 {v16.8b, v17.8b}, [x1], #16
+ ld1 {v27.s}[0], [x1], x2
+ ushr v18.2d, v16.2d, #8
+ ushr v19.2d, v17.2d, #8
+ mov v18.b[7], v27.b[0]
+ mov v19.b[7], v27.b[1]
+ ushr v20.2d, v18.2d, #8
+ ushr v21.2d, v19.2d, #8
+ mov v20.b[7], v27.b[2]
+ mov v21.b[7], v27.b[3]
+ ushr v22.2d, v20.2d, #8
+ ushr v23.2d, v21.2d, #8
+ ushr v24.2d, v22.2d, #8
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ zip1 v16.8h, v28.8h, v29.8h
+ zip2 v17.8h, v28.8h, v29.8h
+ st1 {v16.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v17.4h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #16
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld2 {v16.8b, v17.8b}, [x1], #16
+ ld1 {v27.8b}, [x1], x2
+ ushr v18.2d, v16.2d, #8
+ ushr v19.2d, v17.2d, #8
+ mov v18.b[7], v27.b[0]
+ mov v19.b[7], v27.b[1]
+ ushr v20.2d, v18.2d, #8
+ ushr v21.2d, v19.2d, #8
+ mov v20.b[7], v27.b[2]
+ mov v21.b[7], v27.b[3]
+ ushr v22.2d, v20.2d, #8
+ ushr v23.2d, v21.2d, #8
+ mov v22.b[7], v27.b[4]
+ mov v23.b[7], v27.b[5]
+ ushr v24.2d, v22.2d, #8
+ mov v24.b[7], v27.b[6]
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ subs w3, w3, #1
+ st2 {v28.8h, v29.8h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #24
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld3 {v16.8b, v17.8b, v18.8b}, [x1], #24
+ ld1 {v27.8b}, [x1], x2
+ ushr v19.2d, v16.2d, #8
+ ushr v20.2d, v17.2d, #8
+ ushr v21.2d, v18.2d, #8
+ mov v19.b[7], v27.b[0]
+ mov v20.b[7], v27.b[1]
+ mov v21.b[7], v27.b[2]
+ ushr v22.2d, v19.2d, #8
+ ushr v23.2d, v20.2d, #8
+ ushr v24.2d, v21.2d, #8
+ mov v22.b[7], v27.b[3]
+ mov v23.b[7], v27.b[4]
+ mov v24.b[7], v27.b[5]
+ ushr v25.2d, v22.2d, #8
+ mov v25.b[7], v27.b[6]
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
+ subs w3, w3, #1
+ st3 {v28.8h, v29.8h, v30.8h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #32
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
+ movi v28.8h, #0
+ movi v29.8h, #0
+ ld1 {v27.8b}, [x1], x2
+ movi v30.8h, #0
+ movi v31.8h, #0
+ ushr v20.2d, v16.2d, #8
+ ushr v21.2d, v17.2d, #8
+ ushr v22.2d, v18.2d, #8
+ ushr v23.2d, v19.2d, #8
+ mov v20.b[7], v27.b[0]
+ mov v21.b[7], v27.b[1]
+ mov v22.b[7], v27.b[2]
+ mov v23.b[7], v27.b[3]
+ ushr v24.2d, v20.2d, #8
+ ushr v25.2d, v21.2d, #8
+ ushr v26.2d, v22.2d, #8
+ mov v24.b[7], v27.b[4]
+ mov v25.b[7], v27.b[5]
+ mov v26.b[7], v27.b[6]
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
+ calc_qpelb v31, v19, v20, v21, v22, v23, v24, v25, v26
+ st4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #48
+ mov x7, #24
+ mov x14, #80
+1: ld3 {v16.16b, v17.16b, v18.16b}, [x1], x7
+ movi v28.8h, #0
+ ld1 {v26.8b}, [x1], x7
+ movi v29.8h, #0
+ ld1 {v27.8b}, [x1], x2
+ movi v30.8h, #0
+ ushr v19.2d, v16.2d, #8
+ ushr v20.2d, v17.2d, #8
+ ushr v21.2d, v18.2d, #8
+ mov v19.b[7], v26.b[0]
+ mov v19.b[15], v27.b[0]
+ mov v20.b[7], v26.b[1]
+ mov v20.b[15], v27.b[1]
+ mov v21.b[7], v26.b[2]
+ mov v21.b[15], v27.b[2]
+ ushr v22.2d, v19.2d, #8
+ ushr v23.2d, v20.2d, #8
+ ushr v24.2d, v21.2d, #8
+ mov v22.b[7], v26.b[3]
+ mov v22.b[15], v27.b[3]
+ mov v23.b[7], v26.b[4]
+ mov v23.b[15], v27.b[4]
+ mov v24.b[7], v26.b[5]
+ mov v24.b[15], v27.b[5]
+ ushr v25.2d, v22.2d, #8
+ mov v25.b[7], v26.b[6]
+ mov v25.b[15], v27.b[6]
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
+ st3 {v28.8h, v29.8h, v30.8h}, [x0], #48
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ calc_qpelb2 v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb2 v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb2 v30, v18, v19, v20, v21, v22, v23, v24, v25
+ st3 {v28.8h, v29.8h, v30.8h}, [x0], x14
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #64
+ mov x7, #32
+1: ld4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x7
+ ld1 {v27.8b}, [x1], x7
+ ld1 {v28.8b}, [x1], x2
+ ushr v20.2d, v16.2d, #8
+ ushr v21.2d, v17.2d, #8
+ ushr v22.2d, v18.2d, #8
+ ushr v23.2d, v19.2d, #8
+ mov v20.b[7], v27.b[0]
+ mov v21.b[7], v27.b[1]
+ mov v22.b[7], v27.b[2]
+ mov v23.b[7], v27.b[3]
+ mov v20.b[15], v28.b[0]
+ mov v21.b[15], v28.b[1]
+ mov v22.b[15], v28.b[2]
+ mov v23.b[15], v28.b[3]
+ ushr v24.2d, v20.2d, #8
+ ushr v25.2d, v21.2d, #8
+ ushr v26.2d, v22.2d, #8
+ mov v24.b[7], v27.b[4]
+ mov v25.b[7], v27.b[5]
+ mov v26.b[7], v27.b[6]
+ mov v24.b[15], v28.b[4]
+ mov v25.b[15], v28.b[5]
+ mov v26.b[15], v28.b[6]
+.macro calc fn
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+ \fn v28, v16, v17, v18, v19, v20, v21, v22, v23
+ \fn v29, v17, v18, v19, v20, v21, v22, v23, v24
+ \fn v30, v18, v19, v20, v21, v22, v23, v24, v25
+ \fn v31, v19, v20, v21, v22, v23, v24, v25, v26
+ st4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+.endm
+ calc calc_qpelb
+ calc calc_qpelb2
+.purgem calc
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+.macro calc_all
+ calc v23, v16, v17, v18, v19, v20, v21, v22, v23
+ b.eq 2f
+ calc v16, v17, v18, v19, v20, v21, v22, v23, v16
+ b.eq 2f
+ calc v17, v18, v19, v20, v21, v22, v23, v16, v17
+ b.eq 2f
+ calc v18, v19, v20, v21, v22, v23, v16, v17, v18
+ b.eq 2f
+ calc v19, v20, v21, v22, v23, v16, v17, v18, v19
+ b.eq 2f
+ calc v20, v21, v22, v23, v16, v17, v18, v19, v20
+ b.eq 2f
+ calc v21, v22, v23, v16, v17, v18, v19, v20, v21
+ b.eq 2f
+ calc v22, v23, v16, v17, v18, v19, v20, v21, v22
+ b.hi 1b
+.endm
+
+.macro calc_all2
+ calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
+ b.eq 2f
+ calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
+ b.eq 2f
+ calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
+ b.eq 2f
+ calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
+ b.eq 2f
+ calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
+ b.eq 2f
+ calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
+ b.eq 2f
+ calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
+ b.eq 2f
+ calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
+ b.hi 1b
+.endm
+
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.s}[0], [x1], x2
+ ld1 {v17.s}[0], [x1], x2
+ ld1 {v18.s}[0], [x1], x2
+ ld1 {v19.s}[0], [x1], x2
+ ld1 {v20.s}[0], [x1], x2
+ ld1 {v21.s}[0], [x1], x2
+ ld1 {v22.s}[0], [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], x9
+ subs w3, w3, #1
+ b.eq 2f
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 8)
+ sub x1, x1, x2
+ ld1 {v16.8b}, [x1], x2
+ ld1 {v17.8b}, [x1], x2
+ ld1 {v18.8b}, [x1], x2
+ ld1 {v19.8b}, [x1], x2
+ ld1 {v20.8b}, [x1], x2
+ ld1 {v21.8b}, [x1], x2
+ ld1 {v22.8b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], #8
+ st1 {v24.s}[2], [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.8b}, [x1], x2
+ ld1 {v17.8b}, [x1], x2
+ ld1 {v18.8b}, [x1], x2
+ ld1 {v19.8b}, [x1], x2
+ ld1 {v20.8b}, [x1], x2
+ ld1 {v21.8b}, [x1], x2
+ ld1 {v22.8b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 16)
+ sub x1, x1, x2
+ ld1 {v16.16b}, [x1], x2
+ ld1 {v17.16b}, [x1], x2
+ ld1 {v18.16b}, [x1], x2
+ ld1 {v19.16b}, [x1], x2
+ ld1 {v20.16b}, [x1], x2
+ ld1 {v21.16b}, [x1], x2
+ ld1 {v22.16b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v25.4h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.16b}, [x1], x2
+ ld1 {v17.16b}, [x1], x2
+ ld1 {v18.16b}, [x1], x2
+ ld1 {v19.16b}, [x1], x2
+ ld1 {v20.16b}, [x1], x2
+ ld1 {v21.16b}, [x1], x2
+ ld1 {v22.16b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ subs w3, w3, #1
+ st1 {v24.8h, v25.8h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+ sub sp, sp, #48
+ st1 {v8.16b, v9.16b, v10.16b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h, v9.8h, v10.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.16b, v9.16b, v10.16b}, [sp], #48
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.16b-v11.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+ stp x5, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ ldr x5, [sp]
+ add x0, x0, #48
+ add x1, x1, #24
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+0: mov x8, x1 // src
+ ld1 {v16.16b, v17.16b}, [x8], x2
+ mov w11, w3 // height
+ ld1 {v18.16b, v19.16b}, [x8], x2
+ mov x10, x0 // dst
+ ld1 {v20.16b, v21.16b}, [x8], x2
+ ld1 {v22.16b, v23.16b}, [x8], x2
+ ld1 {v24.16b, v25.16b}, [x8], x2
+ ld1 {v26.16b, v27.16b}, [x8], x2
+ ld1 {v28.16b, v29.16b}, [x8], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x8], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs x11, x11, #1
+ st1 {v8.8h-v11.8h}, [x10], x9
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #64
+ add x1, x1, #32
+ subs w6, w6, #32
+ b.hi 0b
+ ld1 {v8.16b-v11.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
+ ldp x5, x30, [sp], #16
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.4h}, [sp], x7
+ ld1 {v17.4h}, [sp], x7
+ ld1 {v18.4h}, [sp], x7
+ ld1 {v19.4h}, [sp], x7
+ ld1 {v20.4h}, [sp], x7
+ ld1 {v21.4h}, [sp], x7
+ ld1 {v22.4h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ subs w3, w3, #1
+ st1 {v1.4h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x8, #120
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h}, [sp], x7
+ ld1 {v17.8h}, [sp], x7
+ ld1 {v18.8h}, [sp], x7
+ ld1 {v19.8h}, [sp], x7
+ ld1 {v20.8h}, [sp], x7
+ ld1 {v21.8h}, [sp], x7
+ ld1 {v22.8h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ st1 {v1.4h}, [x0], #8
+ subs w3, w3, #1
+ st1 {v1.s}[2], [x0], x8
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h}, [sp], x7
+ ld1 {v17.8h}, [sp], x7
+ ld1 {v18.8h}, [sp], x7
+ ld1 {v19.8h}, [sp], x7
+ ld1 {v20.8h}, [sp], x7
+ ld1 {v21.8h}, [sp], x7
+ ld1 {v22.8h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h12_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ mov x8, #112
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ st1 {v1.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v2.4h}, [x0], x8
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x3, x3, #7
+ add x0, sp, #32
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h, v2.8h}, [x0], x7
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, #64
+ add w10, w3, #7
+ st1 {v12.16b-v15.16b}, [sp]
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h24_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ ld1 {v26.8h-v28.8h}, [sp], x7
+1: ld1 {v29.8h-v31.8h}, [sp], x7
+ calc_qpelh v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+ calc_qpelh2 v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
+ calc_qpelh v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+ calc_qpelh2 v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
+ calc_qpelh v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+ calc_qpelh2 v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ calc_qpelh v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+ calc_qpelh2 v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
+ calc_qpelh v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+ calc_qpelh2 v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
+ calc_qpelh v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+ calc_qpelh2 v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ calc_qpelh v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+ calc_qpelh2 v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
+ calc_qpelh v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+ calc_qpelh2 v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
+ calc_qpelh v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+ calc_qpelh2 v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ calc_qpelh v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+ calc_qpelh2 v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
+ calc_qpelh v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+ calc_qpelh2 v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
+ calc_qpelh v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+ calc_qpelh2 v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ calc_qpelh v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+ calc_qpelh2 v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
+ calc_qpelh v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+ calc_qpelh2 v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
+ calc_qpelh v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+ calc_qpelh2 v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ calc_qpelh v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+ calc_qpelh2 v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
+ calc_qpelh v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+ calc_qpelh2 v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
+ calc_qpelh v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+ calc_qpelh2 v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ calc_qpelh v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+ calc_qpelh2 v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
+ calc_qpelh v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+ calc_qpelh2 v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
+ calc_qpelh v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+ calc_qpelh2 v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v26.8h-v28.8h}, [sp], x7
+ calc_qpelh v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+ calc_qpelh2 v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
+ calc_qpelh v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+ calc_qpelh2 v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
+ calc_qpelh v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+ calc_qpelh2 v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.hi 1b
+2: ld1 {v12.16b-v15.16b}, [sp], #64
+ ld1 {v8.16b-v11.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon, export=1
+ add w10, w3, #7
+ sub x1, x1, x2, lsl #1
+ lsl x10, x10, #7
+ sub x1, x1, x2
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ add x3, x3, #7
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x7
+ mov w9, w3 // height
+ ld1 {v18.8h, v19.8h}, [x8], x7
+ mov x5, x0 // dst
+ ld1 {v20.8h, v21.8h}, [x8], x7
+ ld1 {v22.8h, v23.8h}, [x8], x7
+ ld1 {v24.8h, v25.8h}, [x8], x7
+ ld1 {v26.8h, v27.8h}, [x8], x7
+ ld1 {v28.8h, v29.8h}, [x8], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs x9, x9, #1
+ st1 {v1.8h, v2.8h}, [x5], x7
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #32
+ add sp, sp, #32
+ subs w6, w6, #16
+ b.hi 0b
+ add w10, w3, #6
+ add sp, sp, #64 // discard rest of first line
+ lsl x10, x10, #7
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x1, x1, #24
+ add x0, x0, #48
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x1, x1, #32
+ add x0, x0, #64
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h4_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+.macro calc op, src
+ \op v20.8h, v16.8b, v\src\().8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], v17.b[\src]
+.endm
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+// no purgem
+ umlsl v20.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ subs w4, w4, #1
+ st1 {v20.s}[0], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h6_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+ sub x1, x1, #4
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+// same macro as above
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ st1 {v20.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v20.h}[2], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h8_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+// same macro as above
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+.purgem calc
+ umlsl v20.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ subs w4, w4, #1
+ st1 {v20.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h12_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+ sub x1, x1, #8
+1: ld2 {v16.8b, v17.8b}, [x2]
+ movi v20.8h, #0
+ ldr w12, [x2, #16]
+ movi v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+.if \tail-1
+ ushr \r1\().2d, \r1\().2d, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ mov v16.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ mov v17.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ mov v16.b[7], w12
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7
+ calc umlal, umlsl, v21, v17, v16, v6, v7, 1
+.purgem calc
+ zip1 v16.8h, v20.8h, v21.8h
+ zip2 v17.8h, v20.8h, v21.8h
+ sqrshrun v20.8b, v16.8h, #6
+ sqrshrun2 v20.16b, v17.8h, #6
+ st1 {v20.8b}, [x0], #8
+ add x2, x2, x3
+ st1 {v20.s}[2], [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h16_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld2 {v16.8b, v17.8b}, [x2]
+ ldr x12, [x2, #16]
+ movi v20.8h, #0
+ movi v21.8h, #0
+.macro calc op1, op2, dst, r0, r1, src0, src1, tail=0
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7, 1
+.purgem calc
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ st2 {v20.8b, v21.8b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h24_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld3 {v16.8b-v18.8b}, [x2]
+ ldr x12, [x2, #24]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, src0, src1, src2
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ umlsl \dst\().8h, \r2\().8b, \src2\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ lsr x12, x12, #8
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v18, v16, v17, v3, v4, v5
+.purgem calc
+ umlal v20.8h, v16.8b, v6.8b
+ umlsl v20.8h, v17.8b, v7.8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v18.8b, v7.8b
+ umlal v22.8h, v18.8b, v6.8b
+ umlsl v22.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v22.8b, v22.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ st3 {v20.8b-v22.8b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h32_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld4 {v16.8b-v19.8b}, [x2]
+ ldr x12, [x2, #32]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst\().8h, \r3\().8b, \src3\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
+.purgem calc
+ umlal v23.8h, v19.8b, v4.8b
+ sqrshrun v20.8b, v20.8h, #6
+ umlsl v23.8h, v16.8b, v5.8b
+ sqrshrun v21.8b, v21.8h, #6
+ umlal v23.8h, v17.8b, v6.8b
+ sqrshrun v22.8b, v22.8h, #6
+ umlsl v23.8h, v18.8b, v7.8b
+ sqrshrun v23.8b, v23.8h, #6
+ st4 {v20.8b-v23.8b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h48_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld3 {v16.16b-v18.16b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ ldr x12, [x2, #24]
+ movi v23.8h, #0
+ movi v24.8h, #0
+ movi v25.8h, #0
+ ldr x13, [x2, #48]
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ umlsl \dst0\().8h, \r2\().8b, \src2\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ umlsl2 \dst1\().8h, \r2\().16b, \src2\().16b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ mov \r0\().b[15], w13
+ lsr x12, x12, #8
+ lsr x13, x13, #8
+.endm
+ calc umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
+.purgem calc
+.macro calc r0, r1, r2, r3
+ umlal \r0\().8h, \r2\().8b, v6.8b
+ umlsl \r0\().8h, \r3\().8b, v7.8b
+ umlal2 \r1\().8h, \r2\().16b, v6.16b
+ umlsl2 \r1\().8h, \r3\().16b, v7.16b
+.endm
+ calc v20, v23, v16, v17
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ mov v16.b[15], w13
+ calc v21, v24, v17, v18
+ calc v22, v25, v18, v16
+.purgem calc
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ sqrshrun v22.8b, v22.8h, #6
+ sqrshrun2 v20.16b, v23.8h, #6
+ sqrshrun2 v21.16b, v24.8h, #6
+ sqrshrun2 v22.16b, v25.8h, #6
+ st3 {v20.16b-v22.16b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h64_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld4 {v16.16b-v19.16b}, [x2]
+ ldr x12, [x2, #32]
+ ldr x13, [x2, #64]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ movi v24.8h, #0
+ movi v25.8h, #0
+ movi v26.8h, #0
+ movi v27.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst0\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst0\().8h, \r3\().8b, \src3\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ \op1\()2 \dst1\().8h, \r2\().16b, \src2\().16b
+ \op2\()2 \dst1\().8h, \r3\().16b, \src3\().16b
+.if \tail-1
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ mov \r0\().b[15], w13
+ lsr x12, x12, #8
+ lsr x13, x13, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
+ calc umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
+.purgem calc
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ sqrshrun v22.8b, v22.8h, #6
+ sqrshrun v23.8b, v23.8h, #6
+ sqrshrun2 v20.16b, v24.8h, #6
+ sqrshrun2 v21.16b, v25.8h, #6
+ sqrshrun2 v22.16b, v26.8h, #6
+ sqrshrun2 v23.16b, v27.8h, #6
+ st4 {v20.16b-v23.16b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v17.s}[0], [x2], x3
+ ld1 {v18.s}[0], [x2], x3
+ ld1 {v19.s}[0], [x2], x3
+ ld1 {v20.s}[0], [x2], x3
+ ld1 {v21.s}[0], [x2], x3
+ ld1 {v22.s}[0], [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ subs w4, w4, #1
+ st1 {v24.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x1, x1, #4
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ st1 {v24.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v24.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ subs w4, w4, #1
+ st1 {v24.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x1, x1, #8
+ sub x2, x2, x3
+0: mov x8, x2 // src
+ ld1 {v16.16b}, [x8], x3
+ mov w11, w4 // height
+ ld1 {v17.16b}, [x8], x3
+ mov x10, x0 // dst
+ ld1 {v18.16b}, [x8], x3
+ ld1 {v19.16b}, [x8], x3
+ ld1 {v20.16b}, [x8], x3
+ ld1 {v21.16b}, [x8], x3
+ ld1 {v22.16b}, [x8], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x8], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ sqrshrun2 v24.16b, v25.8h, #6
+ st1 {v24.8b}, [x10], #8
+ subs x11, x11, #1
+ st1 {v24.s}[2], [x10], x1
+.endm
+1: calc_all
+.purgem calc
+2: add x0, x0, #12
+ add x2, x2, #12
+ subs w7, w7, #12
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+0: mov x8, x2 // src
+ ld1 {v16.16b}, [x8], x3
+ mov w11, w4 // height
+ ld1 {v17.16b}, [x8], x3
+ mov x10, x0 // dst
+ ld1 {v18.16b}, [x8], x3
+ ld1 {v19.16b}, [x8], x3
+ ld1 {v20.16b}, [x8], x3
+ ld1 {v21.16b}, [x8], x3
+ ld1 {v22.16b}, [x8], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x8], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ sqrshrun2 v24.16b, v25.8h, #6
+ subs x11, x11, #1
+ st1 {v24.16b}, [x10], x1
+.endm
+1: calc_all
+.purgem calc
+2: add x0, x0, #16
+ add x2, x2, #16
+ subs w7, w7, #16
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv4_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x30, xzr, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add x3, x4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
+ ldp x30, xzr, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ ld1 {v16.4h}, [sp], x9
+ ld1 {v17.4h}, [sp], x9
+ ld1 {v18.4h}, [sp], x9
+ ld1 {v19.4h}, [sp], x9
+ ld1 {v20.4h}, [sp], x9
+ ld1 {v21.4h}, [sp], x9
+ ld1 {v22.4h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ sqxtun v1.8b, v1.8h
+ subs w4, w4, #1
+ st1 {v1.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv6_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x30, xzr, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
+ ldp x30, xzr, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ sub x1, x1, #4
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ sqxtun v1.8b, v1.8h
+ st1 {v1.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v1.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv8_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x30, xzr, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
+ ldp x30, xzr, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ sqxtun v1.8b, v1.8h
+ subs w4, w4, #1
+ st1 {v1.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv12_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ mov x2, x3
+ add x0, sp, #48
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h12_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ sub x1, x1, #8
+ ld1 {v16.8h, v17.8h}, [sp], x9
+ ld1 {v18.8h, v19.8h}, [sp], x9
+ ld1 {v20.8h, v21.8h}, [sp], x9
+ ld1 {v22.8h, v23.8h}, [sp], x9
+ ld1 {v24.8h, v25.8h}, [sp], x9
+ ld1 {v26.8h, v27.8h}, [sp], x9
+ ld1 {v28.8h, v29.8h}, [sp], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
+ sqxtun v1.8b, v1.8h
+ sqxtun2 v1.16b, v2.8h
+ st1 {v1.8b}, [x0], #8
+ subs w4, w4, #1
+ st1 {v1.s}[2], [x0], x1
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv16_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+.Lqpel_uni_hv16_loop:
+ mov x9, #(MAX_PB_SIZE * 2)
+ load_qpel_filterh x6, x5
+ sub w12, w9, w7, lsl #1
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x9
+ mov w11, w4 // height
+ ld1 {v18.8h, v19.8h}, [x8], x9
+ mov x10, x0 // dst
+ ld1 {v20.8h, v21.8h}, [x8], x9
+ ld1 {v22.8h, v23.8h}, [x8], x9
+ ld1 {v24.8h, v25.8h}, [x8], x9
+ ld1 {v26.8h, v27.8h}, [x8], x9
+ ld1 {v28.8h, v29.8h}, [x8], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn2, #12
+ sqxtun v1.8b, v1.8h
+ subs x11, x11, #1
+ sqxtun2 v1.16b, v2.8h
+ st1 {v1.16b}, [x10], x1
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #16
+ add sp, sp, #32
+ subs w7, w7, #16
+ b.ne 0b
+ add w10, w4, #6
+ add sp, sp, x12 // discard rest of first line
+ lsl x10, x10, #7
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv24_8_neon, export=1
+ stp x6, x30, [sp, #-16]!
+ mov x7, #16
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ add x2, x2, #16
+ ldp x0, x1, [sp], #16
+ mov x7, #8
+ add x0, x0, #16
+ ldr x6, [sp]
+ bl X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv32_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ add x0, sp, #48
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv48_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ mov x2, x3
+ add x0, sp, #48
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h48_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv64_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h64_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h4_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+.macro calc op, idx
+ \op v20.8h, v16.8b, v\idx\().8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], v17.b[\idx]
+.endm
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+ ld1 {v24.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqrshrun v16.8b, v16.8h, #7
+ subs w5, w5, #1
+ st1 {v16.s}[0], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h6_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ sub x1, x1, #4
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+ // same macro
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+ ld1 {v24.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqrshrun v16.8b, v16.8h, #7
+ st1 {v16.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v16.h}[2], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h8_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+ // same macro
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+.purgem calc
+ ld1 {v24.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqrshrun v16.8b, v16.8h, #7
+ subs w5, w5, #1
+ st1 {v16.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h12_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ sub x1, x1, #8
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v16.8b, v17.8b}, [x2]
+ movi v20.8h, #0
+ ldr w12, [x2, #16]
+ movi v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+.if \tail-1
+ ushr \r1\().2d, \r1\().2d, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ mov v16.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ mov v17.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ mov v16.b[7], w12
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7
+ calc umlal, umlsl, v21, v17, v16, v6, v7, 1
+.purgem calc
+ ld2 {v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqadd v17.8h, v21.8h, v25.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ zip1 v16.16b, v16.16b, v17.16b
+ st1 {v16.8b}, [x0], #8
+ subs w5, w5, #1
+ st1 {v16.s}[2], [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h16_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v16.8b, v17.8b}, [x2]
+ movi v20.8h, #0
+ ldr x12, [x2, #16]
+ movi v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+ ushr \r1\().2d, \r1\().2d, #8
+ mov \r1\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7, 1
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v16.8b, v7.8b
+.purgem calc
+ ld2 {v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqadd v17.8h, v21.8h, v25.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ subs w5, w5, #1
+ st2 {v16.8b, v17.8b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h24_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+ mov x11, x7 // height
+1: ld3 {v16.8b-v18.8b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ ldr x12, [x2, #24]
+ movi v22.8h, #0
+.macro calc op1, op2, r0, r1, r2, r3, src0, src1, src2, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+ umlsl \r0\().8h, \r3\().8b, \src2\().8b
+ ushr \r1\().2d, \r1\().2d, #8
+ mov \r1\().b[7], w12
+ lsr x12, x12, #8
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v18, v16, v17, v3, v4, v5
+.purgem calc
+ umlal v20.8h, v16.8b, v6.8b
+ umlsl v20.8h, v17.8b, v7.8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v18.8b, v7.8b
+ umlal v22.8h, v18.8b, v6.8b
+ umlsl v22.8h, v16.8b, v7.8b
+ ld3 {v23.8h, v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v23.8h
+ sqadd v17.8h, v21.8h, v24.8h
+ sqadd v18.8h, v22.8h, v25.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ sqrshrun v18.8b, v18.8h, #7
+ subs w5, w5, #1
+ st3 {v16.8b, v17.8b, v18.8b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h32_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+ mov x11, x7 // height
+1: ld4 {v16.8b-v19.8b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ ldr x12, [x2, #32]
+ movi v23.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst\().8h, \r3\().8b, \src3\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
+.purgem calc
+ umlal v23.8h, v19.8b, v4.8b
+ umlsl v23.8h, v16.8b, v5.8b
+ umlal v23.8h, v17.8b, v6.8b
+ umlsl v23.8h, v18.8b, v7.8b
+ ld4 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqadd v17.8h, v21.8h, v25.8h
+ sqadd v18.8h, v22.8h, v26.8h
+ sqadd v19.8h, v23.8h, v27.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ sqrshrun v18.8b, v18.8h, #7
+ sqrshrun v19.8b, v19.8h, #7
+ st4 {v16.8b-v19.8b}, [x0], x1
+ add x2, x2, x3
+ subs w5, w5, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h48_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #80
+ mov x11, x7 // height
+1: ld3 {v16.16b-v18.16b}, [x2]
+ ldr x12, [x2, #24]
+ ldr x13, [x2, #48]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ movi v24.8h, #0
+ movi v25.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2, tail=0
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ umlsl \dst0\().8h, \r2\().8b, \src2\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ umlsl2 \dst1\().8h, \r2\().16b, \src2\().16b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ mov \r0\().b[15], w13
+
+ lsr x12, x12, #8
+ lsr x13, x13, #8
+
+.endm
+ calc umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
+.purgem calc
+ umlal v20.8h, v16.8b, v6.8b
+ umlsl v20.8h, v17.8b, v7.8b
+ umlal2 v23.8h, v16.16b, v6.16b
+ umlsl2 v23.8h, v17.16b, v7.16b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ mov v16.b[15], w13
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v18.8b, v7.8b
+ umlal2 v24.8h, v17.16b, v6.16b
+ umlsl2 v24.8h, v18.16b, v7.16b
+ umlal v22.8h, v18.8b, v6.8b
+ umlsl v22.8h, v16.8b, v7.8b
+ umlal2 v25.8h, v18.16b, v6.16b
+ umlsl2 v25.8h, v16.16b, v7.16b
+ ld3 {v26.8h, v27.8h, v28.8h}, [x4], #48
+ sqadd v16.8h, v20.8h, v26.8h
+ sqadd v17.8h, v21.8h, v27.8h
+ sqadd v18.8h, v22.8h, v28.8h
+ ld3 {v26.8h, v27.8h, v28.8h}, [x4], x10
+ sqadd v19.8h, v23.8h, v26.8h
+ sqadd v20.8h, v24.8h, v27.8h
+ sqadd v21.8h, v25.8h, v28.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ sqrshrun v18.8b, v18.8h, #7
+ sqrshrun2 v16.16b, v19.8h, #7
+ sqrshrun2 v17.16b, v20.8h, #7
+ sqrshrun2 v18.16b, v21.8h, #7
+ subs w5, w5, #1
+ st3 {v16.16b-v18.16b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h64_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+1: ld4 {v16.16b-v19.16b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ ldr x12, [x2, #32]
+ movi v24.8h, #0
+ movi v25.8h, #0
+ ldr x13, [x2, #64]
+ movi v26.8h, #0
+ movi v27.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst0\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst0\().8h, \r3\().8b, \src3\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ \op1\()2 \dst1\().8h, \r2\().16b, \src2\().16b
+ \op2\()2 \dst1\().8h, \r3\().16b, \src3\().16b
+.if \tail-1
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ lsr x12, x12, #8
+ mov \r0\().b[15], w13
+ lsr x13, x13, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
+ calc umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
+.purgem calc
+ ld4 {v28.8h-v31.8h}, [x4], #64
+ sqadd v20.8h, v20.8h, v28.8h
+ sqadd v21.8h, v21.8h, v29.8h
+ sqadd v22.8h, v22.8h, v30.8h
+ sqadd v23.8h, v23.8h, v31.8h
+ ld4 {v28.8h-v31.8h}, [x4], #64
+ sqadd v24.8h, v24.8h, v28.8h
+ sqadd v25.8h, v25.8h, v29.8h
+ sqadd v26.8h, v26.8h, v30.8h
+ sqadd v27.8h, v27.8h, v31.8h
+ sqrshrun v16.8b, v20.8h, #7
+ sqrshrun v17.8b, v21.8h, #7
+ sqrshrun v18.8b, v22.8h, #7
+ sqrshrun v19.8b, v23.8h, #7
+ sqrshrun2 v16.16b, v24.8h, #7
+ sqrshrun2 v17.16b, v25.8h, #7
+ sqrshrun2 v18.16b, v26.8h, #7
+ sqrshrun2 v19.16b, v27.8h, #7
+ subs w5, w5, #1
+ st4 {v16.16b-v19.16b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v17.s}[0], [x2], x3
+ ld1 {v18.s}[0], [x2], x3
+ ld1 {v19.s}[0], [x2], x3
+ ld1 {v20.s}[0], [x2], x3
+ ld1 {v21.s}[0], [x2], x3
+ ld1 {v22.s}[0], [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v25.4h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v25.8h
+ sqrshrun v25.8b, v24.8h, #7
+ subs w5, w5, #1
+ st1 {v25.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ sub x1, x1, #4
+ ld1 {v17.8b}, [x2], x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v25.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v25.8h
+ sqrshrun v25.8b, v24.8h, #7
+ st1 {v25.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v25.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+ .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v25.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v25.8h
+ sqrshrun v25.8b, v24.8h, #7
+ subs w5, w5, #1
+ st1 {v25.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ sub x1, x1, #8
+ ld1 {v16.16b}, [x2], x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+ ld1 {v19.16b}, [x2], x3
+ ld1 {v20.16b}, [x2], x3
+ ld1 {v21.16b}, [x2], x3
+ ld1 {v22.16b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x2], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v26.8h, v27.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v26.8h
+ sqadd v25.8h, v25.8h, v27.8h
+ sqrshrun v26.8b, v24.8h, #7
+ sqrshrun2 v26.16b, v25.8h, #7
+ st1 {v26.8b}, [x0], #8
+ subs w5, w5, #1
+ st1 {v26.s}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b}, [x2], x3
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+ ld1 {v19.16b}, [x2], x3
+ ld1 {v20.16b}, [x2], x3
+ ld1 {v21.16b}, [x2], x3
+ ld1 {v22.16b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x2], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v26.8h, v27.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v26.8h
+ sqadd v25.8h, v25.8h, v27.8h
+ sqrshrun v26.8b, v24.8h, #7
+ subs w5, w5, #1
+ sqrshrun2 v26.16b, v25.8h, #7
+ st1 {v26.16b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1
+ stp x7, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ ldr x7, [sp]
+ add x0, x0, #16
+ add x2, x2, #16
+ add x4, x4, #32
+ bl X(ff_hevc_put_hevc_qpel_bi_v8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v12.16b-v15.16b}, [sp]
+ sub x2, x2, x3, lsl #1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ sub x2, x2, x3
+ load_qpel_filterb x7, x6
+ ldr w6, [sp, #128]
+ mov x12, #(MAX_PB_SIZE * 2)
+0: mov x8, x2 // src
+ ld1 {v16.16b, v17.16b}, [x8], x3
+ mov w11, w5 // height
+ ld1 {v18.16b, v19.16b}, [x8], x3
+ mov x10, x0 // dst
+ ld1 {v20.16b, v21.16b}, [x8], x3
+ mov x9, x4 // src2
+ ld1 {v22.16b, v23.16b}, [x8], x3
+ ld1 {v24.16b, v25.16b}, [x8], x3
+ ld1 {v26.16b, v27.16b}, [x8], x3
+ ld1 {v28.16b, v29.16b}, [x8], x3
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x3
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12 // src2
+ sqadd v8.8h, v8.8h, v12.8h
+ sqadd v9.8h, v9.8h, v13.8h
+ sqadd v10.8h, v10.8h, v14.8h
+ sqadd v11.8h, v11.8h, v15.8h
+ sqrshrun v12.8b, v8.8h, #7
+ sqrshrun2 v12.16b, v9.8h, #7
+ sqrshrun v13.8b, v10.8h, #7
+ sqrshrun2 v13.16b, v11.8h, #7
+ subs x11, x11, #1
+ st1 {v12.16b, v13.16b}, [x10], x1
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #32 // dst
+ add x2, x2, #32 // src
+ add x4, x4, #64 // src2
+ subs w6, w6, #32
+ b.ne 0b
+ ld1 {v8.16b-v11.16b}, [sp], #64
+ ld1 {v12.16b-v15.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1
+ stp x7, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ mov x8, #32
+ stp x8, x8, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+ ldp x8, xzr, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ ldr x7, [sp]
+ add x0, x0, #32
+ add x2, x2, #32
+ add x4, x4, #64
+ bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv4_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x7, x6
+ ld1 {v16.4h}, [sp], x9
+ ld1 {v17.4h}, [sp], x9
+ ld1 {v18.4h}, [sp], x9
+ ld1 {v19.4h}, [sp], x9
+ ld1 {v20.4h}, [sp], x9
+ ld1 {v21.4h}, [sp], x9
+ ld1 {v22.4h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ ld1 {v5.4h}, [x4], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ rshrn v1.4h, v1.4s, #7
+ sqxtun v1.8b, v1.8h
+ subs w5, w5, #1
+ st1 {v1.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv6_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add x3, x5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x7, x6
+ sub x1, x1, #4
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ ld1 {v5.8h}, [x4], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ saddw2 v2.4s, v2.4s, v5.8h
+ rshrn v1.4h, v1.4s, #7
+ rshrn2 v1.8h, v2.4s, #7
+ sqxtun v1.8b, v1.8h
+ st1 {v1.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v1.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv8_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add x3, x5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x7, x6
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ ld1 {v5.8h}, [x4], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ saddw2 v2.4s, v2.4s, v5.8h
+ rshrn v1.4h, v1.4s, #7
+ rshrn2 v1.8h, v2.4s, #7
+ sqxtun v1.8b, v1.8h
+ subs w5, w5, #1
+ st1 {v1.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv12_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x6, x7, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+ ldp x6, x7, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ add x4, x4, #16
+ ldp x0, x1, [sp], #16
+ add x2, x2, #8
+ add x0, x0, #8
+ bl X(ff_hevc_put_hevc_qpel_bi_hv4_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv16_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #16 // width
+.Lqpel_bi_hv16_loop:
+ load_qpel_filterh x7, x8
+ mov x9, #(MAX_PB_SIZE * 2)
+ mov x10, x6
+
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x9
+ mov w11, w5 // height
+ ld1 {v18.8h, v19.8h}, [x8], x9
+ mov x12, x4 // src2
+ ld1 {v20.8h, v21.8h}, [x8], x9
+ mov x7, x0 // dst
+ ld1 {v22.8h, v23.8h}, [x8], x9
+ ld1 {v24.8h, v25.8h}, [x8], x9
+ ld1 {v26.8h, v27.8h}, [x8], x9
+ ld1 {v28.8h, v29.8h}, [x8], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
+ calc_qpelh2 v4, v4, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
+ ld1 {v5.8h, v6.8h}, [x12], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ saddw2 v2.4s, v2.4s, v5.8h
+ saddw v3.4s, v3.4s, v6.4h
+ saddw2 v4.4s, v4.4s, v6.8h
+ rshrn v1.4h, v1.4s, #7
+ rshrn2 v1.8h, v2.4s, #7
+ rshrn v2.4h, v3.4s, #7
+ rshrn2 v2.8h, v4.4s, #7
+ sqxtun v1.8b, v1.8h
+ sqxtun2 v1.16b, v2.8h
+ subs x11, x11, #1
+ st1 {v1.16b}, [x7], x1
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #16
+ add sp, sp, #32
+ subs x10, x10, #16
+ add x4, x4, #32
+ b.ne 0b
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub x10, x10, x6, lsl #1 // part of first line
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv24_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x6, x7, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_hv16_8_neon)
+ ldp x6, x7, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x4, x4, #32
+ add x2, x2, #16
+ add x0, x0, #16
+ bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv32_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #32 // width
+ b .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv48_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h48_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #48 // width
+ b .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv64_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h64_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #64 // width
+ b .Lqpel_bi_hv16_loop
+endfunc
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc epel assembly
2022-02-03 13:51 ` [FFmpeg-devel] [PATCH v2 " J. Dekker
@ 2022-02-03 13:51 ` J. Dekker
2022-02-07 22:13 ` Martin Storsjö
2022-02-07 22:11 ` [FFmpeg-devel] [PATCH v2 1/2] lavc/aarch64: add hevc qpel assembly Martin Storsjö
1 sibling, 1 reply; 6+ messages in thread
From: J. Dekker @ 2022-02-03 13:51 UTC (permalink / raw)
To: ffmpeg-devel
Thanks: Rafal Dabrowa <fatwildcat@gmail.com>
---
libavcodec/aarch64/Makefile | 3 +-
libavcodec/aarch64/hevcdsp_epel_neon.S | 2501 +++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 52 +
3 files changed, 2555 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 8592692479..ebedc03bfa 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -61,7 +61,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9lpf_neon.o \
aarch64/vp9mc_16bpp_neon.o \
aarch64/vp9mc_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
+NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_epel_neon.o \
+ aarch64/hevcdsp_idct_neon.o \
aarch64/hevcdsp_init_aarch64.o \
aarch64/hevcdsp_qpel_neon.o \
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
new file mode 100644
index 0000000000..bbf93c3d6a
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -0,0 +1,2501 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.s}[0], [x1], x2
+ ushll v4.8h, v0.8b, #6
+ subs w3, w3, #1
+ st1 {v4.d}[0], [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2 - 8)
+1: ld1 {v0.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2 - 16)
+1: ld1 {v0.8b, v1.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ st1 {v4.8h}, [x0], #16
+ ushll v5.8h, v1.8b, #6
+ subs w3, w3, #1
+ st1 {v5.d}[0], [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b, v1.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll v5.8h, v1.8b, #6
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b-v2.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll v5.8h, v1.8b, #6
+ ushll v6.8h, v2.8b, #6
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b-v3.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll v5.8h, v1.8b, #6
+ ushll v6.8h, v2.8b, #6
+ ushll v7.8h, v3.8b, #6
+ subs w3, w3, #1
+ st1 {v4.8h-v7.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE)
+1: ld1 {v0.16b-v2.16b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll2 v5.8h, v0.16b, #6
+ ushll v6.8h, v1.8b, #6
+ ushll2 v7.8h, v1.16b, #6
+ st1 {v4.8h-v7.8h}, [x0], #64
+ ushll v16.8h, v2.8b, #6
+ ushll2 v17.8h, v2.16b, #6
+ subs w3, w3, #1
+ st1 {v16.8h-v17.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
+1: ld1 {v0.16b-v3.16b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll2 v5.8h, v0.16b, #6
+ ushll v6.8h, v1.8b, #6
+ ushll2 v7.8h, v1.16b, #6
+ st1 {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE)
+ ushll v16.8h, v2.8b, #6
+ ushll2 v17.8h, v2.16b, #6
+ ushll v18.8h, v3.8b, #6
+ ushll2 v19.8h, v3.16b, #6
+ subs w3, w3, #1
+ st1 {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE)
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.s}[0], [x2], x3 // src
+ ushll v16.8h, v0.8b, #6
+ ld1 {v20.4h}, [x4], x10 // src2
+ sqadd v16.8h, v16.8h, v20.8h
+ sqrshrun v0.8b, v16.8h, #7
+ st1 {v0.s}[0], [x0], x1
+ subs w5, w5, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
+ mov x10, #(MAX_PB_SIZE * 2 - 8)
+ sub x1, x1, #4
+1: ld1 {v0.8b}, [x2], x3
+ ushll v16.8h, v0.8b, #6
+ ld1 {v20.4h}, [x4], #8
+ ld1 {v20.s}[2], [x4], x10
+ sqadd v16.8h, v16.8h, v20.8h
+ sqrshrun v0.8b, v16.8h, #7
+ st1 {v0.s}[0], [x0], #4
+ st1 {v0.h}[2], [x0], x1
+ subs w5, w5, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b}, [x2], x3 // src
+ ushll v16.8h, v0.8b, #6
+ ld1 {v20.8h}, [x4], x10 // src2
+ sqadd v16.8h, v16.8h, v20.8h
+ sqrshrun v0.8b, v16.8h, #7
+ subs w5, w5, #1
+ st1 {v0.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
+ mov x10, #(MAX_PB_SIZE * 2 - 16)
+ sub x1, x1, #8
+1: ld1 {v0.16b}, [x2], x3
+ ushll v16.8h, v0.8b, #6
+ ushll2 v17.8h, v0.16b, #6
+ ld1 {v20.8h}, [x4], #16
+ ld1 {v21.4h}, [x4], x10
+ sqadd v16.8h, v16.8h, v20.8h
+ sqadd v17.8h, v17.8h, v21.8h
+ sqrshrun v0.8b, v16.8h, #7
+ sqrshrun2 v0.16b, v17.8h, #7
+ st1 {v0.8b}, [x0], #8
+ subs w5, w5, #1
+ st1 {v0.s}[2], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.16b}, [x2], x3 // src
+ ushll v16.8h, v0.8b, #6
+ ushll2 v17.8h, v0.16b, #6
+ ld1 {v20.8h, v21.8h}, [x4], x10 // src2
+ sqadd v16.8h, v16.8h, v20.8h
+ sqadd v17.8h, v17.8h, v21.8h
+ sqrshrun v0.8b, v16.8h, #7
+ sqrshrun2 v0.16b, v17.8h, #7
+ subs w5, w5, #1
+ st1 {v0.16b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b-v2.8b}, [x2], x3 // src
+ ushll v16.8h, v0.8b, #6
+ ushll v17.8h, v1.8b, #6
+ ushll v18.8h, v2.8b, #6
+ ld1 {v20.8h-v22.8h}, [x4], x10 // src2
+ sqadd v16.8h, v16.8h, v20.8h
+ sqadd v17.8h, v17.8h, v21.8h
+ sqadd v18.8h, v18.8h, v22.8h
+ sqrshrun v0.8b, v16.8h, #7
+ sqrshrun v1.8b, v17.8h, #7
+ sqrshrun v2.8b, v18.8h, #7
+ subs w5, w5, #1
+ st1 {v0.8b-v2.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.16b-v1.16b}, [x2], x3 // src
+ ushll v16.8h, v0.8b, #6
+ ushll2 v17.8h, v0.16b, #6
+ ushll v18.8h, v1.8b, #6
+ ushll2 v19.8h, v1.16b, #6
+ ld1 {v20.8h-v23.8h}, [x4], x10 // src2
+ sqadd v16.8h, v16.8h, v20.8h
+ sqadd v17.8h, v17.8h, v21.8h
+ sqadd v18.8h, v18.8h, v22.8h
+ sqadd v19.8h, v19.8h, v23.8h
+ sqrshrun v0.8b, v16.8h, #7
+ sqrshrun2 v0.16b, v17.8h, #7
+ sqrshrun v1.8b, v18.8h, #7
+ sqrshrun2 v1.16b, v19.8h, #7
+ st1 {v0.16b-v1.16b}, [x0], x1
+ subs w5, w5, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
+ mov x10, #(MAX_PB_SIZE)
+1: ld1 {v0.16b-v2.16b}, [x2], x3 // src
+ ushll v16.8h, v0.8b, #6
+ ushll2 v17.8h, v0.16b, #6
+ ushll v18.8h, v1.8b, #6
+ ushll2 v19.8h, v1.16b, #6
+ ushll v20.8h, v2.8b, #6
+ ushll2 v21.8h, v2.16b, #6
+ ld1 {v24.8h-v27.8h}, [x4], #(MAX_PB_SIZE) // src2
+ sqadd v16.8h, v16.8h, v24.8h
+ sqadd v17.8h, v17.8h, v25.8h
+ sqadd v18.8h, v18.8h, v26.8h
+ sqadd v19.8h, v19.8h, v27.8h
+ ld1 {v24.8h-v25.8h}, [x4], x10
+ sqadd v20.8h, v20.8h, v24.8h
+ sqadd v21.8h, v21.8h, v25.8h
+ sqrshrun v0.8b, v16.8h, #7
+ sqrshrun2 v0.16b, v17.8h, #7
+ sqrshrun v1.8b, v18.8h, #7
+ sqrshrun2 v1.16b, v19.8h, #7
+ sqrshrun v2.8b, v20.8h, #7
+ sqrshrun2 v2.16b, v21.8h, #7
+ subs w5, w5, #1
+ st1 {v0.16b-v2.16b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
+1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 // src
+ ushll v16.8h, v0.8b, #6
+ ushll2 v17.8h, v0.16b, #6
+ ushll v18.8h, v1.8b, #6
+ ushll2 v19.8h, v1.16b, #6
+ ushll v20.8h, v2.8b, #6
+ ushll2 v21.8h, v2.16b, #6
+ ushll v22.8h, v3.8b, #6
+ ushll2 v23.8h, v3.16b, #6
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) // src2
+ sqadd v16.8h, v16.8h, v24.8h
+ sqadd v17.8h, v17.8h, v25.8h
+ sqadd v18.8h, v18.8h, v26.8h
+ sqadd v19.8h, v19.8h, v27.8h
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)
+ sqadd v20.8h, v20.8h, v24.8h
+ sqadd v21.8h, v21.8h, v25.8h
+ sqadd v22.8h, v22.8h, v26.8h
+ sqadd v23.8h, v23.8h, v27.8h
+ sqrshrun v0.8b, v16.8h, #7
+ sqrshrun2 v0.16b, v17.8h, #7
+ sqrshrun v1.8b, v18.8h, #7
+ sqrshrun2 v1.16b, v19.8h, #7
+ sqrshrun v2.8b, v20.8h, #7
+ sqrshrun2 v2.16b, v21.8h, #7
+ sqrshrun v3.8b, v22.8h, #7
+ sqrshrun2 v3.16b, v23.8h, #7
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ subs w5, w5, #1
+ b.ne 1b
+ ret
+endfunc
+
+.Lepel_filters:
+ .byte 0, 0, 0, 0
+ .byte -2, 58, 10, -2
+ .byte -4, 54, 16, -2
+ .byte -6, 46, 28, -4
+ .byte -4, 36, 36, -4
+ .byte -4, 28, 46, -6
+ .byte -2, 16, 54, -4
+ .byte -2, 10, 58, -2
+
+.macro load_epel_filterb freg, xreg
+ adr \xreg, .Lepel_filters
+ add \xreg, \xreg, \freg, lsl #2
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter
+ neg v0.16b, v0.16b
+ neg v3.16b, v3.16b
+.endm
+
+.macro calc_epelb dst, src0, src1, src2, src3
+ umlsl \dst\().8h, \src0\().8b, v0.8b
+ umlal \dst\().8h, \src1\().8b, v1.8b
+ umlal \dst\().8h, \src2\().8b, v2.8b
+ umlsl \dst\().8h, \src3\().8b, v3.8b
+.endm
+
+.macro calc_epelb2 dst, src0, src1, src2, src3
+ umlsl2 \dst\().8h, \src0\().16b, v0.16b
+ umlal2 \dst\().8h, \src1\().16b, v1.16b
+ umlal2 \dst\().8h, \src2\().16b, v2.16b
+ umlsl2 \dst\().8h, \src3\().16b, v3.16b
+.endm
+
+.macro load_epel_filterh freg, xreg
+ adr \xreg, .Lepel_filters
+ add \xreg, \xreg, \freg, lsl #2
+ ld1 {v0.8b}, [\xreg]
+ sxtl v0.8h, v0.8b
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+ smull \dst\().4s, \src0\().4h, v0.h[0]
+ smlal \dst\().4s, \src1\().4h, v0.h[1]
+ smlal \dst\().4s, \src2\().4h, v0.h[2]
+ smlal \dst\().4s, \src3\().4h, v0.h[3]
+ sqshrn \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+ smull2 \tmp\().4s, \src0\().8h, v0.h[0]
+ smlal2 \tmp\().4s, \src1\().8h, v0.h[1]
+ smlal2 \tmp\().4s, \src2\().8h, v0.h[2]
+ smlal2 \tmp\().4s, \src3\().8h, v0.h[3]
+ sqshrn2 \dst\().8h, \tmp\().4s, #6
+.endm
+
+function ff_hevc_put_hevc_epel_h4_8_neon, export=1
+ load_epel_filterb x4, x5
+ sub x1, x1, #1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v4.8b}, [x1], x2
+ ushr v5.2d, v4.2d, #8
+ ushr v6.2d, v5.2d, #8
+ ushr v7.2d, v6.2d, #8
+ movi v16.8h, #0
+ calc_epelb v16, v4, v5, v6, v7
+ st1 {v16.4h}, [x0], x10
+ subs w3, w3, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h6_8_neon, export=1
+ load_epel_filterb x4, x5
+ sub x1, x1, #1
+ sub x2, x2, #8
+ mov x10, #(MAX_PB_SIZE * 2 - 8)
+1: ld1 {v24.8b}, [x1], #8
+ ushr v26.2d, v24.2d, #8
+ ushr v27.2d, v26.2d, #8
+ ushr v28.2d, v27.2d, #8
+ movi v16.8h, #0
+ ld1 {v28.b}[5], [x1], x2
+ calc_epelb v16, v24, v26, v27, v28
+ st1 {v16.4h}, [x0], #8
+ st1 {v16.s}[2], [x0], x10
+ subs w3, w3, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h8_8_neon, export=1
+ load_epel_filterb x4, x5
+ sub x1, x1, #1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v24.8b, v25.8b}, [x1], x2
+ ushr v26.2d, v24.2d, #8
+ ushr v27.2d, v25.2d, #8
+ ushr v28.2d, v26.2d, #8
+ movi v16.8h, #0
+ movi v17.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ st2 {v16.4h, v17.4h}, [x0], x10
+ subs w3, w3, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h12_8_neon, export=1
+ load_epel_filterb x4, x5
+ sub x1, x1, #1
+ mov x10, #(MAX_PB_SIZE * 2 - 16)
+1: ld2 {v24.8b, v25.8b}, [x1], x2
+ ushr v26.2d, v24.2d, #8
+ ushr v27.2d, v25.2d, #8
+ ushr v28.2d, v26.2d, #8
+ movi v16.8h, #0
+ movi v17.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ zip1 v18.8h, v16.8h, v17.8h
+ zip2 v19.8h, v16.8h, v17.8h
+ st1 {v18.8h}, [x0], #16
+ st1 {v19.d}[0], [x0], x10
+ subs w3, w3, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h16_8_neon, export=1
+ load_epel_filterb x4, x5
+ sub x1, x1, #1
+ sub x2, x2, #16
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v24.8b, v25.8b}, [x1], #16
+ ld1 {v20.s}[0], [x1], x2
+ ushr v26.2d, v24.2d, #8
+ ushr v27.2d, v25.2d, #8
+ mov v26.b[7], v20.b[0]
+ mov v27.b[7], v20.b[1]
+ ushr v28.2d, v26.2d, #8
+ mov v28.b[7], v20.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ st2 {v16.8h, v17.8h}, [x0], x10
+ subs w3, w3, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h24_8_neon, export=1
+ load_epel_filterb x4, x5
+ sub x1, x1, #1
+ sub x2, x2, #24
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld3 {v24.8b, v25.8b, v26.8b}, [x1], #24
+ ld1 {v20.s}[0], [x1], x2
+ ushr v27.2d, v24.2d, #8
+ ushr v28.2d, v25.2d, #8
+ ushr v29.2d, v26.2d, #8
+ mov v27.b[7], v20.b[0]
+ mov v28.b[7], v20.b[1]
+ mov v29.b[7], v20.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ calc_epelb v18, v26, v27, v28, v29
+ st3 {v16.8h, v17.8h, v18.8h}, [x0], x10
+ subs w3, w3, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h32_8_neon, export=1
+ load_epel_filterb x4, x5
+ sub x1, x1, #1
+ sub x2, x2, #32
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld4 {v24.8b, v25.8b, v26.8b, v27.8b}, [x1], #32
+ ld1 {v20.s}[0], [x1], x2
+ ushr v28.2d, v24.2d, #8
+ ushr v29.2d, v25.2d, #8
+ ushr v30.2d, v26.2d, #8
+ ins v28.b[7], v20.b[0]
+ ins v29.b[7], v20.b[1]
+ ins v30.b[7], v20.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ calc_epelb v18, v26, v27, v28, v29
+ calc_epelb v19, v27, v28, v29, v30
+ st4 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h48_8_neon, export=1
+ load_epel_filterb x4, x5
+ sub x1, x1, #1
+ sub x2, x2, #48
+ mov x5, #24
+ mov x10, #(128 - 48)
+1: ld3 {v26.16b, v27.16b, v28.16b}, [x1], x5
+ ushr v29.2d, v26.2d, #8
+ ushr v30.2d, v27.2d, #8
+ ushr v31.2d, v28.2d, #8
+ ld1 {v24.s}[0], [x1], x5
+ ld1 {v25.s}[0], [x1], x2
+ mov v29.b[7], v24.b[0]
+ mov v30.b[7], v24.b[1]
+ mov v31.b[7], v24.b[2]
+ mov v29.b[15], v25.b[0]
+ mov v30.b[15], v25.b[1]
+ mov v31.b[15], v25.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ calc_epelb v16, v26, v27, v28, v29
+ calc_epelb2 v20, v26, v27, v28, v29
+ calc_epelb v17, v27, v28, v29, v30
+ calc_epelb2 v21, v27, v28, v29, v30
+ calc_epelb v18, v28, v29, v30, v31
+ calc_epelb2 v22, v28, v29, v30, v31
+ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48
+ st3 {v20.8h, v21.8h, v22.8h}, [x0], x10
+ subs w3, w3, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h64_8_neon, export=1
+ load_epel_filterb x4, x5
+ sub x1, x1, #1
+ sub x2, x2, #64
+ mov x7, #32
+1: ld4 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x7
+ ushr v28.2d, v24.2d, #8
+ ushr v29.2d, v25.2d, #8
+ ushr v30.2d, v26.2d, #8
+ ld1 {v4.s}[0], [x1], x7
+ ld1 {v5.s}[0], [x1], x2
+ ins v28.b[7], v4.b[0]
+ ins v28.b[15], v5.b[0]
+ ins v29.b[7], v4.b[1]
+ ins v29.b[15], v5.b[1]
+ ins v30.b[7], v4.b[2]
+ ins v30.b[15], v5.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb2 v20, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ calc_epelb2 v21, v25, v26, v27, v28
+ calc_epelb v18, v26, v27, v28, v29
+ calc_epelb2 v22, v26, v27, v28, v29
+ calc_epelb v19, v27, v28, v29, v30
+ calc_epelb2 v23, v27, v28, v29, v30
+ st4 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+ st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+.macro calc_all4
+ calc v16, v17, v18, v19
+ b.eq 2f
+ calc v17, v18, v19, v16
+ b.eq 2f
+ calc v18, v19, v16, v17
+ b.eq 2f
+ calc v19, v16, v17, v18
+ b.ne 1b
+.endm
+
+.macro calc_all8
+ calc v16, v17, v18, v19, v20, v21, v22, v23
+ b.eq 2f
+ calc v18, v19, v20, v21, v22, v23, v16, v17
+ b.eq 2f
+ calc v20, v21, v22, v23, v16, v17, v18, v19
+ b.eq 2f
+ calc v22, v23, v16, v17, v18, v19, v20, v21
+ b.ne 1b
+.endm
+
+.macro calc_all12
+ calc v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
+ b.eq 2f
+ calc v19, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17, v18
+ b.eq 2f
+ calc v22, v23, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ b.eq 2f
+ calc v25, v26, v27, v16, v17, v18, v19, v20, v21, v22, v23, v24
+ b.ne 1b
+.endm
+
+.macro calc_all16
+ calc v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ b.eq 2f
+ calc v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v16, v17, v18, v19
+ b.eq 2f
+ calc v24, v25, v26, v27, v28, v29, v30, v31, v16, v17, v18, v19, v20, v21, v22, v23
+ b.eq 2f
+ calc v28, v29, v30, v31, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
+ b.ne 1b
+.endm
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.s}[0], [x1], x2
+ ld1 {v17.s}[0], [x1], x2
+ ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().s}[0], [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2 - 8)
+ ld1 {v16.8b}, [x1], x2
+ ld1 {v17.8b}, [x1], x2
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b}, [x1], x2
+ ld1 {v17.8b}, [x1], x2
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2 - 16)
+ ld1 {v16.16b}, [x1], x2
+ ld1 {v17.16b}, [x1], x2
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ st1 {v4.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v5.d}[0], [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b}, [x1], x2
+ ld1 {v17.16b}, [x1], x2
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
+ ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
+ ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb v5, \src1, \src4, \src7, \src10
+ calc_epelb v6, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().16b, \src7\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ calc_epelb v4, \src0, \src2, \src4, \src6
+ calc_epelb2 v5, \src0, \src2, \src4, \src6
+ calc_epelb v6, \src1, \src3, \src5, \src7
+ calc_epelb2 v7, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h-v7.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #64
+ ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
+ ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb2 v5, \src0, \src3, \src6, \src9
+ calc_epelb v6, \src1, \src4, \src7, \src10
+ calc_epelb2 v7, \src1, \src4, \src7, \src10
+ calc_epelb v28, \src2, \src5, \src8, \src11
+ calc_epelb2 v29, \src2, \src5, \src8, \src11
+ st1 { v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v28.8h-v29.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\src12\().16b-\src15\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_epelb v4, \src0, \src4, \src8, \src12
+ calc_epelb2 v5, \src0, \src4, \src8, \src12
+ calc_epelb v6, \src1, \src5, \src9, \src13
+ calc_epelb2 v7, \src1, \src5, \src9, \src13
+ calc_epelb v8, \src2, \src6, \src10, \src14
+ calc_epelb2 v9, \src2, \src6, \src10, \src14
+ calc_epelb v10, \src3, \src7, \src11, \src15
+ calc_epelb2 v11, \src3, \src7, \src11, \src15
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], #64
+.endm
+1: calc_all16
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv4_8_neon, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h4_8_neon)
+ ldp x5, x30, [sp], #16
+ ldp x0, x3, [sp], #16
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.4h}, [sp], x10
+ ld1 {v17.4h}, [sp], x10
+ ld1 {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().4h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h6_8_neon)
+ ldp x5, x30, [sp], #16
+ ldp x0, x3, [sp], #16
+ load_epel_filterh x5, x4
+ mov x5, #120
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x5
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h8_8_neon)
+ ldp x5, x30, [sp], #16
+ ldp x0, x3, [sp], #16
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h12_8_neon)
+ ldp x5, x30, [sp], #16
+ ldp x0, x3, [sp], #16
+ load_epel_filterh x5, x4
+ mov x5, #112
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ st1 {v4.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v5.4h}, [x0], x5
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv16_8_neon, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h16_8_neon)
+ ldp x5, x30, [sp], #16
+ ldp x0, x3, [sp], #16
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ calc_epelh2 v5, v6, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv24_8_neon, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h24_8_neon)
+ ldp x5, x30, [sp], #16
+ ldp x0, x3, [sp], #16
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
+ ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
+ ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8h-\src11\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src3, \src6, \src9
+ calc_epelh2 v4, v5, \src0, \src3, \src6, \src9
+ calc_epelh v5, \src1, \src4, \src7, \src10
+ calc_epelh2 v5, v6, \src1, \src4, \src7, \src10
+ calc_epelh v6, \src2, \src5, \src8, \src11
+ calc_epelh2 v6, v7, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv32_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x0, x0, #32
+ add x1, x1, #16
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv48_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ mov x6, #24
+ bl X(ff_hevc_put_hevc_epel_hv24_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x0, x0, #48
+ add x1, x1, #24
+ mov x6, #24
+ bl X(ff_hevc_put_hevc_epel_hv24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv64_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add x0, x0, #32
+ add x1, x1, #16
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add x0, x0, #64
+ add x1, x1, #32
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x0, x0, #96
+ add x1, x1, #48
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
+ load_epel_filterb x6, x5
+ sxtw x3, w3
+ sxtw x1, w1
+ sub x2, x2, x3
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v17.s}[0], [x2], x3
+ ld1 {v18.s}[0], [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().s}[0], [x2], x3
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ subs w4, w4, #1
+ st1 {v4.s}[0], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v6_8_neon, export=1
+ load_epel_filterb x6, x5
+ sxtw x3, w3
+ sxtw x1, w1
+ sub x2, x2, x3
+ sub x1, x1, #4
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x2], x3
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ st1 {v4.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v4.h}[2], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v8_8_neon, export=1
+ load_epel_filterb x6, x5
+ sxtw x3, w3
+ sxtw x1, w1
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x2], x3
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ subs w4, w4, #1
+ st1 {v4.8b}, [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v12_8_neon, export=1
+ load_epel_filterb x6, x5
+ sxtw x3, w3
+ sxtw x1, w1
+ sub x2, x2, x3
+ sub x1, x1, #8
+ ld1 {v16.16b}, [x2], x3
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun2 v4.16b, v5.8h, #6
+ subs w4, w4, #1
+ st1 {v4.8b}, [x0], #8
+ st1 {v4.s}[2], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v16_8_neon, export=1
+ load_epel_filterb x6, x5
+ sxtw x3, w3
+ sxtw x1, w1
+ sub x2, x2, x3
+ ld1 {v16.16b}, [x2], x3
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun2 v4.16b, v5.8h, #6
+ subs w4, w4, #1
+ st1 {v4.16b}, [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v24_8_neon, export=1
+ load_epel_filterb x6, x5
+ sxtw x3, w3
+ sxtw x1, w1
+ sub x2, x2, x3
+ ld1 {v16.8b, v17.8b, v18.8b}, [x2], x3
+ ld1 {v19.8b, v20.8b, v21.8b}, [x2], x3
+ ld1 {v22.8b, v23.8b, v24.8b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb v5, \src1, \src4, \src7, \src10
+ calc_epelb v6, \src2, \src5, \src8, \src11
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun v5.8b, v5.8h, #6
+ sqrshrun v6.8b, v6.8h, #6
+ subs w4, w4, #1
+ st1 {v4.8b-v6.8b}, [x0], x1
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v32_8_neon, export=1
+ load_epel_filterb x6, x5
+ sxtw x3, w3
+ sxtw x1, w1
+ sub x2, x2, x3
+ ld1 {v16.16b, v17.16b}, [x2], x3
+ ld1 {v18.16b, v19.16b}, [x2], x3
+ ld1 {v20.16b, v21.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().16b, \src7\().16b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ calc_epelb v4, \src0, \src2, \src4, \src6
+ calc_epelb2 v5, \src0, \src2, \src4, \src6
+ calc_epelb v6, \src1, \src3, \src5, \src7
+ calc_epelb2 v7, \src1, \src3, \src5, \src7
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun2 v4.16b, v5.8h, #6
+ sqrshrun v5.8b, v6.8h, #6
+ sqrshrun2 v5.16b, v7.8h, #6
+ subs w4, w4, #1
+ st1 {v4.16b, v5.16b}, [x0], x1
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v48_8_neon, export=1
+ load_epel_filterb x6, x5
+ sxtw x3, w3
+ sxtw x1, w1
+ sub x2, x2, x3
+ ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3
+ ld1 {v19.16b, v20.16b, v21.16b}, [x2], x3
+ ld1 {v22.16b, v23.16b, v24.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb2 v5, \src0, \src3, \src6, \src9
+ calc_epelb v6, \src1, \src4, \src7, \src10
+ calc_epelb2 v7, \src1, \src4, \src7, \src10
+ calc_epelb v28, \src2, \src5, \src8, \src11
+ calc_epelb2 v29, \src2, \src5, \src8, \src11
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun2 v4.16b, v5.8h, #6
+ sqrshrun v5.8b, v6.8h, #6
+ sqrshrun2 v5.16b, v7.8h, #6
+ sqrshrun v6.8b, v28.8h, #6
+ sqrshrun2 v6.16b, v29.8h, #6
+ subs w4, w4, #1
+ st1 {v4.16b, v5.16b, v6.16b}, [x0], x1
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1
+ load_epel_filterb x6, x5
+ sub sp, sp, #32
+ sxtw x3, w3
+ sxtw x1, w1
+ st1 {v8.8b-v11.8b}, [sp]
+ sub x2, x2, x3
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\src12\().16b, \src13\().16b, \src14\().16b, \src15\().16b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_epelb v10, \src3, \src7, \src11, \src15
+ calc_epelb2 v11, \src3, \src7, \src11, \src15
+ calc_epelb v4, \src0, \src4, \src8, \src12
+ calc_epelb2 v5, \src0, \src4, \src8, \src12
+ calc_epelb v6, \src1, \src5, \src9, \src13
+ calc_epelb2 v7, \src1, \src5, \src9, \src13
+ calc_epelb v8, \src2, \src6, \src10, \src14
+ calc_epelb2 v9, \src2, \src6, \src10, \src14
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun2 v4.16b, v5.8h, #6
+ sqrshrun v5.8b, v6.8h, #6
+ sqrshrun2 v5.16b, v7.8h, #6
+ sqrshrun v6.8b, v8.8h, #6
+ sqrshrun2 v6.16b, v9.8h, #6
+ sqrshrun v7.8b, v10.8h, #6
+ sqrshrun2 v7.16b, v11.8h, #6
+ subs w4, w4, #1
+ st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+.endm
+1: calc_all16
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv4_8_neon, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp xzr, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h4_8_neon)
+ ldp xzr, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.4h}, [sp], x10
+ ld1 {v17.4h}, [sp], x10
+ ld1 {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().4h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ subs w4, w4, #1
+ st1 {v4.s}[0], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv6_8_neon, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp xzr, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h6_8_neon)
+ ldp xzr, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x6, x5
+ sub x1, x1, #4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ st1 {v4.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v4.h}[2], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv8_8_neon, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp xzr, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ subs w4, w4, #1
+ st1 {v4.8b}, [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv12_8_neon, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp xzr, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h12_8_neon)
+ ldp xzr, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x6, x5
+ sub x1, x1, #8
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun2 v4.16b, v5.8h, #6
+ st1 {v4.8b}, [x0], #8
+ st1 {v4.s}[2], [x0], x1
+ subs w4, w4, #1
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv16_8_neon, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp xzr, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h16_8_neon)
+ ldp xzr, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ calc_epelh2 v5, v6, \src1, \src3, \src5, \src7
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun2 v4.16b, v5.8h, #6
+ subs w4, w4, #1
+ st1 {v4.16b}, [x0], x1
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv24_8_neon, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp xzr, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
+ ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
+ ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8h, \src10\().8h, \src11\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src3, \src6, \src9
+ calc_epelh2 v4, v5, \src0, \src3, \src6, \src9
+ calc_epelh v5, \src1, \src4, \src7, \src10
+ calc_epelh2 v5, v6, \src1, \src4, \src7, \src10
+ calc_epelh v6, \src2, \src5, \src8, \src11
+ calc_epelh2 v6, v7, \src2, \src5, \src8, \src11
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun v5.8b, v5.8h, #6
+ sqrshrun v6.8b, v6.8h, #6
+ subs w4, w4, #1
+ st1 {v4.8b, v5.8b, v6.8b}, [x0], x1
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv32_8_neon, export=1
+ stp x0, x30, [sp, #-16]!
+ stp x1, x2, [sp, #-16]!
+ stp x3, x4, [sp, #-16]!
+ stp x5, x6, [sp, #-16]!
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+ ldp x5, x6, [sp], #16
+ ldp x3, x4, [sp], #16
+ ldp x1, x2, [sp], #16
+ ldr x0, [sp]
+ add x0, x0, #16
+ add x2, x2, #16
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv48_8_neon, export=1
+ stp x0, x30, [sp, #-16]!
+ stp x1, x2, [sp, #-16]!
+ stp x3, x4, [sp, #-16]!
+ stp x5, x6, [sp, #-16]!
+ mov x7, #24
+ bl X(ff_hevc_put_hevc_epel_uni_hv24_8_neon)
+ ldp x5, x6, [sp], #16
+ ldp x3, x4, [sp], #16
+ ldp x1, x2, [sp], #16
+ ldr x0, [sp]
+ add x0, x0, #24
+ add x2, x2, #24
+ mov x7, #24
+ bl X(ff_hevc_put_hevc_epel_uni_hv24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv64_8_neon, export=1
+ stp x0, x30, [sp, #-16]!
+ stp x1, x2, [sp, #-16]!
+ stp x3, x4, [sp, #-16]!
+ stp x5, x6, [sp, #-16]!
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+ ldp x5, x6, [sp]
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp, #32]
+ ldr x0, [sp, #48]
+ add x0, x0, #16
+ add x2, x2, #16
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+ ldp x5, x6, [sp]
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp, #32]
+ ldr x0, [sp, #48]
+ add x0, x0, #32
+ add x2, x2, #32
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+ ldp x5, x6, [sp], #16
+ ldp x3, x4, [sp], #16
+ ldp x1, x2, [sp], #16
+ ldr x0, [sp]
+ add x0, x0, #48
+ add x2, x2, #48
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1
+ load_epel_filterb x6, x7
+ sub x2, x2, #1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v4.8b}, [x2], x3
+ ushr v5.2d, v4.2d, #8
+ ushr v6.2d, v5.2d, #8
+ ushr v7.2d, v6.2d, #8
+ movi v16.8h, #0
+ calc_epelb v16, v4, v5, v6, v7
+ ld1 {v20.4h}, [x4], x10
+ sqadd v16.8h, v16.8h, v20.8h
+ sqrshrun v4.8b, v16.8h, #7
+ st1 {v4.s}[0], [x0], x1
+ subs w5, w5, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h6_8_neon, export=1
+ load_epel_filterb x6, x7
+ sub w1, w1, #4
+ sub x2, x2, #1
+ sub x3, x3, #8
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v24.8b}, [x2], #8
+ ushr v26.2d, v24.2d, #8
+ ushr v27.2d, v26.2d, #8
+ ushr v28.2d, v27.2d, #8
+ movi v16.8h, #0
+ ld1 {v28.b}[5], [x2], x3
+ calc_epelb v16, v24, v26, v27, v28
+ ld1 {v20.8h}, [x4], x10
+ sqadd v16.8h, v16.8h, v20.8h
+ sqrshrun v16.8b, v16.8h, #7
+ st1 {v16.s}[0], [x0], #4
+ st1 {v16.h}[2], [x0], x1
+ subs w5, w5, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1
+ load_epel_filterb x6, x7
+ sub x2, x2, #1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v24.8b, v25.8b}, [x2], x3
+ ushr v26.2d, v24.2d, #8
+ ushr v27.2d, v25.2d, #8
+ ushr v28.2d, v26.2d, #8
+ movi v16.8h, #0
+ movi v17.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ zip1 v16.8h, v16.8h, v17.8h
+ ld1 {v20.8h}, [x4], x10
+ sqadd v16.8h, v16.8h, v20.8h
+ sqrshrun v16.8b, v16.8h, #7
+ st1 {v16.8b}, [x0], x1
+ subs w5, w5, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h12_8_neon, export=1
+ load_epel_filterb x6, x7
+ sub x1, x1, #8
+ sub x2, x2, #1
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v24.8b, v25.8b}, [x2], x3
+ ushr v26.2d, v24.2d, #8
+ ushr v27.2d, v25.2d, #8
+ ushr v28.2d, v26.2d, #8
+ movi v16.8h, #0
+ movi v17.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ zip1 v18.8h, v16.8h, v17.8h
+ zip2 v19.8h, v16.8h, v17.8h
+ ld1 {v20.8h, v21.8h}, [x4], x10
+ sqadd v18.8h, v18.8h, v20.8h
+ sqadd v19.8h, v19.8h, v21.8h
+ sqrshrun v20.8b, v18.8h, #7
+ sqrshrun v21.8b, v19.8h, #7
+ st1 {v20.8b}, [x0], #8
+ st1 {v21.s}[0], [x0], x1
+ subs w5, w5, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h16_8_neon, export=1
+ load_epel_filterb x6, x7
+ sub x2, x2, #1
+ sub w3, w3, #16
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v24.8b, v25.8b}, [x2], #16
+ ld1 {v20.s}[0], [x2], x3
+ ushr v26.2d, v24.2d, #8
+ ushr v27.2d, v25.2d, #8
+ mov v26.b[7], v20.b[0]
+ mov v27.b[7], v20.b[1]
+ ushr v28.2d, v26.2d, #8
+ mov v28.b[7], v20.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ zip1 v18.8h, v16.8h, v17.8h
+ zip2 v19.8h, v16.8h, v17.8h
+ ld2 {v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v16.8h, v24.8h
+ sqadd v17.8h, v17.8h, v25.8h
+ sqrshrun v4.8b, v16.8h, #7
+ sqrshrun v5.8b, v17.8h, #7
+ st2 {v4.8b, v5.8b}, [x0], x1
+ subs w5, w5, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h24_8_neon, export=1
+ load_epel_filterb x6, x7
+ sub x2, x2, #1
+ sub w3, w3, #24
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld3 {v24.8b, v25.8b, v26.8b}, [x2], #24
+ ld1 {v20.s}[0], [x2], x3
+ ushr v27.2d, v24.2d, #8
+ ushr v28.2d, v25.2d, #8
+ ushr v29.2d, v26.2d, #8
+ mov v27.b[7], v20.b[0]
+ mov v28.b[7], v20.b[1]
+ mov v29.b[7], v20.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ calc_epelb v18, v26, v27, v28, v29
+ ld3 {v24.8h, v25.8h, v26.8h}, [x4], x10
+ sqadd v16.8h, v16.8h, v24.8h
+ sqadd v17.8h, v17.8h, v25.8h
+ sqadd v18.8h, v18.8h, v26.8h
+ sqrshrun v4.8b, v16.8h, #7
+ sqrshrun v5.8b, v17.8h, #7
+ sqrshrun v6.8b, v18.8h, #7
+ st3 {v4.8b, v5.8b, v6.8b}, [x0], x1
+ subs w5, w5, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h32_8_neon, export=1
+ load_epel_filterb x6, x7
+ sub x2, x2, #1
+ sub w3, w3, #32
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld4 {v24.8b, v25.8b, v26.8b, v27.8b}, [x2], #32
+ ld1 {v20.s}[0], [x2], x3
+ ushr v28.2d, v24.2d, #8
+ ushr v29.2d, v25.2d, #8
+ ushr v30.2d, v26.2d, #8
+ ins v28.b[7], v20.b[0]
+ ins v29.b[7], v20.b[1]
+ ins v30.b[7], v20.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ calc_epelb v18, v26, v27, v28, v29
+ calc_epelb v19, v27, v28, v29, v30
+ ld4 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+ sqadd v16.8h, v16.8h, v24.8h
+ sqadd v17.8h, v17.8h, v25.8h
+ sqadd v18.8h, v18.8h, v26.8h
+ sqadd v19.8h, v19.8h, v27.8h
+ sqrshrun v4.8b, v16.8h, #7
+ sqrshrun v5.8b, v17.8h, #7
+ sqrshrun v6.8b, v18.8h, #7
+ sqrshrun v7.8b, v19.8h, #7
+ st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [x0], x1
+ subs w5, w5, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h48_8_neon, export=1
+ load_epel_filterb x6, x7
+ sub x2, x2, #1
+ sub w3, w3, #48
+ mov x7, #24
+ mov x10, #(128 - 48)
+1: ld3 {v26.16b, v27.16b, v28.16b}, [x2], x7
+ ushr v29.2d, v26.2d, #8
+ ushr v30.2d, v27.2d, #8
+ ushr v31.2d, v28.2d, #8
+ ld1 {v24.s}[0], [x2], x7
+ ld1 {v25.s}[0], [x2], x3
+ mov v29.b[7], v24.b[0]
+ mov v30.b[7], v24.b[1]
+ mov v31.b[7], v24.b[2]
+ mov v29.b[15], v25.b[0]
+ mov v30.b[15], v25.b[1]
+ mov v31.b[15], v25.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ calc_epelb v16, v26, v27, v28, v29
+ calc_epelb2 v20, v26, v27, v28, v29
+ calc_epelb v17, v27, v28, v29, v30
+ calc_epelb2 v21, v27, v28, v29, v30
+ calc_epelb v18, v28, v29, v30, v31
+ calc_epelb2 v22, v28, v29, v30, v31
+ ld3 {v24.8h, v25.8h, v26.8h}, [x4], #48
+ sqadd v16.8h, v16.8h, v24.8h
+ sqadd v17.8h, v17.8h, v25.8h
+ sqadd v18.8h, v18.8h, v26.8h
+ ld3 {v27.8h, v28.8h, v29.8h}, [x4], x10
+ sqadd v20.8h, v20.8h, v27.8h
+ sqadd v21.8h, v21.8h, v28.8h
+ sqadd v22.8h, v22.8h, v29.8h
+ sqrshrun v4.8b, v16.8h, #7
+ sqrshrun v5.8b, v17.8h, #7
+ sqrshrun v6.8b, v18.8h, #7
+ sqrshrun2 v4.16b, v20.8h, #7
+ sqrshrun2 v5.16b, v21.8h, #7
+ sqrshrun2 v6.16b, v22.8h, #7
+ st3 {v4.16b, v5.16b, v6.16b}, [x0], x1
+ subs w5, w5, #1 // height
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h64_8_neon, export=1
+ load_epel_filterb x6, x7
+ sub x2, x2, #1
+ sub w3, w3, #64
+ mov x7, #32
+1: ld4 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x7
+ ushr v28.2d, v24.2d, #8
+ ushr v29.2d, v25.2d, #8
+ ushr v30.2d, v26.2d, #8
+ ld1 {v4.s}[0], [x2], x7
+ ld1 {v5.s}[0], [x2], x3
+ ins v28.b[7], v4.b[0]
+ ins v28.b[15], v5.b[0]
+ ins v29.b[7], v4.b[1]
+ ins v29.b[15], v5.b[1]
+ ins v30.b[7], v4.b[2]
+ ins v30.b[15], v5.b[2]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ calc_epelb v16, v24, v25, v26, v27
+ calc_epelb2 v20, v24, v25, v26, v27
+ calc_epelb v17, v25, v26, v27, v28
+ calc_epelb2 v21, v25, v26, v27, v28
+ calc_epelb v18, v26, v27, v28, v29
+ calc_epelb2 v22, v26, v27, v28, v29
+ calc_epelb v19, v27, v28, v29, v30
+ calc_epelb2 v23, v27, v28, v29, v30
+ ld4 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #64
+ sqadd v16.8h, v16.8h, v24.8h
+ sqadd v17.8h, v17.8h, v25.8h
+ sqadd v18.8h, v18.8h, v26.8h
+ sqadd v19.8h, v19.8h, v27.8h
+ ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
+ sqadd v20.8h, v20.8h, v28.8h
+ sqadd v21.8h, v21.8h, v29.8h
+ sqadd v22.8h, v22.8h, v30.8h
+ sqadd v23.8h, v23.8h, v31.8h
+ sqrshrun v4.8b, v16.8h, #7
+ sqrshrun v5.8b, v17.8h, #7
+ sqrshrun v6.8b, v18.8h, #7
+ sqrshrun v7.8b, v19.8h, #7
+ sqrshrun2 v4.16b, v20.8h, #7
+ sqrshrun2 v5.16b, v21.8h, #7
+ sqrshrun2 v6.16b, v22.8h, #7
+ sqrshrun2 v7.16b, v23.8h, #7
+ st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+ subs w5, w5, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1
+ load_epel_filterb x7, x6
+ sub x2, x2, x3
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v17.s}[0], [x2], x3
+ ld1 {v18.s}[0], [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().s}[0], [x2], x3
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ ld1 {v24.4h}, [x4], x10
+ sqadd v4.8h, v4.8h, v24.8h
+ sqrshrun v4.8b, v4.8h, #7
+ subs w5, w5, #1
+ st1 {v4.s}[0], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v6_8_neon, export=1
+ load_epel_filterb x7, x6
+ sub x2, x2, x3
+ sub x1, x1, #4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x2], x3
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ ld1 {v24.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v24.8h
+ sqrshrun v4.8b, v4.8h, #7
+ st1 {v4.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v4.h}[2], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1
+ load_epel_filterb x7, x6
+ sub x2, x2, x3
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x2], x3
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ ld1 {v24.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v24.8h
+ sqrshrun v4.8b, v4.8h, #7
+ subs w5, w5, #1
+ st1 {v4.8b}, [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v12_8_neon, export=1
+ load_epel_filterb x7, x6
+ sub x1, x1, #8
+ sub x2, x2, x3
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b}, [x2], x3
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ ld1 {v24.8h, v25.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v24.8h
+ sqadd v5.8h, v5.8h, v25.8h
+ sqrshrun v4.8b, v4.8h, #7
+ sqrshrun2 v4.16b, v5.8h, #7
+ st1 {v4.8b}, [x0], #8
+ subs w5, w5, #1
+ st1 {v4.s}[2], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1
+ load_epel_filterb x7, x6
+ sub x2, x2, x3
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b}, [x2], x3
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ ld1 {v24.8h, v25.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v24.8h
+ sqadd v5.8h, v5.8h, v25.8h
+ sqrshrun v4.8b, v4.8h, #7
+ sqrshrun2 v4.16b, v5.8h, #7
+ st1 {v4.16b}, [x0], x1
+ subs w5, w5, #1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v24_8_neon, export=1
+ load_epel_filterb x7, x6
+ sub x2, x2, x3
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b, v17.8b, v18.8b}, [x2], x3
+ ld1 {v19.8b, v20.8b, v21.8b}, [x2], x3
+ ld1 {v22.8b, v23.8b, v24.8b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb v5, \src1, \src4, \src7, \src10
+ calc_epelb v6, \src2, \src5, \src8, \src11
+ ld1 {v28.8h, v29.8h, v30.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v28.8h
+ sqadd v5.8h, v5.8h, v29.8h
+ sqadd v6.8h, v6.8h, v30.8h
+ sqrshrun v4.8b, v4.8h, #7
+ sqrshrun v5.8b, v5.8h, #7
+ sqrshrun v6.8b, v6.8h, #7
+ subs w5, w5, #1
+ st1 {v4.8b, v5.8b, v6.8b}, [x0], x1
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1
+ load_epel_filterb x7, x6
+ sub x2, x2, x3
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x2], x3
+ ld1 {v18.16b, v19.16b}, [x2], x3
+ ld1 {v20.16b, v21.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().16b, \src7\().16b}, [x2], x3
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ calc_epelb v4, \src0, \src2, \src4, \src6
+ calc_epelb2 v5, \src0, \src2, \src4, \src6
+ calc_epelb v6, \src1, \src3, \src5, \src7
+ calc_epelb2 v7, \src1, \src3, \src5, \src7
+ ld1 {v24.8h-v27.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v24.8h
+ sqadd v5.8h, v5.8h, v25.8h
+ sqadd v6.8h, v6.8h, v26.8h
+ sqadd v7.8h, v7.8h, v27.8h
+ sqrshrun v4.8b, v4.8h, #7
+ sqrshrun2 v4.16b, v5.8h, #7
+ sqrshrun v5.8b, v6.8h, #7
+ sqrshrun2 v5.16b, v7.8h, #7
+ st1 {v4.16b, v5.16b}, [x0], x1
+ subs w5, w5, #1
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v48_8_neon, export=1
+ stp x7, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_bi_v24_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x0, x0, #24
+ add x2, x2, #24
+ add x4, x4, #48
+ ldr x7, [sp]
+ bl X(ff_hevc_put_hevc_epel_bi_v24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v64_8_neon, export=1
+ stp x7, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_bi_v32_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x0, x0, #32
+ add x2, x2, #32
+ add x4, x4, #64
+ ldr x7, [sp]
+ bl X(ff_hevc_put_hevc_epel_bi_v32_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv4_8_neon, export=1
+ add w10, w5, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w5, #3
+ mov x4, x6
+ mov x5, x7
+ bl X(ff_hevc_put_hevc_epel_h4_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x7, x6
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.4h}, [sp], x10
+ ld1 {v17.4h}, [sp], x10
+ ld1 {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().4h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ ld1 {v6.4h}, [x4], x10
+ sqadd v4.4h, v4.4h, v6.4h
+ sqrshrun v4.8b, v4.8h, #7
+ subs w5, w5, #1
+ st1 {v4.s}[0], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv6_8_neon, export=1
+ add w10, w5, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w5, #3
+ mov x4, x6
+ mov x5, x7
+ bl X(ff_hevc_put_hevc_epel_h6_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x7, x6
+ sub x1, x1, #4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ ld1 {v6.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v6.8h
+ sqrshrun v4.8b, v4.8h, #7
+ st1 {v4.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v4.h}[2], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv8_8_neon, export=1
+ add w10, w5, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w5, #3
+ mov x4, x6
+ mov x5, x7
+ bl X(ff_hevc_put_hevc_epel_h8_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x7, x6
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ ld1 {v6.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v6.8h
+ sqrshrun v4.8b, v4.8h, #7
+ subs w5, w5, #1
+ st1 {v4.8b}, [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv12_8_neon, export=1
+ add w10, w5, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w5, #3
+ mov x4, x6
+ mov x5, x7
+ bl X(ff_hevc_put_hevc_epel_h12_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x7, x6
+ sub x1, x1, #8
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ ld1 {v6.8h, v7.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v6.8h
+ sqadd v5.8h, v5.8h, v7.8h
+ sqrshrun v4.8b, v4.8h, #7
+ sqrshrun2 v4.16b, v5.8h, #7
+ st1 {v4.8b}, [x0], #8
+ subs w5, w5, #1
+ st1 {v4.s}[2], [x0], x1
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv16_8_neon, export=1
+ add w10, w5, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w5, #3
+ mov x4, x6
+ mov x5, x7
+ bl X(ff_hevc_put_hevc_epel_h16_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x7, x6
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ calc_epelh2 v5, v6, \src1, \src3, \src5, \src7
+ ld1 {v6.8h, v7.8h}, [x4], x10
+ sqadd v4.8h, v4.8h, v6.8h
+ sqadd v5.8h, v5.8h, v7.8h
+ sqrshrun v4.8b, v4.8h, #7
+ sqrshrun2 v4.16b, v5.8h, #7
+ st1 {v4.16b}, [x0], x1
+ subs w5, w5, #1
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv24_8_neon, export=1
+ add w10, w5, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w5, #3
+ mov x4, x6
+ mov x5, x7
+ bl X(ff_hevc_put_hevc_epel_h24_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x7, x6
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
+ ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
+ ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8h, \src10\().8h, \src11\().8h}, [sp], x10
+ calc_epelh v1, \src0, \src3, \src6, \src9
+ calc_epelh2 v1, v2, \src0, \src3, \src6, \src9
+ calc_epelh v2, \src1, \src4, \src7, \src10
+ calc_epelh2 v2, v3, \src1, \src4, \src7, \src10
+ calc_epelh v3, \src2, \src5, \src8, \src11
+ calc_epelh2 v3, v4, \src2, \src5, \src8, \src11
+ ld1 {v4.8h, v5.8h, v6.8h}, [x4], x10
+ sqadd v1.8h, v1.8h, v4.8h
+ sqadd v2.8h, v2.8h, v5.8h
+ sqadd v3.8h, v3.8h, v6.8h
+ sqrshrun v1.8b, v1.8h, #7
+ sqrshrun v2.8b, v2.8h, #7
+ sqrshrun v3.8b, v3.8h, #7
+ subs w5, w5, #1
+ st1 {v1.8b, v2.8b, v3.8b}, [x0], x1
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv32_8_neon, export=1
+ sub sp, sp, #16
+ st1 {v8.16b}, [sp]
+ add w10, w5, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w5, #3
+ mov x4, x6
+ mov x5, x7
+ bl X(ff_hevc_put_hevc_epel_h32_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x7, x6
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\src12\().8h, \src13\().8h, \src14\().8h, \src15\().8h}, [sp], x10
+ calc_epelh v1, \src0, \src4, \src8, \src12
+ calc_epelh2 v1, v2, \src0, \src4, \src8, \src12
+ calc_epelh v2, \src1, \src5, \src9, \src13
+ calc_epelh2 v2, v3, \src1, \src5, \src9, \src13
+ calc_epelh v3, \src2, \src6, \src10, \src14
+ calc_epelh2 v3, v4, \src2, \src6, \src10, \src14
+ calc_epelh v4, \src3, \src7, \src11, \src15
+ calc_epelh2 v4, v5, \src3, \src7, \src11, \src15
+ ld1 {v5.8h, v6.8h, v7.8h, v8.8h}, [x4], x10
+ sqadd v1.8h, v1.8h, v5.8h
+ sqadd v2.8h, v2.8h, v6.8h
+ sqadd v3.8h, v3.8h, v7.8h
+ sqadd v4.8h, v4.8h, v8.8h
+ sqrshrun v1.8b, v1.8h, #7
+ sqrshrun v2.8b, v2.8h, #7
+ sqrshrun v3.8b, v3.8h, #7
+ sqrshrun v4.8b, v4.8h, #7
+ st1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x0], x1
+ subs w5, w5, #1
+.endm
+1: calc_all16
+.purgem calc
+2: ld1 {v8.16b}, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv48_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x6, x7, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_bi_hv24_8_neon)
+ ldp x6, x7, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x0, x0, #24
+ add x2, x2, #24
+ add x4, x4, #48
+ bl X(ff_hevc_put_hevc_epel_bi_hv24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv64_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x6, x7, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_bi_hv32_8_neon)
+ ldp x6, x7, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x0, x0, #32
+ add x2, x2, #32
+ add x4, x4, #64
+ bl X(ff_hevc_put_hevc_epel_bi_hv32_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 3e5d85247e..a4c078683b 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -69,6 +69,46 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
void ff_hevc_put_hevc_##fn##48_8_neon args; \
void ff_hevc_put_hevc_##fn##64_8_neon args; \
+NEON8_FNPROTO(pel_pixels, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(pel_bi_pixels, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_h, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_hv, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_uni_hv, (uint8_t *dst, ptrdiff_t _dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
NEON8_FNPROTO(qpel_h, (int16_t *dst,
uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width));
@@ -137,12 +177,24 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
// of non-multiple of 8 seem to arise.
// c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon;
+ NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels);
+ NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv);
+ NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v);
+ NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv);
+ NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 0, pel_bi_pixels);
+ NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 1, epel_bi_h);
+ NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 0, epel_bi_v);
+ NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 1, epel_bi_hv);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels);
NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1, qpel_bi_h);
NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v);
NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv);
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/2] lavc/aarch64: add hevc qpel assembly
2022-02-03 13:51 ` [FFmpeg-devel] [PATCH v2 " J. Dekker
2022-02-03 13:51 ` [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc epel assembly J. Dekker
@ 2022-02-07 22:11 ` Martin Storsjö
1 sibling, 0 replies; 6+ messages in thread
From: Martin Storsjö @ 2022-02-07 22:11 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Thu, 3 Feb 2022, J. Dekker wrote:
> Thanks: Rafal Dabrowa <fatwildcat@gmail.com>
> ---
> libavcodec/aarch64/Makefile | 1 +
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 67 +
> libavcodec/aarch64/hevcdsp_qpel_neon.S | 2799 +++++++++++++++++++++
> 3 files changed, 2867 insertions(+)
> create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
>
> Had trouble testing on a Linux machine as well, but have a workflow
> setup for that now so should be easier in the future. Passes FATE on
> both macOS and Linux.
> +NEON8_FNPROTO(qpel_h, (int16_t *dst,
> + uint8_t *src, ptrdiff_t srcstride,
> + int height, intptr_t mx, intptr_t my, int width));
Passing a whole parenthesized expression like this, via one macro
parameter, feels quite unorthodox to me, but it does seem to work now with
all compilers I have to test with, so I guess it's tolerable that way.
> +
> +#include "libavutil/aarch64/asm.S"
> +#define MAX_PB_SIZE 64
> +
> +.Lqpel_filters:
> + .byte 0, 0, 0, 0, 0, 0, 0, 0
This assembles incorrectly with gas-preprocessor targeting MSVC armasm64.
Normally we enclose all such constants in const/endconst, which sets up
the appropriate section and all that. But if put into the const data
section, it's probably too far away for an 'adr' instruction, so then
you'd need to use the movrel macro (expanding to 'adrp' + 'add').
A less elegant workaround for armasm/gas-preprocessor is to just add a
'.text' above this.
> + .byte -1, 4,-10, 58, 17, -5, 1, 0
> + .byte -1, 4,-11, 40, 40,-11, 4, -1
> + .byte 0, 1, -5, 17, 58,-10, 4, -1
> +
> +.macro load_qpel_filterb freg, xreg
> + adr \xreg, .Lqpel_filters
> + add \xreg, \xreg, \freg, lsl #3
> + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
> + ld4r {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
Please follow the normal coding style (align the starting '{' just like
other characters at the start of the operand column, don't leave it
outside. This goes for the whole file.
> + neg v0.16b, v0.16b
> + neg v2.16b, v2.16b
> + neg v5.16b, v5.16b
> + neg v7.16b, v7.16b
Why these negations? Can't you just change the corresponding umlsl/umlal
instructions matchingly?
Also, can't those umlsl/umlal use the elementwise form, e.g. v0.b[0], so
you wouldn't need to waste 8 full registers on the coefficients? (If
you've got enough registers so you don't need to clobber v8-v15, there's
probably no benefit in squeezing things tighter though. But if there's
code that could be made more efficient if you'd have more spare registers,
that could help.)
> +.endm
> +
> +.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
> + umlsl \dst\().8h, \src0\().8b, v0.8b
Could this first one be plain 'umull' (if you wouldn't negate the
coefficient), avoiding an extra 'movi v28.8h, #0'?
> + umlal \dst\().8h, \src1\().8b, v1.8b
> + umlsl \dst\().8h, \src2\().8b, v2.8b
> + umlal \dst\().8h, \src3\().8b, v3.8b
> + umlal \dst\().8h, \src4\().8b, v4.8b
> + umlsl \dst\().8h, \src5\().8b, v5.8b
> + umlal \dst\().8h, \src6\().8b, v6.8b
> + umlsl \dst\().8h, \src7\().8b, v7.8b
> +.endm
> +
> +.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
> + umlsl2 \dst\().8h, \src0\().16b, v0.16b
> + umlal2 \dst\().8h, \src1\().16b, v1.16b
> + umlsl2 \dst\().8h, \src2\().16b, v2.16b
> + umlal2 \dst\().8h, \src3\().16b, v3.16b
> + umlal2 \dst\().8h, \src4\().16b, v4.16b
> + umlsl2 \dst\().8h, \src5\().16b, v5.16b
> + umlal2 \dst\().8h, \src6\().16b, v6.16b
> + umlsl2 \dst\().8h, \src7\().16b, v7.16b
> +.endm
> +
> +.macro load_qpel_filterh freg, xreg
> + adr \xreg, .Lqpel_filters
> + add \xreg, \xreg, \freg, lsl #3
> + ld1 {v0.8b}, [\xreg]
> + sxtl v0.8h, v0.8b
> +.endm
> +
> +.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
> + smull \dst\().4s, \src0\().4h, v0.h[0]
> + smlal \dst\().4s, \src1\().4h, v0.h[1]
> + smlal \dst\().4s, \src2\().4h, v0.h[2]
> + smlal \dst\().4s, \src3\().4h, v0.h[3]
> + smlal \dst\().4s, \src4\().4h, v0.h[4]
> + smlal \dst\().4s, \src5\().4h, v0.h[5]
> + smlal \dst\().4s, \src6\().4h, v0.h[6]
> + smlal \dst\().4s, \src7\().4h, v0.h[7]
> +.ifc \op, sshr
> + sshr \dst\().4s, \dst\().4s, \shift
> +.else
> + \op \dst\().4h, \dst\().4s, \shift
> +.endif
> +.endm
> +
> +.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
> + smull2 \dstt\().4s, \src0\().8h, v0.h[0]
> + smlal2 \dstt\().4s, \src1\().8h, v0.h[1]
> + smlal2 \dstt\().4s, \src2\().8h, v0.h[2]
> + smlal2 \dstt\().4s, \src3\().8h, v0.h[3]
> + smlal2 \dstt\().4s, \src4\().8h, v0.h[4]
> + smlal2 \dstt\().4s, \src5\().8h, v0.h[5]
> + smlal2 \dstt\().4s, \src6\().8h, v0.h[6]
> + smlal2 \dstt\().4s, \src7\().8h, v0.h[7]
> +.ifc \op, sshr
> + sshr \dst\().4s, \dstt\().4s, \shift
> +.else
> + \op \dst\().8h, \dstt\().4s, \shift
> +.endif
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_h4_8_neon, export=1
> + load_qpel_filterb x4, x5
> + sub x1, x1, #3
> + sub x2, x2, #8
> + mov x14, #(MAX_PB_SIZE * 2)
> +1: ld1 {v16.8b}, [x1], #8
> + ld1 {v17.s}[0], [x1], x2
> +.macro calc src0, src1, idx
> + ushr \src0\().2d, \src1\().2d, #8
> + mov \src0\().b[7], v17.b[\idx]
> +.endm
> + calc v18, v16, 0
> + calc v19, v18, 1
> + calc v20, v19, 2
This operation looks weird. Isn't this equivalent of "ext v18.8b, v16.8b,
v17.8b, #1; ext.v19.8b, v16.8b, v17.8b, #2" etc?
> + ushr v21.2d, v20.2d, #8
> + ushr v22.2d, v21.2d, #8
> + ushr v23.2d, v22.2d, #8
> + ushr v24.2d, v23.2d, #8
... and here, more 'ext'. Also, this whole sequence of 10 instructions
above is completely serial, where every single instruction depends on the
result of the previous one. That's pretty bad for pipelining.
> + movi v28.8h, #0
This instruction could be avoided if the first instruction in the macro
would be a 'mull'
> + calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
> + subs w3, w3, #1
> + st1 {v28.4h}, [x0], x14
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h6_8_neon, export=1
> + load_qpel_filterb x4, x5
> + sub x1, x1, #3
> + mov x14, #(MAX_PB_SIZE * 2 - 8)
> +1: ld1 {v16.8b, v17.8b}, [x1], x2
> + // same macro
> + calc v18, v16, 0
> + calc v19, v18, 1
> + calc v20, v19, 2
> + calc v21, v20, 3
> + calc v22, v21, 4
> + ushr v23.2d, v22.2d, #8
> + ushr v24.2d, v23.2d, #8
> + movi v28.8h, #0
> + calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
> + st1 {v28.4h}, [x0], #8
> + subs w3, w3, #1
> + st1 {v28.s}[2], [x0], x14
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h8_8_neon, export=1
> + sxtw x4, w4
> + sxtw x7, w7
Indentation is off
> + load_qpel_filterb x4, x5
> + sub x1, x1, #3
> + mov x14, #(MAX_PB_SIZE * 2)
> +1: ld1 {v16.8b, v17.8b}, [x1], x2
> + // same macro
> + calc v18, v16, 0
> + calc v19, v18, 1
> + calc v20, v19, 2
> + calc v21, v20, 3
> + calc v22, v21, 4
> + calc v23, v22, 5
> + calc v24, v23, 6
> +.purgem calc
> + movi v28.8h, #0
> + calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
> + subs w3, w3, #1
> + st1 {v28.8h}, [x0], x14
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h12_8_neon, export=1
> + load_qpel_filterb x4, x5
> + sub x1, x1, #3
> + sub x2, x2, #16
> + mov x14, #(MAX_PB_SIZE * 2 - 16)
> +1: ld2 {v16.8b, v17.8b}, [x1], #16
> + ld1 {v27.s}[0], [x1], x2
> + ushr v18.2d, v16.2d, #8
> + ushr v19.2d, v17.2d, #8
> + mov v18.b[7], v27.b[0]
> + mov v19.b[7], v27.b[1]
Please look into using 'ext' here too
> + ushr v20.2d, v18.2d, #8
> + ushr v21.2d, v19.2d, #8
> + mov v20.b[7], v27.b[2]
> + mov v21.b[7], v27.b[3]
> + ushr v22.2d, v20.2d, #8
> + ushr v23.2d, v21.2d, #8
> + ushr v24.2d, v22.2d, #8
> + movi v28.8h, #0
> + movi v29.8h, #0
> + calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
> + calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
> + zip1 v16.8h, v28.8h, v29.8h
> + zip2 v17.8h, v28.8h, v29.8h
I'm not sure about why this function deinterleaves things and reinterleavs
them afterwards, but maybe it's necessary for what this does.
> + st1 {v16.8h}, [x0], #16
> + subs w3, w3, #1
> + st1 {v17.4h}, [x0], x14
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
> + load_qpel_filterb x4, x5
> + sub x1, x1, #3
> + sub x2, x2, #16
> + mov x14, #(MAX_PB_SIZE * 2)
> +1: ld2 {v16.8b, v17.8b}, [x1], #16
> + ld1 {v27.8b}, [x1], x2
> + ushr v18.2d, v16.2d, #8
> + ushr v19.2d, v17.2d, #8
> + mov v18.b[7], v27.b[0]
> + mov v19.b[7], v27.b[1]
> + ushr v20.2d, v18.2d, #8
> + ushr v21.2d, v19.2d, #8
> + mov v20.b[7], v27.b[2]
> + mov v21.b[7], v27.b[3]
> + ushr v22.2d, v20.2d, #8
> + ushr v23.2d, v21.2d, #8
> + mov v22.b[7], v27.b[4]
> + mov v23.b[7], v27.b[5]
> + ushr v24.2d, v22.2d, #8
> + mov v24.b[7], v27.b[6]
Same thing about 'ext' for shifting (and in all other functions below)
> + movi v28.8h, #0
> + movi v29.8h, #0
> + calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
> + calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
> + subs w3, w3, #1
> + st2 {v28.8h, v29.8h}, [x0], x14
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h24_8_neon, export=1
> + load_qpel_filterb x4, x5
> + sub x1, x1, #3
> + sub x2, x2, #24
> + mov x14, #(MAX_PB_SIZE * 2)
> +1: ld3 {v16.8b, v17.8b, v18.8b}, [x1], #24
> + ld1 {v27.8b}, [x1], x2
> + ushr v19.2d, v16.2d, #8
> + ushr v20.2d, v17.2d, #8
> + ushr v21.2d, v18.2d, #8
> + mov v19.b[7], v27.b[0]
> + mov v20.b[7], v27.b[1]
> + mov v21.b[7], v27.b[2]
> + ushr v22.2d, v19.2d, #8
> + ushr v23.2d, v20.2d, #8
> + ushr v24.2d, v21.2d, #8
> + mov v22.b[7], v27.b[3]
> + mov v23.b[7], v27.b[4]
> + mov v24.b[7], v27.b[5]
> + ushr v25.2d, v22.2d, #8
> + mov v25.b[7], v27.b[6]
> + movi v28.8h, #0
> + movi v29.8h, #0
> + movi v30.8h, #0
> + calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
> + calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
> + calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
> + subs w3, w3, #1
> + st3 {v28.8h, v29.8h, v30.8h}, [x0], x14
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
> + load_qpel_filterb x4, x5
> + sub x1, x1, #3
> + sub x2, x2, #32
> + mov x14, #(MAX_PB_SIZE * 2)
> +1: ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
> + movi v28.8h, #0
> + movi v29.8h, #0
> + ld1 {v27.8b}, [x1], x2
> + movi v30.8h, #0
> + movi v31.8h, #0
> + ushr v20.2d, v16.2d, #8
> + ushr v21.2d, v17.2d, #8
> + ushr v22.2d, v18.2d, #8
> + ushr v23.2d, v19.2d, #8
> + mov v20.b[7], v27.b[0]
> + mov v21.b[7], v27.b[1]
> + mov v22.b[7], v27.b[2]
> + mov v23.b[7], v27.b[3]
> + ushr v24.2d, v20.2d, #8
> + ushr v25.2d, v21.2d, #8
> + ushr v26.2d, v22.2d, #8
> + mov v24.b[7], v27.b[4]
> + mov v25.b[7], v27.b[5]
> + mov v26.b[7], v27.b[6]
> + calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
> + calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
> + calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
> + calc_qpelb v31, v19, v20, v21, v22, v23, v24, v25, v26
> + st4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h48_8_neon, export=1
> + load_qpel_filterb x4, x5
> + sub x1, x1, #3
> + sub x2, x2, #48
> + mov x7, #24
> + mov x14, #80
> +1: ld3 {v16.16b, v17.16b, v18.16b}, [x1], x7
> + movi v28.8h, #0
> + ld1 {v26.8b}, [x1], x7
> + movi v29.8h, #0
> + ld1 {v27.8b}, [x1], x2
> + movi v30.8h, #0
> + ushr v19.2d, v16.2d, #8
> + ushr v20.2d, v17.2d, #8
> + ushr v21.2d, v18.2d, #8
> + mov v19.b[7], v26.b[0]
> + mov v19.b[15], v27.b[0]
> + mov v20.b[7], v26.b[1]
> + mov v20.b[15], v27.b[1]
> + mov v21.b[7], v26.b[2]
> + mov v21.b[15], v27.b[2]
> + ushr v22.2d, v19.2d, #8
> + ushr v23.2d, v20.2d, #8
> + ushr v24.2d, v21.2d, #8
> + mov v22.b[7], v26.b[3]
> + mov v22.b[15], v27.b[3]
> + mov v23.b[7], v26.b[4]
> + mov v23.b[15], v27.b[4]
> + mov v24.b[7], v26.b[5]
> + mov v24.b[15], v27.b[5]
> + ushr v25.2d, v22.2d, #8
> + mov v25.b[7], v26.b[6]
> + mov v25.b[15], v27.b[6]
> + calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
> + calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
> + calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
> + st3 {v28.8h, v29.8h, v30.8h}, [x0], #48
> + movi v28.8h, #0
> + movi v29.8h, #0
> + movi v30.8h, #0
> + calc_qpelb2 v28, v16, v17, v18, v19, v20, v21, v22, v23
> + calc_qpelb2 v29, v17, v18, v19, v20, v21, v22, v23, v24
> + calc_qpelb2 v30, v18, v19, v20, v21, v22, v23, v24, v25
> + st3 {v28.8h, v29.8h, v30.8h}, [x0], x14
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h64_8_neon, export=1
> + load_qpel_filterb x4, x5
> + sub x1, x1, #3
> + sub x2, x2, #64
> + mov x7, #32
> +1: ld4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x7
> + ld1 {v27.8b}, [x1], x7
> + ld1 {v28.8b}, [x1], x2
> + ushr v20.2d, v16.2d, #8
> + ushr v21.2d, v17.2d, #8
> + ushr v22.2d, v18.2d, #8
> + ushr v23.2d, v19.2d, #8
> + mov v20.b[7], v27.b[0]
> + mov v21.b[7], v27.b[1]
> + mov v22.b[7], v27.b[2]
> + mov v23.b[7], v27.b[3]
> + mov v20.b[15], v28.b[0]
> + mov v21.b[15], v28.b[1]
> + mov v22.b[15], v28.b[2]
> + mov v23.b[15], v28.b[3]
> + ushr v24.2d, v20.2d, #8
> + ushr v25.2d, v21.2d, #8
> + ushr v26.2d, v22.2d, #8
> + mov v24.b[7], v27.b[4]
> + mov v25.b[7], v27.b[5]
> + mov v26.b[7], v27.b[6]
> + mov v24.b[15], v28.b[4]
> + mov v25.b[15], v28.b[5]
> + mov v26.b[15], v28.b[6]
> +.macro calc fn
> + movi v28.8h, #0
> + movi v29.8h, #0
> + movi v30.8h, #0
> + movi v31.8h, #0
> + \fn v28, v16, v17, v18, v19, v20, v21, v22, v23
> + \fn v29, v17, v18, v19, v20, v21, v22, v23, v24
> + \fn v30, v18, v19, v20, v21, v22, v23, v24, v25
> + \fn v31, v19, v20, v21, v22, v23, v24, v25, v26
> + st4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
> +.endm
> + calc calc_qpelb
> + calc calc_qpelb2
> +.purgem calc
> + subs w3, w3, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +.macro calc_all
> + calc v23, v16, v17, v18, v19, v20, v21, v22, v23
> + b.eq 2f
> + calc v16, v17, v18, v19, v20, v21, v22, v23, v16
> + b.eq 2f
> + calc v17, v18, v19, v20, v21, v22, v23, v16, v17
> + b.eq 2f
> + calc v18, v19, v20, v21, v22, v23, v16, v17, v18
> + b.eq 2f
> + calc v19, v20, v21, v22, v23, v16, v17, v18, v19
> + b.eq 2f
> + calc v20, v21, v22, v23, v16, v17, v18, v19, v20
> + b.eq 2f
> + calc v21, v22, v23, v16, v17, v18, v19, v20, v21
> + b.eq 2f
> + calc v22, v23, v16, v17, v18, v19, v20, v21, v22
> + b.hi 1b
> +.endm
> +
> +.macro calc_all2
> + calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
> + b.eq 2f
> + calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
> + b.eq 2f
> + calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
> + b.eq 2f
> + calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
> + b.eq 2f
> + calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
> + b.eq 2f
> + calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
> + b.eq 2f
> + calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
> + b.eq 2f
> + calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
> + b.hi 1b
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
> + load_qpel_filterb x5, x4
> + sub x1, x1, x2, lsl #1
> + mov x9, #(MAX_PB_SIZE * 2)
> + sub x1, x1, x2
> + ld1 {v16.s}[0], [x1], x2
> + ld1 {v17.s}[0], [x1], x2
> + ld1 {v18.s}[0], [x1], x2
> + ld1 {v19.s}[0], [x1], x2
> + ld1 {v20.s}[0], [x1], x2
> + ld1 {v21.s}[0], [x1], x2
> + ld1 {v22.s}[0], [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().s}[0], [x1], x2
> + movi v24.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + st1 {v24.4h}, [x0], x9
> + subs w3, w3, #1
> + b.eq 2f
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
The calc_all macro expands to _al lot_ of code. It would be good if you'd
just expand it once (or at most twice) and have all other cases jump into
that instantiation of it. To be clear, this object file right now adds 41
KB of executable code. (The corresponding vp9mc_neon.o contains 10 KB of
executable code, and vp9mc_16bpp_neon.o another 9 KB.)
But it seems like each of them expand into different code. In that case,
I'd suggest to unroll less. Instead of unrolling, just do a series of "mov
v16, v17; mov v17, v18" etc, to shift the registers and skip unrolling. It
costs a little extra, but avoids wasting instruction cache.
> +
> +function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
> + load_qpel_filterb x5, x4
> + sub x1, x1, x2, lsl #1
> + mov x9, #(MAX_PB_SIZE * 2 - 8)
> + sub x1, x1, x2
> + ld1 {v16.8b}, [x1], x2
> + ld1 {v17.8b}, [x1], x2
> + ld1 {v18.8b}, [x1], x2
> + ld1 {v19.8b}, [x1], x2
> + ld1 {v20.8b}, [x1], x2
> + ld1 {v21.8b}, [x1], x2
> + ld1 {v22.8b}, [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().8b}, [x1], x2
> + movi v24.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + st1 {v24.4h}, [x0], #8
> + st1 {v24.s}[2], [x0], x9
> + subs w3, w3, #1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
> + load_qpel_filterb x5, x4
> + sub x1, x1, x2, lsl #1
> + mov x9, #(MAX_PB_SIZE * 2)
> + sub x1, x1, x2
> + ld1 {v16.8b}, [x1], x2
> + ld1 {v17.8b}, [x1], x2
> + ld1 {v18.8b}, [x1], x2
> + ld1 {v19.8b}, [x1], x2
> + ld1 {v20.8b}, [x1], x2
> + ld1 {v21.8b}, [x1], x2
> + ld1 {v22.8b}, [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().8b}, [x1], x2
> + movi v24.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + st1 {v24.8h}, [x0], x9
> + subs w3, w3, #1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
> + load_qpel_filterb x5, x4
> + sub x1, x1, x2, lsl #1
> + mov x9, #(MAX_PB_SIZE * 2 - 16)
> + sub x1, x1, x2
> + ld1 {v16.16b}, [x1], x2
> + ld1 {v17.16b}, [x1], x2
> + ld1 {v18.16b}, [x1], x2
> + ld1 {v19.16b}, [x1], x2
> + ld1 {v20.16b}, [x1], x2
> + ld1 {v21.16b}, [x1], x2
> + ld1 {v22.16b}, [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().16b}, [x1], x2
> + movi v24.8h, #0
> + movi v25.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + st1 {v24.8h}, [x0], #16
> + subs w3, w3, #1
> + st1 {v25.4h}, [x0], x9
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
> + load_qpel_filterb x5, x4
> + sub x1, x1, x2, lsl #1
> + mov x9, #(MAX_PB_SIZE * 2)
> + sub x1, x1, x2
> + ld1 {v16.16b}, [x1], x2
> + ld1 {v17.16b}, [x1], x2
> + ld1 {v18.16b}, [x1], x2
> + ld1 {v19.16b}, [x1], x2
> + ld1 {v20.16b}, [x1], x2
> + ld1 {v21.16b}, [x1], x2
> + ld1 {v22.16b}, [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().16b}, [x1], x2
> + movi v24.8h, #0
> + movi v25.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + subs w3, w3, #1
> + st1 {v24.8h, v25.8h}, [x0], x9
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +// todo: reads #32 bytes
> +function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
> + sub sp, sp, #48
> + st1 {v8.16b, v9.16b, v10.16b}, [sp]
> + load_qpel_filterb x5, x4
> + sub x1, x1, x2, lsl #1
> + sub x1, x1, x2
> + mov x9, #(MAX_PB_SIZE * 2)
> + ld1 {v16.16b, v17.16b}, [x1], x2
> + ld1 {v18.16b, v19.16b}, [x1], x2
> + ld1 {v20.16b, v21.16b}, [x1], x2
> + ld1 {v22.16b, v23.16b}, [x1], x2
> + ld1 {v24.16b, v25.16b}, [x1], x2
> + ld1 {v26.16b, v27.16b}, [x1], x2
> + ld1 {v28.16b, v29.16b}, [x1], x2
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> + ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
> + movi v8.8h, #0
> + movi v9.8h, #0
> + movi v10.8h, #0
> + calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> + subs w3, w3, #1
> + st1 {v8.8h, v9.8h, v10.8h}, [x0], x9
> +.endm
> +1: calc_all2
> +.purgem calc
> +2: ld1 {v8.16b, v9.16b, v10.16b}, [sp], #48
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
> + sub sp, sp, #64
> + st1 {v8.16b-v11.16b}, [sp]
> + load_qpel_filterb x5, x4
> + sub x1, x1, x2, lsl #1
> + mov x9, #(MAX_PB_SIZE * 2)
> + sub x1, x1, x2
> + ld1 {v16.16b, v17.16b}, [x1], x2
> + ld1 {v18.16b, v19.16b}, [x1], x2
> + ld1 {v20.16b, v21.16b}, [x1], x2
> + ld1 {v22.16b, v23.16b}, [x1], x2
> + ld1 {v24.16b, v25.16b}, [x1], x2
> + ld1 {v26.16b, v27.16b}, [x1], x2
> + ld1 {v28.16b, v29.16b}, [x1], x2
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> + ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
> + movi v8.8h, #0
> + movi v9.8h, #0
> + movi v10.8h, #0
> + movi v11.8h, #0
> + calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> + calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> + subs w3, w3, #1
> + st1 {v8.8h-v11.8h}, [x0], x9
> +.endm
> +1: calc_all2
> +.purgem calc
> +2: ld1 {v8.16b-v11.16b}, [sp], #64
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
> + stp x5, x30, [sp, #-16]!
> + stp x0, x1, [sp, #-16]!
> + stp x2, x3, [sp, #-16]!
> + bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
> + ldp x2, x3, [sp], #16
> + ldp x0, x1, [sp], #16
> + ldr x5, [sp]
> + add x0, x0, #48
> + add x1, x1, #24
> + bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
> + ldp xzr, x30, [sp], #16
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
> + sub sp, sp, #64
> + st1 {v8.16b-v11.16b}, [sp]
> + load_qpel_filterb x5, x4
> + sub x1, x1, x2, lsl #1
> + sub x1, x1, x2
> + mov x9, #(MAX_PB_SIZE * 2)
> +0: mov x8, x1 // src
> + ld1 {v16.16b, v17.16b}, [x8], x2
> + mov w11, w3 // height
> + ld1 {v18.16b, v19.16b}, [x8], x2
> + mov x10, x0 // dst
> + ld1 {v20.16b, v21.16b}, [x8], x2
> + ld1 {v22.16b, v23.16b}, [x8], x2
> + ld1 {v24.16b, v25.16b}, [x8], x2
> + ld1 {v26.16b, v27.16b}, [x8], x2
> + ld1 {v28.16b, v29.16b}, [x8], x2
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> + ld1 {\tmp0\().16b, \tmp1\().16b}, [x8], x2
> + movi v8.8h, #0
> + movi v9.8h, #0
> + movi v10.8h, #0
> + movi v11.8h, #0
> + calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> + calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> + subs x11, x11, #1
> + st1 {v8.8h-v11.8h}, [x10], x9
> +.endm
> +1: calc_all2
> +.purgem calc
> +2: add x0, x0, #64
> + add x1, x1, #32
> + subs w6, w6, #32
> + b.hi 0b
> + ld1 {v8.16b-v11.16b}, [sp], #64
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv4_8_neon, export=1
> + add w10, w3, #7
> + mov x7, #128
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x3, [sp, #-16]!
> + stp x5, x30, [sp, #-16]!
> + add x0, sp, #32
> + sub x1, x1, x2, lsl #1
> + add x3, x3, #7
> + sub x1, x1, x2
> + bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
> + ldp x5, x30, [sp], #16
> + ldp x0, x3, [sp], #16
> + load_qpel_filterh x5, x4
> + ld1 {v16.4h}, [sp], x7
> + ld1 {v17.4h}, [sp], x7
> + ld1 {v18.4h}, [sp], x7
> + ld1 {v19.4h}, [sp], x7
> + ld1 {v20.4h}, [sp], x7
> + ld1 {v21.4h}, [sp], x7
> + ld1 {v22.4h}, [sp], x7
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().4h}, [sp], x7
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
> + subs w3, w3, #1
> + st1 {v1.4h}, [x0], x7
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv6_8_neon, export=1
> + add w10, w3, #7
> + mov x7, #128
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x3, [sp, #-16]!
> + stp x5, x30, [sp, #-16]!
> + add x0, sp, #32
> + sub x1, x1, x2, lsl #1
> + add x3, x3, #7
> + sub x1, x1, x2
> + bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
> + ldp x5, x30, [sp], #16
> + mov x8, #120
> + ldp x0, x3, [sp], #16
> + load_qpel_filterh x5, x4
> + ld1 {v16.8h}, [sp], x7
> + ld1 {v17.8h}, [sp], x7
> + ld1 {v18.8h}, [sp], x7
> + ld1 {v19.8h}, [sp], x7
> + ld1 {v20.8h}, [sp], x7
> + ld1 {v21.8h}, [sp], x7
> + ld1 {v22.8h}, [sp], x7
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().8h}, [sp], x7
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
> + calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
> + st1 {v1.4h}, [x0], #8
> + subs w3, w3, #1
> + st1 {v1.s}[2], [x0], x8
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv8_8_neon, export=1
> + add w10, w3, #7
> + lsl x10, x10, #7
> + sub x1, x1, x2, lsl #1
> + sub sp, sp, x10 // tmp_array
> + stp x0, x3, [sp, #-16]!
> + stp x5, x30, [sp, #-16]!
> + add x0, sp, #32
> + add x3, x3, #7
> + sub x1, x1, x2
> + bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
> + ldp x5, x30, [sp], #16
> + mov x7, #128
> + ldp x0, x3, [sp], #16
> + load_qpel_filterh x5, x4
> + ld1 {v16.8h}, [sp], x7
> + ld1 {v17.8h}, [sp], x7
> + ld1 {v18.8h}, [sp], x7
> + ld1 {v19.8h}, [sp], x7
> + ld1 {v20.8h}, [sp], x7
> + ld1 {v21.8h}, [sp], x7
> + ld1 {v22.8h}, [sp], x7
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().8h}, [sp], x7
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
> + calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
> + subs w3, w3, #1
> + st1 {v1.8h}, [x0], x7
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv12_8_neon, export=1
> + add w10, w3, #7
> + lsl x10, x10, #7
> + sub x1, x1, x2, lsl #1
> + sub sp, sp, x10 // tmp_array
> + stp x0, x3, [sp, #-16]!
> + stp x5, x30, [sp, #-16]!
> + add x0, sp, #32
> + add x3, x3, #7
> + sub x1, x1, x2
> + bl X(ff_hevc_put_hevc_qpel_h12_8_neon)
> + ldp x5, x30, [sp], #16
> + mov x7, #128
> + ldp x0, x3, [sp], #16
> + load_qpel_filterh x5, x4
> + mov x8, #112
> + ld1 {v16.8h, v17.8h}, [sp], x7
> + ld1 {v18.8h, v19.8h}, [sp], x7
> + ld1 {v20.8h, v21.8h}, [sp], x7
> + ld1 {v22.8h, v23.8h}, [sp], x7
> + ld1 {v24.8h, v25.8h}, [sp], x7
> + ld1 {v26.8h, v27.8h}, [sp], x7
> + ld1 {v28.8h, v29.8h}, [sp], x7
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> + ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
> + calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
> + calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
> + st1 {v1.8h}, [x0], #16
> + subs w3, w3, #1
> + st1 {v2.4h}, [x0], x8
> +.endm
> +1: calc_all2
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv16_8_neon, export=1
> + add w10, w3, #7
> + lsl x10, x10, #7
> + sub x1, x1, x2, lsl #1
> + sub sp, sp, x10 // tmp_array
> + stp x0, x3, [sp, #-16]!
> + stp x5, x30, [sp, #-16]!
> + add x3, x3, #7
> + add x0, sp, #32
> + sub x1, x1, x2
> + bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
> + ldp x5, x30, [sp], #16
> + mov x7, #128
> + ldp x0, x3, [sp], #16
> + load_qpel_filterh x5, x4
> + ld1 {v16.8h, v17.8h}, [sp], x7
> + ld1 {v18.8h, v19.8h}, [sp], x7
> + ld1 {v20.8h, v21.8h}, [sp], x7
> + ld1 {v22.8h, v23.8h}, [sp], x7
> + ld1 {v24.8h, v25.8h}, [sp], x7
> + ld1 {v26.8h, v27.8h}, [sp], x7
> + ld1 {v28.8h, v29.8h}, [sp], x7
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> + ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
> + calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
> + calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
> + calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
> + subs w3, w3, #1
> + st1 {v1.8h, v2.8h}, [x0], x7
> +.endm
> +1: calc_all2
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv24_8_neon, export=1
> + sub sp, sp, #64
> + st1 {v8.16b-v11.16b}, [sp]
> + sub x1, x1, x2, lsl #1
> + sub sp, sp, #64
> + add w10, w3, #7
> + st1 {v12.16b-v15.16b}, [sp]
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x3, [sp, #-16]!
> + stp x5, x30, [sp, #-16]!
> + add x0, sp, #32
> + add x3, x3, #7
> + sub x1, x1, x2
> + bl X(ff_hevc_put_hevc_qpel_h24_8_neon)
> + ldp x5, x30, [sp], #16
> + mov x7, #128
> + ldp x0, x3, [sp], #16
> + load_qpel_filterh x5, x4
> + ld1 {v8.8h-v10.8h}, [sp], x7
> + ld1 {v11.8h-v13.8h}, [sp], x7
> + ld1 {v14.8h-v16.8h}, [sp], x7
> + ld1 {v17.8h-v19.8h}, [sp], x7
> + ld1 {v20.8h-v22.8h}, [sp], x7
> + ld1 {v23.8h-v25.8h}, [sp], x7
> + ld1 {v26.8h-v28.8h}, [sp], x7
> +1: ld1 {v29.8h-v31.8h}, [sp], x7
> + calc_qpelh v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
> + calc_qpelh2 v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
> + calc_qpelh v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
> + calc_qpelh2 v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
> + calc_qpelh v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
> + calc_qpelh2 v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
For code like this, please try to align register columns vertically, if
possible...
Also, please unroll less here.
> + subs w3, w3, #1
> + st1 {v1.8h-v3.8h}, [x0], x7
> + b.eq 2f
> +
> + ld1 {v8.8h-v10.8h}, [sp], x7
> + calc_qpelh v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
> + calc_qpelh2 v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
> + calc_qpelh v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
> + calc_qpelh2 v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
> + calc_qpelh v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
> + calc_qpelh2 v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
> + subs w3, w3, #1
> + st1 {v1.8h-v3.8h}, [x0], x7
> + b.eq 2f
> +
> + ld1 {v11.8h-v13.8h}, [sp], x7
> + calc_qpelh v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
> + calc_qpelh2 v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
> + calc_qpelh v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
> + calc_qpelh2 v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
> + calc_qpelh v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
> + calc_qpelh2 v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
> + subs w3, w3, #1
> + st1 {v1.8h-v3.8h}, [x0], x7
> + b.eq 2f
> +
> + ld1 {v14.8h-v16.8h}, [sp], x7
> + calc_qpelh v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
> + calc_qpelh2 v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
> + calc_qpelh v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
> + calc_qpelh2 v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
> + calc_qpelh v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
> + calc_qpelh2 v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
> + subs w3, w3, #1
> + st1 {v1.8h-v3.8h}, [x0], x7
> + b.eq 2f
> +
> + ld1 {v17.8h-v19.8h}, [sp], x7
> + calc_qpelh v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
> + calc_qpelh2 v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
> + calc_qpelh v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
> + calc_qpelh2 v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
> + calc_qpelh v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
> + calc_qpelh2 v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
> + subs w3, w3, #1
> + st1 {v1.8h-v3.8h}, [x0], x7
> + b.eq 2f
> +
> + ld1 {v20.8h-v22.8h}, [sp], x7
> + calc_qpelh v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
> + calc_qpelh2 v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
> + calc_qpelh v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
> + calc_qpelh2 v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
> + calc_qpelh v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
> + calc_qpelh2 v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
> + subs w3, w3, #1
> + st1 {v1.8h-v3.8h}, [x0], x7
> + b.eq 2f
> +
> + ld1 {v23.8h-v25.8h}, [sp], x7
> + calc_qpelh v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
> + calc_qpelh2 v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
> + calc_qpelh v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
> + calc_qpelh2 v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
> + calc_qpelh v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
> + calc_qpelh2 v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
> + subs w3, w3, #1
> + st1 {v1.8h-v3.8h}, [x0], x7
> + b.eq 2f
> +
> + ld1 {v26.8h-v28.8h}, [sp], x7
> + calc_qpelh v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
> + calc_qpelh2 v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
> + calc_qpelh v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
> + calc_qpelh2 v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
> + calc_qpelh v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
> + calc_qpelh2 v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
> + subs w3, w3, #1
> + st1 {v1.8h-v3.8h}, [x0], x7
> + b.hi 1b
> +2: ld1 {v12.16b-v15.16b}, [sp], #64
> + ld1 {v8.16b-v11.16b}, [sp], #64
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv32_8_neon, export=1
> + add w10, w3, #7
> + sub x1, x1, x2, lsl #1
> + lsl x10, x10, #7
> + sub x1, x1, x2
> + sub sp, sp, x10 // tmp_array
> + stp x0, x3, [sp, #-16]!
> + add x3, x3, #7
> + stp x5, x30, [sp, #-16]!
> + add x0, sp, #32
> + bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
> + ldp x5, x30, [sp], #16
> + mov x7, #128
> + ldp x0, x3, [sp], #16
> + load_qpel_filterh x5, x4
> +0: mov x8, sp // src
> + ld1 {v16.8h, v17.8h}, [x8], x7
> + mov w9, w3 // height
> + ld1 {v18.8h, v19.8h}, [x8], x7
> + mov x5, x0 // dst
> + ld1 {v20.8h, v21.8h}, [x8], x7
> + ld1 {v22.8h, v23.8h}, [x8], x7
> + ld1 {v24.8h, v25.8h}, [x8], x7
> + ld1 {v26.8h, v27.8h}, [x8], x7
> + ld1 {v28.8h, v29.8h}, [x8], x7
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> + ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x7
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
> + calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
> + calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
> + calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
> + subs x9, x9, #1
> + st1 {v1.8h, v2.8h}, [x5], x7
> +.endm
> +1: calc_all2
> +.purgem calc
> +2: add x0, x0, #32
> + add sp, sp, #32
> + subs w6, w6, #16
> + b.hi 0b
> + add w10, w3, #6
> + add sp, sp, #64 // discard rest of first line
> + lsl x10, x10, #7
> + add sp, sp, x10 // tmp_array without first line
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv48_8_neon, export=1
> + stp xzr, x30, [sp, #-16]!
> + stp x0, x1, [sp, #-16]!
> + stp x2, x3, [sp, #-16]!
> + stp x4, x5, [sp, #-16]!
Storing more than one register like this is usually done with a series of
instructions like this:
stp x0, x1, [sp, #-64]!
stp x2, x3, [sp, #16]
stp x4, x5, [sp, #32]
...
That way, sp only has to be updated once.
Also you don't need to store xzr. If you don't have a pair of registers
to store, back it up with 'str' instead of 'stp'. (But the stack must be
kept 16 byte aligned.)
Same thing in reverse when restoring registers:
ldp x4, x5, [sp, #32]
ldp x2, x3, [sp, #16]
ldp x0, x1, [sp], #64
> + bl X(ff_hevc_put_hevc_qpel_hv24_8_neon)
> + ldp x4, x5, [sp], #16
> + ldp x2, x3, [sp], #16
> + ldp x0, x1, [sp], #16
> + add x1, x1, #24
> + add x0, x0, #48
> + bl X(ff_hevc_put_hevc_qpel_hv24_8_neon)
> + ldp xzr, x30, [sp], #16
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv64_8_neon, export=1
> + stp xzr, x30, [sp, #-16]!
> + stp x0, x1, [sp, #-16]!
> + stp x2, x3, [sp, #-16]!
> + stp x4, x5, [sp, #-16]!
> + mov x6, #32
> + bl X(ff_hevc_put_hevc_qpel_hv32_8_neon)
> + ldp x4, x5, [sp], #16
> + ldp x2, x3, [sp], #16
> + ldp x0, x1, [sp], #16
> + add x1, x1, #32
> + add x0, x0, #64
> + mov x6, #32
> + bl X(ff_hevc_put_hevc_qpel_hv32_8_neon)
> + ldp xzr, x30, [sp], #16
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h4_8_neon, export=1
> + load_qpel_filterb x5, x6
> + sub x2, x2, #3
> +1: ld1 {v16.8b, v17.8b}, [x2], x3
> + movi v20.8h, #0
> +.macro calc op, src
> + \op v20.8h, v16.8b, v\src\().8b
> + ushr v16.2d, v16.2d, #8
> + mov v16.b[7], v17.b[\src]
> +.endm
> + calc umlsl, 0
> + calc umlal, 1
> + calc umlsl, 2
> + calc umlal, 3
> + calc umlal, 4
> + calc umlsl, 5
> + calc umlal, 6
Align the macro parameters as instruction operands, like other instances
of 'calc' above.
> +// no purgem
> + umlsl v20.8h, v16.8b, v7.8b
> + sqrshrun v20.8b, v20.8h, #6
> + subs w4, w4, #1
> + st1 {v20.s}[0], [x0], x1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h6_8_neon, export=1
> + load_qpel_filterb x5, x6
> + sub x2, x2, #3
> + sub x1, x1, #4
> +1: ld1 {v16.8b, v17.8b}, [x2], x3
> + movi v20.8h, #0
> +// same macro as above
> + calc umlsl, 0
> + calc umlal, 1
> + calc umlsl, 2
> + calc umlal, 3
> + calc umlal, 4
> + calc umlsl, 5
> + calc umlal, 6
> + umlsl v20.8h, v16.8b, v7.8b
> + sqrshrun v20.8b, v20.8h, #6
> + st1 {v20.s}[0], [x0], #4
> + subs w4, w4, #1
> + st1 {v20.h}[2], [x0], x1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h8_8_neon, export=1
> + load_qpel_filterb x5, x6
> + sub x2, x2, #3
> +1: ld1 {v16.8b, v17.8b}, [x2], x3
> + movi v20.8h, #0
> +// same macro as above
> + calc umlsl, 0
> + calc umlal, 1
> + calc umlsl, 2
> + calc umlal, 3
> + calc umlal, 4
> + calc umlsl, 5
> + calc umlal, 6
> +.purgem calc
> + umlsl v20.8h, v16.8b, v7.8b
> + sqrshrun v20.8b, v20.8h, #6
> + subs w4, w4, #1
> + st1 {v20.8b}, [x0], x1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h12_8_neon, export=1
> + load_qpel_filterb x5, x6
> + sub x2, x2, #3
> + sub x1, x1, #8
> +1: ld2 {v16.8b, v17.8b}, [x2]
> + movi v20.8h, #0
> + ldr w12, [x2, #16]
> + movi v21.8h, #0
> +.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
> + \op1 \r0\().8h, \r1\().8b, \src0\().8b
> + \op2 \r0\().8h, \r2\().8b, \src1\().8b
> +.if \tail-1
> + ushr \r1\().2d, \r1\().2d, #8
> +.endif
> +.endm
> + calc umlsl, umlal, v20, v16, v17, v0, v1
> + mov v16.b[7], w12
> + lsr x12, x12, #8
> + calc umlsl, umlal, v21, v17, v16, v0, v1
> + mov v17.b[7], w12
> + lsr x12, x12, #8
> + calc umlsl, umlal, v20, v16, v17, v2, v3
Same here about aligning macro parameters. Also, if we didn't switch the
sign of some of the coefficients, this could all use umlal consistently?
> + mov v16.b[7], w12
> + calc umlsl, umlal, v21, v17, v16, v2, v3
> + calc umlal, umlsl, v20, v16, v17, v4, v5
> + calc umlal, umlsl, v21, v17, v16, v4, v5
> + calc umlal, umlsl, v20, v16, v17, v6, v7
> + calc umlal, umlsl, v21, v17, v16, v6, v7, 1
> +.purgem calc
> + zip1 v16.8h, v20.8h, v21.8h
> + zip2 v17.8h, v20.8h, v21.8h
> + sqrshrun v20.8b, v16.8h, #6
> + sqrshrun2 v20.16b, v17.8h, #6
> + st1 {v20.8b}, [x0], #8
> + add x2, x2, x3
> + st1 {v20.s}[2], [x0], x1
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h16_8_neon, export=1
> + load_qpel_filterb x5, x6
> + sub x2, x2, #3
> +1: ld2 {v16.8b, v17.8b}, [x2]
> + ldr x12, [x2, #16]
> + movi v20.8h, #0
> + movi v21.8h, #0
> +.macro calc op1, op2, dst, r0, r1, src0, src1, tail=0
> + \op1 \dst\().8h, \r0\().8b, \src0\().8b
> + \op2 \dst\().8h, \r1\().8b, \src1\().8b
> + ushr \r0\().2d, \r0\().2d, #8
> + mov \r0\().b[7], w12
> +.if \tail-1
> + lsr x12, x12, #8
> +.endif
> +.endm
> + calc umlsl, umlal, v20, v16, v17, v0, v1
> + calc umlsl, umlal, v21, v17, v16, v0, v1
> + calc umlsl, umlal, v20, v16, v17, v2, v3
> + calc umlsl, umlal, v21, v17, v16, v2, v3
> + calc umlal, umlsl, v20, v16, v17, v4, v5
> + calc umlal, umlsl, v21, v17, v16, v4, v5
> + calc umlal, umlsl, v20, v16, v17, v6, v7, 1
> +.purgem calc
> + umlal v21.8h, v17.8b, v6.8b
> + umlsl v21.8h, v16.8b, v7.8b
> + sqrshrun v20.8b, v20.8h, #6
> + sqrshrun v21.8b, v21.8h, #6
> + st2 {v20.8b, v21.8b}, [x0], x1
> + add x2, x2, x3
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h24_8_neon, export=1
> + load_qpel_filterb x5, x6
> + sub x2, x2, #3
> +1: ld3 {v16.8b-v18.8b}, [x2]
> + ldr x12, [x2, #24]
> + movi v20.8h, #0
> + movi v21.8h, #0
> + movi v22.8h, #0
> +.macro calc op1, op2, dst, r0, r1, r2, src0, src1, src2
> + \op1 \dst\().8h, \r0\().8b, \src0\().8b
> + \op2 \dst\().8h, \r1\().8b, \src1\().8b
> + umlsl \dst\().8h, \r2\().8b, \src2\().8b
> + ushr \r0\().2d, \r0\().2d, #8
> + mov \r0\().b[7], w12
> + lsr x12, x12, #8
> +.endm
> + calc umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
> + calc umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
> + calc umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
> + calc umlal, umlal, v20, v16, v17, v18, v3, v4, v5
> + calc umlal, umlal, v21, v17, v18, v16, v3, v4, v5
> + calc umlal, umlal, v22, v18, v16, v17, v3, v4, v5
> +.purgem calc
> + umlal v20.8h, v16.8b, v6.8b
> + umlsl v20.8h, v17.8b, v7.8b
> + ushr v16.2d, v16.2d, #8
> + mov v16.b[7], w12
> + umlal v21.8h, v17.8b, v6.8b
> + umlsl v21.8h, v18.8b, v7.8b
> + umlal v22.8h, v18.8b, v6.8b
> + umlsl v22.8h, v16.8b, v7.8b
> + sqrshrun v20.8b, v20.8h, #6
> + sqrshrun v22.8b, v22.8h, #6
> + sqrshrun v21.8b, v21.8h, #6
> + st3 {v20.8b-v22.8b}, [x0], x1
> + add x2, x2, x3
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h32_8_neon, export=1
> + load_qpel_filterb x5, x6
> + sub x2, x2, #3
> +1: ld4 {v16.8b-v19.8b}, [x2]
> + ldr x12, [x2, #32]
> + movi v20.8h, #0
> + movi v21.8h, #0
> + movi v22.8h, #0
> + movi v23.8h, #0
> +.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
> + \op1 \dst\().8h, \r0\().8b, \src0\().8b
> + \op2 \dst\().8h, \r1\().8b, \src1\().8b
> + \op1 \dst\().8h, \r2\().8b, \src2\().8b
> + \op2 \dst\().8h, \r3\().8b, \src3\().8b
> + ushr \r0\().2d, \r0\().2d, #8
> + mov \r0\().b[7], w12
> +.if \tail-1
> + lsr x12, x12, #8
> +.endif
> +.endm
> + calc umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
> + calc umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
> + calc umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
> + calc umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
> + calc umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
> + calc umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
> + calc umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
> +.purgem calc
> + umlal v23.8h, v19.8b, v4.8b
> + sqrshrun v20.8b, v20.8h, #6
> + umlsl v23.8h, v16.8b, v5.8b
> + sqrshrun v21.8b, v21.8h, #6
> + umlal v23.8h, v17.8b, v6.8b
> + sqrshrun v22.8b, v22.8h, #6
> + umlsl v23.8h, v18.8b, v7.8b
> + sqrshrun v23.8b, v23.8h, #6
> + st4 {v20.8b-v23.8b}, [x0], x1
> + add x2, x2, x3
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h48_8_neon, export=1
> + load_qpel_filterb x5, x6
> + sub x2, x2, #3
> +1: ld3 {v16.16b-v18.16b}, [x2]
> + movi v20.8h, #0
> + movi v21.8h, #0
> + movi v22.8h, #0
> + ldr x12, [x2, #24]
> + movi v23.8h, #0
> + movi v24.8h, #0
> + movi v25.8h, #0
> + ldr x13, [x2, #48]
> +.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2
> + \op1 \dst0\().8h, \r0\().8b, \src0\().8b
> + \op2 \dst0\().8h, \r1\().8b, \src1\().8b
> + umlsl \dst0\().8h, \r2\().8b, \src2\().8b
> + \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
> + \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
> + umlsl2 \dst1\().8h, \r2\().16b, \src2\().16b
> + ushr \r0\().2d, \r0\().2d, #8
> + mov \r0\().b[7], w12
> + mov \r0\().b[15], w13
> + lsr x12, x12, #8
> + lsr x13, x13, #8
> +.endm
> + calc umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
> + calc umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
> + calc umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
> + calc umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
> + calc umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
> + calc umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
> +.purgem calc
> +.macro calc r0, r1, r2, r3
> + umlal \r0\().8h, \r2\().8b, v6.8b
> + umlsl \r0\().8h, \r3\().8b, v7.8b
> + umlal2 \r1\().8h, \r2\().16b, v6.16b
> + umlsl2 \r1\().8h, \r3\().16b, v7.16b
> +.endm
> + calc v20, v23, v16, v17
> + ushr v16.2d, v16.2d, #8
> + mov v16.b[7], w12
> + mov v16.b[15], w13
> + calc v21, v24, v17, v18
> + calc v22, v25, v18, v16
> +.purgem calc
> + sqrshrun v20.8b, v20.8h, #6
> + sqrshrun v21.8b, v21.8h, #6
> + sqrshrun v22.8b, v22.8h, #6
> + sqrshrun2 v20.16b, v23.8h, #6
> + sqrshrun2 v21.16b, v24.8h, #6
> + sqrshrun2 v22.16b, v25.8h, #6
> + st3 {v20.16b-v22.16b}, [x0], x1
> + add x2, x2, x3
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h64_8_neon, export=1
> + load_qpel_filterb x5, x6
> + sub x2, x2, #3
> +1: ld4 {v16.16b-v19.16b}, [x2]
> + ldr x12, [x2, #32]
> + ldr x13, [x2, #64]
> + movi v20.8h, #0
> + movi v21.8h, #0
> + movi v22.8h, #0
> + movi v23.8h, #0
> + movi v24.8h, #0
> + movi v25.8h, #0
> + movi v26.8h, #0
> + movi v27.8h, #0
> +.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
> + \op1 \dst0\().8h, \r0\().8b, \src0\().8b
> + \op2 \dst0\().8h, \r1\().8b, \src1\().8b
> + \op1 \dst0\().8h, \r2\().8b, \src2\().8b
> + \op2 \dst0\().8h, \r3\().8b, \src3\().8b
> + \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
> + \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
> + \op1\()2 \dst1\().8h, \r2\().16b, \src2\().16b
> + \op2\()2 \dst1\().8h, \r3\().16b, \src3\().16b
> +.if \tail-1
> + ushr \r0\().2d, \r0\().2d, #8
> + mov \r0\().b[7], w12
> + mov \r0\().b[15], w13
> + lsr x12, x12, #8
> + lsr x13, x13, #8
> +.endif
> +.endm
> + calc umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
> + calc umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
> + calc umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
> + calc umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
> + calc umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
> + calc umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
> + calc umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
> + calc umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
> +.purgem calc
> + sqrshrun v20.8b, v20.8h, #6
> + sqrshrun v21.8b, v21.8h, #6
> + sqrshrun v22.8b, v22.8h, #6
> + sqrshrun v23.8b, v23.8h, #6
> + sqrshrun2 v20.16b, v24.8h, #6
> + sqrshrun2 v21.16b, v25.8h, #6
> + sqrshrun2 v22.16b, v26.8h, #6
> + sqrshrun2 v23.16b, v27.8h, #6
> + st4 {v20.16b-v23.16b}, [x0], x1
> + add x2, x2, x3
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
> + load_qpel_filterb x6, x5
> + sub x2, x2, x3, lsl #1
> + sub x2, x2, x3
> + ld1 {v16.s}[0], [x2], x3
> + ld1 {v17.s}[0], [x2], x3
> + ld1 {v18.s}[0], [x2], x3
> + ld1 {v19.s}[0], [x2], x3
> + ld1 {v20.s}[0], [x2], x3
> + ld1 {v21.s}[0], [x2], x3
> + ld1 {v22.s}[0], [x2], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().s}[0], [x2], x3
> + movi v24.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + sqrshrun v24.8b, v24.8h, #6
> + subs w4, w4, #1
> + st1 {v24.s}[0], [x0], x1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
> + load_qpel_filterb x6, x5
> + sub x2, x2, x3, lsl #1
> + sub x1, x1, #4
> + sub x2, x2, x3
> + ld1 {v16.8b}, [x2], x3
> + ld1 {v17.8b}, [x2], x3
> + ld1 {v18.8b}, [x2], x3
> + ld1 {v19.8b}, [x2], x3
> + ld1 {v20.8b}, [x2], x3
> + ld1 {v21.8b}, [x2], x3
> + ld1 {v22.8b}, [x2], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().8b}, [x2], x3
> + movi v24.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + sqrshrun v24.8b, v24.8h, #6
> + st1 {v24.s}[0], [x0], #4
> + subs w4, w4, #1
> + st1 {v24.h}[2], [x0], x1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
> + load_qpel_filterb x6, x5
> + sub x2, x2, x3, lsl #1
> + sub x2, x2, x3
> + ld1 {v16.8b}, [x2], x3
> + ld1 {v17.8b}, [x2], x3
> + ld1 {v18.8b}, [x2], x3
> + ld1 {v19.8b}, [x2], x3
> + ld1 {v20.8b}, [x2], x3
> + ld1 {v21.8b}, [x2], x3
> + ld1 {v22.8b}, [x2], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().8b}, [x2], x3
> + movi v24.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + sqrshrun v24.8b, v24.8h, #6
> + subs w4, w4, #1
> + st1 {v24.8b}, [x0], x1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
> + load_qpel_filterb x6, x5
> + sub x2, x2, x3, lsl #1
> + sub x1, x1, #8
> + sub x2, x2, x3
> +0: mov x8, x2 // src
> + ld1 {v16.16b}, [x8], x3
> + mov w11, w4 // height
> + ld1 {v17.16b}, [x8], x3
> + mov x10, x0 // dst
> + ld1 {v18.16b}, [x8], x3
> + ld1 {v19.16b}, [x8], x3
> + ld1 {v20.16b}, [x8], x3
> + ld1 {v21.16b}, [x8], x3
> + ld1 {v22.16b}, [x8], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().16b}, [x8], x3
> + movi v24.8h, #0
> + movi v25.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + sqrshrun v24.8b, v24.8h, #6
> + sqrshrun2 v24.16b, v25.8h, #6
> + st1 {v24.8b}, [x10], #8
> + subs x11, x11, #1
> + st1 {v24.s}[2], [x10], x1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: add x0, x0, #12
> + add x2, x2, #12
> + subs w7, w7, #12
> + b.ne 0b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
> + load_qpel_filterb x6, x5
> + sub x2, x2, x3, lsl #1
> + sub x2, x2, x3
> +0: mov x8, x2 // src
> + ld1 {v16.16b}, [x8], x3
> + mov w11, w4 // height
> + ld1 {v17.16b}, [x8], x3
> + mov x10, x0 // dst
> + ld1 {v18.16b}, [x8], x3
> + ld1 {v19.16b}, [x8], x3
> + ld1 {v20.16b}, [x8], x3
> + ld1 {v21.16b}, [x8], x3
> + ld1 {v22.16b}, [x8], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().16b}, [x8], x3
> + movi v24.8h, #0
> + movi v25.8h, #0
> + calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> + sqrshrun v24.8b, v24.8h, #6
> + sqrshrun2 v24.16b, v25.8h, #6
> + subs x11, x11, #1
> + st1 {v24.16b}, [x10], x1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: add x0, x0, #16
> + add x2, x2, #16
> + subs w7, w7, #16
> + b.ne 0b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
> + b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
Align the operands at the right column
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
> + b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
> + b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
> + b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv4_8_neon, export=1
> + add w10, w4, #7
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x1, [sp, #-16]!
> + stp x4, x6, [sp, #-16]!
> + sub x1, x2, x3, lsl #1
> + stp x30, xzr, [sp, #-16]!
> + sub x1, x1, x3
> + add x0, sp, #48
> + mov x2, x3
> + add x3, x4, #7
> + mov x4, x5
> + bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
> + ldp x30, xzr, [sp], #16
> + ldp x4, x6, [sp], #16
> + mov x9, #(MAX_PB_SIZE * 2)
> + ldp x0, x1, [sp], #16
> + load_qpel_filterh x6, x5
> + ld1 {v16.4h}, [sp], x9
> + ld1 {v17.4h}, [sp], x9
> + ld1 {v18.4h}, [sp], x9
> + ld1 {v19.4h}, [sp], x9
> + ld1 {v20.4h}, [sp], x9
> + ld1 {v21.4h}, [sp], x9
> + ld1 {v22.4h}, [sp], x9
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().4h}, [sp], x9
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
> + sqxtun v1.8b, v1.8h
> + subs w4, w4, #1
> + st1 {v1.s}[0], [x0], x1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv6_8_neon, export=1
> + add w10, w4, #7
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x1, [sp, #-16]!
> + stp x4, x6, [sp, #-16]!
> + sub x1, x2, x3, lsl #1
> + stp x30, xzr, [sp, #-16]!
> + sub x1, x1, x3
> + add x0, sp, #48
> + mov x2, x3
> + add w3, w4, #7
> + mov x4, x5
> + bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
> + ldp x30, xzr, [sp], #16
> + ldp x4, x6, [sp], #16
> + mov x9, #(MAX_PB_SIZE * 2)
> + ldp x0, x1, [sp], #16
> + load_qpel_filterh x6, x5
> + sub x1, x1, #4
> + ld1 {v16.8h}, [sp], x9
> + ld1 {v17.8h}, [sp], x9
> + ld1 {v18.8h}, [sp], x9
> + ld1 {v19.8h}, [sp], x9
> + ld1 {v20.8h}, [sp], x9
> + ld1 {v21.8h}, [sp], x9
> + ld1 {v22.8h}, [sp], x9
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().8h}, [sp], x9
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
> + calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
> + sqxtun v1.8b, v1.8h
> + st1 {v1.s}[0], [x0], #4
> + subs w4, w4, #1
> + st1 {v1.h}[2], [x0], x1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv8_8_neon, export=1
> + add w10, w4, #7
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x1, [sp, #-16]!
> + stp x4, x6, [sp, #-16]!
> + sub x1, x2, x3, lsl #1
> + stp x30, xzr, [sp, #-16]!
> + sub x1, x1, x3
> + add x0, sp, #48
> + mov x2, x3
> + add w3, w4, #7
> + mov x4, x5
> + bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
> + ldp x30, xzr, [sp], #16
> + ldp x4, x6, [sp], #16
> + mov x9, #(MAX_PB_SIZE * 2)
> + ldp x0, x1, [sp], #16
> + load_qpel_filterh x6, x5
> + ld1 {v16.8h}, [sp], x9
> + ld1 {v17.8h}, [sp], x9
> + ld1 {v18.8h}, [sp], x9
> + ld1 {v19.8h}, [sp], x9
> + ld1 {v20.8h}, [sp], x9
> + ld1 {v21.8h}, [sp], x9
> + ld1 {v22.8h}, [sp], x9
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\tmp\().8h}, [sp], x9
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
> + calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
> + sqxtun v1.8b, v1.8h
> + subs w4, w4, #1
> + st1 {v1.8b}, [x0], x1
> +.endm
> +1: calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv12_8_neon, export=1
> + add w10, w4, #7
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x1, [sp, #-16]!
> + stp x4, x6, [sp, #-16]!
> + sub x1, x2, x3, lsl #1
> + stp x7, x30, [sp, #-16]!
> + sub x1, x1, x3
> + mov x2, x3
> + add x0, sp, #48
> + add w3, w4, #7
> + mov x4, x5
> + bl X(ff_hevc_put_hevc_qpel_h12_8_neon)
> + ldp x7, x30, [sp], #16
> + ldp x4, x6, [sp], #16
> + mov x9, #(MAX_PB_SIZE * 2)
> + ldp x0, x1, [sp], #16
> + load_qpel_filterh x6, x5
> + sub x1, x1, #8
> + ld1 {v16.8h, v17.8h}, [sp], x9
> + ld1 {v18.8h, v19.8h}, [sp], x9
> + ld1 {v20.8h, v21.8h}, [sp], x9
> + ld1 {v22.8h, v23.8h}, [sp], x9
> + ld1 {v24.8h, v25.8h}, [sp], x9
> + ld1 {v26.8h, v27.8h}, [sp], x9
> + ld1 {v28.8h, v29.8h}, [sp], x9
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> + ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x9
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
> + calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
> + calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
> + sqxtun v1.8b, v1.8h
> + sqxtun2 v1.16b, v2.8h
> + st1 {v1.8b}, [x0], #8
> + subs w4, w4, #1
> + st1 {v1.s}[2], [x0], x1
> +.endm
> +1: calc_all2
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv16_8_neon, export=1
> + add w10, w4, #7
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x1, [sp, #-16]!
> + stp x4, x6, [sp, #-16]!
> + stp x7, x30, [sp, #-16]!
> + add x0, sp, #48
> + sub x1, x2, x3, lsl #1
> + sub x1, x1, x3
> + mov x2, x3
> + add w3, w4, #7
> + mov x4, x5
> + bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
> + ldp x7, x30, [sp], #16
> + ldp x4, x6, [sp], #16
> + ldp x0, x1, [sp], #16
> +.Lqpel_uni_hv16_loop:
> + mov x9, #(MAX_PB_SIZE * 2)
> + load_qpel_filterh x6, x5
> + sub w12, w9, w7, lsl #1
> +0: mov x8, sp // src
> + ld1 {v16.8h, v17.8h}, [x8], x9
> + mov w11, w4 // height
> + ld1 {v18.8h, v19.8h}, [x8], x9
> + mov x10, x0 // dst
> + ld1 {v20.8h, v21.8h}, [x8], x9
> + ld1 {v22.8h, v23.8h}, [x8], x9
> + ld1 {v24.8h, v25.8h}, [x8], x9
> + ld1 {v26.8h, v27.8h}, [x8], x9
> + ld1 {v28.8h, v29.8h}, [x8], x9
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> + ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9
> + calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
> + calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
> + calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
> + calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn2, #12
> + sqxtun v1.8b, v1.8h
> + subs x11, x11, #1
> + sqxtun2 v1.16b, v2.8h
> + st1 {v1.16b}, [x10], x1
> +.endm
> +1: calc_all2
> +.purgem calc
> +2: add x0, x0, #16
> + add sp, sp, #32
> + subs w7, w7, #16
> + b.ne 0b
> + add w10, w4, #6
> + add sp, sp, x12 // discard rest of first line
> + lsl x10, x10, #7
> + add sp, sp, x10 // tmp_array without first line
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv24_8_neon, export=1
> + stp x6, x30, [sp, #-16]!
> + mov x7, #16
> + stp x0, x1, [sp, #-16]!
> + stp x2, x3, [sp, #-16]!
> + stp x4, x5, [sp, #-16]!
> + bl X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon)
> + ldp x4, x5, [sp], #16
> + ldp x2, x3, [sp], #16
> + add x2, x2, #16
> + ldp x0, x1, [sp], #16
> + mov x7, #8
> + add x0, x0, #16
> + ldr x6, [sp]
> + bl X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon)
> + ldp xzr, x30, [sp], #16
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv32_8_neon, export=1
> + add w10, w4, #7
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x1, [sp, #-16]!
> + stp x4, x6, [sp, #-16]!
> + stp x7, x30, [sp, #-16]!
> + sub x1, x2, x3, lsl #1
> + add x0, sp, #48
> + sub x1, x1, x3
> + mov x2, x3
> + add w3, w4, #7
> + mov x4, x5
> + bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
> + ldp x7, x30, [sp], #16
> + ldp x4, x6, [sp], #16
> + ldp x0, x1, [sp], #16
> + b .Lqpel_uni_hv16_loop
Align the operand
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv48_8_neon, export=1
> + add w10, w4, #7
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x1, [sp, #-16]!
> + stp x4, x6, [sp, #-16]!
> + sub x1, x2, x3, lsl #1
> + stp x7, x30, [sp, #-16]!
> + sub x1, x1, x3
> + mov x2, x3
> + add x0, sp, #48
> + add w3, w4, #7
> + mov x4, x5
> + bl X(ff_hevc_put_hevc_qpel_h48_8_neon)
> + ldp x7, x30, [sp], #16
> + ldp x4, x6, [sp], #16
> + ldp x0, x1, [sp], #16
> + b .Lqpel_uni_hv16_loop
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv64_8_neon, export=1
> + add w10, w4, #7
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x1, [sp, #-16]!
> + stp x4, x6, [sp, #-16]!
> + stp x7, x30, [sp, #-16]!
> + add x0, sp, #48
> + sub x1, x2, x3, lsl #1
> + mov x2, x3
> + sub x1, x1, x3
> + add w3, w4, #7
> + mov x4, x5
> + bl X(ff_hevc_put_hevc_qpel_h64_8_neon)
> + ldp x7, x30, [sp], #16
> + ldp x4, x6, [sp], #16
> + ldp x0, x1, [sp], #16
> + b .Lqpel_uni_hv16_loop
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h4_8_neon, export=1
> + load_qpel_filterb x6, x7
> + sub x2, x2, #3
> + mov x10, #(MAX_PB_SIZE * 2)
> +1: ld1 {v16.8b, v17.8b}, [x2], x3
> + movi v20.8h, #0
> +.macro calc op, idx
> + \op v20.8h, v16.8b, v\idx\().8b
> + ushr v16.2d, v16.2d, #8
> + mov v16.b[7], v17.b[\idx]
> +.endm
> + calc umlsl, 0
> + calc umlal, 1
> + calc umlsl, 2
> + calc umlal, 3
> + calc umlal, 4
> + calc umlsl, 5
> + calc umlal, 6
> + umlsl v20.8h, v16.8b, v7.8b
> + ld1 {v24.8h}, [x4], x10
> + sqadd v16.8h, v20.8h, v24.8h
> + sqrshrun v16.8b, v16.8h, #7
> + subs w5, w5, #1
> + st1 {v16.s}[0], [x0], x1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h6_8_neon, export=1
> + load_qpel_filterb x6, x7
> + sub x2, x2, #3
> + sub x1, x1, #4
> + mov x10, #(MAX_PB_SIZE * 2)
> +1: ld1 {v16.8b, v17.8b}, [x2], x3
> + movi v20.8h, #0
> + // same macro
> + calc umlsl, 0
> + calc umlal, 1
> + calc umlsl, 2
> + calc umlal, 3
> + calc umlal, 4
> + calc umlsl, 5
> + calc umlal, 6
> + umlsl v20.8h, v16.8b, v7.8b
> + ld1 {v24.8h}, [x4], x10
> + sqadd v16.8h, v20.8h, v24.8h
> + sqrshrun v16.8b, v16.8h, #7
> + st1 {v16.s}[0], [x0], #4
> + subs w5, w5, #1
> + st1 {v16.h}[2], [x0], x1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h8_8_neon, export=1
> + load_qpel_filterb x6, x7
> + sub x2, x2, #3
> + mov x10, #(MAX_PB_SIZE * 2)
> +1: ld1 {v16.8b, v17.8b}, [x2], x3
> + movi v20.8h, #0
> + // same macro
> + calc umlsl, 0
> + calc umlal, 1
> + calc umlsl, 2
> + calc umlal, 3
> + calc umlal, 4
> + calc umlsl, 5
> + calc umlal, 6
> + umlsl v20.8h, v16.8b, v7.8b
> +.purgem calc
> + ld1 {v24.8h}, [x4], x10
> + sqadd v16.8h, v20.8h, v24.8h
> + sqrshrun v16.8b, v16.8h, #7
> + subs w5, w5, #1
> + st1 {v16.8b}, [x0], x1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h12_8_neon, export=1
> + load_qpel_filterb x6, x7
> + sub x2, x2, #3
> + sub x1, x1, #8
> + mov x10, #(MAX_PB_SIZE * 2)
> +1: ld2 {v16.8b, v17.8b}, [x2]
> + movi v20.8h, #0
> + ldr w12, [x2, #16]
> + movi v21.8h, #0
> +.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
> + \op1 \r0\().8h, \r1\().8b, \src0\().8b
> + \op2 \r0\().8h, \r2\().8b, \src1\().8b
> +.if \tail-1
Wouldn't '.if \tail == 0' be less obscure?
> + ushr \r1\().2d, \r1\().2d, #8
> +.endif
> +.endm
> + calc umlsl, umlal, v20, v16, v17, v0, v1
> + mov v16.b[7], w12
Moving data via GPRs like this is quite inelegant and slow. Can't this be
done with proper vector instructions (ext)?
> + lsr x12, x12, #8
> + calc umlsl, umlal, v21, v17, v16, v0, v1
> + mov v17.b[7], w12
> + lsr x12, x12, #8
> + calc umlsl, umlal, v20, v16, v17, v2, v3
> + mov v16.b[7], w12
> + calc umlsl, umlal, v21, v17, v16, v2, v3
> + calc umlal, umlsl, v20, v16, v17, v4, v5
> + calc umlal, umlsl, v21, v17, v16, v4, v5
> + calc umlal, umlsl, v20, v16, v17, v6, v7
> + calc umlal, umlsl, v21, v17, v16, v6, v7, 1
> +.purgem calc
> + ld2 {v24.8h, v25.8h}, [x4], x10
> + sqadd v16.8h, v20.8h, v24.8h
> + sqadd v17.8h, v21.8h, v25.8h
> + sqrshrun v16.8b, v16.8h, #7
> + sqrshrun v17.8b, v17.8h, #7
> + zip1 v16.16b, v16.16b, v17.16b
> + st1 {v16.8b}, [x0], #8
> + subs w5, w5, #1
> + st1 {v16.s}[2], [x0], x1
> + add x2, x2, x3
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h16_8_neon, export=1
> + load_qpel_filterb x6, x7
> + sub x2, x2, #3
> + mov x10, #(MAX_PB_SIZE * 2)
> +1: ld2 {v16.8b, v17.8b}, [x2]
> + movi v20.8h, #0
> + ldr x12, [x2, #16]
> + movi v21.8h, #0
> +.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
> + \op1 \r0\().8h, \r1\().8b, \src0\().8b
> + \op2 \r0\().8h, \r2\().8b, \src1\().8b
> + ushr \r1\().2d, \r1\().2d, #8
> + mov \r1\().b[7], w12
Same, please use proper vector shifting via ext, don't mix data in GPRs
when doing SIMD
The same ccomments apply to most of the remaining code too.
And overall, please unroll less. In particular, try to get rid of every
case of calc_all. At most, unroll the calculation once or twice, but then
shift registers inbetween.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2022-02-07 22:13 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-20 8:10 [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: add hevc qpel assembly J. Dekker
2022-01-20 8:38 ` Martin Storsjö
2022-02-03 13:51 ` [FFmpeg-devel] [PATCH v2 " J. Dekker
2022-02-03 13:51 ` [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc epel assembly J. Dekker
2022-02-07 22:13 ` Martin Storsjö
2022-02-07 22:11 ` [FFmpeg-devel] [PATCH v2 1/2] lavc/aarch64: add hevc qpel assembly Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git