From e01cd973488aa4d65e09a85b53ea0639477fc76e Mon Sep 17 00:00:00 2001 From: Logan Lyu Date: Fri, 5 May 2023 22:06:22 +0800 Subject: [PATCH 2/3] lavc/aarch64: new optimization for 8-bit hevc_qpel_uni_w_h --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 15 +- libavcodec/aarch64/hevcdsp_qpel_neon.S | 434 ++++++++++++++++++++++ 2 files changed, 448 insertions(+), 1 deletion(-) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 6b5341dd45..a7e62c7d15 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -145,6 +145,7 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \ void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \ + NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, @@ -155,6 +156,12 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width),); +NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width), _i8mm); + + #define NEON8_FNASSIGN(member, v, h, fn, ext) \ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \ @@ -174,9 +181,11 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \ member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; + av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) { - if (!have_neon(av_get_cpu_flags())) return; + int cpu_flags = av_get_cpu_flags(); + if (!have_neon(cpu_flags)) return; if (bit_depth == 8) { c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_neon; @@ -236,6 +245,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,); NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); + if (have_i8mm(cpu_flags)) { + NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm); + } + } if (bit_depth == 10) { c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon; diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S index 51df52e1ea..8e8b88c9ea 100644 --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S @@ -1192,3 +1192,437 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1 b.hi 3b ret endfunc + +#if HAVE_I8MM +.macro QPEL_UNI_W_H_HEADER + ldr x12, [sp] + sub x2, x2, #3 + movrel x9, qpel_filters + add x9, x9, x12, lsl #3 + ldr x11, [x9] + dup v28.2d, x11 + mov w10, #-6 + sub w10, w10, w5 + dup v30.4s, w6 // wx + dup v31.4s, w10 // shift + dup v29.4s, w7 // ox +.endm + +function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1 + QPEL_UNI_W_H_HEADER +1: + ld1 {v0.16b}, [x2], x3 + ext v1.16b, v0.16b, v0.16b, #1 + ext v2.16b, v0.16b, v0.16b, #2 + ext v3.16b, v0.16b, v0.16b, #3 + zip1 v0.2d, v0.2d, v1.2d + zip1 v2.2d, v2.2d, v3.2d + movi v16.2d, #0 + movi v17.2d, #0 + usdot v16.4s, v0.16b, v28.16b + usdot v17.4s, v2.16b, v28.16b + addp v16.4s, v16.4s, v17.4s + mul v16.4s, v16.4s, v30.4s + sqrshl v16.4s, v16.4s, v31.4s + sqadd v16.4s, v16.4s, v29.4s + sqxtn v16.4h, v16.4s + sqxtun v16.8b, v16.8h + str s16, [x0] + add x0, x0, x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1 + QPEL_UNI_W_H_HEADER + sub x1, x1, #4 +1: + ld1 {v0.16b}, [x2], x3 + ext v1.16b, v0.16b, v0.16b, #1 + ext v2.16b, v0.16b, v0.16b, #2 + ext v3.16b, v0.16b, v0.16b, #3 + ext v4.16b, v0.16b, v0.16b, #4 + ext v5.16b, v0.16b, v0.16b, #5 + zip1 v0.2d, v0.2d, v1.2d + zip1 v2.2d, v2.2d, v3.2d + zip1 v4.2d, v4.2d, v5.2d + movi v16.2d, #0 + movi v17.2d, #0 + movi v18.2d, #0 + usdot v16.4s, v0.16b, v28.16b + usdot v17.4s, v2.16b, v28.16b + usdot v18.4s, v4.16b, v28.16b + addp v16.4s, v16.4s, v17.4s + addp v18.4s, v18.4s, v18.4s + mul v16.4s, v16.4s, v30.4s + mul v18.2s, v18.2s, v30.2s + sqrshl v16.4s, v16.4s, v31.4s + sqrshl v18.2s, v18.2s, v31.2s + sqadd v16.4s, v16.4s, v29.4s + sqadd v18.2s, v18.2s, v29.2s + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v18.4s + sqxtun v16.8b, v16.8h + str s16, [x0], #4 + st1 {v16.h}[2], [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + + +.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 + movi \d0\().2d, #0 + movi \d1\().2d, #0 + movi \d2\().2d, #0 + movi \d3\().2d, #0 + usdot \d0\().4s, \s0\().16b, v28.16b + usdot \d1\().4s, \s1\().16b, v28.16b + usdot \d2\().4s, \s2\().16b, v28.16b + usdot \d3\().4s, \s3\().16b, v28.16b + addp \d0\().4s, \d0\().4s, \d1\().4s + addp \d2\().4s, \d2\().4s, \d3\().4s + mul \d0\().4s, \d0\().4s, v30.4s + mul \d2\().4s, \d2\().4s, v30.4s + sqrshl \d0\().4s, \d0\().4s, v31.4s + sqrshl \d2\().4s, \d2\().4s, v31.4s + sqadd \d0\().4s, \d0\().4s, v29.4s + sqadd \d2\().4s, \d2\().4s, v29.4s +.endm + +.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1 + movi \d0\().2d, #0 + movi \d1\().2d, #0 + usdot \d0\().4s, \s0\().16b, v28.16b + usdot \d1\().4s, \s1\().16b, v28.16b + addp \d0\().4s, \d0\().4s, \d1\().4s + mul \d0\().4s, \d0\().4s, v30.4s + sqrshl \d0\().4s, \d0\().4s, v31.4s + sqadd \d0\().4s, \d0\().4s, v29.4s +.endm + + +function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_i8mm, export=1 + QPEL_UNI_W_H_HEADER +1: + ld1 {v16.16b, v17.16b}, [x2], x3 + ext v1.16b, v16.16b, v17.16b, #1 + ext v2.16b, v16.16b, v17.16b, #2 + ext v3.16b, v16.16b, v17.16b, #3 + ext v4.16b, v16.16b, v17.16b, #4 + ext v5.16b, v16.16b, v17.16b, #5 + ext v6.16b, v16.16b, v17.16b, #6 + ext v7.16b, v16.16b, v17.16b, #7 + zip1 v0.2d, v16.2d, v1.2d + zip1 v2.2d, v2.2d, v3.2d + zip1 v4.2d, v4.2d, v5.2d + zip1 v6.2d, v6.2d, v7.2d + QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21 + sqxtn v18.4h, v18.4s + sqxtn2 v18.8h, v20.4s + sqxtun v18.8b, v18.8h + str d18, [x0] + add x0, x0, x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_i8mm, export=1 + QPEL_UNI_W_H_HEADER + add x13, x0, #8 +1: + ld1 {v16.16b, v17.16b}, [x2], x3 + ext v1.16b, v16.16b, v17.16b, #1 + ext v2.16b, v16.16b, v17.16b, #2 + ext v3.16b, v16.16b, v17.16b, #3 + ext v4.16b, v16.16b, v17.16b, #4 + ext v5.16b, v16.16b, v17.16b, #5 + ext v6.16b, v16.16b, v17.16b, #6 + ext v7.16b, v16.16b, v17.16b, #7 + zip1 v18.2d, v16.2d, v1.2d + zip1 v19.2d, v2.2d, v3.2d + zip1 v20.2d, v4.2d, v5.2d + zip1 v21.2d, v6.2d, v7.2d + zip2 v22.2d, v16.2d, v1.2d + zip2 v23.2d, v2.2d, v3.2d + QPEL_UNI_W_H_CALC v18, v19, v20, v21, v0, v2, v4, v6 + QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25 + sqxtn v0.4h, v0.4s + sqxtn2 v0.8h, v4.4s + sqxtn v1.4h, v24.4s + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + + str d0, [x0] + str s1, [x13] + add x0, x0, x1 + add x13, x13, x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_i8mm, export=1 + QPEL_UNI_W_H_HEADER +1: + ld1 {v16.16b, v17.16b}, [x2], x3 + ext v1.16b, v16.16b, v17.16b, #1 + ext v2.16b, v16.16b, v17.16b, #2 + ext v3.16b, v16.16b, v17.16b, #3 + ext v4.16b, v16.16b, v17.16b, #4 + ext v5.16b, v16.16b, v17.16b, #5 + ext v6.16b, v16.16b, v17.16b, #6 + ext v7.16b, v16.16b, v17.16b, #7 + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21 // v18: 0, 8, 2, 10 v20: 1, 9, 3, 11 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 // v22: 4, 12, 6, 14 v24: 5, 13, 7, 15 + sqxtn v0.4h, v18.4s + sqxtn2 v0.8h, v22.4s + sqxtn v1.4h, v20.4s + sqxtn2 v1.8h, v24.4s + trn1 v2.8h, v0.8h, v1.8h + trn2 v3.8h, v0.8h, v1.8h + sqxtun v0.8b, v2.8h + sqxtun2 v0.16b, v3.8h + st1 {v0.16b}, [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_i8mm, export=1 + QPEL_UNI_W_H_HEADER + sub x1, x1, #16 +1: + ld1 {v16.16b, v17.16b}, [x2], x3 + ext v1.16b, v16.16b, v17.16b, #1 + ext v2.16b, v16.16b, v17.16b, #2 + ext v3.16b, v16.16b, v17.16b, #3 + ext v4.16b, v16.16b, v17.16b, #4 + ext v5.16b, v16.16b, v17.16b, #5 + ext v6.16b, v16.16b, v17.16b, #6 + ext v7.16b, v16.16b, v17.16b, #7 + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 + sqxtn v18.4h, v18.4s + sqxtn2 v18.8h, v22.4s + sqxtn v19.4h, v20.4s + sqxtn2 v19.8h, v24.4s + trn1 v20.8h, v18.8h, v19.8h + trn2 v21.8h, v18.8h, v19.8h + sqxtun v26.8b, v20.8h + sqxtun2 v26.16b, v21.8h // 0-15 + ext v1.16b, v17.16b, v17.16b, #1 + ext v2.16b, v17.16b, v17.16b, #2 + ext v3.16b, v17.16b, v17.16b, #3 + ext v4.16b, v17.16b, v17.16b, #4 + ext v5.16b, v17.16b, v17.16b, #5 + ext v6.16b, v17.16b, v17.16b, #6 + ext v7.16b, v17.16b, v17.16b, #7 + zip1 v0.2d, v17.2d, v1.2d + zip1 v2.2d, v2.2d, v3.2d + zip1 v4.2d, v4.2d, v5.2d + zip1 v6.2d, v6.2d, v7.2d + QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21 + sqxtn v18.4h, v18.4s + sqxtn2 v18.8h, v20.4s + sqxtun v27.8b, v18.8h + + st1 {v26.16b}, [x0], #16 + st1 {v27.8b}, [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + + +function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_i8mm, export=1 + QPEL_UNI_W_H_HEADER +1: + ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3 + ext v1.16b, v16.16b, v17.16b, #1 + ext v2.16b, v16.16b, v17.16b, #2 + ext v3.16b, v16.16b, v17.16b, #3 + ext v4.16b, v16.16b, v17.16b, #4 + ext v5.16b, v16.16b, v17.16b, #5 + ext v6.16b, v16.16b, v17.16b, #6 + ext v7.16b, v16.16b, v17.16b, #7 + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v0, v19, v20, v21 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 + sqxtn v0.4h, v0.4s + sqxtn2 v0.8h, v22.4s + sqxtn v19.4h, v20.4s + sqxtn2 v19.8h, v24.4s + trn1 v20.8h, v0.8h, v19.8h + trn2 v21.8h, v0.8h, v19.8h + sqxtun v26.8b, v20.8h + sqxtun2 v26.16b, v21.8h // 0-15 + ext v1.16b, v17.16b, v18.16b, #1 + ext v2.16b, v17.16b, v18.16b, #2 + ext v3.16b, v17.16b, v18.16b, #3 + ext v4.16b, v17.16b, v18.16b, #4 + ext v5.16b, v17.16b, v18.16b, #5 + ext v6.16b, v17.16b, v18.16b, #6 + ext v7.16b, v17.16b, v18.16b, #7 + QPEL_UNI_W_H_CALC v17, v2, v1, v3, v0, v19, v20, v21 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 + sqxtn v0.4h, v0.4s + sqxtn2 v0.8h, v22.4s + sqxtn v19.4h, v20.4s + sqxtn2 v19.8h, v24.4s + trn1 v20.8h, v0.8h, v19.8h + trn2 v21.8h, v0.8h, v19.8h + sqxtun v27.8b, v20.8h + sqxtun2 v27.16b, v21.8h // 16-31 + st1 {v26.16b, v27.16b}, [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_i8mm, export=1 + QPEL_UNI_W_H_HEADER +1: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3 + ext v1.16b, v16.16b, v17.16b, #1 + ext v2.16b, v16.16b, v17.16b, #2 + ext v3.16b, v16.16b, v17.16b, #3 + ext v4.16b, v16.16b, v17.16b, #4 + ext v5.16b, v16.16b, v17.16b, #5 + ext v6.16b, v16.16b, v17.16b, #6 + ext v7.16b, v16.16b, v17.16b, #7 + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 + sqxtn v20.4h, v20.4s + sqxtn2 v20.8h, v22.4s + sqxtn v21.4h, v21.4s + sqxtn2 v21.8h, v23.4s + trn1 v22.8h, v20.8h, v21.8h + trn2 v23.8h, v20.8h, v21.8h + sqxtun v25.8b, v22.8h + sqxtun2 v25.16b, v23.8h // 0-15 + ext v1.16b, v17.16b, v18.16b, #1 + ext v2.16b, v17.16b, v18.16b, #2 + ext v3.16b, v17.16b, v18.16b, #3 + ext v4.16b, v17.16b, v18.16b, #4 + ext v5.16b, v17.16b, v18.16b, #5 + ext v6.16b, v17.16b, v18.16b, #6 + ext v7.16b, v17.16b, v18.16b, #7 + QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 + sqxtn v20.4h, v20.4s + sqxtn2 v20.8h, v22.4s + sqxtn v21.4h, v21.4s + sqxtn2 v21.8h, v23.4s + trn1 v22.8h, v20.8h, v21.8h + trn2 v23.8h, v20.8h, v21.8h + sqxtun v26.8b, v22.8h + sqxtun2 v26.16b, v23.8h // 16-31 + ext v1.16b, v18.16b, v19.16b, #1 + ext v2.16b, v18.16b, v19.16b, #2 + ext v3.16b, v18.16b, v19.16b, #3 + ext v4.16b, v18.16b, v19.16b, #4 + ext v5.16b, v18.16b, v19.16b, #5 + ext v6.16b, v18.16b, v19.16b, #6 + ext v7.16b, v18.16b, v19.16b, #7 + QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 + sqxtn v20.4h, v20.4s + sqxtn2 v20.8h, v22.4s + sqxtn v21.4h, v21.4s + sqxtn2 v21.8h, v23.4s + trn1 v22.8h, v20.8h, v21.8h + trn2 v23.8h, v20.8h, v21.8h + sqxtun v27.8b, v22.8h + sqxtun2 v27.16b, v23.8h // 32-47 + st1 {v25.16b, v26.16b, v27.16b}, [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + + + +function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1 + QPEL_UNI_W_H_HEADER + sub x3, x3, #64 +1: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 + ext v1.16b, v16.16b, v17.16b, #1 + ext v2.16b, v16.16b, v17.16b, #2 + ext v3.16b, v16.16b, v17.16b, #3 + ext v4.16b, v16.16b, v17.16b, #4 + ext v5.16b, v16.16b, v17.16b, #5 + ext v6.16b, v16.16b, v17.16b, #6 + ext v7.16b, v16.16b, v17.16b, #7 + QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 + sqxtn v20.4h, v20.4s + sqxtn2 v20.8h, v22.4s + sqxtn v21.4h, v21.4s + sqxtn2 v21.8h, v23.4s + trn1 v22.8h, v20.8h, v21.8h + trn2 v23.8h, v20.8h, v21.8h + sqxtun v16.8b, v22.8h + sqxtun2 v16.16b, v23.8h // 0-15 + ext v1.16b, v17.16b, v18.16b, #1 + ext v2.16b, v17.16b, v18.16b, #2 + ext v3.16b, v17.16b, v18.16b, #3 + ext v4.16b, v17.16b, v18.16b, #4 + ext v5.16b, v17.16b, v18.16b, #5 + ext v6.16b, v17.16b, v18.16b, #6 + ext v7.16b, v17.16b, v18.16b, #7 + QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 + sqxtn v20.4h, v20.4s + sqxtn2 v20.8h, v22.4s + sqxtn v21.4h, v21.4s + sqxtn2 v21.8h, v23.4s + trn1 v22.8h, v20.8h, v21.8h + trn2 v23.8h, v20.8h, v21.8h + sqxtun v17.8b, v22.8h + sqxtun2 v17.16b, v23.8h // 16-31 + ext v1.16b, v18.16b, v19.16b, #1 + ext v2.16b, v18.16b, v19.16b, #2 + ext v3.16b, v18.16b, v19.16b, #3 + ext v4.16b, v18.16b, v19.16b, #4 + ext v5.16b, v18.16b, v19.16b, #5 + ext v6.16b, v18.16b, v19.16b, #6 + ext v7.16b, v18.16b, v19.16b, #7 + QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 + ld1 {v0.16b}, [x2], x3 + sqxtn v20.4h, v20.4s + sqxtn2 v20.8h, v22.4s + sqxtn v21.4h, v21.4s + sqxtn2 v21.8h, v23.4s + trn1 v22.8h, v20.8h, v21.8h + trn2 v23.8h, v20.8h, v21.8h + sqxtun v18.8b, v22.8h + sqxtun2 v18.16b, v23.8h // 32-47 + ext v1.16b, v19.16b, v0.16b, #1 + ext v2.16b, v19.16b, v0.16b, #2 + ext v3.16b, v19.16b, v0.16b, #3 + ext v4.16b, v19.16b, v0.16b, #4 + ext v5.16b, v19.16b, v0.16b, #5 + ext v6.16b, v19.16b, v0.16b, #6 + ext v7.16b, v19.16b, v0.16b, #7 + QPEL_UNI_W_H_CALC v19, v2, v1, v3, v20, v24, v21, v0 + QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0 + sqxtn v20.4h, v20.4s + sqxtn2 v20.8h, v22.4s + sqxtn v21.4h, v21.4s + sqxtn2 v21.8h, v23.4s + trn1 v22.8h, v20.8h, v21.8h + trn2 v23.8h, v20.8h, v21.8h + sqxtun v19.8b, v22.8h + sqxtun2 v19.16b, v23.8h // 48-63 + + st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + +#endif // HAVE_I8MM -- 2.38.0.windows.1