* [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct
2022-06-23 12:23 [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct J. Dekker
@ 2022-06-23 12:23 ` J. Dekker
2022-08-09 12:15 ` [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct Martin Storsjö
1 sibling, 0 replies; 3+ messages in thread
From: J. Dekker @ 2022-06-23 12:23 UTC (permalink / raw)
To: ffmpeg-devel
hevc_idct_32x32_8_c: 40128.5
hevc_idct_32x32_8_neon: 7102.0
Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com>
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 1265 +++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
2 files changed, 1267 insertions(+)
Written by the same author as the other 16x16 idct. Again the same
concern with it.
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 784bae33b3..3b6e95153f 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -644,6 +644,40 @@ idct_dc 32, 10
.Lo16transform5: .hword 43, -90, 57, 25, -87, 70, 9, -80 // transform[22][0-7]
.Lo16transform6: .hword 25, -70, 90, -80, 43, 9, -57, 87 // transform[26][0-7]
.Lo16transform7: .hword 9, -25, 43, -57, 70, -80, 87, -90 // transform[30][0-7]
+.Lo32transform:
+ .hword 90, 90, 88, 85, 82, 78, 73, 67 // transform[1,3,5,7..15][1]
+ .hword 61, 54, 46, 38, 31, 22, 13, 4 // transform[17,19,21..31][1]
+ .hword 90, 82, 67, 46, 22, -4, -31, -54 // transform[1,3,5,7..15][3]
+ .hword -73, -85, -90, -88, -78, -61, -38, -13 // transform[17,19,21..31][3]
+ .hword 88, 67, 31, -13, -54, -82, -90, -78 // ..
+ .hword -46, -4, 38, 73, 90, 85, 61, 22
+ .hword 85, 46, -13, -67, -90, -73, -22, 38
+ .hword 82, 88, 54, -4, -61, -90, -78, -31
+.Lo32transform9_31:
+ .hword 82, 22, -54, -90, -61, 13, 78, 85
+ .hword 31, -46, -90, -67, 4, 73, 88, 38
+ .hword 78, -4, -82, -73, 13, 85, 67, -22
+ .hword -88, -61, 31, 90, 54, -38, -90, -46
+ .hword 73, -31, -90, -22, 78, 67, -38, -90
+ .hword -13, 82, 61, -46, -88, -4, 85, 54
+ .hword 67, -54, -78, 38, 85, -22, -90, 4
+ .hword 90, 13, -88, -31, 82, 46, -73, -61
+ .hword 61, -73, -46, 82, 31, -88, -13, 90
+ .hword -4, -90, 22, 85, -38, -78, 54, 67
+ .hword 54, -85, -4, 88, -46, -61, 82, 13
+ .hword -90, 38, 67, -78, -22, 90, -31, -73
+ .hword 46, -90, 38, 54, -90, 31, 61, -88
+ .hword 22, 67, -85, 13, 73, -82, 4, 78
+ .hword 38, -88, 73, -4, -67, 90, -46, -31
+ .hword 85, -78, 13, 61, -90, 54, 22, -82
+ .hword 31, -78, 90, -61, 4, 54, -88, 82
+ .hword -38, -22, 73, -90, 67, -13, -46, 85
+ .hword 22, -61, 85, -90, 73, -38, -4, 46
+ .hword -78, 90, -82, 54, -13, -31, 67, -88
+ .hword 13, -38, 61, -78, 88, -90, 85, -73
+ .hword 54, -31, 4, 22, -46, 67, -82, 90
+ .hword 4, -13, 22, -31, 38, -46, 54, -61 // transform[1,3,5,7..15][31]
+ .hword 67, -73, 78, -82, 85, -88, 90, -90 // transform[17,19,21..31][31]
// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
function ff_hevc_idct_16x16_8_neon_new, export=1
@@ -1284,3 +1318,1234 @@ function ff_hevc_idct_16x16_8_neon_new, export=1
ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
ret
endfunc
+
+function ff_hevc_idct_32x32_8_neon, export=1
+ sub sp, sp, 64
+ st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+ sub sp, sp, 64
+ st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [sp]
+ sub sp, sp, 16 * 32 * 4 // room for o_32: 16 * 32 values
+ mov x3, 0 // loop counter
+ mov x2, x0
+ mov x7, 83
+ add x7, x7, 36 * 65536 // o0, o1 coeff. factors
+1: mov x9, 128
+ // loading odd lines
+ add x4, x2, 64 // odd lines
+ ld1 {v16.8h}, [x4], x9 // line 1
+ ld1 {v17.8h}, [x4], x9 // line 3
+ ld1 {v18.8h}, [x4], x9 // line 5
+ ld1 {v19.8h}, [x4], x9 // line 7
+ ld1 {v20.8h}, [x4], x9 // line 9
+ ld1 {v21.8h}, [x4], x9 // line 11
+ ld1 {v22.8h}, [x4], x9 // line 13
+ ld1 {v23.8h}, [x4], x9 // line 15
+ ld1 {v24.8h}, [x4], x9 // line 17
+ ld1 {v25.8h}, [x4], x9 // line 19
+ ld1 {v26.8h}, [x4], x9 // line 21
+ ld1 {v27.8h}, [x4], x9 // line 23
+ ld1 {v28.8h}, [x4], x9 // line 25
+ ld1 {v29.8h}, [x4], x9 // line 27
+ ld1 {v30.8h}, [x4], x9 // line 29
+ ld1 {v31.8h}, [x4], x9 // line 31
+
+ cmp x1, 28
+ b.hs 5f
+ // limit2 below 32
+ bic x4, x1, 1
+ adr x5, .LimitMask
+ cbnz x3, 3f
+ // columns 0 .. 7 - cleanup of indexes 5 .. 7
+ ld1 {v0.8h}, [x5]
+ adr x5, 2f
+ add x5, x5, x4, lsl 2
+ add x5, x5, x4, lsl 1
+ br x5
+2: and v16.16b, v16.16b, v0.16b // col_limit 0..1 -> limit2 == 4..5
+ and v17.16b, v17.16b, v0.16b
+ b 5f
+ and v17.16b, v17.16b, v0.16b // col_limit 2..3 -> limit2 == 6..7
+ and v18.16b, v18.16b, v0.16b
+ b 5f
+ and v18.16b, v18.16b, v0.16b // col_limit 4..5 -> limit2 == 8..9
+ and v19.16b, v19.16b, v0.16b
+ b 5f
+ and v19.16b, v19.16b, v0.16b // col_limit 6..7 -> limit2 == 10..11
+ and v20.16b, v20.16b, v0.16b
+ b 5f
+ and v20.16b, v20.16b, v0.16b // col_limit 8..9 -> limit2 == 12..13
+ and v21.16b, v21.16b, v0.16b
+ b 5f
+ and v21.16b, v21.16b, v0.16b // col_limit 10..11 -> limit2 == 14..15
+ and v22.16b, v22.16b, v0.16b
+ b 5f
+ and v22.16b, v22.16b, v0.16b // col_limit 12..13 -> limit2 == 16..17
+ and v23.16b, v23.16b, v0.16b
+ b 5f
+ and v23.16b, v23.16b, v0.16b // col_limit 14..15 -> limit2 == 18..19
+ and v24.16b, v24.16b, v0.16b
+ b 5f
+ and v24.16b, v24.16b, v0.16b // col_limit 16..17 -> limit2 == 20..21
+ and v25.16b, v25.16b, v0.16b
+ b 5f
+ and v25.16b, v25.16b, v0.16b // col_limit 18..19 -> limit2 == 22..23
+ and v26.16b, v26.16b, v0.16b
+ b 5f
+ and v26.16b, v26.16b, v0.16b // col_limit 20..21 -> limit2 == 24..25
+ and v27.16b, v27.16b, v0.16b
+ b 5f
+ and v27.16b, v27.16b, v0.16b // col_limit 22..23 -> limit2 == 26..27
+ and v28.16b, v28.16b, v0.16b
+ b 5f
+ and v28.16b, v28.16b, v0.16b // col_limit 24..25 -> limit2 == 28..29
+ and v29.16b, v29.16b, v0.16b
+ b 5f
+ and v29.16b, v29.16b, v0.16b // col_limit 26..27 -> limit2 == 30..31
+ and v30.16b, v30.16b, v0.16b
+ b 5f
+ // columns 8 .. 31
+3: add x4, x4, 6
+ subs x4, x4, x3, lsl 3
+ b.lo 5f
+ ld1 {v0.8h, v1.8h}, [x5]
+ adr x5, 4f
+ add x5, x5, x4, lsl 3
+ add x5, x5, x4, lsl 1
+ br x5
+4: and v16.16b, v16.16b, v1.16b // limit2 == 2..3
+ b 5f
+ nop
+ nop
+ nop
+ and v16.16b, v16.16b, v1.16b // limit2 == 4..5
+ and v17.16b, v17.16b, v1.16b
+ b 5f
+ nop
+ nop
+ and v16.16b, v16.16b, v0.16b // limit2 == 6..7
+ and v17.16b, v17.16b, v1.16b
+ and v18.16b, v18.16b, v1.16b
+ b 5f
+ nop
+ and v16.16b, v16.16b, v0.16b // limit2 == 8..9
+ and v17.16b, v17.16b, v0.16b
+ and v18.16b, v18.16b, v1.16b
+ and v19.16b, v19.16b, v1.16b
+ b 5f
+ and v17.16b, v17.16b, v0.16b // limit2 == 10..11
+ and v18.16b, v18.16b, v0.16b
+ and v19.16b, v19.16b, v1.16b
+ and v20.16b, v20.16b, v1.16b
+ b 5f
+ and v18.16b, v18.16b, v0.16b // limit2 == 12..13
+ and v19.16b, v19.16b, v0.16b
+ and v20.16b, v20.16b, v1.16b
+ and v21.16b, v21.16b, v1.16b
+ b 5f
+ and v19.16b, v19.16b, v0.16b // limit2 == 14..15
+ and v20.16b, v20.16b, v0.16b
+ and v21.16b, v21.16b, v1.16b
+ and v22.16b, v22.16b, v1.16b
+ b 5f
+ and v20.16b, v20.16b, v0.16b // limit2 == 16..17
+ and v21.16b, v21.16b, v0.16b
+ and v22.16b, v22.16b, v1.16b
+ and v23.16b, v23.16b, v1.16b
+ b 5f
+ and v21.16b, v21.16b, v0.16b // limit2 == 18..19
+ and v22.16b, v22.16b, v0.16b
+ and v23.16b, v23.16b, v1.16b
+ and v24.16b, v24.16b, v1.16b
+ b 5f
+ and v22.16b, v22.16b, v0.16b // limit2 == 20..21
+ and v23.16b, v23.16b, v0.16b
+ and v24.16b, v24.16b, v1.16b
+ and v25.16b, v25.16b, v1.16b
+ b 5f
+ and v23.16b, v23.16b, v0.16b // limit2 == 22..23
+ and v24.16b, v24.16b, v0.16b
+ and v25.16b, v25.16b, v1.16b
+ and v26.16b, v26.16b, v1.16b
+ b 5f
+ and v24.16b, v24.16b, v0.16b // limit2 == 24..25
+ and v25.16b, v25.16b, v0.16b
+ and v26.16b, v26.16b, v1.16b
+ and v27.16b, v27.16b, v1.16b
+ b 5f
+ and v25.16b, v25.16b, v0.16b // limit2 == 26..27
+ and v26.16b, v26.16b, v0.16b
+ and v27.16b, v27.16b, v1.16b
+ and v28.16b, v28.16b, v1.16b
+ b 5f
+
+
+ // o_32
+5: mov x5, 32
+ cmp x1, 28
+ b.hs 6f
+ add x5, x1, 4
+ bic x5, x5, 1
+ cbz x3, 6f
+ add x5, x1, 6
+ orr x5, x5, 1
+ subs x5, x5, x3, lsl 3
+ csel x5, x5, xzr, hs
+6: mov x4, 128
+ sub x4, x4, x5, lsl 2
+ adr x5, 8f
+ add x5, x5, x4
+ adr x4, .Lo32transform
+ mov x8, sp
+ mov x6, 16
+7: ld1 {v2.8h, v3.8h}, [x4], 32
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+8: smlal2 v9.4s, v31.8h, v3.h[7]
+ smlal v8.4s, v31.4h, v3.h[7]
+ smlal2 v9.4s, v30.8h, v3.h[6]
+ smlal v8.4s, v30.4h, v3.h[6]
+ smlal2 v9.4s, v29.8h, v3.h[5]
+ smlal v8.4s, v29.4h, v3.h[5]
+ smlal2 v9.4s, v28.8h, v3.h[4]
+ smlal v8.4s, v28.4h, v3.h[4]
+ smlal2 v9.4s, v27.8h, v3.h[3]
+ smlal v8.4s, v27.4h, v3.h[3]
+ smlal2 v9.4s, v26.8h, v3.h[2]
+ smlal v8.4s, v26.4h, v3.h[2]
+ smlal2 v9.4s, v25.8h, v3.h[1]
+ smlal v8.4s, v25.4h, v3.h[1]
+ smlal2 v9.4s, v24.8h, v3.h[0]
+ smlal v8.4s, v24.4h, v3.h[0]
+ smlal2 v9.4s, v23.8h, v2.h[7]
+ smlal v8.4s, v23.4h, v2.h[7]
+ smlal2 v9.4s, v22.8h, v2.h[6]
+ smlal v8.4s, v22.4h, v2.h[6]
+ smlal2 v9.4s, v21.8h, v2.h[5]
+ smlal v8.4s, v21.4h, v2.h[5]
+ smlal2 v9.4s, v20.8h, v2.h[4]
+ smlal v8.4s, v20.4h, v2.h[4]
+ smlal2 v9.4s, v19.8h, v2.h[3]
+ smlal v8.4s, v19.4h, v2.h[3]
+ smlal2 v9.4s, v18.8h, v2.h[2]
+ smlal v8.4s, v18.4h, v2.h[2]
+ smlal2 v9.4s, v17.8h, v2.h[1]
+ smlal v8.4s, v17.4h, v2.h[1]
+ smlal2 v9.4s, v16.8h, v2.h[0]
+ smlal v8.4s, v16.4h, v2.h[0]
+ st1 {v8.4s, v9.4s}, [x8], 32
+ subs x6, x6, 1
+ b.ne 7b
+
+ mov x4, x2
+ ld1 {v16.8h}, [x4], x9 // line 0
+ ld1 {v17.8h}, [x4], x9 // line 2
+ ld1 {v18.8h}, [x4], x9 // line 4
+ ld1 {v19.8h}, [x4], x9 // line 6
+ ld1 {v20.8h}, [x4], x9 // line 8
+ ld1 {v21.8h}, [x4], x9 // line 10
+ ld1 {v22.8h}, [x4], x9 // line 12
+ ld1 {v23.8h}, [x4], x9 // line 14
+ ld1 {v24.8h}, [x4], x9 // line 16
+ ld1 {v25.8h}, [x4], x9 // line 18
+ ld1 {v26.8h}, [x4], x9 // line 20
+ ld1 {v27.8h}, [x4], x9 // line 22
+ ld1 {v28.8h}, [x4], x9 // line 24
+ ld1 {v29.8h}, [x4], x9 // line 26
+ ld1 {v30.8h}, [x4], x9 // line 28
+ ld1 {v31.8h}, [x4], x9 // line 30
+ cmp x1, 28
+ b.hs 12f
+ // limit2 below 32
+ bic x4, x1, 3
+ cbnz x3, 10f
+ // columns 0 .. 7 - cleanup of indexes 5 .. 7
+ adr x5, 9f
+ add x5, x5, x4, lsl 1
+ br x5
+9: and v17.16b, v17.16b, v0.16b // col_limit 0..3 -> limit2/2 == 2..3
+ b 12f
+ and v19.16b, v19.16b, v0.16b // col_limit 4..7 -> limit2/2 == 4..5
+ b 12f
+ and v21.16b, v21.16b, v0.16b // col_limit 8..11 -> limit2/2 == 6..7
+ b 12f
+ and v23.16b, v23.16b, v0.16b // col_limit 12..15 -> limit2/2 == 8..9
+ b 12f
+ and v25.16b, v25.16b, v0.16b // col_limit 16..19 -> limit2/2 == 10..11
+ b 12f
+ and v27.16b, v27.16b, v0.16b // col_limit 20..23 -> limit2/2 == 12..13
+ b 12f
+ and v29.16b, v29.16b, v0.16b // col_limit 24..27 -> limit2/2 == 14..15
+ b 12f
+ // columns 8 .. 31
+10: add x4, x4, 4
+ subs x4, x4, x3, lsl 3 // x4 = (limit2 & ~3)-4 for column 8 * x3
+ b.lo 12f
+ adr x5, 11f
+ add x5, x5, x4, lsl 1
+ add x5, x5, x4
+ br x5
+11: and v17.16b, v17.16b, v1.16b // limit2 == 4..7
+ b 12f
+ nop
+ and v17.16b, v17.16b, v0.16b // limit2 == 8..11
+ and v19.16b, v19.16b, v1.16b
+ b 12f
+ and v19.16b, v19.16b, v0.16b // limit2 == 12..15
+ and v21.16b, v21.16b, v1.16b
+ b 12f
+ and v21.16b, v21.16b, v0.16b // limit2 == 16..19
+ and v23.16b, v23.16b, v1.16b
+ b 12f
+ and v23.16b, v23.16b, v0.16b // limit2 == 20..23
+ and v25.16b, v25.16b, v1.16b
+ b 12f
+ and v25.16b, v25.16b, v0.16b // limit2 == 24..27
+ and v27.16b, v27.16b, v1.16b
+ b 12f
+
+ // v0,v1 = e0
+12: sshll v0.4s, v16.4h, 6
+ sshll v1.4s, v24.4h, 6
+ add v0.4s, v0.4s, v1.4s
+ sshll2 v1.4s, v16.8h, 6
+ sshll2 v2.4s, v24.8h, 6
+ add v1.4s, v1.4s, v2.4s
+
+ // v2,v3 = o0
+ mov v14.s[0], w7
+ smull v2.4s, v20.4h, v14.h[0]
+ smlal v2.4s, v28.4h, v14.h[1]
+ smull2 v3.4s, v20.8h, v14.h[0]
+ smlal2 v3.4s, v28.8h, v14.h[1]
+
+ // v4,v5 = e_8[0]
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+
+ // v6,v7 = e_8[3]
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+
+
+ // v0,v1 = o_8[0]
+ adr x4, .Lo8transform0
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[0]
+ add v2.4s, v4.4s, v0.4s
+ add v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[0]
+ adr x4, .Lo16transform0
+ ld1 {v15.8h}, [x4]
+ mov x5, 32
+ cmp x1, 28
+ b.hs 13f
+ add x5, x1, 4
+ bic x5, x5, 3
+ cbz x3, 13f
+ orr x5, x5, 2
+ subs x5, x5, x3, lsl 3
+ csel x5, x5, xzr, hs
+13: mov x4, 64
+ sub x6, x4, x5, lsl 1
+ adr x5, 14f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+14: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[0]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[0]
+ ld1 {v14.4s, v15.4s}, [sp]
+
+ // tmp[0 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ st1 {v10.8h}, [x2]
+
+ // tmp[31 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 31 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[15]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[15]
+ add x4, sp, 15 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[15 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 15 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[16 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 16 * 64
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[7]
+ sub v2.4s, v4.4s, v0.4s
+ sub v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[7]
+ adr x4, .Lo16transform7
+ ld1 {v15.8h}, [x4]
+ adr x5, 15f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+15: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[7]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[7]
+ add x4, sp, 7 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[7 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 7 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[24 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 24 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[8]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[8]
+ add x4, sp, 8 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[8 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 8 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[23 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 23 * 64
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = o_8[3]
+ adr x4, .Lo8transform3
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[3]
+ add v2.4s, v6.4s, v0.4s
+ add v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[3]
+ adr x4, .Lo16transform3
+ ld1 {v15.8h}, [x4]
+ adr x5, 16f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+16: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[3]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[3]
+ add x4, sp, 3 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[3 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 3 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[28 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 28 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[12]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[12]
+ add x4, sp, 12 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[12 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 12 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[19 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 19 * 64
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[4]
+ sub v2.4s, v6.4s, v0.4s
+ sub v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[4]
+ adr x4, .Lo16transform4
+ ld1 {v15.8h}, [x4]
+ adr x5, 17f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+17: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[4]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[4]
+ add x4, sp, 4 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[4 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 4 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[27 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 27 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[11]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[11]
+ add x4, sp, 11 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[11 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 11 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[20 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 20 * 64
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = e1
+ sshll v0.4s, v16.4h, 6
+ sshll v1.4s, v24.4h, 6
+ sub v0.4s, v0.4s, v1.4s
+ sshll2 v1.4s, v16.8h, 6
+ sshll2 v2.4s, v24.8h, 6
+ sub v1.4s, v1.4s, v2.4s
+
+ // v2,v3 = o1
+ mov v14.s[0], w7
+ smull v2.4s, v20.4h, v14.h[1]
+ smlsl v2.4s, v28.4h, v14.h[0]
+ smull2 v3.4s, v20.8h, v14.h[1]
+ smlsl2 v3.4s, v28.8h, v14.h[0]
+
+ // v4,v5 = e_8[1]
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+
+ // v6,v7 = e_8[2]
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+
+ // v0,v1 = o_8[1]
+ adr x4, .Lo8transform1
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[1]
+ add v2.4s, v4.4s, v0.4s
+ add v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[1]
+ adr x4, .Lo16transform1
+ ld1 {v15.8h}, [x4]
+ adr x5, 18f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+18: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[1]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[1]
+ add x4, sp, 1 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[1 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 1 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[30 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 30 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[14]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[14]
+ add x4, sp, 14 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[14 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 14 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[17 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 17 * 64
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[6]
+ sub v2.4s, v4.4s, v0.4s
+ sub v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[6]
+ adr x4, .Lo16transform6
+ ld1 {v15.8h}, [x4]
+ adr x5, 19f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+19: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[6]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[6]
+ add x4, sp, 6 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[6 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 6 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[25 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 25 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[9]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[9]
+ add x4, sp, 9 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[9 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 9 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[22 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 22 * 64
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = o_8[2]
+ adr x4, .Lo8transform2
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[2]
+ add v2.4s, v6.4s, v0.4s
+ add v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[2]
+ adr x4, .Lo16transform2
+ ld1 {v15.8h}, [x4]
+ adr x5, 20f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+20: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[2]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[2]
+ add x4, sp, 2 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[2 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 2 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[29 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 29 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[13]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[13]
+ add x4, sp, 13 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[13 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 13 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[18 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 18 * 64
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[5]
+ sub v2.4s, v6.4s, v0.4s
+ sub v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[5]
+ adr x4, .Lo16transform5
+ ld1 {v15.8h}, [x4]
+ adr x5, 21f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+21: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[5]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[5]
+ add x4, sp, 5 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[5 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 5 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[26 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 26 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[10]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[10]
+ add x4, sp, 10 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[10 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 10 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[21 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 21 * 64
+ st1 {v10.8h}, [x4]
+
+
+ add x2, x2, 16
+ add x3, x3, 1
+ cmp x3, 4
+ b.ne 1b
+
+ // horizontal transform
+ cmp x1, 9
+ b.ls 24f
+ // o_32 partially (last 12 sum components)
+ adr x4, .Lo32transform9_31
+ ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x4], 64
+ ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x4], 64
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], 64
+ bic x5, x1, 1
+ subs x5, x5, 8
+ csel x5, x5, xzr, hs
+ mov x4, 24
+ subs x4, x4, x5
+ csel x5, x4, xzr, hs
+ adr x4, 23f
+ add x5, x4, x5, lsl 3
+ add x2, x0, 16
+ mov x8, sp
+ mov x3, 64
+ mov x6, 32
+22: ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3
+ movi v4.4s, 0
+ movi v5.4s, 0
+ movi v6.4s, 0
+ movi v7.4s, 0
+ br x5
+23: smlal v4.4s, v30.4h, v2.h[7]
+ smlal2 v5.4s, v30.8h, v2.h[7]
+ smlal v6.4s, v31.4h, v2.h[7]
+ smlal2 v7.4s, v31.8h, v2.h[7]
+ smlal v4.4s, v28.4h, v2.h[5]
+ smlal2 v5.4s, v28.8h, v2.h[5]
+ smlal v6.4s, v29.4h, v2.h[5]
+ smlal2 v7.4s, v29.8h, v2.h[5]
+ smlal v4.4s, v26.4h, v2.h[3]
+ smlal2 v5.4s, v26.8h, v2.h[3]
+ smlal v6.4s, v27.4h, v2.h[3]
+ smlal2 v7.4s, v27.8h, v2.h[3]
+ smlal v4.4s, v24.4h, v2.h[1]
+ smlal2 v5.4s, v24.8h, v2.h[1]
+ smlal v6.4s, v25.4h, v2.h[1]
+ smlal2 v7.4s, v25.8h, v2.h[1]
+ smlal v4.4s, v22.4h, v1.h[7]
+ smlal2 v5.4s, v22.8h, v1.h[7]
+ smlal v6.4s, v23.4h, v1.h[7]
+ smlal2 v7.4s, v23.8h, v1.h[7]
+ smlal v4.4s, v20.4h, v1.h[5]
+ smlal2 v5.4s, v20.8h, v1.h[5]
+ smlal v6.4s, v21.4h, v1.h[5]
+ smlal2 v7.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v18.4h, v1.h[3]
+ smlal2 v5.4s, v18.8h, v1.h[3]
+ smlal v6.4s, v19.4h, v1.h[3]
+ smlal2 v7.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v16.4h, v1.h[1]
+ smlal2 v5.4s, v16.8h, v1.h[1]
+ smlal v6.4s, v17.4h, v1.h[1]
+ smlal2 v7.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v14.4h, v0.h[7]
+ smlal2 v5.4s, v14.8h, v0.h[7]
+ smlal v6.4s, v15.4h, v0.h[7]
+ smlal2 v7.4s, v15.8h, v0.h[7]
+ smlal v4.4s, v12.4h, v0.h[5]
+ smlal2 v5.4s, v12.8h, v0.h[5]
+ smlal v6.4s, v13.4h, v0.h[5]
+ smlal2 v7.4s, v13.8h, v0.h[5]
+ smlal v4.4s, v10.4h, v0.h[3]
+ smlal2 v5.4s, v10.8h, v0.h[3]
+ smlal v6.4s, v11.4h, v0.h[3]
+ smlal2 v7.4s, v11.8h, v0.h[3]
+ smlal v4.4s, v8.4h, v0.h[1]
+ smlal2 v5.4s, v8.8h, v0.h[1]
+ smlal v6.4s, v9.4h, v0.h[1]
+ smlal2 v7.4s, v9.8h, v0.h[1]
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], 64
+ subs x6, x6, 1
+ b.ne 22b
+
+
+24: adr x4, .Leo_coeff
+ ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x4], 64
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+ adr x4, .Lo32transform
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], 64
+ // o_16 jump address
+ mov x4, 64
+ bic x5, x1, 3
+ subs x4, x4, x5, lsl 1
+ csel x4, x4, xzr, hs
+ adr x5, 26f
+ add x5, x5, x4
+ // o_32 jump address
+ bic x6, x1, 1
+ mov x4, 8
+ subs x4, x4, x6
+ csel x6, x4, xzr, hs
+ adr x4, 29f
+ add x6, x4, x6, lsl 3
+
+ mov x8, sp
+ mov x3, 32
+25: ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0]
+
+ // v2 = e_8
+ smull v2.4s, v12.4h, v8.h[0]
+ smlal2 v2.4s, v12.8h, v9.h[0]
+ smlal v2.4s, v13.4h, v10.h[0]
+ smlal2 v2.4s, v13.8h, v11.h[0]
+
+ // v3 = o_8
+ smull v3.4s, v14.4h, v8.h[4]
+ smlal2 v3.4s, v14.8h, v9.h[4]
+ smlal v3.4s, v15.4h, v10.h[4]
+ smlal2 v3.4s, v15.8h, v11.h[4]
+
+ // v0,v1 = e_16
+ add v0.4s, v2.4s, v3.4s
+ sub v2.4s, v2.4s, v3.4s
+ mov v1.d[0], v2.d[1]
+ mov v1.d[1], v2.d[0]
+ rev64 v1.4s, v1.4s
+
+ // v2,v3 = o_16
+ movi v2.4s, 0
+ movi v3.4s, 0
+ br x5
+26: smlal v2.4s, v23.4h, v11.h[6]
+ smlal2 v3.4s, v23.8h, v11.h[6]
+ smlal v2.4s, v22.4h, v11.h[2]
+ smlal2 v3.4s, v22.8h, v11.h[2]
+ smlal v2.4s, v21.4h, v10.h[6]
+ smlal2 v3.4s, v21.8h, v10.h[6]
+ smlal v2.4s, v20.4h, v10.h[2]
+ smlal2 v3.4s, v20.8h, v10.h[2]
+ smlal v2.4s, v19.4h, v9.h[6]
+ smlal2 v3.4s, v19.8h, v9.h[6]
+ smlal v2.4s, v18.4h, v9.h[2]
+ smlal2 v3.4s, v18.8h, v9.h[2]
+ smlal v2.4s, v17.4h, v8.h[6]
+ smlal2 v3.4s, v17.8h, v8.h[6]
+ smlal v2.4s, v16.4h, v8.h[2]
+ smlal2 v3.4s, v16.8h, v8.h[2]
+
+ // v4,v5,v6,v7 = e_32
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+ sub v11.4s, v0.4s, v2.4s
+ mov v7.d[0], v11.d[1]
+ mov v7.d[1], v11.d[0]
+ rev64 v7.4s, v7.4s
+ sub v11.4s, v1.4s, v3.4s
+ mov v6.d[0], v11.d[1]
+ mov v6.d[1], v11.d[0]
+ rev64 v6.4s, v6.4s
+
+ // v0,v1,v2,v3 = o_32
+ cmp x1, 9
+ b.hi 28f
+ movi v0.4s, 0
+ movi v1.4s, 0
+ movi v2.4s, 0
+ movi v3.4s, 0
+ br x6
+28: ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], 64
+ br x6
+29: smlal v0.4s, v30.4h, v8.h[7]
+ smlal2 v1.4s, v30.8h, v8.h[7]
+ smlal v2.4s, v31.4h, v8.h[7]
+ smlal2 v3.4s, v31.8h, v8.h[7]
+ smlal v0.4s, v28.4h, v8.h[5]
+ smlal2 v1.4s, v28.8h, v8.h[5]
+ smlal v2.4s, v29.4h, v8.h[5]
+ smlal2 v3.4s, v29.8h, v8.h[5]
+ smlal v0.4s, v26.4h, v8.h[3]
+ smlal2 v1.4s, v26.8h, v8.h[3]
+ smlal v2.4s, v27.4h, v8.h[3]
+ smlal2 v3.4s, v27.8h, v8.h[3]
+ smlal v0.4s, v24.4h, v8.h[1]
+ smlal2 v1.4s, v24.8h, v8.h[1]
+ smlal v2.4s, v25.4h, v8.h[1]
+ smlal2 v3.4s, v25.8h, v8.h[1]
+
+ // coeff
+ add v8.4s, v4.4s, v0.4s
+ add v9.4s, v5.4s, v1.4s
+ add v10.4s, v6.4s, v2.4s
+ add v11.4s, v7.4s, v3.4s
+ sub v4.4s, v4.4s, v0.4s
+ sub v5.4s, v5.4s, v1.4s
+ sub v6.4s, v6.4s, v2.4s
+ sub v7.4s, v7.4s, v3.4s
+ sqrshrn v8.4h, v8.4s, 12
+ sqrshrn2 v8.8h, v9.4s, 12
+ sqrshrn v9.4h, v10.4s, 12
+ sqrshrn2 v9.8h, v11.4s, 12
+ sqrshrn v4.4h, v4.4s, 12
+ sqrshrn2 v4.8h, v5.4s, 12
+ sqrshrn v5.4h, v6.4s, 12
+ sqrshrn2 v5.8h, v7.4s, 12
+ mov v10.d[0], v5.d[1]
+ mov v10.d[1], v5.d[0]
+ rev64 v10.8h, v10.8h
+ mov v11.d[0], v4.d[1]
+ mov v11.d[1], v4.d[0]
+ rev64 v11.8h, v11.8h
+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], 64
+ subs x3, x3, 1
+ b.ne 25b
+
+ add sp, sp, 16 * 32 * 4
+ ld1 {v12.16b, v13.16b, v14.16b, v15.16b}, [sp], 64
+ ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
+ ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 612ebb9541..bb2a6b2502 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -46,6 +46,7 @@ void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon_new(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
@@ -74,6 +75,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon;
c->idct[1] = ff_hevc_idct_8x8_8_neon;
c->idct[2] = ff_hevc_idct_16x16_8_neon_new;
+ c->idct[3] = ff_hevc_idct_32x32_8_neon;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread