* [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct
@ 2022-06-23 12:23 J. Dekker
2022-06-23 12:23 ` [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct J. Dekker
2022-08-09 12:15 ` [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct Martin Storsjö
0 siblings, 2 replies; 3+ messages in thread
From: J. Dekker @ 2022-06-23 12:23 UTC (permalink / raw)
To: ffmpeg-devel
old:
hevc_idct_16x16_8_c: 5366.2
hevc_idct_16x16_8_neon: 1493.2
new:
hevc_idct_16x16_8_c: 5363.2
hevc_idct_16x16_8_neon: 943.5
Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com>
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 666 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 3 +-
2 files changed, 668 insertions(+), 1 deletion(-)
This idct is significantly faster than the one we currently have, I
suspect its for a couple reasons: 1) it's only written for 8bit 2) it's
unrolled signficantly more. It comes at a hefty cost of roughly 2.25x
the object size. I'm wondering if this idct is salvagable, or the one
we have should just be improved instead.
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..784bae33b3 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -618,3 +618,669 @@ idct_dc 16, 10
idct_dc 32, 8
idct_dc 32, 10
+
+// WIP
+
+.Lo0_coeff: .hword 83, 36, 0, 0, 0, 0, 0, 0
+.Lo8transform0: .hword 89, 75, 50, 18 // transform[4,12,20,28][0]
+.Lo8transform1: .hword 75, -18, -89, -50
+.Lo8transform2: .hword 50, -89, 18, 75
+.Lo8transform3: .hword 18, -50, 75, -89
+
+.LimitMask:
+ .hword 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0
+ .hword 0xffff, 0, 0, 0, 0, 0, 0, 0
+
+.Leo_coeff:
+ .hword 64, 64, 64, 64, 83, 36, -36, -83
+ .hword 64, -64, -64, 64, 36, -83, 83, -36
+ .hword 89, 75, 50, 18, 75, -18, -89, -50 // transform[4,12][0-3]
+ .hword 50, -89, 18, 75, 18, -50, 75, -89 // transform[20,28][0-3]
+.Lo16transform0: .hword 90, 87, 80, 70, 57, 43, 25, 9 // transform[2][0-7], also transform[2,6,10..][0]
+.Lo16transform1: .hword 87, 57, 9, -43, -80, -90, -70, -25 // transform[6][0-7]
+.Lo16transform2: .hword 80, 9, -70, -87, -25, 57, 90, 43 // transform[10][0-7]
+.Lo16transform3: .hword 70, -43, -87, 9, 90, 25, -80, -57 // transform[14][0-7]
+.Lo16transform4: .hword 57, -80, -25, 90, -9, -87, 43, 70 // transform[18][0-7]
+.Lo16transform5: .hword 43, -90, 57, 25, -87, 70, 9, -80 // transform[22][0-7]
+.Lo16transform6: .hword 25, -70, 90, -80, 43, 9, -57, 87 // transform[26][0-7]
+.Lo16transform7: .hword 9, -25, 43, -57, 70, -80, 87, -90 // transform[30][0-7]
+
+// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
+function ff_hevc_idct_16x16_8_neon_new, export=1
+ sub sp, sp, 64
+ st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+ sub sp, sp, 32
+ st1 {v14.16b, v15.16b}, [sp]
+ mov x3, 0
+ mov x2, x0
+1: mov x4, x2
+ mov x5, 32
+ ld1 {v16.8h}, [x4], x5
+ ld1 {v17.8h}, [x4], x5
+ ld1 {v18.8h}, [x4], x5
+ ld1 {v19.8h}, [x4], x5
+ ld1 {v20.8h}, [x4], x5
+ ld1 {v21.8h}, [x4], x5
+ ld1 {v22.8h}, [x4], x5
+ ld1 {v23.8h}, [x4], x5
+ ld1 {v24.8h}, [x4], x5
+ ld1 {v25.8h}, [x4], x5
+ ld1 {v26.8h}, [x4], x5
+ ld1 {v27.8h}, [x4], x5
+ ld1 {v28.8h}, [x4], x5
+ ld1 {v29.8h}, [x4], x5
+ ld1 {v30.8h}, [x4], x5
+ ld1 {v31.8h}, [x4], x5
+ cmp x1, 12
+ b.hs 5f
+ // limit2 below 16
+ bic x4, x1, 1
+ adr x5, .LimitMask
+ cbnz x3, 3f
+ // columns 0 .. 7 - cleanup of indexes 5 .. 7
+ ld1 {v0.8h}, [x5]
+ adr x5, 2f
+ add x5, x5, x4, lsl 2
+ add x5, x5, x4, lsl 1
+ br x5
+2: and v17.16b, v17.16b, v0.16b // col_limit 0..1 -> limit2 == 4..5
+ and v19.16b, v19.16b, v0.16b
+ b 5f
+ and v19.16b, v19.16b, v0.16b // col_limit 2..3 -> limit2 == 6..7
+ and v21.16b, v21.16b, v0.16b
+ b 5f
+ and v21.16b, v21.16b, v0.16b // col_limit 4..5 -> limit2 == 8..9
+ and v23.16b, v23.16b, v0.16b
+ b 5f
+ and v23.16b, v23.16b, v0.16b // col_limit 6..7 -> limit2 == 10..11
+ and v25.16b, v25.16b, v0.16b
+ b 5f
+ and v25.16b, v25.16b, v0.16b // col_limit 8..9 -> limit2 == 12..13
+ and v27.16b, v27.16b, v0.16b
+ b 5f
+ and v27.16b, v27.16b, v0.16b // col_limit 10..11 -> limit2 == 14..15
+ and v29.16b, v29.16b, v0.16b
+ b 5f
+ // columns 8 .. 15
+3: subs x4, x4, 2
+ b.lo 5f
+ ld1 {v0.8h, v1.8h}, [x5]
+ adr x5, 4f
+ add x5, x5, x4, lsl 3
+ add x5, x5, x4, lsl 1
+ br x5
+4: and v17.16b, v17.16b, v1.16b // col_limit 2..3 -> limit2 == 2..3
+ b 5f
+ nop
+ nop
+ nop
+ and v17.16b, v17.16b, v1.16b // col_limit 4..5 -> limit2 == 4..5
+ and v19.16b, v19.16b, v1.16b
+ b 5f
+ nop
+ nop
+ and v17.16b, v17.16b, v0.16b // col_limit 6..7 -> limit2 == 6..7
+ and v19.16b, v19.16b, v1.16b
+ and v21.16b, v21.16b, v1.16b
+ b 5f
+ nop
+ and v17.16b, v17.16b, v0.16b // col_limit 8..9 -> limit2 == 8..9
+ and v19.16b, v19.16b, v0.16b
+ and v21.16b, v21.16b, v1.16b
+ and v23.16b, v23.16b, v1.16b
+ b 5f
+ and v19.16b, v19.16b, v0.16b // col_limit 10..11 -> limit2 == 10..11
+ and v21.16b, v21.16b, v0.16b
+ and v23.16b, v23.16b, v1.16b
+ and v25.16b, v25.16b, v1.16b
+ b 5f
+5: adr x4, .Lo0_coeff
+ ld1 {v14.8h}, [x4]
+
+ // v0,v1 = e0
+ sshll v0.4s, v16.4h, 6
+ sshll v1.4s, v24.4h, 6
+ add v0.4s, v0.4s, v1.4s
+ sshll2 v1.4s, v16.8h, 6
+ sshll2 v2.4s, v24.8h, 6
+ add v1.4s, v1.4s, v2.4s
+
+ // v2,v3 = o0
+ smull v2.4s, v20.4h, v14.h[0]
+ smlal v2.4s, v28.4h, v14.h[1]
+ smull2 v3.4s, v20.8h, v14.h[0]
+ smlal2 v3.4s, v28.8h, v14.h[1]
+
+ // v4,v5 = e_8[0]
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+
+ // v6,v7 = e_8[3]
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+
+
+ // v0,v1 = o_8[0]
+ adr x4, .Lo8transform0
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[0]
+ add v2.4s, v4.4s, v0.4s
+ add v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[0]
+ adr x4, .Lo16transform0
+ ld1 {v15.8h}, [x4]
+
+ mov x5, 16
+ cmp x1, 12
+ b.hs 6f
+ add x5, x1, 4
+ bic x5, x5, 1
+ cbz x3, 6f
+ orr x5, x1, 1
+ subs x5, x5, 2
+ csel x5, x5, xzr, hs
+6: mov x4, 64
+ sub x6, x4, x5, lsl 2
+ adr x5, 7f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+7: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[0 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ st1 {v10.8h}, [x2]
+
+ // tmp[15 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 15 * 32
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[7]
+ sub v2.4s, v4.4s, v0.4s
+ sub v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[7]
+ adr x4, .Lo16transform7
+ ld1 {v15.8h}, [x4]
+ adr x5, 8f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+8: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[7 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 7 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[8 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 8 * 32
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = o_8[3]
+ adr x4, .Lo8transform3
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[3]
+ add v2.4s, v6.4s, v0.4s
+ add v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[3]
+ adr x4, .Lo16transform3
+ ld1 {v15.8h}, [x4]
+ adr x5, 9f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+9: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6] // 13
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5] // 11
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4] // 9
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3] // 7
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2] // 5
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1] // 3
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0] // 1
+
+ // tmp[3 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 3 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[12 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 12 * 32
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[4]
+ sub v2.4s, v6.4s, v0.4s
+ sub v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[4]
+ adr x4, .Lo16transform4
+ ld1 {v15.8h}, [x4]
+ adr x5, 10f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+10: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[4 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 4 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[11 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 11 * 32
+ st1 {v10.8h}, [x4]
+
+
+ // v0,v1 = e1
+ sshll v0.4s, v16.4h, 6
+ sshll v1.4s, v24.4h, 6
+ sub v0.4s, v0.4s, v1.4s
+ sshll2 v1.4s, v16.8h, 6
+ sshll2 v2.4s, v24.8h, 6
+ sub v1.4s, v1.4s, v2.4s
+
+ // v2,v3 = o1
+ smull v2.4s, v20.4h, v14.h[1]
+ smlsl v2.4s, v28.4h, v14.h[0]
+ smull2 v3.4s, v20.8h, v14.h[1]
+ smlsl2 v3.4s, v28.8h, v14.h[0]
+
+ // v4,v5 = e_8[1]
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+
+ // v6,v7 = e_8[2]
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+
+ // v0,v1 = o_8[1]
+ adr x4, .Lo8transform1
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[1]
+ add v2.4s, v4.4s, v0.4s
+ add v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[1]
+ adr x4, .Lo16transform1
+ ld1 {v15.8h}, [x4]
+ adr x5, 11f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+11: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[1 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 1 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[14 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 14 * 32
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[6]
+ sub v2.4s, v4.4s, v0.4s
+ sub v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[6]
+ adr x4, .Lo16transform6
+ ld1 {v15.8h}, [x4]
+ adr x5, 12f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+12: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[6 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 6 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[9 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 9 * 32
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = o_8[2]
+ adr x4, .Lo8transform2
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[2]
+ add v2.4s, v6.4s, v0.4s
+ add v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[2]
+ adr x4, .Lo16transform2
+ ld1 {v15.8h}, [x4]
+ adr x5, 13f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+13: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[2 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 2 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[13 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 13 * 32
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[5]
+ sub v2.4s, v6.4s, v0.4s
+ sub v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[5]
+ adr x4, .Lo16transform5
+ ld1 {v15.8h}, [x4]
+ adr x5, 14f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+14: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[5 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 5 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[10 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 10 * 32
+ st1 {v10.8h}, [x4]
+
+ add x2, x2, 16
+ add x3, x3, 1
+ cmp x3, 2
+ b.lo 1b
+
+
+ // horizontal transform
+ adr x4, .Leo_coeff
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+ // o_16 jump address
+ mov x4, 64
+ bic x5, x1, 1
+ subs x4, x4, x5, lsl 2
+ csel x4, x4, xzr, hs
+ adr x5, 15f
+ add x5, x5, x4
+
+ mov x3, 16
+14: ld1 {v6.8h, v7.8h}, [x0]
+
+ // v2 = e_8
+ smull v2.4s, v16.4h, v6.h[0]
+ smlal2 v2.4s, v16.8h, v6.h[4]
+ smlal v2.4s, v17.4h, v7.h[0]
+ smlal2 v2.4s, v17.8h, v7.h[4]
+
+ // v3 = o_8
+ smull v3.4s, v18.4h, v6.h[2]
+ smlal2 v3.4s, v18.8h, v6.h[6]
+ smlal v3.4s, v19.4h, v7.h[2]
+ smlal2 v3.4s, v19.8h, v7.h[6]
+
+ // v0,v1 = e_16
+ add v0.4s, v2.4s, v3.4s
+ sub v2.4s, v2.4s, v3.4s
+ mov v1.d[0], v2.d[1]
+ mov v1.d[1], v2.d[0]
+ rev64 v1.4s, v1.4s
+
+ // v2,v3 = o_16
+ movi v2.4s, 0
+ movi v3.4s, 0
+ br x5
+15: smlal v2.4s, v27.4h, v7.h[7]
+ smlal2 v3.4s, v27.8h, v7.h[7]
+ smlal v2.4s, v26.4h, v7.h[5]
+ smlal2 v3.4s, v26.8h, v7.h[5]
+ smlal v2.4s, v25.4h, v7.h[3]
+ smlal2 v3.4s, v25.8h, v7.h[3]
+ smlal v2.4s, v24.4h, v7.h[1]
+ smlal2 v3.4s, v24.8h, v7.h[1]
+ smlal v2.4s, v23.4h, v6.h[7]
+ smlal2 v3.4s, v23.8h, v6.h[7]
+ smlal v2.4s, v22.4h, v6.h[5]
+ smlal2 v3.4s, v22.8h, v6.h[5]
+ smlal v2.4s, v21.4h, v6.h[3]
+ smlal2 v3.4s, v21.8h, v6.h[3]
+ smlal v2.4s, v20.4h, v6.h[1]
+ smlal2 v3.4s, v20.8h, v6.h[1]
+
+ // coeff
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+ sqrshrn v4.4h, v4.4s, 12
+ sqrshrn2 v4.8h, v5.4s, 12
+ sqrshrn v6.4h, v6.4s, 12
+ sqrshrn2 v6.8h, v7.4s, 12
+ mov v5.d[0], v6.d[1]
+ mov v5.d[1], v6.d[0]
+ rev64 v5.8h, v5.8h
+ st1 {v4.8h, v5.8h}, [x0], 32
+ subs x3, x3, 1
+ b.ne 14b
+
+ ld1 {v14.16b, v15.16b}, [sp], 32
+ ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
+ ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 2002530266..612ebb9541 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -45,6 +45,7 @@ void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_8_neon_new(int16_t *coeffs, int col_limit);
void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
@@ -72,7 +73,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon;
c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon;
c->idct[1] = ff_hevc_idct_8x8_8_neon;
- c->idct[2] = ff_hevc_idct_16x16_8_neon;
+ c->idct[2] = ff_hevc_idct_16x16_8_neon_new;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct
2022-06-23 12:23 [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct J. Dekker
@ 2022-06-23 12:23 ` J. Dekker
2022-08-09 12:15 ` [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct Martin Storsjö
1 sibling, 0 replies; 3+ messages in thread
From: J. Dekker @ 2022-06-23 12:23 UTC (permalink / raw)
To: ffmpeg-devel
hevc_idct_32x32_8_c: 40128.5
hevc_idct_32x32_8_neon: 7102.0
Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com>
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 1265 +++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
2 files changed, 1267 insertions(+)
Written by the same author as the other 16x16 idct. Again the same
concern with it.
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 784bae33b3..3b6e95153f 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -644,6 +644,40 @@ idct_dc 32, 10
.Lo16transform5: .hword 43, -90, 57, 25, -87, 70, 9, -80 // transform[22][0-7]
.Lo16transform6: .hword 25, -70, 90, -80, 43, 9, -57, 87 // transform[26][0-7]
.Lo16transform7: .hword 9, -25, 43, -57, 70, -80, 87, -90 // transform[30][0-7]
+.Lo32transform:
+ .hword 90, 90, 88, 85, 82, 78, 73, 67 // transform[1,3,5,7..15][1]
+ .hword 61, 54, 46, 38, 31, 22, 13, 4 // transform[17,19,21..31][1]
+ .hword 90, 82, 67, 46, 22, -4, -31, -54 // transform[1,3,5,7..15][3]
+ .hword -73, -85, -90, -88, -78, -61, -38, -13 // transform[17,19,21..31][3]
+ .hword 88, 67, 31, -13, -54, -82, -90, -78 // ..
+ .hword -46, -4, 38, 73, 90, 85, 61, 22
+ .hword 85, 46, -13, -67, -90, -73, -22, 38
+ .hword 82, 88, 54, -4, -61, -90, -78, -31
+.Lo32transform9_31:
+ .hword 82, 22, -54, -90, -61, 13, 78, 85
+ .hword 31, -46, -90, -67, 4, 73, 88, 38
+ .hword 78, -4, -82, -73, 13, 85, 67, -22
+ .hword -88, -61, 31, 90, 54, -38, -90, -46
+ .hword 73, -31, -90, -22, 78, 67, -38, -90
+ .hword -13, 82, 61, -46, -88, -4, 85, 54
+ .hword 67, -54, -78, 38, 85, -22, -90, 4
+ .hword 90, 13, -88, -31, 82, 46, -73, -61
+ .hword 61, -73, -46, 82, 31, -88, -13, 90
+ .hword -4, -90, 22, 85, -38, -78, 54, 67
+ .hword 54, -85, -4, 88, -46, -61, 82, 13
+ .hword -90, 38, 67, -78, -22, 90, -31, -73
+ .hword 46, -90, 38, 54, -90, 31, 61, -88
+ .hword 22, 67, -85, 13, 73, -82, 4, 78
+ .hword 38, -88, 73, -4, -67, 90, -46, -31
+ .hword 85, -78, 13, 61, -90, 54, 22, -82
+ .hword 31, -78, 90, -61, 4, 54, -88, 82
+ .hword -38, -22, 73, -90, 67, -13, -46, 85
+ .hword 22, -61, 85, -90, 73, -38, -4, 46
+ .hword -78, 90, -82, 54, -13, -31, 67, -88
+ .hword 13, -38, 61, -78, 88, -90, 85, -73
+ .hword 54, -31, 4, 22, -46, 67, -82, 90
+ .hword 4, -13, 22, -31, 38, -46, 54, -61 // transform[1,3,5,7..15][31]
+ .hword 67, -73, 78, -82, 85, -88, 90, -90 // transform[17,19,21..31][31]
// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
function ff_hevc_idct_16x16_8_neon_new, export=1
@@ -1284,3 +1318,1234 @@ function ff_hevc_idct_16x16_8_neon_new, export=1
ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
ret
endfunc
+
+function ff_hevc_idct_32x32_8_neon, export=1
+ sub sp, sp, 64
+ st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+ sub sp, sp, 64
+ st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [sp]
+ sub sp, sp, 16 * 32 * 4 // room for o_32: 16 * 32 values
+ mov x3, 0 // loop counter
+ mov x2, x0
+ mov x7, 83
+ add x7, x7, 36 * 65536 // o0, o1 coeff. factors
+1: mov x9, 128
+ // loading odd lines
+ add x4, x2, 64 // odd lines
+ ld1 {v16.8h}, [x4], x9 // line 1
+ ld1 {v17.8h}, [x4], x9 // line 3
+ ld1 {v18.8h}, [x4], x9 // line 5
+ ld1 {v19.8h}, [x4], x9 // line 7
+ ld1 {v20.8h}, [x4], x9 // line 9
+ ld1 {v21.8h}, [x4], x9 // line 11
+ ld1 {v22.8h}, [x4], x9 // line 13
+ ld1 {v23.8h}, [x4], x9 // line 15
+ ld1 {v24.8h}, [x4], x9 // line 17
+ ld1 {v25.8h}, [x4], x9 // line 19
+ ld1 {v26.8h}, [x4], x9 // line 21
+ ld1 {v27.8h}, [x4], x9 // line 23
+ ld1 {v28.8h}, [x4], x9 // line 25
+ ld1 {v29.8h}, [x4], x9 // line 27
+ ld1 {v30.8h}, [x4], x9 // line 29
+ ld1 {v31.8h}, [x4], x9 // line 31
+
+ cmp x1, 28
+ b.hs 5f
+ // limit2 below 32
+ bic x4, x1, 1
+ adr x5, .LimitMask
+ cbnz x3, 3f
+ // columns 0 .. 7 - cleanup of indexes 5 .. 7
+ ld1 {v0.8h}, [x5]
+ adr x5, 2f
+ add x5, x5, x4, lsl 2
+ add x5, x5, x4, lsl 1
+ br x5
+2: and v16.16b, v16.16b, v0.16b // col_limit 0..1 -> limit2 == 4..5
+ and v17.16b, v17.16b, v0.16b
+ b 5f
+ and v17.16b, v17.16b, v0.16b // col_limit 2..3 -> limit2 == 6..7
+ and v18.16b, v18.16b, v0.16b
+ b 5f
+ and v18.16b, v18.16b, v0.16b // col_limit 4..5 -> limit2 == 8..9
+ and v19.16b, v19.16b, v0.16b
+ b 5f
+ and v19.16b, v19.16b, v0.16b // col_limit 6..7 -> limit2 == 10..11
+ and v20.16b, v20.16b, v0.16b
+ b 5f
+ and v20.16b, v20.16b, v0.16b // col_limit 8..9 -> limit2 == 12..13
+ and v21.16b, v21.16b, v0.16b
+ b 5f
+ and v21.16b, v21.16b, v0.16b // col_limit 10..11 -> limit2 == 14..15
+ and v22.16b, v22.16b, v0.16b
+ b 5f
+ and v22.16b, v22.16b, v0.16b // col_limit 12..13 -> limit2 == 16..17
+ and v23.16b, v23.16b, v0.16b
+ b 5f
+ and v23.16b, v23.16b, v0.16b // col_limit 14..15 -> limit2 == 18..19
+ and v24.16b, v24.16b, v0.16b
+ b 5f
+ and v24.16b, v24.16b, v0.16b // col_limit 16..17 -> limit2 == 20..21
+ and v25.16b, v25.16b, v0.16b
+ b 5f
+ and v25.16b, v25.16b, v0.16b // col_limit 18..19 -> limit2 == 22..23
+ and v26.16b, v26.16b, v0.16b
+ b 5f
+ and v26.16b, v26.16b, v0.16b // col_limit 20..21 -> limit2 == 24..25
+ and v27.16b, v27.16b, v0.16b
+ b 5f
+ and v27.16b, v27.16b, v0.16b // col_limit 22..23 -> limit2 == 26..27
+ and v28.16b, v28.16b, v0.16b
+ b 5f
+ and v28.16b, v28.16b, v0.16b // col_limit 24..25 -> limit2 == 28..29
+ and v29.16b, v29.16b, v0.16b
+ b 5f
+ and v29.16b, v29.16b, v0.16b // col_limit 26..27 -> limit2 == 30..31
+ and v30.16b, v30.16b, v0.16b
+ b 5f
+ // columns 8 .. 31
+3: add x4, x4, 6
+ subs x4, x4, x3, lsl 3
+ b.lo 5f
+ ld1 {v0.8h, v1.8h}, [x5]
+ adr x5, 4f
+ add x5, x5, x4, lsl 3
+ add x5, x5, x4, lsl 1
+ br x5
+4: and v16.16b, v16.16b, v1.16b // limit2 == 2..3
+ b 5f
+ nop
+ nop
+ nop
+ and v16.16b, v16.16b, v1.16b // limit2 == 4..5
+ and v17.16b, v17.16b, v1.16b
+ b 5f
+ nop
+ nop
+ and v16.16b, v16.16b, v0.16b // limit2 == 6..7
+ and v17.16b, v17.16b, v1.16b
+ and v18.16b, v18.16b, v1.16b
+ b 5f
+ nop
+ and v16.16b, v16.16b, v0.16b // limit2 == 8..9
+ and v17.16b, v17.16b, v0.16b
+ and v18.16b, v18.16b, v1.16b
+ and v19.16b, v19.16b, v1.16b
+ b 5f
+ and v17.16b, v17.16b, v0.16b // limit2 == 10..11
+ and v18.16b, v18.16b, v0.16b
+ and v19.16b, v19.16b, v1.16b
+ and v20.16b, v20.16b, v1.16b
+ b 5f
+ and v18.16b, v18.16b, v0.16b // limit2 == 12..13
+ and v19.16b, v19.16b, v0.16b
+ and v20.16b, v20.16b, v1.16b
+ and v21.16b, v21.16b, v1.16b
+ b 5f
+ and v19.16b, v19.16b, v0.16b // limit2 == 14..15
+ and v20.16b, v20.16b, v0.16b
+ and v21.16b, v21.16b, v1.16b
+ and v22.16b, v22.16b, v1.16b
+ b 5f
+ and v20.16b, v20.16b, v0.16b // limit2 == 16..17
+ and v21.16b, v21.16b, v0.16b
+ and v22.16b, v22.16b, v1.16b
+ and v23.16b, v23.16b, v1.16b
+ b 5f
+ and v21.16b, v21.16b, v0.16b // limit2 == 18..19
+ and v22.16b, v22.16b, v0.16b
+ and v23.16b, v23.16b, v1.16b
+ and v24.16b, v24.16b, v1.16b
+ b 5f
+ and v22.16b, v22.16b, v0.16b // limit2 == 20..21
+ and v23.16b, v23.16b, v0.16b
+ and v24.16b, v24.16b, v1.16b
+ and v25.16b, v25.16b, v1.16b
+ b 5f
+ and v23.16b, v23.16b, v0.16b // limit2 == 22..23
+ and v24.16b, v24.16b, v0.16b
+ and v25.16b, v25.16b, v1.16b
+ and v26.16b, v26.16b, v1.16b
+ b 5f
+ and v24.16b, v24.16b, v0.16b // limit2 == 24..25
+ and v25.16b, v25.16b, v0.16b
+ and v26.16b, v26.16b, v1.16b
+ and v27.16b, v27.16b, v1.16b
+ b 5f
+ and v25.16b, v25.16b, v0.16b // limit2 == 26..27
+ and v26.16b, v26.16b, v0.16b
+ and v27.16b, v27.16b, v1.16b
+ and v28.16b, v28.16b, v1.16b
+ b 5f
+
+
+ // o_32
+5: mov x5, 32
+ cmp x1, 28
+ b.hs 6f
+ add x5, x1, 4
+ bic x5, x5, 1
+ cbz x3, 6f
+ add x5, x1, 6
+ orr x5, x5, 1
+ subs x5, x5, x3, lsl 3
+ csel x5, x5, xzr, hs
+6: mov x4, 128
+ sub x4, x4, x5, lsl 2
+ adr x5, 8f
+ add x5, x5, x4
+ adr x4, .Lo32transform
+ mov x8, sp
+ mov x6, 16
+7: ld1 {v2.8h, v3.8h}, [x4], 32
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+8: smlal2 v9.4s, v31.8h, v3.h[7]
+ smlal v8.4s, v31.4h, v3.h[7]
+ smlal2 v9.4s, v30.8h, v3.h[6]
+ smlal v8.4s, v30.4h, v3.h[6]
+ smlal2 v9.4s, v29.8h, v3.h[5]
+ smlal v8.4s, v29.4h, v3.h[5]
+ smlal2 v9.4s, v28.8h, v3.h[4]
+ smlal v8.4s, v28.4h, v3.h[4]
+ smlal2 v9.4s, v27.8h, v3.h[3]
+ smlal v8.4s, v27.4h, v3.h[3]
+ smlal2 v9.4s, v26.8h, v3.h[2]
+ smlal v8.4s, v26.4h, v3.h[2]
+ smlal2 v9.4s, v25.8h, v3.h[1]
+ smlal v8.4s, v25.4h, v3.h[1]
+ smlal2 v9.4s, v24.8h, v3.h[0]
+ smlal v8.4s, v24.4h, v3.h[0]
+ smlal2 v9.4s, v23.8h, v2.h[7]
+ smlal v8.4s, v23.4h, v2.h[7]
+ smlal2 v9.4s, v22.8h, v2.h[6]
+ smlal v8.4s, v22.4h, v2.h[6]
+ smlal2 v9.4s, v21.8h, v2.h[5]
+ smlal v8.4s, v21.4h, v2.h[5]
+ smlal2 v9.4s, v20.8h, v2.h[4]
+ smlal v8.4s, v20.4h, v2.h[4]
+ smlal2 v9.4s, v19.8h, v2.h[3]
+ smlal v8.4s, v19.4h, v2.h[3]
+ smlal2 v9.4s, v18.8h, v2.h[2]
+ smlal v8.4s, v18.4h, v2.h[2]
+ smlal2 v9.4s, v17.8h, v2.h[1]
+ smlal v8.4s, v17.4h, v2.h[1]
+ smlal2 v9.4s, v16.8h, v2.h[0]
+ smlal v8.4s, v16.4h, v2.h[0]
+ st1 {v8.4s, v9.4s}, [x8], 32
+ subs x6, x6, 1
+ b.ne 7b
+
+ mov x4, x2
+ ld1 {v16.8h}, [x4], x9 // line 0
+ ld1 {v17.8h}, [x4], x9 // line 2
+ ld1 {v18.8h}, [x4], x9 // line 4
+ ld1 {v19.8h}, [x4], x9 // line 6
+ ld1 {v20.8h}, [x4], x9 // line 8
+ ld1 {v21.8h}, [x4], x9 // line 10
+ ld1 {v22.8h}, [x4], x9 // line 12
+ ld1 {v23.8h}, [x4], x9 // line 14
+ ld1 {v24.8h}, [x4], x9 // line 16
+ ld1 {v25.8h}, [x4], x9 // line 18
+ ld1 {v26.8h}, [x4], x9 // line 20
+ ld1 {v27.8h}, [x4], x9 // line 22
+ ld1 {v28.8h}, [x4], x9 // line 24
+ ld1 {v29.8h}, [x4], x9 // line 26
+ ld1 {v30.8h}, [x4], x9 // line 28
+ ld1 {v31.8h}, [x4], x9 // line 30
+ cmp x1, 28
+ b.hs 12f
+ // limit2 below 32
+ bic x4, x1, 3
+ cbnz x3, 10f
+ // columns 0 .. 7 - cleanup of indexes 5 .. 7
+ adr x5, 9f
+ add x5, x5, x4, lsl 1
+ br x5
+9: and v17.16b, v17.16b, v0.16b // col_limit 0..3 -> limit2/2 == 2..3
+ b 12f
+ and v19.16b, v19.16b, v0.16b // col_limit 4..7 -> limit2/2 == 4..5
+ b 12f
+ and v21.16b, v21.16b, v0.16b // col_limit 8..11 -> limit2/2 == 6..7
+ b 12f
+ and v23.16b, v23.16b, v0.16b // col_limit 12..15 -> limit2/2 == 8..9
+ b 12f
+ and v25.16b, v25.16b, v0.16b // col_limit 16..19 -> limit2/2 == 10..11
+ b 12f
+ and v27.16b, v27.16b, v0.16b // col_limit 20..23 -> limit2/2 == 12..13
+ b 12f
+ and v29.16b, v29.16b, v0.16b // col_limit 24..27 -> limit2/2 == 14..15
+ b 12f
+ // columns 8 .. 31
+10: add x4, x4, 4
+ subs x4, x4, x3, lsl 3 // x4 = (limit2 & ~3)-4 for column 8 * x3
+ b.lo 12f
+ adr x5, 11f
+ add x5, x5, x4, lsl 1
+ add x5, x5, x4
+ br x5
+11: and v17.16b, v17.16b, v1.16b // limit2 == 4..7
+ b 12f
+ nop
+ and v17.16b, v17.16b, v0.16b // limit2 == 8..11
+ and v19.16b, v19.16b, v1.16b
+ b 12f
+ and v19.16b, v19.16b, v0.16b // limit2 == 12..15
+ and v21.16b, v21.16b, v1.16b
+ b 12f
+ and v21.16b, v21.16b, v0.16b // limit2 == 16..19
+ and v23.16b, v23.16b, v1.16b
+ b 12f
+ and v23.16b, v23.16b, v0.16b // limit2 == 20..23
+ and v25.16b, v25.16b, v1.16b
+ b 12f
+ and v25.16b, v25.16b, v0.16b // limit2 == 24..27
+ and v27.16b, v27.16b, v1.16b
+ b 12f
+
+ // v0,v1 = e0
+12: sshll v0.4s, v16.4h, 6
+ sshll v1.4s, v24.4h, 6
+ add v0.4s, v0.4s, v1.4s
+ sshll2 v1.4s, v16.8h, 6
+ sshll2 v2.4s, v24.8h, 6
+ add v1.4s, v1.4s, v2.4s
+
+ // v2,v3 = o0
+ mov v14.s[0], w7
+ smull v2.4s, v20.4h, v14.h[0]
+ smlal v2.4s, v28.4h, v14.h[1]
+ smull2 v3.4s, v20.8h, v14.h[0]
+ smlal2 v3.4s, v28.8h, v14.h[1]
+
+ // v4,v5 = e_8[0]
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+
+ // v6,v7 = e_8[3]
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+
+
+ // v0,v1 = o_8[0]
+ adr x4, .Lo8transform0
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[0]
+ add v2.4s, v4.4s, v0.4s
+ add v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[0]
+ adr x4, .Lo16transform0
+ ld1 {v15.8h}, [x4]
+ mov x5, 32
+ cmp x1, 28
+ b.hs 13f
+ add x5, x1, 4
+ bic x5, x5, 3
+ cbz x3, 13f
+ orr x5, x5, 2
+ subs x5, x5, x3, lsl 3
+ csel x5, x5, xzr, hs
+13: mov x4, 64
+ sub x6, x4, x5, lsl 1
+ adr x5, 14f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+14: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[0]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[0]
+ ld1 {v14.4s, v15.4s}, [sp]
+
+ // tmp[0 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ st1 {v10.8h}, [x2]
+
+ // tmp[31 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 31 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[15]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[15]
+ add x4, sp, 15 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[15 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 15 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[16 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 16 * 64
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[7]
+ sub v2.4s, v4.4s, v0.4s
+ sub v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[7]
+ adr x4, .Lo16transform7
+ ld1 {v15.8h}, [x4]
+ adr x5, 15f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+15: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[7]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[7]
+ add x4, sp, 7 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[7 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 7 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[24 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 24 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[8]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[8]
+ add x4, sp, 8 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[8 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 8 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[23 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 23 * 64
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = o_8[3]
+ adr x4, .Lo8transform3
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[3]
+ add v2.4s, v6.4s, v0.4s
+ add v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[3]
+ adr x4, .Lo16transform3
+ ld1 {v15.8h}, [x4]
+ adr x5, 16f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+16: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[3]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[3]
+ add x4, sp, 3 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[3 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 3 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[28 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 28 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[12]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[12]
+ add x4, sp, 12 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[12 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 12 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[19 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 19 * 64
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[4]
+ sub v2.4s, v6.4s, v0.4s
+ sub v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[4]
+ adr x4, .Lo16transform4
+ ld1 {v15.8h}, [x4]
+ adr x5, 17f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+17: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[4]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[4]
+ add x4, sp, 4 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[4 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 4 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[27 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 27 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[11]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[11]
+ add x4, sp, 11 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[11 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 11 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[20 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 20 * 64
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = e1
+ sshll v0.4s, v16.4h, 6
+ sshll v1.4s, v24.4h, 6
+ sub v0.4s, v0.4s, v1.4s
+ sshll2 v1.4s, v16.8h, 6
+ sshll2 v2.4s, v24.8h, 6
+ sub v1.4s, v1.4s, v2.4s
+
+ // v2,v3 = o1
+ mov v14.s[0], w7
+ smull v2.4s, v20.4h, v14.h[1]
+ smlsl v2.4s, v28.4h, v14.h[0]
+ smull2 v3.4s, v20.8h, v14.h[1]
+ smlsl2 v3.4s, v28.8h, v14.h[0]
+
+ // v4,v5 = e_8[1]
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+
+ // v6,v7 = e_8[2]
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+
+ // v0,v1 = o_8[1]
+ adr x4, .Lo8transform1
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[1]
+ add v2.4s, v4.4s, v0.4s
+ add v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[1]
+ adr x4, .Lo16transform1
+ ld1 {v15.8h}, [x4]
+ adr x5, 18f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+18: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[1]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[1]
+ add x4, sp, 1 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[1 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 1 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[30 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 30 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[14]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[14]
+ add x4, sp, 14 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[14 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 14 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[17 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 17 * 64
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[6]
+ sub v2.4s, v4.4s, v0.4s
+ sub v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[6]
+ adr x4, .Lo16transform6
+ ld1 {v15.8h}, [x4]
+ adr x5, 19f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+19: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[6]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[6]
+ add x4, sp, 6 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[6 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 6 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[25 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 25 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[9]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[9]
+ add x4, sp, 9 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[9 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 9 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[22 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 22 * 64
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = o_8[2]
+ adr x4, .Lo8transform2
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[2]
+ add v2.4s, v6.4s, v0.4s
+ add v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[2]
+ adr x4, .Lo16transform2
+ ld1 {v15.8h}, [x4]
+ adr x5, 20f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+20: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[2]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[2]
+ add x4, sp, 2 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[2 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 2 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[29 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 29 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[13]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[13]
+ add x4, sp, 13 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[13 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 13 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[18 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 18 * 64
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[5]
+ sub v2.4s, v6.4s, v0.4s
+ sub v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[5]
+ adr x4, .Lo16transform5
+ ld1 {v15.8h}, [x4]
+ adr x5, 21f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+21: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // v12,v13 = e_32[5]
+ add v12.4s, v2.4s, v8.4s
+ add v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[5]
+ add x4, sp, 5 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[5 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 5 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[26 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 26 * 64
+ st1 {v10.8h}, [x4]
+
+ // v12,v13 = e_32[10]
+ sub v12.4s, v2.4s, v8.4s
+ sub v13.4s, v3.4s, v9.4s
+
+ // v14,v15 = o_32[10]
+ add x4, sp, 10 * 32
+ ld1 {v14.4s, v15.4s}, [x4]
+
+ // tmp[10 * 32]
+ add v10.4s, v12.4s, v14.4s
+ add v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 10 * 64
+ st1 {v10.8h}, [x4]
+
+ // tmp[21 * 32]
+ sub v10.4s, v12.4s, v14.4s
+ sub v11.4s, v13.4s, v15.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 21 * 64
+ st1 {v10.8h}, [x4]
+
+
+ add x2, x2, 16
+ add x3, x3, 1
+ cmp x3, 4
+ b.ne 1b
+
+ // horizontal transform
+ cmp x1, 9
+ b.ls 24f
+ // o_32 partially (last 12 sum components)
+ adr x4, .Lo32transform9_31
+ ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x4], 64
+ ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x4], 64
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], 64
+ bic x5, x1, 1
+ subs x5, x5, 8
+ csel x5, x5, xzr, hs
+ mov x4, 24
+ subs x4, x4, x5
+ csel x5, x4, xzr, hs
+ adr x4, 23f
+ add x5, x4, x5, lsl 3
+ add x2, x0, 16
+ mov x8, sp
+ mov x3, 64
+ mov x6, 32
+22: ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3
+ movi v4.4s, 0
+ movi v5.4s, 0
+ movi v6.4s, 0
+ movi v7.4s, 0
+ br x5
+23: smlal v4.4s, v30.4h, v2.h[7]
+ smlal2 v5.4s, v30.8h, v2.h[7]
+ smlal v6.4s, v31.4h, v2.h[7]
+ smlal2 v7.4s, v31.8h, v2.h[7]
+ smlal v4.4s, v28.4h, v2.h[5]
+ smlal2 v5.4s, v28.8h, v2.h[5]
+ smlal v6.4s, v29.4h, v2.h[5]
+ smlal2 v7.4s, v29.8h, v2.h[5]
+ smlal v4.4s, v26.4h, v2.h[3]
+ smlal2 v5.4s, v26.8h, v2.h[3]
+ smlal v6.4s, v27.4h, v2.h[3]
+ smlal2 v7.4s, v27.8h, v2.h[3]
+ smlal v4.4s, v24.4h, v2.h[1]
+ smlal2 v5.4s, v24.8h, v2.h[1]
+ smlal v6.4s, v25.4h, v2.h[1]
+ smlal2 v7.4s, v25.8h, v2.h[1]
+ smlal v4.4s, v22.4h, v1.h[7]
+ smlal2 v5.4s, v22.8h, v1.h[7]
+ smlal v6.4s, v23.4h, v1.h[7]
+ smlal2 v7.4s, v23.8h, v1.h[7]
+ smlal v4.4s, v20.4h, v1.h[5]
+ smlal2 v5.4s, v20.8h, v1.h[5]
+ smlal v6.4s, v21.4h, v1.h[5]
+ smlal2 v7.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v18.4h, v1.h[3]
+ smlal2 v5.4s, v18.8h, v1.h[3]
+ smlal v6.4s, v19.4h, v1.h[3]
+ smlal2 v7.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v16.4h, v1.h[1]
+ smlal2 v5.4s, v16.8h, v1.h[1]
+ smlal v6.4s, v17.4h, v1.h[1]
+ smlal2 v7.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v14.4h, v0.h[7]
+ smlal2 v5.4s, v14.8h, v0.h[7]
+ smlal v6.4s, v15.4h, v0.h[7]
+ smlal2 v7.4s, v15.8h, v0.h[7]
+ smlal v4.4s, v12.4h, v0.h[5]
+ smlal2 v5.4s, v12.8h, v0.h[5]
+ smlal v6.4s, v13.4h, v0.h[5]
+ smlal2 v7.4s, v13.8h, v0.h[5]
+ smlal v4.4s, v10.4h, v0.h[3]
+ smlal2 v5.4s, v10.8h, v0.h[3]
+ smlal v6.4s, v11.4h, v0.h[3]
+ smlal2 v7.4s, v11.8h, v0.h[3]
+ smlal v4.4s, v8.4h, v0.h[1]
+ smlal2 v5.4s, v8.8h, v0.h[1]
+ smlal v6.4s, v9.4h, v0.h[1]
+ smlal2 v7.4s, v9.8h, v0.h[1]
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], 64
+ subs x6, x6, 1
+ b.ne 22b
+
+
+24: adr x4, .Leo_coeff
+ ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x4], 64
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+ adr x4, .Lo32transform
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], 64
+ // o_16 jump address
+ mov x4, 64
+ bic x5, x1, 3
+ subs x4, x4, x5, lsl 1
+ csel x4, x4, xzr, hs
+ adr x5, 26f
+ add x5, x5, x4
+ // o_32 jump address
+ bic x6, x1, 1
+ mov x4, 8
+ subs x4, x4, x6
+ csel x6, x4, xzr, hs
+ adr x4, 29f
+ add x6, x4, x6, lsl 3
+
+ mov x8, sp
+ mov x3, 32
+25: ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0]
+
+ // v2 = e_8
+ smull v2.4s, v12.4h, v8.h[0]
+ smlal2 v2.4s, v12.8h, v9.h[0]
+ smlal v2.4s, v13.4h, v10.h[0]
+ smlal2 v2.4s, v13.8h, v11.h[0]
+
+ // v3 = o_8
+ smull v3.4s, v14.4h, v8.h[4]
+ smlal2 v3.4s, v14.8h, v9.h[4]
+ smlal v3.4s, v15.4h, v10.h[4]
+ smlal2 v3.4s, v15.8h, v11.h[4]
+
+ // v0,v1 = e_16
+ add v0.4s, v2.4s, v3.4s
+ sub v2.4s, v2.4s, v3.4s
+ mov v1.d[0], v2.d[1]
+ mov v1.d[1], v2.d[0]
+ rev64 v1.4s, v1.4s
+
+ // v2,v3 = o_16
+ movi v2.4s, 0
+ movi v3.4s, 0
+ br x5
+26: smlal v2.4s, v23.4h, v11.h[6]
+ smlal2 v3.4s, v23.8h, v11.h[6]
+ smlal v2.4s, v22.4h, v11.h[2]
+ smlal2 v3.4s, v22.8h, v11.h[2]
+ smlal v2.4s, v21.4h, v10.h[6]
+ smlal2 v3.4s, v21.8h, v10.h[6]
+ smlal v2.4s, v20.4h, v10.h[2]
+ smlal2 v3.4s, v20.8h, v10.h[2]
+ smlal v2.4s, v19.4h, v9.h[6]
+ smlal2 v3.4s, v19.8h, v9.h[6]
+ smlal v2.4s, v18.4h, v9.h[2]
+ smlal2 v3.4s, v18.8h, v9.h[2]
+ smlal v2.4s, v17.4h, v8.h[6]
+ smlal2 v3.4s, v17.8h, v8.h[6]
+ smlal v2.4s, v16.4h, v8.h[2]
+ smlal2 v3.4s, v16.8h, v8.h[2]
+
+ // v4,v5,v6,v7 = e_32
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+ sub v11.4s, v0.4s, v2.4s
+ mov v7.d[0], v11.d[1]
+ mov v7.d[1], v11.d[0]
+ rev64 v7.4s, v7.4s
+ sub v11.4s, v1.4s, v3.4s
+ mov v6.d[0], v11.d[1]
+ mov v6.d[1], v11.d[0]
+ rev64 v6.4s, v6.4s
+
+ // v0,v1,v2,v3 = o_32
+ cmp x1, 9
+ b.hi 28f
+ movi v0.4s, 0
+ movi v1.4s, 0
+ movi v2.4s, 0
+ movi v3.4s, 0
+ br x6
+28: ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], 64
+ br x6
+29: smlal v0.4s, v30.4h, v8.h[7]
+ smlal2 v1.4s, v30.8h, v8.h[7]
+ smlal v2.4s, v31.4h, v8.h[7]
+ smlal2 v3.4s, v31.8h, v8.h[7]
+ smlal v0.4s, v28.4h, v8.h[5]
+ smlal2 v1.4s, v28.8h, v8.h[5]
+ smlal v2.4s, v29.4h, v8.h[5]
+ smlal2 v3.4s, v29.8h, v8.h[5]
+ smlal v0.4s, v26.4h, v8.h[3]
+ smlal2 v1.4s, v26.8h, v8.h[3]
+ smlal v2.4s, v27.4h, v8.h[3]
+ smlal2 v3.4s, v27.8h, v8.h[3]
+ smlal v0.4s, v24.4h, v8.h[1]
+ smlal2 v1.4s, v24.8h, v8.h[1]
+ smlal v2.4s, v25.4h, v8.h[1]
+ smlal2 v3.4s, v25.8h, v8.h[1]
+
+ // coeff
+ add v8.4s, v4.4s, v0.4s
+ add v9.4s, v5.4s, v1.4s
+ add v10.4s, v6.4s, v2.4s
+ add v11.4s, v7.4s, v3.4s
+ sub v4.4s, v4.4s, v0.4s
+ sub v5.4s, v5.4s, v1.4s
+ sub v6.4s, v6.4s, v2.4s
+ sub v7.4s, v7.4s, v3.4s
+ sqrshrn v8.4h, v8.4s, 12
+ sqrshrn2 v8.8h, v9.4s, 12
+ sqrshrn v9.4h, v10.4s, 12
+ sqrshrn2 v9.8h, v11.4s, 12
+ sqrshrn v4.4h, v4.4s, 12
+ sqrshrn2 v4.8h, v5.4s, 12
+ sqrshrn v5.4h, v6.4s, 12
+ sqrshrn2 v5.8h, v7.4s, 12
+ mov v10.d[0], v5.d[1]
+ mov v10.d[1], v5.d[0]
+ rev64 v10.8h, v10.8h
+ mov v11.d[0], v4.d[1]
+ mov v11.d[1], v4.d[0]
+ rev64 v11.8h, v11.8h
+ st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], 64
+ subs x3, x3, 1
+ b.ne 25b
+
+ add sp, sp, 16 * 32 * 4
+ ld1 {v12.16b, v13.16b, v14.16b, v15.16b}, [sp], 64
+ ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
+ ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 612ebb9541..bb2a6b2502 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -46,6 +46,7 @@ void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon_new(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
@@ -74,6 +75,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon;
c->idct[1] = ff_hevc_idct_8x8_8_neon;
c->idct[2] = ff_hevc_idct_16x16_8_neon_new;
+ c->idct[3] = ff_hevc_idct_32x32_8_neon;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct
2022-06-23 12:23 [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct J. Dekker
2022-06-23 12:23 ` [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct J. Dekker
@ 2022-08-09 12:15 ` Martin Storsjö
1 sibling, 0 replies; 3+ messages in thread
From: Martin Storsjö @ 2022-08-09 12:15 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Thu, 23 Jun 2022, J. Dekker wrote:
> old:
> hevc_idct_16x16_8_c: 5366.2
> hevc_idct_16x16_8_neon: 1493.2
>
> new:
> hevc_idct_16x16_8_c: 5363.2
> hevc_idct_16x16_8_neon: 943.5
>
> Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 666 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 3 +-
> 2 files changed, 668 insertions(+), 1 deletion(-)
Throughout the new code, you have e.g. "add x5, x5, x4, lsl 2", where the
"lsl 2" breaks assembling with MS armasm64 - it's missing the '#' on the
constant 2.
Also, for loads/stores, it seems to be missing the same '#' for
postincrement, e.g. "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64". Also
"mov x4, 64". Apparently armasm64 doesn't have a problem with that, but it
would still be good to have it consistent with the rest.
> This idct is significantly faster than the one we currently have, I
> suspect its for a couple reasons: 1) it's only written for 8bit
I don't see how that would change anything? Isn't the only thing that
differs between 8 and 10/12 bit in the existing implementation about how
much to scale down at the end? All other intermediate values are the same
size?
> 2) it's unrolled signficantly more. It comes at a hefty cost of roughly
> 2.25x the object size.
If by that, you mean that the existing code works on 4 elements at a time
(i.e. mostly operating on .4h vectors), while this one operates on .8h
vectors, then yes, that's most probably the biggest source of the speedup
(even if a lot of the intermediate stuff happens in .4s vectors). The
existing code was ported from the 32 bit arm version (which probably had
to stick to 4 elements at a time due to register availability there),
while it probably could have been made double width when it was ported to
64 bit.
> I'm wondering if this idct is salvagable, or the one we have should just
> be improved instead.
Well, my honest opinion is:
- I don't quite understand the current code (I've worked on the
vp8/vp9/av1 IDCTs a fair amount, but the HEVC one seems to be different
enough that I don't recognize all the concepts here.
- The current implementation would need to be reformatted if kept
- The current implementation does have some rather clear high level
structure though, e.g. when looking at the idct_16x16 macro.
- The new implementation seems to be just one huuuuge function. If you
know it by heart, it's probably good, but it's really hard to get an
overview of if you're not familiar with the HEVC IDCTs.
As for steps forward:
- Is it possible to widen the existing implementation to operate on 8
elements instead of 4? I think that would bring it up to par with this
one.
- Can you get some high level structure to the new implementation so that
it becomes understandable? Either lots of more comments explaining what's
happening and why, or splitting it up in smaller macros.
Some more comments on the code itself below:
> +// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
> +function ff_hevc_idct_16x16_8_neon_new, export=1
> + sub sp, sp, 64
> + st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
> + sub sp, sp, 32
> + st1 {v14.16b, v15.16b}, [sp]
> + mov x3, 0
> + mov x2, x0
> +1: mov x4, x2
> + mov x5, 32
> + ld1 {v16.8h}, [x4], x5
> + ld1 {v17.8h}, [x4], x5
> + ld1 {v18.8h}, [x4], x5
> + ld1 {v19.8h}, [x4], x5
> + ld1 {v20.8h}, [x4], x5
> + ld1 {v21.8h}, [x4], x5
> + ld1 {v22.8h}, [x4], x5
> + ld1 {v23.8h}, [x4], x5
> + ld1 {v24.8h}, [x4], x5
> + ld1 {v25.8h}, [x4], x5
> + ld1 {v26.8h}, [x4], x5
> + ld1 {v27.8h}, [x4], x5
> + ld1 {v28.8h}, [x4], x5
> + ld1 {v29.8h}, [x4], x5
> + ld1 {v30.8h}, [x4], x5
> + ld1 {v31.8h}, [x4], x5
> + cmp x1, 12
> + b.hs 5f
> + // limit2 below 16
> + bic x4, x1, 1
> + adr x5, .LimitMask
> + cbnz x3, 3f
> + // columns 0 .. 7 - cleanup of indexes 5 .. 7
> + ld1 {v0.8h}, [x5]
> + adr x5, 2f
> + add x5, x5, x4, lsl 2
> + add x5, x5, x4, lsl 1
> + br x5
> +2: and v17.16b, v17.16b, v0.16b // col_limit 0..1 -> limit2 == 4..5
> + and v19.16b, v19.16b, v0.16b
> + b 5f
I don't really know what these jump tables do and how it corresponds to
things in the existing implementation - but I guess that can be one part
of what makes things faster too.
The existing implementation does an 16x16 transform by first doing 4x
transforms for an 4x16 piece of data, transpose that, then do another 4x
4x16 for the second pass. How does the new implementation do it?
If I understand correctly, the old implementation didn't take col_limit
into account at all. Can that be one part of what makes things faster - or
is that only something that makes a difference in real use but not in
checkasm benchmarks?
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2022-08-09 12:15 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-23 12:23 [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct J. Dekker
2022-06-23 12:23 ` [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct J. Dekker
2022-08-09 12:15 ` [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git