From: "J. Dekker" <jdek@itanimul.li> To: ffmpeg-devel@ffmpeg.org Subject: [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct Date: Thu, 23 Jun 2022 14:23:10 +0200 Message-ID: <20220623122311.20097-1-jdek@itanimul.li> (raw) old: hevc_idct_16x16_8_c: 5366.2 hevc_idct_16x16_8_neon: 1493.2 new: hevc_idct_16x16_8_c: 5363.2 hevc_idct_16x16_8_neon: 943.5 Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com> Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/aarch64/hevcdsp_idct_neon.S | 666 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 3 +- 2 files changed, 668 insertions(+), 1 deletion(-) This idct is significantly faster than the one we currently have, I suspect its for a couple reasons: 1) it's only written for 8bit 2) it's unrolled signficantly more. It comes at a hefty cost of roughly 2.25x the object size. I'm wondering if this idct is salvagable, or the one we have should just be improved instead. diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 0869431294..784bae33b3 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -618,3 +618,669 @@ idct_dc 16, 10 idct_dc 32, 8 idct_dc 32, 10 + +// WIP + +.Lo0_coeff: .hword 83, 36, 0, 0, 0, 0, 0, 0 +.Lo8transform0: .hword 89, 75, 50, 18 // transform[4,12,20,28][0] +.Lo8transform1: .hword 75, -18, -89, -50 +.Lo8transform2: .hword 50, -89, 18, 75 +.Lo8transform3: .hword 18, -50, 75, -89 + +.LimitMask: + .hword 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 + .hword 0xffff, 0, 0, 0, 0, 0, 0, 0 + +.Leo_coeff: + .hword 64, 64, 64, 64, 83, 36, -36, -83 + .hword 64, -64, -64, 64, 36, -83, 83, -36 + .hword 89, 75, 50, 18, 75, -18, -89, -50 // transform[4,12][0-3] + .hword 50, -89, 18, 75, 18, -50, 75, -89 // transform[20,28][0-3] +.Lo16transform0: .hword 90, 87, 80, 70, 57, 43, 25, 9 // transform[2][0-7], also transform[2,6,10..][0] +.Lo16transform1: .hword 87, 57, 9, -43, -80, -90, -70, -25 // transform[6][0-7] +.Lo16transform2: .hword 80, 9, -70, -87, -25, 57, 90, 43 // transform[10][0-7] +.Lo16transform3: .hword 70, -43, -87, 9, 90, 25, -80, -57 // transform[14][0-7] +.Lo16transform4: .hword 57, -80, -25, 90, -9, -87, 43, 70 // transform[18][0-7] +.Lo16transform5: .hword 43, -90, 57, 25, -87, 70, 9, -80 // transform[22][0-7] +.Lo16transform6: .hword 25, -70, 90, -80, 43, 9, -57, 87 // transform[26][0-7] +.Lo16transform7: .hword 9, -25, 43, -57, 70, -80, 87, -90 // transform[30][0-7] + +// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit) +function ff_hevc_idct_16x16_8_neon_new, export=1 + sub sp, sp, 64 + st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp] + sub sp, sp, 32 + st1 {v14.16b, v15.16b}, [sp] + mov x3, 0 + mov x2, x0 +1: mov x4, x2 + mov x5, 32 + ld1 {v16.8h}, [x4], x5 + ld1 {v17.8h}, [x4], x5 + ld1 {v18.8h}, [x4], x5 + ld1 {v19.8h}, [x4], x5 + ld1 {v20.8h}, [x4], x5 + ld1 {v21.8h}, [x4], x5 + ld1 {v22.8h}, [x4], x5 + ld1 {v23.8h}, [x4], x5 + ld1 {v24.8h}, [x4], x5 + ld1 {v25.8h}, [x4], x5 + ld1 {v26.8h}, [x4], x5 + ld1 {v27.8h}, [x4], x5 + ld1 {v28.8h}, [x4], x5 + ld1 {v29.8h}, [x4], x5 + ld1 {v30.8h}, [x4], x5 + ld1 {v31.8h}, [x4], x5 + cmp x1, 12 + b.hs 5f + // limit2 below 16 + bic x4, x1, 1 + adr x5, .LimitMask + cbnz x3, 3f + // columns 0 .. 7 - cleanup of indexes 5 .. 7 + ld1 {v0.8h}, [x5] + adr x5, 2f + add x5, x5, x4, lsl 2 + add x5, x5, x4, lsl 1 + br x5 +2: and v17.16b, v17.16b, v0.16b // col_limit 0..1 -> limit2 == 4..5 + and v19.16b, v19.16b, v0.16b + b 5f + and v19.16b, v19.16b, v0.16b // col_limit 2..3 -> limit2 == 6..7 + and v21.16b, v21.16b, v0.16b + b 5f + and v21.16b, v21.16b, v0.16b // col_limit 4..5 -> limit2 == 8..9 + and v23.16b, v23.16b, v0.16b + b 5f + and v23.16b, v23.16b, v0.16b // col_limit 6..7 -> limit2 == 10..11 + and v25.16b, v25.16b, v0.16b + b 5f + and v25.16b, v25.16b, v0.16b // col_limit 8..9 -> limit2 == 12..13 + and v27.16b, v27.16b, v0.16b + b 5f + and v27.16b, v27.16b, v0.16b // col_limit 10..11 -> limit2 == 14..15 + and v29.16b, v29.16b, v0.16b + b 5f + // columns 8 .. 15 +3: subs x4, x4, 2 + b.lo 5f + ld1 {v0.8h, v1.8h}, [x5] + adr x5, 4f + add x5, x5, x4, lsl 3 + add x5, x5, x4, lsl 1 + br x5 +4: and v17.16b, v17.16b, v1.16b // col_limit 2..3 -> limit2 == 2..3 + b 5f + nop + nop + nop + and v17.16b, v17.16b, v1.16b // col_limit 4..5 -> limit2 == 4..5 + and v19.16b, v19.16b, v1.16b + b 5f + nop + nop + and v17.16b, v17.16b, v0.16b // col_limit 6..7 -> limit2 == 6..7 + and v19.16b, v19.16b, v1.16b + and v21.16b, v21.16b, v1.16b + b 5f + nop + and v17.16b, v17.16b, v0.16b // col_limit 8..9 -> limit2 == 8..9 + and v19.16b, v19.16b, v0.16b + and v21.16b, v21.16b, v1.16b + and v23.16b, v23.16b, v1.16b + b 5f + and v19.16b, v19.16b, v0.16b // col_limit 10..11 -> limit2 == 10..11 + and v21.16b, v21.16b, v0.16b + and v23.16b, v23.16b, v1.16b + and v25.16b, v25.16b, v1.16b + b 5f +5: adr x4, .Lo0_coeff + ld1 {v14.8h}, [x4] + + // v0,v1 = e0 + sshll v0.4s, v16.4h, 6 + sshll v1.4s, v24.4h, 6 + add v0.4s, v0.4s, v1.4s + sshll2 v1.4s, v16.8h, 6 + sshll2 v2.4s, v24.8h, 6 + add v1.4s, v1.4s, v2.4s + + // v2,v3 = o0 + smull v2.4s, v20.4h, v14.h[0] + smlal v2.4s, v28.4h, v14.h[1] + smull2 v3.4s, v20.8h, v14.h[0] + smlal2 v3.4s, v28.8h, v14.h[1] + + // v4,v5 = e_8[0] + add v4.4s, v0.4s, v2.4s + add v5.4s, v1.4s, v3.4s + + // v6,v7 = e_8[3] + sub v6.4s, v0.4s, v2.4s + sub v7.4s, v1.4s, v3.4s + + + // v0,v1 = o_8[0] + adr x4, .Lo8transform0 + ld1 {v15.4h}, [x4] + smull v0.4s, v18.4h, v15.h[0] + smlal v0.4s, v22.4h, v15.h[1] + smlal v0.4s, v26.4h, v15.h[2] + smlal v0.4s, v30.4h, v15.h[3] + smull2 v1.4s, v18.8h, v15.h[0] + smlal2 v1.4s, v22.8h, v15.h[1] + smlal2 v1.4s, v26.8h, v15.h[2] + smlal2 v1.4s, v30.8h, v15.h[3] + + // v2,v3 = e_16[0] + add v2.4s, v4.4s, v0.4s + add v3.4s, v5.4s, v1.4s + + // v8,v9 = o_16[0] + adr x4, .Lo16transform0 + ld1 {v15.8h}, [x4] + + mov x5, 16 + cmp x1, 12 + b.hs 6f + add x5, x1, 4 + bic x5, x5, 1 + cbz x3, 6f + orr x5, x1, 1 + subs x5, x5, 2 + csel x5, x5, xzr, hs +6: mov x4, 64 + sub x6, x4, x5, lsl 2 + adr x5, 7f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +7: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // tmp[0 * 16] + add v10.4s, v2.4s, v8.4s + add v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + st1 {v10.8h}, [x2] + + // tmp[15 * 16] + sub v10.4s, v2.4s, v8.4s + sub v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 15 * 32 + st1 {v10.8h}, [x4] + + // v2,v3 = e_16[7] + sub v2.4s, v4.4s, v0.4s + sub v3.4s, v5.4s, v1.4s + + // v8,v9 = o_16[7] + adr x4, .Lo16transform7 + ld1 {v15.8h}, [x4] + adr x5, 8f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +8: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // tmp[7 * 16] + add v10.4s, v2.4s, v8.4s + add v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 7 * 32 + st1 {v10.8h}, [x4] + + // tmp[8 * 16] + sub v10.4s, v2.4s, v8.4s + sub v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 8 * 32 + st1 {v10.8h}, [x4] + + // v0,v1 = o_8[3] + adr x4, .Lo8transform3 + ld1 {v15.4h}, [x4] + smull v0.4s, v18.4h, v15.h[0] + smlal v0.4s, v22.4h, v15.h[1] + smlal v0.4s, v26.4h, v15.h[2] + smlal v0.4s, v30.4h, v15.h[3] + smull2 v1.4s, v18.8h, v15.h[0] + smlal2 v1.4s, v22.8h, v15.h[1] + smlal2 v1.4s, v26.8h, v15.h[2] + smlal2 v1.4s, v30.8h, v15.h[3] + + // v2,v3 = e_16[3] + add v2.4s, v6.4s, v0.4s + add v3.4s, v7.4s, v1.4s + + // v8,v9 = o_16[3] + adr x4, .Lo16transform3 + ld1 {v15.8h}, [x4] + adr x5, 9f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +9: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] // 13 + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] // 11 + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] // 9 + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] // 7 + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] // 5 + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] // 3 + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] // 1 + + // tmp[3 * 16] + add v10.4s, v2.4s, v8.4s + add v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 3 * 32 + st1 {v10.8h}, [x4] + + // tmp[12 * 16] + sub v10.4s, v2.4s, v8.4s + sub v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 12 * 32 + st1 {v10.8h}, [x4] + + // v2,v3 = e_16[4] + sub v2.4s, v6.4s, v0.4s + sub v3.4s, v7.4s, v1.4s + + // v8,v9 = o_16[4] + adr x4, .Lo16transform4 + ld1 {v15.8h}, [x4] + adr x5, 10f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +10: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // tmp[4 * 16] + add v10.4s, v2.4s, v8.4s + add v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 4 * 32 + st1 {v10.8h}, [x4] + + // tmp[11 * 16] + sub v10.4s, v2.4s, v8.4s + sub v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 11 * 32 + st1 {v10.8h}, [x4] + + + // v0,v1 = e1 + sshll v0.4s, v16.4h, 6 + sshll v1.4s, v24.4h, 6 + sub v0.4s, v0.4s, v1.4s + sshll2 v1.4s, v16.8h, 6 + sshll2 v2.4s, v24.8h, 6 + sub v1.4s, v1.4s, v2.4s + + // v2,v3 = o1 + smull v2.4s, v20.4h, v14.h[1] + smlsl v2.4s, v28.4h, v14.h[0] + smull2 v3.4s, v20.8h, v14.h[1] + smlsl2 v3.4s, v28.8h, v14.h[0] + + // v4,v5 = e_8[1] + add v4.4s, v0.4s, v2.4s + add v5.4s, v1.4s, v3.4s + + // v6,v7 = e_8[2] + sub v6.4s, v0.4s, v2.4s + sub v7.4s, v1.4s, v3.4s + + // v0,v1 = o_8[1] + adr x4, .Lo8transform1 + ld1 {v15.4h}, [x4] + smull v0.4s, v18.4h, v15.h[0] + smlal v0.4s, v22.4h, v15.h[1] + smlal v0.4s, v26.4h, v15.h[2] + smlal v0.4s, v30.4h, v15.h[3] + smull2 v1.4s, v18.8h, v15.h[0] + smlal2 v1.4s, v22.8h, v15.h[1] + smlal2 v1.4s, v26.8h, v15.h[2] + smlal2 v1.4s, v30.8h, v15.h[3] + + // v2,v3 = e_16[1] + add v2.4s, v4.4s, v0.4s + add v3.4s, v5.4s, v1.4s + + // v8,v9 = o_16[1] + adr x4, .Lo16transform1 + ld1 {v15.8h}, [x4] + adr x5, 11f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +11: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // tmp[1 * 16] + add v10.4s, v2.4s, v8.4s + add v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 1 * 32 + st1 {v10.8h}, [x4] + + // tmp[14 * 16] + sub v10.4s, v2.4s, v8.4s + sub v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 14 * 32 + st1 {v10.8h}, [x4] + + // v2,v3 = e_16[6] + sub v2.4s, v4.4s, v0.4s + sub v3.4s, v5.4s, v1.4s + + // v8,v9 = o_16[6] + adr x4, .Lo16transform6 + ld1 {v15.8h}, [x4] + adr x5, 12f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +12: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // tmp[6 * 16] + add v10.4s, v2.4s, v8.4s + add v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 6 * 32 + st1 {v10.8h}, [x4] + + // tmp[9 * 16] + sub v10.4s, v2.4s, v8.4s + sub v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 9 * 32 + st1 {v10.8h}, [x4] + + // v0,v1 = o_8[2] + adr x4, .Lo8transform2 + ld1 {v15.4h}, [x4] + smull v0.4s, v18.4h, v15.h[0] + smlal v0.4s, v22.4h, v15.h[1] + smlal v0.4s, v26.4h, v15.h[2] + smlal v0.4s, v30.4h, v15.h[3] + smull2 v1.4s, v18.8h, v15.h[0] + smlal2 v1.4s, v22.8h, v15.h[1] + smlal2 v1.4s, v26.8h, v15.h[2] + smlal2 v1.4s, v30.8h, v15.h[3] + + // v2,v3 = e_16[2] + add v2.4s, v6.4s, v0.4s + add v3.4s, v7.4s, v1.4s + + // v8,v9 = o_16[2] + adr x4, .Lo16transform2 + ld1 {v15.8h}, [x4] + adr x5, 13f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +13: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // tmp[2 * 16] + add v10.4s, v2.4s, v8.4s + add v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 2 * 32 + st1 {v10.8h}, [x4] + + // tmp[13 * 16] + sub v10.4s, v2.4s, v8.4s + sub v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 13 * 32 + st1 {v10.8h}, [x4] + + // v2,v3 = e_16[5] + sub v2.4s, v6.4s, v0.4s + sub v3.4s, v7.4s, v1.4s + + // v8,v9 = o_16[5] + adr x4, .Lo16transform5 + ld1 {v15.8h}, [x4] + adr x5, 14f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +14: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // tmp[5 * 16] + add v10.4s, v2.4s, v8.4s + add v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 5 * 32 + st1 {v10.8h}, [x4] + + // tmp[10 * 16] + sub v10.4s, v2.4s, v8.4s + sub v11.4s, v3.4s, v9.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 10 * 32 + st1 {v10.8h}, [x4] + + add x2, x2, 16 + add x3, x3, 1 + cmp x3, 2 + b.lo 1b + + + // horizontal transform + adr x4, .Leo_coeff + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64 + // o_16 jump address + mov x4, 64 + bic x5, x1, 1 + subs x4, x4, x5, lsl 2 + csel x4, x4, xzr, hs + adr x5, 15f + add x5, x5, x4 + + mov x3, 16 +14: ld1 {v6.8h, v7.8h}, [x0] + + // v2 = e_8 + smull v2.4s, v16.4h, v6.h[0] + smlal2 v2.4s, v16.8h, v6.h[4] + smlal v2.4s, v17.4h, v7.h[0] + smlal2 v2.4s, v17.8h, v7.h[4] + + // v3 = o_8 + smull v3.4s, v18.4h, v6.h[2] + smlal2 v3.4s, v18.8h, v6.h[6] + smlal v3.4s, v19.4h, v7.h[2] + smlal2 v3.4s, v19.8h, v7.h[6] + + // v0,v1 = e_16 + add v0.4s, v2.4s, v3.4s + sub v2.4s, v2.4s, v3.4s + mov v1.d[0], v2.d[1] + mov v1.d[1], v2.d[0] + rev64 v1.4s, v1.4s + + // v2,v3 = o_16 + movi v2.4s, 0 + movi v3.4s, 0 + br x5 +15: smlal v2.4s, v27.4h, v7.h[7] + smlal2 v3.4s, v27.8h, v7.h[7] + smlal v2.4s, v26.4h, v7.h[5] + smlal2 v3.4s, v26.8h, v7.h[5] + smlal v2.4s, v25.4h, v7.h[3] + smlal2 v3.4s, v25.8h, v7.h[3] + smlal v2.4s, v24.4h, v7.h[1] + smlal2 v3.4s, v24.8h, v7.h[1] + smlal v2.4s, v23.4h, v6.h[7] + smlal2 v3.4s, v23.8h, v6.h[7] + smlal v2.4s, v22.4h, v6.h[5] + smlal2 v3.4s, v22.8h, v6.h[5] + smlal v2.4s, v21.4h, v6.h[3] + smlal2 v3.4s, v21.8h, v6.h[3] + smlal v2.4s, v20.4h, v6.h[1] + smlal2 v3.4s, v20.8h, v6.h[1] + + // coeff + add v4.4s, v0.4s, v2.4s + add v5.4s, v1.4s, v3.4s + sub v6.4s, v0.4s, v2.4s + sub v7.4s, v1.4s, v3.4s + sqrshrn v4.4h, v4.4s, 12 + sqrshrn2 v4.8h, v5.4s, 12 + sqrshrn v6.4h, v6.4s, 12 + sqrshrn2 v6.8h, v7.4s, 12 + mov v5.d[0], v6.d[1] + mov v5.d[1], v6.d[0] + rev64 v5.8h, v5.8h + st1 {v4.8h, v5.8h}, [x0], 32 + subs x3, x3, 1 + b.ne 14b + + ld1 {v14.16b, v15.16b}, [sp], 32 + ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64 + ret +endfunc diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 2002530266..612ebb9541 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -45,6 +45,7 @@ void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_8_neon_new(int16_t *coeffs, int col_limit); void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs); void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs); void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs); @@ -72,7 +73,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon; c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon; c->idct[1] = ff_hevc_idct_8x8_8_neon; - c->idct[2] = ff_hevc_idct_16x16_8_neon; + c->idct[2] = ff_hevc_idct_16x16_8_neon_new; c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon; c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; -- 2.32.0 (Apple Git-132) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next reply other threads:[~2022-06-23 12:23 UTC|newest] Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top 2022-06-23 12:23 J. Dekker [this message] 2022-06-23 12:23 ` [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct J. Dekker 2022-08-09 12:15 ` [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct Martin Storsjö
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20220623122311.20097-1-jdek@itanimul.li \ --to=jdek@itanimul.li \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git