Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct
@ 2022-06-23 12:23 J. Dekker
  2022-06-23 12:23 ` [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct J. Dekker
  2022-08-09 12:15 ` [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct Martin Storsjö
  0 siblings, 2 replies; 3+ messages in thread
From: J. Dekker @ 2022-06-23 12:23 UTC (permalink / raw)
  To: ffmpeg-devel

old:
hevc_idct_16x16_8_c: 5366.2
hevc_idct_16x16_8_neon: 1493.2

new:
hevc_idct_16x16_8_c: 5363.2
hevc_idct_16x16_8_neon: 943.5

Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com>
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 666 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   3 +-
 2 files changed, 668 insertions(+), 1 deletion(-)

 This idct is significantly faster than the one we currently have, I
 suspect its for a couple reasons: 1) it's only written for 8bit 2) it's
 unrolled signficantly more. It comes at a hefty cost of roughly 2.25x
 the object size. I'm wondering if this idct is salvagable, or the one
 we have should just be improved instead.

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..784bae33b3 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -618,3 +618,669 @@ idct_dc 16, 10
 
 idct_dc 32, 8
 idct_dc 32, 10
+
+// WIP
+
+.Lo0_coeff:     .hword  83, 36, 0, 0, 0, 0, 0, 0
+.Lo8transform0: .hword  89,  75,  50,  18               // transform[4,12,20,28][0]
+.Lo8transform1: .hword  75, -18, -89, -50
+.Lo8transform2: .hword  50, -89,  18,  75
+.Lo8transform3: .hword  18, -50,  75, -89
+
+.LimitMask:
+        .hword          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0
+        .hword          0xffff,      0,      0,      0,      0, 0, 0, 0
+
+.Leo_coeff:
+        .hword          64,  64,  64,  64,  83,  36, -36, -83
+        .hword          64, -64, -64,  64,  36, -83,  83, -36
+        .hword          89,  75,  50,  18,  75, -18, -89, -50   // transform[4,12][0-3]
+        .hword          50, -89,  18,  75,  18, -50,  75, -89   // transform[20,28][0-3]
+.Lo16transform0: .hword 90,  87,  80,  70,  57,  43,  25,   9   // transform[2][0-7], also transform[2,6,10..][0]
+.Lo16transform1: .hword 87,  57,   9, -43, -80, -90, -70, -25   // transform[6][0-7]
+.Lo16transform2: .hword 80,   9, -70, -87, -25,  57,  90,  43   // transform[10][0-7]
+.Lo16transform3: .hword 70, -43, -87,   9,  90,  25, -80, -57   // transform[14][0-7]
+.Lo16transform4: .hword 57, -80, -25,  90,  -9, -87,  43,  70   // transform[18][0-7]
+.Lo16transform5: .hword 43, -90,  57,  25, -87,  70,   9, -80   // transform[22][0-7]
+.Lo16transform6: .hword 25, -70,  90, -80,  43,   9, -57,  87   // transform[26][0-7]
+.Lo16transform7: .hword  9, -25,  43, -57,  70, -80,  87, -90   // transform[30][0-7]
+
+// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
+function ff_hevc_idct_16x16_8_neon_new, export=1
+        sub             sp, sp, 64
+        st1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        sub             sp, sp, 32
+        st1             {v14.16b, v15.16b}, [sp]
+        mov             x3, 0
+        mov             x2, x0
+1:      mov             x4, x2
+        mov             x5, 32
+        ld1             {v16.8h}, [x4], x5
+        ld1             {v17.8h}, [x4], x5
+        ld1             {v18.8h}, [x4], x5
+        ld1             {v19.8h}, [x4], x5
+        ld1             {v20.8h}, [x4], x5
+        ld1             {v21.8h}, [x4], x5
+        ld1             {v22.8h}, [x4], x5
+        ld1             {v23.8h}, [x4], x5
+        ld1             {v24.8h}, [x4], x5
+        ld1             {v25.8h}, [x4], x5
+        ld1             {v26.8h}, [x4], x5
+        ld1             {v27.8h}, [x4], x5
+        ld1             {v28.8h}, [x4], x5
+        ld1             {v29.8h}, [x4], x5
+        ld1             {v30.8h}, [x4], x5
+        ld1             {v31.8h}, [x4], x5
+        cmp             x1, 12
+        b.hs            5f
+        // limit2 below 16
+        bic             x4, x1, 1
+        adr             x5, .LimitMask
+        cbnz            x3, 3f
+        // columns 0 .. 7 - cleanup of indexes 5 .. 7
+        ld1             {v0.8h}, [x5]
+        adr             x5, 2f
+        add             x5, x5, x4, lsl 2
+        add             x5, x5, x4, lsl 1
+        br              x5
+2:      and             v17.16b, v17.16b, v0.16b    // col_limit 0..1 -> limit2 == 4..5
+        and             v19.16b, v19.16b, v0.16b
+        b               5f
+        and             v19.16b, v19.16b, v0.16b    // col_limit 2..3 -> limit2 == 6..7
+        and             v21.16b, v21.16b, v0.16b
+        b               5f
+        and             v21.16b, v21.16b, v0.16b    // col_limit 4..5 -> limit2 == 8..9
+        and             v23.16b, v23.16b, v0.16b
+        b               5f
+        and             v23.16b, v23.16b, v0.16b    // col_limit 6..7 -> limit2 == 10..11
+        and             v25.16b, v25.16b, v0.16b
+        b               5f
+        and             v25.16b, v25.16b, v0.16b    // col_limit 8..9 -> limit2 == 12..13
+        and             v27.16b, v27.16b, v0.16b
+        b               5f
+        and             v27.16b, v27.16b, v0.16b    // col_limit 10..11 -> limit2 == 14..15
+        and             v29.16b, v29.16b, v0.16b
+        b               5f
+        // columns 8 .. 15
+3:      subs            x4, x4, 2
+        b.lo            5f
+        ld1             {v0.8h, v1.8h}, [x5]
+        adr             x5, 4f
+        add             x5, x5, x4, lsl 3
+        add             x5, x5, x4, lsl 1
+        br              x5
+4:      and             v17.16b, v17.16b, v1.16b    // col_limit 2..3 -> limit2 == 2..3
+        b               5f
+        nop
+        nop
+        nop
+        and             v17.16b, v17.16b, v1.16b    // col_limit 4..5 -> limit2 == 4..5
+        and             v19.16b, v19.16b, v1.16b
+        b               5f
+        nop
+        nop
+        and             v17.16b, v17.16b, v0.16b    // col_limit 6..7 -> limit2 == 6..7
+        and             v19.16b, v19.16b, v1.16b
+        and             v21.16b, v21.16b, v1.16b
+        b               5f
+        nop
+        and             v17.16b, v17.16b, v0.16b    // col_limit 8..9 -> limit2 == 8..9
+        and             v19.16b, v19.16b, v0.16b
+        and             v21.16b, v21.16b, v1.16b
+        and             v23.16b, v23.16b, v1.16b
+        b               5f
+        and             v19.16b, v19.16b, v0.16b    // col_limit 10..11 -> limit2 == 10..11
+        and             v21.16b, v21.16b, v0.16b
+        and             v23.16b, v23.16b, v1.16b
+        and             v25.16b, v25.16b, v1.16b
+        b               5f
+5:      adr             x4, .Lo0_coeff
+        ld1             {v14.8h}, [x4]
+
+        // v0,v1 = e0
+        sshll           v0.4s, v16.4h, 6
+        sshll           v1.4s, v24.4h, 6
+        add             v0.4s, v0.4s, v1.4s
+        sshll2          v1.4s, v16.8h, 6
+        sshll2          v2.4s, v24.8h, 6
+        add             v1.4s, v1.4s, v2.4s
+
+        // v2,v3 = o0
+        smull           v2.4s, v20.4h, v14.h[0]
+        smlal           v2.4s, v28.4h, v14.h[1]
+        smull2          v3.4s, v20.8h, v14.h[0]
+        smlal2          v3.4s, v28.8h, v14.h[1]
+
+        // v4,v5 = e_8[0]
+        add             v4.4s, v0.4s, v2.4s
+        add             v5.4s, v1.4s, v3.4s
+
+        // v6,v7 = e_8[3]
+        sub             v6.4s, v0.4s, v2.4s
+        sub             v7.4s, v1.4s, v3.4s
+
+
+        // v0,v1 = o_8[0]
+        adr             x4, .Lo8transform0
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[0]
+        add             v2.4s, v4.4s, v0.4s
+        add             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[0]
+        adr             x4, .Lo16transform0
+        ld1             {v15.8h}, [x4]
+
+        mov             x5, 16
+        cmp             x1, 12
+        b.hs            6f
+        add             x5, x1, 4
+        bic             x5, x5, 1
+        cbz             x3, 6f
+        orr             x5, x1, 1
+        subs            x5, x5, 2
+        csel            x5, x5, xzr, hs
+6:      mov             x4, 64
+        sub             x6, x4, x5, lsl 2
+        adr             x5, 7f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+7:      smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[0 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        st1             {v10.8h}, [x2]
+
+        // tmp[15 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 15 * 32
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[7]
+        sub             v2.4s, v4.4s, v0.4s
+        sub             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[7]
+        adr             x4, .Lo16transform7
+        ld1             {v15.8h}, [x4]
+        adr             x5, 8f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+8:      smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[7 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 7 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[8 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 8 * 32
+        st1             {v10.8h}, [x4]
+
+        // v0,v1 = o_8[3]
+        adr             x4, .Lo8transform3
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[3]
+        add             v2.4s, v6.4s, v0.4s
+        add             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[3]
+        adr             x4, .Lo16transform3
+        ld1             {v15.8h}, [x4]
+        adr             x5, 9f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+9:      smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6] // 13
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5] // 11
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4] // 9
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3] // 7
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2] // 5
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1] // 3
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0] // 1
+
+        // tmp[3 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 3 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[12 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 12 * 32
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[4]
+        sub             v2.4s, v6.4s, v0.4s
+        sub             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[4]
+        adr             x4, .Lo16transform4
+        ld1             {v15.8h}, [x4]
+        adr             x5, 10f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+10:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[4 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 4 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[11 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 11 * 32
+        st1             {v10.8h}, [x4]
+
+
+        // v0,v1 = e1
+        sshll           v0.4s, v16.4h, 6
+        sshll           v1.4s, v24.4h, 6
+        sub             v0.4s, v0.4s, v1.4s
+        sshll2          v1.4s, v16.8h, 6
+        sshll2          v2.4s, v24.8h, 6
+        sub             v1.4s, v1.4s, v2.4s
+
+        // v2,v3 = o1
+        smull           v2.4s, v20.4h, v14.h[1]
+        smlsl           v2.4s, v28.4h, v14.h[0]
+        smull2          v3.4s, v20.8h, v14.h[1]
+        smlsl2          v3.4s, v28.8h, v14.h[0]
+
+        // v4,v5 = e_8[1]
+        add             v4.4s, v0.4s, v2.4s
+        add             v5.4s, v1.4s, v3.4s
+
+        // v6,v7 = e_8[2]
+        sub             v6.4s, v0.4s, v2.4s
+        sub             v7.4s, v1.4s, v3.4s
+
+        // v0,v1 = o_8[1]
+        adr             x4, .Lo8transform1
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[1]
+        add             v2.4s, v4.4s, v0.4s
+        add             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[1]
+        adr             x4, .Lo16transform1
+        ld1             {v15.8h}, [x4]
+        adr             x5, 11f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+11:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[1 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 1 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[14 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 14 * 32
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[6]
+        sub             v2.4s, v4.4s, v0.4s
+        sub             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[6]
+        adr             x4, .Lo16transform6
+        ld1             {v15.8h}, [x4]
+        adr             x5, 12f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+12:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[6 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 6 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[9 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 9 * 32
+        st1             {v10.8h}, [x4]
+
+        // v0,v1 = o_8[2]
+        adr             x4, .Lo8transform2
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[2]
+        add             v2.4s, v6.4s, v0.4s
+        add             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[2]
+        adr             x4, .Lo16transform2
+        ld1             {v15.8h}, [x4]
+        adr             x5, 13f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+13:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[2 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 2 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[13 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 13 * 32
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[5]
+        sub             v2.4s, v6.4s, v0.4s
+        sub             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[5]
+        adr             x4, .Lo16transform5
+        ld1             {v15.8h}, [x4]
+        adr             x5, 14f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+14:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[5 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 5 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[10 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 10 * 32
+        st1             {v10.8h}, [x4]
+
+        add             x2, x2, 16
+        add             x3, x3, 1
+        cmp             x3, 2
+        b.lo            1b
+
+
+        // horizontal transform
+        adr             x4, .Leo_coeff
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+        // o_16 jump address
+        mov             x4, 64
+        bic             x5, x1, 1
+        subs            x4, x4, x5, lsl 2
+        csel            x4, x4, xzr, hs
+        adr             x5, 15f
+        add             x5, x5, x4
+
+        mov             x3, 16
+14:     ld1             {v6.8h, v7.8h}, [x0]
+
+        // v2 = e_8
+        smull           v2.4s, v16.4h, v6.h[0]
+        smlal2          v2.4s, v16.8h, v6.h[4]
+        smlal           v2.4s, v17.4h, v7.h[0]
+        smlal2          v2.4s, v17.8h, v7.h[4]
+
+        // v3 = o_8
+        smull           v3.4s, v18.4h, v6.h[2]
+        smlal2          v3.4s, v18.8h, v6.h[6]
+        smlal           v3.4s, v19.4h, v7.h[2]
+        smlal2          v3.4s, v19.8h, v7.h[6]
+
+        // v0,v1 = e_16
+        add             v0.4s, v2.4s, v3.4s
+        sub             v2.4s, v2.4s, v3.4s
+        mov             v1.d[0], v2.d[1]
+        mov             v1.d[1], v2.d[0]
+        rev64           v1.4s, v1.4s
+
+        // v2,v3 = o_16
+        movi            v2.4s, 0
+        movi            v3.4s, 0
+        br              x5
+15:     smlal           v2.4s, v27.4h, v7.h[7]
+        smlal2          v3.4s, v27.8h, v7.h[7]
+        smlal           v2.4s, v26.4h, v7.h[5]
+        smlal2          v3.4s, v26.8h, v7.h[5]
+        smlal           v2.4s, v25.4h, v7.h[3]
+        smlal2          v3.4s, v25.8h, v7.h[3]
+        smlal           v2.4s, v24.4h, v7.h[1]
+        smlal2          v3.4s, v24.8h, v7.h[1]
+        smlal           v2.4s, v23.4h, v6.h[7]
+        smlal2          v3.4s, v23.8h, v6.h[7]
+        smlal           v2.4s, v22.4h, v6.h[5]
+        smlal2          v3.4s, v22.8h, v6.h[5]
+        smlal           v2.4s, v21.4h, v6.h[3]
+        smlal2          v3.4s, v21.8h, v6.h[3]
+        smlal           v2.4s, v20.4h, v6.h[1]
+        smlal2          v3.4s, v20.8h, v6.h[1]
+
+        // coeff
+        add             v4.4s, v0.4s, v2.4s
+        add             v5.4s, v1.4s, v3.4s
+        sub             v6.4s, v0.4s, v2.4s
+        sub             v7.4s, v1.4s, v3.4s
+        sqrshrn         v4.4h, v4.4s, 12
+        sqrshrn2        v4.8h, v5.4s, 12
+        sqrshrn         v6.4h, v6.4s, 12
+        sqrshrn2        v6.8h, v7.4s, 12
+        mov             v5.d[0], v6.d[1]
+        mov             v5.d[1], v6.d[0]
+        rev64           v5.8h, v5.8h
+        st1             {v4.8h, v5.8h}, [x0], 32
+        subs            x3, x3, 1
+        b.ne            14b
+
+        ld1             {v14.16b, v15.16b}, [sp], 32
+        ld1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 2002530266..612ebb9541 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -45,6 +45,7 @@ void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_8_neon_new(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
@@ -72,7 +73,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
         c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_8_neon;
-        c->idct[2]                     = ff_hevc_idct_16x16_8_neon;
+        c->idct[2]                     = ff_hevc_idct_16x16_8_neon_new;
         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_8_neon;
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
-- 
2.32.0 (Apple Git-132)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct
  2022-06-23 12:23 [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct J. Dekker
@ 2022-06-23 12:23 ` J. Dekker
  2022-08-09 12:15 ` [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct Martin Storsjö
  1 sibling, 0 replies; 3+ messages in thread
From: J. Dekker @ 2022-06-23 12:23 UTC (permalink / raw)
  To: ffmpeg-devel

hevc_idct_32x32_8_c: 40128.5
hevc_idct_32x32_8_neon: 7102.0

Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com>
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 1265 +++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |    2 +
 2 files changed, 1267 insertions(+)

 Written by the same author as the other 16x16 idct. Again the same
 concern with it.

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 784bae33b3..3b6e95153f 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -644,6 +644,40 @@ idct_dc 32, 10
 .Lo16transform5: .hword 43, -90,  57,  25, -87,  70,   9, -80   // transform[22][0-7]
 .Lo16transform6: .hword 25, -70,  90, -80,  43,   9, -57,  87   // transform[26][0-7]
 .Lo16transform7: .hword  9, -25,  43, -57,  70, -80,  87, -90   // transform[30][0-7]
+.Lo32transform:
+        .hword          90,  90,  88,  85,  82,  78,  73,  67   // transform[1,3,5,7..15][1]
+        .hword          61,  54,  46,  38,  31,  22,  13,   4   // transform[17,19,21..31][1]
+        .hword          90,  82,  67,  46,  22,  -4, -31, -54   // transform[1,3,5,7..15][3]
+        .hword         -73, -85, -90, -88, -78, -61, -38, -13   // transform[17,19,21..31][3]
+        .hword          88,  67,  31, -13, -54, -82, -90, -78   // ..
+        .hword         -46, -4,   38,  73,  90,  85,  61,  22
+        .hword          85,  46, -13, -67, -90, -73, -22,  38
+        .hword          82,  88,  54,  -4, -61, -90, -78, -31
+.Lo32transform9_31:
+        .hword          82,  22, -54, -90, -61,  13,  78,  85
+        .hword          31, -46, -90, -67,   4,  73,  88,  38
+        .hword          78,  -4, -82, -73,  13,  85,  67, -22
+        .hword         -88, -61,  31,  90,  54, -38, -90, -46
+        .hword          73, -31, -90, -22,  78,  67, -38, -90
+        .hword         -13,  82,  61, -46, -88,  -4,  85,  54
+        .hword          67, -54, -78,  38,  85, -22, -90,   4
+        .hword          90,  13, -88, -31,  82,  46, -73, -61
+        .hword          61, -73, -46,  82,  31, -88, -13,  90
+        .hword          -4, -90,  22,  85, -38, -78,  54,  67
+        .hword          54, -85,  -4,  88, -46, -61,  82,  13
+        .hword         -90,  38,  67, -78, -22,  90, -31, -73
+        .hword          46, -90,  38,  54, -90,  31,  61, -88
+        .hword          22,  67, -85,  13,  73, -82,   4,  78
+        .hword          38, -88,  73,  -4, -67,  90, -46, -31
+        .hword          85, -78,  13,  61, -90,  54,  22, -82
+        .hword          31, -78,  90, -61,   4,  54, -88,  82
+        .hword         -38, -22,  73, -90,  67, -13, -46,  85
+        .hword          22, -61,  85, -90,  73, -38,  -4,  46
+        .hword         -78,  90, -82,  54, -13, -31,  67, -88
+        .hword          13, -38,  61, -78,  88, -90,  85, -73
+        .hword          54, -31,   4,  22, -46,  67, -82,  90
+        .hword           4, -13,  22, -31,  38, -46,  54, -61   // transform[1,3,5,7..15][31]
+        .hword          67, -73,  78, -82,  85, -88,  90, -90   // transform[17,19,21..31][31]
 
 // void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
 function ff_hevc_idct_16x16_8_neon_new, export=1
@@ -1284,3 +1318,1234 @@ function ff_hevc_idct_16x16_8_neon_new, export=1
         ld1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
         ret
 endfunc
+
+function ff_hevc_idct_32x32_8_neon, export=1
+        sub             sp, sp, 64
+        st1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        sub             sp, sp, 64
+        st1             {v12.16b, v13.16b, v14.16b, v15.16b}, [sp]
+        sub             sp, sp, 16 * 32 * 4     // room for o_32: 16 * 32 values
+        mov             x3, 0                   // loop counter
+        mov             x2, x0
+        mov             x7, 83
+        add             x7, x7, 36 * 65536      // o0, o1 coeff. factors
+1:      mov             x9, 128
+        // loading odd lines
+        add             x4, x2, 64              // odd lines
+        ld1             {v16.8h}, [x4], x9    // line 1
+        ld1             {v17.8h}, [x4], x9    // line 3
+        ld1             {v18.8h}, [x4], x9    // line 5
+        ld1             {v19.8h}, [x4], x9    // line 7
+        ld1             {v20.8h}, [x4], x9    // line 9
+        ld1             {v21.8h}, [x4], x9    // line 11
+        ld1             {v22.8h}, [x4], x9    // line 13
+        ld1             {v23.8h}, [x4], x9    // line 15
+        ld1             {v24.8h}, [x4], x9    // line 17
+        ld1             {v25.8h}, [x4], x9    // line 19
+        ld1             {v26.8h}, [x4], x9    // line 21
+        ld1             {v27.8h}, [x4], x9    // line 23
+        ld1             {v28.8h}, [x4], x9    // line 25
+        ld1             {v29.8h}, [x4], x9    // line 27
+        ld1             {v30.8h}, [x4], x9    // line 29
+        ld1             {v31.8h}, [x4], x9    // line 31
+
+        cmp             x1, 28
+        b.hs            5f
+        // limit2 below 32
+        bic             x4, x1, 1
+        adr             x5, .LimitMask
+        cbnz            x3, 3f
+        // columns 0 .. 7 - cleanup of indexes 5 .. 7
+        ld1             {v0.8h}, [x5]
+        adr             x5, 2f
+        add             x5, x5, x4, lsl 2
+        add             x5, x5, x4, lsl 1
+        br              x5
+2:      and             v16.16b, v16.16b, v0.16b    // col_limit 0..1 -> limit2 == 4..5
+        and             v17.16b, v17.16b, v0.16b
+        b               5f
+        and             v17.16b, v17.16b, v0.16b    // col_limit 2..3 -> limit2 == 6..7
+        and             v18.16b, v18.16b, v0.16b
+        b               5f
+        and             v18.16b, v18.16b, v0.16b    // col_limit 4..5 -> limit2 == 8..9
+        and             v19.16b, v19.16b, v0.16b
+        b               5f
+        and             v19.16b, v19.16b, v0.16b    // col_limit 6..7 -> limit2 == 10..11
+        and             v20.16b, v20.16b, v0.16b
+        b               5f
+        and             v20.16b, v20.16b, v0.16b    // col_limit 8..9 -> limit2 == 12..13
+        and             v21.16b, v21.16b, v0.16b
+        b               5f
+        and             v21.16b, v21.16b, v0.16b    // col_limit 10..11 -> limit2 == 14..15
+        and             v22.16b, v22.16b, v0.16b
+        b               5f
+        and             v22.16b, v22.16b, v0.16b    // col_limit 12..13 -> limit2 == 16..17
+        and             v23.16b, v23.16b, v0.16b
+        b               5f
+        and             v23.16b, v23.16b, v0.16b    // col_limit 14..15 -> limit2 == 18..19
+        and             v24.16b, v24.16b, v0.16b
+        b               5f
+        and             v24.16b, v24.16b, v0.16b    // col_limit 16..17 -> limit2 == 20..21
+        and             v25.16b, v25.16b, v0.16b
+        b               5f
+        and             v25.16b, v25.16b, v0.16b    // col_limit 18..19 -> limit2 == 22..23
+        and             v26.16b, v26.16b, v0.16b
+        b               5f
+        and             v26.16b, v26.16b, v0.16b    // col_limit 20..21 -> limit2 == 24..25
+        and             v27.16b, v27.16b, v0.16b
+        b               5f
+        and             v27.16b, v27.16b, v0.16b    // col_limit 22..23 -> limit2 == 26..27
+        and             v28.16b, v28.16b, v0.16b
+        b               5f
+        and             v28.16b, v28.16b, v0.16b    // col_limit 24..25 -> limit2 == 28..29
+        and             v29.16b, v29.16b, v0.16b
+        b               5f
+        and             v29.16b, v29.16b, v0.16b    // col_limit 26..27 -> limit2 == 30..31
+        and             v30.16b, v30.16b, v0.16b
+        b               5f
+        // columns 8 .. 31
+3:      add             x4, x4, 6
+        subs            x4, x4, x3, lsl 3
+        b.lo            5f
+        ld1             {v0.8h, v1.8h}, [x5]
+        adr             x5, 4f
+        add             x5, x5, x4, lsl 3
+        add             x5, x5, x4, lsl 1
+        br              x5
+4:      and             v16.16b, v16.16b, v1.16b    // limit2 == 2..3
+        b               5f
+        nop
+        nop
+        nop
+        and             v16.16b, v16.16b, v1.16b    // limit2 == 4..5
+        and             v17.16b, v17.16b, v1.16b
+        b               5f
+        nop
+        nop
+        and             v16.16b, v16.16b, v0.16b    // limit2 == 6..7
+        and             v17.16b, v17.16b, v1.16b
+        and             v18.16b, v18.16b, v1.16b
+        b               5f
+        nop
+        and             v16.16b, v16.16b, v0.16b    // limit2 == 8..9
+        and             v17.16b, v17.16b, v0.16b
+        and             v18.16b, v18.16b, v1.16b
+        and             v19.16b, v19.16b, v1.16b
+        b               5f
+        and             v17.16b, v17.16b, v0.16b    // limit2 == 10..11
+        and             v18.16b, v18.16b, v0.16b
+        and             v19.16b, v19.16b, v1.16b
+        and             v20.16b, v20.16b, v1.16b
+        b               5f
+        and             v18.16b, v18.16b, v0.16b    // limit2 == 12..13
+        and             v19.16b, v19.16b, v0.16b
+        and             v20.16b, v20.16b, v1.16b
+        and             v21.16b, v21.16b, v1.16b
+        b               5f
+        and             v19.16b, v19.16b, v0.16b    // limit2 == 14..15
+        and             v20.16b, v20.16b, v0.16b
+        and             v21.16b, v21.16b, v1.16b
+        and             v22.16b, v22.16b, v1.16b
+        b               5f
+        and             v20.16b, v20.16b, v0.16b    // limit2 == 16..17
+        and             v21.16b, v21.16b, v0.16b
+        and             v22.16b, v22.16b, v1.16b
+        and             v23.16b, v23.16b, v1.16b
+        b               5f
+        and             v21.16b, v21.16b, v0.16b    // limit2 == 18..19
+        and             v22.16b, v22.16b, v0.16b
+        and             v23.16b, v23.16b, v1.16b
+        and             v24.16b, v24.16b, v1.16b
+        b               5f
+        and             v22.16b, v22.16b, v0.16b    // limit2 == 20..21
+        and             v23.16b, v23.16b, v0.16b
+        and             v24.16b, v24.16b, v1.16b
+        and             v25.16b, v25.16b, v1.16b
+        b               5f
+        and             v23.16b, v23.16b, v0.16b    // limit2 == 22..23
+        and             v24.16b, v24.16b, v0.16b
+        and             v25.16b, v25.16b, v1.16b
+        and             v26.16b, v26.16b, v1.16b
+        b               5f
+        and             v24.16b, v24.16b, v0.16b    // limit2 == 24..25
+        and             v25.16b, v25.16b, v0.16b
+        and             v26.16b, v26.16b, v1.16b
+        and             v27.16b, v27.16b, v1.16b
+        b               5f
+        and             v25.16b, v25.16b, v0.16b    // limit2 == 26..27
+        and             v26.16b, v26.16b, v0.16b
+        and             v27.16b, v27.16b, v1.16b
+        and             v28.16b, v28.16b, v1.16b
+        b               5f
+
+
+        // o_32
+5:      mov             x5, 32
+        cmp             x1, 28
+        b.hs            6f
+        add             x5, x1, 4
+        bic             x5, x5, 1
+        cbz             x3, 6f
+        add             x5, x1, 6
+        orr             x5, x5, 1
+        subs            x5, x5, x3, lsl 3
+        csel            x5, x5, xzr, hs
+6:      mov             x4, 128
+        sub             x4, x4, x5, lsl 2
+        adr             x5, 8f
+        add             x5, x5, x4
+        adr             x4, .Lo32transform
+        mov             x8, sp
+        mov             x6, 16
+7:      ld1             {v2.8h, v3.8h}, [x4], 32
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+8:      smlal2          v9.4s, v31.8h, v3.h[7]
+        smlal           v8.4s, v31.4h, v3.h[7]
+        smlal2          v9.4s, v30.8h, v3.h[6]
+        smlal           v8.4s, v30.4h, v3.h[6]
+        smlal2          v9.4s, v29.8h, v3.h[5]
+        smlal           v8.4s, v29.4h, v3.h[5]
+        smlal2          v9.4s, v28.8h, v3.h[4]
+        smlal           v8.4s, v28.4h, v3.h[4]
+        smlal2          v9.4s, v27.8h, v3.h[3]
+        smlal           v8.4s, v27.4h, v3.h[3]
+        smlal2          v9.4s, v26.8h, v3.h[2]
+        smlal           v8.4s, v26.4h, v3.h[2]
+        smlal2          v9.4s, v25.8h, v3.h[1]
+        smlal           v8.4s, v25.4h, v3.h[1]
+        smlal2          v9.4s, v24.8h, v3.h[0]
+        smlal           v8.4s, v24.4h, v3.h[0]
+        smlal2          v9.4s, v23.8h, v2.h[7]
+        smlal           v8.4s, v23.4h, v2.h[7]
+        smlal2          v9.4s, v22.8h, v2.h[6]
+        smlal           v8.4s, v22.4h, v2.h[6]
+        smlal2          v9.4s, v21.8h, v2.h[5]
+        smlal           v8.4s, v21.4h, v2.h[5]
+        smlal2          v9.4s, v20.8h, v2.h[4]
+        smlal           v8.4s, v20.4h, v2.h[4]
+        smlal2          v9.4s, v19.8h, v2.h[3]
+        smlal           v8.4s, v19.4h, v2.h[3]
+        smlal2          v9.4s, v18.8h, v2.h[2]
+        smlal           v8.4s, v18.4h, v2.h[2]
+        smlal2          v9.4s, v17.8h, v2.h[1]
+        smlal           v8.4s, v17.4h, v2.h[1]
+        smlal2          v9.4s, v16.8h, v2.h[0]
+        smlal           v8.4s, v16.4h, v2.h[0]
+        st1             {v8.4s, v9.4s}, [x8], 32
+        subs            x6, x6, 1
+        b.ne            7b
+
+        mov             x4, x2
+        ld1             {v16.8h}, [x4], x9    // line 0
+        ld1             {v17.8h}, [x4], x9    // line 2
+        ld1             {v18.8h}, [x4], x9    // line 4
+        ld1             {v19.8h}, [x4], x9    // line 6
+        ld1             {v20.8h}, [x4], x9    // line 8
+        ld1             {v21.8h}, [x4], x9    // line 10
+        ld1             {v22.8h}, [x4], x9    // line 12
+        ld1             {v23.8h}, [x4], x9    // line 14
+        ld1             {v24.8h}, [x4], x9    // line 16
+        ld1             {v25.8h}, [x4], x9    // line 18
+        ld1             {v26.8h}, [x4], x9    // line 20
+        ld1             {v27.8h}, [x4], x9    // line 22
+        ld1             {v28.8h}, [x4], x9    // line 24
+        ld1             {v29.8h}, [x4], x9    // line 26
+        ld1             {v30.8h}, [x4], x9    // line 28
+        ld1             {v31.8h}, [x4], x9    // line 30
+        cmp             x1, 28
+        b.hs            12f
+        // limit2 below 32
+        bic             x4, x1, 3
+        cbnz            x3, 10f
+        // columns 0 .. 7 - cleanup of indexes 5 .. 7
+        adr             x5, 9f
+        add             x5, x5, x4, lsl 1
+        br              x5
+9:      and             v17.16b, v17.16b, v0.16b    // col_limit 0..3 -> limit2/2 == 2..3
+        b               12f
+        and             v19.16b, v19.16b, v0.16b    // col_limit 4..7 -> limit2/2 == 4..5
+        b               12f
+        and             v21.16b, v21.16b, v0.16b    // col_limit 8..11 -> limit2/2 == 6..7
+        b               12f
+        and             v23.16b, v23.16b, v0.16b    // col_limit 12..15 -> limit2/2 == 8..9
+        b               12f
+        and             v25.16b, v25.16b, v0.16b    // col_limit 16..19 -> limit2/2 == 10..11
+        b               12f
+        and             v27.16b, v27.16b, v0.16b    // col_limit 20..23 -> limit2/2 == 12..13
+        b               12f
+        and             v29.16b, v29.16b, v0.16b    // col_limit 24..27 -> limit2/2 == 14..15
+        b               12f
+        // columns 8 .. 31
+10:     add             x4, x4, 4
+        subs            x4, x4, x3, lsl 3           // x4 = (limit2 & ~3)-4  for column 8 * x3
+        b.lo            12f
+        adr             x5, 11f
+        add             x5, x5, x4, lsl 1
+        add             x5, x5, x4
+        br              x5
+11:     and             v17.16b, v17.16b, v1.16b    // limit2 == 4..7
+        b               12f
+        nop
+        and             v17.16b, v17.16b, v0.16b    // limit2 == 8..11
+        and             v19.16b, v19.16b, v1.16b
+        b               12f
+        and             v19.16b, v19.16b, v0.16b    // limit2 == 12..15
+        and             v21.16b, v21.16b, v1.16b
+        b               12f
+        and             v21.16b, v21.16b, v0.16b    // limit2 == 16..19
+        and             v23.16b, v23.16b, v1.16b
+        b               12f
+        and             v23.16b, v23.16b, v0.16b    // limit2 == 20..23
+        and             v25.16b, v25.16b, v1.16b
+        b               12f
+        and             v25.16b, v25.16b, v0.16b    // limit2 == 24..27
+        and             v27.16b, v27.16b, v1.16b
+        b               12f
+
+        // v0,v1 = e0
+12:     sshll           v0.4s, v16.4h, 6
+        sshll           v1.4s, v24.4h, 6
+        add             v0.4s, v0.4s, v1.4s
+        sshll2          v1.4s, v16.8h, 6
+        sshll2          v2.4s, v24.8h, 6
+        add             v1.4s, v1.4s, v2.4s
+
+        // v2,v3 = o0
+        mov             v14.s[0], w7
+        smull           v2.4s, v20.4h, v14.h[0]
+        smlal           v2.4s, v28.4h, v14.h[1]
+        smull2          v3.4s, v20.8h, v14.h[0]
+        smlal2          v3.4s, v28.8h, v14.h[1]
+
+        // v4,v5 = e_8[0]
+        add             v4.4s, v0.4s, v2.4s
+        add             v5.4s, v1.4s, v3.4s
+
+        // v6,v7 = e_8[3]
+        sub             v6.4s, v0.4s, v2.4s
+        sub             v7.4s, v1.4s, v3.4s
+
+
+        // v0,v1 = o_8[0]
+        adr             x4, .Lo8transform0
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[0]
+        add             v2.4s, v4.4s, v0.4s
+        add             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[0]
+        adr             x4, .Lo16transform0
+        ld1             {v15.8h}, [x4]
+        mov             x5, 32
+        cmp             x1, 28
+        b.hs            13f
+        add             x5, x1, 4
+        bic             x5, x5, 3
+        cbz             x3, 13f
+        orr             x5, x5, 2
+        subs            x5, x5, x3, lsl 3
+        csel            x5, x5, xzr, hs
+13:     mov             x4, 64
+        sub             x6, x4, x5, lsl 1
+        adr             x5, 14f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+14:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // v12,v13 = e_32[0]
+        add             v12.4s, v2.4s, v8.4s
+        add             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[0]
+        ld1             {v14.4s, v15.4s}, [sp]
+
+        // tmp[0 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        st1             {v10.8h}, [x2]
+
+        // tmp[31 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 31 * 64
+        st1             {v10.8h}, [x4]
+
+        // v12,v13 = e_32[15]
+        sub             v12.4s, v2.4s, v8.4s
+        sub             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[15]
+        add             x4, sp, 15 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[15 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 15 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[16 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 16 * 64
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[7]
+        sub             v2.4s, v4.4s, v0.4s
+        sub             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[7]
+        adr             x4, .Lo16transform7
+        ld1             {v15.8h}, [x4]
+        adr             x5, 15f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+15:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // v12,v13 = e_32[7]
+        add             v12.4s, v2.4s, v8.4s
+        add             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[7]
+        add             x4, sp, 7 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[7 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 7 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[24 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 24 * 64
+        st1             {v10.8h}, [x4]
+
+        // v12,v13 = e_32[8]
+        sub             v12.4s, v2.4s, v8.4s
+        sub             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[8]
+        add             x4, sp, 8 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[8 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 8 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[23 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 23 * 64
+        st1             {v10.8h}, [x4]
+
+        // v0,v1 = o_8[3]
+        adr             x4, .Lo8transform3
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[3]
+        add             v2.4s, v6.4s, v0.4s
+        add             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[3]
+        adr             x4, .Lo16transform3
+        ld1             {v15.8h}, [x4]
+        adr             x5, 16f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+16:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // v12,v13 = e_32[3]
+        add             v12.4s, v2.4s, v8.4s
+        add             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[3]
+        add             x4, sp, 3 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[3 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 3 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[28 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 28 * 64
+        st1             {v10.8h}, [x4]
+
+        // v12,v13 = e_32[12]
+        sub             v12.4s, v2.4s, v8.4s
+        sub             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[12]
+        add             x4, sp, 12 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[12 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 12 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[19 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 19 * 64
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[4]
+        sub             v2.4s, v6.4s, v0.4s
+        sub             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[4]
+        adr             x4, .Lo16transform4
+        ld1             {v15.8h}, [x4]
+        adr             x5, 17f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+17:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // v12,v13 = e_32[4]
+        add             v12.4s, v2.4s, v8.4s
+        add             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[4]
+        add             x4, sp, 4 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[4 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 4 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[27 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 27 * 64
+        st1             {v10.8h}, [x4]
+
+        // v12,v13 = e_32[11]
+        sub             v12.4s, v2.4s, v8.4s
+        sub             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[11]
+        add             x4, sp, 11 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[11 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 11 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[20 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 20 * 64
+        st1             {v10.8h}, [x4]
+
+        // v0,v1 = e1
+        sshll           v0.4s, v16.4h, 6
+        sshll           v1.4s, v24.4h, 6
+        sub             v0.4s, v0.4s, v1.4s
+        sshll2          v1.4s, v16.8h, 6
+        sshll2          v2.4s, v24.8h, 6
+        sub             v1.4s, v1.4s, v2.4s
+
+        // v2,v3 = o1
+        mov             v14.s[0], w7
+        smull           v2.4s, v20.4h, v14.h[1]
+        smlsl           v2.4s, v28.4h, v14.h[0]
+        smull2          v3.4s, v20.8h, v14.h[1]
+        smlsl2          v3.4s, v28.8h, v14.h[0]
+
+        // v4,v5 = e_8[1]
+        add             v4.4s, v0.4s, v2.4s
+        add             v5.4s, v1.4s, v3.4s
+
+        // v6,v7 = e_8[2]
+        sub             v6.4s, v0.4s, v2.4s
+        sub             v7.4s, v1.4s, v3.4s
+
+        // v0,v1 = o_8[1]
+        adr             x4, .Lo8transform1
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[1]
+        add             v2.4s, v4.4s, v0.4s
+        add             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[1]
+        adr             x4, .Lo16transform1
+        ld1             {v15.8h}, [x4]
+        adr             x5, 18f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+18:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // v12,v13 = e_32[1]
+        add             v12.4s, v2.4s, v8.4s
+        add             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[1]
+        add             x4, sp, 1 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[1 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 1 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[30 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 30 * 64
+        st1             {v10.8h}, [x4]
+
+        // v12,v13 = e_32[14]
+        sub             v12.4s, v2.4s, v8.4s
+        sub             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[14]
+        add             x4, sp, 14 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[14 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 14 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[17 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 17 * 64
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[6]
+        sub             v2.4s, v4.4s, v0.4s
+        sub             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[6]
+        adr             x4, .Lo16transform6
+        ld1             {v15.8h}, [x4]
+        adr             x5, 19f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+19:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // v12,v13 = e_32[6]
+        add             v12.4s, v2.4s, v8.4s
+        add             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[6]
+        add             x4, sp, 6 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[6 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 6 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[25 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 25 * 64
+        st1             {v10.8h}, [x4]
+
+        // v12,v13 = e_32[9]
+        sub             v12.4s, v2.4s, v8.4s
+        sub             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[9]
+        add             x4, sp, 9 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[9 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 9 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[22 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 22 * 64
+        st1             {v10.8h}, [x4]
+
+        // v0,v1 = o_8[2]
+        adr             x4, .Lo8transform2
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[2]
+        add             v2.4s, v6.4s, v0.4s
+        add             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[2]
+        adr             x4, .Lo16transform2
+        ld1             {v15.8h}, [x4]
+        adr             x5, 20f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+20:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // v12,v13 = e_32[2]
+        add             v12.4s, v2.4s, v8.4s
+        add             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[2]
+        add             x4, sp, 2 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[2 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 2 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[29 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 29 * 64
+        st1             {v10.8h}, [x4]
+
+        // v12,v13 = e_32[13]
+        sub             v12.4s, v2.4s, v8.4s
+        sub             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[13]
+        add             x4, sp, 13 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[13 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 13 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[18 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 18 * 64
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[5]
+        sub             v2.4s, v6.4s, v0.4s
+        sub             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[5]
+        adr             x4, .Lo16transform5
+        ld1             {v15.8h}, [x4]
+        adr             x5, 21f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+21:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // v12,v13 = e_32[5]
+        add             v12.4s, v2.4s, v8.4s
+        add             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[5]
+        add             x4, sp, 5 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[5 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 5 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[26 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 26 * 64
+        st1             {v10.8h}, [x4]
+
+        // v12,v13 = e_32[10]
+        sub             v12.4s, v2.4s, v8.4s
+        sub             v13.4s, v3.4s, v9.4s
+
+        // v14,v15 = o_32[10]
+        add             x4, sp, 10 * 32
+        ld1             {v14.4s, v15.4s}, [x4]
+
+        // tmp[10 * 32]
+        add             v10.4s, v12.4s, v14.4s
+        add             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 10 * 64
+        st1             {v10.8h}, [x4]
+
+        // tmp[21 * 32]
+        sub             v10.4s, v12.4s, v14.4s
+        sub             v11.4s, v13.4s, v15.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 21 * 64
+        st1             {v10.8h}, [x4]
+
+
+        add             x2, x2, 16
+        add             x3, x3, 1
+        cmp             x3, 4
+        b.ne            1b
+
+        // horizontal transform
+        cmp             x1, 9
+        b.ls            24f
+        // o_32 partially (last 12 sum components)
+        adr             x4, .Lo32transform9_31
+        ld1             {v8.8h, v9.8h, v10.8h, v11.8h}, [x4], 64
+        ld1             {v12.8h, v13.8h, v14.8h, v15.8h}, [x4], 64
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], 64
+        bic             x5, x1, 1
+        subs            x5, x5, 8
+        csel            x5, x5, xzr, hs
+        mov             x4, 24
+        subs            x4, x4, x5
+        csel            x5, x4, xzr, hs
+        adr             x4, 23f
+        add             x5, x4, x5, lsl 3
+        add             x2, x0, 16
+        mov             x8, sp
+        mov             x3, 64
+        mov             x6, 32
+22:     ld1             {v0.8h, v1.8h, v2.8h}, [x2], x3
+        movi            v4.4s, 0
+        movi            v5.4s, 0
+        movi            v6.4s, 0
+        movi            v7.4s, 0
+        br              x5
+23:     smlal           v4.4s, v30.4h, v2.h[7]
+        smlal2          v5.4s, v30.8h, v2.h[7]
+        smlal           v6.4s, v31.4h, v2.h[7]
+        smlal2          v7.4s, v31.8h, v2.h[7]
+        smlal           v4.4s, v28.4h, v2.h[5]
+        smlal2          v5.4s, v28.8h, v2.h[5]
+        smlal           v6.4s, v29.4h, v2.h[5]
+        smlal2          v7.4s, v29.8h, v2.h[5]
+        smlal           v4.4s, v26.4h, v2.h[3]
+        smlal2          v5.4s, v26.8h, v2.h[3]
+        smlal           v6.4s, v27.4h, v2.h[3]
+        smlal2          v7.4s, v27.8h, v2.h[3]
+        smlal           v4.4s, v24.4h, v2.h[1]
+        smlal2          v5.4s, v24.8h, v2.h[1]
+        smlal           v6.4s, v25.4h, v2.h[1]
+        smlal2          v7.4s, v25.8h, v2.h[1]
+        smlal           v4.4s, v22.4h, v1.h[7]
+        smlal2          v5.4s, v22.8h, v1.h[7]
+        smlal           v6.4s, v23.4h, v1.h[7]
+        smlal2          v7.4s, v23.8h, v1.h[7]
+        smlal           v4.4s, v20.4h, v1.h[5]
+        smlal2          v5.4s, v20.8h, v1.h[5]
+        smlal           v6.4s, v21.4h, v1.h[5]
+        smlal2          v7.4s, v21.8h, v1.h[5]
+        smlal           v4.4s, v18.4h, v1.h[3]
+        smlal2          v5.4s, v18.8h, v1.h[3]
+        smlal           v6.4s, v19.4h, v1.h[3]
+        smlal2          v7.4s, v19.8h, v1.h[3]
+        smlal           v4.4s, v16.4h, v1.h[1]
+        smlal2          v5.4s, v16.8h, v1.h[1]
+        smlal           v6.4s, v17.4h, v1.h[1]
+        smlal2          v7.4s, v17.8h, v1.h[1]
+        smlal           v4.4s, v14.4h, v0.h[7]
+        smlal2          v5.4s, v14.8h, v0.h[7]
+        smlal           v6.4s, v15.4h, v0.h[7]
+        smlal2          v7.4s, v15.8h, v0.h[7]
+        smlal           v4.4s, v12.4h, v0.h[5]
+        smlal2          v5.4s, v12.8h, v0.h[5]
+        smlal           v6.4s, v13.4h, v0.h[5]
+        smlal2          v7.4s, v13.8h, v0.h[5]
+        smlal           v4.4s, v10.4h, v0.h[3]
+        smlal2          v5.4s, v10.8h, v0.h[3]
+        smlal           v6.4s, v11.4h, v0.h[3]
+        smlal2          v7.4s, v11.8h, v0.h[3]
+        smlal           v4.4s, v8.4h, v0.h[1]
+        smlal2          v5.4s, v8.8h, v0.h[1]
+        smlal           v6.4s, v9.4h, v0.h[1]
+        smlal2          v7.4s, v9.8h, v0.h[1]
+        st1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], 64
+        subs            x6, x6, 1
+        b.ne            22b
+
+
+24:     adr             x4, .Leo_coeff
+        ld1             {v12.8h, v13.8h, v14.8h, v15.8h}, [x4], 64
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+        adr             x4, .Lo32transform
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], 64
+        // o_16 jump address
+        mov             x4, 64
+        bic             x5, x1, 3
+        subs            x4, x4, x5, lsl 1
+        csel            x4, x4, xzr, hs
+        adr             x5, 26f
+        add             x5, x5, x4
+        // o_32 jump address
+        bic             x6, x1, 1
+        mov             x4, 8
+        subs            x4, x4, x6
+        csel            x6, x4, xzr, hs
+        adr             x4, 29f
+        add             x6, x4, x6, lsl 3
+
+        mov             x8, sp
+        mov             x3, 32
+25:     ld1             {v8.8h, v9.8h, v10.8h, v11.8h}, [x0]
+
+        // v2 = e_8
+        smull           v2.4s, v12.4h, v8.h[0]
+        smlal2          v2.4s, v12.8h, v9.h[0]
+        smlal           v2.4s, v13.4h, v10.h[0]
+        smlal2          v2.4s, v13.8h, v11.h[0]
+
+        // v3 = o_8
+        smull           v3.4s, v14.4h, v8.h[4]
+        smlal2          v3.4s, v14.8h, v9.h[4]
+        smlal           v3.4s, v15.4h, v10.h[4]
+        smlal2          v3.4s, v15.8h, v11.h[4]
+
+        // v0,v1 = e_16
+        add             v0.4s, v2.4s, v3.4s
+        sub             v2.4s, v2.4s, v3.4s
+        mov             v1.d[0], v2.d[1]
+        mov             v1.d[1], v2.d[0]
+        rev64           v1.4s, v1.4s
+
+        // v2,v3 = o_16
+        movi            v2.4s, 0
+        movi            v3.4s, 0
+        br              x5
+26:     smlal           v2.4s, v23.4h, v11.h[6]
+        smlal2          v3.4s, v23.8h, v11.h[6]
+        smlal           v2.4s, v22.4h, v11.h[2]
+        smlal2          v3.4s, v22.8h, v11.h[2]
+        smlal           v2.4s, v21.4h, v10.h[6]
+        smlal2          v3.4s, v21.8h, v10.h[6]
+        smlal           v2.4s, v20.4h, v10.h[2]
+        smlal2          v3.4s, v20.8h, v10.h[2]
+        smlal           v2.4s, v19.4h, v9.h[6]
+        smlal2          v3.4s, v19.8h, v9.h[6]
+        smlal           v2.4s, v18.4h, v9.h[2]
+        smlal2          v3.4s, v18.8h, v9.h[2]
+        smlal           v2.4s, v17.4h, v8.h[6]
+        smlal2          v3.4s, v17.8h, v8.h[6]
+        smlal           v2.4s, v16.4h, v8.h[2]
+        smlal2          v3.4s, v16.8h, v8.h[2]
+
+        // v4,v5,v6,v7 = e_32
+        add             v4.4s, v0.4s, v2.4s
+        add             v5.4s, v1.4s, v3.4s
+        sub             v11.4s, v0.4s, v2.4s
+        mov             v7.d[0], v11.d[1]
+        mov             v7.d[1], v11.d[0]
+        rev64           v7.4s, v7.4s
+        sub             v11.4s, v1.4s, v3.4s
+        mov             v6.d[0], v11.d[1]
+        mov             v6.d[1], v11.d[0]
+        rev64           v6.4s, v6.4s
+
+        // v0,v1,v2,v3 = o_32
+        cmp             x1, 9
+        b.hi            28f
+        movi            v0.4s, 0
+        movi            v1.4s, 0
+        movi            v2.4s, 0
+        movi            v3.4s, 0
+        br              x6
+28:     ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], 64
+        br              x6
+29:     smlal           v0.4s, v30.4h, v8.h[7]
+        smlal2          v1.4s, v30.8h, v8.h[7]
+        smlal           v2.4s, v31.4h, v8.h[7]
+        smlal2          v3.4s, v31.8h, v8.h[7]
+        smlal           v0.4s, v28.4h, v8.h[5]
+        smlal2          v1.4s, v28.8h, v8.h[5]
+        smlal           v2.4s, v29.4h, v8.h[5]
+        smlal2          v3.4s, v29.8h, v8.h[5]
+        smlal           v0.4s, v26.4h, v8.h[3]
+        smlal2          v1.4s, v26.8h, v8.h[3]
+        smlal           v2.4s, v27.4h, v8.h[3]
+        smlal2          v3.4s, v27.8h, v8.h[3]
+        smlal           v0.4s, v24.4h, v8.h[1]
+        smlal2          v1.4s, v24.8h, v8.h[1]
+        smlal           v2.4s, v25.4h, v8.h[1]
+        smlal2          v3.4s, v25.8h, v8.h[1]
+
+        // coeff
+        add             v8.4s, v4.4s, v0.4s
+        add             v9.4s, v5.4s, v1.4s
+        add             v10.4s, v6.4s, v2.4s
+        add             v11.4s, v7.4s, v3.4s
+        sub             v4.4s, v4.4s, v0.4s
+        sub             v5.4s, v5.4s, v1.4s
+        sub             v6.4s, v6.4s, v2.4s
+        sub             v7.4s, v7.4s, v3.4s
+        sqrshrn         v8.4h, v8.4s, 12
+        sqrshrn2        v8.8h, v9.4s, 12
+        sqrshrn         v9.4h, v10.4s, 12
+        sqrshrn2        v9.8h, v11.4s, 12
+        sqrshrn         v4.4h, v4.4s, 12
+        sqrshrn2        v4.8h, v5.4s, 12
+        sqrshrn         v5.4h, v6.4s, 12
+        sqrshrn2        v5.8h, v7.4s, 12
+        mov             v10.d[0], v5.d[1]
+        mov             v10.d[1], v5.d[0]
+        rev64           v10.8h, v10.8h
+        mov             v11.d[0], v4.d[1]
+        mov             v11.d[1], v4.d[0]
+        rev64           v11.8h, v11.8h
+        st1             {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], 64
+        subs            x3, x3, 1
+        b.ne            25b
+
+        add             sp, sp, 16 * 32 * 4
+        ld1             {v12.16b, v13.16b, v14.16b, v15.16b}, [sp], 64
+        ld1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 612ebb9541..bb2a6b2502 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -46,6 +46,7 @@ void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon_new(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
@@ -74,6 +75,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_8_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_8_neon_new;
+        c->idct[3]                     = ff_hevc_idct_32x32_8_neon;
         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_8_neon;
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
-- 
2.32.0 (Apple Git-132)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct
  2022-06-23 12:23 [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct J. Dekker
  2022-06-23 12:23 ` [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct J. Dekker
@ 2022-08-09 12:15 ` Martin Storsjö
  1 sibling, 0 replies; 3+ messages in thread
From: Martin Storsjö @ 2022-08-09 12:15 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Thu, 23 Jun 2022, J. Dekker wrote:

> old:
> hevc_idct_16x16_8_c: 5366.2
> hevc_idct_16x16_8_neon: 1493.2
>
> new:
> hevc_idct_16x16_8_c: 5363.2
> hevc_idct_16x16_8_neon: 943.5
>
> Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 666 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |   3 +-
> 2 files changed, 668 insertions(+), 1 deletion(-)

Throughout the new code, you have e.g. "add x5, x5, x4, lsl 2", where the 
"lsl 2" breaks assembling with MS armasm64 - it's missing the '#' on the 
constant 2.

Also, for loads/stores, it seems to be missing the same '#' for 
postincrement, e.g. "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64". Also 
"mov x4, 64". Apparently armasm64 doesn't have a problem with that, but it 
would still be good to have it consistent with the rest.

> This idct is significantly faster than the one we currently have, I
> suspect its for a couple reasons: 1) it's only written for 8bit

I don't see how that would change anything? Isn't the only thing that 
differs between 8 and 10/12 bit in the existing implementation about how 
much to scale down at the end? All other intermediate values are the same 
size?

> 2) it's unrolled signficantly more. It comes at a hefty cost of roughly 
> 2.25x the object size.

If by that, you mean that the existing code works on 4 elements at a time 
(i.e. mostly operating on .4h vectors), while this one operates on .8h 
vectors, then yes, that's most probably the biggest source of the speedup 
(even if a lot of the intermediate stuff happens in .4s vectors). The 
existing code was ported from the 32 bit arm version (which probably had 
to stick to 4 elements at a time due to register availability there), 
while it probably could have been made double width when it was ported to 
64 bit.

> I'm wondering if this idct is salvagable, or the one we have should just 
> be improved instead.

Well, my honest opinion is:

- I don't quite understand the current code (I've worked on the 
vp8/vp9/av1 IDCTs a fair amount, but the HEVC one seems to be different 
enough that I don't recognize all the concepts here.

- The current implementation would need to be reformatted if kept

- The current implementation does have some rather clear high level 
structure though, e.g. when looking at the idct_16x16 macro.

- The new implementation seems to be just one huuuuge function. If you 
know it by heart, it's probably good, but it's really hard to get an 
overview of if you're not familiar with the HEVC IDCTs.

As for steps forward:
- Is it possible to widen the existing implementation to operate on 8 
elements instead of 4? I think that would bring it up to par with this 
one.
- Can you get some high level structure to the new implementation so that 
it becomes understandable? Either lots of more comments explaining what's 
happening and why, or splitting it up in smaller macros.

Some more comments on the code itself below:

> +// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
> +function ff_hevc_idct_16x16_8_neon_new, export=1
> +        sub             sp, sp, 64
> +        st1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
> +        sub             sp, sp, 32
> +        st1             {v14.16b, v15.16b}, [sp]
> +        mov             x3, 0
> +        mov             x2, x0
> +1:      mov             x4, x2
> +        mov             x5, 32
> +        ld1             {v16.8h}, [x4], x5
> +        ld1             {v17.8h}, [x4], x5
> +        ld1             {v18.8h}, [x4], x5
> +        ld1             {v19.8h}, [x4], x5
> +        ld1             {v20.8h}, [x4], x5
> +        ld1             {v21.8h}, [x4], x5
> +        ld1             {v22.8h}, [x4], x5
> +        ld1             {v23.8h}, [x4], x5
> +        ld1             {v24.8h}, [x4], x5
> +        ld1             {v25.8h}, [x4], x5
> +        ld1             {v26.8h}, [x4], x5
> +        ld1             {v27.8h}, [x4], x5
> +        ld1             {v28.8h}, [x4], x5
> +        ld1             {v29.8h}, [x4], x5
> +        ld1             {v30.8h}, [x4], x5
> +        ld1             {v31.8h}, [x4], x5
> +        cmp             x1, 12
> +        b.hs            5f
> +        // limit2 below 16
> +        bic             x4, x1, 1
> +        adr             x5, .LimitMask
> +        cbnz            x3, 3f
> +        // columns 0 .. 7 - cleanup of indexes 5 .. 7
> +        ld1             {v0.8h}, [x5]
> +        adr             x5, 2f
> +        add             x5, x5, x4, lsl 2
> +        add             x5, x5, x4, lsl 1
> +        br              x5
> +2:      and             v17.16b, v17.16b, v0.16b    // col_limit 0..1 -> limit2 == 4..5
> +        and             v19.16b, v19.16b, v0.16b
> +        b               5f

I don't really know what these jump tables do and how it corresponds to 
things in the existing implementation - but I guess that can be one part 
of what makes things faster too.

The existing implementation does an 16x16 transform by first doing 4x 
transforms for an 4x16 piece of data, transpose that, then do another 4x 
4x16 for the second pass. How does the new implementation do it?

If I understand correctly, the old implementation didn't take col_limit 
into account at all. Can that be one part of what makes things faster - or 
is that only something that makes a difference in real use but not in 
checkasm benchmarks?

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-08-09 12:15 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-23 12:23 [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct J. Dekker
2022-06-23 12:23 ` [FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add 8-bit hevc 32x32 idct J. Dekker
2022-08-09 12:15 ` [FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git