From 76eb1f213a72cdfd04a62c773442336cd56e0858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 3 Sep 2022 00:45:55 +0300 Subject: [PATCH] Improve vsad16_neon --- libavcodec/aarch64/me_cmp_neon.S | 71 ++++++++++++++------------------ 1 file changed, 31 insertions(+), 40 deletions(-) diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index ecc9c793d6..7ab8744e0d 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -592,65 +592,56 @@ function vsad16_neon, export=1 // x3 ptrdiff_t stride // w4 int h + ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration + ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration + sub w4, w4, #1 // we need to make h-1 iterations movi v16.8h, #0 cmp w4, #3 // check if we can make 3 iterations at once - add x5, x1, x3 // pix1 + stride - add x6, x2, x3 // pix2 + stride - b.le 2f + usubl v31.8h, v0.8b, v1.8b // Signed difference pix1[0] - pix2[0], first iteration + usubl2 v30.8h, v0.16b, v1.16b // Signed difference pix1[0] - pix2[0], first iteration + + b.lt 2f 1: // abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride]) // abs(x) = (x < 0 ? (-x) : (x)) - ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration - ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration - ld1 {v2.16b}, [x5], x3 // Load pix1[0 + stride], first iteration - usubl v31.8h, v0.8b, v1.8b // Signed difference pix1[0] - pix2[0], first iteration - ld1 {v3.16b}, [x6], x3 // Load pix2[0 + stride], first iteration - usubl2 v30.8h, v0.16b, v1.16b // Signed difference pix1[0] - pix2[0], first iteration - usubl v29.8h, v2.8b, v3.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration - ld1 {v4.16b}, [x1], x3 // Load pix1[0], second iteration - usubl2 v28.8h, v2.16b, v3.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration - ld1 {v5.16b}, [x2], x3 // Load pix2[0], second iteration + ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration + ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration + ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration + ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration + usubl v29.8h, v0.8b, v1.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration + usubl2 v28.8h, v0.16b, v1.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration + ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration + ld1 {v5.16b}, [x2], x3 // Load pix2[0 + stride], third iteration + usubl v27.8h, v2.8b, v3.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration saba v16.8h, v31.8h, v29.8h // Signed absolute difference and accumulate the result. first iteration - ld1 {v6.16b}, [x5], x3 // Load pix1[0 + stride], second iteration + usubl2 v26.8h, v2.16b, v3.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration saba v16.8h, v30.8h, v28.8h // Signed absolute difference and accumulate the result. first iteration - usubl v27.8h, v4.8b, v5.8b // Signed difference pix1[0] - pix2[0], second iteration - ld1 {v7.16b}, [x6], x3 // Load pix2[0 + stride], second iteration - usubl2 v26.8h, v4.16b, v5.16b // Signed difference pix1[0] - pix2[0], second iteration - usubl v25.8h, v6.8b, v7.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration - ld1 {v17.16b}, [x1], x3 // Load pix1[0], third iteration - usubl2 v24.8h, v6.16b, v7.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration - ld1 {v18.16b}, [x2], x3 // Load pix2[0], second iteration - saba v16.8h, v27.8h, v25.8h // Signed absolute difference and accumulate the result. second iteration - ld1 {v19.16b}, [x5], x3 // Load pix1[0 + stride], third iteration - saba v16.8h, v26.8h, v24.8h // Signed absolute difference and accumulate the result. second iteration - usubl v23.8h, v17.8b, v18.8b // Signed difference pix1[0] - pix2[0], third iteration - ld1 {v20.16b}, [x6], x3 // Load pix2[0 + stride], third iteration - usubl2 v22.8h, v17.16b, v18.16b // Signed difference pix1[0] - pix2[0], third iteration - usubl v21.8h, v19.8b, v20.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration + usubl v25.8h, v4.8b, v5.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration + usubl2 v24.8h, v4.16b, v5.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration + saba v16.8h, v29.8h, v27.8h // Signed absolute difference and accumulate the result. second iteration + mov v31.16b, v25.16b + saba v16.8h, v28.8h, v26.8h // Signed absolute difference and accumulate the result. second iteration sub w4, w4, #3 // h -= 3 - saba v16.8h, v23.8h, v21.8h // Signed absolute difference and accumulate the result. third iteration - usubl2 v31.8h, v19.16b, v20.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration + mov v30.16b, v24.16b + saba v16.8h, v27.8h, v25.8h // Signed absolute difference and accumulate the result. third iteration cmp w4, #3 - saba v16.8h, v22.8h, v31.8h // Signed absolute difference and accumulate the result. third iteration + saba v16.8h, v26.8h, v24.8h // Signed absolute difference and accumulate the result. third iteration b.ge 1b cbz w4, 3f 2: - ld1 {v0.16b}, [x1], x3 ld1 {v1.16b}, [x2], x3 - ld1 {v2.16b}, [x5], x3 - usubl v30.8h, v0.8b, v1.8b - ld1 {v3.16b}, [x6], x3 - usubl2 v29.8h, v0.16b, v1.16b - usubl v28.8h, v2.8b, v3.8b - usubl2 v27.8h, v2.16b, v3.16b - saba v16.8h, v30.8h, v28.8h subs w4, w4, #1 - saba v16.8h, v29.8h, v27.8h + usubl v29.8h, v0.8b, v1.8b + usubl2 v28.8h, v0.16b, v1.16b + saba v16.8h, v31.8h, v29.8h + mov v31.16b, v29.16b + saba v16.8h, v30.8h, v28.8h + mov v30.16b, v28.16b b.ne 2b 3: -- 2.25.1