From 081aff967d4fdc3d475c777033223625db3bb532 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 7 Sep 2022 11:50:29 +0300 Subject: [PATCH] squash: nsse16: Tune scheduling Before: Cortex A53 A72 A73 nsse_0_neon: 401.0 198.0 194.5 After: nsse_0_neon: 377.0 198.7 196.5 (The differences on A72 and A73 are within the measurement noise, those numbers vary more than that from one run to another.) --- libavcodec/aarch64/me_cmp_neon.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index bd21122a21..2a2af7a788 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -874,8 +874,8 @@ function nsse16_neon, export=1 ld1 {v0.16b}, [x1], x3 subs w4, w4, #1 // we need to make h-1 iterations - ext v1.16b, v0.16b, v0.16b, #1 // x1 + 1 ld1 {v2.16b}, [x2], x3 + ext v1.16b, v0.16b, v0.16b, #1 // x1 + 1 cmp w4, #2 ext v3.16b, v2.16b, v2.16b, #1 // x2 + 1 @@ -884,12 +884,12 @@ function nsse16_neon, export=1 // make 2 iterations at once 1: ld1 {v4.16b}, [x1], x3 + ld1 {v6.16b}, [x2], x3 ld1 {v20.16b}, [x1], x3 ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1 - ext v21.16b, v20.16b, v20.16b, #1 - ld1 {v6.16b}, [x2], x3 ld1 {v22.16b}, [x2], x3 ext v7.16b, v6.16b, v6.16b, #1 // x2 + stride + 1 + ext v21.16b, v20.16b, v20.16b, #1 ext v23.16b, v22.16b, v22.16b, #1 usubl v31.8h, v0.8b, v4.8b @@ -933,8 +933,8 @@ function nsse16_neon, export=1 2: ld1 {v4.16b}, [x1], x3 subs w4, w4, #1 - ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1 ld1 {v6.16b}, [x2], x3 + ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1 usubl v31.8h, v0.8b, v4.8b ext v7.16b, v6.16b, v6.16b, #1 // x2 + stride + 1 -- 2.25.1