From d0345bceaf013bea2023b1a02b372f2a64c6efaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 28 Sep 2022 11:53:55 +0300
Subject: [PATCH] aarch64: me_cmp: Improve scheduling in vsse_intra8

Before:  Cortex A53    A72    A73
vsse_5_neon:   74.7   31.5   26.0
After:
vsse_5_neon:   62.7   32.5   25.7
---
 libavcodec/aarch64/me_cmp_neon.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 4037953488..dc0b1e5f43 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1113,11 +1113,11 @@ function vsse_intra8_neon, export=1
         // x3           ptrdiff_t stride
         // w4           int h
 
+        sub             w4, w4, #1 // we need to make h-1 iterations
         ld1             {v0.8b}, [x1], x3
+        cmp             w4, #3
         movi            v16.4s, #0
 
-        sub             w4, w4, #1 // we need to make h-1 iterations
-        cmp             w4, #3
         b.lt            2f
 
 1:
@@ -1127,13 +1127,13 @@ function vsse_intra8_neon, export=1
         ld1             {v2.8b}, [x1], x3
         uabd            v30.8b, v0.8b, v1.8b
         ld1             {v3.8b}, [x1], x3
-        umull           v29.8h, v30.8b, v30.8b
         uabd            v27.8b, v1.8b, v2.8b
-        uadalp          v16.4s, v29.8h
-        umull           v26.8h, v27.8b, v27.8b
+        umull           v29.8h, v30.8b, v30.8b
         uabd            v25.8b, v2.8b, v3.8b
-        uadalp          v16.4s, v26.8h
+        umull           v26.8h, v27.8b, v27.8b
+        uadalp          v16.4s, v29.8h
         umull           v24.8h, v25.8b, v25.8b
+        uadalp          v16.4s, v26.8h
         sub             w4, w4, #3
         uadalp          v16.4s, v24.8h
         cmp             w4, #3
-- 
2.25.1