[FFmpeg-devel] [PATCH 02/21] aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm

From: "Martin Storsjö" <martin@martin.st>
To: ffmpeg-devel@ffmpeg.org
Cc: Logan Lyu <Logan.Lyu@myais.com.cn>, "J . Dekker" <jdek@itanimul.li>
Subject: [FFmpeg-devel] [PATCH 02/21] aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Date: Mon, 25 Mar 2024 17:02:24 +0200
Message-ID: <20240325150243.59058-3-martin@martin.st> (raw)
In-Reply-To: <20240325150243.59058-1-martin@martin.st>

Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.

Technically, this is fine as long as the pointer remains properly
aligned.

However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.

This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.

Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
---
 libavcodec/aarch64/hevcdsp_qpel_neon.S | 130 +++++++++++++------------
 1 file changed, 66 insertions(+), 64 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 9be29cafe2..815d897094 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -3981,24 +3981,25 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         mov             x11, sp
         mov             w12, w22
         mov             x13, x20
+        mov             x14, sp
 3:
-        ldp             q16, q1, [sp]
-        add             sp, sp, x10
-        ldp             q17, q2, [sp]
-        add             sp, sp, x10
-        ldp             q18, q3, [sp]
-        add             sp, sp, x10
-        ldp             q19, q4, [sp]
-        add             sp, sp, x10
-        ldp             q20, q5, [sp]
-        add             sp, sp, x10
-        ldp             q21, q6, [sp]
-        add             sp, sp, x10
-        ldp             q22, q7, [sp]
-        add             sp, sp, x10
+        ldp             q16, q1, [x11]
+        add             x11, x11, x10
+        ldp             q17, q2, [x11]
+        add             x11, x11, x10
+        ldp             q18, q3, [x11]
+        add             x11, x11, x10
+        ldp             q19, q4, [x11]
+        add             x11, x11, x10
+        ldp             q20, q5, [x11]
+        add             x11, x11, x10
+        ldp             q21, q6, [x11]
+        add             x11, x11, x10
+        ldp             q22, q7, [x11]
+        add             x11, x11, x10
 1:
-        ldp             q23, q31, [sp]
-        add             sp, sp, x10
+        ldp             q23, q31, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
         QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
         QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
@@ -4007,8 +4008,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q16, q1, [sp]
-        add             sp, sp, x10
+        ldp             q16, q1, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
         QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
         QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
@@ -4017,8 +4018,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q17, q2, [sp]
-        add             sp, sp, x10
+        ldp             q17, q2, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
         QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
         QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
@@ -4027,8 +4028,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q18, q3, [sp]
-        add             sp, sp, x10
+        ldp             q18, q3, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
         QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
         QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
@@ -4037,8 +4038,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q19, q4, [sp]
-        add             sp, sp, x10
+        ldp             q19, q4, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
         QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
         QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
@@ -4047,8 +4048,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q20, q5, [sp]
-        add             sp, sp, x10
+        ldp             q20, q5, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
         QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
         QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
@@ -4057,8 +4058,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q21, q6, [sp]
-        add             sp, sp, x10
+        ldp             q21, q6, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
         QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
         QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
@@ -4067,8 +4068,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q22, q7, [sp]
-        add             sp, sp, x10
+        ldp             q22, q7, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
         QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
         QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
@@ -4078,10 +4079,10 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         b.hi            1b
 2:
         subs            w27, w27, #16
-        add             sp, x11, #32
+        add             x11, x14, #32
         add             x20, x13, #16
         mov             w22, w12
-        mov             x11, sp
+        mov             x14, x11
         mov             x13, x20
         b.hi            3b
         QPEL_UNI_W_HV_END
@@ -4093,24 +4094,25 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         mov             x11, sp
         mov             w12, w22
         mov             x13, x20
+        mov             x14, sp
 3:
-        ldp             q16, q1, [sp]
-        add             sp, sp, x10
-        ldp             q17, q2, [sp]
-        add             sp, sp, x10
-        ldp             q18, q3, [sp]
-        add             sp, sp, x10
-        ldp             q19, q4, [sp]
-        add             sp, sp, x10
-        ldp             q20, q5, [sp]
-        add             sp, sp, x10
-        ldp             q21, q6, [sp]
-        add             sp, sp, x10
-        ldp             q22, q7, [sp]
-        add             sp, sp, x10
+        ldp             q16, q1, [x11]
+        add             x11, x11, x10
+        ldp             q17, q2, [x11]
+        add             x11, x11, x10
+        ldp             q18, q3, [x11]
+        add             x11, x11, x10
+        ldp             q19, q4, [x11]
+        add             x11, x11, x10
+        ldp             q20, q5, [x11]
+        add             x11, x11, x10
+        ldp             q21, q6, [x11]
+        add             x11, x11, x10
+        ldp             q22, q7, [x11]
+        add             x11, x11, x10
 1:
-        ldp             q23, q31, [sp]
-        add             sp, sp, x10
+        ldp             q23, q31, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
         QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
         QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
@@ -4119,8 +4121,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q16, q1, [sp]
-        add             sp, sp, x10
+        ldp             q16, q1, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
         QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
         QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
@@ -4129,8 +4131,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q17, q2, [sp]
-        add             sp, sp, x10
+        ldp             q17, q2, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
         QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
         QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
@@ -4139,8 +4141,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q18, q3, [sp]
-        add             sp, sp, x10
+        ldp             q18, q3, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
         QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
         QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
@@ -4149,8 +4151,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q19, q4, [sp]
-        add             sp, sp, x10
+        ldp             q19, q4, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
         QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
         QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
@@ -4159,8 +4161,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q20, q5, [sp]
-        add             sp, sp, x10
+        ldp             q20, q5, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
         QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
         QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
@@ -4169,8 +4171,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q21, q6, [sp]
-        add             sp, sp, x10
+        ldp             q21, q6, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
         QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
         QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
@@ -4179,8 +4181,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         subs            w22, w22, #1
         b.eq            2f
 
-        ldp             q22, q7, [sp]
-        add             sp, sp, x10
+        ldp             q22, q7, [x11]
+        add             x11, x11, x10
         QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
         QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
         QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
@@ -4190,10 +4192,10 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         b.hi            1b
 2:
         subs            w27, w27, #16
-        add             sp, x11, #32
+        add             x11, x14, #32
         add             x20, x13, #16
         mov             w22, w12
-        mov             x11, sp
+        mov             x14, x11
         mov             x13, x20
         b.hi            3b
         QPEL_UNI_W_HV_END
-- 
2.39.3 (Apple Git-146)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".