From: "Martin Storsjö" <martin@martin.st>
To: ffmpeg-devel@ffmpeg.org
Cc: Logan Lyu <Logan.Lyu@myais.com.cn>, "J . Dekker" <jdek@itanimul.li>
Subject: [FFmpeg-devel] [PATCH 02/21] aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Date: Mon, 25 Mar 2024 17:02:24 +0200
Message-ID: <20240325150243.59058-3-martin@martin.st> (raw)
In-Reply-To: <20240325150243.59058-1-martin@martin.st>
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
---
libavcodec/aarch64/hevcdsp_qpel_neon.S | 130 +++++++++++++------------
1 file changed, 66 insertions(+), 64 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 9be29cafe2..815d897094 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -3981,24 +3981,25 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
mov x11, sp
mov w12, w22
mov x13, x20
+ mov x14, sp
3:
- ldp q16, q1, [sp]
- add sp, sp, x10
- ldp q17, q2, [sp]
- add sp, sp, x10
- ldp q18, q3, [sp]
- add sp, sp, x10
- ldp q19, q4, [sp]
- add sp, sp, x10
- ldp q20, q5, [sp]
- add sp, sp, x10
- ldp q21, q6, [sp]
- add sp, sp, x10
- ldp q22, q7, [sp]
- add sp, sp, x10
+ ldp q16, q1, [x11]
+ add x11, x11, x10
+ ldp q17, q2, [x11]
+ add x11, x11, x10
+ ldp q18, q3, [x11]
+ add x11, x11, x10
+ ldp q19, q4, [x11]
+ add x11, x11, x10
+ ldp q20, q5, [x11]
+ add x11, x11, x10
+ ldp q21, q6, [x11]
+ add x11, x11, x10
+ ldp q22, q7, [x11]
+ add x11, x11, x10
1:
- ldp q23, q31, [sp]
- add sp, sp, x10
+ ldp q23, q31, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
@@ -4007,8 +4008,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q16, q1, [sp]
- add sp, sp, x10
+ ldp q16, q1, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
@@ -4017,8 +4018,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q17, q2, [sp]
- add sp, sp, x10
+ ldp q17, q2, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
@@ -4027,8 +4028,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q18, q3, [sp]
- add sp, sp, x10
+ ldp q18, q3, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
@@ -4037,8 +4038,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q19, q4, [sp]
- add sp, sp, x10
+ ldp q19, q4, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
@@ -4047,8 +4048,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q20, q5, [sp]
- add sp, sp, x10
+ ldp q20, q5, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
@@ -4057,8 +4058,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q21, q6, [sp]
- add sp, sp, x10
+ ldp q21, q6, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
@@ -4067,8 +4068,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q22, q7, [sp]
- add sp, sp, x10
+ ldp q22, q7, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
@@ -4078,10 +4079,10 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
b.hi 1b
2:
subs w27, w27, #16
- add sp, x11, #32
+ add x11, x14, #32
add x20, x13, #16
mov w22, w12
- mov x11, sp
+ mov x14, x11
mov x13, x20
b.hi 3b
QPEL_UNI_W_HV_END
@@ -4093,24 +4094,25 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
mov x11, sp
mov w12, w22
mov x13, x20
+ mov x14, sp
3:
- ldp q16, q1, [sp]
- add sp, sp, x10
- ldp q17, q2, [sp]
- add sp, sp, x10
- ldp q18, q3, [sp]
- add sp, sp, x10
- ldp q19, q4, [sp]
- add sp, sp, x10
- ldp q20, q5, [sp]
- add sp, sp, x10
- ldp q21, q6, [sp]
- add sp, sp, x10
- ldp q22, q7, [sp]
- add sp, sp, x10
+ ldp q16, q1, [x11]
+ add x11, x11, x10
+ ldp q17, q2, [x11]
+ add x11, x11, x10
+ ldp q18, q3, [x11]
+ add x11, x11, x10
+ ldp q19, q4, [x11]
+ add x11, x11, x10
+ ldp q20, q5, [x11]
+ add x11, x11, x10
+ ldp q21, q6, [x11]
+ add x11, x11, x10
+ ldp q22, q7, [x11]
+ add x11, x11, x10
1:
- ldp q23, q31, [sp]
- add sp, sp, x10
+ ldp q23, q31, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
@@ -4119,8 +4121,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q16, q1, [sp]
- add sp, sp, x10
+ ldp q16, q1, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
@@ -4129,8 +4131,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q17, q2, [sp]
- add sp, sp, x10
+ ldp q17, q2, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
@@ -4139,8 +4141,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q18, q3, [sp]
- add sp, sp, x10
+ ldp q18, q3, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
@@ -4149,8 +4151,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q19, q4, [sp]
- add sp, sp, x10
+ ldp q19, q4, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
@@ -4159,8 +4161,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q20, q5, [sp]
- add sp, sp, x10
+ ldp q20, q5, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
@@ -4169,8 +4171,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q21, q6, [sp]
- add sp, sp, x10
+ ldp q21, q6, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
@@ -4179,8 +4181,8 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
subs w22, w22, #1
b.eq 2f
- ldp q22, q7, [sp]
- add sp, sp, x10
+ ldp q22, q7, [x11]
+ add x11, x11, x10
QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
@@ -4190,10 +4192,10 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
b.hi 1b
2:
subs w27, w27, #16
- add sp, x11, #32
+ add x11, x14, #32
add x20, x13, #16
mov w22, w12
- mov x11, sp
+ mov x14, x11
mov x13, x20
b.hi 3b
QPEL_UNI_W_HV_END
--
2.39.3 (Apple Git-146)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2024-03-25 15:03 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-25 15:02 [FFmpeg-devel] [PATCH 00/21] aarch64: hevc: Add missing hevc_pel NEON functions Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 01/21] aarch64: hevc: Reorder a misplaced function init line Martin Storsjö
2024-03-25 15:02 ` Martin Storsjö [this message]
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 03/21] aarch64: hevc: Merge consecutive stores in put_hevc_\type\()_h16_8_neon Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 04/21] aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 05/21] aarch64: hevc: Use ld1r instead of ldr+dup in hevc_qpel_uni_w_h Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 06/21] aarch64: hevc: Implement a neon version of put_hevc_epel_h*_8 Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 07/21] aarch64: hevc: Implement a neon version of hevc_epel_uni_w_h*_8 Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 08/21] aarch64: hevc: Split the epel_*_hv functions into two parts Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 09/21] aarch64: hevc: Reorder epel_hv functions to prepare for templating Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 10/21] aarch64: hevc: Produce epel_hv functions for both plain neon and i8mm Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 11/21] aarch64: hevc: Produce epel_uni_hv functions for both " Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 12/21] aarch64: hevc: Produce epel_uni_w_hv " Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 13/21] aarch64: hevc: Produce epel_bi_hv " Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 14/21] aarch64: hevc: Implement a neon version of hevc_qpel_uni_w_h*_8 Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 15/21] aarch64: hevc: Split the qpel_*_hv functions into two parts Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 16/21] aarch64: hevc: Deduplicate the hevc_put_hevc_qpel_uni_w_hv*_8_end_neon functions Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 17/21] aarch64: hevc: Reorder qpel_hv functions to prepare for templating Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 18/21] aarch64: hevc: Produce plain neon versions of qpel_hv Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 19/21] aarch64: hevc: Produce plain neon versions of qpel_uni_hv Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 20/21] aarch64: hevc: Produce plain neon versions of qpel_uni_w_hv Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 21/21] aarch64: hevc: Produce plain neon versions of qpel_bi_hv Martin Storsjö
2024-03-25 21:15 ` [FFmpeg-devel] [PATCH 00/21] aarch64: hevc: Add missing hevc_pel NEON functions Martin Storsjö
2024-03-25 21:56 ` J. Dekker
2024-03-26 6:01 ` Jean-Baptiste Kempf
2024-03-26 7:09 ` Martin Storsjö
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240325150243.59058-3-martin@martin.st \
--to=martin@martin.st \
--cc=Logan.Lyu@myais.com.cn \
--cc=ffmpeg-devel@ffmpeg.org \
--cc=jdek@itanimul.li \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git