* [FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way.
@ 2022-10-09 19:47 Reimar.Doeffinger
2022-10-10 21:29 ` Martin Storsjö
0 siblings, 1 reply; 3+ messages in thread
From: Reimar.Doeffinger @ 2022-10-09 19:47 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Reimar Döffinger
From: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
Currently it is done in several different ways, which
might cause needless dependencies or in case of
tx_float_neon.S is incorrect.
Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
---
libavcodec/aarch64/fft_neon.S | 3 +-
libavcodec/aarch64/h264idct_neon.S | 6 +-
libavcodec/aarch64/hevcdsp_sao_neon.S | 3 +-
libavcodec/aarch64/mdct_neon.S | 18 ++----
libavcodec/aarch64/me_cmp_neon.S | 6 +-
libavcodec/aarch64/synth_filter_neon.S | 3 +-
libavcodec/aarch64/vp9itxfm_neon.S | 28 ++++-----
libavcodec/aarch64/vp9lpf_16bpp_neon.S | 32 +++++------
libavcodec/aarch64/vp9lpf_neon.S | 80 +++++++++++++-------------
libavutil/aarch64/tx_float_neon.S | 52 ++++++++---------
10 files changed, 109 insertions(+), 122 deletions(-)
diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index 9ff3f9c526..d7225511dd 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -342,8 +342,7 @@ endfunc
function fft\n\()_neon, align=6
AARCH64_VALID_JUMP_TARGET
AARCH64_SIGN_LINK_REGISTER
- sub sp, sp, #16
- stp x28, x30, [sp]
+ stp x28, x30, [sp, #-16]!
add x28, x0, #\n4*2*8
bl fft\n2\()_neon
mov x0, x28
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 7d2879b0ce..375da31d65 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -157,8 +157,7 @@ function ff_h264_idct_add16intra_neon, export=1
endfunc
function ff_h264_idct_add8_neon, export=1
- sub sp, sp, #0x40
- stp x19, x20, [sp]
+ stp x19, x20, [sp, #-0x40]!
mov x12, x30
ldp x6, x15, [x0] // dest[0], dest[1]
add x5, x1, #16*4 // block_offset
@@ -187,8 +186,7 @@ function ff_h264_idct_add8_neon, export=1
csel x6, x15, x6, eq
cmp x10, #20
b.lt 1b
- ldp x19, x20, [sp]
- add sp, sp, #0x40
+ ldp x19, x20, [sp], #0x40
ret x12
endfunc
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
index d4decfde3b..30e83dda5d 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -33,8 +33,7 @@
// int16_t *sao_offset_val, int sao_left_class,
// int width, int height)
function ff_hevc_sao_band_filter_8x8_8_neon, export=1
- sub sp, sp, #64
- stp xzr, xzr, [sp]
+ stp xzr, xzr, [sp, #-64]!
stp xzr, xzr, [sp, #16]
stp xzr, xzr, [sp, #32]
stp xzr, xzr, [sp, #48]
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
index 6091e72022..98b09bf1ab 100644
--- a/libavcodec/aarch64/mdct_neon.S
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -23,8 +23,7 @@
#include "libavutil/aarch64/asm.S"
function ff_imdct_half_neon, export=1
- sub sp, sp, #32
- stp x19, x20, [sp]
+ stp x19, x20, [sp, #-32]!
AARCH64_SIGN_LINK_REGISTER
str x30, [sp, #16]
mov x12, #1
@@ -120,17 +119,15 @@ function ff_imdct_half_neon, export=1
st2 {v4.2s,v5.2s}, [x0]
st2 {v6.2s,v7.2s}, [x8]
- ldp x19, x20, [sp]
ldr x30, [sp, #16]
AARCH64_VALIDATE_LINK_REGISTER
- add sp, sp, #32
+ ldp x19, x20, [sp], #32
ret
endfunc
function ff_imdct_calc_neon, export=1
- sub sp, sp, #32
- stp x19, x20, [sp]
+ stp x19, x20, [sp, #-32]!
AARCH64_SIGN_LINK_REGISTER
str x30, [sp, #16]
ldr w3, [x0, #28] // mdct_bits
@@ -163,18 +160,16 @@ function ff_imdct_calc_neon, export=1
subs x19, x19, #16
b.gt 1b
- ldp x19, x20, [sp]
ldr x30, [sp, #16]
AARCH64_VALIDATE_LINK_REGISTER
- add sp, sp, #32
+ ldp x19, x20, [sp], #32
ret
endfunc
function ff_mdct_calc_neon, export=1
- sub sp, sp, #32
- stp x19, x20, [sp]
+ stp x19, x20, [sp, #-32]!
AARCH64_SIGN_LINK_REGISTER
str x30, [sp, #16]
@@ -323,10 +318,9 @@ function ff_mdct_calc_neon, export=1
st2 {v4.2s,v5.2s}, [x0]
st2 {v6.2s,v7.2s}, [x8]
- ldp x19, x20, [sp]
ldr x30, [sp, #16]
AARCH64_VALIDATE_LINK_REGISTER
- add sp, sp, #32
+ ldp x19, x20, [sp], #32
ret
endfunc
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index d8a18cd4b8..cf86e5081d 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1173,10 +1173,9 @@ function nsse16_neon, export=1
bl X(sse16_neon)
ldr x30, [sp, #0x30]
mov w9, w0 // here we store score1
- ldr x5, [sp]
ldp x1, x2, [sp, #0x10]
ldp x3, x4, [sp, #0x20]
- add sp, sp, #0x40
+ ldr x5, [sp], #0x40
movi v16.8h, #0
movi v17.8h, #0
@@ -1295,10 +1294,9 @@ function nsse8_neon, export=1
bl X(sse8_neon)
ldr x30, [sp, #0x30]
mov w9, w0 // here we store score1
- ldr x5, [sp]
ldp x1, x2, [sp, #0x10]
ldp x3, x4, [sp, #0x20]
- add sp, sp, #0x40
+ ldr x5, [sp], #0x40
movi v16.8h, #0
movi v17.8h, #0
diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S
index ba79ba9686..4f8494ce30 100644
--- a/libavcodec/aarch64/synth_filter_neon.S
+++ b/libavcodec/aarch64/synth_filter_neon.S
@@ -61,11 +61,11 @@ function ff_synth_filter_float_neon, export=1
blr x9
- ldp x2, x4, [sp] // synct_buf_2, window
ldp x13, x9, [sp, #16] // out, synth_buf
ldp x0, x30, [sp, #32] // *synth_buf_offset
AARCH64_VALIDATE_LINK_REGISTER
ldr s0, [sp, #48]
+ ldp x2, x4, [sp], #64 // synct_buf_2, window
add x3, x2, #16*4 // synct_buf_2 + 16
add x14, x13, #16*4 // out + 16
@@ -116,6 +116,5 @@ function ff_synth_filter_float_neon, export=1
b 1b
10:
- add sp, sp, #64
ret
endfunc
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 03272eae82..a27f7b8ae5 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -850,10 +850,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
mov x15, x30
// iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
.ifnc \txfm1\()_\txfm2,idct_idct
- stp d14, d15, [sp, #-0x10]!
- stp d12, d13, [sp, #-0x10]!
- stp d10, d11, [sp, #-0x10]!
- stp d8, d9, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x40]!
+ stp d14, d15, [sp, #0x30]
+ stp d12, d13, [sp, #0x20]
+ stp d10, d11, [sp, #0x10]
.endif
sub sp, sp, #512
@@ -920,10 +920,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
add sp, sp, #512
.ifnc \txfm1\()_\txfm2,idct_idct
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d12, d13, [sp, #0x20]
+ ldp d14, d15, [sp, #0x30]
+ ldp d8, d9, [sp], #0x40
.endif
ret x15
endfunc
@@ -1486,8 +1486,8 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
mov x15, x30
- stp d10, d11, [sp, #-0x10]!
- stp d8, d9, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x20]!
+ stp d10, d11, [sp, #0x10]
sub sp, sp, #2048
@@ -1544,8 +1544,8 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
add sp, sp, #2048
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], #0x20
ret x15
endfunc
@@ -1569,8 +1569,8 @@ function idct32x32_\size\()_add_neon
add sp, sp, #2048
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], #0x20
ret x15
endfunc
diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
index a092617b92..e3e70491c6 100644
--- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -417,10 +417,10 @@ endfunc
function ff_\func\()_\bpp\()_neon, export=1
.if \push
mov x16, x30
- stp d14, d15, [sp, #-0x10]!
- stp d12, d13, [sp, #-0x10]!
- stp d10, d11, [sp, #-0x10]!
- stp d8, d9, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x40]!
+ stp d14, d15, [sp, #0x30]
+ stp d12, d13, [sp, #0x20]
+ stp d10, d11, [sp, #0x10]
.endif
lsl w2, w2, #\bpp - 8
lsl w3, w3, #\bpp - 8
@@ -430,10 +430,10 @@ function ff_\func\()_\bpp\()_neon, export=1
mov x7, #((1 << \bpp) - 1)
.if \push
bl \func\()_16_neon
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d12, d13, [sp, #0x20]
+ ldp d14, d15, [sp, #0x30]
+ ldp d8, d9, [sp], #0x40
ret x16
.else
b \func\()_16_neon
@@ -450,10 +450,10 @@ endfunc
function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
mov x16, x30
.if \push
- stp d14, d15, [sp, #-0x10]!
- stp d12, d13, [sp, #-0x10]!
- stp d10, d11, [sp, #-0x10]!
- stp d8, d9, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x40]!
+ stp d14, d15, [sp, #0x30]
+ stp d12, d13, [sp, #0x20]
+ stp d10, d11, [sp, #0x10]
.endif
lsl w2, w2, #\bpp - 8
lsl w3, w3, #\bpp - 8
@@ -469,10 +469,10 @@ function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
.endif
bl \func\()_\int_suffix\()_16_neon
.if \push
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d12, d13, [sp, #0x20]
+ ldp d14, d15, [sp, #0x30]
+ ldp d8, d9, [sp], 0x40
.endif
ret x16
endfunc
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 694ff8956f..9a79f48df3 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -564,10 +564,10 @@ function vp9_loop_filter_16
loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15
ret
9:
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d12, d13, [sp, #0x20]
+ ldp d14, d15, [sp, #0x30]
+ ldp d8, d9, [sp], #0x40
ret x10
endfunc
@@ -575,10 +575,10 @@ function vp9_loop_filter_16_16b
loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15
ret
9:
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d12, d13, [sp, #0x20]
+ ldp d14, d15, [sp, #0x30]
+ ldp d8, d9, [sp], #0x40
ret x10
endfunc
@@ -973,10 +973,10 @@ mix_h_16 88
function ff_vp9_loop_filter_v_16_8_neon, export=1
mov x10, x30
- stp d14, d15, [sp, #-0x10]!
- stp d12, d13, [sp, #-0x10]!
- stp d10, d11, [sp, #-0x10]!
- stp d8, d9, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x40]!
+ stp d14, d15, [sp, #0x30]
+ stp d12, d13, [sp, #0x20]
+ stp d10, d11, [sp, #0x10]
sub x9, x0, x1, lsl #3
ld1 {v16.8b}, [x9], x1 // p7
ld1 {v24.8b}, [x0], x1 // q0
@@ -1018,10 +1018,10 @@ function ff_vp9_loop_filter_v_16_8_neon, export=1
st1 {v9.8b}, [x9], x1
st1 {v17.8b}, [x0], x1
9:
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d12, d13, [sp, #0x20]
+ ldp d14, d15, [sp, #0x30]
+ ldp d8, d9, [sp], #0x40
ret x10
8:
add x9, x9, x1, lsl #2
@@ -1045,10 +1045,10 @@ endfunc
function ff_vp9_loop_filter_v_16_16_neon, export=1
mov x10, x30
- stp d14, d15, [sp, #-0x10]!
- stp d12, d13, [sp, #-0x10]!
- stp d10, d11, [sp, #-0x10]!
- stp d8, d9, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x40]!
+ stp d14, d15, [sp, #0x30]
+ stp d12, d13, [sp, #0x20]
+ stp d10, d11, [sp, #0x10]
sub x9, x0, x1, lsl #3
ld1 {v16.16b}, [x9], x1 // p7
ld1 {v24.16b}, [x0], x1 // q0
@@ -1087,10 +1087,10 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
st1 {v9.16b}, [x9], x1
st1 {v17.16b}, [x0], x1
9:
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d12, d13, [sp, #0x20]
+ ldp d14, d15, [sp, #0x30]
+ ldp d8, d9, [sp], #0x40
ret x10
8:
add x9, x9, x1, lsl #2
@@ -1112,10 +1112,10 @@ endfunc
function ff_vp9_loop_filter_h_16_8_neon, export=1
mov x10, x30
- stp d14, d15, [sp, #-0x10]!
- stp d12, d13, [sp, #-0x10]!
- stp d10, d11, [sp, #-0x10]!
- stp d8, d9, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x40]!
+ stp d14, d15, [sp, #0x30]
+ stp d12, d13, [sp, #0x20]
+ stp d10, d11, [sp, #0x10]
sub x9, x0, #8
ld1 {v16.8b}, [x9], x1
ld1 {v24.8b}, [x0], x1
@@ -1164,10 +1164,10 @@ function ff_vp9_loop_filter_h_16_8_neon, export=1
st1 {v9.8b}, [x9], x1
st1 {v31.8b}, [x0], x1
9:
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d12, d13, [sp, #0x20]
+ ldp d14, d15, [sp, #0x30]
+ ldp d8, d9, [sp], #0x40
ret x10
8:
// The same writeback as in loop_filter_h_8_8
@@ -1202,10 +1202,10 @@ endfunc
function ff_vp9_loop_filter_h_16_16_neon, export=1
mov x10, x30
- stp d14, d15, [sp, #-0x10]!
- stp d12, d13, [sp, #-0x10]!
- stp d10, d11, [sp, #-0x10]!
- stp d8, d9, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x40]!
+ stp d14, d15, [sp, #0x30]
+ stp d12, d13, [sp, #0x20]
+ stp d10, d11, [sp, #0x10]
sub x9, x0, #8
ld1 {v16.8b}, [x9], x1
ld1 {v24.8b}, [x0], x1
@@ -1283,10 +1283,10 @@ function ff_vp9_loop_filter_h_16_16_neon, export=1
st1 {v9.d}[1], [x9], x1
st1 {v31.d}[1], [x0], x1
9:
- ldp d8, d9, [sp], 0x10
- ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
+ ldp d10, d11, [sp, #0x10]
+ ldp d12, d13, [sp, #0x20]
+ ldp d14, d15, [sp, #0x30]
+ ldp d8, d9, [sp], #0x40
ret x10
8:
sub x9, x0, #4
diff --git a/libavutil/aarch64/tx_float_neon.S b/libavutil/aarch64/tx_float_neon.S
index 4126c3b812..e5531dcc7c 100644
--- a/libavutil/aarch64/tx_float_neon.S
+++ b/libavutil/aarch64/tx_float_neon.S
@@ -866,10 +866,10 @@ FFT16_FN ns_float, 1
.macro FFT32_FN name, no_perm
function ff_tx_fft32_\name\()_neon, export=1
- stp d8, d9, [sp, #-16]
- stp d10, d11, [sp, #-32]
- stp d12, d13, [sp, #-48]
- stp d14, d15, [sp, #-64]
+ stp d14, d15, [sp, #-16*4]!
+ stp d8, d9, [sp, #16*3]
+ stp d10, d11, [sp, #16*2]
+ stp d12, d13, [sp, #16]
LOAD_SUBADD
SETUP_SR_RECOMB 32, x7, x8, x9
@@ -911,10 +911,10 @@ function ff_tx_fft32_\name\()_neon, export=1
zip2 v31.2d, v11.2d, v15.2d
st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x1]
- ldp d14, d15, [sp, #-64]
- ldp d12, d13, [sp, #-48]
- ldp d10, d11, [sp, #-32]
- ldp d8, d9, [sp, #-16]
+ ldp d12, d13, [sp, #16]
+ ldp d10, d11, [sp, #16*2]
+ ldp d8, d9, [sp, #16*3]
+ ldp d14, d15, [sp], #16*4
ret
endfunc
@@ -966,12 +966,12 @@ FFT32_FN ns_float, 1
.macro FFT_SPLIT_RADIX_FN name, no_perm
function ff_tx_fft_sr_\name\()_neon, export=1
- stp d8, d9, [sp, #-16]!
- stp d10, d11, [sp, #-16]!
- stp d12, d13, [sp, #-16]!
- stp d14, d15, [sp, #-16]!
- stp x19, x20, [sp, #-16]!
- stp x21, x22, [sp, #-16]!
+ stp x21, x22, [sp, #-16*6]!
+ stp d8, d9, [sp, #16*5]
+ stp d10, d11, [sp, #16*4]
+ stp d12, d13, [sp, #16*3]
+ stp d14, d15, [sp, #16*2]
+ stp x19, x20, [sp, #16]
ldr w19, [x0, #0] // global target
mov w20, w19 // local length
@@ -1185,12 +1185,12 @@ SR_TRANSFORM_DEF 131072
subs w19, w19, #32*4
b.gt 0b
- ldp x21, x22, [sp], #16
- ldp x19, x20, [sp], #16
- ldp d14, d15, [sp], #16
- ldp d12, d13, [sp], #16
- ldp d10, d11, [sp], #16
- ldp d8, d9, [sp], #16
+ ldp x19, x20, [sp, #16]
+ ldp d14, d15, [sp, #16*2]
+ ldp d12, d13, [sp, #16*3]
+ ldp d10, d11, [sp, #16*4]
+ ldp d8, d9, [sp, #16*5]
+ ldp x21, x22, [sp], #16*6
ret
@@ -1279,12 +1279,12 @@ SR_TRANSFORM_DEF 131072
zip2 v7.2d, v15.2d, v23.2d
st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x15]
- ldp x21, x22, [sp], #16
- ldp x19, x20, [sp], #16
- ldp d14, d15, [sp], #16
- ldp d12, d13, [sp], #16
- ldp d10, d11, [sp], #16
- ldp d8, d9, [sp], #16
+ ldp x19, x20, [sp, #16]
+ ldp d14, d15, [sp, #16*2]
+ ldp d12, d13, [sp, #16*3]
+ ldp d10, d11, [sp, #16*4]
+ ldp d8, d9, [sp, #16*5]
+ ldp x21, x22, [sp], #16*6
ret
endfunc
--
2.37.0 (Apple Git-136)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way.
2022-10-09 19:47 [FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way Reimar.Doeffinger
@ 2022-10-10 21:29 ` Martin Storsjö
2022-10-11 7:14 ` Reimar Döffinger
0 siblings, 1 reply; 3+ messages in thread
From: Martin Storsjö @ 2022-10-10 21:29 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Reimar Döffinger
On Sun, 9 Oct 2022, Reimar.Doeffinger@gmx.de wrote:
> From: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
>
> Currently it is done in several different ways, which
> might cause needless dependencies or in case of
> tx_float_neon.S is incorrect.
>
> Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
> ---
> libavcodec/aarch64/fft_neon.S | 3 +-
> libavcodec/aarch64/h264idct_neon.S | 6 +-
> libavcodec/aarch64/hevcdsp_sao_neon.S | 3 +-
> libavcodec/aarch64/mdct_neon.S | 18 ++----
> libavcodec/aarch64/me_cmp_neon.S | 6 +-
> libavcodec/aarch64/synth_filter_neon.S | 3 +-
> libavcodec/aarch64/vp9itxfm_neon.S | 28 ++++-----
> libavcodec/aarch64/vp9lpf_16bpp_neon.S | 32 +++++------
> libavcodec/aarch64/vp9lpf_neon.S | 80 +++++++++++++-------------
> libavutil/aarch64/tx_float_neon.S | 52 ++++++++---------
> 10 files changed, 109 insertions(+), 122 deletions(-)
This looks reasonable to me, assuming that it passes fate. Do you want to
push it yourself, or do you want me to do it?
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way.
2022-10-10 21:29 ` Martin Storsjö
@ 2022-10-11 7:14 ` Reimar Döffinger
0 siblings, 0 replies; 3+ messages in thread
From: Reimar Döffinger @ 2022-10-11 7:14 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Hi Martin,
> On 10 Oct 2022, at 23:29, Martin Storsjö <martin@martin.st> wrote:
>
> On Sun, 9 Oct 2022, Reimar.Doeffinger@gmx.de wrote:
>
>> From: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
>>
>> Currently it is done in several different ways, which
>> might cause needless dependencies or in case of
>> tx_float_neon.S is incorrect.
>
> This looks reasonable to me, assuming that it passes fate. Do you want to push it yourself, or do you want me to do it?
Thanks, I pushed it.
I had only run checkasm on it (I had made a couple of mistakes first), but it also passes fate.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2022-10-11 7:15 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-09 19:47 [FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way Reimar.Doeffinger
2022-10-10 21:29 ` Martin Storsjö
2022-10-11 7:14 ` Reimar Döffinger
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git