* [FFmpeg-devel] [PATCH] lavc/hevcdsp_qpel_neon: using movi.16b instead of movi.2d
@ 2023-11-15 3:53 xufuji456 via ffmpeg-devel
2023-11-28 13:57 ` Martin Storsjö
0 siblings, 1 reply; 3+ messages in thread
From: xufuji456 via ffmpeg-devel @ 2023-11-15 3:53 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: xufuji456
Building iOS platform with arm64, the compiler has a warning: "instruction movi.2d with immediate #0 may not function correctly on this CPU, converting to movi.16b"
Signed-off-by: xufuji456 <839789740@qq.com>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 136 ++++++++++++-------------
libavcodec/aarch64/hevcdsp_qpel_neon.S | 52 +++++-----
libavcodec/aarch64/mpegaudiodsp_neon.S | 8 +-
libswscale/aarch64/hscale.S | 72 ++++++-------
4 files changed, 134 insertions(+), 134 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index a2a051210f..c077c204cc 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -694,7 +694,7 @@ function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
trn1 v4.2s, v4.2s, v5.2s
trn1 v6.2s, v6.2s, v7.2s
trn1 v4.2d, v4.2d, v6.2d
- movi v16.2d, #0
+ movi v16.16b, #0
usdot v16.4s, v4.16b, v30.16b
xtn v16.4h, v16.4s
st1 {v16.4h}, [x0], x10
@@ -714,8 +714,8 @@ function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
trn2 v17.2s, v4.2s, v5.2s
trn1 v6.2s, v6.2s, v7.2s
trn1 v16.2d, v16.2d, v6.2d
- movi v18.2d, #0
- movi v19.2d, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
usdot v18.4s, v16.16b, v30.16b
usdot v19.2s, v17.8b, v30.8b
xtn v18.4h, v18.4s
@@ -736,8 +736,8 @@ function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
ext v7.16b, v4.16b, v4.16b, #3
zip1 v20.4s, v4.4s, v6.4s
zip1 v21.4s, v5.4s, v7.4s
- movi v16.2d, #0
- movi v17.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
usdot v16.4s, v20.16b, v30.16b
usdot v17.4s, v21.16b, v30.16b
xtn v16.4h, v16.4s
@@ -761,9 +761,9 @@ function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
trn1 v4.4s, v20.4s, v21.4s
trn2 v5.4s, v20.4s, v21.4s
trn1 v6.4s, v22.4s, v23.4s
- movi v16.2d, #0
- movi v17.2d, #0
- movi v18.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
usdot v16.4s, v4.16b, v30.16b
usdot v17.4s, v5.16b, v30.16b
usdot v18.4s, v6.16b, v30.16b
@@ -788,10 +788,10 @@ function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
zip2 v22.4s, v0.4s, v6.4s
zip1 v21.4s, v5.4s, v7.4s
zip2 v23.4s, v5.4s, v7.4s
- movi v16.2d, #0
- movi v17.2d, #0
- movi v18.2d, #0
- movi v19.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
usdot v16.4s, v20.16b, v30.16b
usdot v17.4s, v21.16b, v30.16b
usdot v18.4s, v22.16b, v30.16b
@@ -815,14 +815,14 @@ function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
ext v26.16b, v1.16b, v1.16b, #1
ext v27.16b, v1.16b, v1.16b, #2
ext v28.16b, v1.16b, v1.16b, #3
- movi v16.2d, #0
- movi v17.2d, #0
- movi v18.2d, #0
- movi v19.2d, #0
- movi v20.2d, #0
- movi v21.2d, #0
- movi v22.2d, #0
- movi v23.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+ movi v22.16b, #0
+ movi v23.16b, #0
usdot v16.4s, v0.16b, v30.16b
usdot v17.4s, v5.16b, v30.16b
usdot v18.4s, v6.16b, v30.16b
@@ -861,14 +861,14 @@ function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
ext v26.16b, v1.16b, v2.16b, #1
ext v27.16b, v1.16b, v2.16b, #2
ext v28.16b, v1.16b, v2.16b, #3
- movi v16.2d, #0
- movi v17.2d, #0
- movi v18.2d, #0
- movi v19.2d, #0
- movi v20.2d, #0
- movi v21.2d, #0
- movi v22.2d, #0
- movi v23.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+ movi v22.16b, #0
+ movi v23.16b, #0
usdot v16.4s, v0.16b, v30.16b
usdot v17.4s, v5.16b, v30.16b
usdot v18.4s, v6.16b, v30.16b
@@ -900,18 +900,18 @@ function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
ext v16.16b, v1.16b, v2.16b, #1
ext v17.16b, v1.16b, v2.16b, #2
ext v18.16b, v1.16b, v2.16b, #3
- movi v20.2d, #0
- movi v21.2d, #0
- movi v22.2d, #0
- movi v23.2d, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+ movi v22.16b, #0
+ movi v23.16b, #0
usdot v20.4s, v0.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b
- movi v24.2d, #0
- movi v25.2d, #0
- movi v26.2d, #0
- movi v27.2d, #0
+ movi v24.16b, #0
+ movi v25.16b, #0
+ movi v26.16b, #0
+ movi v27.16b, #0
usdot v24.4s, v1.16b, v30.16b
usdot v25.4s, v16.16b, v30.16b
usdot v26.4s, v17.16b, v30.16b
@@ -928,10 +928,10 @@ function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
ext v4.16b, v2.16b, v3.16b, #1
ext v5.16b, v2.16b, v3.16b, #2
ext v6.16b, v2.16b, v3.16b, #3
- movi v20.2d, #0
- movi v21.2d, #0
- movi v22.2d, #0
- movi v23.2d, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+ movi v22.16b, #0
+ movi v23.16b, #0
usdot v20.4s, v2.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b
@@ -957,18 +957,18 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
ext v16.16b, v1.16b, v2.16b, #1
ext v17.16b, v1.16b, v2.16b, #2
ext v18.16b, v1.16b, v2.16b, #3
- movi v20.2d, #0
- movi v21.2d, #0
- movi v22.2d, #0
- movi v23.2d, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+ movi v22.16b, #0
+ movi v23.16b, #0
usdot v20.4s, v0.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b
- movi v24.2d, #0
- movi v25.2d, #0
- movi v26.2d, #0
- movi v27.2d, #0
+ movi v24.16b, #0
+ movi v25.16b, #0
+ movi v26.16b, #0
+ movi v27.16b, #0
usdot v24.4s, v1.16b, v30.16b
usdot v25.4s, v16.16b, v30.16b
usdot v26.4s, v17.16b, v30.16b
@@ -989,18 +989,18 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
ext v16.16b, v3.16b, v7.16b, #1
ext v17.16b, v3.16b, v7.16b, #2
ext v18.16b, v3.16b, v7.16b, #3
- movi v20.2d, #0
- movi v21.2d, #0
- movi v22.2d, #0
- movi v23.2d, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+ movi v22.16b, #0
+ movi v23.16b, #0
usdot v20.4s, v2.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b
- movi v24.2d, #0
- movi v25.2d, #0
- movi v26.2d, #0
- movi v27.2d, #0
+ movi v24.16b, #0
+ movi v25.16b, #0
+ movi v26.16b, #0
+ movi v27.16b, #0
usdot v24.4s, v3.16b, v30.16b
usdot v25.4s, v16.16b, v30.16b
usdot v26.4s, v17.16b, v30.16b
@@ -1593,7 +1593,7 @@ function ff_hevc_put_hevc_epel_uni_w_h4_8_neon_i8mm, export=1
trn1 v0.2s, v0.2s, v2.2s
trn1 v1.2s, v1.2s, v3.2s
zip1 v0.4s, v0.4s, v1.4s
- movi v16.2d, #0
+ movi v16.16b, #0
usdot v16.4s, v0.16b, v28.16b
mul v16.4s, v16.4s, v30.4s
sqrshl v16.4s, v16.4s, v31.4s
@@ -1620,8 +1620,8 @@ function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1
trn2 v6.2s, v0.2s, v1.2s
trn1 v5.2s, v2.2s, v3.2s
zip1 v4.2d, v4.2d, v5.2d
- movi v16.2d, #0
- movi v17.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
usdot v16.4s, v4.16b, v28.16b
usdot v17.2s, v6.8b, v28.8b
mul v16.4s, v16.4s, v30.4s
@@ -1640,8 +1640,8 @@ function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1
endfunc
.macro EPEL_UNI_W_H_CALC s0, s1, d0, d1
- movi \d0\().2d, #0
- movi \d1\().2d, #0
+ movi \d0\().16b, #0
+ movi \d1\().16b, #0
usdot \d0\().4s, \s0\().16b, v28.16b
usdot \d1\().4s, \s1\().16b, v28.16b
mul \d0\().4s, \d0\().4s, v30.4s
@@ -1687,7 +1687,7 @@ function ff_hevc_put_hevc_epel_uni_w_h12_8_neon_i8mm, export=1
zip2 v7.4s, v1.4s, v3.4s
zip1 v6.4s, v6.4s, v7.4s
EPEL_UNI_W_H_CALC v4, v5, v16, v17
- movi v18.2d, #0
+ movi v18.16b, #0
usdot v18.4s, v6.16b, v28.16b
mul v18.4s, v18.4s, v30.4s
sqrshl v18.4s, v18.4s, v31.4s
@@ -2575,7 +2575,7 @@ DISABLE_I8MM
.endm
.macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3
- movi \d0\().2d, #0
+ movi \d0\().16b, #0
umlsl \d0\().8h, \s0\().8b, v0.8b
umlal \d0\().8h, \s1\().8b, v1.8b
umlal \d0\().8h, \s2\().8b, v2.8b
@@ -2626,7 +2626,7 @@ function ff_hevc_put_hevc_epel_uni_w_v4_8_neon, export=1
endfunc
.macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1
- movi \d0\().2d, #0
+ movi \d0\().16b, #0
umlsl \d0\().8h, \s0\().8b, v0.8b
umlal \d0\().8h, \s1\().8b, v1.8b
umlal \d0\().8h, \s2\().8b, v2.8b
@@ -2720,8 +2720,8 @@ function ff_hevc_put_hevc_epel_uni_w_v8_8_neon, export=1
endfunc
.macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
- movi \d0\().2d, #0
- movi \d1\().2d, #0
+ movi \d0\().16b, #0
+ movi \d1\().16b, #0
umlsl \d0\().8h, \s0\().8b, v0.8b
umlsl2 \d1\().8h, \s0\().16b, v0.16b
umlal \d0\().8h, \s1\().8b, v1.8b
@@ -2793,8 +2793,8 @@ function ff_hevc_put_hevc_epel_uni_w_v12_8_neon, export=1
endfunc
.macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
- movi \d0\().2d, #0
- movi \d1\().2d, #0
+ movi \d0\().16b, #0
+ movi \d1\().16b, #0
umlsl \d0\().8h, \s0\().8b, v0.8b
umlsl2 \d1\().8h, \s0\().16b, v0.16b
umlal \d0\().8h, \s1\().8b, v1.8b
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 8adfa38ccf..bcee627cba 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -2180,8 +2180,8 @@ function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
ext v3.16b, v0.16b, v0.16b, #3
zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
- movi v16.2d, #0
- movi v17.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
usdot v16.4s, v0.16b, v28.16b
usdot v17.4s, v2.16b, v28.16b
addp v16.4s, v16.4s, v17.4s
@@ -2210,9 +2210,9 @@ function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
zip1 v4.2d, v4.2d, v5.2d
- movi v16.2d, #0
- movi v17.2d, #0
- movi v18.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
usdot v16.4s, v0.16b, v28.16b
usdot v17.4s, v2.16b, v28.16b
usdot v18.4s, v4.16b, v28.16b
@@ -2236,10 +2236,10 @@ endfunc
.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
- movi \d0\().2d, #0
- movi \d1\().2d, #0
- movi \d2\().2d, #0
- movi \d3\().2d, #0
+ movi \d0\().16b, #0
+ movi \d1\().16b, #0
+ movi \d2\().16b, #0
+ movi \d3\().16b, #0
usdot \d0\().4s, \s0\().16b, v28.16b
usdot \d1\().4s, \s1\().16b, v28.16b
usdot \d2\().4s, \s2\().16b, v28.16b
@@ -2255,8 +2255,8 @@ endfunc
.endm
.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
- movi \d0\().2d, #0
- movi \d1\().2d, #0
+ movi \d0\().16b, #0
+ movi \d1\().16b, #0
usdot \d0\().4s, \s0\().16b, v28.16b
usdot \d1\().4s, \s1\().16b, v28.16b
addp \d0\().4s, \d0\().4s, \d1\().4s
@@ -2606,8 +2606,8 @@ function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1
ext v3.16b, v0.16b, v0.16b, #3
zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
- movi v16.2d, #0
- movi v17.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
usdot v16.4s, v0.16b, v31.16b
usdot v17.4s, v2.16b, v31.16b
addp v16.4s, v16.4s, v17.4s
@@ -2633,9 +2633,9 @@ function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1
zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
zip1 v4.2d, v4.2d, v5.2d
- movi v16.2d, #0
- movi v17.2d, #0
- movi v18.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
usdot v16.4s, v0.16b, v31.16b
usdot v17.4s, v2.16b, v31.16b
usdot v18.4s, v4.16b, v31.16b
@@ -2668,10 +2668,10 @@ function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
zip1 v2.2d, v2.2d, v3.2d
zip1 v4.2d, v4.2d, v5.2d
zip1 v6.2d, v6.2d, v7.2d
- movi v16.2d, #0
- movi v17.2d, #0
- movi v18.2d, #0
- movi v19.2d, #0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
usdot v16.4s, v0.16b, v31.16b
usdot v17.4s, v2.16b, v31.16b
usdot v18.4s, v4.16b, v31.16b
@@ -2688,10 +2688,10 @@ function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
endfunc
.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
- movi \d0\().2d, #0
- movi \d1\().2d, #0
- movi \d2\().2d, #0
- movi \d3\().2d, #0
+ movi \d0\().16b, #0
+ movi \d1\().16b, #0
+ movi \d2\().16b, #0
+ movi \d3\().16b, #0
usdot \d0\().4s, \s0\().16b, v31.16b
usdot \d1\().4s, \s1\().16b, v31.16b
usdot \d2\().4s, \s2\().16b, v31.16b
@@ -2716,8 +2716,8 @@ function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
addp v20.4s, v20.4s, v22.4s
addp v21.4s, v21.4s, v23.4s
- movi v24.2d, #0
- movi v25.2d, #0
+ movi v24.16b, #0
+ movi v25.16b, #0
usdot v24.4s, v18.16b, v31.16b
usdot v25.4s, v19.16b, v31.16b
addp v24.4s, v24.4s, v25.4s
diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S
index b6ef131228..9799c271d0 100644
--- a/libavcodec/aarch64/mpegaudiodsp_neon.S
+++ b/libavcodec/aarch64/mpegaudiodsp_neon.S
@@ -56,7 +56,7 @@ function ff_mpadsp_apply_window_\type\()_neon, export=1
.ifc \type, fixed
ld1r {v16.2s}, [x2] // dither_state
sxtl v16.2d, v16.2s
- movi v29.2d, #0
+ movi v29.16b, #0
movi v30.2d, #(1<<OUT_SHIFT)-1
trn1 v31.2d, v29.2d, v30.2d
trn2 v30.2d, v30.2d, v29.2d
@@ -74,9 +74,9 @@ function ff_mpadsp_apply_window_\type\()_neon, export=1
add x11, x6, #(32)<<2 // w + 32
add x12, x7, #(32)<<2 // w2 + 32
mov x15, #8
- movi v17.2d, #0
- movi v18.2d, #0
- movi v19.2d, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
2:
subs x15, x15, #1
ld1 {v0.4s}, [x8], x9
diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
index b49443c964..b3873fc4b7 100644
--- a/libswscale/aarch64/hscale.S
+++ b/libswscale/aarch64/hscale.S
@@ -50,10 +50,10 @@ function ff_hscale8to15_X8_neon, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2
- movi v0.2d, #0 // val sum part 1 (for dst[0])
- movi v1.2d, #0 // val sum part 2 (for dst[1])
- movi v2.2d, #0 // val sum part 3 (for dst[2])
- movi v3.2d, #0 // val sum part 4 (for dst[3])
+ movi v0.16b, #0 // val sum part 1 (for dst[0])
+ movi v1.16b, #0 // val sum part 2 (for dst[1])
+ movi v2.16b, #0 // val sum part 3 (for dst[2])
+ movi v3.16b, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, uxtw // srcp + filterPos[0]
add x8, x3, w0, uxtw // srcp + filterPos[1]
add x0, x3, w11, uxtw // srcp + filterPos[2]
@@ -108,10 +108,10 @@ function ff_hscale8to15_X4_neon, export=1
ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
- movi v16.2d, #0 // initialize accumulator for idx + 0
- movi v17.2d, #0 // initialize accumulator for idx + 1
- movi v18.2d, #0 // initialize accumulator for idx + 2
- movi v19.2d, #0 // initialize accumulator for idx + 3
+ movi v16.16b, #0 // initialize accumulator for idx + 0
+ movi v17.16b, #0 // initialize accumulator for idx + 1
+ movi v18.16b, #0 // initialize accumulator for idx + 2
+ movi v19.16b, #0 // initialize accumulator for idx + 3
mov x12, x4 // filter pointer for idx + 0
add x13, x4, x7 // filter pointer for idx + 1
@@ -253,8 +253,8 @@ function ff_hscale8to15_4_neon, export=1
ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
- movi v0.2d, #0 // Clear madd accumulator for idx 0..3
- movi v5.2d, #0 // Clear madd accumulator for idx 4..7
+ movi v0.16b, #0 // Clear madd accumulator for idx 0..3
+ movi v5.16b, #0 // Clear madd accumulator for idx 4..7
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
@@ -299,8 +299,8 @@ function ff_hscale8to15_4_neon, export=1
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
- movi v0.2d, #0 // Clear madd accumulator for idx 0..3
- movi v5.2d, #0 // Clear madd accumulator for idx 4..7
+ movi v0.16b, #0 // Clear madd accumulator for idx 0..3
+ movi v5.16b, #0 // Clear madd accumulator for idx 4..7
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
@@ -499,10 +499,10 @@ function ff_hscale8to19_X8_neon, export=1
ldr w11, [x5], #4 // filterPos[idx + 2]
add x4, x13, x7 // filter3 = filter2 + filterSize*2
ldr w9, [x5], #4 // filterPos[idx + 3]
- movi v0.2d, #0 // val sum part 1 (for dst[0])
- movi v1.2d, #0 // val sum part 2 (for dst[1])
- movi v2.2d, #0 // val sum part 3 (for dst[2])
- movi v3.2d, #0 // val sum part 4 (for dst[3])
+ movi v0.16b, #0 // val sum part 1 (for dst[0])
+ movi v1.16b, #0 // val sum part 2 (for dst[1])
+ movi v2.16b, #0 // val sum part 3 (for dst[2])
+ movi v3.16b, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, uxtw // srcp + filterPos[0]
add x8, x3, w0, uxtw // srcp + filterPos[1]
add x0, x3, w11, uxtw // srcp + filterPos[2]
@@ -560,10 +560,10 @@ function ff_hscale8to19_X4_neon, export=1
ldp w8, w9, [x5]
ldp w10, w11, [x5, #8]
- movi v16.2d, #0 // initialize accumulator for idx + 0
- movi v17.2d, #0 // initialize accumulator for idx + 1
- movi v18.2d, #0 // initialize accumulator for idx + 2
- movi v19.2d, #0 // initialize accumulator for idx + 3
+ movi v16.16b, #0 // initialize accumulator for idx + 0
+ movi v17.16b, #0 // initialize accumulator for idx + 1
+ movi v18.16b, #0 // initialize accumulator for idx + 2
+ movi v19.16b, #0 // initialize accumulator for idx + 3
mov x12, x4 // filter + 0
add x13, x4, x7 // filter + 1
@@ -865,10 +865,10 @@ function ff_hscale16to15_X8_neon_asm, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2
- movi v0.2d, #0 // val sum part 1 (for dst[0])
- movi v1.2d, #0 // val sum part 2 (for dst[1])
- movi v2.2d, #0 // val sum part 3 (for dst[2])
- movi v3.2d, #0 // val sum part 4 (for dst[3])
+ movi v0.16b, #0 // val sum part 1 (for dst[0])
+ movi v1.16b, #0 // val sum part 2 (for dst[1])
+ movi v2.16b, #0 // val sum part 3 (for dst[2])
+ movi v3.16b, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, uxtw // srcp + filterPos[0]
add x8, x3, w10, uxtw // srcp + filterPos[1]
add x10, x3, w11, uxtw // srcp + filterPos[2]
@@ -945,10 +945,10 @@ function ff_hscale16to15_X4_neon_asm, export=1
ldp w8, w9, [x5]
ldp w10, w11, [x5, #8]
- movi v16.2d, #0 // initialize accumulator for idx + 0
- movi v17.2d, #0 // initialize accumulator for idx + 1
- movi v18.2d, #0 // initialize accumulator for idx + 2
- movi v19.2d, #0 // initialize accumulator for idx + 3
+ movi v16.16b, #0 // initialize accumulator for idx + 0
+ movi v17.16b, #0 // initialize accumulator for idx + 1
+ movi v18.16b, #0 // initialize accumulator for idx + 2
+ movi v19.16b, #0 // initialize accumulator for idx + 3
mov x12, x4 // filter + 0
add x13, x4, x7 // filter + 1
@@ -1270,10 +1270,10 @@ function ff_hscale16to19_X8_neon_asm, export=1
add x13, x12, x7 // filter2 = filter1 + filterSize*2
lsl w10, w10, #1
add x4, x13, x7 // filter3 = filter2 + filterSize*2
- movi v0.2d, #0 // val sum part 1 (for dst[0])
- movi v1.2d, #0 // val sum part 2 (for dst[1])
- movi v2.2d, #0 // val sum part 3 (for dst[2])
- movi v3.2d, #0 // val sum part 4 (for dst[3])
+ movi v0.16b, #0 // val sum part 1 (for dst[0])
+ movi v1.16b, #0 // val sum part 2 (for dst[1])
+ movi v2.16b, #0 // val sum part 3 (for dst[2])
+ movi v3.16b, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, uxtw // srcp + filterPos[0]
add x8, x3, w10, uxtw // srcp + filterPos[1]
add x10, x3, w11, uxtw // srcp + filterPos[2]
@@ -1348,10 +1348,10 @@ function ff_hscale16to19_X4_neon_asm, export=1
ldp w8, w9, [x5]
ldp w10, w11, [x5, #8]
- movi v16.2d, #0 // initialize accumulator for idx + 0
- movi v17.2d, #0 // initialize accumulator for idx + 1
- movi v18.2d, #0 // initialize accumulator for idx + 2
- movi v19.2d, #0 // initialize accumulator for idx + 3
+ movi v16.16b, #0 // initialize accumulator for idx + 0
+ movi v17.16b, #0 // initialize accumulator for idx + 1
+ movi v18.16b, #0 // initialize accumulator for idx + 2
+ movi v19.16b, #0 // initialize accumulator for idx + 3
mov x12, x4 // filter + 0
add x13, x4, x7 // filter + 1
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/hevcdsp_qpel_neon: using movi.16b instead of movi.2d
2023-11-15 3:53 [FFmpeg-devel] [PATCH] lavc/hevcdsp_qpel_neon: using movi.16b instead of movi.2d xufuji456 via ffmpeg-devel
@ 2023-11-28 13:57 ` Martin Storsjö
2023-11-29 10:01 ` 徐福隆 via ffmpeg-devel
0 siblings, 1 reply; 3+ messages in thread
From: Martin Storsjö @ 2023-11-28 13:57 UTC (permalink / raw)
To: xufuji456 via ffmpeg-devel; +Cc: xufuji456
Hi,
On Wed, 15 Nov 2023, xufuji456 via ffmpeg-devel wrote:
> Building iOS platform with arm64, the compiler has a warning:
> "instruction movi.2d with immediate #0 may not function correctly on
> this CPU, converting to movi.16b"
The change is correct and something we should do. However since
fb1b88af77cd39034cef4b6d08af79496cd75ed8, our asm.S starts with an ".arch"
directive, and that silences this warning, so I wonder how you've run into
it or tested it. (I presume you've run into it with an older version, then
tried to find all cases of this instruction in the codebase and changed
those?)
(This warning is something that only is triggered when building for iOS,
not macOS, due to a bug in early iPhone CPUs - see
https://github.com/llvm/llvm-project/commit/9097a07e4e83b9f394598d53557247277b064b09.)
Anyway, the change looks good, so I'll push it.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/hevcdsp_qpel_neon: using movi.16b instead of movi.2d
2023-11-28 13:57 ` Martin Storsjö
@ 2023-11-29 10:01 ` 徐福隆 via ffmpeg-devel
0 siblings, 0 replies; 3+ messages in thread
From: 徐福隆 via ffmpeg-devel @ 2023-11-29 10:01 UTC (permalink / raw)
To: =?gb18030?B?RkZtcGVnIGRldmVsb3BtZW50IGRpc2N1c3Npb25zIGFuZCBwYXRjaGVz?=
Cc: =?gb18030?B?0Oy4o8Kh?=, =?gb18030?B?bWFydGlu?=
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="gb18030", Size: 2244 bytes --]
Thank you for your reading and explaining carefully, Martin.
Yes, you are right. I build with iOS 13.0, and FFmpeg 6.0.
It seems that there is no ".arch" directive in asm.S.
That is to say, it isn't a newest version.
------------------ Original ------------------
From: "FFmpeg development discussions and patches" <martin@martin.st>;
Date: Tue, Nov 28, 2023 09:57 PM
To: "xufuji456 via ffmpeg-devel"<ffmpeg-devel@ffmpeg.org>;
Cc: "Ð츣¡"<839789740@qq.com>;
Subject: Re: [FFmpeg-devel] [PATCH] lavc/hevcdsp_qpel_neon: using movi.16b instead of movi.2d
Hi,
On Wed, 15 Nov 2023, xufuji456 via ffmpeg-devel wrote:
> Building iOS platform with arm64, the compiler has a warning:
> "instruction movi.2d with immediate #0 may not function correctly on
> this CPU, converting to movi.16b"
The change is correct and something we should do. However since
fb1b88af77cd39034cef4b6d08af79496cd75ed8, our asm.S starts with an ".arch"
directive, and that silences this warning, so I wonder how you've run into
it or tested it. (I presume you've run into it with an older version, then
tried to find all cases of this instruction in the codebase and changed
those?)
(This warning is something that only is triggered when building for iOS,
not macOS, due to a bug in early iPhone CPUs - see
https://github.com/llvm/llvm-project/commit/9097a07e4e83b9f394598d53557247277b064b09.)
Anyway, the change looks good, so I'll push it.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2023-11-29 10:02 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-15 3:53 [FFmpeg-devel] [PATCH] lavc/hevcdsp_qpel_neon: using movi.16b instead of movi.2d xufuji456 via ffmpeg-devel
2023-11-28 13:57 ` Martin Storsjö
2023-11-29 10:01 ` 徐福隆 via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git