* [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon
@ 2022-07-13 20:48 Martin Storsjö
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 2/5] checkasm: motion: Make the benchmarks more stable Martin Storsjö
` (4 more replies)
0 siblings, 5 replies; 20+ messages in thread
From: Martin Storsjö @ 2022-07-13 20:48 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Jonathan Swinney
Checkasm doesn't currently test this codepath.
---
libavcodec/aarch64/me_cmp_neon.S | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index e49d049fc2..31db3793d9 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -189,11 +189,11 @@ function ff_pix_abs16_xy2_neon, export=1
urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
urshr v17.8h, v17.8h, #2 // shift right by 2 8..15
- uxtl2 v8.8h, v1.16b // 8->16 bits pix1 8..15
+ uxtl2 v7.8h, v1.16b // 8->16 bits pix1 8..15
uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7
uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7
- uaba v6.8h, v8.8h, v17.8h // absolute difference accumulate 8..15
+ uaba v6.8h, v7.8h, v17.8h // absolute difference accumulate 8..15
mov v2.16b, v18.16b // pix3 -> pix2
mov v3.16b, v19.16b // pix3+1 -> pix2+1
uaddlv s6, v6.8h // add up accumulator in v6
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* [FFmpeg-devel] [PATCH 2/5] checkasm: motion: Make the benchmarks more stable
2022-07-13 20:48 [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Martin Storsjö
@ 2022-07-13 20:48 ` Martin Storsjö
2022-07-15 19:35 ` Swinney, Jonathan
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 3/5] aarch64: me_cmp: Interleave some of the loads in ff_pix_abs16_xy2_neon Martin Storsjö
` (3 subsequent siblings)
4 siblings, 1 reply; 20+ messages in thread
From: Martin Storsjö @ 2022-07-13 20:48 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Jonathan Swinney
Don't use the last random offset, but a static one.
---
tests/checkasm/motion.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
index 79e4358941..87b20d1c10 100644
--- a/tests/checkasm/motion.c
+++ b/tests/checkasm/motion.c
@@ -81,7 +81,8 @@ static void test_motion(const char *name, me_cmp_func test_func)
break;
}
}
- // benchmark with the final value of ptr
+ // Test with a fixed offset, for benchmark stability
+ ptr = img2 + 3 * WIDTH + 3;
bench_new(NULL, img1, ptr, WIDTH, 8);
}
}
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] aarch64: me_cmp: Interleave some of the loads in ff_pix_abs16_xy2_neon
2022-07-13 20:48 [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Martin Storsjö
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 2/5] checkasm: motion: Make the benchmarks more stable Martin Storsjö
@ 2022-07-13 20:48 ` Martin Storsjö
2022-07-15 19:34 ` Swinney, Jonathan
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 4/5] aarch64: me_cmp: Switch from uabd to uabal " Martin Storsjö
` (2 subsequent siblings)
4 siblings, 1 reply; 20+ messages in thread
From: Martin Storsjö @ 2022-07-13 20:48 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Jonathan Swinney
Before: Cortex A53 A72 A73 Graviton 3
pix_abs_0_3_neon: 183.7 112.7 97.5 41.2
After:
pix_abs_0_3_neon: 175.7 109.2 92.0 41.2
---
libavcodec/aarch64/me_cmp_neon.S | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 31db3793d9..0ae23d8922 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -101,26 +101,32 @@ function ff_pix_abs16_xy2_neon, export=1
ld1 {v6.16b}, [x5], x3 // load pix3
ld1 {v16.16b}, [x1], x3 // load pix1
- ldur q19, [x5, #1] // load pix3+1
- ld1 {v18.16b}, [x5], x3 // load pix3
- ld1 {v17.16b}, [x1], x3 // load pix1
-
- ldur q22, [x5, #1] // load pix3+1
- ld1 {v21.16b}, [x5], x3 // load pix3
- ld1 {v20.16b}, [x1], x3 // load pix1
-
// These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
uaddl v30.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
uaddl2 v31.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
+
+ ldur q19, [x5, #1] // load pix3+1
+
add v23.8h, v2.8h, v30.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
add v24.8h, v3.8h, v31.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
+
+ ld1 {v18.16b}, [x5], x3 // load pix3
+ ld1 {v17.16b}, [x1], x3 // load pix1
+
rshrn v23.8b, v23.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v23.16b, v24.8h, #2 // shift right 2 8..15
uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
+
+ ldur q22, [x5, #1] // load pix3+1
+
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+
+ ld1 {v21.16b}, [x5], x3 // load pix3
+ ld1 {v20.16b}, [x1], x3 // load pix1
+
rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v26.16b, v27.8h, #2 // shift right 2 8..15
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] aarch64: me_cmp: Switch from uabd to uabal in ff_pix_abs16_xy2_neon
2022-07-13 20:48 [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Martin Storsjö
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 2/5] checkasm: motion: Make the benchmarks more stable Martin Storsjö
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 3/5] aarch64: me_cmp: Interleave some of the loads in ff_pix_abs16_xy2_neon Martin Storsjö
@ 2022-07-13 20:48 ` Martin Storsjö
2022-07-15 19:38 ` Swinney, Jonathan
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration Martin Storsjö
2022-07-15 19:35 ` [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Swinney, Jonathan
4 siblings, 1 reply; 20+ messages in thread
From: Martin Storsjö @ 2022-07-13 20:48 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Jonathan Swinney
Using absolute-difference-accumulate does use twice the amount of
absolute-difference instructions, but avoids the need for the
uaddl and add instructions, reducing the total number of instructions
by 3.
These can be interleaved in the rest of the calculation, to avoid
tight dependencies at the end. Unfortunately, this is marginally
slower on Cortex A53, but faster on A72 and A73.
Before: Cortex A53 A72 A73 Graviton 3
pix_abs_0_3_neon: 175.7 109.2 92.0 41.2
After:
pix_abs_0_3_neon: 179.7 96.7 87.5 41.2
---
libavcodec/aarch64/me_cmp_neon.S | 32 +++++++++++---------------------
1 file changed, 11 insertions(+), 21 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 0ae23d8922..89546869fb 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -124,6 +124,9 @@ function ff_pix_abs16_xy2_neon, export=1
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ uabdl v24.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
+ uabdl2 v23.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
+
ld1 {v21.16b}, [x5], x3 // load pix3
ld1 {v20.16b}, [x1], x3 // load pix1
@@ -137,6 +140,9 @@ function ff_pix_abs16_xy2_neon, export=1
rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15
+ uabal v24.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
+ uabal2 v23.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
+
uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15
add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
@@ -144,33 +150,17 @@ function ff_pix_abs16_xy2_neon, export=1
rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15
- // Averages are now stored in these registers:
- // v23, v16, v28, v30
- // pix1 values in these registers:
- // v1, v16, v17, v20
- // available:
- // v4, v5, v7, v18, v19, v24, v25, v27, v29, v31
+ uabal v24.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
+ uabal2 v23.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
sub w4, w4, #4 // h -= 4
- // Using absolute-difference instructions instead of absolute-difference-accumulate allows
- // us to keep the results in 16b vectors instead of widening values with twice the instructions.
- // This approach also has fewer data dependencies, allowing better instruction level parallelism.
- uabd v4.16b, v1.16b, v23.16b // absolute difference 0..15, i=0
- uabd v5.16b, v16.16b, v26.16b // absolute difference 0..15, i=1
- uabd v6.16b, v17.16b, v28.16b // absolute difference 0..15, i=2
- uabd v7.16b, v20.16b, v30.16b // absolute difference 0..15, i=3
+ uabal v24.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
+ uabal2 v23.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
cmp w4, #4 // loop if h >= 4
- // Now add up all the values in each vector, v4-v7 with widening adds
- uaddl v19.8h, v4.8b, v5.8b
- uaddl2 v18.8h, v4.16b, v5.16b
- uaddl v4.8h, v6.8b, v7.8b
- uaddl2 v5.8h, v6.16b, v7.16b
- add v4.8h, v4.8h, v5.8h
- add v4.8h, v4.8h, v18.8h
- add v4.8h, v4.8h, v19.8h
+ add v4.8h, v23.8h, v24.8h
uaddlv s4, v4.8h // finish adding up accumulated values
add d0, d0, d4 // add the value to the top level accumulator
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-13 20:48 [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Martin Storsjö
` (2 preceding siblings ...)
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 4/5] aarch64: me_cmp: Switch from uabd to uabal " Martin Storsjö
@ 2022-07-13 20:48 ` Martin Storsjö
2022-07-15 19:32 ` Swinney, Jonathan
2022-07-15 19:35 ` [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Swinney, Jonathan
4 siblings, 1 reply; 20+ messages in thread
From: Martin Storsjö @ 2022-07-13 20:48 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Jonathan Swinney
The max height is 16; the max difference per pixel is 255, and
a .8h element can easily contain 16*255, thus keep accumulating
in two .8h vectors, and just do the final accumulation at the
end.
This requires a minor register renumbering in ff_pix_abs16_xy2_neon.
Before: Cortex A53 A72 A73 Graviton 3
pix_abs_0_0_neon: 97.7 47.0 37.5 22.7
pix_abs_0_1_neon: 154.0 59.0 52.0 25.0
pix_abs_0_3_neon: 179.7 96.7 87.5 41.2
After:
pix_abs_0_0_neon: 96.0 39.2 31.2 22.0
pix_abs_0_1_neon: 150.7 59.7 46.2 23.7
pix_abs_0_3_neon: 175.7 83.7 81.7 38.2
---
I remember suggesting this before, and having it dismissed for
some reason I don't remember - maybe that the element size wasn't
big enough to hold the intermediate results? At least as far as I
can see, it can hold the results just fine, and this passes all
tests.
---
libavcodec/aarch64/me_cmp_neon.S | 102 +++++++++++++++----------------
1 file changed, 49 insertions(+), 53 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 89546869fb..cda7ce0408 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -27,15 +27,16 @@ function ff_pix_abs16_neon, export=1
// x3 ptrdiff_t stride
// w4 int h
cmp w4, #4 // if h < 4, jump to completion section
- movi v18.4S, #0 // clear result accumulator
+ movi v16.8h, #0 // clear result accumulator
+ movi v17.8h, #0 // clear result accumulator
b.lt 2f
1:
ld1 {v0.16b}, [x1], x3 // load pix1
ld1 {v4.16b}, [x2], x3 // load pix2
ld1 {v1.16b}, [x1], x3 // load pix1
ld1 {v5.16b}, [x2], x3 // load pix2
- uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
- uabdl2 v17.8h, v0.16b, v4.16b
+ uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabal2 v17.8h, v0.16b, v4.16b
ld1 {v2.16b}, [x1], x3 // load pix1
ld1 {v6.16b}, [x2], x3 // load pix2
uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate
@@ -48,27 +49,26 @@ function ff_pix_abs16_neon, export=1
uabal v16.8h, v3.8b, v7.8b
uabal2 v17.8h, v3.16b, v7.16b
cmp w4, #4 // if h >= 4, loop
- add v16.8h, v16.8h, v17.8h
- uaddlv s16, v16.8h // add up everything in v16 accumulator
- add d18, d16, d18 // add to the end result register
b.ge 1b
cbnz w4, 2f // if iterations remain, jump to completion section
- fmov w0, s18 // copy result to general purpose register
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h // add up everything in v16 accumulator
+ fmov w0, s16 // copy result to general purpose register
ret
2:
ld1 {v0.16b}, [x1], x3 // load pix1
ld1 {v4.16b}, [x2], x3 // load pix2
- uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
- uabal2 v16.8h, v0.16b, v4.16b
subs w4, w4, #1 // h -= 1
- addv h16, v16.8h // add up v16
- add d18, d16, d18 // add to result
+ uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabal2 v17.8h, v0.16b, v4.16b
b.ne 2b
- fmov w0, s18 // copy result to general purpose register
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h // add up everything in v16 accumulator
+ fmov w0, s16 // copy result to general purpose register
ret
endfunc
@@ -80,7 +80,8 @@ function ff_pix_abs16_xy2_neon, export=1
// w4 int h
add x5, x2, x3 // use x5 to hold uint8_t *pix3
- movi v0.2d, #0 // initialize the result register
+ movi v21.8h, #0 // initialize the result register
+ movi v22.8h, #0 // initialize the result register
// Load initial pix2 values for either the unrolled version or completion version.
ldur q4, [x2, #1] // load pix2+1
@@ -119,15 +120,15 @@ function ff_pix_abs16_xy2_neon, export=1
uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
- ldur q22, [x5, #1] // load pix3+1
+ ldur q7, [x5, #1] // load pix3+1
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
- uabdl v24.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
- uabdl2 v23.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
+ uabal v21.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
+ uabal2 v22.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
- ld1 {v21.16b}, [x5], x3 // load pix3
+ ld1 {v6.16b}, [x5], x3 // load pix3
ld1 {v20.16b}, [x1], x3 // load pix1
rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right)
@@ -140,33 +141,33 @@ function ff_pix_abs16_xy2_neon, export=1
rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15
- uabal v24.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
- uabal2 v23.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
+ uabal v21.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
+ uabal2 v22.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
- uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7
- uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15
+ uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
+ uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15
- uabal v24.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
- uabal2 v23.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
-
sub w4, w4, #4 // h -= 4
- uabal v24.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
- uabal2 v23.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
+ uabal v21.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
+ uabal2 v22.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
cmp w4, #4 // loop if h >= 4
- add v4.8h, v23.8h, v24.8h
- uaddlv s4, v4.8h // finish adding up accumulated values
- add d0, d0, d4 // add the value to the top level accumulator
+
+ uabal v21.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
+ uabal2 v22.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
b.ge 1b
cbnz w4, 2f // if iterations remain jump to completion section
+ add v4.8h, v21.8h, v22.8h
+ uaddlv s0, v4.8h // finish adding up accumulated values
+
fmov w0, s0 // copy result to general purpose register
ret
2:
@@ -182,20 +183,18 @@ function ff_pix_abs16_xy2_neon, export=1
add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
// divide by 4 to compute the average of values summed above
- urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
- urshr v17.8h, v17.8h, #2 // shift right by 2 8..15
-
- uxtl2 v7.8h, v1.16b // 8->16 bits pix1 8..15
- uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7
+ rshrn v16.8b, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
+ rshrn2 v16.16b, v17.8h, #2 // shift right by 2 8..15
- uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7
- uaba v6.8h, v7.8h, v17.8h // absolute difference accumulate 8..15
+ uabal v21.8h, v1.8b, v16.8b // absolute difference 0..7
+ uabal2 v22.8h, v1.16b, v16.16b // absolute difference accumulate 8..15
mov v2.16b, v18.16b // pix3 -> pix2
mov v3.16b, v19.16b // pix3+1 -> pix2+1
- uaddlv s6, v6.8h // add up accumulator in v6
- add d0, d0, d6 // add to the final result
b.ne 2b // loop if h > 0
+
+ add v4.8h, v21.8h, v22.8h
+ uaddlv s0, v4.8h // finish adding up accumulated values
fmov w0, s0 // copy result to general purpose register
ret
endfunc
@@ -209,7 +208,8 @@ function ff_pix_abs16_x2_neon, export=1
cmp w4, #4
// initialize buffers
- movi d20, #0
+ movi v16.8h, #0
+ movi v17.8h, #0
add x5, x2, #1 // pix2 + 1
b.lt 2f
@@ -224,9 +224,9 @@ function ff_pix_abs16_x2_neon, export=1
ld1 {v2.16b}, [x5], x3
urhadd v30.16b, v1.16b, v2.16b
ld1 {v0.16b}, [x1], x3
- uabdl v16.8h, v0.8b, v30.8b
+ uabal v16.8h, v0.8b, v30.8b
ld1 {v4.16b}, [x2], x3
- uabdl2 v17.8h, v0.16b, v30.16b
+ uabal2 v17.8h, v0.16b, v30.16b
ld1 {v5.16b}, [x5], x3
urhadd v29.16b, v4.16b, v5.16b
ld1 {v3.16b}, [x1], x3
@@ -238,20 +238,15 @@ function ff_pix_abs16_x2_neon, export=1
ld1 {v6.16b}, [x1], x3
uabal v16.8h, v6.8b, v28.8b
ld1 {v24.16b}, [x2], x3
+ sub w4, w4, #4
uabal2 v17.8h, v6.16b, v28.16b
ld1 {v25.16b}, [x5], x3
urhadd v27.16b, v24.16b, v25.16b
ld1 {v23.16b}, [x1], x3
+ cmp w4, #4
uabal v16.8h, v23.8b, v27.8b
uabal2 v17.8h, v23.16b, v27.16b
- sub w4, w4, #4
-
- add v16.8h, v16.8h, v17.8h
- uaddlv s16, v16.8h
- cmp w4, #4
- add d20, d20, d16
-
b.ge 1b
cbz w4, 3f
@@ -259,18 +254,19 @@ function ff_pix_abs16_x2_neon, export=1
2:
ld1 {v1.16b}, [x2], x3
ld1 {v2.16b}, [x5], x3
+ subs w4, w4, #1
urhadd v29.16b, v1.16b, v2.16b
ld1 {v0.16b}, [x1], x3
- uabd v28.16b, v0.16b, v29.16b
+ uabal v16.8h, v0.8b, v29.8b
+ uabal2 v17.8h, v0.16b, v29.16b
- uaddlv h28, v28.16b
- subs w4, w4, #1
- add d20, d20, d28
b.ne 2b
3:
- fmov w0, s20
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h
+ fmov w0, s16
ret
endfunc
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration Martin Storsjö
@ 2022-07-15 19:32 ` Swinney, Jonathan
2022-07-15 19:56 ` Martin Storsjö
0 siblings, 1 reply; 20+ messages in thread
From: Swinney, Jonathan @ 2022-07-15 19:32 UTC (permalink / raw)
To: Martin Storsjö, ffmpeg-devel
If the max height is just 16, then this should be fine. I assumed that h could have a much higher value (>1024), but if that is not the case, then this is a useful optimization.
Thanks!
--
Jonathan Swinney
On 7/13/22, 3:49 PM, "Martin Storsjö" <martin@martin.st> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
The max height is 16; the max difference per pixel is 255, and
a .8h element can easily contain 16*255, thus keep accumulating
in two .8h vectors, and just do the final accumulation at the
end.
This requires a minor register renumbering in ff_pix_abs16_xy2_neon.
Before: Cortex A53 A72 A73 Graviton 3
pix_abs_0_0_neon: 97.7 47.0 37.5 22.7
pix_abs_0_1_neon: 154.0 59.0 52.0 25.0
pix_abs_0_3_neon: 179.7 96.7 87.5 41.2
After:
pix_abs_0_0_neon: 96.0 39.2 31.2 22.0
pix_abs_0_1_neon: 150.7 59.7 46.2 23.7
pix_abs_0_3_neon: 175.7 83.7 81.7 38.2
---
I remember suggesting this before, and having it dismissed for
some reason I don't remember - maybe that the element size wasn't
big enough to hold the intermediate results? At least as far as I
can see, it can hold the results just fine, and this passes all
tests.
---
libavcodec/aarch64/me_cmp_neon.S | 102 +++++++++++++++----------------
1 file changed, 49 insertions(+), 53 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 89546869fb..cda7ce0408 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -27,15 +27,16 @@ function ff_pix_abs16_neon, export=1
// x3 ptrdiff_t stride
// w4 int h
cmp w4, #4 // if h < 4, jump to completion section
- movi v18.4S, #0 // clear result accumulator
+ movi v16.8h, #0 // clear result accumulator
+ movi v17.8h, #0 // clear result accumulator
b.lt 2f
1:
ld1 {v0.16b}, [x1], x3 // load pix1
ld1 {v4.16b}, [x2], x3 // load pix2
ld1 {v1.16b}, [x1], x3 // load pix1
ld1 {v5.16b}, [x2], x3 // load pix2
- uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
- uabdl2 v17.8h, v0.16b, v4.16b
+ uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabal2 v17.8h, v0.16b, v4.16b
ld1 {v2.16b}, [x1], x3 // load pix1
ld1 {v6.16b}, [x2], x3 // load pix2
uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate
@@ -48,27 +49,26 @@ function ff_pix_abs16_neon, export=1
uabal v16.8h, v3.8b, v7.8b
uabal2 v17.8h, v3.16b, v7.16b
cmp w4, #4 // if h >= 4, loop
- add v16.8h, v16.8h, v17.8h
- uaddlv s16, v16.8h // add up everything in v16 accumulator
- add d18, d16, d18 // add to the end result register
b.ge 1b
cbnz w4, 2f // if iterations remain, jump to completion section
- fmov w0, s18 // copy result to general purpose register
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h // add up everything in v16 accumulator
+ fmov w0, s16 // copy result to general purpose register
ret
2:
ld1 {v0.16b}, [x1], x3 // load pix1
ld1 {v4.16b}, [x2], x3 // load pix2
- uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
- uabal2 v16.8h, v0.16b, v4.16b
subs w4, w4, #1 // h -= 1
- addv h16, v16.8h // add up v16
- add d18, d16, d18 // add to result
+ uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabal2 v17.8h, v0.16b, v4.16b
b.ne 2b
- fmov w0, s18 // copy result to general purpose register
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h // add up everything in v16 accumulator
+ fmov w0, s16 // copy result to general purpose register
ret
endfunc
@@ -80,7 +80,8 @@ function ff_pix_abs16_xy2_neon, export=1
// w4 int h
add x5, x2, x3 // use x5 to hold uint8_t *pix3
- movi v0.2d, #0 // initialize the result register
+ movi v21.8h, #0 // initialize the result register
+ movi v22.8h, #0 // initialize the result register
// Load initial pix2 values for either the unrolled version or completion version.
ldur q4, [x2, #1] // load pix2+1
@@ -119,15 +120,15 @@ function ff_pix_abs16_xy2_neon, export=1
uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
- ldur q22, [x5, #1] // load pix3+1
+ ldur q7, [x5, #1] // load pix3+1
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
- uabdl v24.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
- uabdl2 v23.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
+ uabal v21.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
+ uabal2 v22.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
- ld1 {v21.16b}, [x5], x3 // load pix3
+ ld1 {v6.16b}, [x5], x3 // load pix3
ld1 {v20.16b}, [x1], x3 // load pix1
rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right)
@@ -140,33 +141,33 @@ function ff_pix_abs16_xy2_neon, export=1
rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15
- uabal v24.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
- uabal2 v23.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
+ uabal v21.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
+ uabal2 v22.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
- uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7
- uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15
+ uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
+ uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15
- uabal v24.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
- uabal2 v23.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
-
sub w4, w4, #4 // h -= 4
- uabal v24.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
- uabal2 v23.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
+ uabal v21.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
+ uabal2 v22.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
cmp w4, #4 // loop if h >= 4
- add v4.8h, v23.8h, v24.8h
- uaddlv s4, v4.8h // finish adding up accumulated values
- add d0, d0, d4 // add the value to the top level accumulator
+
+ uabal v21.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
+ uabal2 v22.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
b.ge 1b
cbnz w4, 2f // if iterations remain jump to completion section
+ add v4.8h, v21.8h, v22.8h
+ uaddlv s0, v4.8h // finish adding up accumulated values
+
fmov w0, s0 // copy result to general purpose register
ret
2:
@@ -182,20 +183,18 @@ function ff_pix_abs16_xy2_neon, export=1
add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
// divide by 4 to compute the average of values summed above
- urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
- urshr v17.8h, v17.8h, #2 // shift right by 2 8..15
-
- uxtl2 v7.8h, v1.16b // 8->16 bits pix1 8..15
- uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7
+ rshrn v16.8b, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
+ rshrn2 v16.16b, v17.8h, #2 // shift right by 2 8..15
- uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7
- uaba v6.8h, v7.8h, v17.8h // absolute difference accumulate 8..15
+ uabal v21.8h, v1.8b, v16.8b // absolute difference 0..7
+ uabal2 v22.8h, v1.16b, v16.16b // absolute difference accumulate 8..15
mov v2.16b, v18.16b // pix3 -> pix2
mov v3.16b, v19.16b // pix3+1 -> pix2+1
- uaddlv s6, v6.8h // add up accumulator in v6
- add d0, d0, d6 // add to the final result
b.ne 2b // loop if h > 0
+
+ add v4.8h, v21.8h, v22.8h
+ uaddlv s0, v4.8h // finish adding up accumulated values
fmov w0, s0 // copy result to general purpose register
ret
endfunc
@@ -209,7 +208,8 @@ function ff_pix_abs16_x2_neon, export=1
cmp w4, #4
// initialize buffers
- movi d20, #0
+ movi v16.8h, #0
+ movi v17.8h, #0
add x5, x2, #1 // pix2 + 1
b.lt 2f
@@ -224,9 +224,9 @@ function ff_pix_abs16_x2_neon, export=1
ld1 {v2.16b}, [x5], x3
urhadd v30.16b, v1.16b, v2.16b
ld1 {v0.16b}, [x1], x3
- uabdl v16.8h, v0.8b, v30.8b
+ uabal v16.8h, v0.8b, v30.8b
ld1 {v4.16b}, [x2], x3
- uabdl2 v17.8h, v0.16b, v30.16b
+ uabal2 v17.8h, v0.16b, v30.16b
ld1 {v5.16b}, [x5], x3
urhadd v29.16b, v4.16b, v5.16b
ld1 {v3.16b}, [x1], x3
@@ -238,20 +238,15 @@ function ff_pix_abs16_x2_neon, export=1
ld1 {v6.16b}, [x1], x3
uabal v16.8h, v6.8b, v28.8b
ld1 {v24.16b}, [x2], x3
+ sub w4, w4, #4
uabal2 v17.8h, v6.16b, v28.16b
ld1 {v25.16b}, [x5], x3
urhadd v27.16b, v24.16b, v25.16b
ld1 {v23.16b}, [x1], x3
+ cmp w4, #4
uabal v16.8h, v23.8b, v27.8b
uabal2 v17.8h, v23.16b, v27.16b
- sub w4, w4, #4
-
- add v16.8h, v16.8h, v17.8h
- uaddlv s16, v16.8h
- cmp w4, #4
- add d20, d20, d16
-
b.ge 1b
cbz w4, 3f
@@ -259,18 +254,19 @@ function ff_pix_abs16_x2_neon, export=1
2:
ld1 {v1.16b}, [x2], x3
ld1 {v2.16b}, [x5], x3
+ subs w4, w4, #1
urhadd v29.16b, v1.16b, v2.16b
ld1 {v0.16b}, [x1], x3
- uabd v28.16b, v0.16b, v29.16b
+ uabal v16.8h, v0.8b, v29.8b
+ uabal2 v17.8h, v0.16b, v29.16b
- uaddlv h28, v28.16b
- subs w4, w4, #1
- add d20, d20, d28
b.ne 2b
3:
- fmov w0, s20
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h
+ fmov w0, s16
ret
endfunc
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/5] aarch64: me_cmp: Interleave some of the loads in ff_pix_abs16_xy2_neon
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 3/5] aarch64: me_cmp: Interleave some of the loads in ff_pix_abs16_xy2_neon Martin Storsjö
@ 2022-07-15 19:34 ` Swinney, Jonathan
0 siblings, 0 replies; 20+ messages in thread
From: Swinney, Jonathan @ 2022-07-15 19:34 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Martin Storsjö
This looks good to me. As you can see it makes very little difference on Graviton 3, which is why I had not already done it. :)
--
Jonathan Swinney
On 7/13/22, 3:49 PM, "ffmpeg-devel on behalf of Martin Storsjö" <ffmpeg-devel-bounces@ffmpeg.org on behalf of martin@martin.st> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Before: Cortex A53 A72 A73 Graviton 3
pix_abs_0_3_neon: 183.7 112.7 97.5 41.2
After:
pix_abs_0_3_neon: 175.7 109.2 92.0 41.2
---
libavcodec/aarch64/me_cmp_neon.S | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 31db3793d9..0ae23d8922 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -101,26 +101,32 @@ function ff_pix_abs16_xy2_neon, export=1
ld1 {v6.16b}, [x5], x3 // load pix3
ld1 {v16.16b}, [x1], x3 // load pix1
- ldur q19, [x5, #1] // load pix3+1
- ld1 {v18.16b}, [x5], x3 // load pix3
- ld1 {v17.16b}, [x1], x3 // load pix1
-
- ldur q22, [x5, #1] // load pix3+1
- ld1 {v21.16b}, [x5], x3 // load pix3
- ld1 {v20.16b}, [x1], x3 // load pix1
-
// These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
uaddl v30.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
uaddl2 v31.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
+
+ ldur q19, [x5, #1] // load pix3+1
+
add v23.8h, v2.8h, v30.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
add v24.8h, v3.8h, v31.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
+
+ ld1 {v18.16b}, [x5], x3 // load pix3
+ ld1 {v17.16b}, [x1], x3 // load pix1
+
rshrn v23.8b, v23.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v23.16b, v24.8h, #2 // shift right 2 8..15
uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
+
+ ldur q22, [x5, #1] // load pix3+1
+
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+
+ ld1 {v21.16b}, [x5], x3 // load pix3
+ ld1 {v20.16b}, [x1], x3 // load pix1
+
rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v26.16b, v27.8h, #2 // shift right 2 8..15
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon
2022-07-13 20:48 [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Martin Storsjö
` (3 preceding siblings ...)
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration Martin Storsjö
@ 2022-07-15 19:35 ` Swinney, Jonathan
4 siblings, 0 replies; 20+ messages in thread
From: Swinney, Jonathan @ 2022-07-15 19:35 UTC (permalink / raw)
To: Martin Storsjö, ffmpeg-devel
LGTM.
--
Jonathan Swinney
On 7/13/22, 3:49 PM, "Martin Storsjö" <martin@martin.st> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Checkasm doesn't currently test this codepath.
---
libavcodec/aarch64/me_cmp_neon.S | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index e49d049fc2..31db3793d9 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -189,11 +189,11 @@ function ff_pix_abs16_xy2_neon, export=1
urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
urshr v17.8h, v17.8h, #2 // shift right by 2 8..15
- uxtl2 v8.8h, v1.16b // 8->16 bits pix1 8..15
+ uxtl2 v7.8h, v1.16b // 8->16 bits pix1 8..15
uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7
uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7
- uaba v6.8h, v8.8h, v17.8h // absolute difference accumulate 8..15
+ uaba v6.8h, v7.8h, v17.8h // absolute difference accumulate 8..15
mov v2.16b, v18.16b // pix3 -> pix2
mov v3.16b, v19.16b // pix3+1 -> pix2+1
uaddlv s6, v6.8h // add up accumulator in v6
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] checkasm: motion: Make the benchmarks more stable
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 2/5] checkasm: motion: Make the benchmarks more stable Martin Storsjö
@ 2022-07-15 19:35 ` Swinney, Jonathan
0 siblings, 0 replies; 20+ messages in thread
From: Swinney, Jonathan @ 2022-07-15 19:35 UTC (permalink / raw)
To: Martin Storsjö, ffmpeg-devel
LGTM.
--
Jonathan Swinney
On 7/13/22, 3:49 PM, "Martin Storsjö" <martin@martin.st> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Don't use the last random offset, but a static one.
---
tests/checkasm/motion.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
index 79e4358941..87b20d1c10 100644
--- a/tests/checkasm/motion.c
+++ b/tests/checkasm/motion.c
@@ -81,7 +81,8 @@ static void test_motion(const char *name, me_cmp_func test_func)
break;
}
}
- // benchmark with the final value of ptr
+ // Test with a fixed offset, for benchmark stability
+ ptr = img2 + 3 * WIDTH + 3;
bench_new(NULL, img1, ptr, WIDTH, 8);
}
}
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/5] aarch64: me_cmp: Switch from uabd to uabal in ff_pix_abs16_xy2_neon
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 4/5] aarch64: me_cmp: Switch from uabd to uabal " Martin Storsjö
@ 2022-07-15 19:38 ` Swinney, Jonathan
0 siblings, 0 replies; 20+ messages in thread
From: Swinney, Jonathan @ 2022-07-15 19:38 UTC (permalink / raw)
To: FFmpeg development discussions and patches, Martin Storsjö
LGTM.
--
Jonathan Swinney
On 7/13/22, 3:49 PM, "ffmpeg-devel on behalf of Martin Storsjö" <ffmpeg-devel-bounces@ffmpeg.org on behalf of martin@martin.st> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Using absolute-difference-accumulate does use twice the amount of
absolute-difference instructions, but avoids the need for the
uaddl and add instructions, reducing the total number of instructions
by 3.
These can be interleaved in the rest of the calculation, to avoid
tight dependencies at the end. Unfortunately, this is marginally
slower on Cortex A53, but faster on A72 and A73.
Before: Cortex A53 A72 A73 Graviton 3
pix_abs_0_3_neon: 175.7 109.2 92.0 41.2
After:
pix_abs_0_3_neon: 179.7 96.7 87.5 41.2
---
libavcodec/aarch64/me_cmp_neon.S | 32 +++++++++++---------------------
1 file changed, 11 insertions(+), 21 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 0ae23d8922..89546869fb 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -124,6 +124,9 @@ function ff_pix_abs16_xy2_neon, export=1
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ uabdl v24.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
+ uabdl2 v23.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
+
ld1 {v21.16b}, [x5], x3 // load pix3
ld1 {v20.16b}, [x1], x3 // load pix1
@@ -137,6 +140,9 @@ function ff_pix_abs16_xy2_neon, export=1
rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15
+ uabal v24.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
+ uabal2 v23.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
+
uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15
add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
@@ -144,33 +150,17 @@ function ff_pix_abs16_xy2_neon, export=1
rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15
- // Averages are now stored in these registers:
- // v23, v16, v28, v30
- // pix1 values in these registers:
- // v1, v16, v17, v20
- // available:
- // v4, v5, v7, v18, v19, v24, v25, v27, v29, v31
+ uabal v24.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
+ uabal2 v23.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
sub w4, w4, #4 // h -= 4
- // Using absolute-difference instructions instead of absolute-difference-accumulate allows
- // us to keep the results in 16b vectors instead of widening values with twice the instructions.
- // This approach also has fewer data dependencies, allowing better instruction level parallelism.
- uabd v4.16b, v1.16b, v23.16b // absolute difference 0..15, i=0
- uabd v5.16b, v16.16b, v26.16b // absolute difference 0..15, i=1
- uabd v6.16b, v17.16b, v28.16b // absolute difference 0..15, i=2
- uabd v7.16b, v20.16b, v30.16b // absolute difference 0..15, i=3
+ uabal v24.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
+ uabal2 v23.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
cmp w4, #4 // loop if h >= 4
- // Now add up all the values in each vector, v4-v7 with widening adds
- uaddl v19.8h, v4.8b, v5.8b
- uaddl2 v18.8h, v4.16b, v5.16b
- uaddl v4.8h, v6.8b, v7.8b
- uaddl2 v5.8h, v6.16b, v7.16b
- add v4.8h, v4.8h, v5.8h
- add v4.8h, v4.8h, v18.8h
- add v4.8h, v4.8h, v19.8h
+ add v4.8h, v23.8h, v24.8h
uaddlv s4, v4.8h // finish adding up accumulated values
add d0, d0, d4 // add the value to the top level accumulator
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-15 19:32 ` Swinney, Jonathan
@ 2022-07-15 19:56 ` Martin Storsjö
2022-07-15 21:19 ` Michael Niedermayer
0 siblings, 1 reply; 20+ messages in thread
From: Martin Storsjö @ 2022-07-15 19:56 UTC (permalink / raw)
To: Swinney, Jonathan; +Cc: ffmpeg-devel
On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
> If the max height is just 16, then this should be fine. I assumed that h
> could have a much higher value (>1024), but if that is not the case,
> then this is a useful optimization.
At least according to the me_cmp.h header, which says:
/* Motion estimation:
* h is limited to { width / 2, width, 2 * width },
* but never larger than 16 and never smaller than 2.
* Although currently h < 4 is not used as functions with
* width < 8 are neither used nor implemented. */
So with that in mind, I think this should be safe to do.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-15 19:56 ` Martin Storsjö
@ 2022-07-15 21:19 ` Michael Niedermayer
2022-07-15 21:25 ` Martin Storsjö
2022-07-16 9:18 ` Martin Storsjö
0 siblings, 2 replies; 20+ messages in thread
From: Michael Niedermayer @ 2022-07-15 21:19 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 1035 bytes --]
On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
>
> > If the max height is just 16, then this should be fine. I assumed that h
> > could have a much higher value (>1024), but if that is not the case,
> > then this is a useful optimization.
>
> At least according to the me_cmp.h header, which says:
>
> /* Motion estimation:
> * h is limited to { width / 2, width, 2 * width },
> * but never larger than 16 and never smaller than 2.
> * Although currently h < 4 is not used as functions with
> * width < 8 are neither used nor implemented. */
These rules where written with support for encoding of all
standard formats in mind at the time that was written.
today it may make sense to extend these rules to cover the
things which where created since then
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Republics decline into democracies and democracies degenerate into
despotisms. -- Aristotle
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-15 21:19 ` Michael Niedermayer
@ 2022-07-15 21:25 ` Martin Storsjö
2022-07-16 11:23 ` Michael Niedermayer
2022-07-16 9:18 ` Martin Storsjö
1 sibling, 1 reply; 20+ messages in thread
From: Martin Storsjö @ 2022-07-15 21:25 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Fri, 15 Jul 2022, Michael Niedermayer wrote:
> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
>>
>>> If the max height is just 16, then this should be fine. I assumed that h
>>> could have a much higher value (>1024), but if that is not the case,
>>> then this is a useful optimization.
>>
>> At least according to the me_cmp.h header, which says:
>>
>> /* Motion estimation:
>> * h is limited to { width / 2, width, 2 * width },
>> * but never larger than 16 and never smaller than 2.
>> * Although currently h < 4 is not used as functions with
>> * width < 8 are neither used nor implemented. */
>
> These rules where written with support for encoding of all
> standard formats in mind at the time that was written.
> today it may make sense to extend these rules to cover the
> things which where created since then
Right, but if that suddenly changes, such a change also must expect that
it might need updates to all assembly implementations that implement that
interface currently. Right now, both the defacto case (any callers in the
codebase) and the explicit documentation says that it can't be called with
parameters outside of that range.
Even if it's raised from the current <= 16, this particular optimization
should be fine as long as h <= 256 - which should be fine for at least all
current-gen mainstream codecs since, I think?
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-15 21:19 ` Michael Niedermayer
2022-07-15 21:25 ` Martin Storsjö
@ 2022-07-16 9:18 ` Martin Storsjö
1 sibling, 0 replies; 20+ messages in thread
From: Martin Storsjö @ 2022-07-16 9:18 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Fri, 15 Jul 2022, Michael Niedermayer wrote:
> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
>>
>>> If the max height is just 16, then this should be fine. I assumed that h
>>> could have a much higher value (>1024), but if that is not the case,
>>> then this is a useful optimization.
>>
>> At least according to the me_cmp.h header, which says:
>>
>> /* Motion estimation:
>> * h is limited to { width / 2, width, 2 * width },
>> * but never larger than 16 and never smaller than 2.
>> * Although currently h < 4 is not used as functions with
>> * width < 8 are neither used nor implemented. */
>
> These rules where written with support for encoding of all
> standard formats in mind at the time that was written.
> today it may make sense to extend these rules to cover the
> things which where created since then
Also - if extending this, I would expect that you want other widths too.
Right now, most of the functions seem to be arranged such as [0] is w=16
and [0] is w=8. For those, for w=8, it seems to be mostly hardcoded to
only assume h=8, while the w=16 functions actually honor the h parameter.
If it ever would be relevant with h>256, that wouldn't be for the existing
w=8 or w=16 functions, but for newer functions with a larger width too.
So I think this patch is safe (which works for h up to 256), and if
someone wants to extend the interface later, that can be done.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-15 21:25 ` Martin Storsjö
@ 2022-07-16 11:23 ` Michael Niedermayer
2022-07-16 12:30 ` Martin Storsjö
2022-07-16 12:50 ` Ronald S. Bultje
0 siblings, 2 replies; 20+ messages in thread
From: Michael Niedermayer @ 2022-07-16 11:23 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 2138 bytes --]
On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote:
> On Fri, 15 Jul 2022, Michael Niedermayer wrote:
>
> > On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
> > > On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
> > >
> > > > If the max height is just 16, then this should be fine. I assumed that h
> > > > could have a much higher value (>1024), but if that is not the case,
> > > > then this is a useful optimization.
> > >
> > > At least according to the me_cmp.h header, which says:
> > >
> > > /* Motion estimation:
> > > * h is limited to { width / 2, width, 2 * width },
> > > * but never larger than 16 and never smaller than 2.
> > > * Although currently h < 4 is not used as functions with
> > > * width < 8 are neither used nor implemented. */
> >
> > These rules where written with support for encoding of all
> > standard formats in mind at the time that was written.
> > today it may make sense to extend these rules to cover the
> > things which where created since then
>
> Right, but if that suddenly changes, such a change also must expect that it
> might need updates to all assembly implementations that implement that
> interface currently. Right now, both the defacto case (any callers in the
> codebase) and the explicit documentation says that it can't be called with
> parameters outside of that range.
What i meant was that newly added functions should be more flexible than
these old rules. That is 2 sets of rules
1. What a caller ATM can do and expect to work (thats whats written there)
2. What an implementor of new functions should make sure is supported
>
> Even if it's raised from the current <= 16, this particular optimization
> should be fine as long as h <= 256 - which should be fine for at least all
> current-gen mainstream codecs since, I think?
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Many things microsoft did are stupid, but not doing something just because
microsoft did it is even more stupid. If everything ms did were stupid they
would be bankrupt already.
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-16 11:23 ` Michael Niedermayer
@ 2022-07-16 12:30 ` Martin Storsjö
2022-07-16 13:20 ` Michael Niedermayer
2022-07-16 12:50 ` Ronald S. Bultje
1 sibling, 1 reply; 20+ messages in thread
From: Martin Storsjö @ 2022-07-16 12:30 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Sat, 16 Jul 2022, Michael Niedermayer wrote:
> On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote:
>> On Fri, 15 Jul 2022, Michael Niedermayer wrote:
>>
>>> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
>>>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
>>>>
>>>>> If the max height is just 16, then this should be fine. I assumed that h
>>>>> could have a much higher value (>1024), but if that is not the case,
>>>>> then this is a useful optimization.
>>>>
>>>> At least according to the me_cmp.h header, which says:
>>>>
>>>> /* Motion estimation:
>>>> * h is limited to { width / 2, width, 2 * width },
>>>> * but never larger than 16 and never smaller than 2.
>>>> * Although currently h < 4 is not used as functions with
>>>> * width < 8 are neither used nor implemented. */
>>>
>>> These rules where written with support for encoding of all
>>> standard formats in mind at the time that was written.
>>> today it may make sense to extend these rules to cover the
>>> things which where created since then
>>
>> Right, but if that suddenly changes, such a change also must expect that it
>> might need updates to all assembly implementations that implement that
>> interface currently. Right now, both the defacto case (any callers in the
>> codebase) and the explicit documentation says that it can't be called with
>> parameters outside of that range.
>
> What i meant was that newly added functions should be more flexible than
> these old rules. That is 2 sets of rules
> 1. What a caller ATM can do and expect to work (thats whats written there)
> 2. What an implementor of new functions should make sure is supported
With 2., do you mean if adding a new function into the same struct, or if
implementing the existing pix_abs[0][..] functions?
If you mean new implementations of the existing function interface, you
say they "should be more flexible". How flexible must they be? Is it ok to
assume h<=256 for the w=16 functions?
Gradually increasing the requirements for existing function interfaces
like you suggest is really problematic.
If we want to require more of the functions, we should document it, and
extend the checkasm test to test that new requirement - which also extends
the requirement to the existing functions. If we don't have a checkasm
test for the required behaviour, we can pretty much assume it's broken,
even in new implementations.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-16 11:23 ` Michael Niedermayer
2022-07-16 12:30 ` Martin Storsjö
@ 2022-07-16 12:50 ` Ronald S. Bultje
2022-07-16 13:06 ` Michael Niedermayer
1 sibling, 1 reply; 20+ messages in thread
From: Ronald S. Bultje @ 2022-07-16 12:50 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Hi,
On Sat, Jul 16, 2022 at 7:23 PM Michael Niedermayer <michael@niedermayer.cc>
wrote:
> What i meant was that newly added functions should be more flexible than
> these old rules. That is 2 sets of rules
> 1. What a caller ATM can do and expect to work (thats whats written there)
> 2. What an implementor of new functions should make sure is supported
>
What's the use case exactly that you'd like to support that we currently
don't?
Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-16 12:50 ` Ronald S. Bultje
@ 2022-07-16 13:06 ` Michael Niedermayer
0 siblings, 0 replies; 20+ messages in thread
From: Michael Niedermayer @ 2022-07-16 13:06 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 1074 bytes --]
On Sat, Jul 16, 2022 at 08:50:24PM +0800, Ronald S. Bultje wrote:
> Hi,
>
> On Sat, Jul 16, 2022 at 7:23 PM Michael Niedermayer <michael@niedermayer.cc>
> wrote:
>
> > What i meant was that newly added functions should be more flexible than
> > these old rules. That is 2 sets of rules
> > 1. What a caller ATM can do and expect to work (thats whats written there)
> > 2. What an implementor of new functions should make sure is supported
> >
>
> What's the use case exactly that you'd like to support that we currently
> don't?
The idea is to have general purpose functions which can be used if someone
wants to write a new encoder.
(this is kind of not a different goal, its rather that the original
limitations where based on mpeg2/mpeg4/h264 which was the most flexible at
the time)
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Take away the freedom of one citizen and you will be jailed, take away
the freedom of all citizens and you will be congratulated by your peers
in Parliament.
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-16 12:30 ` Martin Storsjö
@ 2022-07-16 13:20 ` Michael Niedermayer
2022-07-16 14:23 ` Martin Storsjö
0 siblings, 1 reply; 20+ messages in thread
From: Michael Niedermayer @ 2022-07-16 13:20 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 3401 bytes --]
On Sat, Jul 16, 2022 at 03:30:23PM +0300, Martin Storsjö wrote:
> On Sat, 16 Jul 2022, Michael Niedermayer wrote:
>
> > On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote:
> > > On Fri, 15 Jul 2022, Michael Niedermayer wrote:
> > >
> > > > On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
> > > > > On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
> > > > >
> > > > > > If the max height is just 16, then this should be fine. I assumed that h
> > > > > > could have a much higher value (>1024), but if that is not the case,
> > > > > > then this is a useful optimization.
> > > > >
> > > > > At least according to the me_cmp.h header, which says:
> > > > >
> > > > > /* Motion estimation:
> > > > > * h is limited to { width / 2, width, 2 * width },
> > > > > * but never larger than 16 and never smaller than 2.
> > > > > * Although currently h < 4 is not used as functions with
> > > > > * width < 8 are neither used nor implemented. */
> > > >
> > > > These rules where written with support for encoding of all
> > > > standard formats in mind at the time that was written.
> > > > today it may make sense to extend these rules to cover the
> > > > things which where created since then
> > >
> > > Right, but if that suddenly changes, such a change also must expect that it
> > > might need updates to all assembly implementations that implement that
> > > interface currently. Right now, both the defacto case (any callers in the
> > > codebase) and the explicit documentation says that it can't be called with
> > > parameters outside of that range.
> >
> > What i meant was that newly added functions should be more flexible than
> > these old rules. That is 2 sets of rules
> > 1. What a caller ATM can do and expect to work (thats whats written there)
> > 2. What an implementor of new functions should make sure is supported
>
> With 2., do you mean if adding a new function into the same struct, or if
> implementing the existing pix_abs[0][..] functions?
i would say both
>
> If you mean new implementations of the existing function interface, you say
> they "should be more flexible". How flexible must they be? Is it ok to
> assume h<=256 for the w=16 functions?
i think thats fine
>
> Gradually increasing the requirements for existing function interfaces like
> you suggest is really problematic.
why ?
iam really just saying
"when you add new code, dont base it on old limitations"
And i suggest this because its much easier to do when a function is added than
when later one wants to use it
When i originally wrote this list of restrictions it didnt list what our
code actually used but tried to look ahead to what we would need when we
write encoders for all standard formats at that time. We never wrote a
h264 encoder but this API was designed to support it.
>
> If we want to require more of the functions, we should document it, and
> extend the checkasm test to test that new requirement - which also extends
> the requirement to the existing functions. If we don't have a checkasm test
> for the required behaviour, we can pretty much assume it's broken, even in
> new implementations.
yes
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
No snowflake in an avalanche ever feels responsible. -- Voltaire
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
2022-07-16 13:20 ` Michael Niedermayer
@ 2022-07-16 14:23 ` Martin Storsjö
0 siblings, 0 replies; 20+ messages in thread
From: Martin Storsjö @ 2022-07-16 14:23 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Sat, 16 Jul 2022, Michael Niedermayer wrote:
> On Sat, Jul 16, 2022 at 03:30:23PM +0300, Martin Storsjö wrote:
>> On Sat, 16 Jul 2022, Michael Niedermayer wrote:
>>
>>> On Sat, Jul 16, 2022 at 12:25:37AM +0300, Martin Storsjö wrote:
>>>> On Fri, 15 Jul 2022, Michael Niedermayer wrote:
>>>>
>>>>> On Fri, Jul 15, 2022 at 10:56:03PM +0300, Martin Storsjö wrote:
>>>>>> On Fri, 15 Jul 2022, Swinney, Jonathan wrote:
>>>>>>
>>>>>>> If the max height is just 16, then this should be fine. I assumed that h
>>>>>>> could have a much higher value (>1024), but if that is not the case,
>>>>>>> then this is a useful optimization.
>>>>>>
>>>>>> At least according to the me_cmp.h header, which says:
>>>>>>
>>>>>> /* Motion estimation:
>>>>>> * h is limited to { width / 2, width, 2 * width },
>>>>>> * but never larger than 16 and never smaller than 2.
>>>>>> * Although currently h < 4 is not used as functions with
>>>>>> * width < 8 are neither used nor implemented. */
>>>>>
>>>>> These rules where written with support for encoding of all
>>>>> standard formats in mind at the time that was written.
>>>>> today it may make sense to extend these rules to cover the
>>>>> things which where created since then
>>>>
>>>> Right, but if that suddenly changes, such a change also must expect that it
>>>> might need updates to all assembly implementations that implement that
>>>> interface currently. Right now, both the defacto case (any callers in the
>>>> codebase) and the explicit documentation says that it can't be called with
>>>> parameters outside of that range.
>>>
>>> What i meant was that newly added functions should be more flexible than
>>> these old rules. That is 2 sets of rules
>>> 1. What a caller ATM can do and expect to work (thats whats written there)
>>> 2. What an implementor of new functions should make sure is supported
>>
>> With 2., do you mean if adding a new function into the same struct, or if
>> implementing the existing pix_abs[0][..] functions?
>
> i would say both
>
>
>>
>> If you mean new implementations of the existing function interface, you say
>> they "should be more flexible". How flexible must they be? Is it ok to
>> assume h<=256 for the w=16 functions?
>
> i think thats fine
Ok, I'll go ahead and push this then.
>>
>> Gradually increasing the requirements for existing function interfaces like
>> you suggest is really problematic.
>
> why ?
> iam really just saying
> "when you add new code, dont base it on old limitations"
For this case, I just quoted what the header said, which seemed
authoritative to me - but it's fine for me with a wider spec too, up to
h=256. But saying arbitrarily "any height" really inhibits what you can do
in asm.
Also if I shouldn't reference that limitation in the header, please
update/reword it, as it's actively misleading for anyone working on
this right now.
And in general, we can design for an intended use case and calculate
whether it should work or not - but as long as it's not tested, those
cases will often have hidden bugs - see e.g. patch 1/5 in this series.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2022-07-16 14:24 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-13 20:48 [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Martin Storsjö
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 2/5] checkasm: motion: Make the benchmarks more stable Martin Storsjö
2022-07-15 19:35 ` Swinney, Jonathan
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 3/5] aarch64: me_cmp: Interleave some of the loads in ff_pix_abs16_xy2_neon Martin Storsjö
2022-07-15 19:34 ` Swinney, Jonathan
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 4/5] aarch64: me_cmp: Switch from uabd to uabal " Martin Storsjö
2022-07-15 19:38 ` Swinney, Jonathan
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration Martin Storsjö
2022-07-15 19:32 ` Swinney, Jonathan
2022-07-15 19:56 ` Martin Storsjö
2022-07-15 21:19 ` Michael Niedermayer
2022-07-15 21:25 ` Martin Storsjö
2022-07-16 11:23 ` Michael Niedermayer
2022-07-16 12:30 ` Martin Storsjö
2022-07-16 13:20 ` Michael Niedermayer
2022-07-16 14:23 ` Martin Storsjö
2022-07-16 12:50 ` Ronald S. Bultje
2022-07-16 13:06 ` Michael Niedermayer
2022-07-16 9:18 ` Martin Storsjö
2022-07-15 19:35 ` [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Swinney, Jonathan
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git