* [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8
2022-09-26 9:15 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
@ 2022-09-26 9:15 ` Grzegorz Bernacki
2022-09-28 9:08 ` Martin Storsjö
2022-09-26 9:15 ` [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Grzegorz Bernacki @ 2022-09-26 9:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
Add vectorized implementation of nsse8 function.
Performance comparison tests are shown below.
- nsse_1_c: 256.0
- nsse_1_neon: 82.7
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 15 ++++
libavcodec/aarch64/me_cmp_neon.S | 99 ++++++++++++++++++++++++
2 files changed, 114 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3459403ee5..2c61cfcf63 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -66,6 +66,11 @@ int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *p
int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
+ ptrdiff_t stride, int h);
+int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -94,6 +99,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsse[4] = vsse_intra16_neon;
c->nsse[0] = nsse16_neon_wrapper;
+ c->nsse[1] = nsse8_neon_wrapper;
c->median_sad[0] = pix_median_abs16_neon;
c->median_sad[1] = pix_median_abs8_neon;
@@ -108,3 +114,12 @@ int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
else
return nsse16_neon(8, s1, s2, stride, h);
}
+
+int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h)
+{
+ if (c)
+ return nsse8_neon(c->avctx->nsse_weight, s1, s2, stride, h);
+ else
+ return nsse8_neon(8, s1, s2, stride, h);
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index e03c0c26cd..6f7c7c1690 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1163,6 +1163,105 @@ function nsse16_neon, export=1
ret
endfunc
+function nsse8_neon, export=1
+ // x0 multiplier
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ str x0, [sp, #-0x40]!
+ stp x1, x2, [sp, #0x10]
+ stp x3, x4, [sp, #0x20]
+ str x30, [sp, #0x30]
+ bl X(sse8_neon)
+ ldr x30, [sp, #0x30]
+ mov w9, w0 // here we store score1
+ ldr x5, [sp]
+ ldp x1, x2, [sp, #0x10]
+ ldp x3, x4, [sp, #0x20]
+ add sp, sp, #0x40
+
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+
+ ld1 {v0.8b}, [x1], x3
+ subs w4, w4, #1 // we need to make h-1 iterations
+ ext v1.8b, v0.8b, v0.8b, #1 // x1 + 1
+ ld1 {v2.8b}, [x2], x3
+ cmp w4, #2
+ ext v3.8b, v2.8b, v2.8b, #1 // x2 + 1
+
+ b.lt 2f
+
+// make 2 iterations at once
+1:
+ ld1 {v4.8b}, [x1], x3
+ ld1 {v20.8b}, [x1], x3
+ ld1 {v6.8b}, [x2], x3
+ ext v5.8b, v4.8b, v4.8b, #1 // x1 + stride + 1
+ ext v21.8b, v20.8b, v20.8b, #1
+ ld1 {v22.8b}, [x2], x3
+ ext v7.8b, v6.8b, v6.8b, #1 // x2 + stride + 1
+ usubl v31.8h, v0.8b, v4.8b
+ ext v23.8b, v22.8b, v22.8b, #1
+ usubl v29.8h, v1.8b, v5.8b
+ usubl v27.8h, v2.8b, v6.8b
+ usubl v25.8h, v3.8b, v7.8b
+ saba v16.8h, v31.8h, v29.8h
+ usubl v31.8h, v4.8b, v20.8b
+ saba v18.8h, v27.8h, v25.8h
+ sub w4, w4, #2
+ usubl v29.8h, v5.8b, v21.8b
+ mov v0.16b, v20.16b
+ mov v1.16b, v21.16b
+ saba v16.8h, v31.8h, v29.8h
+ usubl v27.8h, v6.8b, v22.8b
+ usubl v25.8h, v7.8b, v23.8b
+ mov v2.16b, v22.16b
+ mov v3.16b, v23.16b
+ cmp w4, #2
+ saba v18.8h, v27.8h, v25.8h
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v4.8b}, [x1], x3
+ subs w4, w4, #1
+ ext v5.8b, v4.8b, v4.8b, #1 // x1 + stride + 1
+ ld1 {v6.8b}, [x2], x3
+ usubl v31.8h, v0.8b, v4.8b
+ ext v7.8b, v6.8b, v6.8b, #1 // x2 + stride + 1
+
+ usubl v29.8h, v1.8b, v5.8b
+ saba v16.8h, v31.8h, v29.8h
+ usubl v27.8h, v2.8b, v6.8b
+ usubl v25.8h, v3.8b, v7.8b
+ saba v18.8h, v27.8h, v25.8h
+
+ mov v0.16b, v4.16b
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+
+ cbnz w4, 2b
+
+3:
+ sqsub v16.8h, v16.8h, v18.8h
+ ins v16.h[7], wzr
+ saddlv s16, v16.8h
+ sqabs s16, s16
+ fmov w0, s16
+
+ mul w0, w0, w5
+ add w0, w0, w9
+
+ ret
+endfunc
+
function pix_median_abs16_neon, export=1
// x0 unused
// x1 uint8_t *pix1
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8
2022-09-26 9:15 ` [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
@ 2022-09-28 9:08 ` Martin Storsjö
0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2022-09-28 9:08 UTC (permalink / raw)
To: Grzegorz Bernacki; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop
On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:
> Add vectorized implementation of nsse8 function.
>
> Performance comparison tests are shown below.
> - nsse_1_c: 256.0
> - nsse_1_neon: 82.7
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 15 ++++
> libavcodec/aarch64/me_cmp_neon.S | 99 ++++++++++++++++++++++++
> 2 files changed, 114 insertions(+)
Looks reasonable to me, but do check to make sure there's no tabs.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64.
2022-09-26 9:15 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
2022-09-26 9:15 ` [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
@ 2022-09-26 9:15 ` Grzegorz Bernacki
2022-09-28 9:09 ` Martin Storsjö
2022-09-26 9:15 ` [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
2022-09-28 9:06 ` [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Martin Storsjö
3 siblings, 1 reply; 9+ messages in thread
From: Grzegorz Bernacki @ 2022-09-26 9:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
Provide optimized implementation of vsse8 for arm64.
Performance comparison tests are shown below.
- vsse_1_c: 141.5
- vsse_1_neon: 32.5
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 5 ++
libavcodec/aarch64/me_cmp_neon.S | 70 ++++++++++++++++++++++++
2 files changed, 75 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 2c61cfcf63..f247372c94 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -71,6 +71,9 @@ int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -96,6 +99,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsad[5] = vsad_intra8_neon;
c->vsse[0] = vsse16_neon;
+ c->vsse[1] = vsse8_neon;
+
c->vsse[4] = vsse_intra16_neon;
c->nsse[0] = nsse16_neon_wrapper;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 6f7c7c1690..386d2de0c5 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -843,6 +843,76 @@ function vsad16_neon, export=1
ret
endfunc
+function vsse8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.8b}, [x1], x3 // Load pix1[0], first iteration
+ ld1 {v1.8b}, [x2], x3 // Load pix2[0], first iteration
+
+ sub w4, w4, #1 // we need to make h-1 iterations
+ movi v16.4s, #0
+ movi v17.4s, #0
+
+ cmp w4, #3 // check if we can make 3 iterations at once
+ usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
+ b.le 2f
+
+
+1:
+ // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+ // res = (x) * (x)
+ ld1 {v0.8b}, [x1], x3 // Load pix1[0 + stride], first iteration
+ ld1 {v1.8b}, [x2], x3 // Load pix2[0 + stride], first iteration
+ ld1 {v2.8b}, [x1], x3 // Load pix1[0 + stride], second iteration
+ ld1 {v3.8b}, [x2], x3 // Load pix2[0 + stride], second iteration
+ usubl v29.8h, v0.8b, v1.8b
+ usubl2 v28.8h, v0.16b, v1.16b
+ ld1 {v4.8b}, [x1], x3 // Load pix1[0 + stride], third iteration
+ ld1 {v5.8b}, [x2], x3 // Load pix1[0 + stride], third iteration
+ sabd v31.8h, v31.8h, v29.8h
+ usubl v27.8h, v2.8b, v3.8b
+ usubl v25.8h, v4.8b, v5.8b
+ sabd v29.8h, v29.8h, v27.8h
+ sabd v27.8h, v27.8h, v25.8h
+ umlal v16.4s, v31.4h, v31.4h
+ umlal2 v17.4s, v31.8h, v31.8h
+ mov v31.16b, v25.16b
+ umlal v16.4s, v29.4h, v29.4h
+ umlal2 v17.4s, v29.8h, v29.8h
+ sub w4, w4, #3
+ umlal v16.4s, v27.4h, v27.4h
+ umlal2 v17.4s, v27.8h, v27.8h
+ cmp w4, #3
+
+ b.ge 1b
+
+ cbz w4, 3f
+
+// iterate by once
+2:
+ ld1 {v0.8b}, [x1], x3
+ ld1 {v1.8b}, [x2], x3
+ subs w4, w4, #1
+ usubl v29.8h, v0.8b, v1.8b
+ sabd v31.8h, v31.8h, v29.8h
+ umlal v16.4s, v31.4h, v31.4h
+ umlal2 v17.4s, v31.8h, v31.8h
+ mov v31.16b, v29.16b
+ b.ne 2b
+
+3:
+ add v16.4s, v16.4s, v17.4s
+ uaddlv d17, v16.4s
+ fmov w0, s17
+
+ ret
+endfunc
+
+
function vsse16_neon, export=1
// x0 unused
// x1 uint8_t *pix1
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64.
2022-09-26 9:15 ` [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
@ 2022-09-28 9:09 ` Martin Storsjö
0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2022-09-28 9:09 UTC (permalink / raw)
To: Grzegorz Bernacki; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop
On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:
> Provide optimized implementation of vsse8 for arm64.
>
> Performance comparison tests are shown below.
> - vsse_1_c: 141.5
> - vsse_1_neon: 32.5
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 5 ++
> libavcodec/aarch64/me_cmp_neon.S | 70 ++++++++++++++++++++++++
> 2 files changed, 75 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 2c61cfcf63..f247372c94 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -71,6 +71,9 @@ int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
> int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> ptrdiff_t stride, int h);
>
> +int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> + ptrdiff_t stride, int h);
> +
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -96,6 +99,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> c->vsad[5] = vsad_intra8_neon;
>
> c->vsse[0] = vsse16_neon;
> + c->vsse[1] = vsse8_neon;
> +
> c->vsse[4] = vsse_intra16_neon;
>
> c->nsse[0] = nsse16_neon_wrapper;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 6f7c7c1690..386d2de0c5 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -843,6 +843,76 @@ function vsad16_neon, export=1
> ret
> endfunc
>
> +function vsse8_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // w4 int h
> +
> + ld1 {v0.8b}, [x1], x3 // Load pix1[0], first iteration
> + ld1 {v1.8b}, [x2], x3 // Load pix2[0], first iteration
> +
> + sub w4, w4, #1 // we need to make h-1 iterations
> + movi v16.4s, #0
> + movi v17.4s, #0
> +
> + cmp w4, #3 // check if we can make 3 iterations at once
> + usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
> + b.le 2f
Why the b.le here, shouldn't it be enough with b.lt? If we're run with
h=<unroll amount>, it should be enough to run one round in the unrolled
loop.
I see that we've got the same issue in a couple preexisting functions;
I'll send a patch to fix that.
Other than that, this looks reasonable to me.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8
2022-09-26 9:15 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
2022-09-26 9:15 ` [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
2022-09-26 9:15 ` [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
@ 2022-09-26 9:15 ` Grzegorz Bernacki
2022-09-28 9:11 ` Martin Storsjö
2022-09-28 9:06 ` [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Martin Storsjö
3 siblings, 1 reply; 9+ messages in thread
From: Grzegorz Bernacki @ 2022-09-26 9:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
Provide optimized implementation for vsse_intra8 for arm64.
Performance tests are shown below.
- vsse_5_c: 87.7
- vsse_5_neon: 26.2
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 53 ++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index f247372c94..defec37478 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -74,6 +74,9 @@ int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsse_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -102,6 +105,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsse[1] = vsse8_neon;
c->vsse[4] = vsse_intra16_neon;
+ c->vsse[5] = vsse_intra8_neon;
c->nsse[0] = nsse16_neon_wrapper;
c->nsse[1] = nsse8_neon_wrapper;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 386d2de0c5..82ff05d3f0 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1111,6 +1111,59 @@ function vsse_intra16_neon, export=1
ret
endfunc
+function vsse_intra8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *dummy
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.8b}, [x1], x3
+ movi v16.4s, #0
+
+ sub w4, w4, #1 // we need to make h-1 iterations
+ cmp w4, #3
+ b.lt 2f
+
+1:
+ // v = abs( pix1[0] - pix1[0 + stride] )
+ // score = sum( v * v )
+ ld1 {v1.8b}, [x1], x3
+ ld1 {v2.8b}, [x1], x3
+ uabd v30.8b, v0.8b, v1.8b
+ ld1 {v3.8b}, [x1], x3
+ umull v29.8h, v30.8b, v30.8b
+ uabd v27.8b, v1.8b, v2.8b
+ uadalp v16.4s, v29.8h
+ umull v26.8h, v27.8b, v27.8b
+ uabd v25.8b, v2.8b, v3.8b
+ uadalp v16.4s, v26.8h
+ umull v24.8h, v25.8b, v25.8b
+ sub w4, w4, #3
+ uadalp v16.4s, v24.8h
+ cmp w4, #3
+ mov v0.8b, v3.8b
+
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v1.8b}, [x1], x3
+ subs w4, w4, #1
+ uabd v30.8b, v0.8b, v1.8b
+ mov v0.8b, v1.8b
+ umull v29.8h, v30.8b, v30.8b
+ uadalp v16.4s, v29.8h
+ cbnz w4, 2b
+
+3:
+ uaddlv d17, v16.4s
+ fmov w0, s17
+
+ ret
+endfunc
+
function nsse16_neon, export=1
// x0 multiplier
// x1 uint8_t *pix1
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8
2022-09-26 9:15 ` [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
@ 2022-09-28 9:11 ` Martin Storsjö
0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2022-09-28 9:11 UTC (permalink / raw)
To: Grzegorz Bernacki; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop
[-- Attachment #1: Type: text/plain, Size: 2747 bytes --]
On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:
> Provide optimized implementation for vsse_intra8 for arm64.
>
> Performance tests are shown below.
> - vsse_5_c: 87.7
> - vsse_5_neon: 26.2
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
> libavcodec/aarch64/me_cmp_neon.S | 53 ++++++++++++++++++++++++
> 2 files changed, 57 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index f247372c94..defec37478 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -74,6 +74,9 @@ int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> ptrdiff_t stride, int h);
>
> +int vsse_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
> + ptrdiff_t stride, int h);
> +
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -102,6 +105,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> c->vsse[1] = vsse8_neon;
>
> c->vsse[4] = vsse_intra16_neon;
> + c->vsse[5] = vsse_intra8_neon;
>
> c->nsse[0] = nsse16_neon_wrapper;
> c->nsse[1] = nsse8_neon_wrapper;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 386d2de0c5..82ff05d3f0 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -1111,6 +1111,59 @@ function vsse_intra16_neon, export=1
> ret
> endfunc
>
> +function vsse_intra8_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *dummy
> + // x3 ptrdiff_t stride
> + // w4 int h
> +
> + ld1 {v0.8b}, [x1], x3
> + movi v16.4s, #0
> +
> + sub w4, w4, #1 // we need to make h-1 iterations
> + cmp w4, #3
> + b.lt 2f
> +
> +1:
> + // v = abs( pix1[0] - pix1[0 + stride] )
> + // score = sum( v * v )
> + ld1 {v1.8b}, [x1], x3
> + ld1 {v2.8b}, [x1], x3
> + uabd v30.8b, v0.8b, v1.8b
> + ld1 {v3.8b}, [x1], x3
> + umull v29.8h, v30.8b, v30.8b
> + uabd v27.8b, v1.8b, v2.8b
> + uadalp v16.4s, v29.8h
The scheduling here can be improved, please see the attached patch.
Other than that, it does look reasonable.
// Martin
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: Type: text/x-diff; name=0001-aarch64-me_cmp-Improve-scheduling-in-vsse_intra8.patch, Size: 1957 bytes --]
From d0345bceaf013bea2023b1a02b372f2a64c6efaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 28 Sep 2022 11:53:55 +0300
Subject: [PATCH] aarch64: me_cmp: Improve scheduling in vsse_intra8
Before: Cortex A53 A72 A73
vsse_5_neon: 74.7 31.5 26.0
After:
vsse_5_neon: 62.7 32.5 25.7
---
libavcodec/aarch64/me_cmp_neon.S | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 4037953488..dc0b1e5f43 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1113,11 +1113,11 @@ function vsse_intra8_neon, export=1
// x3 ptrdiff_t stride
// w4 int h
+ sub w4, w4, #1 // we need to make h-1 iterations
ld1 {v0.8b}, [x1], x3
+ cmp w4, #3
movi v16.4s, #0
- sub w4, w4, #1 // we need to make h-1 iterations
- cmp w4, #3
b.lt 2f
1:
@@ -1127,13 +1127,13 @@ function vsse_intra8_neon, export=1
ld1 {v2.8b}, [x1], x3
uabd v30.8b, v0.8b, v1.8b
ld1 {v3.8b}, [x1], x3
- umull v29.8h, v30.8b, v30.8b
uabd v27.8b, v1.8b, v2.8b
- uadalp v16.4s, v29.8h
- umull v26.8h, v27.8b, v27.8b
+ umull v29.8h, v30.8b, v30.8b
uabd v25.8b, v2.8b, v3.8b
- uadalp v16.4s, v26.8h
+ umull v26.8h, v27.8b, v27.8b
+ uadalp v16.4s, v29.8h
umull v24.8h, v25.8b, v25.8b
+ uadalp v16.4s, v26.8h
sub w4, w4, #3
uadalp v16.4s, v24.8h
cmp w4, #3
--
2.25.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions.
2022-09-26 9:15 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
` (2 preceding siblings ...)
2022-09-26 9:15 ` [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
@ 2022-09-28 9:06 ` Martin Storsjö
2022-09-28 12:22 ` Grzegorz Bernacki
3 siblings, 1 reply; 9+ messages in thread
From: Martin Storsjö @ 2022-09-28 9:06 UTC (permalink / raw)
To: Grzegorz Bernacki; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop
[-- Attachment #1: Type: text/plain, Size: 8077 bytes --]
On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:
> Provide optimized implementation of pix_abs8 function for arm64.
>
> Performance comparison tests are shown below:
> pix_abs_1_1_c: 162.5
> pix_abs_1_1_neon: 27.0
> pix_abs_1_2_c: 174.0
> pix_abs_1_2_neon: 23.5
> pix_abs_1_3_c: 203.2
> pix_abs_1_3_neon: 34.7
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 9 ++
> libavcodec/aarch64/me_cmp_neon.S | 193 +++++++++++++++++++++++
> 2 files changed, 202 insertions(+)
I don't see any changes compared to the version you sent last week? If
reposting a patchset, please do mention what has changed - or ping the old
one. (I had it on my radar to review, but reviews of larger amounts of
code doesn't happen immediately, unfortunately.)
Overall, you seem to have mixed in tabs among regular spaces. Please do
fix that. (I would have kinda expected us to have a fate test that checks
this, but apparently we don't.)
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index e143f0816e..3459403ee5 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -59,6 +59,12 @@ int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t
> ptrdiff_t stride, int h);
> int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> ptrdiff_t stride, int h);
> +int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> + ptrdiff_t stride, int h);
> +int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> + ptrdiff_t stride, int h);
> +int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> + ptrdiff_t stride, int h);
>
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -70,6 +76,9 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
> c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
> c->pix_abs[1][0] = ff_pix_abs8_neon;
> + c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
> + c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
> + c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;
In most mediums, the diff here shows that there's something off - tabs.
>
> c->sad[0] = ff_pix_abs16_neon;
> c->sad[1] = ff_pix_abs8_neon;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 11af4849f9..e03c0c26cd 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -119,6 +119,199 @@ function ff_pix_abs8_neon, export=1
> ret
> endfunc
>
> +function ff_pix_abs8_x2_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // w4 int h
> +
> + cmp w4, #4
> + movi v26.8h, #0
> + add x5, x2, #1 // pix2 + 1
> + b.lt 2f
> +
> +// make 4 iterations at once
> +1:
> + ld1 {v1.8b}, [x2], x3
> + ld1 {v2.8b}, [x5], x3
> + ld1 {v0.8b}, [x1], x3
> + ld1 {v4.8b}, [x2], x3
> + urhadd v30.8b, v1.8b, v2.8b
> + ld1 {v5.8b}, [x5], x3
> + uabal v26.8h, v0.8b, v30.8b
> + ld1 {v6.8b}, [x1], x3
> + urhadd v29.8b, v4.8b, v5.8b
> + ld1 {v7.8b}, [x2], x3
> + ld1 {v20.8b}, [x5], x3
> + uabal v26.8h, v6.8b, v29.8b
> + ld1 {v21.8b}, [x1], x3
> + urhadd v28.8b, v7.8b, v20.8b
> + ld1 {v22.8b}, [x2], x3
> + ld1 {v23.8b}, [x5], x3
> + uabal v26.8h, v21.8b, v28.8b
> + sub w4, w4, #4
> + ld1 {v24.8b}, [x1], x3
> + urhadd v27.8b, v22.8b, v23.8b
> + cmp w4, #4
> + uabal v26.8h, v24.8b, v27.8b
> +
> + b.ge 1b
> + cbz w4, 3f
> +
> +// iterate by one
> +2:
> + ld1 {v1.8b}, [x2], x3
> + ld1 {v2.8b}, [x5], x3
> + ld1 {v0.8b}, [x1], x3
> + urhadd v30.8b, v1.8b, v2.8b
> + subs w4, w4, #1
> + uabal v26.8h, v0.8b, v30.8b
> +
> + b.ne 2b
> +3:
> + uaddlv s20, v26.8h
> + fmov w0, s20
> +
> + ret
> +
> +endfunc
> +
> +function ff_pix_abs8_y2_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // w4 int h
> +
> + cmp w4, #4
> + movi v26.8h, #0
> + ld1 {v1.8b}, [x2], x3
> + b.lt 2f
> +
> +// make 4 iterations at once
> +1:
> + ld1 {v2.8b}, [x2], x3
> + ld1 {v0.8b}, [x1], x3
> + ld1 {v6.8b}, [x1], x3
> + urhadd v30.8b, v1.8b, v2.8b
Good thing that you wrote this better than the current
ff_pix_abs16_y2_neon, reusing the data from the previous round. But the
scheduling could be much improved here (why load v6 directly after v0 - x1
will delay it due to being updated in the previous instruction, and v6
won't be needed for a long time here yet).
By fixing the scheduling of this function (and getting rid of the
unnecessary mov instruction at the end of the unrolled loop), I got it
down from 74 cycles to 62 cycles on a Cortex A53. I'm attaching the fixup
I did, so you can apply it on top instead of having to describe it
verbally.
> + ld1 {v5.8b}, [x2], x3
> + ld1 {v21.8b}, [x1], x3
> + uabal v26.8h, v0.8b, v30.8b
> + urhadd v29.8b, v2.8b, v5.8b
> + ld1 {v20.8b}, [x2], x3
> + ld1 {v24.8b}, [x1], x3
> + uabal v26.8h, v6.8b, v29.8b
> + urhadd v28.8b, v5.8b, v20.8b
> + uabal v26.8h, v21.8b, v28.8b
> + ld1 {v23.8b}, [x2], x3
> + mov v1.8b, v23.8b
> + sub w4, w4, #4
> + urhadd v27.8b, v20.8b, v23.8b
> + cmp w4, #4
> + uabal v26.8h, v24.8b, v27.8b
> +
> + b.ge 1b
> + cbz w4, 3f
> +
> +// iterate by one
> +2:
> + ld1 {v0.8b}, [x1], x3
> + ld1 {v2.8b}, [x2], x3
> + urhadd v30.8b, v1.8b, v2.8b
> + subs w4, w4, #1
> + uabal v26.8h, v0.8b, v30.8b
> + mov v1.8b, v2.8b
> +
> + b.ne 2b
> +3:
> + uaddlv s20, v26.8h
> + fmov w0, s20
> +
> + ret
> +
> +endfunc
> +
> +function ff_pix_abs8_xy2_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // w4 int h
> +
> + movi v31.8h, #0
> + add x0, x2, 1 // pix2 + 1
> +
> + add x5, x2, x3 // pix2 + stride = pix3
> + cmp w4, #4
> + add x6, x5, 1 // pix3 + stride + 1
> +
> + b.lt 2f
> +
> + ld1 {v0.8b}, [x2], x3
> + ld1 {v1.8b}, [x0], x3
> + uaddl v2.8h, v0.8b, v1.8b
If we'd start out with h<4, we'd jump to the 2f label above, missing the
setup of v0-v2 here.
Other than that, the patch looks reasonable to me.
// Martin
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: Type: text/x-diff; name=0001-aarch64-me_cmp-Improve-scheduling-in-ff_pix_abs8_y2_.patch, Size: 2017 bytes --]
From 7909432b3cffdcb31c0953279d862f820a738ec4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 28 Sep 2022 11:29:57 +0300
Subject: [PATCH 1/2] aarch64: me_cmp: Improve scheduling in
ff_pix_abs8_y2_neon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before: Cortex A53 A72 A73
pix_abs_1_2_neon: 73.7 31.0 25.7
After:
pix_abs_1_2_neon: 61.7 30.2 24.7
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/aarch64/me_cmp_neon.S | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index a900ae2190..7f2b7c754f 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -193,21 +193,20 @@ function ff_pix_abs8_y2_neon, export=1
1:
ld1 {v2.8b}, [x2], x3
ld1 {v0.8b}, [x1], x3
- ld1 {v6.8b}, [x1], x3
urhadd v30.8b, v1.8b, v2.8b
ld1 {v5.8b}, [x2], x3
- ld1 {v21.8b}, [x1], x3
+ ld1 {v6.8b}, [x1], x3
uabal v26.8h, v0.8b, v30.8b
urhadd v29.8b, v2.8b, v5.8b
ld1 {v20.8b}, [x2], x3
- ld1 {v24.8b}, [x1], x3
+ ld1 {v21.8b}, [x1], x3
uabal v26.8h, v6.8b, v29.8b
urhadd v28.8b, v5.8b, v20.8b
- uabal v26.8h, v21.8b, v28.8b
- ld1 {v23.8b}, [x2], x3
- mov v1.8b, v23.8b
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v24.8b}, [x1], x3
+ urhadd v27.8b, v20.8b, v1.8b
sub w4, w4, #4
- urhadd v27.8b, v20.8b, v23.8b
+ uabal v26.8h, v21.8b, v28.8b
cmp w4, #4
uabal v26.8h, v24.8b, v27.8b
--
2.25.1
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: Type: text/x-diff; name=0002-aarch64-me_cmp-Fix-up-the-prologue-of-ff_pix_abs8_xy.patch, Size: 1083 bytes --]
From dfc91453aa3608b6b6b7c0e832292668150fd0d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 28 Sep 2022 11:43:02 +0300
Subject: [PATCH 2/2] aarch64: me_cmp: Fix up the prologue of
ff_pix_abs8_xy2_neon
This initializes things properly if this were to be called with
h < 4.
---
libavcodec/aarch64/me_cmp_neon.S | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 7f2b7c754f..4037953488 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -245,12 +245,12 @@ function ff_pix_abs8_xy2_neon, export=1
cmp w4, #4
add x6, x5, 1 // pix3 + stride + 1
- b.lt 2f
-
ld1 {v0.8b}, [x2], x3
ld1 {v1.8b}, [x0], x3
uaddl v2.8h, v0.8b, v1.8b
+ b.lt 2f
+
// make 4 iterations at once
1:
ld1 {v4.8b}, [x5], x3
--
2.25.1
[-- Attachment #4: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions.
2022-09-28 9:06 ` [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Martin Storsjö
@ 2022-09-28 12:22 ` Grzegorz Bernacki
0 siblings, 0 replies; 9+ messages in thread
From: Grzegorz Bernacki @ 2022-09-28 12:22 UTC (permalink / raw)
To: Martin Storsjö; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop
Hi Martin,
I resent the patchset, because the first try did not reach ffmpeg-devel
maillist. I apologize, I should have mentioned about that in cover letter.
Thanks a lot for your review, I will apply the changes and send v2 soon.
thanks,
grzegorz
śr., 28 wrz 2022 o 11:07 Martin Storsjö <martin@martin.st> napisał(a):
> On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:
>
> > Provide optimized implementation of pix_abs8 function for arm64.
> >
> > Performance comparison tests are shown below:
> > pix_abs_1_1_c: 162.5
> > pix_abs_1_1_neon: 27.0
> > pix_abs_1_2_c: 174.0
> > pix_abs_1_2_neon: 23.5
> > pix_abs_1_3_c: 203.2
> > pix_abs_1_3_neon: 34.7
> >
> > Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
> >
> > Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> > ---
> > libavcodec/aarch64/me_cmp_init_aarch64.c | 9 ++
> > libavcodec/aarch64/me_cmp_neon.S | 193 +++++++++++++++++++++++
> > 2 files changed, 202 insertions(+)
>
> I don't see any changes compared to the version you sent last week? If
> reposting a patchset, please do mention what has changed - or ping the old
> one. (I had it on my radar to review, but reviews of larger amounts of
> code doesn't happen immediately, unfortunately.)
>
> Overall, you seem to have mixed in tabs among regular spaces. Please do
> fix that. (I would have kinda expected us to have a fate test that checks
> this, but apparently we don't.)
>
> > diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c
> b/libavcodec/aarch64/me_cmp_init_aarch64.c
> > index e143f0816e..3459403ee5 100644
> > --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> > +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> > @@ -59,6 +59,12 @@ int pix_median_abs16_neon(MpegEncContext *v, const
> uint8_t *pix1, const uint8_t
> > ptrdiff_t stride, int h);
> > int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > ptrdiff_t stride, int h);
> > +int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > + ptrdiff_t stride, int h);
> > +int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > + ptrdiff_t stride, int h);
> > +int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > + ptrdiff_t stride, int h);
> >
> > av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext
> *avctx)
> > {
> > @@ -70,6 +76,9 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c,
> AVCodecContext *avctx)
> > c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
> > c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
> > c->pix_abs[1][0] = ff_pix_abs8_neon;
> > + c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
> > + c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
> > + c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;
>
> In most mediums, the diff here shows that there's something off - tabs.
>
> >
> > c->sad[0] = ff_pix_abs16_neon;
> > c->sad[1] = ff_pix_abs8_neon;
> > diff --git a/libavcodec/aarch64/me_cmp_neon.S
> b/libavcodec/aarch64/me_cmp_neon.S
> > index 11af4849f9..e03c0c26cd 100644
> > --- a/libavcodec/aarch64/me_cmp_neon.S
> > +++ b/libavcodec/aarch64/me_cmp_neon.S
> > @@ -119,6 +119,199 @@ function ff_pix_abs8_neon, export=1
> > ret
> > endfunc
> >
> > +function ff_pix_abs8_x2_neon, export=1
> > + // x0 unused
> > + // x1 uint8_t *pix1
> > + // x2 uint8_t *pix2
> > + // x3 ptrdiff_t stride
> > + // w4 int h
> > +
> > + cmp w4, #4
> > + movi v26.8h, #0
> > + add x5, x2, #1 // pix2 + 1
> > + b.lt 2f
> > +
> > +// make 4 iterations at once
> > +1:
> > + ld1 {v1.8b}, [x2], x3
> > + ld1 {v2.8b}, [x5], x3
> > + ld1 {v0.8b}, [x1], x3
> > + ld1 {v4.8b}, [x2], x3
> > + urhadd v30.8b, v1.8b, v2.8b
> > + ld1 {v5.8b}, [x5], x3
> > + uabal v26.8h, v0.8b, v30.8b
> > + ld1 {v6.8b}, [x1], x3
> > + urhadd v29.8b, v4.8b, v5.8b
> > + ld1 {v7.8b}, [x2], x3
> > + ld1 {v20.8b}, [x5], x3
> > + uabal v26.8h, v6.8b, v29.8b
> > + ld1 {v21.8b}, [x1], x3
> > + urhadd v28.8b, v7.8b, v20.8b
> > + ld1 {v22.8b}, [x2], x3
> > + ld1 {v23.8b}, [x5], x3
> > + uabal v26.8h, v21.8b, v28.8b
> > + sub w4, w4, #4
> > + ld1 {v24.8b}, [x1], x3
> > + urhadd v27.8b, v22.8b, v23.8b
> > + cmp w4, #4
> > + uabal v26.8h, v24.8b, v27.8b
> > +
> > + b.ge 1b
> > + cbz w4, 3f
> > +
> > +// iterate by one
> > +2:
> > + ld1 {v1.8b}, [x2], x3
> > + ld1 {v2.8b}, [x5], x3
> > + ld1 {v0.8b}, [x1], x3
> > + urhadd v30.8b, v1.8b, v2.8b
> > + subs w4, w4, #1
> > + uabal v26.8h, v0.8b, v30.8b
> > +
> > + b.ne 2b
> > +3:
> > + uaddlv s20, v26.8h
> > + fmov w0, s20
> > +
> > + ret
> > +
> > +endfunc
> > +
> > +function ff_pix_abs8_y2_neon, export=1
> > + // x0 unused
> > + // x1 uint8_t *pix1
> > + // x2 uint8_t *pix2
> > + // x3 ptrdiff_t stride
> > + // w4 int h
> > +
> > + cmp w4, #4
> > + movi v26.8h, #0
> > + ld1 {v1.8b}, [x2], x3
> > + b.lt 2f
> > +
> > +// make 4 iterations at once
> > +1:
> > + ld1 {v2.8b}, [x2], x3
> > + ld1 {v0.8b}, [x1], x3
> > + ld1 {v6.8b}, [x1], x3
> > + urhadd v30.8b, v1.8b, v2.8b
>
> Good thing that you wrote this better than the current
> ff_pix_abs16_y2_neon, reusing the data from the previous round. But the
> scheduling could be much improved here (why load v6 directly after v0 - x1
> will delay it due to being updated in the previous instruction, and v6
> won't be needed for a long time here yet).
>
> By fixing the scheduling of this function (and getting rid of the
> unnecessary mov instruction at the end of the unrolled loop), I got it
> down from 74 cycles to 62 cycles on a Cortex A53. I'm attaching the fixup
> I did, so you can apply it on top instead of having to describe it
> verbally.
>
> > + ld1 {v5.8b}, [x2], x3
> > + ld1 {v21.8b}, [x1], x3
> > + uabal v26.8h, v0.8b, v30.8b
> > + urhadd v29.8b, v2.8b, v5.8b
> > + ld1 {v20.8b}, [x2], x3
> > + ld1 {v24.8b}, [x1], x3
> > + uabal v26.8h, v6.8b, v29.8b
> > + urhadd v28.8b, v5.8b, v20.8b
> > + uabal v26.8h, v21.8b, v28.8b
> > + ld1 {v23.8b}, [x2], x3
> > + mov v1.8b, v23.8b
> > + sub w4, w4, #4
> > + urhadd v27.8b, v20.8b, v23.8b
> > + cmp w4, #4
> > + uabal v26.8h, v24.8b, v27.8b
> > +
> > + b.ge 1b
> > + cbz w4, 3f
> > +
> > +// iterate by one
> > +2:
> > + ld1 {v0.8b}, [x1], x3
> > + ld1 {v2.8b}, [x2], x3
> > + urhadd v30.8b, v1.8b, v2.8b
> > + subs w4, w4, #1
> > + uabal v26.8h, v0.8b, v30.8b
> > + mov v1.8b, v2.8b
> > +
> > + b.ne 2b
> > +3:
> > + uaddlv s20, v26.8h
> > + fmov w0, s20
> > +
> > + ret
> > +
> > +endfunc
> > +
> > +function ff_pix_abs8_xy2_neon, export=1
> > + // x0 unused
> > + // x1 uint8_t *pix1
> > + // x2 uint8_t *pix2
> > + // x3 ptrdiff_t stride
> > + // w4 int h
> > +
> > + movi v31.8h, #0
> > + add x0, x2, 1 // pix2 + 1
> > +
> > + add x5, x2, x3 // pix2 + stride = pix3
> > + cmp w4, #4
> > + add x6, x5, 1 // pix3 + stride + 1
> > +
> > + b.lt 2f
> > +
> > + ld1 {v0.8b}, [x2], x3
> > + ld1 {v1.8b}, [x0], x3
> > + uaddl v2.8h, v0.8b, v1.8b
>
> If we'd start out with h<4, we'd jump to the 2f label above, missing the
> setup of v0-v2 here.
>
> Other than that, the patch looks reasonable to me.
>
> // Martin
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread