* [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions
@ 2022-07-15 8:02 Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
` (4 more replies)
0 siblings, 5 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15 8:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Add arm64 neon implementation for the following functions from
motion estimation. All functions were tested and benchmarked on
AWS Graviton 3 instances.
Hubert Mazur (5):
lavc/aarch64: Add neon implementation for sse16
lavc/aarch64: Add neon implementation for sse4
lavc/aarch64: Add neon implementation for pix_abs16_y2
lavc/aarch64: Add neon implementation for sse8
lavc/aarch64: Add neon implementation for pix_abs8
libavcodec/aarch64/me_cmp_init_aarch64.c | 17 ++
libavcodec/aarch64/me_cmp_neon.S | 346 +++++++++++++++++++++++
2 files changed, 363 insertions(+)
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16
2022-07-15 8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
@ 2022-07-15 8:02 ` Hubert Mazur
2022-07-25 11:12 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
` (3 subsequent siblings)
4 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15 8:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide neon implementation for sse16 function.
Performance comparison tests are shown below.
- sse_0_c: 273.0
- sse_0_neon: 48.2
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 82 ++++++++++++++++++++++++
2 files changed, 86 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 136b008eb7..3ff5767bd0 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -30,6 +30,9 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -40,5 +43,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
c->sad[0] = ff_pix_abs16_neon;
+ c->sse[0] = sse16_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index e49d049fc2..88cd335443 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -278,3 +278,85 @@ function ff_pix_abs16_x2_neon, export=1
ret
endfunc
+
+function sse16_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ cmp w4, #4
+ movi d18, #0
+ b.lt 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.16b}, [x1], x3
+ ld1 {v1.16b}, [x2], x3
+ uabd v30.16b, v0.16b, v1.16b
+ ld1 {v2.16b}, [x1], x3
+ umull v29.8h, v30.8b, v30.8b
+ ld1 {v3.16b}, [x2], x3
+ umull2 v28.8h, v30.16b, v30.16b
+ uabd v27.16b, v2.16b, v3.16b
+ uaddlp v17.4s, v29.8h
+ umull v26.8h, v27.8b, v27.8b
+ umull2 v25.8h, v27.16b, v27.16b
+ ld1 {v4.16b}, [x1], x3
+ uadalp v17.4s, v26.8h
+ ld1 {v5.16b}, [x2], x3
+ uadalp v17.4s, v25.8h
+ uabd v24.16b, v4.16b, v5.16b
+ ld1 {v6.16b}, [x1], x3
+ umull v23.8h, v24.8b, v24.8b
+ uadalp v17.4s, v23.8h
+ umull2 v22.8h, v24.16b, v24.16b
+ uadalp v17.4s, v22.8h
+ ld1 {v7.16b}, [x2], x3
+ uadalp v17.4s, v28.8h
+ uabd v21.16b, v6.16b, v7.16b
+ umull v20.8h, v21.8b, v21.8b
+ uadalp v17.4s, v20.8h
+ umull2 v19.8h, v21.16b, v21.16b
+ uadalp v17.4s, v19.8h
+
+ sub w4, w4, #4
+ uaddlv d16, v17.4s
+ cmp w4, #4
+ add d18, d18, d16
+
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+
+ ld1 {v0.16b}, [x1], x3
+ ld1 {v1.16b}, [x2], x3
+
+ uabd v30.16b, v0.16b, v1.16b
+ umull v29.8h, v0.8b, v1.8b
+ umull2 v28.8h, v0.16b, v1.16b
+ uaddlp v17.4s, v29.8h
+ uadalp v17.4s, v28.8h
+
+
+ subs w4, w4, #1
+ uaddlv d16, v17.4s
+ add d18, d18, d16
+
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4
2022-07-15 8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
@ 2022-07-15 8:02 ` Hubert Mazur
2022-07-21 21:43 ` Martin Storsjö
` (2 more replies)
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
` (2 subsequent siblings)
4 siblings, 3 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15 8:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide neon implementation for sse4 function.
Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
2 files changed, 68 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_pix_abs16_neon;
c->sse[0] = sse16_neon;
+ c->sse[2] = sse4_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 88cd335443..bacf151314 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -360,3 +360,68 @@ function sse16_neon, export=1
ret
endfunc
+
+function sse4_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ movi d18, #0
+ movi d17, #0
+ cmp w4, #4
+ b.le 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.4b}, [x1], x3
+ ld1 {v1.4b}, [x2], x3
+ uabdl v30.8h, v0.4b, v1.4b
+ ld1 {v2.4b}, [x1], x3
+ ld1 {v3.4b}, [x2], x3
+ umull v16.4s, v30.4h, v30.4h
+ uabdl v29.8h, v2.4b, v3.4b
+ ld1 {v4.4b}, [x1], x3
+ ld1 {v5.4b}, [x2], x3
+ umlal v16.4s, v29.4h, v29.4h
+ uabdl v28.8h, v4.4b, v5.4b
+ ld1 {v6.4b}, [x1], x3
+ ld1 {v7.4b}, [x2], x3
+ umlal v16.4s, v28.4h, v28.4h
+ uabdl v27.8h, v6.4b, v7.4b
+ umlal v16.4s, v27.4h, v27.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ sub w4, w4, #4
+ cmp w4, #4
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.4b}, [x1], x3
+ ld1 {v1.4b}, [x2], x3
+ uabdl v30.8h, v0.4b, v1.4b
+ umull v16.4s, v30.4h, v30.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ subs w4, w4, #1
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2
2022-07-15 8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
@ 2022-07-15 8:02 ` Hubert Mazur
2022-07-25 11:17 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
4 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15 8:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of pix_abs16_y2 function for arm64.
Performance comparison tests are shown below.
pix_abs_0_2_c: 308.5
pix_abs_0_2_neon: 39.2
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 +
libavcodec/aarch64/me_cmp_neon.S | 74 ++++++++++++++++++++++++
2 files changed, 77 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 72a2062e7e..07d62cc1e5 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -29,6 +29,8 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
@@ -42,6 +44,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
if (have_neon(cpu_flags)) {
c->pix_abs[0][0] = ff_pix_abs16_neon;
c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
+ c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
c->sad[0] = ff_pix_abs16_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index bacf151314..858833b0ae 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -425,3 +425,77 @@ function sse4_neon, export=1
ret
endfunc
+
+function ff_pix_abs16_y2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // x4 int h
+ // x5 uint8_t *pix2 + stride
+
+ // initialize buffers
+ movi d18, #0
+ add x5, x2, x3
+ cmp w4, #4
+ b.lt 2f
+
+// make 4 iterations at once
+1:
+
+ // abs(pix1[0], avg2(pix2[0], pix2[0 + stride]))
+ // avg2(a, b) = (((a) + (b) + 1) >> 1)
+ // abs(x) = (x < 0 ? (-x) : (x))
+
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v2.16b}, [x5], x3
+ urhadd v30.16b, v1.16b, v2.16b
+ ld1 {v0.16b}, [x1], x3
+ uabdl v29.8h, v0.8b, v30.8b
+ ld1 {v4.16b}, [x2], x3
+ uabdl2 v28.8h, v0.16b, v30.16b
+ ld1 {v5.16b}, [x5], x3
+ urhadd v27.16b, v4.16b, v5.16b
+ ld1 {v3.16b}, [x1], x3
+ uabal v29.8h, v3.8b, v27.8b
+ ld1 {v7.16b}, [x2], x3
+ uabal2 v28.8h, v3.16b, v27.16b
+ ld1 {v20.16b}, [x5], x3
+ urhadd v26.16b, v7.16b, v20.16b
+ ld1 {v6.16b}, [x1], x3
+ uabal v29.8h, v6.8b, v26.8b
+ ld1 {v22.16b}, [x2], x3
+ uabal2 v28.8h, v6.16b, v26.16b
+ ld1 {v23.16b}, [x5], x3
+ urhadd v25.16b, v22.16b, v23.16b
+ ld1 {v21.16b}, [x1], x3
+ uabal v29.8h, v21.8b, v25.8b
+ uabal2 v28.8h, v21.16b, v25.16b
+
+ add v29.8h, v29.8h, v28.8h
+ sub w4, w4, #4
+ uaddlv s16, v29.8h
+ cmp w4, #4
+ add d18, d18, d16
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+
+ ld1 {v0.16b}, [x1], x3
+ ld1 {v1.16b}, [x2], x3
+ urhadd v30.16b, v1.16b, v2.16b
+ ld1 {v2.16b}, [x5], x3
+ uabd v30.16b, v30.16b, v30.16b
+
+ uaddlv h17, v30.16b
+ subs w4, w4, #1
+ add d18, d18, d17
+ b.ne 2b
+
+3:
+ fmov w0, s18
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8
2022-07-15 8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
` (2 preceding siblings ...)
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
@ 2022-07-15 8:02 ` Hubert Mazur
2022-07-25 11:18 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
4 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15 8:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of sse8 function for arm64.
Performance comparison tests are shown below.
- sse_1_c: 133.0
- sse_1_neon: 36.7
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 +
libavcodec/aarch64/me_cmp_neon.S | 72 ++++++++++++++++++++++++
2 files changed, 75 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 07d62cc1e5..89c817990c 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -34,6 +34,8 @@ int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse8_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
@@ -49,6 +51,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_pix_abs16_neon;
c->sse[0] = sse16_neon;
+ c->sse[1] = sse8_neon;
c->sse[2] = sse4_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 858833b0ae..c78e26df4b 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -361,6 +361,78 @@ function sse16_neon, export=1
endfunc
+function sse8_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ movi d18, #0
+ cmp w4, #4
+ b.le 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.8b}, [x1], x3
+ ld1 {v1.8b}, [x2], x3
+ uabdl v30.8h, v0.8b, v1.8b
+ umull v21.4s, v30.4h, v30.4h
+ ld1 {v2.8b}, [x1], x3
+ umull2 v20.4s, v30.8h, v30.8h
+ ld1 {v3.8b}, [x2], x3
+ uabdl v29.8h, v2.8b, v3.8b
+ ld1 {v4.8b}, [x1], x3
+ umlal v21.4s, v29.4h, v29.4h
+ ld1 {v5.8b}, [x2], x3
+ umlal2 v20.4s, v29.8h, v29.8h
+ uabdl v28.8h, v4.8b, v5.8b
+ ld1 {v6.8b}, [x1], x3
+ umlal v21.4s, v28.4h, v28.4h
+ ld1 {v7.8b}, [x2], x3
+ umlal2 v20.4s, v28.8h, v28.8h
+ uabdl v27.8h, v6.8b, v7.8b
+ umlal v21.4s, v27.4h, v27.4h
+ umlal2 v20.4s, v27.8h, v27.8h
+
+ add v21.4s, v21.4s, v20.4s
+ sub w4, w4, #4
+ uaddlv d17, v21.4s
+ add d18, d18, d17
+ cmp w4, #4
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.8b}, [x1], x3
+ ld1 {v1.8b}, [x2], x3
+
+ uabdl v30.8h, v0.8b, v1.8b
+ umull v21.4s, v30.4h, v30.4h
+ umull2 v20.4s, v30.8h, v30.8h
+
+ subs w4, w4, #1
+
+ uaddlv d17, v21.4s
+ add d18, d18, d17
+ uaddlv d17, v20.4S
+ add d18, d18, d17
+
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+endfunc
+
function sse4_neon, export=1
// x0 - unused
// x1 - pix1
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8
2022-07-15 8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
` (3 preceding siblings ...)
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
@ 2022-07-15 8:02 ` Hubert Mazur
2022-07-25 11:21 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
4 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15 8:02 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of pix_abs8 function for arm64.
Performance comparison tests are shown below.
- pix_abs_1_0_c: 105.2
- pix_abs_1_0_neon: 21.4
- sad_1_c: 107.2
- sad_1_neon: 20.9
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 53 ++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 89c817990c..7d7dc38754 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -31,6 +31,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_pix_abs8_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
@@ -48,8 +50,10 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+ c->pix_abs[1][0] = ff_pix_abs8_neon;
c->sad[0] = ff_pix_abs16_neon;
+ c->sad[1] = ff_pix_abs8_neon;
c->sse[0] = sse16_neon;
c->sse[1] = sse8_neon;
c->sse[2] = sse4_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index c78e26df4b..383459d209 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -72,6 +72,59 @@ function ff_pix_abs16_neon, export=1
ret
endfunc
+function ff_pix_abs8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // x4 int h
+
+ movi d18, #0
+ cmp w4, #4
+ b.lt 2f
+
+// make 4 iterations at once
+1:
+ ld1 {v0.8b}, [x1], x3
+ ld1 {v1.8b}, [x2], x3
+ uabdl v30.8h, v0.8b, v1.8b
+ ld1 {v2.8b}, [x1], x3
+ ld1 {v3.8b}, [x2], x3
+ uabal v30.8h, v2.8b, v3.8b
+ ld1 {v4.8b}, [x1], x3
+ ld1 {v5.8b}, [x2], x3
+ uabal v30.8h, v4.8b, v5.8b
+ ld1 {v6.8b}, [x1], x3
+ ld1 {v7.8b}, [x2], x3
+ uabal v30.8h, v6.8b, v7.8b
+
+ sub w4, w4, #4
+ uaddlv s20, v30.8h
+ cmp w4, #4
+ add d18, d18, d20
+ b.ge 1b
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.8b}, [x1], x3
+ ld1 {v1.8b}, [x2], x3
+
+ uabdl v16.8h, v0.8b, v1.8b
+
+ uaddlv s17, v16.8h
+ add d18, d18, d17
+ subs w4, w4, #1
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
+
function ff_pix_abs16_xy2_neon, export=1
// x0 unused
// x1 uint8_t *pix1
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
@ 2022-07-21 21:43 ` Martin Storsjö
2022-07-22 21:30 ` Swinney, Jonathan
2022-07-25 11:15 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2 siblings, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-07-21 21:43 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Fri, 15 Jul 2022, Hubert Mazur wrote:
> Provide neon implementation for sse4 function.
>
> Performance comparison tests are shown below.
> - sse_2_c: 74.0
> - sse_2_neon: 24.0
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
> libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
> 2 files changed, 68 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 3ff5767bd0..72a2062e7e 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>
> int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> ptrdiff_t stride, int h);
> +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> + ptrdiff_t stride, int h);
>
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>
> c->sad[0] = ff_pix_abs16_neon;
> c->sse[0] = sse16_neon;
> + c->sse[2] = sse4_neon;
> }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 88cd335443..bacf151314 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -360,3 +360,68 @@ function sse16_neon, export=1
> ret
>
> endfunc
> +
> +function sse4_neon, export=1
> + // x0 - unused
> + // x1 - pix1
> + // x2 - pix2
> + // x3 - stride
> + // w4 - h
> +
> + movi d18, #0
> + movi d17, #0
> + cmp w4, #4
> + b.le 2f
> +
> +// make 4 iterations at once
> +1:
> +
> + // res = abs(pix1[0] - pix2[0])
> + // res * res
> +
> + ld1 {v0.4b}, [x1], x3
This fails to assemble for me with essentially all tools I have (old
binutils, moderately recent binutils, current llvm, MS armasm64.exe):
src/libavcodec/aarch64/me_cmp_neon.S:374: Error: operand mismatch -- `ld1
{v0.4b},[x1],x3'
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: did you mean this?
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: ld1 {v0.8b}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: other valid variant(s):
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: ld1 {v0.16b}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: ld1 {v0.4h}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: ld1 {v0.8h}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: ld1 {v0.2s}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: ld1 {v0.4s}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: ld1 {v0.1d}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info: ld1 {v0.2d}, [x1], x3
I'll follow up with an actual review of the patches later. I'm sorry I
have a bit longer review latency than usual at the moment, as I'm on
vacation.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
2022-07-21 21:43 ` Martin Storsjö
@ 2022-07-22 21:30 ` Swinney, Jonathan
2022-07-25 11:15 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2 siblings, 0 replies; 25+ messages in thread
From: Swinney, Jonathan @ 2022-07-22 21:30 UTC (permalink / raw)
To: Hubert Mazur, ffmpeg-devel; +Cc: martin, mw, upstream, Pop, Sebastian, gjb
As Martin noted, this patch doesn't build. But other than, that, it would be nice if there were comments on each line at least making some note about which of the 4 iterations each instruction calculates. That would make it a little bit easier to read, in my opinion, since the instructions are manually reordered.
Thanks,
--
Jonathan Swinney
On 7/15/22, 3:03 AM, "Hubert Mazur" <hum@semihalf.com> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Provide neon implementation for sse4 function.
Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
2 files changed, 68 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_pix_abs16_neon;
c->sse[0] = sse16_neon;
+ c->sse[2] = sse4_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 88cd335443..bacf151314 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -360,3 +360,68 @@ function sse16_neon, export=1
ret
endfunc
+
+function sse4_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ movi d18, #0
+ movi d17, #0
+ cmp w4, #4
+ b.le 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.4b}, [x1], x3
+ ld1 {v1.4b}, [x2], x3
+ uabdl v30.8h, v0.4b, v1.4b
+ ld1 {v2.4b}, [x1], x3
+ ld1 {v3.4b}, [x2], x3
+ umull v16.4s, v30.4h, v30.4h
+ uabdl v29.8h, v2.4b, v3.4b
+ ld1 {v4.4b}, [x1], x3
+ ld1 {v5.4b}, [x2], x3
+ umlal v16.4s, v29.4h, v29.4h
+ uabdl v28.8h, v4.4b, v5.4b
+ ld1 {v6.4b}, [x1], x3
+ ld1 {v7.4b}, [x2], x3
+ umlal v16.4s, v28.4h, v28.4h
+ uabdl v27.8h, v6.4b, v7.4b
+ umlal v16.4s, v27.4h, v27.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ sub w4, w4, #4
+ cmp w4, #4
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.4b}, [x1], x3
+ ld1 {v1.4b}, [x2], x3
+ uabdl v30.8h, v0.4b, v1.4b
+ umull v16.4s, v30.4h, v30.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ subs w4, w4, #1
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse16
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
@ 2022-07-25 11:12 ` Hubert Mazur
2022-08-03 13:22 ` Martin Storsjö
2022-08-04 7:46 ` Martin Storsjö
0 siblings, 2 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:12 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide neon implementation for sse16 function.
Performance comparison tests are shown below.
- sse_0_c: 273.0
- sse_0_neon: 48.2
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 82 ++++++++++++++++++++++++
2 files changed, 86 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 136b008eb7..3ff5767bd0 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -30,6 +30,9 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -40,5 +43,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
c->sad[0] = ff_pix_abs16_neon;
+ c->sse[0] = sse16_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index cda7ce0408..98c912b608 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -270,3 +270,85 @@ function ff_pix_abs16_x2_neon, export=1
ret
endfunc
+
+function sse16_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ cmp w4, #4
+ movi d18, #0
+ b.lt 2f
+
+// Make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.16b}, [x1], x3 // Load pix1 vector for first iteration
+ ld1 {v1.16b}, [x2], x3 // Load pix2 vector for first iteration
+ uabd v30.16b, v0.16b, v1.16b // Absolute difference, first iteration
+ ld1 {v2.16b}, [x1], x3 // Load pix1 vector for second iteration
+ umull v29.8h, v30.8b, v30.8b // Multiply lower half of vectors, first iteration
+ ld1 {v3.16b}, [x2], x3 // Load pix2 vector for second iteration
+ umull2 v28.8h, v30.16b, v30.16b // Multiply upper half of vectors, first iteration
+ uabd v27.16b, v2.16b, v3.16b // Absolute difference, second iteration
+ uaddlp v17.4s, v29.8h // Pairwise add, first iteration
+ umull v26.8h, v27.8b, v27.8b // Mulitply lower half, second iteration
+ umull2 v25.8h, v27.16b, v27.16b // Multiply upper half, second iteration
+ ld1 {v4.16b}, [x1], x3 // Load pix1 for third iteration
+ uadalp v17.4s, v26.8h // Pairwise add and accumulate, second iteration
+ ld1 {v5.16b}, [x2], x3 // Load pix2 for third iteration
+ uadalp v17.4s, v25.8h // Pairwise add andd accumulate, second iteration
+ uabd v24.16b, v4.16b, v5.16b // Absolute difference, third iteration
+ ld1 {v6.16b}, [x1], x3 // Load pix1 for fourth iteration
+ umull v23.8h, v24.8b, v24.8b // Multiply lower half, third iteration
+ umull2 v22.8h, v24.16b, v24.16b // Multiply upper half, third iteration
+ uadalp v17.4s, v23.8h // Pairwise add and accumulate, third iteration
+ uadalp v17.4s, v22.8h // Pairwise add and accumulate, third iteration
+ ld1 {v7.16b}, [x2], x3 // Load pix2 for fouth iteration
+ uadalp v17.4s, v28.8h // Pairwise add and accumulate, first iteration
+ uabd v21.16b, v6.16b, v7.16b // Absolute difference, fourth iteration
+ umull v20.8h, v21.8b, v21.8b // Multiply lower half, fourth iteration
+ uadalp v17.4s, v20.8h // Pairwise add and accumulate, fourth iteration
+ umull2 v19.8h, v21.16b, v21.16b // Multiply upper half, fourth iteration
+ uadalp v17.4s, v19.8h // Pairwise add and accumulate, fourth iteration
+
+ sub w4, w4, #4 // h -= 4
+ uaddlv d16, v17.4s // add up accumulator vector
+ cmp w4, #4
+ add d18, d18, d16
+
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+
+ ld1 {v0.16b}, [x1], x3 // Load pix1
+ ld1 {v1.16b}, [x2], x3 // Load pix2
+
+ uabd v30.16b, v0.16b, v1.16b
+ umull v29.8h, v0.8b, v1.8b
+ umull2 v28.8h, v0.16b, v1.16b
+ uaddlp v17.4s, v29.8h
+ uadalp v17.4s, v28.8h
+
+
+ subs w4, w4, #1
+ uaddlv d16, v17.4s
+ add d18, d18, d16
+
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
2022-07-21 21:43 ` Martin Storsjö
2022-07-22 21:30 ` Swinney, Jonathan
@ 2022-07-25 11:15 ` Hubert Mazur
2022-07-28 18:50 ` Swinney, Jonathan
2022-08-04 8:00 ` Martin Storsjö
2 siblings, 2 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide neon implementation for sse4 function.
Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
2 files changed, 68 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_pix_abs16_neon;
c->sse[0] = sse16_neon;
+ c->sse[2] = sse4_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 98c912b608..3336d88848 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -352,3 +352,68 @@ function sse16_neon, export=1
ret
endfunc
+
+function sse4_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ movi d18, #0
+ movi d17, #0
+ cmp w4, #4
+ b.le 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.s}[0], [x1], x3 // Load pix1, first iteration
+ ld1 {v1.s}[0], [x2], x3 // Load pix2, first iteration
+ uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
+ ld1 {v2.s}[0], [x1], x3 // Load pix1, second iteration
+ ld1 {v3.s}[0], [x2], x3 // Load pix2, second iteration
+ umull v16.4s, v30.4h, v30.4h // Multiply vectors, first iteration
+ uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration
+ ld1 {v4.s}[0], [x1], x3 // Load pix1, third iteration
+ ld1 {v5.s}[0], [x2], x3 // Load pix2, third iteration
+ umlal v16.4s, v29.4h, v29.4h // Multiply and accumulate, second iteration
+ uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration
+ ld1 {v6.s}[0], [x1], x3 // Load pix1, fourth iteration
+ ld1 {v7.s}[0], [x2], x3 // Load pix2, fourth iteration
+ umlal v16.4s, v28.4h, v28.4h // Multiply and accumulate, third iteration
+ uabdl v27.8h, v6.8b, v7.8b // Absolue difference, fourth iteration
+ umlal v16.4s, v27.4h, v27.4h // Multiply and accumulate, fourth iteration
+
+ uaddlv d17, v16.4s // Add vector
+ add d18, d18, d17
+
+ sub w4, w4, #4
+ cmp w4, #4
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.s}[0], [x1], x3 // Load pix1
+ ld1 {v1.s}[0], [x2], x3 // Load pix2
+ uabdl v30.8h, v0.8b, v1.8b
+ umull v16.4s, v30.4h, v30.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ subs w4, w4, #1
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs16_y2
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
@ 2022-07-25 11:17 ` Hubert Mazur
2022-08-04 8:08 ` Martin Storsjö
2022-08-04 8:12 ` Martin Storsjö
0 siblings, 2 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:17 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of pix_abs16_y2 function for arm64.
Performance comparison tests are shown below.
pix_abs_0_2_c: 308.5
pix_abs_0_2_neon: 39.2
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 +
libavcodec/aarch64/me_cmp_neon.S | 73 ++++++++++++++++++++++++
2 files changed, 76 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 72a2062e7e..07d62cc1e5 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -29,6 +29,8 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
@@ -42,6 +44,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
if (have_neon(cpu_flags)) {
c->pix_abs[0][0] = ff_pix_abs16_neon;
c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
+ c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
c->sad[0] = ff_pix_abs16_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 3336d88848..6e392e9066 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -417,3 +417,76 @@ function sse4_neon, export=1
ret
endfunc
+
+function ff_pix_abs16_y2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // x4 int h
+
+ // initialize buffers
+ movi d18, #0
+ add x5, x2, x3 // pix2 + stride
+ cmp w4, #4
+ b.lt 2f
+
+// make 4 iterations at once
+1:
+
+ // abs(pix1[0], avg2(pix2[0], pix2[0 + stride]))
+ // avg2(a, b) = (((a) + (b) + 1) >> 1)
+ // abs(x) = (x < 0 ? (-x) : (x))
+
+ ld1 {v1.16b}, [x2], x3 // Load pix2 for first iteration
+ ld1 {v2.16b}, [x5], x3 // Load pix3 for first iteration
+ urhadd v30.16b, v1.16b, v2.16b // Rounding halving add, first iteration
+ ld1 {v0.16b}, [x1], x3 // Load pix1 for first iteration
+ uabdl v29.8h, v0.8b, v30.8b // Absolute difference of lower half, first iteration
+ ld1 {v4.16b}, [x2], x3 // Load pix2 for second iteration
+ uabdl2 v28.8h, v0.16b, v30.16b // Absolute difference of upper half, first iteration
+ ld1 {v5.16b}, [x5], x3 // Load pix3 for second iteartion
+ ld1 {v3.16b}, [x1], x3 // Load pix1 for second iteration
+ urhadd v27.16b, v4.16b, v5.16b // Rounding halving add, second iteration
+ uabal v29.8h, v3.8b, v27.8b // Absolute difference of lower half for second iteration
+ ld1 {v7.16b}, [x2], x3 // Load pix2 for third iteration
+ uabal2 v28.8h, v3.16b, v27.16b // Absolute difference of upper half for second iteration
+ ld1 {v20.16b}, [x5], x3 // Load pix3 for third iteration
+ urhadd v26.16b, v7.16b, v20.16b // Rounding halving add, third iteration
+ ld1 {v6.16b}, [x1], x3 // Load pix1 for third iteration
+ uabal v29.8h, v6.8b, v26.8b // Absolute difference of lower half for third iteration
+ ld1 {v22.16b}, [x2], x3 // Load pix2 for fourth iteration
+ uabal2 v28.8h, v6.16b, v26.16b // Absolute difference of upper half for third iteration
+ ld1 {v23.16b}, [x5], x3 // Load pix3 for fourth iteration
+ urhadd v25.16b, v22.16b, v23.16b // Rounding halving add
+ ld1 {v21.16b}, [x1], x3 // Load pix1 for fourth iteration
+ uabal v29.8h, v21.8b, v25.8b // Absolute difference of lower half for fourth iteration
+ uabal2 v28.8h, v21.16b, v25.16b // Absolute difference of upper half for fourth iteration
+
+ add v29.8h, v29.8h, v28.8h // Add vectors together
+ sub w4, w4, #4 // h-= 4
+ uaddlv s16, v29.8h // Add up vector values
+ cmp w4, #4
+ add d18, d18, d16
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+
+ ld1 {v1.16b}, [x2], x3 // Load pix2
+ ld1 {v2.16b}, [x5], x3 // Load pix3
+ urhadd v30.16b, v1.16b, v2.16b // Rounding halving add
+ ld1 {v0.16b}, [x1], x3 // Load pix1
+ uabd v30.16b, v30.16b, v30.16b
+
+ uaddlv h17, v30.16b
+ subs w4, w4, #1
+ add d18, d18, d17
+ b.ne 2b
+
+3:
+ fmov w0, s18
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse8
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
@ 2022-07-25 11:18 ` Hubert Mazur
2022-08-04 8:04 ` Martin Storsjö
0 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:18 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of sse8 function for arm64.
Performance comparison tests are shown below.
- sse_1_c: 133.0
- sse_1_neon: 36.7
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 +
libavcodec/aarch64/me_cmp_neon.S | 72 ++++++++++++++++++++++++
2 files changed, 75 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 07d62cc1e5..89c817990c 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -34,6 +34,8 @@ int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse8_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
@@ -49,6 +51,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_pix_abs16_neon;
c->sse[0] = sse16_neon;
+ c->sse[1] = sse8_neon;
c->sse[2] = sse4_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 6e392e9066..dcaffc9b73 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -353,6 +353,78 @@ function sse16_neon, export=1
endfunc
+function sse8_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ movi d18, #0
+ cmp w4, #4
+ b.le 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.8b}, [x1], x3 // Load pix1 for first iteration
+ ld1 {v1.8b}, [x2], x3 // Load pix2 for second iteration
+ uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
+ umull v21.4s, v30.4h, v30.4h // Multiply lower half, first iteration
+ ld1 {v2.8b}, [x1], x3 // Load pix1 for second iteration
+ umull2 v20.4s, v30.8h, v30.8h // Multiply upper half, second iteration
+ ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration
+ uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration
+ ld1 {v4.8b}, [x1], x3 // Load pix1 for third iteration
+ umlal v21.4s, v29.4h, v29.4h // Multiply lower half, second iteration
+ ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration
+ umlal2 v20.4s, v29.8h, v29.8h // Multiply upper half, second iteration
+ uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration
+ ld1 {v6.8b}, [x1], x3 // Load pix1 for fourth iteration
+ umlal v21.4s, v28.4h, v28.4h // Multiply lower half, third iteration
+ ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration
+ umlal2 v20.4s, v28.8h, v28.8h // Multiply upper half, third iteration
+ uabdl v27.8h, v6.8b, v7.8b // Absolute difference, fourth iteration
+ umlal v21.4s, v27.4h, v27.4h // Multiply lower half, fourth iteration
+ umlal2 v20.4s, v27.8h, v27.8h // Multiply upper ha;f, fourth iteration
+
+ add v21.4s, v21.4s, v20.4s // Add accumulator vectors together
+ sub w4, w4, #4 // h -= 4
+ uaddlv d17, v21.4s // Add up vector
+ add d18, d18, d17
+ cmp w4, #4
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.8b}, [x1], x3 // Load pix1
+ ld1 {v1.8b}, [x2], x3 // Load pix2
+
+ uabdl v30.8h, v0.8b, v1.8b
+ umull v21.4s, v30.4h, v30.4h
+ umull2 v20.4s, v30.8h, v30.8h
+
+ subs w4, w4, #1
+
+ uaddlv d17, v21.4s
+ add d18, d18, d17
+ uaddlv d17, v20.4S
+ add d18, d18, d17
+
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+endfunc
+
function sse4_neon, export=1
// x0 - unused
// x1 - pix1
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs8
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
@ 2022-07-25 11:21 ` Hubert Mazur
2022-08-04 8:10 ` Martin Storsjö
0 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:21 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of pix_abs8 function for arm64.
Performance comparison tests are shown below.
- pix_abs_1_0_c: 105.2
- pix_abs_1_0_neon: 21.4
- sad_1_c: 107.2
- sad_1_neon: 20.9
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 53 ++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 89c817990c..7d7dc38754 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -31,6 +31,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_pix_abs8_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
@@ -48,8 +50,10 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+ c->pix_abs[1][0] = ff_pix_abs8_neon;
c->sad[0] = ff_pix_abs16_neon;
+ c->sad[1] = ff_pix_abs8_neon;
c->sse[0] = sse16_neon;
c->sse[1] = sse8_neon;
c->sse[2] = sse4_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index dcaffc9b73..f2dd63ced1 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -72,6 +72,59 @@ function ff_pix_abs16_neon, export=1
ret
endfunc
+function ff_pix_abs8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // x4 int h
+
+ movi d18, #0
+ cmp w4, #4
+ b.lt 2f
+
+// make 4 iterations at once
+1:
+ ld1 {v0.8b}, [x1], x3 // Load pix1 for first iteration
+ ld1 {v1.8b}, [x2], x3 // Load pix2 for first iteration
+ uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
+ ld1 {v2.8b}, [x1], x3 // Load pix1 for second iteration
+ ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration
+ uabal v30.8h, v2.8b, v3.8b // Absolute difference, second iteration
+ ld1 {v4.8b}, [x1], x3 // Load pix1 for third iteration
+ ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration
+ uabal v30.8h, v4.8b, v5.8b // Absolute difference, third iteration
+ ld1 {v6.8b}, [x1], x3 // Load pix1 for foruth iteration
+ ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration
+ uabal v30.8h, v6.8b, v7.8b // Absolute difference, foruth iteration
+
+ sub w4, w4, #4 // h -= 4
+ uaddlv s20, v30.8h // Add up vector
+ cmp w4, #4
+ add d18, d18, d20
+ b.ge 1b
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.8b}, [x1], x3 // Load pix1
+ ld1 {v1.8b}, [x2], x3 // Load pix2
+
+ uabdl v16.8h, v0.8b, v1.8b
+
+ uaddlv s17, v16.8h
+ add d18, d18, d17
+ subs w4, w4, #1
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
+
function ff_pix_abs16_xy2_neon, export=1
// x0 unused
// x1 uint8_t *pix1
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
2022-07-25 11:15 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-07-28 18:50 ` Swinney, Jonathan
2022-07-28 18:51 ` Swinney, Jonathan
2022-08-04 8:00 ` Martin Storsjö
1 sibling, 1 reply; 25+ messages in thread
From: Swinney, Jonathan @ 2022-07-28 18:50 UTC (permalink / raw)
To: Hubert Mazur, ffmpeg-devel; +Cc: martin, mw, upstream, Pop, Sebastian, gjb
Your latest set of patches didn’t get interpreted correctly by the patchwork tool. I suspect it took them in the wrong order.
https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=
There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.
I didn't see anything else.
Thanks!
--
Jonathan Swinney
On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Provide neon implementation for sse4 function.
Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
2 files changed, 68 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_pix_abs16_neon;
c->sse[0] = sse16_neon;
+ c->sse[2] = sse4_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 98c912b608..3336d88848 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -352,3 +352,68 @@ function sse16_neon, export=1
ret
endfunc
+
+function sse4_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ movi d18, #0
+ movi d17, #0
+ cmp w4, #4
+ b.le 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.s}[0], [x1], x3 // Load pix1, first iteration
+ ld1 {v1.s}[0], [x2], x3 // Load pix2, first iteration
+ uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
+ ld1 {v2.s}[0], [x1], x3 // Load pix1, second iteration
+ ld1 {v3.s}[0], [x2], x3 // Load pix2, second iteration
+ umull v16.4s, v30.4h, v30.4h // Multiply vectors, first iteration
+ uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration
+ ld1 {v4.s}[0], [x1], x3 // Load pix1, third iteration
+ ld1 {v5.s}[0], [x2], x3 // Load pix2, third iteration
+ umlal v16.4s, v29.4h, v29.4h // Multiply and accumulate, second iteration
+ uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration
+ ld1 {v6.s}[0], [x1], x3 // Load pix1, fourth iteration
+ ld1 {v7.s}[0], [x2], x3 // Load pix2, fourth iteration
+ umlal v16.4s, v28.4h, v28.4h // Multiply and accumulate, third iteration
+ uabdl v27.8h, v6.8b, v7.8b // Absolue difference, fourth iteration
+ umlal v16.4s, v27.4h, v27.4h // Multiply and accumulate, fourth iteration
+
+ uaddlv d17, v16.4s // Add vector
+ add d18, d18, d17
+
+ sub w4, w4, #4
+ cmp w4, #4
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.s}[0], [x1], x3 // Load pix1
+ ld1 {v1.s}[0], [x2], x3 // Load pix2
+ uabdl v30.8h, v0.8b, v1.8b
+ umull v16.4s, v30.4h, v30.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ subs w4, w4, #1
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
2022-07-28 18:50 ` Swinney, Jonathan
@ 2022-07-28 18:51 ` Swinney, Jonathan
2022-07-29 7:26 ` Hubert Mazur
0 siblings, 1 reply; 25+ messages in thread
From: Swinney, Jonathan @ 2022-07-28 18:51 UTC (permalink / raw)
To: Hubert Mazur, ffmpeg-devel; +Cc: martin, mw, upstream, Pop, Sebastian, gjb
> There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.
Sorry-- I meant sse4_neon.
--
Jonathan Swinney
On 7/28/22, 1:50 PM, "Swinney, Jonathan" <jswinney@amazon.com> wrote:
Your latest set of patches didn’t get interpreted correctly by the patchwork tool. I suspect it took them in the wrong order.
https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=
There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.
I didn't see anything else.
Thanks!
--
Jonathan Swinney
On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Provide neon implementation for sse4 function.
Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
2 files changed, 68 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_pix_abs16_neon;
c->sse[0] = sse16_neon;
+ c->sse[2] = sse4_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 98c912b608..3336d88848 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -352,3 +352,68 @@ function sse16_neon, export=1
ret
endfunc
+
+function sse4_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ movi d18, #0
+ movi d17, #0
+ cmp w4, #4
+ b.le 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.s}[0], [x1], x3 // Load pix1, first iteration
+ ld1 {v1.s}[0], [x2], x3 // Load pix2, first iteration
+ uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
+ ld1 {v2.s}[0], [x1], x3 // Load pix1, second iteration
+ ld1 {v3.s}[0], [x2], x3 // Load pix2, second iteration
+ umull v16.4s, v30.4h, v30.4h // Multiply vectors, first iteration
+ uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration
+ ld1 {v4.s}[0], [x1], x3 // Load pix1, third iteration
+ ld1 {v5.s}[0], [x2], x3 // Load pix2, third iteration
+ umlal v16.4s, v29.4h, v29.4h // Multiply and accumulate, second iteration
+ uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration
+ ld1 {v6.s}[0], [x1], x3 // Load pix1, fourth iteration
+ ld1 {v7.s}[0], [x2], x3 // Load pix2, fourth iteration
+ umlal v16.4s, v28.4h, v28.4h // Multiply and accumulate, third iteration
+ uabdl v27.8h, v6.8b, v7.8b // Absolue difference, fourth iteration
+ umlal v16.4s, v27.4h, v27.4h // Multiply and accumulate, fourth iteration
+
+ uaddlv d17, v16.4s // Add vector
+ add d18, d18, d17
+
+ sub w4, w4, #4
+ cmp w4, #4
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.s}[0], [x1], x3 // Load pix1
+ ld1 {v1.s}[0], [x2], x3 // Load pix2
+ uabdl v30.8h, v0.8b, v1.8b
+ umull v16.4s, v30.4h, v30.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ subs w4, w4, #1
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
2022-07-28 18:51 ` Swinney, Jonathan
@ 2022-07-29 7:26 ` Hubert Mazur
0 siblings, 0 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-29 7:26 UTC (permalink / raw)
To: Swinney, Jonathan; +Cc: gjb, upstream, martin, ffmpeg-devel, mw, Pop, Sebastian
Yes, it seems that they are misplaced or each is treated as a new series
and thus can't be applied.
I will send the whole batch again after the first review, so some issues
could be fixed.
Thanks for the feedback!
On Thu, Jul 28, 2022 at 8:51 PM Swinney, Jonathan <jswinney@amazon.com>
wrote:
> > There is one more place to move the sub, cmp and branch instructions
> apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1
> and V1 and it may help A53.
>
> Sorry-- I meant sse4_neon.
>
> --
>
> Jonathan Swinney
>
> On 7/28/22, 1:50 PM, "Swinney, Jonathan" <jswinney@amazon.com> wrote:
>
> Your latest set of patches didn’t get interpreted correctly by the
> patchwork tool. I suspect it took them in the wrong order.
>
>
> https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=
>
> There is one more place to move the sub, cmp and branch instructions
> apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1
> and V1 and it may help A53.
>
> I didn't see anything else.
>
> Thanks!
> --
>
> Jonathan Swinney
>
> On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:
>
> CAUTION: This email originated from outside of the organization.
> Do not click links or open attachments unless you can confirm the sender
> and know the content is safe.
>
>
>
> Provide neon implementation for sse4 function.
>
> Performance comparison tests are shown below.
> - sse_2_c: 74.0
> - sse_2_neon: 24.0
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
> libavcodec/aarch64/me_cmp_neon.S | 65
> ++++++++++++++++++++++++
> 2 files changed, 68 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c
> b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 3ff5767bd0..72a2062e7e 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v,
> uint8_t *pix1, uint8_t *pix2,
>
> int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> ptrdiff_t stride, int h);
> +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> + ptrdiff_t stride, int h);
>
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c,
> AVCodecContext *avctx)
> {
> @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext
> *c, AVCodecContext *avctx)
>
> c->sad[0] = ff_pix_abs16_neon;
> c->sse[0] = sse16_neon;
> + c->sse[2] = sse4_neon;
> }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S
> b/libavcodec/aarch64/me_cmp_neon.S
> index 98c912b608..3336d88848 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -352,3 +352,68 @@ function sse16_neon, export=1
> ret
>
> endfunc
> +
> +function sse4_neon, export=1
> + // x0 - unused
> + // x1 - pix1
> + // x2 - pix2
> + // x3 - stride
> + // w4 - h
> +
> + movi d18, #0
> + movi d17, #0
> + cmp w4, #4
> + b.le 2f
> +
> +// make 4 iterations at once
> +1:
> +
> + // res = abs(pix1[0] - pix2[0])
> + // res * res
> +
> + ld1 {v0.s}[0], [x1], x3 // Load
> pix1, first iteration
> + ld1 {v1.s}[0], [x2], x3 // Load
> pix2, first iteration
> + uabdl v30.8h, v0.8b, v1.8b //
> Absolute difference, first iteration
> + ld1 {v2.s}[0], [x1], x3 // Load
> pix1, second iteration
> + ld1 {v3.s}[0], [x2], x3 // Load
> pix2, second iteration
> + umull v16.4s, v30.4h, v30.4h //
> Multiply vectors, first iteration
> + uabdl v29.8h, v2.8b, v3.8b //
> Absolute difference, second iteration
> + ld1 {v4.s}[0], [x1], x3 // Load
> pix1, third iteration
> + ld1 {v5.s}[0], [x2], x3 // Load
> pix2, third iteration
> + umlal v16.4s, v29.4h, v29.4h //
> Multiply and accumulate, second iteration
> + uabdl v28.8h, v4.8b, v5.8b //
> Absolute difference, third iteration
> + ld1 {v6.s}[0], [x1], x3 // Load
> pix1, fourth iteration
> + ld1 {v7.s}[0], [x2], x3 // Load
> pix2, fourth iteration
> + umlal v16.4s, v28.4h, v28.4h //
> Multiply and accumulate, third iteration
> + uabdl v27.8h, v6.8b, v7.8b //
> Absolue difference, fourth iteration
> + umlal v16.4s, v27.4h, v27.4h //
> Multiply and accumulate, fourth iteration
> +
> + uaddlv d17, v16.4s // Add
> vector
> + add d18, d18, d17
> +
> + sub w4, w4, #4
> + cmp w4, #4
> + b.ge 1b
> +
> + cbnz w4, 2f
> + fmov w0, s18
> +
> + ret
> +
> +// iterate by one
> +2:
> + ld1 {v0.s}[0], [x1], x3 // Load
> pix1
> + ld1 {v1.s}[0], [x2], x3 // Load
> pix2
> + uabdl v30.8h, v0.8b, v1.8b
> + umull v16.4s, v30.4h, v30.4h
> +
> + uaddlv d17, v16.4s
> + add d18, d18, d17
> +
> + subs w4, w4, #1
> + b.ne 2b
> + fmov w0, s18
> +
> + ret
> +
> +endfunc
> --
> 2.34.1
>
>
>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse16
2022-07-25 11:12 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-08-03 13:22 ` Martin Storsjö
2022-08-04 7:46 ` Martin Storsjö
1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-03 13:22 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Mon, 25 Jul 2022, Hubert Mazur wrote:
> Provide neon implementation for sse16 function.
>
> Performance comparison tests are shown below.
> - sse_0_c: 273.0
> - sse_0_neon: 48.2
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
> libavcodec/aarch64/me_cmp_neon.S | 82 ++++++++++++++++++++++++
> 2 files changed, 86 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 136b008eb7..3ff5767bd0 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -30,6 +30,9 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> ptrdiff_t stride, int h);
>
> +int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> + ptrdiff_t stride, int h);
The signature of these functions has been changed now (right after these
patches were submitted); the pix1/pix2 parameters are now const.
Also, nitpick; please align the following line ("ptrdiff_t stride, ...")
correctly with the parenthese on the line above.
> +
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -40,5 +43,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
>
> c->sad[0] = ff_pix_abs16_neon;
> + c->sse[0] = sse16_neon;
> }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index cda7ce0408..98c912b608 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -270,3 +270,85 @@ function ff_pix_abs16_x2_neon, export=1
>
> ret
> endfunc
> +
> +function sse16_neon, export=1
> + // x0 - unused
> + // x1 - pix1
> + // x2 - pix2
> + // x3 - stride
> + // w4 - h
> +
> + cmp w4, #4
> + movi d18, #0
> + b.lt 2f
> +
> +// Make 4 iterations at once
> +1:
> +
> + // res = abs(pix1[0] - pix2[0])
> + // res * res
> +
> + ld1 {v0.16b}, [x1], x3 // Load pix1 vector for first iteration
> + ld1 {v1.16b}, [x2], x3 // Load pix2 vector for first iteration
> + uabd v30.16b, v0.16b, v1.16b // Absolute difference, first iteration
Try to improve the interleaving of this function; I did a quick test on
Cortex A53, A72 and A73, and got these numbers:
Before:
sse_0_neon: 147.7 64.5 64.7
After:
sse_0_neon: 133.7 60.7 59.2
Overall, try to avoid having consecutive instructions operating on the
same iteration (except for when doing the same operation on different
halves of the same iteration), i.e. not "absolute difference third
iteration; multiply lower half third iteration, multiply upper half third
iteration, pairwise add third iteration", but bundle it up so you have
e.g. "absolute difference third iteration; pairwise add first iteration;
multiply {upper,lower} half third iteration; pairwise add second
iteration; pairwise add third iteration", or something like that.
Then secondly, in general, don't serialize the summation down to a single
element in each iteration! You can keep the accumulated sum as a vX.4s
vector (or maybe even better, two .4s vectors!) throughout the whole
algorithm, and then only add them up horizontally (with an uaddv) at the
end.
For adding vectors, I would instinctively prefer doing "uaddl v0.4s,
v2.4h, v3.4h; uaddl2 v1.4s, v2.8h, v3.8h" instead of "uaddlp v0.4s,
v1.4h; uadalp v0.4s, v1.8h" etc.
I didn't try out this modification, but please do, I'm pretty sure it will
be a fair bit faster, and if not, at least more idiomatic SIMD.
I didn't check the other patches yet, but if the other sse* functions are
implemented similarly, I would expect the same feedback to apply to them
too.
Let's iterate on the sse16 patch first now at least and get that one
great, and then update sse4/sse8 similarly once we have that one settled.
I'll try to have a look at the other patches in the set later
today/tomorrow.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse16
2022-07-25 11:12 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-03 13:22 ` Martin Storsjö
@ 2022-08-04 7:46 ` Martin Storsjö
1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04 7:46 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Mon, 25 Jul 2022, Hubert Mazur wrote:
> Provide neon implementation for sse16 function.
>
> Performance comparison tests are shown below.
> - sse_0_c: 273.0
> - sse_0_neon: 48.2
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
> libavcodec/aarch64/me_cmp_neon.S | 82 ++++++++++++++++++++++++
> 2 files changed, 86 insertions(+)
> +// iterate by one
> +2:
> +
> + ld1 {v0.16b}, [x1], x3 // Load pix1
> + ld1 {v1.16b}, [x2], x3 // Load pix2
> +
> + uabd v30.16b, v0.16b, v1.16b
> + umull v29.8h, v0.8b, v1.8b
> + umull2 v28.8h, v0.16b, v1.16b
This should probably be using v30 instead of v0/v1 in the umull here.
The whole codepath for non-modulo-4 heights is untested in practice. You
can apply the patches from
https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=7028 to make
checkasm test it, so please make sure that the uncommon codepaths in the
patches do work too.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
2022-07-25 11:15 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-07-28 18:50 ` Swinney, Jonathan
@ 2022-08-04 8:00 ` Martin Storsjö
1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04 8:00 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Mon, 25 Jul 2022, Hubert Mazur wrote:
> Provide neon implementation for sse4 function.
>
> Performance comparison tests are shown below.
> - sse_2_c: 74.0
> - sse_2_neon: 24.0
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
> libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
> 2 files changed, 68 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 3ff5767bd0..72a2062e7e 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>
> int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> ptrdiff_t stride, int h);
> +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> + ptrdiff_t stride, int h);
>
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>
> c->sad[0] = ff_pix_abs16_neon;
> c->sse[0] = sse16_neon;
> + c->sse[2] = sse4_neon;
> }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 98c912b608..3336d88848 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -352,3 +352,68 @@ function sse16_neon, export=1
> ret
>
> endfunc
> +
> +function sse4_neon, export=1
> + // x0 - unused
> + // x1 - pix1
> + // x2 - pix2
> + // x3 - stride
> + // w4 - h
> +
> + movi d18, #0
> + movi d17, #0
In the current implementation, it doesn't seem like d17 needs to be
initialized here
> + cmp w4, #4
> + b.le 2f
> +
> +// make 4 iterations at once
> +1:
> +
> + // res = abs(pix1[0] - pix2[0])
> + // res * res
> +
> + ld1 {v0.s}[0], [x1], x3 // Load pix1, first iteration
> + ld1 {v1.s}[0], [x2], x3 // Load pix2, first iteration
> + uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
Right now, half of the values calculated by uabdl are unused; you could
try loading two iterations into v0.s[0] and v0.s[1] so that the full
.8b register gets used. Doing that would reduce the number of uabdl
instructions from 4 to 2 - but it might make it harder to interleave
instructions efficiently. So after all, maybe it's not worth if, it we
can make the loads more efficiently interleaved this way?
Again, also here, it'd be good to interleave things more efficiently, e.g.
like this:
ld1 first
ld1 first
ld1 second
ld1 second
uabdl first
ld1 third
ld1 third
uabdl second
umull first
ld1 fourth
ld1 fourth
uabdl third
umlal second
uabdl fourth
umlal third
umlal fourth
> + ld1 {v2.s}[0], [x1], x3 // Load pix1, second iteration
> + ld1 {v3.s}[0], [x2], x3 // Load pix2, second iteration
> + umull v16.4s, v30.4h, v30.4h // Multiply vectors, first iteration
> + uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration
> + ld1 {v4.s}[0], [x1], x3 // Load pix1, third iteration
> + ld1 {v5.s}[0], [x2], x3 // Load pix2, third iteration
> + umlal v16.4s, v29.4h, v29.4h // Multiply and accumulate, second iteration
> + uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration
> + ld1 {v6.s}[0], [x1], x3 // Load pix1, fourth iteration
> + ld1 {v7.s}[0], [x2], x3 // Load pix2, fourth iteration
> + umlal v16.4s, v28.4h, v28.4h // Multiply and accumulate, third iteration
> + uabdl v27.8h, v6.8b, v7.8b // Absolue difference, fourth iteration
> + umlal v16.4s, v27.4h, v27.4h // Multiply and accumulate, fourth iteration
> +
> + uaddlv d17, v16.4s // Add vector
> + add d18, d18, d17
As usual, don't do any *add*v within the loop, defer it as far as
possible. Here you're accumulating in 32 bit elements, so it will surely
fit the results from the whole algorithm.
Also, if you get rid of the uaddlv here, you can also accumulate into two
separate .4s registers that you only add at the end; that allows two umlal
instructions to possibly execute in parallel without waiting for each
other (provided that the cpu has enough execution units for that).
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse8
2022-07-25 11:18 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-08-04 8:04 ` Martin Storsjö
0 siblings, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04 8:04 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Mon, 25 Jul 2022, Hubert Mazur wrote:
> Provide optimized implementation of sse8 function for arm64.
>
> Performance comparison tests are shown below.
> - sse_1_c: 133.0
> - sse_1_neon: 36.7
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 3 +
> libavcodec/aarch64/me_cmp_neon.S | 72 ++++++++++++++++++++++++
> 2 files changed, 75 insertions(+)
The same comments as for sse16 and sse4 apply here too.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs16_y2
2022-07-25 11:17 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-08-04 8:08 ` Martin Storsjö
2022-08-04 8:12 ` Martin Storsjö
1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04 8:08 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Mon, 25 Jul 2022, Hubert Mazur wrote:
> Provide optimized implementation of pix_abs16_y2 function for arm64.
>
> Performance comparison tests are shown below.
> pix_abs_0_2_c: 308.5
> pix_abs_0_2_neon: 39.2
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 3 +
> libavcodec/aarch64/me_cmp_neon.S | 73 ++++++++++++++++++++++++
> 2 files changed, 76 insertions(+)
Please do the same optimizations as done for pix_abs_xy2 in
b46de9aba436dea0cff76f3ed0f7c98448367fd0,
68a03f64240dcbe408c3fd43d1071a105508a588 and
4136405c86162063e45d40d55c9985f348d4ea0a for this function too
("aarch64: me_cmp: Interleave some of the loads in ff_pix_abs16_xy2_neon",
"aarch64: me_cmp: Switch from uabd to uabal in ff_pix_abs16_xy2_neon" and
"aarch64: me_cmp: Don't do uaddlv once per iteration").
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs8
2022-07-25 11:21 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-08-04 8:10 ` Martin Storsjö
0 siblings, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04 8:10 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Mon, 25 Jul 2022, Hubert Mazur wrote:
> Provide optimized implementation of pix_abs8 function for arm64.
>
> Performance comparison tests are shown below.
> - pix_abs_1_0_c: 105.2
> - pix_abs_1_0_neon: 21.4
> - sad_1_c: 107.2
> - sad_1_neon: 20.9
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
> libavcodec/aarch64/me_cmp_neon.S | 53 ++++++++++++++++++++++++
> 2 files changed, 57 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 89c817990c..7d7dc38754 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -31,6 +31,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> ptrdiff_t stride, int h);
> int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> ptrdiff_t stride, int h);
> +int ff_pix_abs8_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> + ptrdiff_t stride, int h);
>
> int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> ptrdiff_t stride, int h);
> @@ -48,8 +50,10 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
> c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
> c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
> + c->pix_abs[1][0] = ff_pix_abs8_neon;
>
> c->sad[0] = ff_pix_abs16_neon;
> + c->sad[1] = ff_pix_abs8_neon;
> c->sse[0] = sse16_neon;
> c->sse[1] = sse8_neon;
> c->sse[2] = sse4_neon;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index dcaffc9b73..f2dd63ced1 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -72,6 +72,59 @@ function ff_pix_abs16_neon, export=1
> ret
> endfunc
>
> +function ff_pix_abs8_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // x4 int h
> +
> + movi d18, #0
> + cmp w4, #4
> + b.lt 2f
> +
> +// make 4 iterations at once
> +1:
> + ld1 {v0.8b}, [x1], x3 // Load pix1 for first iteration
> + ld1 {v1.8b}, [x2], x3 // Load pix2 for first iteration
> + uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
> + ld1 {v2.8b}, [x1], x3 // Load pix1 for second iteration
> + ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration
> + uabal v30.8h, v2.8b, v3.8b // Absolute difference, second iteration
> + ld1 {v4.8b}, [x1], x3 // Load pix1 for third iteration
> + ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration
> + uabal v30.8h, v4.8b, v5.8b // Absolute difference, third iteration
> + ld1 {v6.8b}, [x1], x3 // Load pix1 for foruth iteration
> + ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration
> + uabal v30.8h, v6.8b, v7.8b // Absolute difference, foruth iteration
This is maybe the simplest example so far, where the unrolled version here
just is 4 identical serial copies of the same set of 3 instructions; this
maybe helps a bit on some CPUs, but it doesn't help nearly as much as it
can on others, if it would be better unrolled.
I.e., same comments as for the other patches; improve interleaving, don't
do uaddlv once per iteration.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs16_y2
2022-07-25 11:17 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04 8:08 ` Martin Storsjö
@ 2022-08-04 8:12 ` Martin Storsjö
1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04 8:12 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Mon, 25 Jul 2022, Hubert Mazur wrote:
> Provide optimized implementation of pix_abs16_y2 function for arm64.
>
> Performance comparison tests are shown below.
> pix_abs_0_2_c: 308.5
> pix_abs_0_2_neon: 39.2
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 3 +
> libavcodec/aarch64/me_cmp_neon.S | 73 ++++++++++++++++++++++++
> 2 files changed, 76 insertions(+)
> +// iterate by one
> +2:
> +
> + ld1 {v1.16b}, [x2], x3 // Load pix2
> + ld1 {v2.16b}, [x5], x3 // Load pix3
> + urhadd v30.16b, v1.16b, v2.16b // Rounding halving add
> + ld1 {v0.16b}, [x1], x3 // Load pix1
> + uabd v30.16b, v30.16b, v30.16b
This should be "uabd v30, v30, v0" here too - please check the uncommon
codepaths too (until we can make checkasm test them by default).
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8
2022-08-16 12:20 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
@ 2022-08-18 9:22 ` Martin Storsjö
0 siblings, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-18 9:22 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Tue, 16 Aug 2022, Hubert Mazur wrote:
> Provide optimized implementation of pix_abs8 function for arm64.
>
> Performance comparison tests are shown below.
> - pix_abs_1_0_c: 101.2
> - pix_abs_1_0_neon: 22.5
> - sad_1_c: 101.2
> - sad_1_neon: 22.5
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
> libavcodec/aarch64/me_cmp_neon.S | 49 ++++++++++++++++++++++++
> 2 files changed, 53 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 2f51f0497e..e7dbd4cbc5 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -31,6 +31,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *
> ptrdiff_t stride, int h);
> int ff_pix_abs16_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> ptrdiff_t stride, int h);
> +int ff_pix_abs8_neon(MpegEncContext *s, const uint8_t *blk1, const uint8_t *blk2,
> + ptrdiff_t stride, int h);
Alignment
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 3f4266d4d5..8c396cad21 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -72,6 +72,55 @@ function ff_pix_abs16_neon, export=1
> ret
> endfunc
>
> +function ff_pix_abs8_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // x4 int h
w4, not x4
> +
> + movi d18, #0
Unused d18
> + movi v30.8h, #0
> + cmp w4, #4
> + b.lt 2f
> +
> +// make 4 iterations at once
> +1:
> + ld1 {v0.8b}, [x1], x3 // Load pix1 for first iteration
> + ld1 {v1.8b}, [x2], x3 // Load pix2 for first iteration
> + ld1 {v2.8b}, [x1], x3 // Load pix1 for second iteration
> + uabal v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
> + ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration
> + ld1 {v4.8b}, [x1], x3 // Load pix1 for third iteration
> + uabal v30.8h, v2.8b, v3.8b // Absolute difference, second iteration
> + ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration
> + sub w4, w4, #4 // h -= 4
> + uabal v30.8h, v4.8b, v5.8b // Absolute difference, third iteration
> + ld1 {v6.8b}, [x1], x3 // Load pix1 for foruth iteration
> + ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration
> + cmp w4, #4
> + uabal v30.8h, v6.8b, v7.8b // Absolute difference, foruth iteration
The interleaving here looks mostly quite good, but the last uabal comes
almost directly after the two loads; I moved the second-last uabal from
before the two ld1s to between ld1 and cmp, and got a rather notable
speedup.
Before: Cortex A53 A72 A73
pix_abs_1_0_neon: 65.7 33.7 21.5
After:
pix_abs_1_0_neon: 57.7 33.5 21.5
So this is a 13% speedup on Cortex A53, just by moving one single
instruction. This is why paying attention to scheduling matters, sometimes
a lot.
> + uaddlv s20, v30.8h // Add up vector
> + add d18, d18, d20
> + fmov w0, s18
And finally, by removing the unnecessary add of d18 here, I got this
further reduced to the following runtimes:
Cortex A53 A72 A73
pix_abs_1_0_neon: 54.7 30.7 20.2
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8
2022-08-16 12:20 [FFmpeg-devel] [PATCH 0/5] Provide neon implementation for me_cmp functions Hubert Mazur
@ 2022-08-16 12:20 ` Hubert Mazur
2022-08-18 9:22 ` Martin Storsjö
0 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-08-16 12:20 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of pix_abs8 function for arm64.
Performance comparison tests are shown below.
- pix_abs_1_0_c: 101.2
- pix_abs_1_0_neon: 22.5
- sad_1_c: 101.2
- sad_1_neon: 22.5
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 49 ++++++++++++++++++++++++
2 files changed, 53 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 2f51f0497e..e7dbd4cbc5 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -31,6 +31,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *
ptrdiff_t stride, int h);
int ff_pix_abs16_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_pix_abs8_neon(MpegEncContext *s, const uint8_t *blk1, const uint8_t *blk2,
+ ptrdiff_t stride, int h);
int sse16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
@@ -49,8 +51,10 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+ c->pix_abs[1][0] = ff_pix_abs8_neon;
c->sad[0] = ff_pix_abs16_neon;
+ c->sad[1] = ff_pix_abs8_neon;
c->sse[0] = sse16_neon;
c->sse[1] = sse8_neon;
c->sse[2] = sse4_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 3f4266d4d5..8c396cad21 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -72,6 +72,55 @@ function ff_pix_abs16_neon, export=1
ret
endfunc
+function ff_pix_abs8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // x4 int h
+
+ movi d18, #0
+ movi v30.8h, #0
+ cmp w4, #4
+ b.lt 2f
+
+// make 4 iterations at once
+1:
+ ld1 {v0.8b}, [x1], x3 // Load pix1 for first iteration
+ ld1 {v1.8b}, [x2], x3 // Load pix2 for first iteration
+ ld1 {v2.8b}, [x1], x3 // Load pix1 for second iteration
+ uabal v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
+ ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration
+ ld1 {v4.8b}, [x1], x3 // Load pix1 for third iteration
+ uabal v30.8h, v2.8b, v3.8b // Absolute difference, second iteration
+ ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration
+ sub w4, w4, #4 // h -= 4
+ uabal v30.8h, v4.8b, v5.8b // Absolute difference, third iteration
+ ld1 {v6.8b}, [x1], x3 // Load pix1 for foruth iteration
+ ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration
+ cmp w4, #4
+ uabal v30.8h, v6.8b, v7.8b // Absolute difference, foruth iteration
+ b.ge 1b
+
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v0.8b}, [x1], x3 // Load pix1
+ ld1 {v1.8b}, [x2], x3 // Load pix2
+
+ subs w4, w4, #1
+ uabal v30.8h, v0.8b, v1.8b
+ b.ne 2b
+
+3:
+ uaddlv s20, v30.8h // Add up vector
+ add d18, d18, d20
+ fmov w0, s18
+
+ ret
+endfunc
+
function ff_pix_abs16_xy2_neon, export=1
// x0 unused
// x1 uint8_t *pix1
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 25+ messages in thread
end of thread, other threads:[~2022-08-18 9:23 UTC | newest]
Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-15 8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
2022-07-25 11:12 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-03 13:22 ` Martin Storsjö
2022-08-04 7:46 ` Martin Storsjö
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
2022-07-21 21:43 ` Martin Storsjö
2022-07-22 21:30 ` Swinney, Jonathan
2022-07-25 11:15 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-07-28 18:50 ` Swinney, Jonathan
2022-07-28 18:51 ` Swinney, Jonathan
2022-07-29 7:26 ` Hubert Mazur
2022-08-04 8:00 ` Martin Storsjö
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
2022-07-25 11:17 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04 8:08 ` Martin Storsjö
2022-08-04 8:12 ` Martin Storsjö
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
2022-07-25 11:18 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04 8:04 ` Martin Storsjö
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
2022-07-25 11:21 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04 8:10 ` Martin Storsjö
2022-08-16 12:20 [FFmpeg-devel] [PATCH 0/5] Provide neon implementation for me_cmp functions Hubert Mazur
2022-08-16 12:20 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
2022-08-18 9:22 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git