* [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for vsad16
2022-09-08 9:25 [FFmpeg-devel] [PATCH 0/5] Provide optimized neon implementation Hubert Mazur
@ 2022-09-08 9:25 ` Hubert Mazur
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation of vsse16 Hubert Mazur
` (4 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Hubert Mazur @ 2022-09-08 9:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of vsad16 function for arm64.
Performance comparison tests are shown below.
- vsad_0_c: 285.2
- vsad_0_neon: 39.5
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Co-authored-by: Martin Storsjö <martin@martin.st>
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 5 ++
libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
2 files changed, 70 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index fb7c3f5059..ddc5d05611 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -41,6 +41,9 @@ int sse8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -57,5 +60,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sse[0] = sse16_neon;
c->sse[1] = sse8_neon;
c->sse[2] = sse4_neon;
+
+ c->vsad[0] = vsad16_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 4198985c6c..1d0b166d69 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -584,3 +584,68 @@ function sse4_neon, export=1
ret
endfunc
+
+function vsad16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration
+ ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration
+
+ sub w4, w4, #1 // we need to make h-1 iterations
+ movi v16.8h, #0
+
+ cmp w4, #3 // check if we can make 3 iterations at once
+ usubl v31.8h, v0.8b, v1.8b // Signed difference pix1[0] - pix2[0], first iteration
+ usubl2 v30.8h, v0.16b, v1.16b // Signed difference pix1[0] - pix2[0], first iteration
+
+ b.lt 2f
+
+1:
+ // abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+ ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration
+ ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration
+ ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration
+ ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration
+ usubl v29.8h, v0.8b, v1.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
+ usubl2 v28.8h, v0.16b, v1.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
+ ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration
+ ld1 {v5.16b}, [x2], x3 // Load pix2[0 + stride], third iteration
+ usubl v27.8h, v2.8b, v3.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
+ saba v16.8h, v31.8h, v29.8h // Signed absolute difference and accumulate the result. first iteration
+ usubl2 v26.8h, v2.16b, v3.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
+ saba v16.8h, v30.8h, v28.8h // Signed absolute difference and accumulate the result. first iteration
+ usubl v25.8h, v4.8b, v5.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
+ usubl2 v24.8h, v4.16b, v5.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
+ saba v16.8h, v29.8h, v27.8h // Signed absolute difference and accumulate the result. second iteration
+ mov v31.16b, v25.16b
+ saba v16.8h, v28.8h, v26.8h // Signed absolute difference and accumulate the result. second iteration
+ sub w4, w4, #3 // h -= 3
+ mov v30.16b, v24.16b
+ saba v16.8h, v27.8h, v25.8h // Signed absolute difference and accumulate the result. third iteration
+ cmp w4, #3
+ saba v16.8h, v26.8h, v24.8h // Signed absolute difference and accumulate the result. third iteration
+
+ b.ge 1b
+ cbz w4, 3f
+2:
+ ld1 {v0.16b}, [x1], x3
+ ld1 {v1.16b}, [x2], x3
+ subs w4, w4, #1
+ usubl v29.8h, v0.8b, v1.8b
+ usubl2 v28.8h, v0.16b, v1.16b
+ saba v16.8h, v31.8h, v29.8h
+ mov v31.16b, v29.16b
+ saba v16.8h, v30.8h, v28.8h
+ mov v30.16b, v28.16b
+
+ b.ne 2b
+3:
+ uaddlv s17, v16.8h
+ fmov w0, s17
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation of vsse16
2022-09-08 9:25 [FFmpeg-devel] [PATCH 0/5] Provide optimized neon implementation Hubert Mazur
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for vsad16 Hubert Mazur
@ 2022-09-08 9:25 ` Hubert Mazur
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for vsad_intra16 Hubert Mazur
` (3 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Hubert Mazur @ 2022-09-08 9:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of vsse16 for arm64.
Performance comparison tests are shown below.
- vsse_0_c: 257.7
- vsse_0_neon: 59.2
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 87 ++++++++++++++++++++++++
2 files changed, 91 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index ddc5d05611..7b81e48d16 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -62,5 +64,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sse[2] = sse4_neon;
c->vsad[0] = vsad16_neon;
+
+ c->vsse[0] = vsse16_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 1d0b166d69..b3f376aa60 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -649,3 +649,90 @@ function vsad16_neon, export=1
ret
endfunc
+
+function vsse16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration
+ ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration
+
+ sub w4, w4, #1 // we need to make h-1 iterations
+ movi v16.4s, #0
+ movi v17.4s, #0
+
+ cmp w4, #3 // check if we can make 3 iterations at once
+ usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
+ usubl2 v30.8h, v0.16b, v1.16b // Signed difference of pix1[0] - pix2[0], first iteration
+ b.le 2f
+
+
+1:
+ // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+ // res = (x) * (x)
+ ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration
+ ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration
+ ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration
+ ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration
+ usubl v29.8h, v0.8b, v1.8b
+ usubl2 v28.8h, v0.16b, v1.16b
+ ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration
+ ld1 {v5.16b}, [x2], x3 // Load pix1[0 + stride], third iteration
+ sabd v31.8h, v31.8h, v29.8h
+ sabd v30.8h, v30.8h, v28.8h
+ usubl v27.8h, v2.8b, v3.8b
+ usubl2 v26.8h, v2.16b, v3.16b
+ usubl v25.8h, v4.8b, v5.8b
+ usubl2 v24.8h, v4.16b, v5.16b
+ sabd v29.8h, v29.8h, v27.8h
+ sabd v27.8h, v27.8h, v25.8h
+ umlal v16.4s, v31.4h, v31.4h
+ umlal2 v17.4s, v31.8h, v31.8h
+ sabd v28.8h, v28.8h, v26.8h
+ sabd v26.8h, v26.8h, v24.8h
+ umlal v16.4s, v30.4h, v30.4h
+ umlal2 v17.4s, v30.8h, v30.8h
+ mov v31.16b, v25.16b
+ umlal v16.4s, v29.4h, v29.4h
+ umlal2 v17.4s, v29.8h, v29.8h
+ mov v30.16b, v24.16b
+ umlal v16.4s, v28.4h, v28.4h
+ umlal2 v17.4s, v28.8h, v28.8h
+ sub w4, w4, #3
+ umlal v16.4s, v27.4h, v27.4h
+ umlal2 v17.4s, v27.8h, v27.8h
+ cmp w4, #3
+ umlal v16.4s, v26.4h, v26.4h
+ umlal2 v17.4s, v26.8h, v26.8h
+
+ b.ge 1b
+
+ cbz w4, 3f
+
+// iterate by once
+2:
+ ld1 {v0.16b}, [x1], x3
+ ld1 {v1.16b}, [x2], x3
+ subs w4, w4, #1
+ usubl v29.8h, v0.8b, v1.8b
+ usubl2 v28.8h, v0.16b, v1.16b
+ sabd v31.8h, v31.8h, v29.8h
+ sabd v30.8h, v30.8h, v28.8h
+ umlal v16.4s, v31.4h, v31.4h
+ umlal2 v17.4s, v31.8h, v31.8h
+ mov v31.16b, v29.16b
+ umlal v16.4s, v30.4h, v30.4h
+ umlal2 v17.4s, v30.8h, v30.8h
+ mov v30.16b, v28.16b
+ b.ne 2b
+
+3:
+ add v16.4s, v16.4s, v17.4s
+ uaddlv d17, v16.4s
+ fmov w0, s17
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for vsad_intra16
2022-09-08 9:25 [FFmpeg-devel] [PATCH 0/5] Provide optimized neon implementation Hubert Mazur
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for vsad16 Hubert Mazur
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation of vsse16 Hubert Mazur
@ 2022-09-08 9:25 ` Hubert Mazur
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for vsse_intra16 Hubert Mazur
` (2 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Hubert Mazur @ 2022-09-08 9:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation for vsad_intra16 function for arm64.
Performance comparison tests are shown below.
- vsad_4_c: 177.5
- vsad_4_neon: 23.5
Benchmarks and tests are run with checkasm tool on AWS Gravtion 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 48 ++++++++++++++++++++++++
2 files changed, 51 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 7b81e48d16..af83f7ed1e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
+ ptrdiff_t stride, int h) ;
int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
@@ -64,6 +66,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sse[2] = sse4_neon;
c->vsad[0] = vsad16_neon;
+ c->vsad[4] = vsad_intra16_neon;
c->vsse[0] = vsse16_neon;
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index b3f376aa60..ce198ea227 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -736,3 +736,51 @@ function vsse16_neon, export=1
ret
endfunc
+
+function vsad_intra16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *dummy
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.16b}, [x1], x3
+ sub w4, w4, #1 // we need to make h-1 iterations
+ cmp w4, #3
+ movi v16.8h, #0
+ b.lt 2f
+
+// make 4 iterations at once
+1:
+ // v = abs( pix1[0] - pix1[0 + stride] )
+ // score = sum(v)
+ ld1 {v1.16b}, [x1], x3
+ ld1 {v2.16b}, [x1], x3
+ uabal v16.8h, v0.8b, v1.8b
+ ld1 {v3.16b}, [x1], x3
+ uabal2 v16.8h, v0.16b, v1.16b
+ sub w4, w4, #3
+ uabal v16.8h, v1.8b, v2.8b
+ cmp w4, #3
+ uabal2 v16.8h, v1.16b, v2.16b
+ mov v0.16b, v3.16b
+ uabal v16.8h, v2.8b, v3.8b
+ uabal2 v16.8h, v2.16b, v3.16b
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v1.16b}, [x1], x3
+ subs w4, w4, #1
+ uabal v16.8h, v0.8b, v1.8b
+ uabal2 v16.8h, v0.16b, v1.16b
+ mov v0.16b, v1.16b
+ cbnz w4, 2b
+
+3:
+ uaddlv s17, v16.8h
+ fmov w0, s17
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for vsse_intra16
2022-09-08 9:25 [FFmpeg-devel] [PATCH 0/5] Provide optimized neon implementation Hubert Mazur
` (2 preceding siblings ...)
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for vsad_intra16 Hubert Mazur
@ 2022-09-08 9:25 ` Hubert Mazur
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Provide neon implementation of nsse16 Hubert Mazur
2022-09-09 7:32 ` [FFmpeg-devel] [PATCH 0/5] Provide optimized neon implementation Martin Storsjö
5 siblings, 0 replies; 11+ messages in thread
From: Hubert Mazur @ 2022-09-08 9:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation for vsse_intra16 for arm64.
Performance tests are shown below.
- vsse_4_c: 155.2
- vsse_4_neon: 36.2
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 63 ++++++++++++++++++++++++
2 files changed, 66 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index af83f7ed1e..8c295d5457 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -47,6 +47,8 @@ int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
ptrdiff_t stride, int h) ;
int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -69,5 +71,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsad[4] = vsad_intra16_neon;
c->vsse[0] = vsse16_neon;
+ c->vsse[4] = vsse_intra16_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index ce198ea227..cf2b8da425 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -784,3 +784,66 @@ function vsad_intra16_neon, export=1
ret
endfunc
+
+function vsse_intra16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *dummy
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.16b}, [x1], x3
+ movi v16.4s, #0
+ movi v17.4s, #0
+
+ sub w4, w4, #1 // we need to make h-1 iterations
+ cmp w4, #3
+ b.lt 2f
+
+1:
+ // v = abs( pix1[0] - pix1[0 + stride] )
+ // score = sum( v * v )
+ ld1 {v1.16b}, [x1], x3
+ ld1 {v2.16b}, [x1], x3
+ uabd v30.16b, v0.16b, v1.16b
+ ld1 {v3.16b}, [x1], x3
+ umull v29.8h, v30.8b, v30.8b
+ umull2 v28.8h, v30.16b, v30.16b
+ uabd v27.16b, v1.16b, v2.16b
+ uadalp v16.4s, v29.8h
+ umull v26.8h, v27.8b, v27.8b
+ umull2 v27.8h, v27.16b, v27.16b
+ uadalp v17.4s, v28.8h
+ uabd v25.16b, v2.16b, v3.16b
+ uadalp v16.4s, v26.8h
+ umull v24.8h, v25.8b, v25.8b
+ umull2 v25.8h, v25.16b, v25.16b
+ uadalp v17.4s, v27.8h
+ sub w4, w4, #3
+ uadalp v16.4s, v24.8h
+ cmp w4, #3
+ uadalp v17.4s, v25.8h
+ mov v0.16b, v3.16b
+
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v1.16b}, [x1], x3
+ subs w4, w4, #1
+ uabd v30.16b, v0.16b, v1.16b
+ mov v0.16b, v1.16b
+ umull v29.8h, v30.8b, v30.8b
+ umull2 v30.8h, v30.16b, v30.16b
+ uadalp v16.4s, v29.8h
+ uadalp v17.4s, v30.8h
+ cbnz w4, 2b
+
+3:
+ add v16.4s, v16.4s, v17.4S
+ uaddlv d17, v16.4s
+ fmov w0, s17
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Provide neon implementation of nsse16
2022-09-08 9:25 [FFmpeg-devel] [PATCH 0/5] Provide optimized neon implementation Hubert Mazur
` (3 preceding siblings ...)
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for vsse_intra16 Hubert Mazur
@ 2022-09-08 9:25 ` Hubert Mazur
2022-09-09 7:32 ` [FFmpeg-devel] [PATCH 0/5] Provide optimized neon implementation Martin Storsjö
5 siblings, 0 replies; 11+ messages in thread
From: Hubert Mazur @ 2022-09-08 9:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Add vectorized implementation of nsse16 function.
Performance comparison tests are shown below.
- nsse_0_c: 682.2
- nsse_0_neon: 116.5
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Co-authored-by: Martin Storsjö <martin@martin.st>
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 15 +++
libavcodec/aarch64/me_cmp_neon.S | 122 +++++++++++++++++++++++
2 files changed, 137 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 8c295d5457..ade3e9a4c1 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -49,6 +49,10 @@ int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
ptrdiff_t stride, int h);
+int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
+ ptrdiff_t stride, int h);
+int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -72,5 +76,16 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsse[0] = vsse16_neon;
c->vsse[4] = vsse_intra16_neon;
+
+ c->nsse[0] = nsse16_neon_wrapper;
}
}
+
+int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h)
+{
+ if (c)
+ return nsse16_neon(c->avctx->nsse_weight, s1, s2, stride, h);
+ else
+ return nsse16_neon(8, s1, s2, stride, h);
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index cf2b8da425..f8998749a5 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -847,3 +847,125 @@ function vsse_intra16_neon, export=1
ret
endfunc
+
+function nsse16_neon, export=1
+ // x0 multiplier
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ str x0, [sp, #-0x40]!
+ stp x1, x2, [sp, #0x10]
+ stp x3, x4, [sp, #0x20]
+ str x30, [sp, #0x30]
+ bl X(sse16_neon)
+ ldr x30, [sp, #0x30]
+ mov w9, w0 // here we store score1
+ ldr x5, [sp]
+ ldp x1, x2, [sp, #0x10]
+ ldp x3, x4, [sp, #0x20]
+ add sp, sp, #0x40
+
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+
+ ld1 {v0.16b}, [x1], x3
+ subs w4, w4, #1 // we need to make h-1 iterations
+ ld1 {v2.16b}, [x2], x3
+ ext v1.16b, v0.16b, v0.16b, #1 // x1 + 1
+ cmp w4, #2
+ ext v3.16b, v2.16b, v2.16b, #1 // x2 + 1
+
+ b.lt 2f
+
+// make 2 iterations at once
+1:
+ ld1 {v4.16b}, [x1], x3
+ ld1 {v6.16b}, [x2], x3
+ ld1 {v20.16b}, [x1], x3
+ ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1
+ usubl v31.8h, v0.8b, v4.8b
+ usubl2 v30.8h, v0.16b, v4.16b
+ ld1 {v22.16b}, [x2], x3
+ usubl v29.8h, v1.8b, v5.8b
+ usubl2 v28.8h, v1.16b, v5.16b
+ ext v7.16b, v6.16b, v6.16b, #1 // x2 + stride + 1
+ saba v16.8h, v31.8h, v29.8h
+ ext v21.16b, v20.16b, v20.16b, #1
+ saba v17.8h, v30.8h, v28.8h
+ usubl v27.8h, v2.8b, v6.8b
+ usubl2 v26.8h, v2.16b, v6.16b
+ ext v23.16b, v22.16b, v22.16b, #1
+ usubl v25.8h, v3.8b, v7.8b
+ usubl2 v24.8h, v3.16b, v7.16b
+ saba v18.8h, v27.8h, v25.8h
+ saba v19.8h, v26.8h, v24.8h
+
+ usubl v31.8h, v4.8b, v20.8b
+ usubl2 v30.8h, v4.16b, v20.16b
+ usubl v29.8h, v5.8b, v21.8b
+ usubl2 v28.8h, v5.16b, v21.16b
+ saba v16.8h, v31.8h, v29.8h
+ saba v17.8h, v30.8h, v28.8h
+ usubl v27.8h, v6.8b, v22.8b
+ usubl2 v26.8h, v6.16b, v22.16b
+ usubl v25.8h, v7.8b, v23.8b
+ usubl2 v24.8h, v7.16b, v23.16b
+ saba v18.8h, v27.8h, v25.8h
+ saba v19.8h, v26.8h, v24.8h
+ sub w4, w4, #2
+
+ mov v0.16b, v20.16b
+ mov v1.16b, v21.16b
+ cmp w4, #2
+ mov v2.16b, v22.16b
+ mov v3.16b, v23.16b
+
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v4.16b}, [x1], x3
+ subs w4, w4, #1
+ ld1 {v6.16b}, [x2], x3
+ ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1
+ usubl v31.8h, v0.8b, v4.8b
+ ext v7.16b, v6.16b, v6.16b, #1 // x2 + stride + 1
+
+ usubl2 v30.8h, v0.16b, v4.16b
+ usubl v29.8h, v1.8b, v5.8b
+ usubl2 v28.8h, v1.16b, v5.16b
+ saba v16.8h, v31.8h, v29.8h
+ saba v17.8h, v30.8h, v28.8h
+ usubl v27.8h, v2.8b, v6.8b
+ usubl2 v26.8h, v2.16b, v6.16b
+ usubl v25.8h, v3.8b, v7.8b
+ usubl2 v24.8h, v3.16b, v7.16b
+ saba v18.8h, v27.8h, v25.8h
+ saba v19.8h, v26.8h, v24.8h
+
+ mov v0.16b, v4.16b
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+
+ cbnz w4, 2b
+
+3:
+ sqsub v17.8h, v17.8h, v19.8h
+ sqsub v16.8h, v16.8h, v18.8h
+ ins v17.h[7], wzr
+ sqadd v16.8h, v16.8h, v17.8h
+ saddlv s16, v16.8h
+ sqabs s16, s16
+ fmov w0, s16
+
+ mul w0, w0, w5
+ add w0, w0, w9
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH 0/5] Provide optimized neon implementation
2022-09-08 9:25 [FFmpeg-devel] [PATCH 0/5] Provide optimized neon implementation Hubert Mazur
` (4 preceding siblings ...)
2022-09-08 9:25 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Provide neon implementation of nsse16 Hubert Mazur
@ 2022-09-09 7:32 ` Martin Storsjö
5 siblings, 0 replies; 11+ messages in thread
From: Martin Storsjö @ 2022-09-09 7:32 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Thu, 8 Sep 2022, Hubert Mazur wrote:
> Fix minor issues in the patches.
> Regarding vsse16 I didn't change saba & umlal to sub & smlal.
> It doesn't affect the performance, so left it as it was.
> The majority of changes refer to nsse16:
> - fixed indentation (thanks for pointing out),
> - applied the patch from Martin which fixes the balance
> within instructions,
> - interleaved instructions - apparently this helped a little
> to achieve better benchmarks.
Thanks! I measured a small further improvement on A53 with this change;
from 377 to 370 cycles.
> I have also updated the benchmark results for each function -
> not a huge performance improvement, but worth the effort.
> For nsse and vsse are shown below (these are the biggest changes).
> - vsse16 asm from 64.7 to 59.2,
> - nsse16 asm from 120.0 to 116.5.
It's kinda surprising that the difference is so small, since we reduced
the amount of work done in the functions quite significantly (IIRC on A53,
the speedup was something like 1.5x compared with the original), but I
guess it's understandable if the Graviton 3 is so powerful, that there's
enough spare execution units so that a bunch of redundant instructions
doesn't really matter.
Anyway, this revision of the patchset looked good to me, so I pushed it
now. Thanks!
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation of vsse16
2022-09-06 10:27 Hubert Mazur
@ 2022-09-06 10:27 ` Hubert Mazur
2022-09-07 8:57 ` Martin Storsjö
0 siblings, 1 reply; 11+ messages in thread
From: Hubert Mazur @ 2022-09-06 10:27 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of vsse16 for arm64.
Performance comparison tests are shown below.
- vsse_0_c: 254.4
- vsse_0_neon: 64.7
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 87 ++++++++++++++++++++++++
2 files changed, 91 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index ddc5d05611..7b81e48d16 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -62,5 +64,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sse[2] = sse4_neon;
c->vsad[0] = vsad16_neon;
+
+ c->vsse[0] = vsse16_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 1d0b166d69..b3f376aa60 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -649,3 +649,90 @@ function vsad16_neon, export=1
ret
endfunc
+
+function vsse16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration
+ ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration
+
+ sub w4, w4, #1 // we need to make h-1 iterations
+ movi v16.4s, #0
+ movi v17.4s, #0
+
+ cmp w4, #3 // check if we can make 3 iterations at once
+ usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
+ usubl2 v30.8h, v0.16b, v1.16b // Signed difference of pix1[0] - pix2[0], first iteration
+ b.le 2f
+
+
+1:
+ // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+ // res = (x) * (x)
+ ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration
+ ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration
+ ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration
+ ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration
+ usubl v29.8h, v0.8b, v1.8b
+ usubl2 v28.8h, v0.16b, v1.16b
+ ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration
+ ld1 {v5.16b}, [x2], x3 // Load pix1[0 + stride], third iteration
+ sabd v31.8h, v31.8h, v29.8h
+ sabd v30.8h, v30.8h, v28.8h
+ usubl v27.8h, v2.8b, v3.8b
+ usubl2 v26.8h, v2.16b, v3.16b
+ usubl v25.8h, v4.8b, v5.8b
+ usubl2 v24.8h, v4.16b, v5.16b
+ sabd v29.8h, v29.8h, v27.8h
+ sabd v27.8h, v27.8h, v25.8h
+ umlal v16.4s, v31.4h, v31.4h
+ umlal2 v17.4s, v31.8h, v31.8h
+ sabd v28.8h, v28.8h, v26.8h
+ sabd v26.8h, v26.8h, v24.8h
+ umlal v16.4s, v30.4h, v30.4h
+ umlal2 v17.4s, v30.8h, v30.8h
+ mov v31.16b, v25.16b
+ umlal v16.4s, v29.4h, v29.4h
+ umlal2 v17.4s, v29.8h, v29.8h
+ mov v30.16b, v24.16b
+ umlal v16.4s, v28.4h, v28.4h
+ umlal2 v17.4s, v28.8h, v28.8h
+ sub w4, w4, #3
+ umlal v16.4s, v27.4h, v27.4h
+ umlal2 v17.4s, v27.8h, v27.8h
+ cmp w4, #3
+ umlal v16.4s, v26.4h, v26.4h
+ umlal2 v17.4s, v26.8h, v26.8h
+
+ b.ge 1b
+
+ cbz w4, 3f
+
+// iterate by once
+2:
+ ld1 {v0.16b}, [x1], x3
+ ld1 {v1.16b}, [x2], x3
+ subs w4, w4, #1
+ usubl v29.8h, v0.8b, v1.8b
+ usubl2 v28.8h, v0.16b, v1.16b
+ sabd v31.8h, v31.8h, v29.8h
+ sabd v30.8h, v30.8h, v28.8h
+ umlal v16.4s, v31.4h, v31.4h
+ umlal2 v17.4s, v31.8h, v31.8h
+ mov v31.16b, v29.16b
+ umlal v16.4s, v30.4h, v30.4h
+ umlal2 v17.4s, v30.8h, v30.8h
+ mov v30.16b, v28.16b
+ b.ne 2b
+
+3:
+ add v16.4s, v16.4s, v17.4s
+ uaddlv d17, v16.4s
+ fmov w0, s17
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation of vsse16
2022-09-06 10:27 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation of vsse16 Hubert Mazur
@ 2022-09-07 8:57 ` Martin Storsjö
0 siblings, 0 replies; 11+ messages in thread
From: Martin Storsjö @ 2022-09-07 8:57 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Tue, 6 Sep 2022, Hubert Mazur wrote:
> Provide optimized implementation of vsse16 for arm64.
>
> Performance comparison tests are shown below.
> - vsse_0_c: 254.4
> - vsse_0_neon: 64.7
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
> libavcodec/aarch64/me_cmp_neon.S | 87 ++++++++++++++++++++++++
> 2 files changed, 91 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index ddc5d05611..7b81e48d16 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
>
> int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> ptrdiff_t stride, int h);
> +int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> + ptrdiff_t stride, int h);
>
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -62,5 +64,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> c->sse[2] = sse4_neon;
>
> c->vsad[0] = vsad16_neon;
> +
> + c->vsse[0] = vsse16_neon;
> }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 1d0b166d69..b3f376aa60 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -649,3 +649,90 @@ function vsad16_neon, export=1
>
> ret
> endfunc
> +
> +function vsse16_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // w4 int h
> +
> + ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration
> + ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration
> +
> + sub w4, w4, #1 // we need to make h-1 iterations
> + movi v16.4s, #0
> + movi v17.4s, #0
> +
> + cmp w4, #3 // check if we can make 3 iterations at once
> + usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
> + usubl2 v30.8h, v0.16b, v1.16b // Signed difference of pix1[0] - pix2[0], first iteration
> + b.le 2f
> +
> +
> +1:
> + // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
> + // res = (x) * (x)
Technically, there's no need for abs() here, we can just as well just do a
plain subtraction. I tested this by replacing sabd with sub here (and
changing umlal into smlal). It doesn't make any difference for the
performance on the cores I tested on though - apparently there's no
difference in performance between sabd and sub. So in practice, both
should be fine. And I don't think that either of them is better for
handling overflows/edge cases here either (which shouldn't be happening
anyway).
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation of vsse16
2022-08-22 15:26 [FFmpeg-devel] [PATCH 0/5] me_cmp: Provide arm64 neon implementations Hubert Mazur
@ 2022-08-22 15:26 ` Hubert Mazur
2022-09-04 20:53 ` Martin Storsjö
0 siblings, 1 reply; 11+ messages in thread
From: Hubert Mazur @ 2022-08-22 15:26 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation of vsse16 for arm64.
Performance comparison tests are shown below.
- vsse_0_c: 254.4
- vsse_0_neon: 64.7
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 +
libavcodec/aarch64/me_cmp_neon.S | 97 ++++++++++++++++++++++++
2 files changed, 101 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index ddc5d05611..7b81e48d16 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -62,5 +64,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sse[2] = sse4_neon;
c->vsad[0] = vsad16_neon;
+
+ c->vsse[0] = vsse16_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index d4c0099854..279bae7cb5 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -659,3 +659,100 @@ function vsad16_neon, export=1
ret
endfunc
+
+function vsse16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ movi v30.4s, #0
+ movi v29.4s, #0
+
+ add x5, x1, x3 // pix1 + stride
+ add x6, x2, x3 // pix2 + stride
+ sub w4, w4, #1 // we need to make h-1 iterations
+ cmp w4, #3 // check if we can make 4 iterations at once
+ b.le 2f
+
+// make 4 iterations at once
+1:
+ // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride]) =
+ // res = (x) * (x)
+ ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration
+ ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration
+ ld1 {v2.16b}, [x5], x3 // Load pix1[0 + stride], first iteration
+ usubl v28.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
+ ld1 {v3.16b}, [x6], x3 // Load pix2[0 + stride], first iteration
+ usubl2 v27.8h, v0.16b, v1.16b // Signed difference of pix1[0] - pix2[0], first iteration
+ usubl v26.8h, v3.8b, v2.8b // Signed difference of pix1[0 + stride] - pix2[0 + stride], first iteration
+ usubl2 v25.8h, v3.16b, v2.16b // Signed difference of pix1[0 + stride] - pix2[0 + stride], first iteration
+ ld1 {v4.16b}, [x1], x3 // Load pix1[0], second iteration
+ sqadd v28.8h, v28.8h, v26.8h // Add first iteration
+ ld1 {v6.16b}, [x5], x3 // Load pix1[0 + stride], second iteration
+ sqadd v27.8h, v27.8h, v25.8h // Add first iteration
+ ld1 {v5.16b}, [x2], x3 // Load pix2[0], second iteration
+ smlal v30.4s, v28.4h, v28.4h // Multiply-accumulate first iteration
+ ld1 {v7.16b}, [x6], x3 // Load pix2[0 + stride], second iteration
+ usubl v26.8h, v4.8b, v5.8b // Signed difference of pix1[0] - pix2[0], second iteration
+ smlal2 v29.4s, v28.8h, v28.8h // Multiply-accumulate first iteration
+ usubl2 v25.8h, v4.16b, v5.16b // Signed difference of pix1[0] - pix2[0], second iteration
+ usubl v24.8h, v7.8b, v6.8b // Signed difference of pix1[0 + stride] - pix2[0 + stride], first iteration
+ smlal v30.4s, v27.4h, v27.4h // Multiply-accumulate first iteration
+ usubl2 v23.8h, v7.16b, v6.16b // Signed difference of pix1[0 + stride] - pix2[0 + stride], first iteration
+ sqadd v24.8h, v26.8h, v24.8h // Add second iteration
+ smlal2 v29.4s, v27.8h, v27.8h // Multiply-accumulate first iteration
+ sqadd v23.8h, v25.8h, v23.8h // Add second iteration
+ ld1 {v18.16b}, [x1], x3 // Load pix1[0], third iteration
+ smlal v30.4s, v24.4h, v24.4h // Multiply-accumulate second iteration
+ ld1 {v31.16b}, [x2], x3 // Load pix2[0], third iteration
+ ld1 {v17.16b}, [x5], x3 // Load pix1[0 + stride], third iteration
+ smlal2 v29.4s, v24.8h, v24.8h // Multiply-accumulate second iteration
+ ld1 {v16.16b}, [x6], x3 // Load pix2[0 + stride], third iteration
+ usubl v22.8h, v18.8b, v31.8b // Signed difference of pix1[0] - pix2[0], third iteration
+ smlal v30.4s, v23.4h, v23.4h // Multiply-accumulate second iteration
+ usubl2 v21.8h, v18.16b, v31.16b // Signed difference of pix1[0] - pix2[0], third iteration
+ usubl v20.8h, v16.8b, v17.8b // Signed difference of pix1[0 + stride] - pix2[0 + stride], first iteration
+ smlal2 v29.4s, v23.8h, v23.8h // Multiply-accumulate second iteration
+ sqadd v20.8h, v22.8h, v20.8h // Add third iteration
+ usubl2 v19.8h, v16.16b, v17.16b // Signed difference of pix1[0 + stride] - pix2[0 + stride], first iteration
+ smlal v30.4s, v20.4h, v20.4h // Multiply-accumulate third iteration
+ sqadd v19.8h, v21.8h, v19.8h // Add third iteration
+ smlal2 v29.4s, v20.8h, v20.8h // Multiply-accumulate third iteration
+ sub w4, w4, #3
+ smlal v30.4s, v19.4h, v19.4h // Multiply-accumulate third iteration
+ cmp w4, #3
+ smlal2 v29.4s, v19.8h, v19.8h // Multiply-accumulate third iteration
+
+ b.ge 1b
+
+ cbz w4, 3f
+
+// iterate by once
+2:
+ ld1 {v0.16b}, [x1], x3
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v2.16b}, [x5], x3
+ usubl v28.8h, v0.8b, v1.8b
+ ld1 {v3.16b}, [x6], x3
+ usubl2 v27.8h, v0.16b, v1.16b
+ usubl v26.8h, v3.8b, v2.8b
+ usubl2 v25.8h, v3.16b, v2.16b
+ sqadd v28.8h, v28.8h, v26.8h
+ sqadd v27.8h, v27.8h, v25.8h
+ smlal v30.4s, v28.4h, v28.4h
+ smlal2 v29.4s, v28.8h, v28.8h
+ subs w4, w4, #1
+ smlal v30.4s, v27.4h, v27.4h
+ smlal2 v29.4s, v27.8h, v27.8h
+
+ b.ne 2b
+
+3:
+ add v30.4s, v30.4s, v29.4s
+ saddlv d17, v30.4s
+ fmov w0, s17
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation of vsse16
2022-08-22 15:26 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation of vsse16 Hubert Mazur
@ 2022-09-04 20:53 ` Martin Storsjö
0 siblings, 0 replies; 11+ messages in thread
From: Martin Storsjö @ 2022-09-04 20:53 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Mon, 22 Aug 2022, Hubert Mazur wrote:
> Provide optimized implementation of vsse16 for arm64.
>
> Performance comparison tests are shown below.
> - vsse_0_c: 254.4
> - vsse_0_neon: 64.7
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 4 +
> libavcodec/aarch64/me_cmp_neon.S | 97 ++++++++++++++++++++++++
> 2 files changed, 101 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index ddc5d05611..7b81e48d16 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
>
> int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> ptrdiff_t stride, int h);
> +int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> + ptrdiff_t stride, int h);
>
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -62,5 +64,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> c->sse[2] = sse4_neon;
>
> c->vsad[0] = vsad16_neon;
> +
> + c->vsse[0] = vsse16_neon;
> }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index d4c0099854..279bae7cb5 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -659,3 +659,100 @@ function vsad16_neon, export=1
>
> ret
> endfunc
> +
> +function vsse16_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // w4 int h
> +
> + movi v30.4s, #0
> + movi v29.4s, #0
> +
> + add x5, x1, x3 // pix1 + stride
> + add x6, x2, x3 // pix2 + stride
> + sub w4, w4, #1 // we need to make h-1 iterations
> + cmp w4, #3 // check if we can make 4 iterations at once
> + b.le 2f
> +
> +// make 4 iterations at once
The comments seem to talk about 4 iterations at once while the code
actually only does 3.
> +1:
> + // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride]) =
The comment seems a bit un-updated here, since there's no abs() involved
here
> + // res = (x) * (x)
> + ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration
> + ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration
> + ld1 {v2.16b}, [x5], x3 // Load pix1[0 + stride], first iteration
> + usubl v28.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
> + ld1 {v3.16b}, [x6], x3 // Load pix2[0 + stride], first iteration
> + usubl2 v27.8h, v0.16b, v1.16b // Signed difference of pix1[0] - pix2[0], first iteration
> + usubl v26.8h, v3.8b, v2.8b // Signed difference of pix1[0 + stride] - pix2[0 + stride], first iteration
> + usubl2 v25.8h, v3.16b, v2.16b // Signed difference of pix1[0 + stride] - pix2[0 + stride], first iteration
Same thing about reusing data from the previous row, as for the previous
patch.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread