* [FFmpeg-devel] [PATCH 0/3] Provide neon implementations
@ 2022-09-20 11:01 Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16 Hubert Mazur
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: Hubert Mazur @ 2022-09-20 11:01 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
This fixes issues addressed in previous patchset:
- move sub instruction in vsad8_intra,
- remove unnecessary mov instructions,
- remove single lane extraction in loop and place it at the end.
Removing mov instructions from pix_median_abs functions significantly
increased peformance for both.
Hubert Mazur (3):
lavc/aarch64: Add neon implementation for pix_median_abs16
lavc/aarch64: Add neon implementation for vsad8_intra
lavc/aarch64: Add neon implementation for pix_median_abs8
libavcodec/aarch64/me_cmp_init_aarch64.c | 10 ++
libavcodec/aarch64/me_cmp_neon.S | 182 +++++++++++++++++++++++
libavcodec/me_cmp.c | 5 +-
3 files changed, 195 insertions(+), 2 deletions(-)
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16
2022-09-20 11:01 [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Hubert Mazur
@ 2022-09-20 11:01 ` Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra Hubert Mazur
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Hubert Mazur @ 2022-09-20 11:01 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation for pix_median_abs16 function.
Performance comparison tests are shown below.
- median_sad_0_c: 720.5
- median_sad_0_neon: 127.2
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 78 ++++++++++++++++++++++++
libavcodec/me_cmp.c | 5 +-
3 files changed, 85 insertions(+), 2 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index ade3e9a4c1..fb51a833be 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -53,6 +53,8 @@ int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
ptrdiff_t stride, int h);
int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -78,6 +80,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsse[4] = vsse_intra16_neon;
c->nsse[0] = nsse16_neon_wrapper;
+
+ c->median_sad[0] = pix_median_abs16_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index f8998749a5..1e08d04869 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -969,3 +969,81 @@ function nsse16_neon, export=1
ret
endfunc
+
+function pix_median_abs16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v2.16b}, [x1], x3
+ ld1 {v3.16b}, [x2], x3
+ movi v31.8h, #0
+ movi v16.8h, #0
+ ext v0.16b, v2.16b, v2.16b, #1
+ ext v1.16b, v3.16b, v3.16b, #1
+ usubl v28.8h, v2.8b, v3.8b
+ usubl2 v27.8h, v2.16b, v3.16b
+ usubl v26.8h, v0.8b, v1.8b
+ usubl2 v25.8h, v0.16b, v1.16b
+ sub w4, w4, #1 // we need to make h-1 iterations
+ saba v31.8h, v26.8h, v28.8h
+ saba v16.8h, v25.8h, v27.8h
+ mov h18, v28.h[0]
+ cmp w4, #1
+ sqabs h18, h18
+ movi v0.8h, #0
+
+ b.lt 2f
+1:
+
+ ld1 {v6.16b}, [x1], x3 // pix1 vector for V(j-1)
+ ld1 {v7.16b}, [x2], x3 // pix2 vector for V(j-1)
+ subs w4, w4, #1
+ ext v4.16b, v6.16b, v6.16b, #1 // pix1 vector for V(j)
+ ext v5.16b, v7.16b, v7.16b, #1 // pix2 vector for V(j)
+
+ // protected registers: v30, v29, v28, v27, v26, v25, v24, v23
+ // scratch registers: v22, v21, v20, v19, v17
+
+ // To find median of three values, calculate sum of them
+ // and subtract max and min value from it.
+ usubl v30.8h, v6.8b, v7.8b // V(j-1)
+ usubl2 v29.8h, v6.16b, v7.16b // V(j-1)
+ usubl v24.8h, v4.8b, v5.8b // V(j)
+ usubl2 v23.8h, v4.16b, v5.16b // V(j)
+ saba v0.8h, v30.8h, v28.8h
+ add v22.8h, v26.8h, v30.8h
+ smin v20.8h, v26.8h, v30.8h
+ add v21.8h, v25.8h, v29.8h
+ smax v19.8h, v26.8h, v30.8h
+ sub v22.8h, v22.8h, v28.8h
+ sub v21.8h, v21.8h, v27.8h
+ smin v17.8h, v19.8h, v22.8h
+ smin v22.8h, v25.8h, v29.8h
+ mov v28.16b, v30.16b
+ smax v20.8h, v20.8h, v17.8h // median values lower half
+ smax v19.8h, v25.8h, v29.8h
+ saba v31.8h, v24.8h, v20.8h
+ mov v27.16b, v29.16b
+ smin v19.8h, v19.8h, v21.8h
+ mov v26.16b, v24.16b
+ smax v17.8h, v22.8h, v19.8h // median values upper half
+ mov v25.16b, v23.16b
+ saba v16.8h, v23.8h, v17.8h
+
+ b.ne 1b
+
+2:
+ mov h17, v0.h[0]
+ ins v16.h[7], wzr
+ add d18, d18, d17
+ add v31.8h, v31.8h, v16.8h
+ uaddlv s17, v31.8h
+ add d18, d18, d17
+ fmov w0, s18
+
+ ret
+
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index 4242fbc6e4..e2f9f84b05 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1048,6 +1048,9 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
ff_dsputil_init_dwt(c);
#endif
+ c->median_sad[0] = pix_median_abs16_c;
+ c->median_sad[1] = pix_median_abs8_c;
+
#if ARCH_AARCH64
ff_me_cmp_init_aarch64(c, avctx);
#elif ARCH_ALPHA
@@ -1062,6 +1065,4 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
ff_me_cmp_init_mips(c, avctx);
#endif
- c->median_sad[0] = pix_median_abs16_c;
- c->median_sad[1] = pix_median_abs8_c;
}
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra
2022-09-20 11:01 [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16 Hubert Mazur
@ 2022-09-20 11:01 ` Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8 Hubert Mazur
2022-09-21 10:11 ` [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Martin Storsjö
3 siblings, 0 replies; 5+ messages in thread
From: Hubert Mazur @ 2022-09-20 11:01 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation for vsad8_intra function.
Performance comparison tests are shown below.
- vsad_5_c: 94.7
- vsad_5_neon: 20.7
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 42 ++++++++++++++++++++++++
2 files changed, 45 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index fb51a833be..d3fa047a86 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -45,6 +45,8 @@ int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
ptrdiff_t stride, int h) ;
+int vsad_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
+ ptrdiff_t stride, int h) ;
int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
@@ -75,6 +77,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsad[0] = vsad16_neon;
c->vsad[4] = vsad_intra16_neon;
+ c->vsad[5] = vsad_intra8_neon;
c->vsse[0] = vsse16_neon;
c->vsse[4] = vsse_intra16_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 1e08d04869..1587f2ea43 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1047,3 +1047,45 @@ function pix_median_abs16_neon, export=1
ret
endfunc
+
+function vsad_intra8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *dummy
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.8b}, [x1], x3
+ sub w4, w4, #1 // we need to make h-1 iterations
+ cmp w4, #3
+ movi v16.8h, #0
+ b.lt 2f
+
+1:
+ // v = abs( pix1[0] - pix1[0 + stride] )
+ // score = sum(v)
+ ld1 {v1.8b}, [x1], x3
+ sub w4, w4, #3
+ ld1 {v2.8b}, [x1], x3
+ uabal v16.8h, v0.8b, v1.8b
+ ld1 {v3.8b}, [x1], x3
+ uabal v16.8h, v1.8b, v2.8b
+ cmp w4, #3
+ mov v0.8b, v3.8b
+ uabal v16.8h, v2.8b, v3.8b
+ b.ge 1b
+ cbz w4, 3f
+
+2:
+ ld1 {v1.8b}, [x1], x3
+ subs w4, w4, #1
+ uabal v16.8h, v0.8b, v1.8b
+ mov v0.8b, v1.8b
+ cbnz w4, 2b
+
+3:
+ uaddlv s17, v16.8h
+ fmov w0, s17
+
+ ret
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8
2022-09-20 11:01 [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16 Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra Hubert Mazur
@ 2022-09-20 11:01 ` Hubert Mazur
2022-09-21 10:11 ` [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Martin Storsjö
3 siblings, 0 replies; 5+ messages in thread
From: Hubert Mazur @ 2022-09-20 11:01 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop
Provide optimized implementation for pix_median_abs8 function.
Performance comparison tests are shown below.
- median_sad_1_c: 277.0
- median_sad_1_neon: 82.0
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 62 ++++++++++++++++++++++++
2 files changed, 65 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index d3fa047a86..e143f0816e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -57,6 +57,8 @@ int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -85,6 +87,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->nsse[0] = nsse16_neon_wrapper;
c->median_sad[0] = pix_median_abs16_neon;
+ c->median_sad[1] = pix_median_abs8_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 1587f2ea43..11af4849f9 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1089,3 +1089,65 @@ function vsad_intra8_neon, export=1
ret
endfunc
+
+function pix_median_abs8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v2.8b}, [x1], x3
+ ld1 {v3.8b}, [x2], x3
+ movi v31.8h, #0
+ ext v0.8b, v2.8b, v2.8b, #1
+ ext v1.8b, v3.8b, v3.8b, #1
+ usubl v28.8h, v2.8b, v3.8b
+ usubl v26.8h, v0.8b, v1.8b
+ sub w4, w4, #1 // we need to make h-1 iterations
+ saba v31.8h, v26.8h, v28.8h
+ mov h18, v28.h[0]
+ cmp w4, #1
+ sqabs h18, h18
+ movi v0.8h, #0
+
+ b.lt 2f
+1:
+ ld1 {v6.8b}, [x1], x3 // pix1 vector for V(j-1)
+ ld1 {v7.8b}, [x2], x3 // pix2 vector for V(j-1)
+ subs w4, w4, #1
+ ext v4.8b, v6.8b, v6.8b, #1 // pix1 vector for V(j)
+ ext v5.8b, v7.8b, v7.8b, #1 // pix2 vector for V(j)
+
+ // protected registers: v30, v29, v28, v27, v26, v25, v24, v23
+ // scratch registers: v22, v21, v20, v19, v17
+
+ // To find median of three values, calculate sum of them
+ // and subtract max and min value from it.
+ usubl v30.8h, v6.8b, v7.8b // V(j-1)
+ usubl v24.8h, v4.8b, v5.8b // V(j)
+ saba v0.8h, v30.8h, v28.8h
+ add v22.8h, v26.8h, v30.8h
+ smin v20.8h, v26.8h, v30.8h
+ smax v19.8h, v26.8h, v30.8h
+ sub v22.8h, v22.8h, v28.8h
+ smin v17.8h, v19.8h, v22.8h
+ mov v28.16b, v30.16b
+ smax v20.8h, v20.8h, v17.8h // median values lower half
+ smax v19.8h, v25.8h, v29.8h
+ saba v31.8h, v24.8h, v20.8h
+ mov v26.16b, v24.16b
+ smax v17.8h, v22.8h, v19.8h // median values upper half
+
+ b.ne 1b
+2:
+ mov h17, v0.h[0]
+ ins v31.h[7], wzr
+ add d18, d18, d17
+ uaddlv s17, v31.8h
+ add d18, d18, d17
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCH 0/3] Provide neon implementations
2022-09-20 11:01 [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Hubert Mazur
` (2 preceding siblings ...)
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8 Hubert Mazur
@ 2022-09-21 10:11 ` Martin Storsjö
3 siblings, 0 replies; 5+ messages in thread
From: Martin Storsjö @ 2022-09-21 10:11 UTC (permalink / raw)
To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop
On Tue, 20 Sep 2022, Hubert Mazur wrote:
> This fixes issues addressed in previous patchset:
> - move sub instruction in vsad8_intra,
> - remove unnecessary mov instructions,
> - remove single lane extraction in loop and place it at the end.
>
> Removing mov instructions from pix_median_abs functions significantly
> increased peformance for both.
I'm quite sure that it wasn't the removed mov instructions that improved
performance (those instructions should be essentially free, they're just
misleading), but the fact that you got rid of the extra single-element
handling within the loop.
Anyway, the patches seem fine to me now, so I'll push them. Thanks!
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2022-09-21 10:12 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-20 11:01 [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16 Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8 Hubert Mazur
2022-09-21 10:11 ` [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git