From: "Swinney, Jonathan" <jswinney@amazon.com>
To: Hubert Mazur <hum@semihalf.com>,
"ffmpeg-devel@ffmpeg.org" <ffmpeg-devel@ffmpeg.org>
Cc: "martin@martin.st" <martin@martin.st>,
"mw@semihalf.com" <mw@semihalf.com>,
"upstream@semihalf.com" <upstream@semihalf.com>,
"Pop, Sebastian" <spop@amazon.com>,
"gjb@semihalf.com" <gjb@semihalf.com>
Subject: Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
Date: Thu, 28 Jul 2022 18:51:35 +0000
Message-ID: <A5902B05-2979-4306-9444-445AE2385F90@amazon.com> (raw)
In-Reply-To: <F44E1864-A982-4C7F-B617-B15601715F13@amazon.com>
> There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.
Sorry-- I meant sse4_neon.
--
Jonathan Swinney
On 7/28/22, 1:50 PM, "Swinney, Jonathan" <jswinney@amazon.com> wrote:
Your latest set of patches didn’t get interpreted correctly by the patchwork tool. I suspect it took them in the wrong order.
https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=
There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.
I didn't see anything else.
Thanks!
--
Jonathan Swinney
On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Provide neon implementation for sse4 function.
Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
2 files changed, 68 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_pix_abs16_neon;
c->sse[0] = sse16_neon;
+ c->sse[2] = sse4_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 98c912b608..3336d88848 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -352,3 +352,68 @@ function sse16_neon, export=1
ret
endfunc
+
+function sse4_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ movi d18, #0
+ movi d17, #0
+ cmp w4, #4
+ b.le 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.s}[0], [x1], x3 // Load pix1, first iteration
+ ld1 {v1.s}[0], [x2], x3 // Load pix2, first iteration
+ uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
+ ld1 {v2.s}[0], [x1], x3 // Load pix1, second iteration
+ ld1 {v3.s}[0], [x2], x3 // Load pix2, second iteration
+ umull v16.4s, v30.4h, v30.4h // Multiply vectors, first iteration
+ uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration
+ ld1 {v4.s}[0], [x1], x3 // Load pix1, third iteration
+ ld1 {v5.s}[0], [x2], x3 // Load pix2, third iteration
+ umlal v16.4s, v29.4h, v29.4h // Multiply and accumulate, second iteration
+ uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration
+ ld1 {v6.s}[0], [x1], x3 // Load pix1, fourth iteration
+ ld1 {v7.s}[0], [x2], x3 // Load pix2, fourth iteration
+ umlal v16.4s, v28.4h, v28.4h // Multiply and accumulate, third iteration
+ uabdl v27.8h, v6.8b, v7.8b // Absolue difference, fourth iteration
+ umlal v16.4s, v27.4h, v27.4h // Multiply and accumulate, fourth iteration
+
+ uaddlv d17, v16.4s // Add vector
+ add d18, d18, d17
+
+ sub w4, w4, #4
+ cmp w4, #4
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.s}[0], [x1], x3 // Load pix1
+ ld1 {v1.s}[0], [x2], x3 // Load pix2
+ uabdl v30.8h, v0.8b, v1.8b
+ umull v16.4s, v30.4h, v30.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ subs w4, w4, #1
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2022-07-28 18:51 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-07-15 8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
2022-07-25 11:12 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-03 13:22 ` Martin Storsjö
2022-08-04 7:46 ` Martin Storsjö
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
2022-07-21 21:43 ` Martin Storsjö
2022-07-22 21:30 ` Swinney, Jonathan
2022-07-25 11:15 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-07-28 18:50 ` Swinney, Jonathan
2022-07-28 18:51 ` Swinney, Jonathan [this message]
2022-07-29 7:26 ` Hubert Mazur
2022-08-04 8:00 ` Martin Storsjö
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
2022-07-25 11:17 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04 8:08 ` Martin Storsjö
2022-08-04 8:12 ` Martin Storsjö
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
2022-07-25 11:18 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04 8:04 ` Martin Storsjö
2022-07-15 8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
2022-07-25 11:21 ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04 8:10 ` Martin Storsjö
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=A5902B05-2979-4306-9444-445AE2385F90@amazon.com \
--to=jswinney@amazon.com \
--cc=ffmpeg-devel@ffmpeg.org \
--cc=gjb@semihalf.com \
--cc=hum@semihalf.com \
--cc=martin@martin.st \
--cc=mw@semihalf.com \
--cc=spop@amazon.com \
--cc=upstream@semihalf.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git