* [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions
@ 2022-03-04 22:52 Swinney, Jonathan
[not found] ` <1646667914236.71807@amazon.com>
0 siblings, 1 reply; 4+ messages in thread
From: Swinney, Jonathan @ 2022-03-04 22:52 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Martin Storsjö, Clément Bœsch, Pop, Sebastian
- ff_pix_abs16_neon
- ff_pix_abs16_xy2_neon
In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 2:
ff_pix_abs16_neon:
c: benchmark ran 100000 iterations in 0.955383 seconds
ff: benchmark ran 100000 iterations in 0.097669 seconds
ff_pix_abs16_xy2_neon:
c: benchmark ran 100000 iterations in 1.916759 seconds
ff: benchmark ran 100000 iterations in 0.414291 seconds
Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
---
libavcodec/aarch64/Makefile | 2 +
libavcodec/aarch64/me_cmp_init_aarch64.c | 39 ++++
libavcodec/aarch64/me_cmp_neon.S | 230 +++++++++++++++++++++++
libavcodec/me_cmp.c | 2 +
libavcodec/me_cmp.h | 1 +
5 files changed, 274 insertions(+)
create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
create mode 100644 libavcodec/aarch64/me_cmp_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..18869da1b4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_init_aarch64.o
@@ -46,6 +47,7 @@ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..fb827daaf5
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->pix_abs[0][0] = ff_pix_abs16_neon;
+ c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+ }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..5422061ab1
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+function ff_pix_abs16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+ // x5 uint8_t *pix3
+ cmp w4, #4 // if h < 4, jump to completion section
+ b.lt 2f
+ movi v18.4S, #0 // clear result accumulator
+1:
+ movi v16.8H, #0 // clear uabal accumulator
+ ld1 {v0.16B}, [x1], x3 // load pix1
+ ld1 {v4.16B}, [x2], x3 // load pix2
+ ld1 {v1.16B}, [x1], x3 // load pix1
+ ld1 {v5.16B}, [x2], x3 // load pix2
+ uabal v16.8H, v0.8B, v4.8B // absolute difference accumulate
+ uabal2 v16.8H, v0.16B, v4.16B
+ ld1 {v2.16B}, [x1], x3 // load pix1
+ ld1 {v6.16B}, [x2], x3 // load pix2
+ uabal v16.8H, v1.8B, v5.8B // absolute difference accumulate
+ uabal2 v16.8H, v1.16B, v5.16B
+ ld1 {v3.16B}, [x1], x3
+ ld1 {v7.16B}, [x2], x3
+ uabal v16.8H, v2.8B, v6.8B
+ uabal2 v16.8H, v2.16B, v6.16B
+ sub w4, w4, #4 // h -= 4
+ uabal v16.8H, v3.8B, v7.8B
+ uabal2 v16.8H, v3.16B, v7.16B
+ cmp w4, #4 // if h >= 4, loop
+ addv h17, v16.8H // add up everything in v16 accumulator
+ add d18, d17, d18 // add to the end result register
+
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain, jump to completion section
+
+ mov w0, v18.S[0] // copy result to general purpose register
+ ret
+
+2:
+ movi v16.8H, #0 // clear the uabal accumulator
+ ld1 {v0.16B}, [x1] // load pix1
+ ld1 {v4.16B}, [x2] // load pix2
+ add x1, x1, x3 // increment pointers
+ add x2, x2, x3
+ uabal v16.8H, v0.8B, v4.8B // absolute difference accumulate
+ uabal2 v16.8H, v0.16B, v4.16B
+ addv h17, v16.8H // add up v16
+ add d18, d17, d18 // add to result
+ subs w4, w4, #1 // h -= 1
+ b.ne 2b
+
+ mov w0, v18.S[0] // copy result to general purpose register
+ ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+ // x5 uint8_t *pix3
+ add x5, x2, x3 // create a pointer for pix3
+ movi v0.2D, #0 // initialize the result register
+
+ // I also tested these intructions to get pix2+1 from pix2, but it wasn't faster
+ // than just doing another full (unaligned) load.
+ // ldr b21, [x5, #16]
+ // ushr v4.2D, v2.2D, #8
+ // mov v4.16B[15], v21.16B[0]
+ // mov v4.16B[7], v2.16B[8]
+
+ // Load initial pix2 values for either the unrolled version of completion version.
+ ldr q4, [x2, #1] // load pix2+1
+ ldr q2, [x2] // load pix2
+ cmp w4, #4 // if h < 4 jump to the completion version
+ b.lt 2f
+1:
+ // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
+ // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+ // plus two at the begining to start.
+ ldr q5, [x5, #1] // load pix3+1
+ ld1 {v3.16B}, [x5], x3 // load pix3
+ ld1 {v1.16B}, [x1], x3 // load pix1
+
+ ldr q16, [x5, #1] // load pix3+1
+ ld1 {v7.16B}, [x5], x3 // load pix3
+ ld1 {v6.16B}, [x1], x3 // load pix1
+
+ ldr q19, [x5, #1] // load pix3+1
+ ld1 {v18.16B}, [x5], x3 // load pix3
+ ld1 {v17.16B}, [x1], x3 // load pix1
+
+ ldr q22, [x5, #1] // load pix3+1
+ ld1 {v21.16B}, [x5], x3 // load pix3
+ ld1 {v20.16B}, [x1], x3 // load pix1
+
+ // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+ uaddl v23.8H, v2.8B, v4.8B // pix2 + pix2+1 0..7
+ uaddl2 v24.8H, v2.16B, v4.16B // pix2 + pix2+1 8..15
+ uaddl v30.8H, v3.8B, v5.8B // pix3 + pix3+1 0..7
+ uaddl2 v31.8H, v3.16B, v5.16B // pix3 + pix3+1 8..15
+ add v23.8H, v23.8H, v30.8H // add up 0..7
+ add v24.8H, v24.8H, v31.8H // add up 8..15
+ urshr v23.8H, v23.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v24.8H, v24.8H, #2 // shift right 2 8..15
+
+ uaddl v26.8H, v3.8B, v5.8B // pix2 + pix2+1 0..7
+ uaddl2 v27.8H, v3.16B, v5.16B // pix2 + pix2+1 8..15
+ uaddl v2.8H, v7.8B, v16.8B // pix3 + pix3+1 0..7
+ uaddl2 v4.8H, v7.16B, v16.16B // pix3 + pix3+1 8..15
+ add v26.8H, v26.8H, v2.8H // add up 0..7
+ add v27.8H, v27.8H, v4.8H // add up 8..15
+ urshr v26.8H, v26.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v27.8H, v27.8H, #2 // shift right 2 8..15
+
+ uaddl v28.8H, v7.8B, v16.8B // pix2 + pix2+1 0..7
+ uaddl2 v29.8H, v7.16B, v16.16B // pix2 + pix2+1 8..15
+ uaddl v3.8H, v18.8B, v19.8B // pix3 + pix3+1 0..7
+ uaddl2 v5.8H, v18.16B, v19.16B // pix3 + pix3+1 8..15
+ add v28.8H, v28.8H, v3.8H // add up 0..7
+ add v29.8H, v29.8H, v5.8H // add up 8..15
+ urshr v28.8H, v28.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v29.8H, v29.8H, #2 // shift right 2 8..15
+
+ uaddl v30.8H, v18.8B, v19.8B // pix2 + pix2+1 0..7
+ uaddl2 v31.8H, v18.16B, v19.16B // pix2 + pix2+1 8..15
+ uaddl v2.8H, v21.8B, v22.8B // pix3 + pix3+1 0..7
+ uaddl2 v4.8H, v21.16B, v22.16B // pix3 + pix3+1 8..15
+ add v30.8H, v30.8H, v2.8H // add up 0..7
+ add v31.8H, v31.8H, v4.8H // add up 8..15
+ urshr v30.8H, v30.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v31.8H, v31.8H, #2 // shift right 2 8..15
+
+ // Averages are now stored in these registers:
+ // v23, v24
+ // v26, v27
+ // v28, v29
+ // v30, v31
+ // pix1 values in these registers:
+ // v1, v6, v17, v20
+ // available
+ // v2, v3, v4, v5, v7, v16, v18, v19, v25
+
+ uxtl2 v2.8H, v1.16B // 8->16 bits pix1 8..15
+ uxtl v1.8H, v1.8B // 8->16 bits pix1 0..7
+ uxtl2 v7.8H, v6.16B // 8->16 bits pix1 8..15
+ uxtl v6.8H, v6.8B // 8->16 bits pix1 0..7
+ uxtl2 v18.8H, v17.16B // 8->16 bits pix1 8..15
+ uxtl v17.8H, v17.8B // 8->16 bits pix1 0..7
+ uxtl2 v25.8H, v20.16B // 8->16 bits pix1 8..15
+ uxtl v20.8H, v20.8B // 8->16 bits pix1 0..7
+
+ uabd v5.8H, v1.8H, v23.8H // absolute difference 0..7
+ uaba v5.8H, v2.8H, v24.8H // absolute difference accumulate 8..15
+ uaba v5.8H, v6.8H, v26.8H // absolute difference accumulate 0..7
+ uaba v5.8H, v7.8H, v27.8H // absolute difference accumulate 8..15
+ uaba v5.8H, v17.8H, v28.8H // absolute difference accumulate 0..7
+ uaba v5.8H, v18.8H, v29.8H // absolute difference accumulate 8..15
+ uaba v5.8H, v20.8H, v30.8H // absolute difference accumulate 0..7
+ uaba v5.8H, v25.8H, v31.8H // absolute difference accumulate 8..15
+
+ uaddlv s5, v5.8H // add up accumulated values
+ add d0, d0, d5 // add to final result
+
+ mov v2.16B, v21.16B // pix3 -> pix2
+ mov v4.16B, v22.16B // pix3+1 -> pix2+1
+
+ sub w4, w4, #4 // h -= 4
+ cmp w4, #4 // loop if h >= 4
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain jump to completion section
+
+ mov w0, v0.s[0] // copy result to general purpose register
+ ret
+2:
+ // q2 and q4 are set either at the end of this loop or at from the unrolled version
+ // which branches here to complete iterations when h % 4 != 0.
+ ldr q5, [x5, #1] // load pix3+1
+ ld1 {v3.16B}, [x5], x3 // load pix3
+ ld1 {v1.16B}, [x1], x3 // load pix1
+ subs w4, w4, #1 // decrement h and set flags for branch below
+
+ uaddl v16.8H, v2.8B, v4.8B // pix2 + pix2+1 0..7
+ uaddl2 v17.8H, v2.16B, v4.16B // pix2 + pix2+1 8..15
+ uaddl v18.8H, v3.8B, v5.8B // pix3 + pix3+1 0..7
+ uaddl2 v19.8H, v3.16B, v5.16B // pix3 + pix3+1 8..15
+ add v16.8H, v16.8H, v18.8H // add up 0..7
+ add v17.8H, v17.8H, v19.8H // add up 8..15
+ // divide by 4 to compute the average of values summed above
+ urshr v16.8H, v16.8H, #2 // shift right by 2 0..7 (rounding shift right)
+ urshr v17.8H, v17.8H, #2 // shift right by 2 8..15
+
+ uxtl2 v8.8H, v1.16B // 8->16 bits pix1 8..15
+ uxtl v1.8H, v1.8B // 8->16 bits pix1 0..7
+
+ uabd v6.8H, v1.8H, v16.8H // absolute difference 0..7
+ uaba v6.8H, v8.8H, v17.8H // absolute difference accumulate 8..15
+ addv h6, v6.8H // add up accumulator in v6
+ add d0, d0, d6
+
+ mov v2.16B, v3.16B // pix3 -> pix2
+ mov v4.16B, v5.16B // pix3+1 -> pix2+1
+
+ b.ne 2b // branch based on subs instruction above
+ mov w0, v0.s[0] // copy result to general purpose register
+ ret
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index b2f87d2e1b..60053a1b92 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1064,6 +1064,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
ff_me_cmp_init_alpha(c, avctx);
if (ARCH_ARM)
ff_me_cmp_init_arm(c, avctx);
+ if (ARCH_AARCH64)
+ ff_me_cmp_init_aarch64(c, avctx);
if (ARCH_PPC)
ff_me_cmp_init_ppc(c, avctx);
if (ARCH_X86)
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..4dd059223d 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -82,6 +82,7 @@ typedef struct MECmpContext {
void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
--
2.32.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions
[not found] ` <1646667914236.71807@amazon.com>
@ 2022-03-14 22:39 ` Martin Storsjö
0 siblings, 0 replies; 4+ messages in thread
From: Martin Storsjö @ 2022-03-14 22:39 UTC (permalink / raw)
To: Pop, Sebastian; +Cc: Swinney, Jonathan, Clément Bœsch, ffmpeg-devel
On Mon, 7 Mar 2022, Pop, Sebastian wrote:
> Here are a few suggestions:
>
>> + add d18, d17, d18 // add to the end result register
>> [...]
>> + mov w0, v18.S[0] // copy result to general purpose register
>
> I think you can use 32-bit register s18 instead of d18.
> The mov with indexed vector is more expensive than fmov.
Oh, I hadn't considered that. In a tight loop, I can indeed measure a
quite significant difference between those.
> add s18, s18, s17
> fmov w0, s18
>
>> + subs w4, w4, #1 // decrement h and set flags for branch below
>> [...]
>> + b.ne 2b // branch based on subs instruction above
>
> Please avoid the flags register to branch.
> Instead you could do:
>
> sub w4, w4, #1
> cbnz w4, 2b
If there are other instructions between the sub and the b.ne, does this
make any difference? (In most cases one can move the decrement into a
suitable gap early in the loop anyway.) I.e. if the flags register already
is set since long ago, naively I'd expect that b.ne would be faster (or at
least not slower) than cbnz.
Some benchmarking on Cortex A53, A72 and A73 seems to agree with my
expectations too. (It'd be good if we'd have the patch at hand hooked up
in checkasm, so that we could measure and compare exactly the function at
hand.)
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions
2022-03-07 20:40 Swinney, Jonathan
@ 2022-03-14 22:28 ` Martin Storsjö
0 siblings, 0 replies; 4+ messages in thread
From: Martin Storsjö @ 2022-03-14 22:28 UTC (permalink / raw)
To: Swinney, Jonathan; +Cc: Clément Bœsch, Pop, Sebastian, ffmpeg-devel
On Mon, 7 Mar 2022, Swinney, Jonathan wrote:
> - ff_pix_abs16_neon
> - ff_pix_abs16_xy2_neon
>
> In direct micro benchmarks of these ff functions verses their C implementations,
> these functions performed as follows on AWS Graviton 2:
>
> ff_pix_abs16_neon:
> c: benchmark ran 100000 iterations in 0.955383 seconds
> ff: benchmark ran 100000 iterations in 0.097669 seconds
>
> ff_pix_abs16_xy2_neon:
> c: benchmark ran 100000 iterations in 1.916759 seconds
> ff: benchmark ran 100000 iterations in 0.370729 seconds
I see that there's no checkasm tests for these functions - would you mind
adding one? (There's something kind of like a checkasm test in
libavcodec/tests/motion.c, but that one doesn't seem to be updated for
contempory SIMD instruction sets.)
Adding a checkasm test is important as it tests for a bunch of aspects
that otherwise can seem to work by accident (like missing zeroing/sign
extension of the upper half of registers, clobbering callee saved
registers, etc). For functions of this size, it's not hard to verify such
aspects of course, but I pretty much want to have checkasm coverage for
all newly added assembly. (Plus that checkasm gives built-in benchmarking
support for the functions.)
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> new file mode 100644
> index 0000000000..fb827daaf5
> --- /dev/null
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -0,0 +1,39 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "config.h"
> +#include "libavutil/attributes.h"
> +#include "libavutil/aarch64/cpu.h"
> +#include "libavcodec/mpegvideo.h"
> +
> +int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> + ptrdiff_t stride, int h);
> +int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> + ptrdiff_t stride, int h);
The second line seems misindented for both functions
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> new file mode 100644
> index 0000000000..85b0e4bd9e
> --- /dev/null
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -0,0 +1,219 @@
> +/*
> + * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +function ff_pix_abs16_neon, export=1
Nit: Empty line between the include and the function
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // w4 int h
> + // x5 uint8_t *pix3
> + cmp w4, #4 // if h < 4, jump to completion section
Please indent the assembly according to the existing code; 8 spaces before
the instruction column, 24 chars before the first operand.
> + b.lt 2f
> + movi v18.4S, #0 // clear result accumulator
Nit: I prefer lower case for the element specifier (.4s)
> +1:
> + movi v16.8H, #0 // clear uabal accumulator
> + ld1 {v0.16B}, [x1], x3 // load pix1
> + ld1 {v4.16B}, [x2], x3 // load pix2
> + ld1 {v1.16B}, [x1], x3 // load pix1
> + ld1 {v5.16B}, [x2], x3 // load pix2
> + uabal v16.8H, v0.8B, v4.8B // absolute difference accumulate
> + uabal2 v16.8H, v0.16B, v4.16B
> + ld1 {v2.16B}, [x1], x3 // load pix1
> + ld1 {v6.16B}, [x2], x3 // load pix2
> + uabal v16.8H, v1.8B, v5.8B // absolute difference accumulate
> + uabal2 v16.8H, v1.16B, v5.16B
> + ld1 {v3.16B}, [x1], x3
> + ld1 {v7.16B}, [x2], x3
> + uabal v16.8H, v2.8B, v6.8B
> + uabal2 v16.8H, v2.16B, v6.16B
> + sub w4, w4, #4 // h -= 4
> + uabal v16.8H, v3.8B, v7.8B
> + uabal2 v16.8H, v3.16B, v7.16B
> + cmp w4, #4 // if h >= 4, loop
> + addv h17, v16.8H // add up everything in v16 accumulator
> + add d18, d17, d18 // add to the end result register
Would it be beneficial to not do the addv here on each iteration, but
accumulate in v18.8h, and the just do one single addv at the end?
> +
> + b.ge 1b
> + cbnz w4, 2f // if iterations remain, jump to completion section
> +
> + mov w0, v18.S[0] // copy result to general purpose register
> + ret
> +
> +2:
> + movi v16.8H, #0 // clear the uabal accumulator
> + ld1 {v0.16B}, [x1] // load pix1
> + ld1 {v4.16B}, [x2] // load pix2
> + add x1, x1, x3 // increment pointers
> + add x2, x2, x3
Why not using the regular postincrement on the ld1, like above?
> + uabal v16.8H, v0.8B, v4.8B // absolute difference accumulate
> + uabal2 v16.8H, v0.16B, v4.16B
> + addv h17, v16.8H // add up v16
> + add d18, d17, d18 // add to result
If we got here via the "b.lt 2f" at the start of the function, d18 is
uninitialized here. (A proper checkasm test would test heights < 4.)
> + subs w4, w4, #1 // h -= 1
> + b.ne 2b
> +
> + mov w0, v18.S[0] // copy result to general purpose register
> + ret
> +endfunc
> +
> +function ff_pix_abs16_xy2_neon, export=1
> + // x0 unused
> + // x1 uint8_t *pix1
> + // x2 uint8_t *pix2
> + // x3 ptrdiff_t stride
> + // w4 int h
> + // x5 uint8_t *pix3
> + add x5, x2, x3 // create a pointer for pix3
> + movi v0.2D, #0 // initialize the result register
> +
> + // I also tested these intructions to get pix2+1 from pix2, but it wasn't faster
> + // than just doing another full (unaligned) load.
> + // ldr b21, [x5, #16]
> + // ushr v4.2D, v2.2D, #8
> + // mov v4.16B[15], v21.16B[0]
> + // mov v4.16B[7], v2.16B[8]
This version most certainly would be slower indeed. If we could be ok with
doing a bit of overread, the simplest version might be to load e.g. "ld1
{v4.16b, v5.16b}, [x2]" followed by "ext v5.16b, v4.16b, v5.16b, #1" to
shift it. But doing an overlapping unaligned load probably is fine too.
> +
> + // Load initial pix2 values for either the unrolled version of completion version.
> + ldr q4, [x2, #1] // load pix2+1
> + ldr q3, [x2] // load pix2
> + uaddl v2.8H, v3.8B, v4.8B // pix2 + pix2+1 0..7
> + uaddl2 v4.8H, v3.16B, v4.16B // pix2 + pix2+1 8..15
> + cmp w4, #4 // if h < 4 jump to the completion version
> + b.lt 2f
> +1:
> + // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
> + // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
> + // plus two at the begining to start.
> + ldr q5, [x5, #1] // load pix3+1
> + ld1 {v3.16B}, [x5], x3 // load pix3
> + ld1 {v1.16B}, [x1], x3 // load pix1
> +
> + ldr q16, [x5, #1] // load pix3+1
> + ld1 {v7.16B}, [x5], x3 // load pix3
> + ld1 {v6.16B}, [x1], x3 // load pix1
> +
> + ldr q19, [x5, #1] // load pix3+1
> + ld1 {v18.16B}, [x5], x3 // load pix3
> + ld1 {v17.16B}, [x1], x3 // load pix1
> +
> + ldr q22, [x5, #1] // load pix3+1
> + ld1 {v21.16B}, [x5], x3 // load pix3
> + ld1 {v20.16B}, [x1], x3 // load pix1
> +
> + // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
> + uaddl v30.8H, v3.8B, v5.8B // pix3 + pix3+1 0..7
> + uaddl2 v31.8H, v3.16B, v5.16B // pix3 + pix3+1 8..15
> + add v23.8H, v2.8H, v30.8H // add up 0..7, using pix2 + pix2+1 values from previous iteration
> + add v24.8H, v4.8H, v31.8H // add up 8..15, using pix2 + pix2+1 values from previous iteration
> + urshr v23.8H, v23.8H, #2 // shift right 2 0..7 (rounding shift right)
> + urshr v24.8H, v24.8H, #2 // shift right 2 8..15
> +
> + uaddl v2.8H, v7.8B, v16.8B // pix3 + pix3+1 0..7
> + uaddl2 v4.8H, v7.16B, v16.16B // pix3 + pix3+1 8..15
> + add v26.8H, v30.8H, v2.8H // add up 0..7, using pix2 + pix2+1 values from pix3 above
> + add v27.8H, v31.8H, v4.8H // add up 8..15, using pix2 + pix2+1 values from pix3 above
> + urshr v26.8H, v26.8H, #2 // shift right 2 0..7 (rounding shift right)
> + urshr v27.8H, v27.8H, #2 // shift right 2 8..15
> +
> + uaddl v3.8H, v18.8B, v19.8B // pix3 + pix3+1 0..7
> + uaddl2 v5.8H, v18.16B, v19.16B // pix3 + pix3+1 8..15
> + add v28.8H, v2.8H, v3.8H // add up 0..7, using pix2 + pix2+1 values from pix3 above
> + add v29.8H, v4.8H, v5.8H // add up 8..15, using pix2 + pix2+1 values from pix3 above
> + urshr v28.8H, v28.8H, #2 // shift right 2 0..7 (rounding shift right)
> + urshr v29.8H, v29.8H, #2 // shift right 2 8..15
> +
> + uaddl v2.8H, v21.8B, v22.8B // pix3 + pix3+1 0..7
> + uaddl2 v4.8H, v21.16B, v22.16B // pix3 + pix3+1 8..15
> + add v30.8H, v3.8H, v2.8H // add up 0..7, using pix2 + pix2+1 values from pix3 above
> + add v31.8H, v5.8H, v4.8H // add up 8..15, using pix2 + pix2+1 values from pix3 above
> + urshr v30.8H, v30.8H, #2 // shift right 2 0..7 (rounding shift right)
> + urshr v31.8H, v31.8H, #2 // shift right 2 8..15
> +
> + // Averages are now stored in these registers:
> + // v23, v24
> + // v26, v27
> + // v28, v29
> + // v30, v31
> + // pix1 values in these registers:
> + // v1, v6, v17, v20
> + // available
> + // v3, v5, v7, v16, v18, v19, v25
> +
> + uxtl2 v3.8H, v1.16B // 8->16 bits pix1 8..15
> + uxtl v1.8H, v1.8B // 8->16 bits pix1 0..7
> + uxtl2 v7.8H, v6.16B // 8->16 bits pix1 8..15
> + uxtl v6.8H, v6.8B // 8->16 bits pix1 0..7
> + uxtl2 v18.8H, v17.16B // 8->16 bits pix1 8..15
> + uxtl v17.8H, v17.8B // 8->16 bits pix1 0..7
> + uxtl2 v25.8H, v20.16B // 8->16 bits pix1 8..15
> + uxtl v20.8H, v20.8B // 8->16 bits pix1 0..7
> +
> + uabd v5.8H, v1.8H, v23.8H // absolute difference 0..7
> + uaba v5.8H, v3.8H, v24.8H // absolute difference accumulate 8..15
> + uaba v5.8H, v6.8H, v26.8H // absolute difference accumulate 0..7
> + uaba v5.8H, v7.8H, v27.8H // absolute difference accumulate 8..15
> + uaba v5.8H, v17.8H, v28.8H // absolute difference accumulate 0..7
> + uaba v5.8H, v18.8H, v29.8H // absolute difference accumulate 8..15
> + uaba v5.8H, v20.8H, v30.8H // absolute difference accumulate 0..7
> + uaba v5.8H, v25.8H, v31.8H // absolute difference accumulate 8..15
> +
> + uaddlv s5, v5.8H // add up accumulated values
> + add d0, d0, d5 // add to final result
Same thing as above; try whether deferring the horizontal addv until the
end is helpful. Also, for in-order cores, it'd be good to avoid direct
chaining of dependencies like this - you could move e.g. the "sub" from
below between the uaddlv/add pair. But if you avoid uaddlv altogether,
it's more straightforward (then it's probably best to place the sub before
the accumulation of the results).
> +
> + sub w4, w4, #4 // h -= 4
> + cmp w4, #4 // loop if h >= 4
To avoid stalls on in-order cores, consider moving both the sub and the
cmp further away from each other, and the cmp further away from the b.ge.
> + b.ge 1b
> + cbnz w4, 2f // if iterations remain jump to completion section
> +
> + fmov w0, s0 // copy result to general purpose register
If applying Sebastian's suggestions, please apply it similarly to
ff_pix_abs16_neon above too.
> + ret
> +2:
> + // v2 and v4 are set either at the end of this loop or at from the unrolled version
> + // which branches here to complete iterations when h % 4 != 0.
> + ldr q5, [x5, #1] // load pix3+1
> + ld1 {v3.16B}, [x5], x3 // load pix3
> + ld1 {v1.16B}, [x1], x3 // load pix1
> + sub w4, w4, #1 // decrement h
> +
> + uaddl v18.8H, v3.8B, v5.8B // pix3 + pix3+1 0..7
> + uaddl2 v19.8H, v3.16B, v5.16B // pix3 + pix3+1 8..15
> + add v16.8H, v2.8H, v18.8H // add up 0..7, using pix2 + pix2+1 values from previous iteration
> + add v17.8H, v4.8H, v19.8H // add up 8..15, using pix2 + pix2+1 values from previous iteration
> + // divide by 4 to compute the average of values summed above
> + urshr v16.8H, v16.8H, #2 // shift right by 2 0..7 (rounding shift right)
> + urshr v17.8H, v17.8H, #2 // shift right by 2 8..15
> +
> + uxtl2 v8.8H, v1.16B // 8->16 bits pix1 8..15
> + uxtl v1.8H, v1.8B // 8->16 bits pix1 0..7
> +
> + uabd v6.8H, v1.8H, v16.8H // absolute difference 0..7
> + uaba v6.8H, v8.8H, v17.8H // absolute difference accumulate 8..15
> + addv h6, v6.8H // add up accumulator in v6
> + add d0, d0, d6
> +
> + mov v2.16B, v18.16B // pix3 -> pix2
> + mov v4.16B, v19.16B // pix3+1 -> pix2+1
It's probably better for in-order cores if you'd place these mov
instructions somewhere between the uaba and the following adds.
> +
> + cbnz w4, 2b // loop if h > 0
> + fmov w0, s0 // copy result to general purpose register
> + ret
> +endfunc
> diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
> index b2f87d2e1b..60053a1b92 100644
> --- a/libavcodec/me_cmp.c
> +++ b/libavcodec/me_cmp.c
> @@ -1064,6 +1064,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
> ff_me_cmp_init_alpha(c, avctx);
> if (ARCH_ARM)
> ff_me_cmp_init_arm(c, avctx);
> + if (ARCH_AARCH64)
> + ff_me_cmp_init_aarch64(c, avctx);
Alphabetical order please; AARCH64 comes before ARM.
> if (ARCH_PPC)
> ff_me_cmp_init_ppc(c, avctx);
> if (ARCH_X86)
> diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
> index e9b5161c9a..4dd059223d 100644
> --- a/libavcodec/me_cmp.h
> +++ b/libavcodec/me_cmp.h
> @@ -82,6 +82,7 @@ typedef struct MECmpContext {
> void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
> +void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
Alphabetical order please.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions
@ 2022-03-07 20:40 Swinney, Jonathan
2022-03-14 22:28 ` Martin Storsjö
0 siblings, 1 reply; 4+ messages in thread
From: Swinney, Jonathan @ 2022-03-07 20:40 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Martin Storsjö, Clément Bœsch, Pop, Sebastian
- ff_pix_abs16_neon
- ff_pix_abs16_xy2_neon
In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 2:
ff_pix_abs16_neon:
c: benchmark ran 100000 iterations in 0.955383 seconds
ff: benchmark ran 100000 iterations in 0.097669 seconds
ff_pix_abs16_xy2_neon:
c: benchmark ran 100000 iterations in 1.916759 seconds
ff: benchmark ran 100000 iterations in 0.370729 seconds
remove redundant adds
Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
---
libavcodec/aarch64/Makefile | 2 +
libavcodec/aarch64/me_cmp_init_aarch64.c | 39 ++++
libavcodec/aarch64/me_cmp_neon.S | 219 +++++++++++++++++++++++
libavcodec/me_cmp.c | 2 +
libavcodec/me_cmp.h | 1 +
5 files changed, 263 insertions(+)
create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
create mode 100644 libavcodec/aarch64/me_cmp_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..18869da1b4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_init_aarch64.o
@@ -46,6 +47,7 @@ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..fb827daaf5
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->pix_abs[0][0] = ff_pix_abs16_neon;
+ c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+ }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..85b0e4bd9e
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+function ff_pix_abs16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+ // x5 uint8_t *pix3
+ cmp w4, #4 // if h < 4, jump to completion section
+ b.lt 2f
+ movi v18.4S, #0 // clear result accumulator
+1:
+ movi v16.8H, #0 // clear uabal accumulator
+ ld1 {v0.16B}, [x1], x3 // load pix1
+ ld1 {v4.16B}, [x2], x3 // load pix2
+ ld1 {v1.16B}, [x1], x3 // load pix1
+ ld1 {v5.16B}, [x2], x3 // load pix2
+ uabal v16.8H, v0.8B, v4.8B // absolute difference accumulate
+ uabal2 v16.8H, v0.16B, v4.16B
+ ld1 {v2.16B}, [x1], x3 // load pix1
+ ld1 {v6.16B}, [x2], x3 // load pix2
+ uabal v16.8H, v1.8B, v5.8B // absolute difference accumulate
+ uabal2 v16.8H, v1.16B, v5.16B
+ ld1 {v3.16B}, [x1], x3
+ ld1 {v7.16B}, [x2], x3
+ uabal v16.8H, v2.8B, v6.8B
+ uabal2 v16.8H, v2.16B, v6.16B
+ sub w4, w4, #4 // h -= 4
+ uabal v16.8H, v3.8B, v7.8B
+ uabal2 v16.8H, v3.16B, v7.16B
+ cmp w4, #4 // if h >= 4, loop
+ addv h17, v16.8H // add up everything in v16 accumulator
+ add d18, d17, d18 // add to the end result register
+
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain, jump to completion section
+
+ mov w0, v18.S[0] // copy result to general purpose register
+ ret
+
+2:
+ movi v16.8H, #0 // clear the uabal accumulator
+ ld1 {v0.16B}, [x1] // load pix1
+ ld1 {v4.16B}, [x2] // load pix2
+ add x1, x1, x3 // increment pointers
+ add x2, x2, x3
+ uabal v16.8H, v0.8B, v4.8B // absolute difference accumulate
+ uabal2 v16.8H, v0.16B, v4.16B
+ addv h17, v16.8H // add up v16
+ add d18, d17, d18 // add to result
+ subs w4, w4, #1 // h -= 1
+ b.ne 2b
+
+ mov w0, v18.S[0] // copy result to general purpose register
+ ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+ // x5 uint8_t *pix3
+ add x5, x2, x3 // create a pointer for pix3
+ movi v0.2D, #0 // initialize the result register
+
+ // I also tested these intructions to get pix2+1 from pix2, but it wasn't faster
+ // than just doing another full (unaligned) load.
+ // ldr b21, [x5, #16]
+ // ushr v4.2D, v2.2D, #8
+ // mov v4.16B[15], v21.16B[0]
+ // mov v4.16B[7], v2.16B[8]
+
+ // Load initial pix2 values for either the unrolled version of completion version.
+ ldr q4, [x2, #1] // load pix2+1
+ ldr q3, [x2] // load pix2
+ uaddl v2.8H, v3.8B, v4.8B // pix2 + pix2+1 0..7
+ uaddl2 v4.8H, v3.16B, v4.16B // pix2 + pix2+1 8..15
+ cmp w4, #4 // if h < 4 jump to the completion version
+ b.lt 2f
+1:
+ // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
+ // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+ // plus two at the begining to start.
+ ldr q5, [x5, #1] // load pix3+1
+ ld1 {v3.16B}, [x5], x3 // load pix3
+ ld1 {v1.16B}, [x1], x3 // load pix1
+
+ ldr q16, [x5, #1] // load pix3+1
+ ld1 {v7.16B}, [x5], x3 // load pix3
+ ld1 {v6.16B}, [x1], x3 // load pix1
+
+ ldr q19, [x5, #1] // load pix3+1
+ ld1 {v18.16B}, [x5], x3 // load pix3
+ ld1 {v17.16B}, [x1], x3 // load pix1
+
+ ldr q22, [x5, #1] // load pix3+1
+ ld1 {v21.16B}, [x5], x3 // load pix3
+ ld1 {v20.16B}, [x1], x3 // load pix1
+
+ // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+ uaddl v30.8H, v3.8B, v5.8B // pix3 + pix3+1 0..7
+ uaddl2 v31.8H, v3.16B, v5.16B // pix3 + pix3+1 8..15
+ add v23.8H, v2.8H, v30.8H // add up 0..7, using pix2 + pix2+1 values from previous iteration
+ add v24.8H, v4.8H, v31.8H // add up 8..15, using pix2 + pix2+1 values from previous iteration
+ urshr v23.8H, v23.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v24.8H, v24.8H, #2 // shift right 2 8..15
+
+ uaddl v2.8H, v7.8B, v16.8B // pix3 + pix3+1 0..7
+ uaddl2 v4.8H, v7.16B, v16.16B // pix3 + pix3+1 8..15
+ add v26.8H, v30.8H, v2.8H // add up 0..7, using pix2 + pix2+1 values from pix3 above
+ add v27.8H, v31.8H, v4.8H // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ urshr v26.8H, v26.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v27.8H, v27.8H, #2 // shift right 2 8..15
+
+ uaddl v3.8H, v18.8B, v19.8B // pix3 + pix3+1 0..7
+ uaddl2 v5.8H, v18.16B, v19.16B // pix3 + pix3+1 8..15
+ add v28.8H, v2.8H, v3.8H // add up 0..7, using pix2 + pix2+1 values from pix3 above
+ add v29.8H, v4.8H, v5.8H // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ urshr v28.8H, v28.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v29.8H, v29.8H, #2 // shift right 2 8..15
+
+ uaddl v2.8H, v21.8B, v22.8B // pix3 + pix3+1 0..7
+ uaddl2 v4.8H, v21.16B, v22.16B // pix3 + pix3+1 8..15
+ add v30.8H, v3.8H, v2.8H // add up 0..7, using pix2 + pix2+1 values from pix3 above
+ add v31.8H, v5.8H, v4.8H // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ urshr v30.8H, v30.8H, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v31.8H, v31.8H, #2 // shift right 2 8..15
+
+ // Averages are now stored in these registers:
+ // v23, v24
+ // v26, v27
+ // v28, v29
+ // v30, v31
+ // pix1 values in these registers:
+ // v1, v6, v17, v20
+ // available
+ // v3, v5, v7, v16, v18, v19, v25
+
+ uxtl2 v3.8H, v1.16B // 8->16 bits pix1 8..15
+ uxtl v1.8H, v1.8B // 8->16 bits pix1 0..7
+ uxtl2 v7.8H, v6.16B // 8->16 bits pix1 8..15
+ uxtl v6.8H, v6.8B // 8->16 bits pix1 0..7
+ uxtl2 v18.8H, v17.16B // 8->16 bits pix1 8..15
+ uxtl v17.8H, v17.8B // 8->16 bits pix1 0..7
+ uxtl2 v25.8H, v20.16B // 8->16 bits pix1 8..15
+ uxtl v20.8H, v20.8B // 8->16 bits pix1 0..7
+
+ uabd v5.8H, v1.8H, v23.8H // absolute difference 0..7
+ uaba v5.8H, v3.8H, v24.8H // absolute difference accumulate 8..15
+ uaba v5.8H, v6.8H, v26.8H // absolute difference accumulate 0..7
+ uaba v5.8H, v7.8H, v27.8H // absolute difference accumulate 8..15
+ uaba v5.8H, v17.8H, v28.8H // absolute difference accumulate 0..7
+ uaba v5.8H, v18.8H, v29.8H // absolute difference accumulate 8..15
+ uaba v5.8H, v20.8H, v30.8H // absolute difference accumulate 0..7
+ uaba v5.8H, v25.8H, v31.8H // absolute difference accumulate 8..15
+
+ uaddlv s5, v5.8H // add up accumulated values
+ add d0, d0, d5 // add to final result
+
+ sub w4, w4, #4 // h -= 4
+ cmp w4, #4 // loop if h >= 4
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain jump to completion section
+
+ fmov w0, s0 // copy result to general purpose register
+ ret
+2:
+ // v2 and v4 are set either at the end of this loop or at from the unrolled version
+ // which branches here to complete iterations when h % 4 != 0.
+ ldr q5, [x5, #1] // load pix3+1
+ ld1 {v3.16B}, [x5], x3 // load pix3
+ ld1 {v1.16B}, [x1], x3 // load pix1
+ sub w4, w4, #1 // decrement h
+
+ uaddl v18.8H, v3.8B, v5.8B // pix3 + pix3+1 0..7
+ uaddl2 v19.8H, v3.16B, v5.16B // pix3 + pix3+1 8..15
+ add v16.8H, v2.8H, v18.8H // add up 0..7, using pix2 + pix2+1 values from previous iteration
+ add v17.8H, v4.8H, v19.8H // add up 8..15, using pix2 + pix2+1 values from previous iteration
+ // divide by 4 to compute the average of values summed above
+ urshr v16.8H, v16.8H, #2 // shift right by 2 0..7 (rounding shift right)
+ urshr v17.8H, v17.8H, #2 // shift right by 2 8..15
+
+ uxtl2 v8.8H, v1.16B // 8->16 bits pix1 8..15
+ uxtl v1.8H, v1.8B // 8->16 bits pix1 0..7
+
+ uabd v6.8H, v1.8H, v16.8H // absolute difference 0..7
+ uaba v6.8H, v8.8H, v17.8H // absolute difference accumulate 8..15
+ addv h6, v6.8H // add up accumulator in v6
+ add d0, d0, d6
+
+ mov v2.16B, v18.16B // pix3 -> pix2
+ mov v4.16B, v19.16B // pix3+1 -> pix2+1
+
+ cbnz w4, 2b // loop if h > 0
+ fmov w0, s0 // copy result to general purpose register
+ ret
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index b2f87d2e1b..60053a1b92 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1064,6 +1064,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
ff_me_cmp_init_alpha(c, avctx);
if (ARCH_ARM)
ff_me_cmp_init_arm(c, avctx);
+ if (ARCH_AARCH64)
+ ff_me_cmp_init_aarch64(c, avctx);
if (ARCH_PPC)
ff_me_cmp_init_ppc(c, avctx);
if (ARCH_X86)
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..4dd059223d 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -82,6 +82,7 @@ typedef struct MECmpContext {
void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
--
2.32.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-03-14 22:40 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-04 22:52 [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions Swinney, Jonathan
[not found] ` <1646667914236.71807@amazon.com>
2022-03-14 22:39 ` Martin Storsjö
2022-03-07 20:40 Swinney, Jonathan
2022-03-14 22:28 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git