Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions
@ 2022-03-07 20:40 Swinney, Jonathan
  2022-03-14 22:28 ` Martin Storsjö
  0 siblings, 1 reply; 4+ messages in thread
From: Swinney, Jonathan @ 2022-03-07 20:40 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Martin Storsjö, Clément Bœsch, Pop, Sebastian

 - ff_pix_abs16_neon
 - ff_pix_abs16_xy2_neon

In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 2:

ff_pix_abs16_neon:
c:  benchmark ran 100000 iterations in 0.955383 seconds
ff: benchmark ran 100000 iterations in 0.097669 seconds

ff_pix_abs16_xy2_neon:
c:  benchmark ran 100000 iterations in 1.916759 seconds
ff: benchmark ran 100000 iterations in 0.370729 seconds

remove redundant adds

Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
---
 libavcodec/aarch64/Makefile              |   2 +
 libavcodec/aarch64/me_cmp_init_aarch64.c |  39 ++++
 libavcodec/aarch64/me_cmp_neon.S         | 219 +++++++++++++++++++++++
 libavcodec/me_cmp.c                      |   2 +
 libavcodec/me_cmp.h                      |   1 +
 5 files changed, 263 insertions(+)
 create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/me_cmp_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..18869da1b4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
@@ -46,6 +47,7 @@ NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..fb827daaf5
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                       ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                       ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->pix_abs[0][0] = ff_pix_abs16_neon;
+        c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+    }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..85b0e4bd9e
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+function ff_pix_abs16_neon, export=1
+        // x0   unused
+        // x1   uint8_t *pix1
+        // x2   uint8_t *pix2
+        // x3   ptrdiff_t stride
+        // w4   int h
+        // x5   uint8_t *pix3
+        cmp     w4, #4                      // if h < 4, jump to completion section
+        b.lt    2f
+        movi    v18.4S, #0                  // clear result accumulator
+1:
+        movi    v16.8H, #0                  // clear uabal accumulator
+        ld1     {v0.16B}, [x1], x3          // load pix1
+        ld1     {v4.16B}, [x2], x3          // load pix2
+        ld1     {v1.16B}, [x1], x3          // load pix1
+        ld1     {v5.16B}, [x2], x3          // load pix2
+        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
+        uabal2  v16.8H, v0.16B, v4.16B
+        ld1     {v2.16B}, [x1], x3          // load pix1
+        ld1     {v6.16B}, [x2], x3          // load pix2
+        uabal   v16.8H, v1.8B, v5.8B        // absolute difference accumulate
+        uabal2  v16.8H, v1.16B, v5.16B
+        ld1     {v3.16B}, [x1], x3
+        ld1     {v7.16B}, [x2], x3
+        uabal   v16.8H, v2.8B, v6.8B
+        uabal2  v16.8H, v2.16B, v6.16B
+        sub     w4, w4, #4                  // h -= 4
+        uabal   v16.8H, v3.8B, v7.8B
+        uabal2  v16.8H, v3.16B, v7.16B
+        cmp     w4, #4                      // if h >= 4, loop
+        addv    h17, v16.8H                 // add up everything in v16 accumulator
+        add     d18, d17, d18               // add to the end result register
+
+        b.ge    1b
+        cbnz    w4, 2f                      // if iterations remain, jump to completion section
+
+        mov     w0, v18.S[0]                // copy result to general purpose register
+        ret
+
+2:
+        movi    v16.8H, #0                  // clear the uabal accumulator
+        ld1     {v0.16B}, [x1]              // load pix1
+        ld1     {v4.16B}, [x2]              // load pix2
+        add     x1, x1, x3                  // increment pointers
+        add     x2, x2, x3
+        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
+        uabal2  v16.8H, v0.16B, v4.16B
+        addv    h17, v16.8H                 // add up v16
+        add     d18, d17, d18               // add to result
+        subs    w4, w4, #1                  // h -= 1
+        b.ne    2b
+
+        mov     w0, v18.S[0]                // copy result to general purpose register
+        ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+        // x0   unused
+        // x1   uint8_t *pix1
+        // x2   uint8_t *pix2
+        // x3   ptrdiff_t stride
+        // w4   int h
+        // x5   uint8_t *pix3
+        add     x5, x2, x3                  // create a pointer for pix3
+        movi    v0.2D, #0                   // initialize the result register
+
+        // I also tested these intructions to get pix2+1 from pix2, but it wasn't faster
+        // than just doing another full (unaligned) load.
+        // ldr     b21, [x5, #16]
+        // ushr    v4.2D, v2.2D, #8
+        // mov     v4.16B[15], v21.16B[0]
+        // mov     v4.16B[7], v2.16B[8]
+
+        // Load initial pix2 values for either the unrolled version of completion version.
+        ldr     q4, [x2, #1]                // load pix2+1
+        ldr     q3, [x2]                    // load pix2
+        uaddl   v2.8H, v3.8B, v4.8B         // pix2 + pix2+1 0..7
+        uaddl2  v4.8H, v3.16B, v4.16B       // pix2 + pix2+1 8..15
+        cmp     w4, #4                      // if h < 4 jump to the completion version
+        b.lt    2f
+1:
+        // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
+        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+        // plus two at the begining to start.
+        ldr     q5, [x5, #1]                // load pix3+1
+        ld1     {v3.16B}, [x5], x3          // load pix3
+        ld1     {v1.16B}, [x1], x3          // load pix1
+
+        ldr     q16, [x5, #1]               // load pix3+1
+        ld1     {v7.16B}, [x5], x3          // load pix3
+        ld1     {v6.16B}, [x1], x3          // load pix1
+
+        ldr     q19, [x5, #1]               // load pix3+1
+        ld1     {v18.16B}, [x5], x3         // load pix3
+        ld1     {v17.16B}, [x1], x3         // load pix1
+
+        ldr     q22, [x5, #1]               // load pix3+1
+        ld1     {v21.16B}, [x5], x3         // load pix3
+        ld1     {v20.16B}, [x1], x3         // load pix1
+
+        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+        uaddl   v30.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
+        uaddl2  v31.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
+        add     v23.8H, v2.8H, v30.8H       // add up 0..7, using pix2 + pix2+1 values from previous iteration
+        add     v24.8H, v4.8H, v31.8H       // add up 8..15, using pix2 + pix2+1 values from previous iteration
+        urshr   v23.8H, v23.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v24.8H, v24.8H, #2          // shift right 2 8..15
+
+        uaddl   v2.8H, v7.8B, v16.8B        // pix3 + pix3+1 0..7
+        uaddl2  v4.8H, v7.16B, v16.16B      // pix3 + pix3+1 8..15
+        add     v26.8H, v30.8H, v2.8H       // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add     v27.8H, v31.8H, v4.8H       // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        urshr   v26.8H, v26.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v27.8H, v27.8H, #2          // shift right 2 8..15
+
+        uaddl   v3.8H, v18.8B, v19.8B       // pix3 + pix3+1 0..7
+        uaddl2  v5.8H, v18.16B, v19.16B     // pix3 + pix3+1 8..15
+        add     v28.8H, v2.8H, v3.8H        // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add     v29.8H, v4.8H, v5.8H        // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        urshr   v28.8H, v28.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v29.8H, v29.8H, #2          // shift right 2 8..15
+
+        uaddl   v2.8H, v21.8B, v22.8B       // pix3 + pix3+1 0..7
+        uaddl2  v4.8H, v21.16B, v22.16B     // pix3 + pix3+1 8..15
+        add     v30.8H, v3.8H, v2.8H        // add up 0..7, using pix2 + pix2+1 values from pix3 above
+        add     v31.8H, v5.8H, v4.8H        // add up 8..15, using pix2 + pix2+1 values from pix3 above
+        urshr   v30.8H, v30.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v31.8H, v31.8H, #2          // shift right 2 8..15
+
+        // Averages are now stored in these registers:
+        // v23, v24
+        // v26, v27
+        // v28, v29
+        // v30, v31
+        // pix1 values in these registers:
+        // v1, v6, v17, v20
+        // available
+        // v3, v5, v7, v16, v18, v19, v25
+
+        uxtl2   v3.8H, v1.16B               // 8->16 bits pix1 8..15
+        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
+        uxtl2   v7.8H, v6.16B               // 8->16 bits pix1 8..15
+        uxtl    v6.8H, v6.8B                // 8->16 bits pix1 0..7
+        uxtl2   v18.8H, v17.16B             // 8->16 bits pix1 8..15
+        uxtl    v17.8H, v17.8B              // 8->16 bits pix1 0..7
+        uxtl2   v25.8H, v20.16B             // 8->16 bits pix1 8..15
+        uxtl    v20.8H, v20.8B              // 8->16 bits pix1 0..7
+
+        uabd    v5.8H, v1.8H, v23.8H        // absolute difference 0..7
+        uaba    v5.8H, v3.8H, v24.8H        // absolute difference accumulate 8..15
+        uaba    v5.8H, v6.8H, v26.8H        // absolute difference accumulate 0..7
+        uaba    v5.8H, v7.8H, v27.8H        // absolute difference accumulate 8..15
+        uaba    v5.8H, v17.8H, v28.8H       // absolute difference accumulate 0..7
+        uaba    v5.8H, v18.8H, v29.8H       // absolute difference accumulate 8..15
+        uaba    v5.8H, v20.8H, v30.8H       // absolute difference accumulate 0..7
+        uaba    v5.8H, v25.8H, v31.8H       // absolute difference accumulate 8..15
+
+        uaddlv  s5, v5.8H                   // add up accumulated values
+        add     d0, d0, d5                  // add to final result
+
+        sub     w4, w4, #4                  // h -= 4
+        cmp     w4, #4                      // loop if h >= 4
+        b.ge    1b
+        cbnz    w4, 2f                      // if iterations remain jump to completion section
+
+        fmov    w0, s0                      // copy result to general purpose register
+        ret
+2:
+        // v2 and v4 are set either at the end of this loop or at from the unrolled version
+        // which branches here to complete iterations when h % 4 != 0.
+        ldr     q5, [x5, #1]                // load pix3+1
+        ld1     {v3.16B}, [x5], x3          // load pix3
+        ld1     {v1.16B}, [x1], x3          // load pix1
+        sub     w4, w4, #1                  // decrement h
+
+        uaddl   v18.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
+        uaddl2  v19.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
+        add     v16.8H, v2.8H, v18.8H       // add up 0..7, using pix2 + pix2+1 values from previous iteration
+        add     v17.8H, v4.8H, v19.8H       // add up 8..15, using pix2 + pix2+1 values from previous iteration
+        // divide by 4 to compute the average of values summed above
+        urshr   v16.8H, v16.8H, #2          // shift right by 2 0..7 (rounding shift right)
+        urshr   v17.8H, v17.8H, #2          // shift right by 2 8..15
+
+        uxtl2   v8.8H, v1.16B               // 8->16 bits pix1 8..15
+        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
+
+        uabd    v6.8H, v1.8H, v16.8H        // absolute difference 0..7
+        uaba    v6.8H, v8.8H, v17.8H        // absolute difference accumulate 8..15
+        addv    h6, v6.8H                   // add up accumulator in v6
+        add     d0, d0, d6
+
+        mov     v2.16B, v18.16B             // pix3 -> pix2
+        mov     v4.16B, v19.16B             // pix3+1 -> pix2+1
+
+        cbnz    w4, 2b                      // loop if h > 0
+        fmov    w0, s0                      // copy result to general purpose register
+        ret
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index b2f87d2e1b..60053a1b92 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1064,6 +1064,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
         ff_me_cmp_init_alpha(c, avctx);
     if (ARCH_ARM)
         ff_me_cmp_init_arm(c, avctx);
+    if (ARCH_AARCH64)
+        ff_me_cmp_init_aarch64(c, avctx);
     if (ARCH_PPC)
         ff_me_cmp_init_ppc(c, avctx);
     if (ARCH_X86)
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..4dd059223d 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -82,6 +82,7 @@ typedef struct MECmpContext {
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
-- 
2.32.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions
  2022-03-07 20:40 [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions Swinney, Jonathan
@ 2022-03-14 22:28 ` Martin Storsjö
  0 siblings, 0 replies; 4+ messages in thread
From: Martin Storsjö @ 2022-03-14 22:28 UTC (permalink / raw)
  To: Swinney, Jonathan; +Cc: Clément Bœsch, Pop, Sebastian, ffmpeg-devel

On Mon, 7 Mar 2022, Swinney, Jonathan wrote:

> - ff_pix_abs16_neon
> - ff_pix_abs16_xy2_neon
>
> In direct micro benchmarks of these ff functions verses their C implementations,
> these functions performed as follows on AWS Graviton 2:
>
> ff_pix_abs16_neon:
> c:  benchmark ran 100000 iterations in 0.955383 seconds
> ff: benchmark ran 100000 iterations in 0.097669 seconds
>
> ff_pix_abs16_xy2_neon:
> c:  benchmark ran 100000 iterations in 1.916759 seconds
> ff: benchmark ran 100000 iterations in 0.370729 seconds

I see that there's no checkasm tests for these functions - would you mind 
adding one? (There's something kind of like a checkasm test in 
libavcodec/tests/motion.c, but that one doesn't seem to be updated for 
contempory SIMD instruction sets.)

Adding a checkasm test is important as it tests for a bunch of aspects 
that otherwise can seem to work by accident (like missing zeroing/sign 
extension of the upper half of registers, clobbering callee saved 
registers, etc). For functions of this size, it's not hard to verify such 
aspects of course, but I pretty much want to have checkasm coverage for 
all newly added assembly. (Plus that checkasm gives built-in benchmarking 
support for the functions.)

> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> new file mode 100644
> index 0000000000..fb827daaf5
> --- /dev/null
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -0,0 +1,39 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "config.h"
> +#include "libavutil/attributes.h"
> +#include "libavutil/aarch64/cpu.h"
> +#include "libavcodec/mpegvideo.h"
> +
> +int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> +                       ptrdiff_t stride, int h);
> +int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> +                       ptrdiff_t stride, int h);

The second line seems misindented for both functions

> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> new file mode 100644
> index 0000000000..85b0e4bd9e
> --- /dev/null
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -0,0 +1,219 @@
> +/*
> + * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +function ff_pix_abs16_neon, export=1

Nit: Empty line between the include and the function

> +        // x0   unused
> +        // x1   uint8_t *pix1
> +        // x2   uint8_t *pix2
> +        // x3   ptrdiff_t stride
> +        // w4   int h
> +        // x5   uint8_t *pix3
> +        cmp     w4, #4                      // if h < 4, jump to completion section

Please indent the assembly according to the existing code; 8 spaces before 
the instruction column, 24 chars before the first operand.

> +        b.lt    2f
> +        movi    v18.4S, #0                  // clear result accumulator

Nit: I prefer lower case for the element specifier (.4s)

> +1:
> +        movi    v16.8H, #0                  // clear uabal accumulator
> +        ld1     {v0.16B}, [x1], x3          // load pix1
> +        ld1     {v4.16B}, [x2], x3          // load pix2
> +        ld1     {v1.16B}, [x1], x3          // load pix1
> +        ld1     {v5.16B}, [x2], x3          // load pix2
> +        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
> +        uabal2  v16.8H, v0.16B, v4.16B
> +        ld1     {v2.16B}, [x1], x3          // load pix1
> +        ld1     {v6.16B}, [x2], x3          // load pix2
> +        uabal   v16.8H, v1.8B, v5.8B        // absolute difference accumulate
> +        uabal2  v16.8H, v1.16B, v5.16B
> +        ld1     {v3.16B}, [x1], x3
> +        ld1     {v7.16B}, [x2], x3
> +        uabal   v16.8H, v2.8B, v6.8B
> +        uabal2  v16.8H, v2.16B, v6.16B
> +        sub     w4, w4, #4                  // h -= 4
> +        uabal   v16.8H, v3.8B, v7.8B
> +        uabal2  v16.8H, v3.16B, v7.16B
> +        cmp     w4, #4                      // if h >= 4, loop
> +        addv    h17, v16.8H                 // add up everything in v16 accumulator
> +        add     d18, d17, d18               // add to the end result register

Would it be beneficial to not do the addv here on each iteration, but 
accumulate in v18.8h, and the just do one single addv at the end?

> +
> +        b.ge    1b
> +        cbnz    w4, 2f                      // if iterations remain, jump to completion section
> +
> +        mov     w0, v18.S[0]                // copy result to general purpose register
> +        ret
> +
> +2:
> +        movi    v16.8H, #0                  // clear the uabal accumulator
> +        ld1     {v0.16B}, [x1]              // load pix1
> +        ld1     {v4.16B}, [x2]              // load pix2
> +        add     x1, x1, x3                  // increment pointers
> +        add     x2, x2, x3

Why not using the regular postincrement on the ld1, like above?

> +        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
> +        uabal2  v16.8H, v0.16B, v4.16B
> +        addv    h17, v16.8H                 // add up v16
> +        add     d18, d17, d18               // add to result

If we got here via the "b.lt 2f" at the start of the function, d18 is 
uninitialized here. (A proper checkasm test would test heights < 4.)

> +        subs    w4, w4, #1                  // h -= 1
> +        b.ne    2b
> +
> +        mov     w0, v18.S[0]                // copy result to general purpose register
> +        ret
> +endfunc
> +
> +function ff_pix_abs16_xy2_neon, export=1
> +        // x0   unused
> +        // x1   uint8_t *pix1
> +        // x2   uint8_t *pix2
> +        // x3   ptrdiff_t stride
> +        // w4   int h
> +        // x5   uint8_t *pix3
> +        add     x5, x2, x3                  // create a pointer for pix3
> +        movi    v0.2D, #0                   // initialize the result register
> +
> +        // I also tested these intructions to get pix2+1 from pix2, but it wasn't faster
> +        // than just doing another full (unaligned) load.
> +        // ldr     b21, [x5, #16]
> +        // ushr    v4.2D, v2.2D, #8
> +        // mov     v4.16B[15], v21.16B[0]
> +        // mov     v4.16B[7], v2.16B[8]

This version most certainly would be slower indeed. If we could be ok with 
doing a bit of overread, the simplest version might be to load e.g. "ld1 
{v4.16b, v5.16b}, [x2]" followed by "ext v5.16b, v4.16b, v5.16b, #1" to 
shift it. But doing an overlapping unaligned load probably is fine too.

> +
> +        // Load initial pix2 values for either the unrolled version of completion version.
> +        ldr     q4, [x2, #1]                // load pix2+1
> +        ldr     q3, [x2]                    // load pix2
> +        uaddl   v2.8H, v3.8B, v4.8B         // pix2 + pix2+1 0..7
> +        uaddl2  v4.8H, v3.16B, v4.16B       // pix2 + pix2+1 8..15
> +        cmp     w4, #4                      // if h < 4 jump to the completion version
> +        b.lt    2f
> +1:
> +        // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
> +        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
> +        // plus two at the begining to start.
> +        ldr     q5, [x5, #1]                // load pix3+1
> +        ld1     {v3.16B}, [x5], x3          // load pix3
> +        ld1     {v1.16B}, [x1], x3          // load pix1
> +
> +        ldr     q16, [x5, #1]               // load pix3+1
> +        ld1     {v7.16B}, [x5], x3          // load pix3
> +        ld1     {v6.16B}, [x1], x3          // load pix1
> +
> +        ldr     q19, [x5, #1]               // load pix3+1
> +        ld1     {v18.16B}, [x5], x3         // load pix3
> +        ld1     {v17.16B}, [x1], x3         // load pix1
> +
> +        ldr     q22, [x5, #1]               // load pix3+1
> +        ld1     {v21.16B}, [x5], x3         // load pix3
> +        ld1     {v20.16B}, [x1], x3         // load pix1
> +
> +        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
> +        uaddl   v30.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
> +        uaddl2  v31.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
> +        add     v23.8H, v2.8H, v30.8H       // add up 0..7, using pix2 + pix2+1 values from previous iteration
> +        add     v24.8H, v4.8H, v31.8H       // add up 8..15, using pix2 + pix2+1 values from previous iteration
> +        urshr   v23.8H, v23.8H, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr   v24.8H, v24.8H, #2          // shift right 2 8..15
> +
> +        uaddl   v2.8H, v7.8B, v16.8B        // pix3 + pix3+1 0..7
> +        uaddl2  v4.8H, v7.16B, v16.16B      // pix3 + pix3+1 8..15
> +        add     v26.8H, v30.8H, v2.8H       // add up 0..7, using pix2 + pix2+1 values from pix3 above
> +        add     v27.8H, v31.8H, v4.8H       // add up 8..15, using pix2 + pix2+1 values from pix3 above
> +        urshr   v26.8H, v26.8H, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr   v27.8H, v27.8H, #2          // shift right 2 8..15
> +
> +        uaddl   v3.8H, v18.8B, v19.8B       // pix3 + pix3+1 0..7
> +        uaddl2  v5.8H, v18.16B, v19.16B     // pix3 + pix3+1 8..15
> +        add     v28.8H, v2.8H, v3.8H        // add up 0..7, using pix2 + pix2+1 values from pix3 above
> +        add     v29.8H, v4.8H, v5.8H        // add up 8..15, using pix2 + pix2+1 values from pix3 above
> +        urshr   v28.8H, v28.8H, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr   v29.8H, v29.8H, #2          // shift right 2 8..15
> +
> +        uaddl   v2.8H, v21.8B, v22.8B       // pix3 + pix3+1 0..7
> +        uaddl2  v4.8H, v21.16B, v22.16B     // pix3 + pix3+1 8..15
> +        add     v30.8H, v3.8H, v2.8H        // add up 0..7, using pix2 + pix2+1 values from pix3 above
> +        add     v31.8H, v5.8H, v4.8H        // add up 8..15, using pix2 + pix2+1 values from pix3 above
> +        urshr   v30.8H, v30.8H, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr   v31.8H, v31.8H, #2          // shift right 2 8..15
> +
> +        // Averages are now stored in these registers:
> +        // v23, v24
> +        // v26, v27
> +        // v28, v29
> +        // v30, v31
> +        // pix1 values in these registers:
> +        // v1, v6, v17, v20
> +        // available
> +        // v3, v5, v7, v16, v18, v19, v25
> +
> +        uxtl2   v3.8H, v1.16B               // 8->16 bits pix1 8..15
> +        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
> +        uxtl2   v7.8H, v6.16B               // 8->16 bits pix1 8..15
> +        uxtl    v6.8H, v6.8B                // 8->16 bits pix1 0..7
> +        uxtl2   v18.8H, v17.16B             // 8->16 bits pix1 8..15
> +        uxtl    v17.8H, v17.8B              // 8->16 bits pix1 0..7
> +        uxtl2   v25.8H, v20.16B             // 8->16 bits pix1 8..15
> +        uxtl    v20.8H, v20.8B              // 8->16 bits pix1 0..7
> +
> +        uabd    v5.8H, v1.8H, v23.8H        // absolute difference 0..7
> +        uaba    v5.8H, v3.8H, v24.8H        // absolute difference accumulate 8..15
> +        uaba    v5.8H, v6.8H, v26.8H        // absolute difference accumulate 0..7
> +        uaba    v5.8H, v7.8H, v27.8H        // absolute difference accumulate 8..15
> +        uaba    v5.8H, v17.8H, v28.8H       // absolute difference accumulate 0..7
> +        uaba    v5.8H, v18.8H, v29.8H       // absolute difference accumulate 8..15
> +        uaba    v5.8H, v20.8H, v30.8H       // absolute difference accumulate 0..7
> +        uaba    v5.8H, v25.8H, v31.8H       // absolute difference accumulate 8..15
> +
> +        uaddlv  s5, v5.8H                   // add up accumulated values
> +        add     d0, d0, d5                  // add to final result

Same thing as above; try whether deferring the horizontal addv until the 
end is helpful. Also, for in-order cores, it'd be good to avoid direct 
chaining of dependencies like this - you could move e.g. the "sub" from 
below between the uaddlv/add pair. But if you avoid uaddlv altogether, 
it's more straightforward (then it's probably best to place the sub before 
the accumulation of the results).

> +
> +        sub     w4, w4, #4                  // h -= 4
> +        cmp     w4, #4                      // loop if h >= 4

To avoid stalls on in-order cores, consider moving both the sub and the 
cmp further away from each other, and the cmp further away from the b.ge.

> +        b.ge    1b
> +        cbnz    w4, 2f                      // if iterations remain jump to completion section
> +
> +        fmov    w0, s0                      // copy result to general purpose register

If applying Sebastian's suggestions, please apply it similarly to 
ff_pix_abs16_neon above too.

> +        ret
> +2:
> +        // v2 and v4 are set either at the end of this loop or at from the unrolled version
> +        // which branches here to complete iterations when h % 4 != 0.
> +        ldr     q5, [x5, #1]                // load pix3+1
> +        ld1     {v3.16B}, [x5], x3          // load pix3
> +        ld1     {v1.16B}, [x1], x3          // load pix1
> +        sub     w4, w4, #1                  // decrement h
> +
> +        uaddl   v18.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
> +        uaddl2  v19.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
> +        add     v16.8H, v2.8H, v18.8H       // add up 0..7, using pix2 + pix2+1 values from previous iteration
> +        add     v17.8H, v4.8H, v19.8H       // add up 8..15, using pix2 + pix2+1 values from previous iteration
> +        // divide by 4 to compute the average of values summed above
> +        urshr   v16.8H, v16.8H, #2          // shift right by 2 0..7 (rounding shift right)
> +        urshr   v17.8H, v17.8H, #2          // shift right by 2 8..15
> +
> +        uxtl2   v8.8H, v1.16B               // 8->16 bits pix1 8..15
> +        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
> +
> +        uabd    v6.8H, v1.8H, v16.8H        // absolute difference 0..7
> +        uaba    v6.8H, v8.8H, v17.8H        // absolute difference accumulate 8..15
> +        addv    h6, v6.8H                   // add up accumulator in v6
> +        add     d0, d0, d6
> +
> +        mov     v2.16B, v18.16B             // pix3 -> pix2
> +        mov     v4.16B, v19.16B             // pix3+1 -> pix2+1

It's probably better for in-order cores if you'd place these mov 
instructions somewhere between the uaba and the following adds.

> +
> +        cbnz    w4, 2b                      // loop if h > 0
> +        fmov    w0, s0                      // copy result to general purpose register
> +        ret
> +endfunc
> diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
> index b2f87d2e1b..60053a1b92 100644
> --- a/libavcodec/me_cmp.c
> +++ b/libavcodec/me_cmp.c
> @@ -1064,6 +1064,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
>         ff_me_cmp_init_alpha(c, avctx);
>     if (ARCH_ARM)
>         ff_me_cmp_init_arm(c, avctx);
> +    if (ARCH_AARCH64)
> +        ff_me_cmp_init_aarch64(c, avctx);

Alphabetical order please; AARCH64 comes before ARM.

>     if (ARCH_PPC)
>         ff_me_cmp_init_ppc(c, avctx);
>     if (ARCH_X86)
> diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
> index e9b5161c9a..4dd059223d 100644
> --- a/libavcodec/me_cmp.h
> +++ b/libavcodec/me_cmp.h
> @@ -82,6 +82,7 @@ typedef struct MECmpContext {
> void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
> +void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);

Alphabetical order please.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions
       [not found] ` <1646667914236.71807@amazon.com>
@ 2022-03-14 22:39   ` Martin Storsjö
  0 siblings, 0 replies; 4+ messages in thread
From: Martin Storsjö @ 2022-03-14 22:39 UTC (permalink / raw)
  To: Pop, Sebastian; +Cc: Swinney, Jonathan, Clément Bœsch, ffmpeg-devel

On Mon, 7 Mar 2022, Pop, Sebastian wrote:

> Here are a few suggestions:
>
>> +        add     d18, d17, d18               // add to the end result register
>> [...]
>> +        mov     w0, v18.S[0]                // copy result to general purpose register
>
> I think you can use 32-bit register s18 instead of d18.
> The mov with indexed vector is more expensive than fmov.

Oh, I hadn't considered that. In a tight loop, I can indeed measure a 
quite significant difference between those.

> add    s18, s18, s17
> fmov  w0, s18
>
>> +        subs    w4, w4, #1                  // decrement h and set flags for branch below
>> [...]
>> +        b.ne    2b                          // branch based on subs instruction above
>
> Please avoid the flags register to branch.
> Instead you could do:
>
> sub   w4, w4, #1
> cbnz w4, 2b

If there are other instructions between the sub and the b.ne, does this 
make any difference? (In most cases one can move the decrement into a 
suitable gap early in the loop anyway.) I.e. if the flags register already 
is set since long ago, naively I'd expect that b.ne would be faster (or at 
least not slower) than cbnz.

Some benchmarking on Cortex A53, A72 and A73 seems to agree with my 
expectations too. (It'd be good if we'd have the patch at hand hooked up 
in checkasm, so that we could measure and compare exactly the function at 
hand.)

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions
@ 2022-03-04 22:52 Swinney, Jonathan
       [not found] ` <1646667914236.71807@amazon.com>
  0 siblings, 1 reply; 4+ messages in thread
From: Swinney, Jonathan @ 2022-03-04 22:52 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Martin Storsjö, Clément Bœsch, Pop, Sebastian

 - ff_pix_abs16_neon
 - ff_pix_abs16_xy2_neon

In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 2:

ff_pix_abs16_neon:
c:  benchmark ran 100000 iterations in 0.955383 seconds
ff: benchmark ran 100000 iterations in 0.097669 seconds

ff_pix_abs16_xy2_neon:
c:  benchmark ran 100000 iterations in 1.916759 seconds
ff: benchmark ran 100000 iterations in 0.414291 seconds

Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
---
 libavcodec/aarch64/Makefile              |   2 +
 libavcodec/aarch64/me_cmp_init_aarch64.c |  39 ++++
 libavcodec/aarch64/me_cmp_neon.S         | 230 +++++++++++++++++++++++
 libavcodec/me_cmp.c                      |   2 +
 libavcodec/me_cmp.h                      |   1 +
 5 files changed, 274 insertions(+)
 create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/me_cmp_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..18869da1b4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
@@ -46,6 +47,7 @@ NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..fb827daaf5
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                       ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                       ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->pix_abs[0][0] = ff_pix_abs16_neon;
+        c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+    }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..5422061ab1
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+function ff_pix_abs16_neon, export=1
+        // x0   unused
+        // x1   uint8_t *pix1
+        // x2   uint8_t *pix2
+        // x3   ptrdiff_t stride
+        // w4   int h
+        // x5   uint8_t *pix3
+        cmp     w4, #4                      // if h < 4, jump to completion section
+        b.lt    2f
+        movi    v18.4S, #0                  // clear result accumulator
+1:
+        movi    v16.8H, #0                  // clear uabal accumulator
+        ld1     {v0.16B}, [x1], x3          // load pix1
+        ld1     {v4.16B}, [x2], x3          // load pix2
+        ld1     {v1.16B}, [x1], x3          // load pix1
+        ld1     {v5.16B}, [x2], x3          // load pix2
+        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
+        uabal2  v16.8H, v0.16B, v4.16B
+        ld1     {v2.16B}, [x1], x3          // load pix1
+        ld1     {v6.16B}, [x2], x3          // load pix2
+        uabal   v16.8H, v1.8B, v5.8B        // absolute difference accumulate
+        uabal2  v16.8H, v1.16B, v5.16B
+        ld1     {v3.16B}, [x1], x3
+        ld1     {v7.16B}, [x2], x3
+        uabal   v16.8H, v2.8B, v6.8B
+        uabal2  v16.8H, v2.16B, v6.16B
+        sub     w4, w4, #4                  // h -= 4
+        uabal   v16.8H, v3.8B, v7.8B
+        uabal2  v16.8H, v3.16B, v7.16B
+        cmp     w4, #4                      // if h >= 4, loop
+        addv    h17, v16.8H                 // add up everything in v16 accumulator
+        add     d18, d17, d18               // add to the end result register
+
+        b.ge    1b
+        cbnz    w4, 2f                      // if iterations remain, jump to completion section
+
+        mov     w0, v18.S[0]                // copy result to general purpose register
+        ret
+
+2:
+        movi    v16.8H, #0                  // clear the uabal accumulator
+        ld1     {v0.16B}, [x1]              // load pix1
+        ld1     {v4.16B}, [x2]              // load pix2
+        add     x1, x1, x3                  // increment pointers
+        add     x2, x2, x3
+        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
+        uabal2  v16.8H, v0.16B, v4.16B
+        addv    h17, v16.8H                 // add up v16
+        add     d18, d17, d18               // add to result
+        subs    w4, w4, #1                  // h -= 1
+        b.ne    2b
+
+        mov     w0, v18.S[0]                // copy result to general purpose register
+        ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+        // x0   unused
+        // x1   uint8_t *pix1
+        // x2   uint8_t *pix2
+        // x3   ptrdiff_t stride
+        // w4   int h
+        // x5   uint8_t *pix3
+        add     x5, x2, x3                  // create a pointer for pix3
+        movi    v0.2D, #0                   // initialize the result register
+
+        // I also tested these intructions to get pix2+1 from pix2, but it wasn't faster
+        // than just doing another full (unaligned) load.
+        // ldr     b21, [x5, #16]
+        // ushr    v4.2D, v2.2D, #8
+        // mov     v4.16B[15], v21.16B[0]
+        // mov     v4.16B[7], v2.16B[8]
+
+        // Load initial pix2 values for either the unrolled version of completion version.
+        ldr     q4, [x2, #1]                // load pix2+1
+        ldr     q2, [x2]                    // load pix2
+        cmp     w4, #4                      // if h < 4 jump to the completion version
+        b.lt    2f
+1:
+        // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
+        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+        // plus two at the begining to start.
+        ldr     q5, [x5, #1]                // load pix3+1
+        ld1     {v3.16B}, [x5], x3          // load pix3
+        ld1     {v1.16B}, [x1], x3          // load pix1
+
+        ldr     q16, [x5, #1]               // load pix3+1
+        ld1     {v7.16B}, [x5], x3          // load pix3
+        ld1     {v6.16B}, [x1], x3          // load pix1
+
+        ldr     q19, [x5, #1]               // load pix3+1
+        ld1     {v18.16B}, [x5], x3         // load pix3
+        ld1     {v17.16B}, [x1], x3         // load pix1
+
+        ldr     q22, [x5, #1]               // load pix3+1
+        ld1     {v21.16B}, [x5], x3         // load pix3
+        ld1     {v20.16B}, [x1], x3         // load pix1
+
+        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+        uaddl   v23.8H, v2.8B, v4.8B        // pix2 + pix2+1 0..7
+        uaddl2  v24.8H, v2.16B, v4.16B      // pix2 + pix2+1 8..15
+        uaddl   v30.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
+        uaddl2  v31.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
+        add     v23.8H, v23.8H, v30.8H      // add up 0..7
+        add     v24.8H, v24.8H, v31.8H      // add up 8..15
+        urshr   v23.8H, v23.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v24.8H, v24.8H, #2          // shift right 2 8..15
+
+        uaddl   v26.8H, v3.8B, v5.8B        // pix2 + pix2+1 0..7
+        uaddl2  v27.8H, v3.16B, v5.16B      // pix2 + pix2+1 8..15
+        uaddl   v2.8H, v7.8B, v16.8B        // pix3 + pix3+1 0..7
+        uaddl2  v4.8H, v7.16B, v16.16B      // pix3 + pix3+1 8..15
+        add     v26.8H, v26.8H, v2.8H       // add up 0..7
+        add     v27.8H, v27.8H, v4.8H       // add up 8..15
+        urshr   v26.8H, v26.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v27.8H, v27.8H, #2          // shift right 2 8..15
+
+        uaddl   v28.8H, v7.8B, v16.8B       // pix2 + pix2+1 0..7
+        uaddl2  v29.8H, v7.16B, v16.16B     // pix2 + pix2+1 8..15
+        uaddl   v3.8H, v18.8B, v19.8B       // pix3 + pix3+1 0..7
+        uaddl2  v5.8H, v18.16B, v19.16B     // pix3 + pix3+1 8..15
+        add     v28.8H, v28.8H, v3.8H       // add up 0..7
+        add     v29.8H, v29.8H, v5.8H       // add up 8..15
+        urshr   v28.8H, v28.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v29.8H, v29.8H, #2          // shift right 2 8..15
+
+        uaddl   v30.8H, v18.8B, v19.8B      // pix2 + pix2+1 0..7
+        uaddl2  v31.8H, v18.16B, v19.16B    // pix2 + pix2+1 8..15
+        uaddl   v2.8H, v21.8B, v22.8B       // pix3 + pix3+1 0..7
+        uaddl2  v4.8H, v21.16B, v22.16B     // pix3 + pix3+1 8..15
+        add     v30.8H, v30.8H, v2.8H       // add up 0..7
+        add     v31.8H, v31.8H, v4.8H       // add up 8..15
+        urshr   v30.8H, v30.8H, #2          // shift right 2 0..7 (rounding shift right)
+        urshr   v31.8H, v31.8H, #2          // shift right 2 8..15
+
+        // Averages are now stored in these registers:
+        // v23, v24
+        // v26, v27
+        // v28, v29
+        // v30, v31
+        // pix1 values in these registers:
+        // v1, v6, v17, v20
+        // available
+        // v2, v3, v4, v5, v7, v16, v18, v19, v25
+
+        uxtl2   v2.8H, v1.16B               // 8->16 bits pix1 8..15
+        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
+        uxtl2   v7.8H, v6.16B               // 8->16 bits pix1 8..15
+        uxtl    v6.8H, v6.8B                // 8->16 bits pix1 0..7
+        uxtl2   v18.8H, v17.16B             // 8->16 bits pix1 8..15
+        uxtl    v17.8H, v17.8B              // 8->16 bits pix1 0..7
+        uxtl2   v25.8H, v20.16B             // 8->16 bits pix1 8..15
+        uxtl    v20.8H, v20.8B              // 8->16 bits pix1 0..7
+
+        uabd    v5.8H, v1.8H, v23.8H        // absolute difference 0..7
+        uaba    v5.8H, v2.8H, v24.8H        // absolute difference accumulate 8..15
+        uaba    v5.8H, v6.8H, v26.8H        // absolute difference accumulate 0..7
+        uaba    v5.8H, v7.8H, v27.8H        // absolute difference accumulate 8..15
+        uaba    v5.8H, v17.8H, v28.8H       // absolute difference accumulate 0..7
+        uaba    v5.8H, v18.8H, v29.8H       // absolute difference accumulate 8..15
+        uaba    v5.8H, v20.8H, v30.8H       // absolute difference accumulate 0..7
+        uaba    v5.8H, v25.8H, v31.8H       // absolute difference accumulate 8..15
+
+        uaddlv  s5, v5.8H                   // add up accumulated values
+        add     d0, d0, d5                  // add to final result
+
+        mov     v2.16B, v21.16B             // pix3 -> pix2
+        mov     v4.16B, v22.16B             // pix3+1 -> pix2+1
+
+        sub     w4, w4, #4                  // h -= 4
+        cmp     w4, #4                      // loop if h >= 4
+        b.ge    1b
+        cbnz    w4, 2f                      // if iterations remain jump to completion section
+
+        mov w0, v0.s[0]                     // copy result to general purpose register
+        ret
+2:
+        // q2 and q4 are set either at the end of this loop or at from the unrolled version
+        // which branches here to complete iterations when h % 4 != 0.
+        ldr     q5, [x5, #1]                // load pix3+1
+        ld1     {v3.16B}, [x5], x3          // load pix3
+        ld1     {v1.16B}, [x1], x3          // load pix1
+        subs    w4, w4, #1                  // decrement h and set flags for branch below
+
+        uaddl   v16.8H, v2.8B, v4.8B        // pix2 + pix2+1 0..7
+        uaddl2  v17.8H, v2.16B, v4.16B      // pix2 + pix2+1 8..15
+        uaddl   v18.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
+        uaddl2  v19.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
+        add     v16.8H, v16.8H, v18.8H      // add up 0..7
+        add     v17.8H, v17.8H, v19.8H      // add up 8..15
+        // divide by 4 to compute the average of values summed above
+        urshr   v16.8H, v16.8H, #2          // shift right by 2 0..7 (rounding shift right)
+        urshr   v17.8H, v17.8H, #2          // shift right by 2 8..15
+
+        uxtl2   v8.8H, v1.16B               // 8->16 bits pix1 8..15
+        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
+
+        uabd    v6.8H, v1.8H, v16.8H        // absolute difference 0..7
+        uaba    v6.8H, v8.8H, v17.8H        // absolute difference accumulate 8..15
+        addv    h6, v6.8H                   // add up accumulator in v6
+        add     d0, d0, d6
+
+        mov     v2.16B, v3.16B              // pix3 -> pix2
+        mov     v4.16B, v5.16B              // pix3+1 -> pix2+1
+
+        b.ne    2b                          // branch based on subs instruction above
+        mov w0, v0.s[0]                     // copy result to general purpose register
+        ret
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index b2f87d2e1b..60053a1b92 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1064,6 +1064,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
         ff_me_cmp_init_alpha(c, avctx);
     if (ARCH_ARM)
         ff_me_cmp_init_arm(c, avctx);
+    if (ARCH_AARCH64)
+        ff_me_cmp_init_aarch64(c, avctx);
     if (ARCH_PPC)
         ff_me_cmp_init_ppc(c, avctx);
     if (ARCH_X86)
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..4dd059223d 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -82,6 +82,7 @@ typedef struct MECmpContext {
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
-- 
2.32.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-03-14 22:40 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-07 20:40 [FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions Swinney, Jonathan
2022-03-14 22:28 ` Martin Storsjö
  -- strict thread matches above, loose matches on Subject: below --
2022-03-04 22:52 Swinney, Jonathan
     [not found] ` <1646667914236.71807@amazon.com>
2022-03-14 22:39   ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git