[FFmpeg-devel] [PATCH 0/3] Provide arm64 neon implementations

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH 0/3] Provide arm64 neon implementations
@ 2022-09-13 11:58 Hubert Mazur
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16 Hubert Mazur
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Hubert Mazur @ 2022-09-13 11:58 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Add neon optimized implementations for functions from motion
estimation family. All functions tested and benchmarked
on AWS Graviton instances.

Hubert Mazur (3):
  lavc/aarch64: Add neon implementation for pix_median_abs16
  lavc/aarch64: Add neon implementation for vsad8_intra
  lavc/aarch64: Add neon implementation for pix_median_abs8

 libavcodec/aarch64/me_cmp_init_aarch64.c |  10 ++
 libavcodec/aarch64/me_cmp_neon.S         | 188 +++++++++++++++++++++++
 libavcodec/me_cmp.c                      |   5 +-
 3 files changed, 201 insertions(+), 2 deletions(-)

-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16
  2022-09-13 11:58 [FFmpeg-devel] [PATCH 0/3] Provide arm64 neon implementations Hubert Mazur
@ 2022-09-13 11:58 ` Hubert Mazur
  2022-09-16 21:08   ` Martin Storsjö
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra Hubert Mazur
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8 Hubert Mazur
  2 siblings, 1 reply; 8+ messages in thread
From: Hubert Mazur @ 2022-09-13 11:58 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation for pix_median_abs16 function.

Performance comparison tests are shown below.
 - median_sad_0_c: 722.0
 - median_sad_0_neon: 144.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
 libavcodec/aarch64/me_cmp_neon.S         | 81 ++++++++++++++++++++++++
 libavcodec/me_cmp.c                      |  5 +-
 3 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index ade3e9a4c1..fb51a833be 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -53,6 +53,8 @@ int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
                 ptrdiff_t stride, int h);
 int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                         ptrdiff_t stride, int h);
+int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                          ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -78,6 +80,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->vsse[4] = vsse_intra16_neon;
 
         c->nsse[0] = nsse16_neon_wrapper;
+
+        c->median_sad[0] = pix_median_abs16_neon;
     }
 }
 
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index f8998749a5..a4a4344f42 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -969,3 +969,84 @@ function nsse16_neon, export=1
 
         ret
 endfunc
+
+function pix_median_abs16_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        ld1             {v2.16b}, [x1], x3
+        ld1             {v3.16b}, [x2], x3
+        movi            v31.8h, #0
+        movi            v16.8h, #0
+        ext             v0.16b, v2.16b, v2.16b, #1
+        ext             v1.16b, v3.16b, v3.16b, #1
+        usubl           v28.8h, v2.8b, v3.8b
+        usubl2          v27.8h, v2.16b, v3.16b
+        usubl           v26.8h, v0.8b, v1.8b
+        usubl2          v25.8h, v0.16b, v1.16b
+        sub             w4, w4, #1                              // we need to make h-1 iterations
+        saba            v31.8h, v26.8h, v28.8h
+        saba            v16.8h, v25.8h, v27.8h
+        mov             h18, v28.h[0]
+        cmp             w4, #1
+        sqabs           h18, h18
+
+        b.lt            2f
+1:
+
+        ld1             {v6.16b}, [x1], x3                      // pix1 vector for V(j-1)
+        ld1             {v7.16b}, [x2], x3                      // pix2 vector for V(j-1)
+        subs            w4, w4, #1
+        mov             v2.16b, v6.16b
+        mov             v3.16b, v7.16b
+        ext             v4.16b, v6.16b, v6.16b, #1              // pix1 vector for V(j)
+        ext             v5.16b, v7.16b, v7.16b, #1              // pix2 vector for V(j)
+
+        // protected registers: v30, v29, v28, v27, v26, v25, v24, v23
+        // scratch registers: v22, v21, v20, v19, v17
+
+        // To find median of three values, calculate sum of them
+        // and subtract max and min value from it.
+        usubl           v30.8h, v6.8b, v7.8b                    // V(j-1)
+        usubl2          v29.8h, v6.16b, v7.16b                  // V(j-1)
+        usubl           v24.8h, v4.8b, v5.8b                    // V(j)
+        usubl2          v23.8h, v4.16b, v5.16b                  // V(j)
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v5.16b
+        sabd            v20.8h, v30.8h, v28.8h
+        mov             h17, v20.h[0]
+        add             d18, d18, d17
+        add             v22.8h, v26.8h, v30.8h
+        smin            v20.8h, v26.8h, v30.8h
+        add             v21.8h, v25.8h, v29.8h
+        smax            v19.8h, v26.8h, v30.8h
+        sub             v22.8h, v22.8h, v28.8h
+        sub             v21.8h, v21.8h, v27.8h
+        smin            v17.8h, v19.8h, v22.8h
+        smin            v22.8h, v25.8h, v29.8h
+        mov             v28.16b, v30.16b
+        smax            v20.8h, v20.8h, v17.8h                  // median values lower half
+        smax            v19.8h, v25.8h, v29.8h
+        saba            v31.8h, v24.8h, v20.8h
+        mov             v27.16b, v29.16b
+        smin            v19.8h, v19.8h, v21.8h
+        mov             v26.16b, v24.16b
+        smax            v17.8h, v22.8h, v19.8h                  // median values upper half
+        mov             v25.16b, v23.16b
+        saba            v16.8h, v23.8h, v17.8h
+
+        b.ne            1b
+
+2:
+        ins             v16.h[7], wzr
+        add             v31.8h, v31.8h, v16.8h
+        uaddlv          s17, v31.8h
+        add             d18, d18, d17
+        fmov            w0, s18
+
+        ret
+
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index 4242fbc6e4..230e7ea54a 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1048,6 +1048,9 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
     ff_dsputil_init_dwt(c);
 #endif
 
+c->median_sad[0] = pix_median_abs16_c;
+c->median_sad[1] = pix_median_abs8_c;
+
 #if ARCH_AARCH64
     ff_me_cmp_init_aarch64(c, avctx);
 #elif ARCH_ALPHA
@@ -1062,6 +1065,4 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
     ff_me_cmp_init_mips(c, avctx);
 #endif
 
-    c->median_sad[0] = pix_median_abs16_c;
-    c->median_sad[1] = pix_median_abs8_c;
 }
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16 Hubert Mazur
@ 2022-09-16 21:08   ` Martin Storsjö
  0 siblings, 0 replies; 8+ messages in thread
From: Martin Storsjö @ 2022-09-16 21:08 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Tue, 13 Sep 2022, Hubert Mazur wrote:

> Provide optimized implementation for pix_median_abs16 function.
>
> Performance comparison tests are shown below.
> - median_sad_0_c: 722.0
> - median_sad_0_neon: 144.7
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
> libavcodec/aarch64/me_cmp_neon.S         | 81 ++++++++++++++++++++++++
> libavcodec/me_cmp.c                      |  5 +-
> 3 files changed, 88 insertions(+), 2 deletions(-)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index ade3e9a4c1..fb51a833be 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -53,6 +53,8 @@ int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
>                 ptrdiff_t stride, int h);
> int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
>                         ptrdiff_t stride, int h);
> +int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> +                          ptrdiff_t stride, int h);
> 
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -78,6 +80,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>         c->vsse[4] = vsse_intra16_neon;
>
>         c->nsse[0] = nsse16_neon_wrapper;
> +
> +        c->median_sad[0] = pix_median_abs16_neon;
>     }
> }
> 
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index f8998749a5..a4a4344f42 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -969,3 +969,84 @@ function nsse16_neon, export=1
>
>         ret
> endfunc
> +
> +function pix_median_abs16_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        ld1             {v2.16b}, [x1], x3
> +        ld1             {v3.16b}, [x2], x3
> +        movi            v31.8h, #0
> +        movi            v16.8h, #0
> +        ext             v0.16b, v2.16b, v2.16b, #1
> +        ext             v1.16b, v3.16b, v3.16b, #1
> +        usubl           v28.8h, v2.8b, v3.8b
> +        usubl2          v27.8h, v2.16b, v3.16b
> +        usubl           v26.8h, v0.8b, v1.8b
> +        usubl2          v25.8h, v0.16b, v1.16b
> +        sub             w4, w4, #1                              // we need to make h-1 iterations
> +        saba            v31.8h, v26.8h, v28.8h
> +        saba            v16.8h, v25.8h, v27.8h
> +        mov             h18, v28.h[0]

> +        cmp             w4, #1
> +        sqabs           h18, h18
> +
> +        b.lt            2f
> +1:
> +
> +        ld1             {v6.16b}, [x1], x3                      // pix1 vector for V(j-1)
> +        ld1             {v7.16b}, [x2], x3                      // pix2 vector for V(j-1)
> +        subs            w4, w4, #1
> +        mov             v2.16b, v6.16b
> +        mov             v3.16b, v7.16b

These two mov instructions seem unnecessary?

> +        ext             v4.16b, v6.16b, v6.16b, #1              // pix1 vector for V(j)
> +        ext             v5.16b, v7.16b, v7.16b, #1              // pix2 vector for V(j)
> +
> +        // protected registers: v30, v29, v28, v27, v26, v25, v24, v23
> +        // scratch registers: v22, v21, v20, v19, v17
> +
> +        // To find median of three values, calculate sum of them
> +        // and subtract max and min value from it.
> +        usubl           v30.8h, v6.8b, v7.8b                    // V(j-1)
> +        usubl2          v29.8h, v6.16b, v7.16b                  // V(j-1)
> +        usubl           v24.8h, v4.8b, v5.8b                    // V(j)
> +        usubl2          v23.8h, v4.16b, v5.16b                  // V(j)
> +        mov             v0.16b, v4.16b
> +        mov             v1.16b, v5.16b

These two movs are unused, too, right?

> +        sabd            v20.8h, v30.8h, v28.8h
> +        mov             h17, v20.h[0]
> +        add             d18, d18, d17

These are quite suboptimally scheduled here. However, we shouldn't need 
them.

In general, try to avoid these single-element calculations if not strictly 
necessary. You can just keep using both the input (here, v20) and the 
accumulator (v18) as a .4h vector, where you only care about the first 
element. Then at the very end you can extract the individual first 
element from it, instead of doing it every round in the loop. Then you can 
potentially change sabd into saba too, unless the non-accumulated result 
is needed too.

> +        add             v22.8h, v26.8h, v30.8h
> +        smin            v20.8h, v26.8h, v30.8h
> +        add             v21.8h, v25.8h, v29.8h
> +        smax            v19.8h, v26.8h, v30.8h
> +        sub             v22.8h, v22.8h, v28.8h
> +        sub             v21.8h, v21.8h, v27.8h
> +        smin            v17.8h, v19.8h, v22.8h
> +        smin            v22.8h, v25.8h, v29.8h
> +        mov             v28.16b, v30.16b
> +        smax            v20.8h, v20.8h, v17.8h                  // median values lower half
> +        smax            v19.8h, v25.8h, v29.8h
> +        saba            v31.8h, v24.8h, v20.8h
> +        mov             v27.16b, v29.16b
> +        smin            v19.8h, v19.8h, v21.8h
> +        mov             v26.16b, v24.16b
> +        smax            v17.8h, v22.8h, v19.8h                  // median values upper half
> +        mov             v25.16b, v23.16b
> +        saba            v16.8h, v23.8h, v17.8h
> +
> +        b.ne            1b
> +
> +2:
> +        ins             v16.h[7], wzr
> +        add             v31.8h, v31.8h, v16.8h
> +        uaddlv          s17, v31.8h
> +        add             d18, d18, d17
> +        fmov            w0, s18
> +
> +        ret
> +
> +endfunc
> diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
> index 4242fbc6e4..230e7ea54a 100644
> --- a/libavcodec/me_cmp.c
> +++ b/libavcodec/me_cmp.c
> @@ -1048,6 +1048,9 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
>     ff_dsputil_init_dwt(c);
> #endif
> 
> +c->median_sad[0] = pix_median_abs16_c;
> +c->median_sad[1] = pix_median_abs8_c;
> +

These are incorrectly indented.

Other than that, this seems reasonable I think.


// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra
  2022-09-13 11:58 [FFmpeg-devel] [PATCH 0/3] Provide arm64 neon implementations Hubert Mazur
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16 Hubert Mazur
@ 2022-09-13 11:58 ` Hubert Mazur
  2022-09-16 21:15   ` Martin Storsjö
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8 Hubert Mazur
  2 siblings, 1 reply; 8+ messages in thread
From: Hubert Mazur @ 2022-09-13 11:58 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation for pix_median_abs16 function.

Performance comparison tests are shown below.
- vsad_5_c: 94.7
- vsad_5_neon: 20.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
 libavcodec/aarch64/me_cmp_neon.S         | 42 ++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index fb51a833be..d3fa047a86 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -45,6 +45,8 @@ int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                 ptrdiff_t stride, int h);
 int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
                       ptrdiff_t stride, int h) ;
+int vsad_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
+                     ptrdiff_t stride, int h) ;
 int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                 ptrdiff_t stride, int h);
 int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
@@ -75,6 +77,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->vsad[0] = vsad16_neon;
         c->vsad[4] = vsad_intra16_neon;
+        c->vsad[5] = vsad_intra8_neon;
 
         c->vsse[0] = vsse16_neon;
         c->vsse[4] = vsse_intra16_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index a4a4344f42..73701bd353 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1050,3 +1050,45 @@ function pix_median_abs16_neon, export=1
         ret
 
 endfunc
+
+function vsad_intra8_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *dummy
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        ld1             {v0.8b}, [x1], x3
+        sub             w4, w4, #1 // we need to make h-1 iterations
+        cmp             w4, #3
+        movi            v16.8h, #0
+        b.lt            2f
+
+1:
+        // v = abs( pix1[0] - pix1[0 + stride] )
+        // score = sum(v)
+        ld1             {v1.8b}, [x1], x3
+        ld1             {v2.8b}, [x1], x3
+        uabal           v16.8h, v0.8b, v1.8b
+        ld1             {v3.8b}, [x1], x3
+        sub             w4, w4, #3
+        uabal           v16.8h, v1.8b, v2.8b
+        cmp             w4, #3
+        mov             v0.8b, v3.8b
+        uabal           v16.8h, v2.8b, v3.8b
+        b.ge            1b
+        cbz             w4, 3f
+
+2:
+        ld1             {v1.8b}, [x1], x3
+        subs            w4, w4, #1
+        uabal           v16.8h, v0.8b, v1.8b
+        mov             v0.8b, v1.8b
+        cbnz            w4, 2b
+
+3:
+        uaddlv          s17, v16.8h
+        fmov            w0, s17
+
+        ret
+endfunc
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra Hubert Mazur
@ 2022-09-16 21:15   ` Martin Storsjö
  0 siblings, 0 replies; 8+ messages in thread
From: Martin Storsjö @ 2022-09-16 21:15 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Tue, 13 Sep 2022, Hubert Mazur wrote:

> Provide optimized implementation for pix_median_abs16 function.

You've forgot to update this part of the commit message.

> Performance comparison tests are shown below.
> - vsad_5_c: 94.7
> - vsad_5_neon: 20.7
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
> libavcodec/aarch64/me_cmp_neon.S         | 42 ++++++++++++++++++++++++
> 2 files changed, 45 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index fb51a833be..d3fa047a86 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -45,6 +45,8 @@ int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
>                 ptrdiff_t stride, int h);
> int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
>                       ptrdiff_t stride, int h) ;
> +int vsad_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
> +                     ptrdiff_t stride, int h) ;
> int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
>                 ptrdiff_t stride, int h);
> int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
> @@ -75,6 +77,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>
>         c->vsad[0] = vsad16_neon;
>         c->vsad[4] = vsad_intra16_neon;
> +        c->vsad[5] = vsad_intra8_neon;
>
>         c->vsse[0] = vsse16_neon;
>         c->vsse[4] = vsse_intra16_neon;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index a4a4344f42..73701bd353 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -1050,3 +1050,45 @@ function pix_median_abs16_neon, export=1
>         ret
> 
> endfunc
> +
> +function vsad_intra8_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *dummy
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        ld1             {v0.8b}, [x1], x3
> +        sub             w4, w4, #1 // we need to make h-1 iterations
> +        cmp             w4, #3
> +        movi            v16.8h, #0
> +        b.lt            2f
> +
> +1:
> +        // v = abs( pix1[0] - pix1[0 + stride] )
> +        // score = sum(v)
> +        ld1             {v1.8b}, [x1], x3
> +        ld1             {v2.8b}, [x1], x3
> +        uabal           v16.8h, v0.8b, v1.8b
> +        ld1             {v3.8b}, [x1], x3
> +        sub             w4, w4, #3

Instinctively, I'd prefer to move the sub instruction up to between the 
first two ld1 instructions here. However I don't see any change in 
benchmarks on Cortex A53 due to that, so it's not strictly necessary, but 
I'd prefer it that way.

Other than that, this looks very reasonable and straightforward.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8
  2022-09-13 11:58 [FFmpeg-devel] [PATCH 0/3] Provide arm64 neon implementations Hubert Mazur
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16 Hubert Mazur
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra Hubert Mazur
@ 2022-09-13 11:58 ` Hubert Mazur
  2022-09-16 21:17   ` Martin Storsjö
  2 siblings, 1 reply; 8+ messages in thread
From: Hubert Mazur @ 2022-09-13 11:58 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation for pix_median_abs16 function.

Performance comparison tests are shown below.
- median_sad_1_c: 273.7
- median_sad_1_neon: 98.2

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
 libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index d3fa047a86..e143f0816e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -57,6 +57,8 @@ int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                         ptrdiff_t stride, int h);
 int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
                           ptrdiff_t stride, int h);
+int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                         ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -85,6 +87,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->nsse[0] = nsse16_neon_wrapper;
 
         c->median_sad[0] = pix_median_abs16_neon;
+        c->median_sad[1] = pix_median_abs8_neon;
     }
 }
 
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 73701bd353..2017d0ce63 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1092,3 +1092,68 @@ function vsad_intra8_neon, export=1
 
         ret
 endfunc
+
+function pix_median_abs8_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        ld1             {v2.8b}, [x1], x3
+        ld1             {v3.8b}, [x2], x3
+        movi            v31.8h, #0
+        ext             v0.8b, v2.8b, v2.8b, #1
+        ext             v1.8b, v3.8b, v3.8b, #1
+        usubl           v28.8h, v2.8b, v3.8b
+        usubl           v26.8h, v0.8b, v1.8b
+        sub             w4, w4, #1                              // we need to make h-1 iterations
+        saba            v31.8h, v26.8h, v28.8h
+        mov             h18, v28.h[0]
+        cmp             w4, #1
+        sqabs           h18, h18
+
+        b.lt            2f
+1:
+        ld1             {v6.8b}, [x1], x3                      // pix1 vector for V(j-1)
+        ld1             {v7.8b}, [x2], x3                      // pix2 vector for V(j-1)
+        subs            w4, w4, #1
+        mov             v2.8b, v6.8b
+        mov             v3.8b, v7.8b
+        ext             v4.8b, v6.8b, v6.8b, #1              // pix1 vector for V(j)
+        ext             v5.8b, v7.8b, v7.8b, #1              // pix2 vector for V(j)
+
+        // protected registers: v30, v29, v28, v27, v26, v25, v24, v23
+        // scratch registers: v22, v21, v20, v19, v17
+
+        // To find median of three values, calculate sum of them
+        // and subtract max and min value from it.
+        usubl           v30.8h, v6.8b, v7.8b                    // V(j-1)
+        usubl           v24.8h, v4.8b, v5.8b                    // V(j)
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v5.16b
+        sabd            v20.8h, v30.8h, v28.8h
+        mov             h17, v20.h[0]
+        add             d18, d18, d17
+        add             v22.8h, v26.8h, v30.8h
+        smin            v20.8h, v26.8h, v30.8h
+        smax            v19.8h, v26.8h, v30.8h
+        sub             v22.8h, v22.8h, v28.8h
+        smin            v17.8h, v19.8h, v22.8h
+        mov             v28.16b, v30.16b
+        smax            v20.8h, v20.8h, v17.8h                  // median values lower half
+        smax            v19.8h, v25.8h, v29.8h
+        saba            v31.8h, v24.8h, v20.8h
+        mov             v26.16b, v24.16b
+        smax            v17.8h, v22.8h, v19.8h                  // median values upper half
+
+        b.ne            1b
+2:
+        ins             v31.h[7], wzr
+        uaddlv          s17, v31.8h
+        add             d18, d18, d17
+        fmov            w0, s18
+
+        ret
+
+endfunc
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8
  2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8 Hubert Mazur
@ 2022-09-16 21:17   ` Martin Storsjö
  0 siblings, 0 replies; 8+ messages in thread
From: Martin Storsjö @ 2022-09-16 21:17 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Tue, 13 Sep 2022, Hubert Mazur wrote:

> Provide optimized implementation for pix_median_abs16 function.

Forgot to update this part of the commit message here too.

> Performance comparison tests are shown below.
> - median_sad_1_c: 273.7
> - median_sad_1_neon: 98.2
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
> libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
> 2 files changed, 68 insertions(+)

The same comments as for patch 1/3; looks reasonble, a bunch of leftover 
mov instructions which I don't see how they'd be necessary, and please 
avoid the extra single-lane handling and just do plain vector operations, 
and extract the single lane at the end.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [FFmpeg-devel] [PATCH 0/3] Provide neon implementations
@ 2022-09-20 11:01 Hubert Mazur
  2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra Hubert Mazur
  0 siblings, 1 reply; 8+ messages in thread
From: Hubert Mazur @ 2022-09-20 11:01 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

This fixes issues addressed in previous patchset:
 - move sub instruction in vsad8_intra,
 - remove unnecessary mov instructions,
 - remove single lane extraction in loop and place it at the end.

Removing mov instructions from pix_median_abs functions significantly
increased peformance for both.

Hubert Mazur (3):
  lavc/aarch64: Add neon implementation for pix_median_abs16
  lavc/aarch64: Add neon implementation for vsad8_intra
  lavc/aarch64: Add neon implementation for pix_median_abs8

 libavcodec/aarch64/me_cmp_init_aarch64.c |  10 ++
 libavcodec/aarch64/me_cmp_neon.S         | 182 +++++++++++++++++++++++
 libavcodec/me_cmp.c                      |   5 +-
 3 files changed, 195 insertions(+), 2 deletions(-)

-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra
  2022-09-20 11:01 [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Hubert Mazur
@ 2022-09-20 11:01 ` Hubert Mazur
  0 siblings, 0 replies; 8+ messages in thread
From: Hubert Mazur @ 2022-09-20 11:01 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation for vsad8_intra function.

Performance comparison tests are shown below.
- vsad_5_c: 94.7
- vsad_5_neon: 20.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
 libavcodec/aarch64/me_cmp_neon.S         | 42 ++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index fb51a833be..d3fa047a86 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -45,6 +45,8 @@ int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                 ptrdiff_t stride, int h);
 int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
                       ptrdiff_t stride, int h) ;
+int vsad_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
+                     ptrdiff_t stride, int h) ;
 int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                 ptrdiff_t stride, int h);
 int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
@@ -75,6 +77,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->vsad[0] = vsad16_neon;
         c->vsad[4] = vsad_intra16_neon;
+        c->vsad[5] = vsad_intra8_neon;
 
         c->vsse[0] = vsse16_neon;
         c->vsse[4] = vsse_intra16_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 1e08d04869..1587f2ea43 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1047,3 +1047,45 @@ function pix_median_abs16_neon, export=1
         ret
 
 endfunc
+
+function vsad_intra8_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *dummy
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        ld1             {v0.8b}, [x1], x3
+        sub             w4, w4, #1 // we need to make h-1 iterations
+        cmp             w4, #3
+        movi            v16.8h, #0
+        b.lt            2f
+
+1:
+        // v = abs( pix1[0] - pix1[0 + stride] )
+        // score = sum(v)
+        ld1             {v1.8b}, [x1], x3
+        sub             w4, w4, #3
+        ld1             {v2.8b}, [x1], x3
+        uabal           v16.8h, v0.8b, v1.8b
+        ld1             {v3.8b}, [x1], x3
+        uabal           v16.8h, v1.8b, v2.8b
+        cmp             w4, #3
+        mov             v0.8b, v3.8b
+        uabal           v16.8h, v2.8b, v3.8b
+        b.ge            1b
+        cbz             w4, 3f
+
+2:
+        ld1             {v1.8b}, [x1], x3
+        subs            w4, w4, #1
+        uabal           v16.8h, v0.8b, v1.8b
+        mov             v0.8b, v1.8b
+        cbnz            w4, 2b
+
+3:
+        uaddlv          s17, v16.8h
+        fmov            w0, s17
+
+        ret
+endfunc
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-09-20 11:02 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-13 11:58 [FFmpeg-devel] [PATCH 0/3] Provide arm64 neon implementations Hubert Mazur
2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16 Hubert Mazur
2022-09-16 21:08   ` Martin Storsjö
2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra Hubert Mazur
2022-09-16 21:15   ` Martin Storsjö
2022-09-13 11:58 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: Add neon implementation for pix_median_abs8 Hubert Mazur
2022-09-16 21:17   ` Martin Storsjö
2022-09-20 11:01 [FFmpeg-devel] [PATCH 0/3] Provide neon implementations Hubert Mazur
2022-09-20 11:01 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: Add neon implementation for vsad8_intra Hubert Mazur

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git