Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions
@ 2022-07-15  8:02 Hubert Mazur
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
                   ` (4 more replies)
  0 siblings, 5 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15  8:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Add arm64 neon implementation for the following functions from
motion estimation. All functions were tested and benchmarked on
AWS Graviton 3 instances.

Hubert Mazur (5):
  lavc/aarch64: Add neon implementation for sse16
  lavc/aarch64: Add neon implementation for sse4
  lavc/aarch64: Add neon implementation for pix_abs16_y2
  lavc/aarch64: Add neon implementation for sse8
  lavc/aarch64: Add neon implementation for pix_abs8

 libavcodec/aarch64/me_cmp_init_aarch64.c |  17 ++
 libavcodec/aarch64/me_cmp_neon.S         | 346 +++++++++++++++++++++++
 2 files changed, 363 insertions(+)

-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16
  2022-07-15  8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
@ 2022-07-15  8:02 ` Hubert Mazur
  2022-07-25 11:12   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15  8:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide neon implementation for sse16 function.

Performance comparison tests are shown below.
- sse_0_c: 273.0
- sse_0_neon: 48.2

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
 libavcodec/aarch64/me_cmp_neon.S         | 82 ++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 136b008eb7..3ff5767bd0 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -30,6 +30,9 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
 int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
 
+int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -40,5 +43,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
 
         c->sad[0] = ff_pix_abs16_neon;
+        c->sse[0] = sse16_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index e49d049fc2..88cd335443 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -278,3 +278,85 @@ function ff_pix_abs16_x2_neon, export=1
 
         ret
 endfunc
+
+function sse16_neon, export=1
+        // x0 - unused
+        // x1 - pix1
+        // x2 - pix2
+        // x3 - stride
+        // w4 - h
+
+        cmp             w4, #4
+        movi            d18, #0
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+
+        // res = abs(pix1[0] - pix2[0])
+        // res * res
+
+        ld1             {v0.16b}, [x1], x3
+        ld1             {v1.16b}, [x2], x3
+        uabd            v30.16b, v0.16b, v1.16b
+        ld1             {v2.16b}, [x1], x3
+        umull           v29.8h, v30.8b, v30.8b
+        ld1             {v3.16b}, [x2], x3
+        umull2          v28.8h, v30.16b, v30.16b
+        uabd            v27.16b, v2.16b, v3.16b
+        uaddlp          v17.4s, v29.8h
+        umull           v26.8h, v27.8b, v27.8b
+        umull2          v25.8h, v27.16b, v27.16b
+        ld1             {v4.16b}, [x1], x3
+        uadalp          v17.4s, v26.8h
+        ld1             {v5.16b}, [x2], x3
+        uadalp          v17.4s, v25.8h
+        uabd            v24.16b, v4.16b, v5.16b
+        ld1             {v6.16b}, [x1], x3
+        umull           v23.8h, v24.8b, v24.8b
+        uadalp          v17.4s, v23.8h
+        umull2          v22.8h, v24.16b, v24.16b
+        uadalp          v17.4s, v22.8h
+        ld1             {v7.16b}, [x2], x3
+        uadalp          v17.4s, v28.8h
+        uabd            v21.16b, v6.16b, v7.16b
+        umull           v20.8h, v21.8b, v21.8b
+        uadalp          v17.4s, v20.8h
+        umull2          v19.8h, v21.16b, v21.16b
+        uadalp          v17.4s, v19.8h
+
+        sub             w4, w4, #4
+        uaddlv          d16, v17.4s
+        cmp             w4, #4
+        add             d18, d18, d16
+
+        b.ge            1b
+
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+
+        ld1             {v0.16b}, [x1], x3
+        ld1             {v1.16b}, [x2], x3
+
+        uabd            v30.16b, v0.16b, v1.16b
+        umull           v29.8h, v0.8b, v1.8b
+        umull2          v28.8h, v0.16b, v1.16b
+        uaddlp          v17.4s, v29.8h
+        uadalp          v17.4s, v28.8h
+
+
+        subs            w4, w4, #1
+        uaddlv          d16, v17.4s
+        add             d18, d18, d16
+
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+
+endfunc
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4
  2022-07-15  8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
@ 2022-07-15  8:02 ` Hubert Mazur
  2022-07-21 21:43   ` Martin Storsjö
                     ` (2 more replies)
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
                   ` (2 subsequent siblings)
  4 siblings, 3 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15  8:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide neon implementation for sse4 function.

Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
 libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->sad[0] = ff_pix_abs16_neon;
         c->sse[0] = sse16_neon;
+        c->sse[2] = sse4_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 88cd335443..bacf151314 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -360,3 +360,68 @@ function sse16_neon, export=1
         ret
 
 endfunc
+
+function sse4_neon, export=1
+        // x0 - unused
+        // x1 - pix1
+        // x2 - pix2
+        // x3 - stride
+        // w4 - h
+
+        movi            d18, #0
+        movi            d17, #0
+        cmp             w4, #4
+        b.le            2f
+
+// make 4 iterations at once
+1:
+
+        // res = abs(pix1[0] - pix2[0])
+        // res * res
+
+        ld1             {v0.4b}, [x1], x3
+        ld1             {v1.4b}, [x2], x3
+        uabdl           v30.8h, v0.4b, v1.4b
+        ld1             {v2.4b}, [x1], x3
+        ld1             {v3.4b}, [x2], x3
+        umull           v16.4s, v30.4h, v30.4h
+        uabdl           v29.8h, v2.4b, v3.4b
+        ld1             {v4.4b}, [x1], x3
+        ld1             {v5.4b}, [x2], x3
+        umlal           v16.4s, v29.4h, v29.4h
+        uabdl           v28.8h, v4.4b, v5.4b
+        ld1             {v6.4b}, [x1], x3
+        ld1             {v7.4b}, [x2], x3
+        umlal           v16.4s, v28.4h, v28.4h
+        uabdl           v27.8h, v6.4b, v7.4b
+        umlal           v16.4s, v27.4h, v27.4h
+
+        uaddlv          d17, v16.4s
+        add             d18, d18, d17
+
+        sub             w4, w4, #4
+        cmp             w4, #4
+        b.ge            1b
+
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+        ld1             {v0.4b}, [x1], x3
+        ld1             {v1.4b}, [x2], x3
+        uabdl           v30.8h, v0.4b, v1.4b
+        umull           v16.4s, v30.4h, v30.4h
+
+        uaddlv          d17, v16.4s
+        add             d18, d18, d17
+
+        subs            w4, w4, #1
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+
+endfunc
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2
  2022-07-15  8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
@ 2022-07-15  8:02 ` Hubert Mazur
  2022-07-25 11:17   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
  4 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15  8:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation of pix_abs16_y2 function for arm64.

Performance comparison tests are shown below.
pix_abs_0_2_c: 308.5
pix_abs_0_2_neon: 39.2

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 +
 libavcodec/aarch64/me_cmp_neon.S         | 74 ++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 72a2062e7e..07d62cc1e5 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -29,6 +29,8 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
                       ptrdiff_t stride, int h);
 int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
+int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
@@ -42,6 +44,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
     if (have_neon(cpu_flags)) {
         c->pix_abs[0][0] = ff_pix_abs16_neon;
         c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
+        c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
 
         c->sad[0] = ff_pix_abs16_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index bacf151314..858833b0ae 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -425,3 +425,77 @@ function sse4_neon, export=1
         ret
 
 endfunc
+
+function ff_pix_abs16_y2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // x4           int h
+        // x5           uint8_t *pix2 + stride
+
+        // initialize buffers
+        movi            d18, #0
+        add             x5, x2, x3
+        cmp             w4, #4
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+
+        // abs(pix1[0], avg2(pix2[0], pix2[0 + stride]))
+        // avg2(a, b) = (((a) + (b) + 1) >> 1)
+        // abs(x) = (x < 0 ? (-x) : (x))
+
+        ld1             {v1.16b}, [x2], x3
+        ld1             {v2.16b}, [x5], x3
+        urhadd          v30.16b, v1.16b, v2.16b
+        ld1             {v0.16b}, [x1], x3
+        uabdl           v29.8h, v0.8b, v30.8b
+        ld1             {v4.16b}, [x2], x3
+        uabdl2          v28.8h, v0.16b, v30.16b
+        ld1             {v5.16b}, [x5], x3
+        urhadd          v27.16b, v4.16b, v5.16b
+        ld1             {v3.16b}, [x1], x3
+        uabal           v29.8h, v3.8b, v27.8b
+        ld1             {v7.16b}, [x2], x3
+        uabal2          v28.8h, v3.16b, v27.16b
+        ld1             {v20.16b}, [x5], x3
+        urhadd          v26.16b, v7.16b, v20.16b
+        ld1             {v6.16b}, [x1], x3
+        uabal           v29.8h, v6.8b, v26.8b
+        ld1             {v22.16b}, [x2], x3
+        uabal2          v28.8h, v6.16b, v26.16b
+        ld1             {v23.16b}, [x5], x3
+        urhadd          v25.16b, v22.16b, v23.16b
+        ld1             {v21.16b}, [x1], x3
+        uabal           v29.8h, v21.8b, v25.8b
+        uabal2          v28.8h, v21.16b, v25.16b
+
+        add             v29.8h, v29.8h, v28.8h
+        sub             w4, w4, #4
+        uaddlv          s16, v29.8h
+        cmp             w4, #4
+        add             d18, d18, d16
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+
+        ld1             {v0.16b}, [x1], x3
+        ld1             {v1.16b}, [x2], x3
+        urhadd          v30.16b, v1.16b, v2.16b
+        ld1             {v2.16b}, [x5], x3
+        uabd            v30.16b, v30.16b, v30.16b
+
+        uaddlv          h17, v30.16b
+        subs            w4, w4, #1
+        add             d18, d18, d17
+        b.ne            2b
+
+3:
+        fmov            w0, s18
+
+        ret
+endfunc
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8
  2022-07-15  8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
                   ` (2 preceding siblings ...)
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
@ 2022-07-15  8:02 ` Hubert Mazur
  2022-07-25 11:18   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
  4 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15  8:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation of sse8 function for arm64.

Performance comparison tests are shown below.
- sse_1_c: 133.0
- sse_1_neon: 36.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 +
 libavcodec/aarch64/me_cmp_neon.S         | 72 ++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 07d62cc1e5..89c817990c 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -34,6 +34,8 @@ int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
+int sse8_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
 int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
 
@@ -49,6 +51,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->sad[0] = ff_pix_abs16_neon;
         c->sse[0] = sse16_neon;
+        c->sse[1] = sse8_neon;
         c->sse[2] = sse4_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 858833b0ae..c78e26df4b 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -361,6 +361,78 @@ function sse16_neon, export=1
 
 endfunc
 
+function sse8_neon, export=1
+        // x0 - unused
+        // x1 - pix1
+        // x2 - pix2
+        // x3 - stride
+        // w4 - h
+
+        movi            d18, #0
+        cmp             w4, #4
+        b.le            2f
+
+// make 4 iterations at once
+1:
+
+        // res = abs(pix1[0] - pix2[0])
+        // res * res
+
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v1.8b}, [x2], x3
+        uabdl           v30.8h, v0.8b, v1.8b
+        umull           v21.4s, v30.4h, v30.4h
+        ld1             {v2.8b}, [x1], x3
+        umull2          v20.4s, v30.8h, v30.8h
+        ld1             {v3.8b}, [x2], x3
+        uabdl           v29.8h, v2.8b, v3.8b
+        ld1             {v4.8b}, [x1], x3
+        umlal           v21.4s, v29.4h, v29.4h
+        ld1             {v5.8b}, [x2], x3
+        umlal2          v20.4s, v29.8h, v29.8h
+        uabdl           v28.8h, v4.8b, v5.8b
+        ld1             {v6.8b}, [x1], x3
+        umlal           v21.4s, v28.4h, v28.4h
+        ld1             {v7.8b}, [x2], x3
+        umlal2          v20.4s, v28.8h, v28.8h
+        uabdl           v27.8h, v6.8b, v7.8b
+        umlal           v21.4s, v27.4h, v27.4h
+        umlal2          v20.4s, v27.8h, v27.8h
+
+        add             v21.4s, v21.4s, v20.4s
+        sub             w4, w4, #4
+        uaddlv          d17, v21.4s
+        add             d18, d18, d17
+        cmp             w4, #4
+        b.ge            1b
+
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v1.8b}, [x2], x3
+
+        uabdl           v30.8h, v0.8b, v1.8b
+        umull           v21.4s, v30.4h, v30.4h
+        umull2          v20.4s, v30.8h, v30.8h
+
+        subs            w4, w4, #1
+
+        uaddlv          d17, v21.4s
+        add             d18, d18, d17
+        uaddlv          d17, v20.4S
+        add             d18, d18, d17
+
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+endfunc
+
 function sse4_neon, export=1
         // x0 - unused
         // x1 - pix1
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8
  2022-07-15  8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
                   ` (3 preceding siblings ...)
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
@ 2022-07-15  8:02 ` Hubert Mazur
  2022-07-25 11:21   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
  4 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-15  8:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation of pix_abs8 function for arm64.

Performance comparison tests are shown below.
- pix_abs_1_0_c: 105.2
- pix_abs_1_0_neon: 21.4
- sad_1_c: 107.2
- sad_1_neon: 20.9

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
 libavcodec/aarch64/me_cmp_neon.S         | 53 ++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 89c817990c..7d7dc38754 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -31,6 +31,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
 int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
+int ff_pix_abs8_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                      ptrdiff_t stride, int h);
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
@@ -48,8 +50,10 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
         c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+        c->pix_abs[1][0] = ff_pix_abs8_neon;
 
         c->sad[0] = ff_pix_abs16_neon;
+        c->sad[1] = ff_pix_abs8_neon;
         c->sse[0] = sse16_neon;
         c->sse[1] = sse8_neon;
         c->sse[2] = sse4_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index c78e26df4b..383459d209 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -72,6 +72,59 @@ function ff_pix_abs16_neon, export=1
         ret
 endfunc
 
+function ff_pix_abs8_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // x4           int h
+
+        movi            d18, #0
+        cmp             w4, #4
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v1.8b}, [x2], x3
+        uabdl           v30.8h, v0.8b, v1.8b
+        ld1             {v2.8b}, [x1], x3
+        ld1             {v3.8b}, [x2], x3
+        uabal           v30.8h, v2.8b, v3.8b
+        ld1             {v4.8b}, [x1], x3
+        ld1             {v5.8b}, [x2], x3
+        uabal           v30.8h, v4.8b, v5.8b
+        ld1             {v6.8b}, [x1], x3
+        ld1             {v7.8b}, [x2], x3
+        uabal           v30.8h, v6.8b, v7.8b
+
+        sub             w4, w4, #4
+        uaddlv          s20, v30.8h
+        cmp             w4, #4
+        add             d18, d18, d20
+        b.ge            1b
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v1.8b}, [x2], x3
+
+        uabdl           v16.8h, v0.8b, v1.8b
+
+        uaddlv          s17, v16.8h
+        add             d18, d18, d17
+        subs            w4, w4, #1
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+
+endfunc
+
 function ff_pix_abs16_xy2_neon, export=1
         // x0           unused
         // x1           uint8_t *pix1
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
@ 2022-07-21 21:43   ` Martin Storsjö
  2022-07-22 21:30   ` Swinney, Jonathan
  2022-07-25 11:15   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
  2 siblings, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-07-21 21:43 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Fri, 15 Jul 2022, Hubert Mazur wrote:

> Provide neon implementation for sse4 function.
>
> Performance comparison tests are shown below.
> - sse_2_c: 74.0
> - sse_2_neon: 24.0
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
> libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
> 2 files changed, 68 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 3ff5767bd0..72a2062e7e 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> 
> int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                   ptrdiff_t stride, int h);
> +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> +                  ptrdiff_t stride, int h);
> 
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>
>         c->sad[0] = ff_pix_abs16_neon;
>         c->sse[0] = sse16_neon;
> +        c->sse[2] = sse4_neon;
>     }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 88cd335443..bacf151314 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -360,3 +360,68 @@ function sse16_neon, export=1
>         ret
> 
> endfunc
> +
> +function sse4_neon, export=1
> +        // x0 - unused
> +        // x1 - pix1
> +        // x2 - pix2
> +        // x3 - stride
> +        // w4 - h
> +
> +        movi            d18, #0
> +        movi            d17, #0
> +        cmp             w4, #4
> +        b.le            2f
> +
> +// make 4 iterations at once
> +1:
> +
> +        // res = abs(pix1[0] - pix2[0])
> +        // res * res
> +
> +        ld1             {v0.4b}, [x1], x3

This fails to assemble for me with essentially all tools I have (old 
binutils, moderately recent binutils, current llvm, MS armasm64.exe):

src/libavcodec/aarch64/me_cmp_neon.S:374: Error: operand mismatch -- `ld1 
{v0.4b},[x1],x3'
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:    did you mean this?
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.8b}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:    other valid variant(s):
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.16b}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.4h}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.8h}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.2s}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.4s}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.1d}, [x1], x3
src/libavcodec/aarch64/me_cmp_neon.S:374: Info:         ld1 {v0.2d}, [x1], x3


I'll follow up with an actual review of the patches later. I'm sorry I 
have a bit longer review latency than usual at the moment, as I'm on 
vacation.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
  2022-07-21 21:43   ` Martin Storsjö
@ 2022-07-22 21:30   ` Swinney, Jonathan
  2022-07-25 11:15   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
  2 siblings, 0 replies; 25+ messages in thread
From: Swinney, Jonathan @ 2022-07-22 21:30 UTC (permalink / raw)
  To: Hubert Mazur, ffmpeg-devel; +Cc: martin, mw, upstream, Pop, Sebastian, gjb

As Martin noted, this patch doesn't build. But other than, that, it would be nice if there were comments on each line at least making some note about which of the 4 iterations each instruction calculates. That would make it a little bit easier to read, in my opinion, since the instructions are manually reordered.

Thanks,
-- 

Jonathan Swinney

On 7/15/22, 3:03 AM, "Hubert Mazur" <hum@semihalf.com> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    Provide neon implementation for sse4 function.

    Performance comparison tests are shown below.
    - sse_2_c: 74.0
    - sse_2_neon: 24.0

    Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

    Signed-off-by: Hubert Mazur <hum@semihalf.com>
    ---
     libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
     libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
     2 files changed, 68 insertions(+)

    diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
    index 3ff5767bd0..72a2062e7e 100644
    --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
    +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
    @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

     int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
    +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    +                  ptrdiff_t stride, int h);

     av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
     {
    @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)

             c->sad[0] = ff_pix_abs16_neon;
             c->sse[0] = sse16_neon;
    +        c->sse[2] = sse4_neon;
         }
     }
    diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
    index 88cd335443..bacf151314 100644
    --- a/libavcodec/aarch64/me_cmp_neon.S
    +++ b/libavcodec/aarch64/me_cmp_neon.S
    @@ -360,3 +360,68 @@ function sse16_neon, export=1
             ret

     endfunc
    +
    +function sse4_neon, export=1
    +        // x0 - unused
    +        // x1 - pix1
    +        // x2 - pix2
    +        // x3 - stride
    +        // w4 - h
    +
    +        movi            d18, #0
    +        movi            d17, #0
    +        cmp             w4, #4
    +        b.le            2f
    +
    +// make 4 iterations at once
    +1:
    +
    +        // res = abs(pix1[0] - pix2[0])
    +        // res * res
    +
    +        ld1             {v0.4b}, [x1], x3
    +        ld1             {v1.4b}, [x2], x3
    +        uabdl           v30.8h, v0.4b, v1.4b
    +        ld1             {v2.4b}, [x1], x3
    +        ld1             {v3.4b}, [x2], x3
    +        umull           v16.4s, v30.4h, v30.4h
    +        uabdl           v29.8h, v2.4b, v3.4b
    +        ld1             {v4.4b}, [x1], x3
    +        ld1             {v5.4b}, [x2], x3
    +        umlal           v16.4s, v29.4h, v29.4h
    +        uabdl           v28.8h, v4.4b, v5.4b
    +        ld1             {v6.4b}, [x1], x3
    +        ld1             {v7.4b}, [x2], x3
    +        umlal           v16.4s, v28.4h, v28.4h
    +        uabdl           v27.8h, v6.4b, v7.4b
    +        umlal           v16.4s, v27.4h, v27.4h
    +
    +        uaddlv          d17, v16.4s
    +        add             d18, d18, d17
    +
    +        sub             w4, w4, #4
    +        cmp             w4, #4
    +        b.ge            1b
    +
    +        cbnz            w4, 2f
    +        fmov            w0, s18
    +
    +        ret
    +
    +// iterate by one
    +2:
    +        ld1             {v0.4b}, [x1], x3
    +        ld1             {v1.4b}, [x2], x3
    +        uabdl           v30.8h, v0.4b, v1.4b
    +        umull           v16.4s, v30.4h, v30.4h
    +
    +        uaddlv          d17, v16.4s
    +        add             d18, d18, d17
    +
    +        subs            w4, w4, #1
    +        b.ne            2b
    +        fmov            w0, s18
    +
    +        ret
    +
    +endfunc
    --
    2.34.1


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse16
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
@ 2022-07-25 11:12   ` Hubert Mazur
  2022-08-03 13:22     ` Martin Storsjö
  2022-08-04  7:46     ` Martin Storsjö
  0 siblings, 2 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:12 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide neon implementation for sse16 function.

Performance comparison tests are shown below.
- sse_0_c: 273.0
- sse_0_neon: 48.2

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
 libavcodec/aarch64/me_cmp_neon.S         | 82 ++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 136b008eb7..3ff5767bd0 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -30,6 +30,9 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
 int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
 
+int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -40,5 +43,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
 
         c->sad[0] = ff_pix_abs16_neon;
+        c->sse[0] = sse16_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index cda7ce0408..98c912b608 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -270,3 +270,85 @@ function ff_pix_abs16_x2_neon, export=1
 
         ret
 endfunc
+
+function sse16_neon, export=1
+        // x0 - unused
+        // x1 - pix1
+        // x2 - pix2
+        // x3 - stride
+        // w4 - h
+
+        cmp             w4, #4
+        movi            d18, #0
+        b.lt            2f
+
+// Make 4 iterations at once
+1:
+
+        // res = abs(pix1[0] - pix2[0])
+        // res * res
+
+        ld1             {v0.16b}, [x1], x3              // Load pix1 vector for first iteration
+        ld1             {v1.16b}, [x2], x3              // Load pix2 vector for first iteration
+        uabd            v30.16b, v0.16b, v1.16b         // Absolute difference, first iteration
+        ld1             {v2.16b}, [x1], x3              // Load pix1 vector for second iteration
+        umull           v29.8h, v30.8b, v30.8b          // Multiply lower half of vectors, first iteration
+        ld1             {v3.16b}, [x2], x3              // Load pix2 vector for second iteration
+        umull2          v28.8h, v30.16b, v30.16b        // Multiply upper half of vectors, first iteration
+        uabd            v27.16b, v2.16b, v3.16b         // Absolute difference, second iteration
+        uaddlp          v17.4s, v29.8h                  // Pairwise add, first iteration
+        umull           v26.8h, v27.8b, v27.8b          // Mulitply lower half, second iteration
+        umull2          v25.8h, v27.16b, v27.16b        // Multiply upper half, second iteration
+        ld1             {v4.16b}, [x1], x3              // Load pix1 for third iteration
+        uadalp          v17.4s, v26.8h                  // Pairwise add and accumulate, second iteration
+        ld1             {v5.16b}, [x2], x3              // Load pix2 for third iteration
+        uadalp          v17.4s, v25.8h                  // Pairwise add andd accumulate, second iteration
+        uabd            v24.16b, v4.16b, v5.16b         // Absolute difference, third iteration
+        ld1             {v6.16b}, [x1], x3              // Load pix1 for fourth iteration
+        umull           v23.8h, v24.8b, v24.8b          // Multiply lower half, third iteration
+        umull2          v22.8h, v24.16b, v24.16b        // Multiply upper half, third iteration
+        uadalp          v17.4s, v23.8h                  // Pairwise add and accumulate, third iteration
+        uadalp          v17.4s, v22.8h                  // Pairwise add and accumulate, third iteration
+        ld1             {v7.16b}, [x2], x3              // Load pix2 for fouth iteration
+        uadalp          v17.4s, v28.8h                  // Pairwise add and accumulate, first iteration
+        uabd            v21.16b, v6.16b, v7.16b         // Absolute difference, fourth iteration
+        umull           v20.8h, v21.8b, v21.8b          // Multiply lower half, fourth iteration
+        uadalp          v17.4s, v20.8h                  // Pairwise add and accumulate, fourth iteration
+        umull2          v19.8h, v21.16b, v21.16b        // Multiply upper half, fourth iteration
+        uadalp          v17.4s, v19.8h                  // Pairwise add and accumulate, fourth iteration
+
+        sub             w4, w4, #4                      // h -= 4
+        uaddlv          d16, v17.4s                     // add up accumulator vector
+        cmp             w4, #4
+        add             d18, d18, d16
+
+        b.ge            1b
+
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+
+        ld1             {v0.16b}, [x1], x3              // Load pix1
+        ld1             {v1.16b}, [x2], x3              // Load pix2
+
+        uabd            v30.16b, v0.16b, v1.16b
+        umull           v29.8h, v0.8b, v1.8b
+        umull2          v28.8h, v0.16b, v1.16b
+        uaddlp          v17.4s, v29.8h
+        uadalp          v17.4s, v28.8h
+
+
+        subs            w4, w4, #1
+        uaddlv          d16, v17.4s
+        add             d18, d18, d16
+
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+
+endfunc
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
  2022-07-21 21:43   ` Martin Storsjö
  2022-07-22 21:30   ` Swinney, Jonathan
@ 2022-07-25 11:15   ` Hubert Mazur
  2022-07-28 18:50     ` Swinney, Jonathan
  2022-08-04  8:00     ` Martin Storsjö
  2 siblings, 2 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide neon implementation for sse4 function.

Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
 libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->sad[0] = ff_pix_abs16_neon;
         c->sse[0] = sse16_neon;
+        c->sse[2] = sse4_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 98c912b608..3336d88848 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -352,3 +352,68 @@ function sse16_neon, export=1
         ret
 
 endfunc
+
+function sse4_neon, export=1
+        // x0 - unused
+        // x1 - pix1
+        // x2 - pix2
+        // x3 - stride
+        // w4 - h
+
+        movi            d18, #0
+        movi            d17, #0
+        cmp             w4, #4
+        b.le            2f
+
+// make 4 iterations at once
+1:
+
+        // res = abs(pix1[0] - pix2[0])
+        // res * res
+
+        ld1             {v0.s}[0], [x1], x3             // Load pix1, first iteration
+        ld1             {v1.s}[0], [x2], x3             // Load pix2, first iteration
+        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
+        ld1             {v2.s}[0], [x1], x3             // Load pix1, second iteration
+        ld1             {v3.s}[0], [x2], x3             // Load pix2, second iteration
+        umull           v16.4s, v30.4h, v30.4h          // Multiply vectors, first iteration
+        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
+        ld1             {v4.s}[0], [x1], x3             // Load pix1, third iteration
+        ld1             {v5.s}[0], [x2], x3             // Load pix2, third iteration
+        umlal           v16.4s, v29.4h, v29.4h          // Multiply and accumulate, second iteration
+        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
+        ld1             {v6.s}[0], [x1], x3             // Load pix1, fourth iteration
+        ld1             {v7.s}[0], [x2], x3             // Load pix2, fourth iteration
+        umlal           v16.4s, v28.4h, v28.4h          // Multiply and accumulate, third iteration
+        uabdl           v27.8h, v6.8b, v7.8b            // Absolue difference, fourth iteration
+        umlal           v16.4s, v27.4h, v27.4h          // Multiply and accumulate, fourth iteration
+
+        uaddlv          d17, v16.4s                     // Add vector
+        add             d18, d18, d17
+
+        sub             w4, w4, #4
+        cmp             w4, #4
+        b.ge            1b
+
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+        ld1             {v0.s}[0], [x1], x3               // Load pix1
+        ld1             {v1.s}[0], [x2], x3               // Load pix2
+        uabdl           v30.8h, v0.8b, v1.8b
+        umull           v16.4s, v30.4h, v30.4h
+
+        uaddlv          d17, v16.4s
+        add             d18, d18, d17
+
+        subs            w4, w4, #1
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+
+endfunc
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs16_y2
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
@ 2022-07-25 11:17   ` Hubert Mazur
  2022-08-04  8:08     ` Martin Storsjö
  2022-08-04  8:12     ` Martin Storsjö
  0 siblings, 2 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:17 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation of pix_abs16_y2 function for arm64.

Performance comparison tests are shown below.
pix_abs_0_2_c: 308.5
pix_abs_0_2_neon: 39.2

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 +
 libavcodec/aarch64/me_cmp_neon.S         | 73 ++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 72a2062e7e..07d62cc1e5 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -29,6 +29,8 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
                       ptrdiff_t stride, int h);
 int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
+int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
@@ -42,6 +44,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
     if (have_neon(cpu_flags)) {
         c->pix_abs[0][0] = ff_pix_abs16_neon;
         c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
+        c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
 
         c->sad[0] = ff_pix_abs16_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 3336d88848..6e392e9066 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -417,3 +417,76 @@ function sse4_neon, export=1
         ret
 
 endfunc
+
+function ff_pix_abs16_y2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // x4           int h
+
+        // initialize buffers
+        movi            d18, #0
+        add             x5, x2, x3                      // pix2 + stride
+        cmp             w4, #4
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+
+        // abs(pix1[0], avg2(pix2[0], pix2[0 + stride]))
+        // avg2(a, b) = (((a) + (b) + 1) >> 1)
+        // abs(x) = (x < 0 ? (-x) : (x))
+
+        ld1             {v1.16b}, [x2], x3              // Load pix2 for first iteration
+        ld1             {v2.16b}, [x5], x3              // Load pix3 for first iteration
+        urhadd          v30.16b, v1.16b, v2.16b         // Rounding halving add, first iteration
+        ld1             {v0.16b}, [x1], x3              // Load pix1 for first iteration
+        uabdl           v29.8h, v0.8b, v30.8b           // Absolute difference of lower half, first iteration
+        ld1             {v4.16b}, [x2], x3              // Load pix2 for second iteration
+        uabdl2          v28.8h, v0.16b, v30.16b         // Absolute difference of upper half, first iteration
+        ld1             {v5.16b}, [x5], x3              // Load pix3 for second iteartion
+        ld1             {v3.16b}, [x1], x3              // Load pix1 for second iteration
+        urhadd          v27.16b, v4.16b, v5.16b         // Rounding halving add, second iteration
+        uabal           v29.8h, v3.8b, v27.8b           // Absolute difference of lower half for second iteration
+        ld1             {v7.16b}, [x2], x3              // Load pix2 for third iteration
+        uabal2          v28.8h, v3.16b, v27.16b         // Absolute difference of upper half for second iteration
+        ld1             {v20.16b}, [x5], x3             // Load pix3 for third iteration
+        urhadd          v26.16b, v7.16b, v20.16b        // Rounding halving add, third iteration
+        ld1             {v6.16b}, [x1], x3              // Load pix1 for third iteration
+        uabal           v29.8h, v6.8b, v26.8b           // Absolute difference of lower half for third iteration
+        ld1             {v22.16b}, [x2], x3             // Load pix2 for fourth iteration
+        uabal2          v28.8h, v6.16b, v26.16b         // Absolute difference of upper half for third iteration
+        ld1             {v23.16b}, [x5], x3             // Load pix3 for fourth iteration
+        urhadd          v25.16b, v22.16b, v23.16b       // Rounding halving add
+        ld1             {v21.16b}, [x1], x3             // Load pix1 for fourth iteration
+        uabal           v29.8h, v21.8b, v25.8b          // Absolute difference of lower half for fourth iteration
+        uabal2          v28.8h, v21.16b, v25.16b        // Absolute difference of upper half for fourth iteration
+
+        add             v29.8h, v29.8h, v28.8h          // Add vectors together
+        sub             w4, w4, #4                      // h-= 4
+        uaddlv          s16, v29.8h                     // Add up vector values
+        cmp             w4, #4
+        add             d18, d18, d16
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+
+        ld1             {v1.16b}, [x2], x3              // Load pix2
+        ld1             {v2.16b}, [x5], x3              // Load pix3
+        urhadd          v30.16b, v1.16b, v2.16b         // Rounding halving add
+        ld1             {v0.16b}, [x1], x3              // Load pix1
+        uabd            v30.16b, v30.16b, v30.16b
+
+        uaddlv          h17, v30.16b
+        subs            w4, w4, #1
+        add             d18, d18, d17
+        b.ne            2b
+
+3:
+        fmov            w0, s18
+
+        ret
+endfunc
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse8
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
@ 2022-07-25 11:18   ` Hubert Mazur
  2022-08-04  8:04     ` Martin Storsjö
  0 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:18 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation of sse8 function for arm64.

Performance comparison tests are shown below.
- sse_1_c: 133.0
- sse_1_neon: 36.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 +
 libavcodec/aarch64/me_cmp_neon.S         | 72 ++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 07d62cc1e5..89c817990c 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -34,6 +34,8 @@ int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
+int sse8_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
 int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
 
@@ -49,6 +51,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->sad[0] = ff_pix_abs16_neon;
         c->sse[0] = sse16_neon;
+        c->sse[1] = sse8_neon;
         c->sse[2] = sse4_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 6e392e9066..dcaffc9b73 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -353,6 +353,78 @@ function sse16_neon, export=1
 
 endfunc
 
+function sse8_neon, export=1
+        // x0 - unused
+        // x1 - pix1
+        // x2 - pix2
+        // x3 - stride
+        // w4 - h
+
+        movi            d18, #0
+        cmp             w4, #4
+        b.le            2f
+
+// make 4 iterations at once
+1:
+
+        // res = abs(pix1[0] - pix2[0])
+        // res * res
+
+        ld1             {v0.8b}, [x1], x3               // Load pix1 for first iteration
+        ld1             {v1.8b}, [x2], x3               // Load pix2 for second iteration
+        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
+        umull           v21.4s, v30.4h, v30.4h          // Multiply lower half, first iteration
+        ld1             {v2.8b}, [x1], x3               // Load pix1 for second iteration
+        umull2          v20.4s, v30.8h, v30.8h          // Multiply upper half, second iteration
+        ld1             {v3.8b}, [x2], x3               // Load pix2 for second iteration
+        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
+        ld1             {v4.8b}, [x1], x3               // Load pix1 for third iteration
+        umlal           v21.4s, v29.4h, v29.4h          // Multiply lower half, second iteration
+        ld1             {v5.8b}, [x2], x3               // Load pix2 for third iteration
+        umlal2          v20.4s, v29.8h, v29.8h          // Multiply upper half, second iteration
+        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
+        ld1             {v6.8b}, [x1], x3               // Load pix1 for fourth iteration
+        umlal           v21.4s, v28.4h, v28.4h          // Multiply lower half, third iteration
+        ld1             {v7.8b}, [x2], x3               // Load pix2 for fourth iteration
+        umlal2          v20.4s, v28.8h, v28.8h          // Multiply upper half, third iteration
+        uabdl           v27.8h, v6.8b, v7.8b            // Absolute difference, fourth iteration
+        umlal           v21.4s, v27.4h, v27.4h          // Multiply lower half, fourth iteration
+        umlal2          v20.4s, v27.8h, v27.8h          // Multiply upper ha;f, fourth iteration
+
+        add             v21.4s, v21.4s, v20.4s          // Add accumulator vectors together
+        sub             w4, w4, #4                      // h -= 4
+        uaddlv          d17, v21.4s                     // Add up vector
+        add             d18, d18, d17
+        cmp             w4, #4
+        b.ge            1b
+
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x1], x3               // Load pix1
+        ld1             {v1.8b}, [x2], x3               // Load pix2
+
+        uabdl           v30.8h, v0.8b, v1.8b
+        umull           v21.4s, v30.4h, v30.4h
+        umull2          v20.4s, v30.8h, v30.8h
+
+        subs            w4, w4, #1
+
+        uaddlv          d17, v21.4s
+        add             d18, d18, d17
+        uaddlv          d17, v20.4S
+        add             d18, d18, d17
+
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+endfunc
+
 function sse4_neon, export=1
         // x0 - unused
         // x1 - pix1
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs8
  2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
@ 2022-07-25 11:21   ` Hubert Mazur
  2022-08-04  8:10     ` Martin Storsjö
  0 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-07-25 11:21 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation of pix_abs8 function for arm64.

Performance comparison tests are shown below.
- pix_abs_1_0_c: 105.2
- pix_abs_1_0_neon: 21.4
- sad_1_c: 107.2
- sad_1_neon: 20.9

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
 libavcodec/aarch64/me_cmp_neon.S         | 53 ++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 89c817990c..7d7dc38754 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -31,6 +31,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
 int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
+int ff_pix_abs8_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+                      ptrdiff_t stride, int h);
 
 int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h);
@@ -48,8 +50,10 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
         c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+        c->pix_abs[1][0] = ff_pix_abs8_neon;
 
         c->sad[0] = ff_pix_abs16_neon;
+        c->sad[1] = ff_pix_abs8_neon;
         c->sse[0] = sse16_neon;
         c->sse[1] = sse8_neon;
         c->sse[2] = sse4_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index dcaffc9b73..f2dd63ced1 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -72,6 +72,59 @@ function ff_pix_abs16_neon, export=1
         ret
 endfunc
 
+function ff_pix_abs8_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // x4           int h
+
+        movi            d18, #0
+        cmp             w4, #4
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+        ld1             {v0.8b}, [x1], x3               // Load pix1 for first iteration
+        ld1             {v1.8b}, [x2], x3               // Load pix2 for first iteration
+        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
+        ld1             {v2.8b}, [x1], x3               // Load pix1 for second iteration
+        ld1             {v3.8b}, [x2], x3               // Load pix2 for second iteration
+        uabal           v30.8h, v2.8b, v3.8b            // Absolute difference, second iteration
+        ld1             {v4.8b}, [x1], x3               // Load pix1 for third iteration
+        ld1             {v5.8b}, [x2], x3               // Load pix2 for third iteration
+        uabal           v30.8h, v4.8b, v5.8b            // Absolute difference, third iteration
+        ld1             {v6.8b}, [x1], x3               // Load pix1 for foruth iteration
+        ld1             {v7.8b}, [x2], x3               // Load pix2 for fourth iteration
+        uabal           v30.8h, v6.8b, v7.8b            // Absolute difference, foruth iteration
+
+        sub             w4, w4, #4                      // h -= 4
+        uaddlv          s20, v30.8h                     // Add up vector
+        cmp             w4, #4
+        add             d18, d18, d20
+        b.ge            1b
+        cbnz            w4, 2f
+        fmov            w0, s18
+
+        ret
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x1], x3               // Load pix1
+        ld1             {v1.8b}, [x2], x3               // Load pix2
+
+        uabdl           v16.8h, v0.8b, v1.8b
+
+        uaddlv          s17, v16.8h
+        add             d18, d18, d17
+        subs            w4, w4, #1
+        b.ne            2b
+        fmov            w0, s18
+
+        ret
+
+endfunc
+
 function ff_pix_abs16_xy2_neon, export=1
         // x0           unused
         // x1           uint8_t *pix1
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
  2022-07-25 11:15   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-07-28 18:50     ` Swinney, Jonathan
  2022-07-28 18:51       ` Swinney, Jonathan
  2022-08-04  8:00     ` Martin Storsjö
  1 sibling, 1 reply; 25+ messages in thread
From: Swinney, Jonathan @ 2022-07-28 18:50 UTC (permalink / raw)
  To: Hubert Mazur, ffmpeg-devel; +Cc: martin, mw, upstream, Pop, Sebastian, gjb

Your latest set of patches didn’t get interpreted correctly by the patchwork tool. I suspect it took them in the wrong order. 

https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=

There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.

I didn't see anything else.

Thanks!
-- 

Jonathan Swinney

On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    Provide neon implementation for sse4 function.

    Performance comparison tests are shown below.
    - sse_2_c: 74.0
    - sse_2_neon: 24.0

    Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

    Signed-off-by: Hubert Mazur <hum@semihalf.com>
    ---
     libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
     libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
     2 files changed, 68 insertions(+)

    diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
    index 3ff5767bd0..72a2062e7e 100644
    --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
    +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
    @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

     int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
    +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    +                  ptrdiff_t stride, int h);

     av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
     {
    @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)

             c->sad[0] = ff_pix_abs16_neon;
             c->sse[0] = sse16_neon;
    +        c->sse[2] = sse4_neon;
         }
     }
    diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
    index 98c912b608..3336d88848 100644
    --- a/libavcodec/aarch64/me_cmp_neon.S
    +++ b/libavcodec/aarch64/me_cmp_neon.S
    @@ -352,3 +352,68 @@ function sse16_neon, export=1
             ret

     endfunc
    +
    +function sse4_neon, export=1
    +        // x0 - unused
    +        // x1 - pix1
    +        // x2 - pix2
    +        // x3 - stride
    +        // w4 - h
    +
    +        movi            d18, #0
    +        movi            d17, #0
    +        cmp             w4, #4
    +        b.le            2f
    +
    +// make 4 iterations at once
    +1:
    +
    +        // res = abs(pix1[0] - pix2[0])
    +        // res * res
    +
    +        ld1             {v0.s}[0], [x1], x3             // Load pix1, first iteration
    +        ld1             {v1.s}[0], [x2], x3             // Load pix2, first iteration
    +        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
    +        ld1             {v2.s}[0], [x1], x3             // Load pix1, second iteration
    +        ld1             {v3.s}[0], [x2], x3             // Load pix2, second iteration
    +        umull           v16.4s, v30.4h, v30.4h          // Multiply vectors, first iteration
    +        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
    +        ld1             {v4.s}[0], [x1], x3             // Load pix1, third iteration
    +        ld1             {v5.s}[0], [x2], x3             // Load pix2, third iteration
    +        umlal           v16.4s, v29.4h, v29.4h          // Multiply and accumulate, second iteration
    +        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
    +        ld1             {v6.s}[0], [x1], x3             // Load pix1, fourth iteration
    +        ld1             {v7.s}[0], [x2], x3             // Load pix2, fourth iteration
    +        umlal           v16.4s, v28.4h, v28.4h          // Multiply and accumulate, third iteration
    +        uabdl           v27.8h, v6.8b, v7.8b            // Absolue difference, fourth iteration
    +        umlal           v16.4s, v27.4h, v27.4h          // Multiply and accumulate, fourth iteration
    +
    +        uaddlv          d17, v16.4s                     // Add vector
    +        add             d18, d18, d17
    +
    +        sub             w4, w4, #4
    +        cmp             w4, #4
    +        b.ge            1b
    +
    +        cbnz            w4, 2f
    +        fmov            w0, s18
    +
    +        ret
    +
    +// iterate by one
    +2:
    +        ld1             {v0.s}[0], [x1], x3               // Load pix1
    +        ld1             {v1.s}[0], [x2], x3               // Load pix2
    +        uabdl           v30.8h, v0.8b, v1.8b
    +        umull           v16.4s, v30.4h, v30.4h
    +
    +        uaddlv          d17, v16.4s
    +        add             d18, d18, d17
    +
    +        subs            w4, w4, #1
    +        b.ne            2b
    +        fmov            w0, s18
    +
    +        ret
    +
    +endfunc
    --
    2.34.1


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
  2022-07-28 18:50     ` Swinney, Jonathan
@ 2022-07-28 18:51       ` Swinney, Jonathan
  2022-07-29  7:26         ` Hubert Mazur
  0 siblings, 1 reply; 25+ messages in thread
From: Swinney, Jonathan @ 2022-07-28 18:51 UTC (permalink / raw)
  To: Hubert Mazur, ffmpeg-devel; +Cc: martin, mw, upstream, Pop, Sebastian, gjb

> There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.

Sorry-- I meant sse4_neon.

-- 

Jonathan Swinney

On 7/28/22, 1:50 PM, "Swinney, Jonathan" <jswinney@amazon.com> wrote:

    Your latest set of patches didn’t get interpreted correctly by the patchwork tool. I suspect it took them in the wrong order. 

    https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=

    There is one more place to move the sub, cmp and branch instructions apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1 and V1 and it may help A53.

    I didn't see anything else.

    Thanks!
    -- 

    Jonathan Swinney

    On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:

        CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



        Provide neon implementation for sse4 function.

        Performance comparison tests are shown below.
        - sse_2_c: 74.0
        - sse_2_neon: 24.0

        Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

        Signed-off-by: Hubert Mazur <hum@semihalf.com>
        ---
         libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
         libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
         2 files changed, 68 insertions(+)

        diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
        index 3ff5767bd0..72a2062e7e 100644
        --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
        +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
        @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

         int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                           ptrdiff_t stride, int h);
        +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
        +                  ptrdiff_t stride, int h);

         av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         {
        @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)

                 c->sad[0] = ff_pix_abs16_neon;
                 c->sse[0] = sse16_neon;
        +        c->sse[2] = sse4_neon;
             }
         }
        diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
        index 98c912b608..3336d88848 100644
        --- a/libavcodec/aarch64/me_cmp_neon.S
        +++ b/libavcodec/aarch64/me_cmp_neon.S
        @@ -352,3 +352,68 @@ function sse16_neon, export=1
                 ret

         endfunc
        +
        +function sse4_neon, export=1
        +        // x0 - unused
        +        // x1 - pix1
        +        // x2 - pix2
        +        // x3 - stride
        +        // w4 - h
        +
        +        movi            d18, #0
        +        movi            d17, #0
        +        cmp             w4, #4
        +        b.le            2f
        +
        +// make 4 iterations at once
        +1:
        +
        +        // res = abs(pix1[0] - pix2[0])
        +        // res * res
        +
        +        ld1             {v0.s}[0], [x1], x3             // Load pix1, first iteration
        +        ld1             {v1.s}[0], [x2], x3             // Load pix2, first iteration
        +        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
        +        ld1             {v2.s}[0], [x1], x3             // Load pix1, second iteration
        +        ld1             {v3.s}[0], [x2], x3             // Load pix2, second iteration
        +        umull           v16.4s, v30.4h, v30.4h          // Multiply vectors, first iteration
        +        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
        +        ld1             {v4.s}[0], [x1], x3             // Load pix1, third iteration
        +        ld1             {v5.s}[0], [x2], x3             // Load pix2, third iteration
        +        umlal           v16.4s, v29.4h, v29.4h          // Multiply and accumulate, second iteration
        +        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
        +        ld1             {v6.s}[0], [x1], x3             // Load pix1, fourth iteration
        +        ld1             {v7.s}[0], [x2], x3             // Load pix2, fourth iteration
        +        umlal           v16.4s, v28.4h, v28.4h          // Multiply and accumulate, third iteration
        +        uabdl           v27.8h, v6.8b, v7.8b            // Absolue difference, fourth iteration
        +        umlal           v16.4s, v27.4h, v27.4h          // Multiply and accumulate, fourth iteration
        +
        +        uaddlv          d17, v16.4s                     // Add vector
        +        add             d18, d18, d17
        +
        +        sub             w4, w4, #4
        +        cmp             w4, #4
        +        b.ge            1b
        +
        +        cbnz            w4, 2f
        +        fmov            w0, s18
        +
        +        ret
        +
        +// iterate by one
        +2:
        +        ld1             {v0.s}[0], [x1], x3               // Load pix1
        +        ld1             {v1.s}[0], [x2], x3               // Load pix2
        +        uabdl           v30.8h, v0.8b, v1.8b
        +        umull           v16.4s, v30.4h, v30.4h
        +
        +        uaddlv          d17, v16.4s
        +        add             d18, d18, d17
        +
        +        subs            w4, w4, #1
        +        b.ne            2b
        +        fmov            w0, s18
        +
        +        ret
        +
        +endfunc
        --
        2.34.1



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
  2022-07-28 18:51       ` Swinney, Jonathan
@ 2022-07-29  7:26         ` Hubert Mazur
  0 siblings, 0 replies; 25+ messages in thread
From: Hubert Mazur @ 2022-07-29  7:26 UTC (permalink / raw)
  To: Swinney, Jonathan; +Cc: gjb, upstream, martin, ffmpeg-devel, mw, Pop, Sebastian

Yes, it seems that they are misplaced or each is treated as a new series
and thus can't be applied.
I will send the whole batch again after the first review, so some issues
could be fixed.
Thanks for the feedback!

On Thu, Jul 28, 2022 at 8:51 PM Swinney, Jonathan <jswinney@amazon.com>
wrote:

> > There is one more place to move the sub, cmp and branch instructions
> apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1
> and V1 and it may help A53.
>
> Sorry-- I meant sse4_neon.
>
> --
>
> Jonathan Swinney
>
> On 7/28/22, 1:50 PM, "Swinney, Jonathan" <jswinney@amazon.com> wrote:
>
>     Your latest set of patches didn’t get interpreted correctly by the
> patchwork tool. I suspect it took them in the wrong order.
>
>
> https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=&submitter=1479&state=&q=&archive=&delegate=
>
>     There is one more place to move the sub, cmp and branch instructions
> apart in sse16_neon. It doesn't seem to make any difference to Neoverse N1
> and V1 and it may help A53.
>
>     I didn't see anything else.
>
>     Thanks!
>     --
>
>     Jonathan Swinney
>
>     On 7/25/22, 6:16 AM, "Hubert Mazur" <hum@semihalf.com> wrote:
>
>         CAUTION: This email originated from outside of the organization.
> Do not click links or open attachments unless you can confirm the sender
> and know the content is safe.
>
>
>
>         Provide neon implementation for sse4 function.
>
>         Performance comparison tests are shown below.
>         - sse_2_c: 74.0
>         - sse_2_neon: 24.0
>
>         Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
>         Signed-off-by: Hubert Mazur <hum@semihalf.com>
>         ---
>          libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
>          libavcodec/aarch64/me_cmp_neon.S         | 65
> ++++++++++++++++++++++++
>          2 files changed, 68 insertions(+)
>
>         diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c
> b/libavcodec/aarch64/me_cmp_init_aarch64.c
>         index 3ff5767bd0..72a2062e7e 100644
>         --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
>         +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
>         @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v,
> uint8_t *pix1, uint8_t *pix2,
>
>          int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                            ptrdiff_t stride, int h);
>         +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>         +                  ptrdiff_t stride, int h);
>
>          av_cold void ff_me_cmp_init_aarch64(MECmpContext *c,
> AVCodecContext *avctx)
>          {
>         @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext
> *c, AVCodecContext *avctx)
>
>                  c->sad[0] = ff_pix_abs16_neon;
>                  c->sse[0] = sse16_neon;
>         +        c->sse[2] = sse4_neon;
>              }
>          }
>         diff --git a/libavcodec/aarch64/me_cmp_neon.S
> b/libavcodec/aarch64/me_cmp_neon.S
>         index 98c912b608..3336d88848 100644
>         --- a/libavcodec/aarch64/me_cmp_neon.S
>         +++ b/libavcodec/aarch64/me_cmp_neon.S
>         @@ -352,3 +352,68 @@ function sse16_neon, export=1
>                  ret
>
>          endfunc
>         +
>         +function sse4_neon, export=1
>         +        // x0 - unused
>         +        // x1 - pix1
>         +        // x2 - pix2
>         +        // x3 - stride
>         +        // w4 - h
>         +
>         +        movi            d18, #0
>         +        movi            d17, #0
>         +        cmp             w4, #4
>         +        b.le            2f
>         +
>         +// make 4 iterations at once
>         +1:
>         +
>         +        // res = abs(pix1[0] - pix2[0])
>         +        // res * res
>         +
>         +        ld1             {v0.s}[0], [x1], x3             // Load
> pix1, first iteration
>         +        ld1             {v1.s}[0], [x2], x3             // Load
> pix2, first iteration
>         +        uabdl           v30.8h, v0.8b, v1.8b            //
> Absolute difference, first iteration
>         +        ld1             {v2.s}[0], [x1], x3             // Load
> pix1, second iteration
>         +        ld1             {v3.s}[0], [x2], x3             // Load
> pix2, second iteration
>         +        umull           v16.4s, v30.4h, v30.4h          //
> Multiply vectors, first iteration
>         +        uabdl           v29.8h, v2.8b, v3.8b            //
> Absolute difference, second iteration
>         +        ld1             {v4.s}[0], [x1], x3             // Load
> pix1, third iteration
>         +        ld1             {v5.s}[0], [x2], x3             // Load
> pix2, third iteration
>         +        umlal           v16.4s, v29.4h, v29.4h          //
> Multiply and accumulate, second iteration
>         +        uabdl           v28.8h, v4.8b, v5.8b            //
> Absolute difference, third iteration
>         +        ld1             {v6.s}[0], [x1], x3             // Load
> pix1, fourth iteration
>         +        ld1             {v7.s}[0], [x2], x3             // Load
> pix2, fourth iteration
>         +        umlal           v16.4s, v28.4h, v28.4h          //
> Multiply and accumulate, third iteration
>         +        uabdl           v27.8h, v6.8b, v7.8b            //
> Absolue difference, fourth iteration
>         +        umlal           v16.4s, v27.4h, v27.4h          //
> Multiply and accumulate, fourth iteration
>         +
>         +        uaddlv          d17, v16.4s                     // Add
> vector
>         +        add             d18, d18, d17
>         +
>         +        sub             w4, w4, #4
>         +        cmp             w4, #4
>         +        b.ge            1b
>         +
>         +        cbnz            w4, 2f
>         +        fmov            w0, s18
>         +
>         +        ret
>         +
>         +// iterate by one
>         +2:
>         +        ld1             {v0.s}[0], [x1], x3               // Load
> pix1
>         +        ld1             {v1.s}[0], [x2], x3               // Load
> pix2
>         +        uabdl           v30.8h, v0.8b, v1.8b
>         +        umull           v16.4s, v30.4h, v30.4h
>         +
>         +        uaddlv          d17, v16.4s
>         +        add             d18, d18, d17
>         +
>         +        subs            w4, w4, #1
>         +        b.ne            2b
>         +        fmov            w0, s18
>         +
>         +        ret
>         +
>         +endfunc
>         --
>         2.34.1
>
>
>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse16
  2022-07-25 11:12   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-08-03 13:22     ` Martin Storsjö
  2022-08-04  7:46     ` Martin Storsjö
  1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-03 13:22 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Mon, 25 Jul 2022, Hubert Mazur wrote:

> Provide neon implementation for sse16 function.
>
> Performance comparison tests are shown below.
> - sse_0_c: 273.0
> - sse_0_neon: 48.2
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
> libavcodec/aarch64/me_cmp_neon.S         | 82 ++++++++++++++++++++++++
> 2 files changed, 86 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 136b008eb7..3ff5767bd0 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -30,6 +30,9 @@ int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                       ptrdiff_t stride, int h);
> 
> +int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> +                  ptrdiff_t stride, int h);

The signature of these functions has been changed now (right after these 
patches were submitted); the pix1/pix2 parameters are now const.

Also, nitpick; please align the following line ("ptrdiff_t stride, ...") 
correctly with the parenthese on the line above.

> +
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
>     int cpu_flags = av_get_cpu_flags();
> @@ -40,5 +43,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
>
>         c->sad[0] = ff_pix_abs16_neon;
> +        c->sse[0] = sse16_neon;
>     }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index cda7ce0408..98c912b608 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -270,3 +270,85 @@ function ff_pix_abs16_x2_neon, export=1
>
>         ret
> endfunc
> +
> +function sse16_neon, export=1
> +        // x0 - unused
> +        // x1 - pix1
> +        // x2 - pix2
> +        // x3 - stride
> +        // w4 - h
> +
> +        cmp             w4, #4
> +        movi            d18, #0
> +        b.lt            2f
> +
> +// Make 4 iterations at once
> +1:
> +
> +        // res = abs(pix1[0] - pix2[0])
> +        // res * res
> +
> +        ld1             {v0.16b}, [x1], x3              // Load pix1 vector for first iteration
> +        ld1             {v1.16b}, [x2], x3              // Load pix2 vector for first iteration
> +        uabd            v30.16b, v0.16b, v1.16b         // Absolute difference, first iteration

Try to improve the interleaving of this function; I did a quick test on 
Cortex A53, A72 and A73, and got these numbers:

Before:
sse_0_neon:  147.7   64.5   64.7
After:
sse_0_neon:  133.7   60.7   59.2

Overall, try to avoid having consecutive instructions operating on the 
same iteration (except for when doing the same operation on different 
halves of the same iteration), i.e. not "absolute difference third 
iteration; multiply lower half third iteration, multiply upper half third 
iteration, pairwise add third iteration", but bundle it up so you have 
e.g. "absolute difference third iteration; pairwise add first iteration; 
multiply {upper,lower} half third iteration; pairwise add second 
iteration; pairwise add third iteration", or something like that.

Then secondly, in general, don't serialize the summation down to a single 
element in each iteration! You can keep the accumulated sum as a vX.4s 
vector (or maybe even better, two .4s vectors!) throughout the whole 
algorithm, and then only add them up horizontally (with an uaddv) at the 
end.

For adding vectors, I would instinctively prefer doing "uaddl v0.4s, 
v2.4h, v3.4h; uaddl2 v1.4s, v2.8h, v3.8h" instead of "uaddlp v0.4s, 
v1.4h; uadalp v0.4s, v1.8h" etc.

I didn't try out this modification, but please do, I'm pretty sure it will 
be a fair bit faster, and if not, at least more idiomatic SIMD.

I didn't check the other patches yet, but if the other sse* functions are 
implemented similarly, I would expect the same feedback to apply to them 
too.

Let's iterate on the sse16 patch first now at least and get that one 
great, and then update sse4/sse8 similarly once we have that one settled.

I'll try to have a look at the other patches in the set later 
today/tomorrow.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse16
  2022-07-25 11:12   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
  2022-08-03 13:22     ` Martin Storsjö
@ 2022-08-04  7:46     ` Martin Storsjö
  1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04  7:46 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Mon, 25 Jul 2022, Hubert Mazur wrote:

> Provide neon implementation for sse16 function.
>
> Performance comparison tests are shown below.
> - sse_0_c: 273.0
> - sse_0_neon: 48.2
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
> libavcodec/aarch64/me_cmp_neon.S         | 82 ++++++++++++++++++++++++
> 2 files changed, 86 insertions(+)

> +// iterate by one
> +2:
> +
> +        ld1             {v0.16b}, [x1], x3              // Load pix1
> +        ld1             {v1.16b}, [x2], x3              // Load pix2
> +
> +        uabd            v30.16b, v0.16b, v1.16b
> +        umull           v29.8h, v0.8b, v1.8b
> +        umull2          v28.8h, v0.16b, v1.16b

This should probably be using v30 instead of v0/v1 in the umull here.

The whole codepath for non-modulo-4 heights is untested in practice. You 
can apply the patches from 
https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=7028 to make 
checkasm test it, so please make sure that the uncommon codepaths in the 
patches do work too.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse4
  2022-07-25 11:15   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
  2022-07-28 18:50     ` Swinney, Jonathan
@ 2022-08-04  8:00     ` Martin Storsjö
  1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04  8:00 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Mon, 25 Jul 2022, Hubert Mazur wrote:

> Provide neon implementation for sse4 function.
>
> Performance comparison tests are shown below.
> - sse_2_c: 74.0
> - sse_2_neon: 24.0
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
> libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
> 2 files changed, 68 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 3ff5767bd0..72a2062e7e 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> 
> int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                   ptrdiff_t stride, int h);
> +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> +                  ptrdiff_t stride, int h);
> 
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>
>         c->sad[0] = ff_pix_abs16_neon;
>         c->sse[0] = sse16_neon;
> +        c->sse[2] = sse4_neon;
>     }
> }
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 98c912b608..3336d88848 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -352,3 +352,68 @@ function sse16_neon, export=1
>         ret
> 
> endfunc
> +
> +function sse4_neon, export=1
> +        // x0 - unused
> +        // x1 - pix1
> +        // x2 - pix2
> +        // x3 - stride
> +        // w4 - h
> +
> +        movi            d18, #0
> +        movi            d17, #0

In the current implementation, it doesn't seem like d17 needs to be 
initialized here

> +        cmp             w4, #4
> +        b.le            2f
> +
> +// make 4 iterations at once
> +1:
> +
> +        // res = abs(pix1[0] - pix2[0])
> +        // res * res
> +
> +        ld1             {v0.s}[0], [x1], x3             // Load pix1, first iteration
> +        ld1             {v1.s}[0], [x2], x3             // Load pix2, first iteration
> +        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration

Right now, half of the values calculated by uabdl are unused; you could 
try loading two iterations into v0.s[0] and v0.s[1] so that the full 
.8b register gets used. Doing that would reduce the number of uabdl 
instructions from 4 to 2 - but it might make it harder to interleave 
instructions efficiently. So after all, maybe it's not worth if, it we 
can make the loads more efficiently interleaved this way?

Again, also here, it'd be good to interleave things more efficiently, e.g. 
like this:

    ld1 first
    ld1 first
    ld1 second
    ld1 second
    uabdl first
    ld1 third
    ld1 third
    uabdl second
    umull first
    ld1 fourth
    ld1 fourth
    uabdl third
    umlal second
    uabdl fourth
    umlal third
    umlal fourth

> +        ld1             {v2.s}[0], [x1], x3             // Load pix1, second iteration
> +        ld1             {v3.s}[0], [x2], x3             // Load pix2, second iteration
> +        umull           v16.4s, v30.4h, v30.4h          // Multiply vectors, first iteration
> +        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
> +        ld1             {v4.s}[0], [x1], x3             // Load pix1, third iteration
> +        ld1             {v5.s}[0], [x2], x3             // Load pix2, third iteration
> +        umlal           v16.4s, v29.4h, v29.4h          // Multiply and accumulate, second iteration
> +        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
> +        ld1             {v6.s}[0], [x1], x3             // Load pix1, fourth iteration
> +        ld1             {v7.s}[0], [x2], x3             // Load pix2, fourth iteration
> +        umlal           v16.4s, v28.4h, v28.4h          // Multiply and accumulate, third iteration
> +        uabdl           v27.8h, v6.8b, v7.8b            // Absolue difference, fourth iteration
> +        umlal           v16.4s, v27.4h, v27.4h          // Multiply and accumulate, fourth iteration
> +
> +        uaddlv          d17, v16.4s                     // Add vector
> +        add             d18, d18, d17

As usual, don't do any *add*v within the loop, defer it as far as 
possible. Here you're accumulating in 32 bit elements, so it will surely 
fit the results from the whole algorithm.

Also, if you get rid of the uaddlv here, you can also accumulate into two 
separate .4s registers that you only add at the end; that allows two umlal 
instructions to possibly execute in parallel without waiting for each 
other (provided that the cpu has enough execution units for that).

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for sse8
  2022-07-25 11:18   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-08-04  8:04     ` Martin Storsjö
  0 siblings, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04  8:04 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Mon, 25 Jul 2022, Hubert Mazur wrote:

> Provide optimized implementation of sse8 function for arm64.
>
> Performance comparison tests are shown below.
> - sse_1_c: 133.0
> - sse_1_neon: 36.7
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  3 +
> libavcodec/aarch64/me_cmp_neon.S         | 72 ++++++++++++++++++++++++
> 2 files changed, 75 insertions(+)

The same comments as for sse16 and sse4 apply here too.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs16_y2
  2022-07-25 11:17   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-08-04  8:08     ` Martin Storsjö
  2022-08-04  8:12     ` Martin Storsjö
  1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04  8:08 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Mon, 25 Jul 2022, Hubert Mazur wrote:

> Provide optimized implementation of pix_abs16_y2 function for arm64.
>
> Performance comparison tests are shown below.
> pix_abs_0_2_c: 308.5
> pix_abs_0_2_neon: 39.2
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  3 +
> libavcodec/aarch64/me_cmp_neon.S         | 73 ++++++++++++++++++++++++
> 2 files changed, 76 insertions(+)

Please do the same optimizations as done for pix_abs_xy2 in 
b46de9aba436dea0cff76f3ed0f7c98448367fd0, 
68a03f64240dcbe408c3fd43d1071a105508a588 and 
4136405c86162063e45d40d55c9985f348d4ea0a for this function too
("aarch64: me_cmp: Interleave some of the loads in ff_pix_abs16_xy2_neon", 
"aarch64: me_cmp: Switch from uabd to uabal in ff_pix_abs16_xy2_neon" and 
"aarch64: me_cmp: Don't do uaddlv once per iteration").

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs8
  2022-07-25 11:21   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
@ 2022-08-04  8:10     ` Martin Storsjö
  0 siblings, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04  8:10 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Mon, 25 Jul 2022, Hubert Mazur wrote:

> Provide optimized implementation of pix_abs8 function for arm64.
>
> Performance comparison tests are shown below.
> - pix_abs_1_0_c: 105.2
> - pix_abs_1_0_neon: 21.4
> - sad_1_c: 107.2
> - sad_1_neon: 20.9
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
> libavcodec/aarch64/me_cmp_neon.S         | 53 ++++++++++++++++++++++++
> 2 files changed, 57 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 89c817990c..7d7dc38754 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -31,6 +31,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                       ptrdiff_t stride, int h);
> int ff_pix_abs16_y2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                       ptrdiff_t stride, int h);
> +int ff_pix_abs8_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> +                      ptrdiff_t stride, int h);
> 
> int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                   ptrdiff_t stride, int h);
> @@ -48,8 +50,10 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>         c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
>         c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
>         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
> +        c->pix_abs[1][0] = ff_pix_abs8_neon;
>
>         c->sad[0] = ff_pix_abs16_neon;
> +        c->sad[1] = ff_pix_abs8_neon;
>         c->sse[0] = sse16_neon;
>         c->sse[1] = sse8_neon;
>         c->sse[2] = sse4_neon;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index dcaffc9b73..f2dd63ced1 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -72,6 +72,59 @@ function ff_pix_abs16_neon, export=1
>         ret
> endfunc
> 
> +function ff_pix_abs8_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // x4           int h
> +
> +        movi            d18, #0
> +        cmp             w4, #4
> +        b.lt            2f
> +
> +// make 4 iterations at once
> +1:
> +        ld1             {v0.8b}, [x1], x3               // Load pix1 for first iteration
> +        ld1             {v1.8b}, [x2], x3               // Load pix2 for first iteration
> +        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
> +        ld1             {v2.8b}, [x1], x3               // Load pix1 for second iteration
> +        ld1             {v3.8b}, [x2], x3               // Load pix2 for second iteration
> +        uabal           v30.8h, v2.8b, v3.8b            // Absolute difference, second iteration
> +        ld1             {v4.8b}, [x1], x3               // Load pix1 for third iteration
> +        ld1             {v5.8b}, [x2], x3               // Load pix2 for third iteration
> +        uabal           v30.8h, v4.8b, v5.8b            // Absolute difference, third iteration
> +        ld1             {v6.8b}, [x1], x3               // Load pix1 for foruth iteration
> +        ld1             {v7.8b}, [x2], x3               // Load pix2 for fourth iteration
> +        uabal           v30.8h, v6.8b, v7.8b            // Absolute difference, foruth iteration

This is maybe the simplest example so far, where the unrolled version here 
just is 4 identical serial copies of the same set of 3 instructions; this 
maybe helps a bit on some CPUs, but it doesn't help nearly as much as it 
can on others, if it would be better unrolled.

I.e., same comments as for the other patches; improve interleaving, don't 
do uaddlv once per iteration.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/aarch64: Add neon implementation for pix_abs16_y2
  2022-07-25 11:17   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
  2022-08-04  8:08     ` Martin Storsjö
@ 2022-08-04  8:12     ` Martin Storsjö
  1 sibling, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-04  8:12 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Mon, 25 Jul 2022, Hubert Mazur wrote:

> Provide optimized implementation of pix_abs16_y2 function for arm64.
>
> Performance comparison tests are shown below.
> pix_abs_0_2_c: 308.5
> pix_abs_0_2_neon: 39.2
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  3 +
> libavcodec/aarch64/me_cmp_neon.S         | 73 ++++++++++++++++++++++++
> 2 files changed, 76 insertions(+)

> +// iterate by one
> +2:
> +
> +        ld1             {v1.16b}, [x2], x3              // Load pix2
> +        ld1             {v2.16b}, [x5], x3              // Load pix3
> +        urhadd          v30.16b, v1.16b, v2.16b         // Rounding halving add
> +        ld1             {v0.16b}, [x1], x3              // Load pix1
> +        uabd            v30.16b, v30.16b, v30.16b

This should be "uabd v30, v30, v0" here too - please check the uncommon 
codepaths too (until we can make checkasm test them by default).

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8
  2022-08-16 12:20 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
@ 2022-08-18  9:18   ` Martin Storsjö
  0 siblings, 0 replies; 25+ messages in thread
From: Martin Storsjö @ 2022-08-18  9:18 UTC (permalink / raw)
  To: Hubert Mazur; +Cc: gjb, upstream, jswinney, ffmpeg-devel, mw, spop

On Tue, 16 Aug 2022, Hubert Mazur wrote:

> Provide optimized implementation of sse8 function for arm64.
>
> Performance comparison tests are shown below.
> - sse_1_c: 130.7
> - sse_1_neon: 29.7
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Hubert Mazur <hum@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
> libavcodec/aarch64/me_cmp_neon.S         | 66 ++++++++++++++++++++++++
> 2 files changed, 70 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 1c36d3d7cb..2f51f0497e 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -34,9 +34,12 @@ int ff_pix_abs16_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *
> 
> int sse16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
>                       ptrdiff_t stride, int h);
> +int sse8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> +                      ptrdiff_t stride, int h);
> int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
>                       ptrdiff_t stride, int h);

Same as the others about function declaration indentation

> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 0ec9c0465b..3f4266d4d5 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -347,6 +347,72 @@ function sse16_neon, export=1
>         ret
> endfunc
> 
> +function sse8_neon, export=1
> +        // x0 - unused
> +        // x1 - pix1
> +        // x2 - pix2
> +        // x3 - stride
> +        // w4 - h
> +
> +        movi            d18, #0

Same as the others about d18

> +        movi            v21.4s, #0
> +        movi            v20.4s, #0
> +        cmp             w4, #4
> +        b.le            2f
> +
> +// make 4 iterations at once
> +1:
> +
> +        // res = abs(pix1[0] - pix2[0])
> +        // res * res
> +
> +        ld1             {v0.8b}, [x1], x3               // Load pix1 for first iteration
> +        ld1             {v1.8b}, [x2], x3               // Load pix2 for second iteration
> +        ld1             {v2.8b}, [x1], x3               // Load pix1 for second iteration
> +        ld1             {v3.8b}, [x2], x3               // Load pix2 for second iteration
> +        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
> +        ld1             {v4.8b}, [x1], x3               // Load pix1 for third iteration
> +        ld1             {v5.8b}, [x2], x3               // Load pix2 for third iteration
> +        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
> +        umlal           v21.4s, v30.4h, v30.4h          // Multiply lower half, first iteration
> +        ld1             {v6.8b}, [x1], x3               // Load pix1 for fourth iteration
> +        ld1             {v7.8b}, [x2], x3               // Load pix2 for fourth iteration
> +        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
> +        umlal           v21.4s, v29.4h, v29.4h          // Multiply lower half, second iteration
> +        umlal2          v20.4s, v30.8h, v30.8h          // Multiply upper half, second iteration

The comment was wrong here, this is about the first iteration, not the 
second one.

> +        uabdl           v27.8h, v6.8b, v7.8b            // Absolute difference, fourth iteration
> +        umlal           v21.4s, v28.4h, v28.4h          // Multiply lower half, third iteration
> +        umlal2          v20.4s, v29.8h, v29.8h          // Multiply upper half, second iteration
> +        sub             w4, w4, #4                      // h -= 4
> +        umlal2          v20.4s, v28.8h, v28.8h          // Multiply upper half, third iteration
> +        umlal           v21.4s, v27.4h, v27.4h          // Multiply lower half, fourth iteration
> +        cmp             w4, #4
> +        umlal2          v20.4s, v27.8h, v27.8h          // Multiply upper half, fourth iteration
> +        b.ge            1b
> +
> +        cbz             w4, 3f
> +
> +// iterate by one
> +2:
> +        ld1             {v0.8b}, [x1], x3               // Load pix1
> +        ld1             {v1.8b}, [x2], x3               // Load pix2
> +        subs            w4, w4, #1
> +        uabdl           v30.8h, v0.8b, v1.8b
> +        umlal           v21.4s, v30.4h, v30.4h
> +        umlal2          v20.4s, v30.8h, v30.8h
> +
> +        b.ne            2b
> +
> +3:
> +        add             v21.4s, v21.4s, v20.4s          // Add accumulator vectors together
> +        uaddlv          d17, v21.4s                     // Add up vector
> +        add             d18, d18, d17
> +

Unnecesssary d18.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8
  2022-08-16 12:20 [FFmpeg-devel] [PATCH 0/5] Provide neon implementation for me_cmp functions Hubert Mazur
@ 2022-08-16 12:20 ` Hubert Mazur
  2022-08-18  9:18   ` Martin Storsjö
  0 siblings, 1 reply; 25+ messages in thread
From: Hubert Mazur @ 2022-08-16 12:20 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, Hubert Mazur, martin, mw, spop

Provide optimized implementation of sse8 function for arm64.

Performance comparison tests are shown below.
- sse_1_c: 130.7
- sse_1_neon: 29.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <hum@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
 libavcodec/aarch64/me_cmp_neon.S         | 66 ++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 1c36d3d7cb..2f51f0497e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -34,9 +34,12 @@ int ff_pix_abs16_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *
 
 int sse16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
                       ptrdiff_t stride, int h);
+int sse8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                      ptrdiff_t stride, int h);
 int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
                       ptrdiff_t stride, int h);
 
+
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -49,6 +52,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 
         c->sad[0] = ff_pix_abs16_neon;
         c->sse[0] = sse16_neon;
+        c->sse[1] = sse8_neon;
         c->sse[2] = sse4_neon;
     }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 0ec9c0465b..3f4266d4d5 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -347,6 +347,72 @@ function sse16_neon, export=1
         ret
 endfunc
 
+function sse8_neon, export=1
+        // x0 - unused
+        // x1 - pix1
+        // x2 - pix2
+        // x3 - stride
+        // w4 - h
+
+        movi            d18, #0
+        movi            v21.4s, #0
+        movi            v20.4s, #0
+        cmp             w4, #4
+        b.le            2f
+
+// make 4 iterations at once
+1:
+
+        // res = abs(pix1[0] - pix2[0])
+        // res * res
+
+        ld1             {v0.8b}, [x1], x3               // Load pix1 for first iteration
+        ld1             {v1.8b}, [x2], x3               // Load pix2 for second iteration
+        ld1             {v2.8b}, [x1], x3               // Load pix1 for second iteration
+        ld1             {v3.8b}, [x2], x3               // Load pix2 for second iteration
+        uabdl           v30.8h, v0.8b, v1.8b            // Absolute difference, first iteration
+        ld1             {v4.8b}, [x1], x3               // Load pix1 for third iteration
+        ld1             {v5.8b}, [x2], x3               // Load pix2 for third iteration
+        uabdl           v29.8h, v2.8b, v3.8b            // Absolute difference, second iteration
+        umlal           v21.4s, v30.4h, v30.4h          // Multiply lower half, first iteration
+        ld1             {v6.8b}, [x1], x3               // Load pix1 for fourth iteration
+        ld1             {v7.8b}, [x2], x3               // Load pix2 for fourth iteration
+        uabdl           v28.8h, v4.8b, v5.8b            // Absolute difference, third iteration
+        umlal           v21.4s, v29.4h, v29.4h          // Multiply lower half, second iteration
+        umlal2          v20.4s, v30.8h, v30.8h          // Multiply upper half, second iteration
+        uabdl           v27.8h, v6.8b, v7.8b            // Absolute difference, fourth iteration
+        umlal           v21.4s, v28.4h, v28.4h          // Multiply lower half, third iteration
+        umlal2          v20.4s, v29.8h, v29.8h          // Multiply upper half, second iteration
+        sub             w4, w4, #4                      // h -= 4
+        umlal2          v20.4s, v28.8h, v28.8h          // Multiply upper half, third iteration
+        umlal           v21.4s, v27.4h, v27.4h          // Multiply lower half, fourth iteration
+        cmp             w4, #4
+        umlal2          v20.4s, v27.8h, v27.8h          // Multiply upper half, fourth iteration
+        b.ge            1b
+
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x1], x3               // Load pix1
+        ld1             {v1.8b}, [x2], x3               // Load pix2
+        subs            w4, w4, #1
+        uabdl           v30.8h, v0.8b, v1.8b
+        umlal           v21.4s, v30.4h, v30.4h
+        umlal2          v20.4s, v30.8h, v30.8h
+
+        b.ne            2b
+
+3:
+        add             v21.4s, v21.4s, v20.4s          // Add accumulator vectors together
+        uaddlv          d17, v21.4s                     // Add up vector
+        add             d18, d18, d17
+
+        fmov            w0, s18
+        ret
+
+endfunc
+
 function sse4_neon, export=1
         // x0 - unused
         // x1 - pix1
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2022-08-18  9:18 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-15  8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
2022-07-25 11:12   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-03 13:22     ` Martin Storsjö
2022-08-04  7:46     ` Martin Storsjö
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
2022-07-21 21:43   ` Martin Storsjö
2022-07-22 21:30   ` Swinney, Jonathan
2022-07-25 11:15   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-07-28 18:50     ` Swinney, Jonathan
2022-07-28 18:51       ` Swinney, Jonathan
2022-07-29  7:26         ` Hubert Mazur
2022-08-04  8:00     ` Martin Storsjö
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
2022-07-25 11:17   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04  8:08     ` Martin Storsjö
2022-08-04  8:12     ` Martin Storsjö
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
2022-07-25 11:18   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04  8:04     ` Martin Storsjö
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
2022-07-25 11:21   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04  8:10     ` Martin Storsjö
2022-08-16 12:20 [FFmpeg-devel] [PATCH 0/5] Provide neon implementation for me_cmp functions Hubert Mazur
2022-08-16 12:20 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
2022-08-18  9:18   ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git