Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions.
@ 2022-09-26  9:15 Grzegorz Bernacki
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Grzegorz Bernacki @ 2022-09-26  9:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop

Provide optimized implementation of pix_abs8 function for arm64.

Performance comparison tests are shown below:
pix_abs_1_1_c: 162.5
pix_abs_1_1_neon: 27.0
pix_abs_1_2_c: 174.0
pix_abs_1_2_neon: 23.5
pix_abs_1_3_c: 203.2
pix_abs_1_3_neon: 34.7

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |   9 ++
 libavcodec/aarch64/me_cmp_neon.S         | 193 +++++++++++++++++++++++
 2 files changed, 202 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index e143f0816e..3459403ee5 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -59,6 +59,12 @@ int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t
                           ptrdiff_t stride, int h);
 int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
                          ptrdiff_t stride, int h);
+int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                         ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -70,6 +76,9 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
         c->pix_abs[1][0] = ff_pix_abs8_neon;
+	    c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
+	    c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
+	    c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;
 
         c->sad[0] = ff_pix_abs16_neon;
         c->sad[1] = ff_pix_abs8_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 11af4849f9..e03c0c26cd 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -119,6 +119,199 @@ function ff_pix_abs8_neon, export=1
         ret
 endfunc
 
+function ff_pix_abs8_x2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        cmp             w4, #4
+        movi            v26.8h, #0
+        add             x5, x2, #1 // pix2 + 1
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+        ld1             {v1.8b}, [x2], x3
+        ld1             {v2.8b}, [x5], x3
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v4.8b}, [x2], x3
+	urhadd          v30.8b, v1.8b, v2.8b
+        ld1             {v5.8b}, [x5], x3
+        uabal           v26.8h, v0.8b, v30.8b
+        ld1             {v6.8b}, [x1], x3
+	urhadd          v29.8b, v4.8b, v5.8b
+        ld1             {v7.8b}, [x2], x3
+        ld1             {v20.8b}, [x5], x3
+        uabal           v26.8h, v6.8b, v29.8b
+        ld1             {v21.8b}, [x1], x3
+	urhadd          v28.8b, v7.8b, v20.8b
+        ld1             {v22.8b}, [x2], x3
+        ld1             {v23.8b}, [x5], x3
+        uabal           v26.8h, v21.8b, v28.8b
+        sub             w4, w4, #4
+        ld1             {v24.8b}, [x1], x3
+	urhadd          v27.8b, v22.8b, v23.8b
+        cmp             w4, #4
+        uabal           v26.8h, v24.8b, v27.8b
+
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v1.8b}, [x2], x3
+        ld1             {v2.8b}, [x5], x3
+        ld1             {v0.8b}, [x1], x3
+	urhadd          v30.8b, v1.8b, v2.8b
+        subs            w4, w4, #1
+        uabal           v26.8h, v0.8b, v30.8b
+
+        b.ne            2b
+3:
+        uaddlv          s20, v26.8h
+        fmov            w0, s20
+
+        ret
+
+endfunc
+
+function ff_pix_abs8_y2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        cmp             w4, #4
+        movi            v26.8h, #0
+        ld1             {v1.8b}, [x2], x3
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+        ld1             {v2.8b}, [x2], x3
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v6.8b}, [x1], x3
+	urhadd          v30.8b, v1.8b, v2.8b
+        ld1             {v5.8b}, [x2], x3
+        ld1             {v21.8b}, [x1], x3
+        uabal           v26.8h, v0.8b, v30.8b
+	urhadd          v29.8b, v2.8b, v5.8b
+        ld1             {v20.8b}, [x2], x3
+        ld1             {v24.8b}, [x1], x3
+        uabal           v26.8h, v6.8b, v29.8b
+	urhadd          v28.8b, v5.8b, v20.8b
+        uabal           v26.8h, v21.8b, v28.8b
+        ld1             {v23.8b}, [x2], x3
+        mov             v1.8b, v23.8b
+        sub             w4, w4, #4
+	urhadd          v27.8b, v20.8b, v23.8b
+        cmp             w4, #4
+        uabal           v26.8h, v24.8b, v27.8b
+
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v2.8b}, [x2], x3
+	urhadd          v30.8b, v1.8b, v2.8b
+        subs            w4, w4, #1
+        uabal           v26.8h, v0.8b, v30.8b
+        mov             v1.8b, v2.8b
+
+        b.ne            2b
+3:
+        uaddlv          s20, v26.8h
+        fmov            w0, s20
+
+        ret
+
+endfunc
+
+function ff_pix_abs8_xy2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        movi            v31.8h, #0
+        add             x0, x2, 1  // pix2 + 1
+
+        add             x5, x2, x3 // pix2 + stride = pix3
+        cmp             w4, #4
+        add             x6, x5, 1 // pix3 + stride + 1
+
+        b.lt            2f
+
+        ld1             {v0.8b}, [x2], x3
+        ld1             {v1.8b}, [x0], x3
+        uaddl           v2.8h, v0.8b, v1.8b
+
+// make 4 iterations at once
+1:
+        ld1             {v4.8b}, [x5], x3
+        ld1             {v5.8b}, [x6], x3
+        ld1             {v7.8b}, [x5], x3
+        uaddl           v0.8h, v4.8b, v5.8b
+        ld1             {v16.8b}, [x6], x3
+        add             v4.8h, v0.8h, v2.8h
+        ld1             {v5.8b}, [x1], x3
+        rshrn           v4.8b, v4.8h, #2
+        uaddl           v7.8h, v7.8b, v16.8b
+        uabal           v31.8h, v5.8b, v4.8b
+        add             v2.8h, v0.8h, v7.8h
+        ld1             {v17.8b}, [x1], x3
+        rshrn           v2.8b, v2.8h, #2
+        ld1             {v20.8b}, [x5], x3
+        uabal           v31.8h, v17.8b, v2.8b
+        ld1             {v21.8b}, [x6], x3
+        ld1             {v25.8b}, [x5], x3
+        uaddl           v20.8h, v20.8b, v21.8b
+        ld1             {v26.8b}, [x6], x3
+        add             v7.8h, v7.8h, v20.8h
+        uaddl           v25.8h, v25.8b, v26.8b
+        rshrn           v7.8b, v7.8h, #2
+        ld1             {v22.8b}, [x1], x3
+        mov             v2.16b, v25.16b
+        uabal           v31.8h, v22.8b, v7.8b
+        add             v20.8h, v20.8h, v25.8h
+        ld1             {v27.8b}, [x1], x3
+        sub             w4, w4, #4
+        rshrn           v20.8b, v20.8h, #2
+        cmp             w4, #4
+        uabal           v31.8h, v27.8b, v20.8b
+
+        b.ge            1b
+
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x5], x3
+        ld1             {v1.8b}, [x6], x3
+        ld1             {v4.8b}, [x1], x3
+        uaddl           v21.8h, v0.8b, v1.8b
+        subs            w4, w4, #1
+        add             v3.8h, v2.8h, v21.8h
+        mov             v2.16b, v21.16b
+        rshrn           v3.8b, v3.8h, #2
+        uabal           v31.8h, v4.8b, v3.8b
+        b.ne            2b
+
+3:
+        uaddlv          s18, v31.8h
+        fmov            w0, s18
+
+        ret
+
+endfunc
+
+
 function ff_pix_abs16_xy2_neon, export=1
         // x0           unused
         // x1           uint8_t *pix1
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8
  2022-09-26  9:15 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
@ 2022-09-26  9:15 ` Grzegorz Bernacki
  2022-09-28  9:08   ` Martin Storsjö
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Grzegorz Bernacki @ 2022-09-26  9:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop

Add vectorized implementation of nsse8 function.

Performance comparison tests are shown below.
- nsse_1_c: 256.0
- nsse_1_neon: 82.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c | 15 ++++
 libavcodec/aarch64/me_cmp_neon.S         | 99 ++++++++++++++++++++++++
 2 files changed, 114 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3459403ee5..2c61cfcf63 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -66,6 +66,11 @@ int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *p
 int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
                          ptrdiff_t stride, int h);
 
+int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
+               ptrdiff_t stride, int h);
+int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+                       ptrdiff_t stride, int h);
+
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -94,6 +99,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->vsse[4] = vsse_intra16_neon;
 
         c->nsse[0] = nsse16_neon_wrapper;
+        c->nsse[1] = nsse8_neon_wrapper;
 
         c->median_sad[0] = pix_median_abs16_neon;
         c->median_sad[1] = pix_median_abs8_neon;
@@ -108,3 +114,12 @@ int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
     else
         return nsse16_neon(8, s1, s2, stride, h);
 }
+
+int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+                       ptrdiff_t stride, int h)
+{
+    if (c)
+        return nsse8_neon(c->avctx->nsse_weight, s1, s2, stride, h);
+    else
+        return nsse8_neon(8, s1, s2, stride, h);
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index e03c0c26cd..6f7c7c1690 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1163,6 +1163,105 @@ function nsse16_neon, export=1
         ret
 endfunc
 
+function nsse8_neon, export=1
+        // x0           multiplier
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        str             x0, [sp, #-0x40]!
+        stp             x1, x2, [sp, #0x10]
+        stp             x3, x4, [sp, #0x20]
+        str             x30, [sp, #0x30]
+        bl              X(sse8_neon)
+        ldr             x30, [sp, #0x30]
+        mov             w9, w0                                  // here we store score1
+        ldr             x5, [sp]
+        ldp             x1, x2, [sp, #0x10]
+        ldp             x3, x4, [sp, #0x20]
+        add             sp, sp, #0x40
+
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+
+        ld1             {v0.8b}, [x1], x3
+        subs            w4, w4, #1                              // we need to make h-1 iterations
+        ext             v1.8b, v0.8b, v0.8b, #1                 // x1 + 1
+        ld1             {v2.8b}, [x2], x3
+        cmp             w4, #2
+        ext             v3.8b, v2.8b, v2.8b, #1                 // x2 + 1
+
+        b.lt            2f
+
+// make 2 iterations at once
+1:
+        ld1             {v4.8b}, [x1], x3
+        ld1             {v20.8b}, [x1], x3
+        ld1             {v6.8b}, [x2], x3
+        ext             v5.8b, v4.8b, v4.8b, #1                 // x1 + stride + 1
+        ext             v21.8b, v20.8b, v20.8b, #1
+        ld1             {v22.8b}, [x2], x3
+        ext             v7.8b, v6.8b, v6.8b, #1                 // x2 + stride + 1
+        usubl           v31.8h, v0.8b, v4.8b
+        ext             v23.8b, v22.8b, v22.8b, #1
+        usubl           v29.8h, v1.8b, v5.8b
+        usubl           v27.8h, v2.8b, v6.8b
+        usubl           v25.8h, v3.8b, v7.8b
+        saba            v16.8h, v31.8h, v29.8h
+        usubl           v31.8h, v4.8b, v20.8b
+        saba            v18.8h, v27.8h, v25.8h
+        sub             w4, w4, #2
+        usubl           v29.8h, v5.8b, v21.8b
+        mov             v0.16b, v20.16b
+        mov             v1.16b, v21.16b
+        saba            v16.8h, v31.8h, v29.8h
+        usubl           v27.8h, v6.8b, v22.8b
+        usubl           v25.8h, v7.8b, v23.8b
+        mov             v2.16b, v22.16b
+        mov             v3.16b, v23.16b
+        cmp             w4, #2
+        saba            v18.8h, v27.8h, v25.8h
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v4.8b}, [x1], x3
+        subs            w4, w4, #1
+        ext             v5.8b, v4.8b, v4.8b, #1                 // x1 + stride + 1
+        ld1             {v6.8b}, [x2], x3
+        usubl           v31.8h, v0.8b, v4.8b
+        ext             v7.8b, v6.8b, v6.8b, #1                 // x2 + stride + 1
+
+        usubl           v29.8h, v1.8b, v5.8b
+        saba            v16.8h, v31.8h, v29.8h
+        usubl           v27.8h, v2.8b, v6.8b
+        usubl           v25.8h, v3.8b, v7.8b
+        saba            v18.8h, v27.8h, v25.8h
+
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v5.16b
+        mov             v2.16b, v6.16b
+        mov             v3.16b, v7.16b
+
+        cbnz            w4, 2b
+
+3:
+        sqsub           v16.8h, v16.8h, v18.8h
+        ins             v16.h[7], wzr
+        saddlv          s16, v16.8h
+        sqabs           s16, s16
+        fmov            w0, s16
+
+        mul             w0, w0, w5
+        add             w0, w0, w9
+
+        ret
+endfunc
+
 function pix_median_abs16_neon, export=1
         // x0           unused
         // x1           uint8_t *pix1
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64.
  2022-09-26  9:15 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
@ 2022-09-26  9:15 ` Grzegorz Bernacki
  2022-09-28  9:09   ` Martin Storsjö
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
  2022-09-28  9:06 ` [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Martin Storsjö
  3 siblings, 1 reply; 9+ messages in thread
From: Grzegorz Bernacki @ 2022-09-26  9:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop

Provide optimized implementation of vsse8 for arm64.

Performance comparison tests are shown below.
- vsse_1_c: 141.5
- vsse_1_neon: 32.5

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  5 ++
 libavcodec/aarch64/me_cmp_neon.S         | 70 ++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 2c61cfcf63..f247372c94 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -71,6 +71,9 @@ int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
 int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                        ptrdiff_t stride, int h);
 
+int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+               ptrdiff_t stride, int h);
+
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -96,6 +99,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->vsad[5] = vsad_intra8_neon;
 
         c->vsse[0] = vsse16_neon;
+        c->vsse[1] = vsse8_neon;
+
         c->vsse[4] = vsse_intra16_neon;
 
         c->nsse[0] = nsse16_neon_wrapper;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 6f7c7c1690..386d2de0c5 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -843,6 +843,76 @@ function vsad16_neon, export=1
         ret
 endfunc
 
+function vsse8_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        ld1             {v0.8b}, [x1], x3              // Load pix1[0], first iteration
+        ld1             {v1.8b}, [x2], x3              // Load pix2[0], first iteration
+
+        sub             w4, w4, #1                      // we need to make h-1 iterations
+        movi            v16.4s, #0
+        movi            v17.4s, #0
+
+        cmp             w4, #3                          // check if we can make 3 iterations at once
+        usubl           v31.8h, v0.8b, v1.8b            // Signed difference of pix1[0] - pix2[0], first iteration
+        b.le            2f
+
+
+1:
+        // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+        // res = (x) * (x)
+        ld1             {v0.8b}, [x1], x3              // Load pix1[0 + stride], first iteration
+        ld1             {v1.8b}, [x2], x3              // Load pix2[0 + stride], first iteration
+        ld1             {v2.8b}, [x1], x3              // Load pix1[0 + stride], second iteration
+        ld1             {v3.8b}, [x2], x3              // Load pix2[0 + stride], second iteration
+        usubl           v29.8h, v0.8b, v1.8b
+        usubl2          v28.8h, v0.16b, v1.16b
+        ld1             {v4.8b}, [x1], x3              // Load pix1[0 + stride], third iteration
+        ld1             {v5.8b}, [x2], x3              // Load pix1[0 + stride], third iteration
+        sabd            v31.8h, v31.8h, v29.8h
+        usubl           v27.8h, v2.8b, v3.8b
+        usubl           v25.8h, v4.8b, v5.8b
+        sabd            v29.8h, v29.8h, v27.8h
+        sabd            v27.8h, v27.8h, v25.8h
+        umlal           v16.4s, v31.4h, v31.4h
+        umlal2          v17.4s, v31.8h, v31.8h
+        mov             v31.16b, v25.16b
+        umlal           v16.4s, v29.4h, v29.4h
+        umlal2          v17.4s, v29.8h, v29.8h
+        sub             w4, w4, #3
+        umlal           v16.4s, v27.4h, v27.4h
+        umlal2          v17.4s, v27.8h, v27.8h
+        cmp             w4, #3
+
+        b.ge            1b
+
+        cbz             w4, 3f
+
+// iterate by once
+2:
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v1.8b}, [x2], x3
+        subs            w4, w4, #1
+        usubl           v29.8h, v0.8b, v1.8b
+        sabd            v31.8h, v31.8h, v29.8h
+        umlal           v16.4s, v31.4h, v31.4h
+        umlal2          v17.4s, v31.8h, v31.8h
+        mov             v31.16b, v29.16b
+        b.ne            2b
+
+3:
+        add             v16.4s, v16.4s, v17.4s
+        uaddlv          d17, v16.4s
+        fmov            w0, s17
+
+        ret
+endfunc
+
+
 function vsse16_neon, export=1
         // x0           unused
         // x1           uint8_t *pix1
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8
  2022-09-26  9:15 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
@ 2022-09-26  9:15 ` Grzegorz Bernacki
  2022-09-28  9:11   ` Martin Storsjö
  2022-09-28  9:06 ` [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Martin Storsjö
  3 siblings, 1 reply; 9+ messages in thread
From: Grzegorz Bernacki @ 2022-09-26  9:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop

Provide optimized implementation for vsse_intra8 for arm64.

Performance tests are shown below.
- vsse_5_c: 87.7
- vsse_5_neon: 26.2

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
 libavcodec/aarch64/me_cmp_neon.S         | 53 ++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index f247372c94..defec37478 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -74,6 +74,9 @@ int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
 int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                ptrdiff_t stride, int h);
 
+int vsse_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
+                     ptrdiff_t stride, int h);
+
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -102,6 +105,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->vsse[1] = vsse8_neon;
 
         c->vsse[4] = vsse_intra16_neon;
+        c->vsse[5] = vsse_intra8_neon;
 
         c->nsse[0] = nsse16_neon_wrapper;
         c->nsse[1] = nsse8_neon_wrapper;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 386d2de0c5..82ff05d3f0 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1111,6 +1111,59 @@ function vsse_intra16_neon, export=1
         ret
 endfunc
 
+function vsse_intra8_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *dummy
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        ld1             {v0.8b}, [x1], x3
+        movi            v16.4s, #0
+
+        sub             w4, w4, #1 // we need to make h-1 iterations
+        cmp             w4, #3
+        b.lt            2f
+
+1:
+        // v = abs( pix1[0] - pix1[0 + stride] )
+        // score = sum( v * v )
+        ld1             {v1.8b}, [x1], x3
+        ld1             {v2.8b}, [x1], x3
+        uabd            v30.8b, v0.8b, v1.8b
+        ld1             {v3.8b}, [x1], x3
+        umull           v29.8h, v30.8b, v30.8b
+        uabd            v27.8b, v1.8b, v2.8b
+        uadalp          v16.4s, v29.8h
+        umull           v26.8h, v27.8b, v27.8b
+        uabd            v25.8b, v2.8b, v3.8b
+        uadalp          v16.4s, v26.8h
+        umull           v24.8h, v25.8b, v25.8b
+        sub             w4, w4, #3
+        uadalp          v16.4s, v24.8h
+        cmp             w4, #3
+        mov             v0.8b, v3.8b
+
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v1.8b}, [x1], x3
+        subs            w4, w4, #1
+        uabd            v30.8b, v0.8b, v1.8b
+        mov             v0.8b, v1.8b
+        umull           v29.8h, v30.8b, v30.8b
+        uadalp          v16.4s, v29.8h
+        cbnz            w4, 2b
+
+3:
+        uaddlv          d17, v16.4s
+        fmov            w0, s17
+
+        ret
+endfunc
+
 function nsse16_neon, export=1
         // x0           multiplier
         // x1           uint8_t *pix1
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions.
  2022-09-26  9:15 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
                   ` (2 preceding siblings ...)
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
@ 2022-09-28  9:06 ` Martin Storsjö
  2022-09-28 12:22   ` Grzegorz Bernacki
  3 siblings, 1 reply; 9+ messages in thread
From: Martin Storsjö @ 2022-09-28  9:06 UTC (permalink / raw)
  To: Grzegorz Bernacki; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop

[-- Attachment #1: Type: text/plain, Size: 8077 bytes --]

On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:

> Provide optimized implementation of pix_abs8 function for arm64.
>
> Performance comparison tests are shown below:
> pix_abs_1_1_c: 162.5
> pix_abs_1_1_neon: 27.0
> pix_abs_1_2_c: 174.0
> pix_abs_1_2_neon: 23.5
> pix_abs_1_3_c: 203.2
> pix_abs_1_3_neon: 34.7
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |   9 ++
> libavcodec/aarch64/me_cmp_neon.S         | 193 +++++++++++++++++++++++
> 2 files changed, 202 insertions(+)

I don't see any changes compared to the version you sent last week? If 
reposting a patchset, please do mention what has changed - or ping the old 
one. (I had it on my radar to review, but reviews of larger amounts of 
code doesn't happen immediately, unfortunately.)

Overall, you seem to have mixed in tabs among regular spaces. Please do 
fix that. (I would have kinda expected us to have a fate test that checks 
this, but apparently we don't.)

> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index e143f0816e..3459403ee5 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -59,6 +59,12 @@ int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t
>                           ptrdiff_t stride, int h);
> int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
>                          ptrdiff_t stride, int h);
> +int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> +                        ptrdiff_t stride, int h);
> +int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> +                        ptrdiff_t stride, int h);
> +int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> +                         ptrdiff_t stride, int h);
> 
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -70,6 +76,9 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>         c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
>         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
>         c->pix_abs[1][0] = ff_pix_abs8_neon;
> +	    c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
> +	    c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
> +	    c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;

In most mediums, the diff here shows that there's something off - tabs.

>
>         c->sad[0] = ff_pix_abs16_neon;
>         c->sad[1] = ff_pix_abs8_neon;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 11af4849f9..e03c0c26cd 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -119,6 +119,199 @@ function ff_pix_abs8_neon, export=1
>         ret
> endfunc
> 
> +function ff_pix_abs8_x2_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        cmp             w4, #4
> +        movi            v26.8h, #0
> +        add             x5, x2, #1 // pix2 + 1
> +        b.lt            2f
> +
> +// make 4 iterations at once
> +1:
> +        ld1             {v1.8b}, [x2], x3
> +        ld1             {v2.8b}, [x5], x3
> +        ld1             {v0.8b}, [x1], x3
> +        ld1             {v4.8b}, [x2], x3
> +	urhadd          v30.8b, v1.8b, v2.8b
> +        ld1             {v5.8b}, [x5], x3
> +        uabal           v26.8h, v0.8b, v30.8b
> +        ld1             {v6.8b}, [x1], x3
> +	urhadd          v29.8b, v4.8b, v5.8b
> +        ld1             {v7.8b}, [x2], x3
> +        ld1             {v20.8b}, [x5], x3
> +        uabal           v26.8h, v6.8b, v29.8b
> +        ld1             {v21.8b}, [x1], x3
> +	urhadd          v28.8b, v7.8b, v20.8b
> +        ld1             {v22.8b}, [x2], x3
> +        ld1             {v23.8b}, [x5], x3
> +        uabal           v26.8h, v21.8b, v28.8b
> +        sub             w4, w4, #4
> +        ld1             {v24.8b}, [x1], x3
> +	urhadd          v27.8b, v22.8b, v23.8b
> +        cmp             w4, #4
> +        uabal           v26.8h, v24.8b, v27.8b
> +
> +        b.ge            1b
> +        cbz             w4, 3f
> +
> +// iterate by one
> +2:
> +        ld1             {v1.8b}, [x2], x3
> +        ld1             {v2.8b}, [x5], x3
> +        ld1             {v0.8b}, [x1], x3
> +	urhadd          v30.8b, v1.8b, v2.8b
> +        subs            w4, w4, #1
> +        uabal           v26.8h, v0.8b, v30.8b
> +
> +        b.ne            2b
> +3:
> +        uaddlv          s20, v26.8h
> +        fmov            w0, s20
> +
> +        ret
> +
> +endfunc
> +
> +function ff_pix_abs8_y2_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        cmp             w4, #4
> +        movi            v26.8h, #0
> +        ld1             {v1.8b}, [x2], x3
> +        b.lt            2f
> +
> +// make 4 iterations at once
> +1:
> +        ld1             {v2.8b}, [x2], x3
> +        ld1             {v0.8b}, [x1], x3
> +        ld1             {v6.8b}, [x1], x3
> +	urhadd          v30.8b, v1.8b, v2.8b

Good thing that you wrote this better than the current 
ff_pix_abs16_y2_neon, reusing the data from the previous round. But the 
scheduling could be much improved here (why load v6 directly after v0 - x1 
will delay it due to being updated in the previous instruction, and v6 
won't be needed for a long time here yet).

By fixing the scheduling of this function (and getting rid of the 
unnecessary mov instruction at the end of the unrolled loop), I got it 
down from 74 cycles to 62 cycles on a Cortex A53. I'm attaching the fixup 
I did, so you can apply it on top instead of having to describe it 
verbally.

> +        ld1             {v5.8b}, [x2], x3
> +        ld1             {v21.8b}, [x1], x3
> +        uabal           v26.8h, v0.8b, v30.8b
> +	urhadd          v29.8b, v2.8b, v5.8b
> +        ld1             {v20.8b}, [x2], x3
> +        ld1             {v24.8b}, [x1], x3
> +        uabal           v26.8h, v6.8b, v29.8b
> +	urhadd          v28.8b, v5.8b, v20.8b
> +        uabal           v26.8h, v21.8b, v28.8b
> +        ld1             {v23.8b}, [x2], x3
> +        mov             v1.8b, v23.8b
> +        sub             w4, w4, #4
> +	urhadd          v27.8b, v20.8b, v23.8b
> +        cmp             w4, #4
> +        uabal           v26.8h, v24.8b, v27.8b
> +
> +        b.ge            1b
> +        cbz             w4, 3f
> +
> +// iterate by one
> +2:
> +        ld1             {v0.8b}, [x1], x3
> +        ld1             {v2.8b}, [x2], x3
> +	urhadd          v30.8b, v1.8b, v2.8b
> +        subs            w4, w4, #1
> +        uabal           v26.8h, v0.8b, v30.8b
> +        mov             v1.8b, v2.8b
> +
> +        b.ne            2b
> +3:
> +        uaddlv          s20, v26.8h
> +        fmov            w0, s20
> +
> +        ret
> +
> +endfunc
> +
> +function ff_pix_abs8_xy2_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        movi            v31.8h, #0
> +        add             x0, x2, 1  // pix2 + 1
> +
> +        add             x5, x2, x3 // pix2 + stride = pix3
> +        cmp             w4, #4
> +        add             x6, x5, 1 // pix3 + stride + 1
> +
> +        b.lt            2f
> +
> +        ld1             {v0.8b}, [x2], x3
> +        ld1             {v1.8b}, [x0], x3
> +        uaddl           v2.8h, v0.8b, v1.8b

If we'd start out with h<4, we'd jump to the 2f label above, missing the 
setup of v0-v2 here.

Other than that, the patch looks reasonable to me.

// Martin

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: Type: text/x-diff; name=0001-aarch64-me_cmp-Improve-scheduling-in-ff_pix_abs8_y2_.patch, Size: 2017 bytes --]

From 7909432b3cffdcb31c0953279d862f820a738ec4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 28 Sep 2022 11:29:57 +0300
Subject: [PATCH 1/2] aarch64: me_cmp: Improve scheduling in
 ff_pix_abs8_y2_neon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before:       Cortex A53    A72    A73
pix_abs_1_2_neon:   73.7   31.0   25.7
After:
pix_abs_1_2_neon:   61.7   30.2   24.7

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/me_cmp_neon.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index a900ae2190..7f2b7c754f 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -193,21 +193,20 @@ function ff_pix_abs8_y2_neon, export=1
 1:
         ld1             {v2.8b}, [x2], x3
         ld1             {v0.8b}, [x1], x3
-        ld1             {v6.8b}, [x1], x3
 	urhadd          v30.8b, v1.8b, v2.8b
         ld1             {v5.8b}, [x2], x3
-        ld1             {v21.8b}, [x1], x3
+        ld1             {v6.8b}, [x1], x3
         uabal           v26.8h, v0.8b, v30.8b
 	urhadd          v29.8b, v2.8b, v5.8b
         ld1             {v20.8b}, [x2], x3
-        ld1             {v24.8b}, [x1], x3
+        ld1             {v21.8b}, [x1], x3
         uabal           v26.8h, v6.8b, v29.8b
 	urhadd          v28.8b, v5.8b, v20.8b
-        uabal           v26.8h, v21.8b, v28.8b
-        ld1             {v23.8b}, [x2], x3
-        mov             v1.8b, v23.8b
+        ld1             {v1.8b},  [x2], x3
+        ld1             {v24.8b}, [x1], x3
+	urhadd          v27.8b, v20.8b, v1.8b
         sub             w4, w4, #4
-	urhadd          v27.8b, v20.8b, v23.8b
+        uabal           v26.8h, v21.8b, v28.8b
         cmp             w4, #4
         uabal           v26.8h, v24.8b, v27.8b
 
-- 
2.25.1


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: Type: text/x-diff; name=0002-aarch64-me_cmp-Fix-up-the-prologue-of-ff_pix_abs8_xy.patch, Size: 1083 bytes --]

From dfc91453aa3608b6b6b7c0e832292668150fd0d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 28 Sep 2022 11:43:02 +0300
Subject: [PATCH 2/2] aarch64: me_cmp: Fix up the prologue of
 ff_pix_abs8_xy2_neon

This initializes things properly if this were to be called with
h < 4.
---
 libavcodec/aarch64/me_cmp_neon.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 7f2b7c754f..4037953488 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -245,12 +245,12 @@ function ff_pix_abs8_xy2_neon, export=1
         cmp             w4, #4
         add             x6, x5, 1 // pix3 + stride + 1
 
-        b.lt            2f
-
         ld1             {v0.8b}, [x2], x3
         ld1             {v1.8b}, [x0], x3
         uaddl           v2.8h, v0.8b, v1.8b
 
+        b.lt            2f
+
 // make 4 iterations at once
 1:
         ld1             {v4.8b}, [x5], x3
-- 
2.25.1


[-- Attachment #4: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
@ 2022-09-28  9:08   ` Martin Storsjö
  0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2022-09-28  9:08 UTC (permalink / raw)
  To: Grzegorz Bernacki; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop

On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:

> Add vectorized implementation of nsse8 function.
>
> Performance comparison tests are shown below.
> - nsse_1_c: 256.0
> - nsse_1_neon: 82.7
>
> Benchmarks and tests run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c | 15 ++++
> libavcodec/aarch64/me_cmp_neon.S         | 99 ++++++++++++++++++++++++
> 2 files changed, 114 insertions(+)

Looks reasonable to me, but do check to make sure there's no tabs.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64.
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
@ 2022-09-28  9:09   ` Martin Storsjö
  0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2022-09-28  9:09 UTC (permalink / raw)
  To: Grzegorz Bernacki; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop

On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:

> Provide optimized implementation of vsse8 for arm64.
>
> Performance comparison tests are shown below.
> - vsse_1_c: 141.5
> - vsse_1_neon: 32.5
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  5 ++
> libavcodec/aarch64/me_cmp_neon.S         | 70 ++++++++++++++++++++++++
> 2 files changed, 75 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index 2c61cfcf63..f247372c94 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -71,6 +71,9 @@ int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
> int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
>                        ptrdiff_t stride, int h);
> 
> +int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> +               ptrdiff_t stride, int h);
> +
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
>     int cpu_flags = av_get_cpu_flags();
> @@ -96,6 +99,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>         c->vsad[5] = vsad_intra8_neon;
>
>         c->vsse[0] = vsse16_neon;
> +        c->vsse[1] = vsse8_neon;
> +
>         c->vsse[4] = vsse_intra16_neon;
>
>         c->nsse[0] = nsse16_neon_wrapper;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 6f7c7c1690..386d2de0c5 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -843,6 +843,76 @@ function vsad16_neon, export=1
>         ret
> endfunc
> 
> +function vsse8_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        ld1             {v0.8b}, [x1], x3              // Load pix1[0], first iteration
> +        ld1             {v1.8b}, [x2], x3              // Load pix2[0], first iteration
> +
> +        sub             w4, w4, #1                      // we need to make h-1 iterations
> +        movi            v16.4s, #0
> +        movi            v17.4s, #0
> +
> +        cmp             w4, #3                          // check if we can make 3 iterations at once
> +        usubl           v31.8h, v0.8b, v1.8b            // Signed difference of pix1[0] - pix2[0], first iteration
> +        b.le            2f

Why the b.le here, shouldn't it be enough with b.lt? If we're run with 
h=<unroll amount>, it should be enough to run one round in the unrolled 
loop.

I see that we've got the same issue in a couple preexisting functions; 
I'll send a patch to fix that.

Other than that, this looks reasonable to me.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8
  2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
@ 2022-09-28  9:11   ` Martin Storsjö
  0 siblings, 0 replies; 9+ messages in thread
From: Martin Storsjö @ 2022-09-28  9:11 UTC (permalink / raw)
  To: Grzegorz Bernacki; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop

[-- Attachment #1: Type: text/plain, Size: 2747 bytes --]

On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:

> Provide optimized implementation for vsse_intra8 for arm64.
>
> Performance tests are shown below.
> - vsse_5_c: 87.7
> - vsse_5_neon: 26.2
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
> libavcodec/aarch64/me_cmp_neon.S         | 53 ++++++++++++++++++++++++
> 2 files changed, 57 insertions(+)
>
> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index f247372c94..defec37478 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -74,6 +74,9 @@ int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
> int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
>                ptrdiff_t stride, int h);
> 
> +int vsse_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
> +                     ptrdiff_t stride, int h);
> +
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
>     int cpu_flags = av_get_cpu_flags();
> @@ -102,6 +105,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>         c->vsse[1] = vsse8_neon;
>
>         c->vsse[4] = vsse_intra16_neon;
> +        c->vsse[5] = vsse_intra8_neon;
>
>         c->nsse[0] = nsse16_neon_wrapper;
>         c->nsse[1] = nsse8_neon_wrapper;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 386d2de0c5..82ff05d3f0 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -1111,6 +1111,59 @@ function vsse_intra16_neon, export=1
>         ret
> endfunc
> 
> +function vsse_intra8_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *dummy
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        ld1             {v0.8b}, [x1], x3
> +        movi            v16.4s, #0
> +
> +        sub             w4, w4, #1 // we need to make h-1 iterations
> +        cmp             w4, #3
> +        b.lt            2f
> +
> +1:
> +        // v = abs( pix1[0] - pix1[0 + stride] )
> +        // score = sum( v * v )
> +        ld1             {v1.8b}, [x1], x3
> +        ld1             {v2.8b}, [x1], x3
> +        uabd            v30.8b, v0.8b, v1.8b
> +        ld1             {v3.8b}, [x1], x3
> +        umull           v29.8h, v30.8b, v30.8b
> +        uabd            v27.8b, v1.8b, v2.8b
> +        uadalp          v16.4s, v29.8h

The scheduling here can be improved, please see the attached patch.

Other than that, it does look reasonable.

// Martin

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: Type: text/x-diff; name=0001-aarch64-me_cmp-Improve-scheduling-in-vsse_intra8.patch, Size: 1957 bytes --]

From d0345bceaf013bea2023b1a02b372f2a64c6efaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 28 Sep 2022 11:53:55 +0300
Subject: [PATCH] aarch64: me_cmp: Improve scheduling in vsse_intra8

Before:  Cortex A53    A72    A73
vsse_5_neon:   74.7   31.5   26.0
After:
vsse_5_neon:   62.7   32.5   25.7
---
 libavcodec/aarch64/me_cmp_neon.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 4037953488..dc0b1e5f43 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1113,11 +1113,11 @@ function vsse_intra8_neon, export=1
         // x3           ptrdiff_t stride
         // w4           int h
 
+        sub             w4, w4, #1 // we need to make h-1 iterations
         ld1             {v0.8b}, [x1], x3
+        cmp             w4, #3
         movi            v16.4s, #0
 
-        sub             w4, w4, #1 // we need to make h-1 iterations
-        cmp             w4, #3
         b.lt            2f
 
 1:
@@ -1127,13 +1127,13 @@ function vsse_intra8_neon, export=1
         ld1             {v2.8b}, [x1], x3
         uabd            v30.8b, v0.8b, v1.8b
         ld1             {v3.8b}, [x1], x3
-        umull           v29.8h, v30.8b, v30.8b
         uabd            v27.8b, v1.8b, v2.8b
-        uadalp          v16.4s, v29.8h
-        umull           v26.8h, v27.8b, v27.8b
+        umull           v29.8h, v30.8b, v30.8b
         uabd            v25.8b, v2.8b, v3.8b
-        uadalp          v16.4s, v26.8h
+        umull           v26.8h, v27.8b, v27.8b
+        uadalp          v16.4s, v29.8h
         umull           v24.8h, v25.8b, v25.8b
+        uadalp          v16.4s, v26.8h
         sub             w4, w4, #3
         uadalp          v16.4s, v24.8h
         cmp             w4, #3
-- 
2.25.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions.
  2022-09-28  9:06 ` [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Martin Storsjö
@ 2022-09-28 12:22   ` Grzegorz Bernacki
  0 siblings, 0 replies; 9+ messages in thread
From: Grzegorz Bernacki @ 2022-09-28 12:22 UTC (permalink / raw)
  To: Martin Storsjö; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop

Hi Martin,

I resent the patchset, because the first try did not reach ffmpeg-devel
maillist. I apologize, I should have mentioned about that in cover letter.
Thanks a lot for your review, I will apply the changes and send v2 soon.

thanks,
grzegorz

śr., 28 wrz 2022 o 11:07 Martin Storsjö <martin@martin.st> napisał(a):

> On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:
>
> > Provide optimized implementation of pix_abs8 function for arm64.
> >
> > Performance comparison tests are shown below:
> > pix_abs_1_1_c: 162.5
> > pix_abs_1_1_neon: 27.0
> > pix_abs_1_2_c: 174.0
> > pix_abs_1_2_neon: 23.5
> > pix_abs_1_3_c: 203.2
> > pix_abs_1_3_neon: 34.7
> >
> > Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
> >
> > Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> > ---
> > libavcodec/aarch64/me_cmp_init_aarch64.c |   9 ++
> > libavcodec/aarch64/me_cmp_neon.S         | 193 +++++++++++++++++++++++
> > 2 files changed, 202 insertions(+)
>
> I don't see any changes compared to the version you sent last week? If
> reposting a patchset, please do mention what has changed - or ping the old
> one. (I had it on my radar to review, but reviews of larger amounts of
> code doesn't happen immediately, unfortunately.)
>
> Overall, you seem to have mixed in tabs among regular spaces. Please do
> fix that. (I would have kinda expected us to have a fate test that checks
> this, but apparently we don't.)
>
> > diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c
> b/libavcodec/aarch64/me_cmp_init_aarch64.c
> > index e143f0816e..3459403ee5 100644
> > --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> > +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> > @@ -59,6 +59,12 @@ int pix_median_abs16_neon(MpegEncContext *v, const
> uint8_t *pix1, const uint8_t
> >                           ptrdiff_t stride, int h);
> > int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> >                          ptrdiff_t stride, int h);
> > +int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > +                        ptrdiff_t stride, int h);
> > +int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > +                        ptrdiff_t stride, int h);
> > +int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > +                         ptrdiff_t stride, int h);
> >
> > av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext
> *avctx)
> > {
> > @@ -70,6 +76,9 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c,
> AVCodecContext *avctx)
> >         c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
> >         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
> >         c->pix_abs[1][0] = ff_pix_abs8_neon;
> > +         c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
> > +         c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
> > +         c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;
>
> In most mediums, the diff here shows that there's something off - tabs.
>
> >
> >         c->sad[0] = ff_pix_abs16_neon;
> >         c->sad[1] = ff_pix_abs8_neon;
> > diff --git a/libavcodec/aarch64/me_cmp_neon.S
> b/libavcodec/aarch64/me_cmp_neon.S
> > index 11af4849f9..e03c0c26cd 100644
> > --- a/libavcodec/aarch64/me_cmp_neon.S
> > +++ b/libavcodec/aarch64/me_cmp_neon.S
> > @@ -119,6 +119,199 @@ function ff_pix_abs8_neon, export=1
> >         ret
> > endfunc
> >
> > +function ff_pix_abs8_x2_neon, export=1
> > +        // x0           unused
> > +        // x1           uint8_t *pix1
> > +        // x2           uint8_t *pix2
> > +        // x3           ptrdiff_t stride
> > +        // w4           int h
> > +
> > +        cmp             w4, #4
> > +        movi            v26.8h, #0
> > +        add             x5, x2, #1 // pix2 + 1
> > +        b.lt            2f
> > +
> > +// make 4 iterations at once
> > +1:
> > +        ld1             {v1.8b}, [x2], x3
> > +        ld1             {v2.8b}, [x5], x3
> > +        ld1             {v0.8b}, [x1], x3
> > +        ld1             {v4.8b}, [x2], x3
> > +     urhadd          v30.8b, v1.8b, v2.8b
> > +        ld1             {v5.8b}, [x5], x3
> > +        uabal           v26.8h, v0.8b, v30.8b
> > +        ld1             {v6.8b}, [x1], x3
> > +     urhadd          v29.8b, v4.8b, v5.8b
> > +        ld1             {v7.8b}, [x2], x3
> > +        ld1             {v20.8b}, [x5], x3
> > +        uabal           v26.8h, v6.8b, v29.8b
> > +        ld1             {v21.8b}, [x1], x3
> > +     urhadd          v28.8b, v7.8b, v20.8b
> > +        ld1             {v22.8b}, [x2], x3
> > +        ld1             {v23.8b}, [x5], x3
> > +        uabal           v26.8h, v21.8b, v28.8b
> > +        sub             w4, w4, #4
> > +        ld1             {v24.8b}, [x1], x3
> > +     urhadd          v27.8b, v22.8b, v23.8b
> > +        cmp             w4, #4
> > +        uabal           v26.8h, v24.8b, v27.8b
> > +
> > +        b.ge            1b
> > +        cbz             w4, 3f
> > +
> > +// iterate by one
> > +2:
> > +        ld1             {v1.8b}, [x2], x3
> > +        ld1             {v2.8b}, [x5], x3
> > +        ld1             {v0.8b}, [x1], x3
> > +     urhadd          v30.8b, v1.8b, v2.8b
> > +        subs            w4, w4, #1
> > +        uabal           v26.8h, v0.8b, v30.8b
> > +
> > +        b.ne            2b
> > +3:
> > +        uaddlv          s20, v26.8h
> > +        fmov            w0, s20
> > +
> > +        ret
> > +
> > +endfunc
> > +
> > +function ff_pix_abs8_y2_neon, export=1
> > +        // x0           unused
> > +        // x1           uint8_t *pix1
> > +        // x2           uint8_t *pix2
> > +        // x3           ptrdiff_t stride
> > +        // w4           int h
> > +
> > +        cmp             w4, #4
> > +        movi            v26.8h, #0
> > +        ld1             {v1.8b}, [x2], x3
> > +        b.lt            2f
> > +
> > +// make 4 iterations at once
> > +1:
> > +        ld1             {v2.8b}, [x2], x3
> > +        ld1             {v0.8b}, [x1], x3
> > +        ld1             {v6.8b}, [x1], x3
> > +     urhadd          v30.8b, v1.8b, v2.8b
>
> Good thing that you wrote this better than the current
> ff_pix_abs16_y2_neon, reusing the data from the previous round. But the
> scheduling could be much improved here (why load v6 directly after v0 - x1
> will delay it due to being updated in the previous instruction, and v6
> won't be needed for a long time here yet).
>
> By fixing the scheduling of this function (and getting rid of the
> unnecessary mov instruction at the end of the unrolled loop), I got it
> down from 74 cycles to 62 cycles on a Cortex A53. I'm attaching the fixup
> I did, so you can apply it on top instead of having to describe it
> verbally.
>
> > +        ld1             {v5.8b}, [x2], x3
> > +        ld1             {v21.8b}, [x1], x3
> > +        uabal           v26.8h, v0.8b, v30.8b
> > +     urhadd          v29.8b, v2.8b, v5.8b
> > +        ld1             {v20.8b}, [x2], x3
> > +        ld1             {v24.8b}, [x1], x3
> > +        uabal           v26.8h, v6.8b, v29.8b
> > +     urhadd          v28.8b, v5.8b, v20.8b
> > +        uabal           v26.8h, v21.8b, v28.8b
> > +        ld1             {v23.8b}, [x2], x3
> > +        mov             v1.8b, v23.8b
> > +        sub             w4, w4, #4
> > +     urhadd          v27.8b, v20.8b, v23.8b
> > +        cmp             w4, #4
> > +        uabal           v26.8h, v24.8b, v27.8b
> > +
> > +        b.ge            1b
> > +        cbz             w4, 3f
> > +
> > +// iterate by one
> > +2:
> > +        ld1             {v0.8b}, [x1], x3
> > +        ld1             {v2.8b}, [x2], x3
> > +     urhadd          v30.8b, v1.8b, v2.8b
> > +        subs            w4, w4, #1
> > +        uabal           v26.8h, v0.8b, v30.8b
> > +        mov             v1.8b, v2.8b
> > +
> > +        b.ne            2b
> > +3:
> > +        uaddlv          s20, v26.8h
> > +        fmov            w0, s20
> > +
> > +        ret
> > +
> > +endfunc
> > +
> > +function ff_pix_abs8_xy2_neon, export=1
> > +        // x0           unused
> > +        // x1           uint8_t *pix1
> > +        // x2           uint8_t *pix2
> > +        // x3           ptrdiff_t stride
> > +        // w4           int h
> > +
> > +        movi            v31.8h, #0
> > +        add             x0, x2, 1  // pix2 + 1
> > +
> > +        add             x5, x2, x3 // pix2 + stride = pix3
> > +        cmp             w4, #4
> > +        add             x6, x5, 1 // pix3 + stride + 1
> > +
> > +        b.lt            2f
> > +
> > +        ld1             {v0.8b}, [x2], x3
> > +        ld1             {v1.8b}, [x0], x3
> > +        uaddl           v2.8h, v0.8b, v1.8b
>
> If we'd start out with h<4, we'd jump to the 2f label above, missing the
> setup of v0-v2 here.
>
> Other than that, the patch looks reasonable to me.
>
> // Martin
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-09-28 12:22 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-26  9:15 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
2022-09-28  9:08   ` Martin Storsjö
2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 3/4] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
2022-09-28  9:09   ` Martin Storsjö
2022-09-26  9:15 ` [FFmpeg-devel] [PATCH 4/4] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
2022-09-28  9:11   ` Martin Storsjö
2022-09-28  9:06 ` [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions Martin Storsjö
2022-09-28 12:22   ` Grzegorz Bernacki

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git