* [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions
@ 2022-10-03 14:10 Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 1/7] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
` (7 more replies)
0 siblings, 8 replies; 10+ messages in thread
From: Grzegorz Bernacki @ 2022-10-03 14:10 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
Changes since v1:
- changed tabs to spaces
- modified branch instruction in vsse8
- apply Martin's patches with improved instructions scheduling
Grzegorz Bernacki (4):
lavc/aarch64: Add neon implementation for pix_abs8 functions.
lavc/aarch64: Provide neon implementation of nsse8
lavc/aarch64: Provide optimized implementation of vsse8 for arm64.
lavc/aarch64: Add neon implementation for vsse_intra8
Martin Storsjö (3):
aarch64: me_cmp: Improve scheduling in ff_pix_abs8_y2_neon
aarch64: me_cmp: Fix up the prologue of ff_pix_abs8_xy2_neon
aarch64: me_cmp: Improve scheduling in vsse_intra8
libavcodec/aarch64/me_cmp_init_aarch64.c | 33 ++
libavcodec/aarch64/me_cmp_neon.S | 414 +++++++++++++++++++++++
2 files changed, 447 insertions(+)
--
2.37.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 1/7] lavc/aarch64: Add neon implementation for pix_abs8 functions.
2022-10-03 14:10 [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Grzegorz Bernacki
@ 2022-10-03 14:10 ` Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 2/7] aarch64: me_cmp: Improve scheduling in ff_pix_abs8_y2_neon Grzegorz Bernacki
` (6 subsequent siblings)
7 siblings, 0 replies; 10+ messages in thread
From: Grzegorz Bernacki @ 2022-10-03 14:10 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
Provide optimized implementation of pix_abs8 function for arm64.
Performance comparison tests are shown below:
pix_abs_1_1_c: 162.5
pix_abs_1_1_neon: 27.0
pix_abs_1_2_c: 174.0
pix_abs_1_2_neon: 23.5
pix_abs_1_3_c: 203.2
pix_abs_1_3_neon: 34.7
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 9 ++
libavcodec/aarch64/me_cmp_neon.S | 193 +++++++++++++++++++++++
2 files changed, 202 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index e143f0816e..695ed35fc1 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -59,6 +59,12 @@ int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t
ptrdiff_t stride, int h);
int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -70,6 +76,9 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
c->pix_abs[1][0] = ff_pix_abs8_neon;
+ c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
+ c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
+ c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;
c->sad[0] = ff_pix_abs16_neon;
c->sad[1] = ff_pix_abs8_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index c710358ab7..43e068bb7f 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -119,6 +119,199 @@ function ff_pix_abs8_neon, export=1
ret
endfunc
+function ff_pix_abs8_x2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ cmp w4, #4
+ movi v26.8h, #0
+ add x5, x2, #1 // pix2 + 1
+ b.lt 2f
+
+// make 4 iterations at once
+1:
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v2.8b}, [x5], x3
+ ld1 {v0.8b}, [x1], x3
+ ld1 {v4.8b}, [x2], x3
+ urhadd v30.8b, v1.8b, v2.8b
+ ld1 {v5.8b}, [x5], x3
+ uabal v26.8h, v0.8b, v30.8b
+ ld1 {v6.8b}, [x1], x3
+ urhadd v29.8b, v4.8b, v5.8b
+ ld1 {v7.8b}, [x2], x3
+ ld1 {v20.8b}, [x5], x3
+ uabal v26.8h, v6.8b, v29.8b
+ ld1 {v21.8b}, [x1], x3
+ urhadd v28.8b, v7.8b, v20.8b
+ ld1 {v22.8b}, [x2], x3
+ ld1 {v23.8b}, [x5], x3
+ uabal v26.8h, v21.8b, v28.8b
+ sub w4, w4, #4
+ ld1 {v24.8b}, [x1], x3
+ urhadd v27.8b, v22.8b, v23.8b
+ cmp w4, #4
+ uabal v26.8h, v24.8b, v27.8b
+
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v2.8b}, [x5], x3
+ ld1 {v0.8b}, [x1], x3
+ urhadd v30.8b, v1.8b, v2.8b
+ subs w4, w4, #1
+ uabal v26.8h, v0.8b, v30.8b
+
+ b.ne 2b
+3:
+ uaddlv s20, v26.8h
+ fmov w0, s20
+
+ ret
+
+endfunc
+
+function ff_pix_abs8_y2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ cmp w4, #4
+ movi v26.8h, #0
+ ld1 {v1.8b}, [x2], x3
+ b.lt 2f
+
+// make 4 iterations at once
+1:
+ ld1 {v2.8b}, [x2], x3
+ ld1 {v0.8b}, [x1], x3
+ ld1 {v6.8b}, [x1], x3
+ urhadd v30.8b, v1.8b, v2.8b
+ ld1 {v5.8b}, [x2], x3
+ ld1 {v21.8b}, [x1], x3
+ uabal v26.8h, v0.8b, v30.8b
+ urhadd v29.8b, v2.8b, v5.8b
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v24.8b}, [x1], x3
+ uabal v26.8h, v6.8b, v29.8b
+ urhadd v28.8b, v5.8b, v20.8b
+ uabal v26.8h, v21.8b, v28.8b
+ ld1 {v23.8b}, [x2], x3
+ mov v1.8b, v23.8b
+ sub w4, w4, #4
+ urhadd v27.8b, v20.8b, v23.8b
+ cmp w4, #4
+ uabal v26.8h, v24.8b, v27.8b
+
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v0.8b}, [x1], x3
+ ld1 {v2.8b}, [x2], x3
+ urhadd v30.8b, v1.8b, v2.8b
+ subs w4, w4, #1
+ uabal v26.8h, v0.8b, v30.8b
+ mov v1.8b, v2.8b
+
+ b.ne 2b
+3:
+ uaddlv s20, v26.8h
+ fmov w0, s20
+
+ ret
+
+endfunc
+
+function ff_pix_abs8_xy2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ movi v31.8h, #0
+ add x0, x2, 1 // pix2 + 1
+
+ add x5, x2, x3 // pix2 + stride = pix3
+ cmp w4, #4
+ add x6, x5, 1 // pix3 + stride + 1
+
+ b.lt 2f
+
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x0], x3
+ uaddl v2.8h, v0.8b, v1.8b
+
+// make 4 iterations at once
+1:
+ ld1 {v4.8b}, [x5], x3
+ ld1 {v5.8b}, [x6], x3
+ ld1 {v7.8b}, [x5], x3
+ uaddl v0.8h, v4.8b, v5.8b
+ ld1 {v16.8b}, [x6], x3
+ add v4.8h, v0.8h, v2.8h
+ ld1 {v5.8b}, [x1], x3
+ rshrn v4.8b, v4.8h, #2
+ uaddl v7.8h, v7.8b, v16.8b
+ uabal v31.8h, v5.8b, v4.8b
+ add v2.8h, v0.8h, v7.8h
+ ld1 {v17.8b}, [x1], x3
+ rshrn v2.8b, v2.8h, #2
+ ld1 {v20.8b}, [x5], x3
+ uabal v31.8h, v17.8b, v2.8b
+ ld1 {v21.8b}, [x6], x3
+ ld1 {v25.8b}, [x5], x3
+ uaddl v20.8h, v20.8b, v21.8b
+ ld1 {v26.8b}, [x6], x3
+ add v7.8h, v7.8h, v20.8h
+ uaddl v25.8h, v25.8b, v26.8b
+ rshrn v7.8b, v7.8h, #2
+ ld1 {v22.8b}, [x1], x3
+ mov v2.16b, v25.16b
+ uabal v31.8h, v22.8b, v7.8b
+ add v20.8h, v20.8h, v25.8h
+ ld1 {v27.8b}, [x1], x3
+ sub w4, w4, #4
+ rshrn v20.8b, v20.8h, #2
+ cmp w4, #4
+ uabal v31.8h, v27.8b, v20.8b
+
+ b.ge 1b
+
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v0.8b}, [x5], x3
+ ld1 {v1.8b}, [x6], x3
+ ld1 {v4.8b}, [x1], x3
+ uaddl v21.8h, v0.8b, v1.8b
+ subs w4, w4, #1
+ add v3.8h, v2.8h, v21.8h
+ mov v2.16b, v21.16b
+ rshrn v3.8b, v3.8h, #2
+ uabal v31.8h, v4.8b, v3.8b
+ b.ne 2b
+
+3:
+ uaddlv s18, v31.8h
+ fmov w0, s18
+
+ ret
+
+endfunc
+
+
function ff_pix_abs16_xy2_neon, export=1
// x0 unused
// x1 uint8_t *pix1
--
2.37.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 2/7] aarch64: me_cmp: Improve scheduling in ff_pix_abs8_y2_neon
2022-10-03 14:10 [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 1/7] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
@ 2022-10-03 14:10 ` Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 3/7] aarch64: me_cmp: Fix up the prologue of ff_pix_abs8_xy2_neon Grzegorz Bernacki
` (5 subsequent siblings)
7 siblings, 0 replies; 10+ messages in thread
From: Grzegorz Bernacki @ 2022-10-03 14:10 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
From: Martin Storsjö <martin@martin.st>
Before: Cortex A53 A72 A73
pix_abs_1_2_neon: 73.7 31.0 25.7
After:
pix_abs_1_2_neon: 61.7 30.2 24.7
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/aarch64/me_cmp_neon.S | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 43e068bb7f..3662419edf 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -193,21 +193,20 @@ function ff_pix_abs8_y2_neon, export=1
1:
ld1 {v2.8b}, [x2], x3
ld1 {v0.8b}, [x1], x3
- ld1 {v6.8b}, [x1], x3
urhadd v30.8b, v1.8b, v2.8b
ld1 {v5.8b}, [x2], x3
- ld1 {v21.8b}, [x1], x3
+ ld1 {v6.8b}, [x1], x3
uabal v26.8h, v0.8b, v30.8b
urhadd v29.8b, v2.8b, v5.8b
ld1 {v20.8b}, [x2], x3
- ld1 {v24.8b}, [x1], x3
+ ld1 {v21.8b}, [x1], x3
uabal v26.8h, v6.8b, v29.8b
urhadd v28.8b, v5.8b, v20.8b
- uabal v26.8h, v21.8b, v28.8b
- ld1 {v23.8b}, [x2], x3
- mov v1.8b, v23.8b
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v24.8b}, [x1], x3
+ urhadd v27.8b, v20.8b, v1.8b
sub w4, w4, #4
- urhadd v27.8b, v20.8b, v23.8b
+ uabal v26.8h, v21.8b, v28.8b
cmp w4, #4
uabal v26.8h, v24.8b, v27.8b
--
2.37.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 3/7] aarch64: me_cmp: Fix up the prologue of ff_pix_abs8_xy2_neon
2022-10-03 14:10 [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 1/7] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 2/7] aarch64: me_cmp: Improve scheduling in ff_pix_abs8_y2_neon Grzegorz Bernacki
@ 2022-10-03 14:10 ` Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 4/7] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
` (4 subsequent siblings)
7 siblings, 0 replies; 10+ messages in thread
From: Grzegorz Bernacki @ 2022-10-03 14:10 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
From: Martin Storsjö <martin@martin.st>
This initializes things properly if this were to be called with
h < 4.
---
libavcodec/aarch64/me_cmp_neon.S | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 3662419edf..cfba3eb33a 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -245,12 +245,12 @@ function ff_pix_abs8_xy2_neon, export=1
cmp w4, #4
add x6, x5, 1 // pix3 + stride + 1
- b.lt 2f
-
ld1 {v0.8b}, [x2], x3
ld1 {v1.8b}, [x0], x3
uaddl v2.8h, v0.8b, v1.8b
+ b.lt 2f
+
// make 4 iterations at once
1:
ld1 {v4.8b}, [x5], x3
--
2.37.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 4/7] lavc/aarch64: Provide neon implementation of nsse8
2022-10-03 14:10 [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Grzegorz Bernacki
` (2 preceding siblings ...)
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 3/7] aarch64: me_cmp: Fix up the prologue of ff_pix_abs8_xy2_neon Grzegorz Bernacki
@ 2022-10-03 14:10 ` Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 5/7] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
` (3 subsequent siblings)
7 siblings, 0 replies; 10+ messages in thread
From: Grzegorz Bernacki @ 2022-10-03 14:10 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
Add vectorized implementation of nsse8 function.
Performance comparison tests are shown below.
- nsse_1_c: 256.0
- nsse_1_neon: 82.7
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 15 ++++
libavcodec/aarch64/me_cmp_neon.S | 99 ++++++++++++++++++++++++
2 files changed, 114 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 695ed35fc1..05156627fa 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -66,6 +66,11 @@ int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *p
int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
+int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
+ ptrdiff_t stride, int h);
+int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -94,6 +99,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsse[4] = vsse_intra16_neon;
c->nsse[0] = nsse16_neon_wrapper;
+ c->nsse[1] = nsse8_neon_wrapper;
c->median_sad[0] = pix_median_abs16_neon;
c->median_sad[1] = pix_median_abs8_neon;
@@ -108,3 +114,12 @@ int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
else
return nsse16_neon(8, s1, s2, stride, h);
}
+
+int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h)
+{
+ if (c)
+ return nsse8_neon(c->avctx->nsse_weight, s1, s2, stride, h);
+ else
+ return nsse8_neon(8, s1, s2, stride, h);
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index cfba3eb33a..24be05dd18 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1158,6 +1158,105 @@ function nsse16_neon, export=1
ret
endfunc
+function nsse8_neon, export=1
+ // x0 multiplier
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ str x0, [sp, #-0x40]!
+ stp x1, x2, [sp, #0x10]
+ stp x3, x4, [sp, #0x20]
+ str x30, [sp, #0x30]
+ bl X(sse8_neon)
+ ldr x30, [sp, #0x30]
+ mov w9, w0 // here we store score1
+ ldr x5, [sp]
+ ldp x1, x2, [sp, #0x10]
+ ldp x3, x4, [sp, #0x20]
+ add sp, sp, #0x40
+
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+
+ ld1 {v0.8b}, [x1], x3
+ subs w4, w4, #1 // we need to make h-1 iterations
+ ext v1.8b, v0.8b, v0.8b, #1 // x1 + 1
+ ld1 {v2.8b}, [x2], x3
+ cmp w4, #2
+ ext v3.8b, v2.8b, v2.8b, #1 // x2 + 1
+
+ b.lt 2f
+
+// make 2 iterations at once
+1:
+ ld1 {v4.8b}, [x1], x3
+ ld1 {v20.8b}, [x1], x3
+ ld1 {v6.8b}, [x2], x3
+ ext v5.8b, v4.8b, v4.8b, #1 // x1 + stride + 1
+ ext v21.8b, v20.8b, v20.8b, #1
+ ld1 {v22.8b}, [x2], x3
+ ext v7.8b, v6.8b, v6.8b, #1 // x2 + stride + 1
+ usubl v31.8h, v0.8b, v4.8b
+ ext v23.8b, v22.8b, v22.8b, #1
+ usubl v29.8h, v1.8b, v5.8b
+ usubl v27.8h, v2.8b, v6.8b
+ usubl v25.8h, v3.8b, v7.8b
+ saba v16.8h, v31.8h, v29.8h
+ usubl v31.8h, v4.8b, v20.8b
+ saba v18.8h, v27.8h, v25.8h
+ sub w4, w4, #2
+ usubl v29.8h, v5.8b, v21.8b
+ mov v0.16b, v20.16b
+ mov v1.16b, v21.16b
+ saba v16.8h, v31.8h, v29.8h
+ usubl v27.8h, v6.8b, v22.8b
+ usubl v25.8h, v7.8b, v23.8b
+ mov v2.16b, v22.16b
+ mov v3.16b, v23.16b
+ cmp w4, #2
+ saba v18.8h, v27.8h, v25.8h
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v4.8b}, [x1], x3
+ subs w4, w4, #1
+ ext v5.8b, v4.8b, v4.8b, #1 // x1 + stride + 1
+ ld1 {v6.8b}, [x2], x3
+ usubl v31.8h, v0.8b, v4.8b
+ ext v7.8b, v6.8b, v6.8b, #1 // x2 + stride + 1
+
+ usubl v29.8h, v1.8b, v5.8b
+ saba v16.8h, v31.8h, v29.8h
+ usubl v27.8h, v2.8b, v6.8b
+ usubl v25.8h, v3.8b, v7.8b
+ saba v18.8h, v27.8h, v25.8h
+
+ mov v0.16b, v4.16b
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+
+ cbnz w4, 2b
+
+3:
+ sqsub v16.8h, v16.8h, v18.8h
+ ins v16.h[7], wzr
+ saddlv s16, v16.8h
+ sqabs s16, s16
+ fmov w0, s16
+
+ mul w0, w0, w5
+ add w0, w0, w9
+
+ ret
+endfunc
+
function pix_median_abs16_neon, export=1
// x0 unused
// x1 uint8_t *pix1
--
2.37.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 5/7] lavc/aarch64: Provide optimized implementation of vsse8 for arm64.
2022-10-03 14:10 [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Grzegorz Bernacki
` (3 preceding siblings ...)
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 4/7] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
@ 2022-10-03 14:10 ` Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 6/7] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
` (2 subsequent siblings)
7 siblings, 0 replies; 10+ messages in thread
From: Grzegorz Bernacki @ 2022-10-03 14:10 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
Provide optimized implementation of vsse8 for arm64.
Performance comparison tests are shown below.
- vsse_1_c: 141.5
- vsse_1_neon: 32.5
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 5 ++
libavcodec/aarch64/me_cmp_neon.S | 70 ++++++++++++++++++++++++
2 files changed, 75 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 05156627fa..1a0c3e90bb 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -71,6 +71,9 @@ int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -96,6 +99,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsad[5] = vsad_intra8_neon;
c->vsse[0] = vsse16_neon;
+ c->vsse[1] = vsse8_neon;
+
c->vsse[4] = vsse_intra16_neon;
c->nsse[0] = nsse16_neon_wrapper;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 24be05dd18..104e02f495 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -838,6 +838,76 @@ function vsad16_neon, export=1
ret
endfunc
+function vsse8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.8b}, [x1], x3 // Load pix1[0], first iteration
+ ld1 {v1.8b}, [x2], x3 // Load pix2[0], first iteration
+
+ sub w4, w4, #1 // we need to make h-1 iterations
+ movi v16.4s, #0
+ movi v17.4s, #0
+
+ cmp w4, #3 // check if we can make 3 iterations at once
+ usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
+ b.lt 2f
+
+
+1:
+ // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+ // res = (x) * (x)
+ ld1 {v0.8b}, [x1], x3 // Load pix1[0 + stride], first iteration
+ ld1 {v1.8b}, [x2], x3 // Load pix2[0 + stride], first iteration
+ ld1 {v2.8b}, [x1], x3 // Load pix1[0 + stride], second iteration
+ ld1 {v3.8b}, [x2], x3 // Load pix2[0 + stride], second iteration
+ usubl v29.8h, v0.8b, v1.8b
+ usubl2 v28.8h, v0.16b, v1.16b
+ ld1 {v4.8b}, [x1], x3 // Load pix1[0 + stride], third iteration
+ ld1 {v5.8b}, [x2], x3 // Load pix1[0 + stride], third iteration
+ sabd v31.8h, v31.8h, v29.8h
+ usubl v27.8h, v2.8b, v3.8b
+ usubl v25.8h, v4.8b, v5.8b
+ sabd v29.8h, v29.8h, v27.8h
+ sabd v27.8h, v27.8h, v25.8h
+ umlal v16.4s, v31.4h, v31.4h
+ umlal2 v17.4s, v31.8h, v31.8h
+ mov v31.16b, v25.16b
+ umlal v16.4s, v29.4h, v29.4h
+ umlal2 v17.4s, v29.8h, v29.8h
+ sub w4, w4, #3
+ umlal v16.4s, v27.4h, v27.4h
+ umlal2 v17.4s, v27.8h, v27.8h
+ cmp w4, #3
+
+ b.ge 1b
+
+ cbz w4, 3f
+
+// iterate by once
+2:
+ ld1 {v0.8b}, [x1], x3
+ ld1 {v1.8b}, [x2], x3
+ subs w4, w4, #1
+ usubl v29.8h, v0.8b, v1.8b
+ sabd v31.8h, v31.8h, v29.8h
+ umlal v16.4s, v31.4h, v31.4h
+ umlal2 v17.4s, v31.8h, v31.8h
+ mov v31.16b, v29.16b
+ b.ne 2b
+
+3:
+ add v16.4s, v16.4s, v17.4s
+ uaddlv d17, v16.4s
+ fmov w0, s17
+
+ ret
+endfunc
+
+
function vsse16_neon, export=1
// x0 unused
// x1 uint8_t *pix1
--
2.37.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 6/7] lavc/aarch64: Add neon implementation for vsse_intra8
2022-10-03 14:10 [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Grzegorz Bernacki
` (4 preceding siblings ...)
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 5/7] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
@ 2022-10-03 14:10 ` Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 7/7] aarch64: me_cmp: Improve scheduling in vsse_intra8 Grzegorz Bernacki
2022-10-04 10:56 ` [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Martin Storsjö
7 siblings, 0 replies; 10+ messages in thread
From: Grzegorz Bernacki @ 2022-10-03 14:10 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
Provide optimized implementation for vsse_intra8 for arm64.
Performance tests are shown below.
- vsse_5_c: 87.7
- vsse_5_neon: 26.2
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 53 ++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 1a0c3e90bb..1e0f1cf4f1 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -74,6 +74,9 @@ int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsse_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -102,6 +105,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsse[1] = vsse8_neon;
c->vsse[4] = vsse_intra16_neon;
+ c->vsse[5] = vsse_intra8_neon;
c->nsse[0] = nsse16_neon_wrapper;
c->nsse[1] = nsse8_neon_wrapper;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 104e02f495..61e4f68335 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1106,6 +1106,59 @@ function vsse_intra16_neon, export=1
ret
endfunc
+function vsse_intra8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *dummy
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.8b}, [x1], x3
+ movi v16.4s, #0
+
+ sub w4, w4, #1 // we need to make h-1 iterations
+ cmp w4, #3
+ b.lt 2f
+
+1:
+ // v = abs( pix1[0] - pix1[0 + stride] )
+ // score = sum( v * v )
+ ld1 {v1.8b}, [x1], x3
+ ld1 {v2.8b}, [x1], x3
+ uabd v30.8b, v0.8b, v1.8b
+ ld1 {v3.8b}, [x1], x3
+ umull v29.8h, v30.8b, v30.8b
+ uabd v27.8b, v1.8b, v2.8b
+ uadalp v16.4s, v29.8h
+ umull v26.8h, v27.8b, v27.8b
+ uabd v25.8b, v2.8b, v3.8b
+ uadalp v16.4s, v26.8h
+ umull v24.8h, v25.8b, v25.8b
+ sub w4, w4, #3
+ uadalp v16.4s, v24.8h
+ cmp w4, #3
+ mov v0.8b, v3.8b
+
+ b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v1.8b}, [x1], x3
+ subs w4, w4, #1
+ uabd v30.8b, v0.8b, v1.8b
+ mov v0.8b, v1.8b
+ umull v29.8h, v30.8b, v30.8b
+ uadalp v16.4s, v29.8h
+ cbnz w4, 2b
+
+3:
+ uaddlv d17, v16.4s
+ fmov w0, s17
+
+ ret
+endfunc
+
function nsse16_neon, export=1
// x0 multiplier
// x1 uint8_t *pix1
--
2.37.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 7/7] aarch64: me_cmp: Improve scheduling in vsse_intra8
2022-10-03 14:10 [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Grzegorz Bernacki
` (5 preceding siblings ...)
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 6/7] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
@ 2022-10-03 14:10 ` Grzegorz Bernacki
2022-10-04 10:56 ` [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Martin Storsjö
7 siblings, 0 replies; 10+ messages in thread
From: Grzegorz Bernacki @ 2022-10-03 14:10 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: gjb, upstream, jswinney, hum, martin, mw, spop
From: Martin Storsjö <martin@martin.st>
Before: Cortex A53 A72 A73
vsse_5_neon: 74.7 31.5 26.0
After:
vsse_5_neon: 62.7 32.5 25.7
---
libavcodec/aarch64/me_cmp_neon.S | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 61e4f68335..d8a18cd4b8 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1113,11 +1113,11 @@ function vsse_intra8_neon, export=1
// x3 ptrdiff_t stride
// w4 int h
+ sub w4, w4, #1 // we need to make h-1 iterations
ld1 {v0.8b}, [x1], x3
+ cmp w4, #3
movi v16.4s, #0
- sub w4, w4, #1 // we need to make h-1 iterations
- cmp w4, #3
b.lt 2f
1:
@@ -1127,13 +1127,13 @@ function vsse_intra8_neon, export=1
ld1 {v2.8b}, [x1], x3
uabd v30.8b, v0.8b, v1.8b
ld1 {v3.8b}, [x1], x3
- umull v29.8h, v30.8b, v30.8b
uabd v27.8b, v1.8b, v2.8b
- uadalp v16.4s, v29.8h
- umull v26.8h, v27.8b, v27.8b
+ umull v29.8h, v30.8b, v30.8b
uabd v25.8b, v2.8b, v3.8b
- uadalp v16.4s, v26.8h
+ umull v26.8h, v27.8b, v27.8b
+ uadalp v16.4s, v29.8h
umull v24.8h, v25.8b, v25.8b
+ uadalp v16.4s, v26.8h
sub w4, w4, #3
uadalp v16.4s, v24.8h
cmp w4, #3
--
2.37.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions
2022-10-03 14:10 [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Grzegorz Bernacki
` (6 preceding siblings ...)
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 7/7] aarch64: me_cmp: Improve scheduling in vsse_intra8 Grzegorz Bernacki
@ 2022-10-04 10:56 ` Martin Storsjö
2022-10-04 11:34 ` Grzegorz Bernacki
7 siblings, 1 reply; 10+ messages in thread
From: Martin Storsjö @ 2022-10-04 10:56 UTC (permalink / raw)
To: Grzegorz Bernacki; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop
On Mon, 3 Oct 2022, Grzegorz Bernacki wrote:
> Changes since v1:
>
> - changed tabs to spaces
> - modified branch instruction in vsse8
> - apply Martin's patches with improved instructions scheduling
>
> Grzegorz Bernacki (4):
> lavc/aarch64: Add neon implementation for pix_abs8 functions.
> lavc/aarch64: Provide neon implementation of nsse8
> lavc/aarch64: Provide optimized implementation of vsse8 for arm64.
> lavc/aarch64: Add neon implementation for vsse_intra8
>
> Martin Storsjö (3):
> aarch64: me_cmp: Improve scheduling in ff_pix_abs8_y2_neon
> aarch64: me_cmp: Fix up the prologue of ff_pix_abs8_xy2_neon
> aarch64: me_cmp: Improve scheduling in vsse_intra8
>
> libavcodec/aarch64/me_cmp_init_aarch64.c | 33 ++
> libavcodec/aarch64/me_cmp_neon.S | 414 +++++++++++++++++++++++
> 2 files changed, 447 insertions(+)
Thanks! This mostly looked good to me.
I had actually meant that you would squash my fixes into your patches,
instead of keeping them as separate ones.
After squashing such changes, it might have been interesting to get
updated benchmarks in those commit messages (the ones that you have from
Graviton 3). However in this case, these changes didn't really make much
difference on out-of-order cores, only on in-order cores, so I guess
there's not that much value in getting updated benchmarks from Graviton 3
in this case.
So I went ahead and squashed those patches (and added co-authored-by lines
where relevant), and pushed them now. Thanks for your contribution!
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions
2022-10-04 10:56 ` [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Martin Storsjö
@ 2022-10-04 11:34 ` Grzegorz Bernacki
0 siblings, 0 replies; 10+ messages in thread
From: Grzegorz Bernacki @ 2022-10-04 11:34 UTC (permalink / raw)
To: Martin Storsjö; +Cc: upstream, jswinney, hum, ffmpeg-devel, mw, spop
Great!! Thanks a lot for your help and your review.
thanks,
greg
wt., 4 paź 2022 o 12:57 Martin Storsjö <martin@martin.st> napisał(a):
> On Mon, 3 Oct 2022, Grzegorz Bernacki wrote:
>
> > Changes since v1:
> >
> > - changed tabs to spaces
> > - modified branch instruction in vsse8
> > - apply Martin's patches with improved instructions scheduling
> >
> > Grzegorz Bernacki (4):
> > lavc/aarch64: Add neon implementation for pix_abs8 functions.
> > lavc/aarch64: Provide neon implementation of nsse8
> > lavc/aarch64: Provide optimized implementation of vsse8 for arm64.
> > lavc/aarch64: Add neon implementation for vsse_intra8
> >
> > Martin Storsjö (3):
> > aarch64: me_cmp: Improve scheduling in ff_pix_abs8_y2_neon
> > aarch64: me_cmp: Fix up the prologue of ff_pix_abs8_xy2_neon
> > aarch64: me_cmp: Improve scheduling in vsse_intra8
> >
> > libavcodec/aarch64/me_cmp_init_aarch64.c | 33 ++
> > libavcodec/aarch64/me_cmp_neon.S | 414 +++++++++++++++++++++++
> > 2 files changed, 447 insertions(+)
>
> Thanks! This mostly looked good to me.
>
> I had actually meant that you would squash my fixes into your patches,
> instead of keeping them as separate ones.
>
> After squashing such changes, it might have been interesting to get
> updated benchmarks in those commit messages (the ones that you have from
> Graviton 3). However in this case, these changes didn't really make much
> difference on out-of-order cores, only on in-order cores, so I guess
> there's not that much value in getting updated benchmarks from Graviton 3
> in this case.
>
> So I went ahead and squashed those patches (and added co-authored-by lines
> where relevant), and pushed them now. Thanks for your contribution!
>
> // Martin
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2022-10-04 11:34 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-03 14:10 [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 1/7] lavc/aarch64: Add neon implementation for pix_abs8 functions Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 2/7] aarch64: me_cmp: Improve scheduling in ff_pix_abs8_y2_neon Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 3/7] aarch64: me_cmp: Fix up the prologue of ff_pix_abs8_xy2_neon Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 4/7] lavc/aarch64: Provide neon implementation of nsse8 Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 5/7] lavc/aarch64: Provide optimized implementation of vsse8 for arm64 Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 6/7] lavc/aarch64: Add neon implementation for vsse_intra8 Grzegorz Bernacki
2022-10-03 14:10 ` [FFmpeg-devel] [PATCH v2 7/7] aarch64: me_cmp: Improve scheduling in vsse_intra8 Grzegorz Bernacki
2022-10-04 10:56 ` [FFmpeg-devel] [PATCH v2 0/7] arm64 neon implementation for 8bits functions Martin Storsjö
2022-10-04 11:34 ` Grzegorz Bernacki
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git