* [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
[not found] <20240604135504.83169-1-quinkblack@foxmail.com>
@ 2024-06-04 13:55 ` Zhao Zhili
2024-06-05 6:29 ` Rémi Denis-Courmont
2024-06-05 8:16 ` Martin Storsjö
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple Zhao Zhili
` (2 subsequent siblings)
3 siblings, 2 replies; 10+ messages in thread
From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Zhao Zhili
From: Zhao Zhili <zhilizhao@tencent.com>
Test on Apple M1:
rgb24_to_uv_1080_c: 7.2
rgb24_to_uv_1080_neon: 5.5
rgb24_to_uv_1280_c: 8.2
rgb24_to_uv_1280_neon: 6.2
rgb24_to_uv_1920_c: 12.5
rgb24_to_uv_1920_neon: 9.5
rgb24_to_uv_half_540_c: 6.5
rgb24_to_uv_half_540_neon: 3.0
rgb24_to_uv_half_640_c: 7.5
rgb24_to_uv_half_640_neon: 3.2
rgb24_to_uv_half_960_c: 12.5
rgb24_to_uv_half_960_neon: 6.0
rgb24_to_y_1080_c: 4.5
rgb24_to_y_1080_neon: 3.5
rgb24_to_y_1280_c: 5.2
rgb24_to_y_1280_neon: 4.2
rgb24_to_y_1920_c: 8.0
rgb24_to_y_1920_neon: 6.0
Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
---
libswscale/aarch64/Makefile | 1 +
libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++
libswscale/aarch64/swscale.c | 25 ++++
3 files changed, 255 insertions(+)
create mode 100644 libswscale/aarch64/input.S
diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index da1d909561..adfd90a1b6 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \
aarch64/swscale_unscaled.o \
NEON-OBJS += aarch64/hscale.o \
+ aarch64/input.o \
aarch64/output.o \
aarch64/rgb2rgb_neon.o \
aarch64/yuv2rgb_neon.o \
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
new file mode 100644
index 0000000000..ee0d223c6e
--- /dev/null
+++ b/libswscale/aarch64/input.S
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro rgb24_to_yuv_load_rgb, src
+ ld3 { v16.16b, v17.16b, v18.16b }, [\src]
+ ushll v19.8h, v16.8b, #0 // v19: r
+ ushll v20.8h, v17.8b, #0 // v20: g
+ ushll v21.8h, v18.8b, #0 // v21: b
+ ushll2 v22.8h, v16.16b, #0 // v22: r
+ ushll2 v23.8h, v17.16b, #0 // v23: g
+ ushll2 v24.8h, v18.16b, #0 // v24: b
+.endm
+
+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
+ mov \dst1\().16b, v6.16b // dst1 = const_offset
+ mov \dst2\().16b, v6.16b // dst2 = const_offset
+ smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r
+ smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r
+ smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g
+ smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g
+ smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b
+ smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b
+ sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift
+ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
+.endm
+
+function ff_rgb24ToY_neon, export=1
+ cmp w4, #0 // check width > 0
+ b.le 4f
+
+ ldp w10, w11, [x5], #8 // w10: ry, w11: gy
+ dup v0.8h, w10
+ dup v1.8h, w11
+ ldr w12, [x5] // w12: by
+ dup v2.8h, w12
+
+ mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
+ movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
+ dup v6.4s, w9 // w9: const_offset
+
+ mov x2, #0 // w2: i
+ and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16
+ cbz w3, 3f
+1:
+ rgb24_to_yuv_load_rgb x1
+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
+ stp q16, q17, [x0], #32 // store to dst
+
+ add w2, w2, #16 // i += 16
+ add x1, x1, #48 // src += 48
+ cmp w2, w3 // i < (width / 16 * 16)
+ b.lt 1b
+ b 3f
+2:
+ ldrb w13, [x1] // w13: r
+ ldrb w14, [x1, #1] // w14: g
+ ldrb w15, [x1, #2] // w15: b
+
+ smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
+ smaddl x13, w14, w11, x13 // x13 += gy * g
+ smaddl x13, w15, w12, x13 // x13 += by * b
+ asr w13, w13, #9 // x13 >>= 9
+ strh w13, [x0], #2 // store to dst
+
+ add w2, w2, #1 // i++
+ add x1, x1, #3 // src += 3
+3:
+ cmp w2, w4 // i < width
+ b.lt 2b
+4:
+ ret
+endfunc
+
+.macro rgb24_load_uv_coeff half
+ add x6, x6, #12
+
+ ldp w10, w11, [x6], #8 // w10: ru, w11: gu
+ dup v0.8h, w10
+ dup v1.8h, w11
+
+ ldp w12, w13, [x6], #8 // w12: bu, w13: rv
+ dup v2.8h, w12
+ dup v3.8h, w13
+
+ ldp w14, w15, [x6], #8 // w14: gv, w15: bv
+ dup v4.8h, w14
+ dup v5.8h, w15
+
+ .if \half
+ mov w9, #512
+ movk w9, #128, lsl #16 // w9: const_offset
+ .else
+ mov w9, #256
+ movk w9, #64, lsl #16 // w9: const_offset
+ .endif
+ dup v6.4s, w9
+.endm
+
+function ff_rgb24ToUV_half_neon, export=1
+ cmp w5, #0 // check width > 0
+ b.le 4f
+
+ rgb24_load_uv_coeff half=1
+
+ mov x9, #0 // x9: i
+ and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8
+ cbz w7, 3f
+1:
+ ld3 { v16.16b, v17.16b, v18.16b }, [x3]
+ uaddlp v19.8h, v16.16b // v19: r
+ uaddlp v20.8h, v17.16b // v20: g
+ uaddlp v21.8h, v18.16b // v21: b
+
+ rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
+ str q16, [x0], #16 // store dst_u
+ rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
+ str q17, [x1], #16 // store dst_v
+
+ add w9, w9, #8 // i += 8
+ add x3, x3, #48 // src += 48
+ cmp w9, w7 // i < (width * 8 / 8)
+ b.lt 1b
+ b 3f
+2:
+ ldrb w2, [x3] // w2: r1
+ ldrb w4, [x3, #3] // w4: r2
+ add w2, w2, w4 // w2 = r1 + r2
+
+ ldrb w4, [x3, #1] // w4: g1
+ ldrb w7, [x3, #4] // w7: g2
+ add w4, w4, w7 // w4 = g1 + g2
+
+ ldrb w7, [x3, #2] // w7: b1
+ ldrb w8, [x3, #5] // w8: b2
+ add w7, w7, w8 // w7 = b1 + b2
+
+ umov w8, v6.s[0] // dst_u = const_offset
+ smaddl x8, w2, w10, x8 // dst_u += ru * r
+ smaddl x8, w4, w11, x8 // dst_u += gu * g
+ smaddl x8, w7, w12, x8 // dst_u += bu * b
+ asr x8, x8, #10 // dst_u >>= 10
+ strh w8, [x0], #2 // store dst_u
+
+ umov w8, v6.s[0] // dst_v = const_offset
+ smaddl x8, w2, w13, x8 // dst_v += rv * r
+ smaddl x8, w4, w14, x8 // dst_v += gv * g
+ smaddl x8, w7, w15, x8 // dst_v += bv * b
+ asr x8, x8, #10 // dst_v >>= 10
+ strh w8, [x1], #2 // store dst_v
+
+ add w9, w9, #1 // i++
+ add x3, x3, #6 // src += 6
+3:
+ cmp w9, w5
+ b.lt 2b
+4:
+ ret
+endfunc
+
+function ff_rgb24ToUV_neon, export=1
+ cmp w5, #0 // check width > 0
+ b.le 4f
+
+ rgb24_load_uv_coeff half=0
+
+ mov x2, #0 // w2: i
+ and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16
+ cbz w4, 3f
+1:
+ rgb24_to_yuv_load_rgb x3
+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
+ stp q16, q17, [x0], #32 // store to dst_u
+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
+ stp q16, q17, [x1], #32 // store to dst_v
+
+ add w2, w2, #16 // i += 16
+ add x3, x3, #48 // src += 48
+ cmp w2, w4 // i < (width / 16 * 16)
+ b.lt 1b
+ b 3f
+2:
+ ldrb w16, [x3] // w16: r
+ ldrb w17, [x3, #1] // w17: g
+ ldrb w4, [x3, #2] // w4: b
+
+ umov w7, v6.s[0] // w7 = const_offset
+
+ smaddl x8, w16, w10, x7 // x8 = ru * r + const_offset
+ smaddl x8, w17, w11, x8 // x8 += gu * g
+ smaddl x8, w4, w12, x8 // x8 += bu * b
+ asr w8, w8, #9 // x8 >>= 9
+ strh w8, [x0], #2 // store to dst_u
+
+ smaddl x8, w16, w13, x7 // x8 = rv * r + const_offset
+ smaddl x8, w17, w14, x8 // x8 += gv * g
+ smaddl x8, w4, w15, x8 // x8 += bv * b
+ asr w8, w8, #9 // x8 >>= 9
+ strh w8, [x1], #2 // store to dst_v
+
+ add w2, w2, #1 // i++
+ add x3, x3, #3 // src += 3
+3:
+ cmp w2, w5 // i < width
+ b.lt 2b
+4:
+ ret
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index bbd9719a44..4c4ea39dc1 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
default: break; \
}
+void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ const uint8_t *unused2, int width,
+ uint32_t *rgb2yuv, void *opq);
+
+void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
+ const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv,
+ void *opq);
+
+void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
+ const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv,
+ void *opq);
+
av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
}
+ switch (c->srcFormat) {
+ case AV_PIX_FMT_RGB24:
+ c->lumToYV12 = ff_rgb24ToY_neon;
+ if (c->chrSrcHSubSample)
+ c->chrToYV12 = ff_rgb24ToUV_half_neon;
+ else
+ c->chrToYV12 = ff_rgb24ToUV_neon;
+ break;
+ default:
+ break;
+ }
}
}
--
2.42.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple
[not found] <20240604135504.83169-1-quinkblack@foxmail.com>
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
@ 2024-06-04 13:55 ` Zhao Zhili
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME Zhao Zhili
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili
3 siblings, 0 replies; 10+ messages in thread
From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Zhao Zhili
From: Zhao Zhili <zhilizhao@tencent.com>
It will fallback to mach_absolute_time inside libavutil/timer.h
---
libavutil/aarch64/timer.h | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/libavutil/aarch64/timer.h b/libavutil/aarch64/timer.h
index 8b28fd354c..fadc9568f8 100644
--- a/libavutil/aarch64/timer.h
+++ b/libavutil/aarch64/timer.h
@@ -24,13 +24,7 @@
#include <stdint.h>
#include "config.h"
-#if defined(__APPLE__)
-
-#include <mach/mach_time.h>
-
-#define AV_READ_TIME mach_absolute_time
-
-#elif HAVE_INLINE_ASM
+#if HAVE_INLINE_ASM && !defined(__APPLE__)
#define AV_READ_TIME read_time
--
2.42.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME
[not found] <20240604135504.83169-1-quinkblack@foxmail.com>
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple Zhao Zhili
@ 2024-06-04 13:55 ` Zhao Zhili
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili
3 siblings, 0 replies; 10+ messages in thread
From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Zhao Zhili
From: Zhao Zhili <zhilizhao@tencent.com>
---
libavutil/timer.h | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/libavutil/timer.h b/libavutil/timer.h
index 2cd299eca3..74c4d84e69 100644
--- a/libavutil/timer.h
+++ b/libavutil/timer.h
@@ -46,6 +46,8 @@
#include "macos_kperf.h"
#elif HAVE_MACH_ABSOLUTE_TIME
#include <mach/mach_time.h>
+#elif HAVE_CLOCK_GETTIME
+#include <time.h>
#endif
#include "common.h"
@@ -70,6 +72,9 @@
# define AV_READ_TIME gethrtime
# elif HAVE_MACH_ABSOLUTE_TIME
# define AV_READ_TIME mach_absolute_time
+# elif HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC)
+# include "libavutil/time.h"
+# define AV_READ_TIME av_gettime_relative
# endif
#endif
--
2.42.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android
[not found] <20240604135504.83169-1-quinkblack@foxmail.com>
` (2 preceding siblings ...)
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME Zhao Zhili
@ 2024-06-04 13:55 ` Zhao Zhili
2024-06-04 20:25 ` Martin Storsjö
3 siblings, 1 reply; 10+ messages in thread
From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Zhao Zhili
From: Zhao Zhili <zhilizhao@tencent.com>
The inline asm doesn't work on Android.
---
libavutil/aarch64/timer.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libavutil/aarch64/timer.h b/libavutil/aarch64/timer.h
index fadc9568f8..13a58b48e4 100644
--- a/libavutil/aarch64/timer.h
+++ b/libavutil/aarch64/timer.h
@@ -24,7 +24,7 @@
#include <stdint.h>
#include "config.h"
-#if HAVE_INLINE_ASM && !defined(__APPLE__)
+#if HAVE_INLINE_ASM && !defined(__APPLE__) && !defined(__ANDROID__)
#define AV_READ_TIME read_time
--
2.42.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili
@ 2024-06-04 20:25 ` Martin Storsjö
0 siblings, 0 replies; 10+ messages in thread
From: Martin Storsjö @ 2024-06-04 20:25 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Zhao Zhili
On Tue, 4 Jun 2024, Zhao Zhili wrote:
> From: Zhao Zhili <zhilizhao@tencent.com>
>
> The inline asm doesn't work on Android.
Using pmccntr_el0 doen't work, no, but instead of falling back to
clock_gettime, you may want to use cntvct_el0 instead of pmccntr_el0. IIRC
that works on Android, at least it worked a number of years ago. It has
less precision than pmccntr_el0, but maybe is better than clock_gettime?
I.e., use similar inline assembly as before, but with cntvct_el0 instead
of pmccntr_el0.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
@ 2024-06-05 6:29 ` Rémi Denis-Courmont
2024-06-05 6:53 ` Zhao Zhili
2024-06-05 8:16 ` Martin Storsjö
1 sibling, 1 reply; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-05 6:29 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>From: Zhao Zhili <zhilizhao@tencent.com>
>
>Test on Apple M1:
>
>rgb24_to_uv_1080_c: 7.2
>rgb24_to_uv_1080_neon: 5.5
>rgb24_to_uv_1280_c: 8.2
>rgb24_to_uv_1280_neon: 6.2
>rgb24_to_uv_1920_c: 12.5
>rgb24_to_uv_1920_neon: 9.5
>
>rgb24_to_uv_half_540_c: 6.5
>rgb24_to_uv_half_540_neon: 3.0
>rgb24_to_uv_half_640_c: 7.5
>rgb24_to_uv_half_640_neon: 3.2
>rgb24_to_uv_half_960_c: 12.5
>rgb24_to_uv_half_960_neon: 6.0
>
>rgb24_to_y_1080_c: 4.5
>rgb24_to_y_1080_neon: 3.5
>rgb24_to_y_1280_c: 5.2
>rgb24_to_y_1280_neon: 4.2
>rgb24_to_y_1920_c: 8.0
>rgb24_to_y_1920_neon: 6.0
>
>Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
>---
> libswscale/aarch64/Makefile | 1 +
> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c | 25 ++++
> 3 files changed, 255 insertions(+)
> create mode 100644 libswscale/aarch64/input.S
>
>diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>index da1d909561..adfd90a1b6 100644
>--- a/libswscale/aarch64/Makefile
>+++ b/libswscale/aarch64/Makefile
>@@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \
> aarch64/swscale_unscaled.o \
>
> NEON-OBJS += aarch64/hscale.o \
>+ aarch64/input.o \
> aarch64/output.o \
> aarch64/rgb2rgb_neon.o \
> aarch64/yuv2rgb_neon.o \
>diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>new file mode 100644
>index 0000000000..ee0d223c6e
>--- /dev/null
>+++ b/libswscale/aarch64/input.S
>@@ -0,0 +1,229 @@
>+/*
>+ * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/aarch64/asm.S"
>+
>+.macro rgb24_to_yuv_load_rgb, src
>+ ld3 { v16.16b, v17.16b, v18.16b }, [\src]
>+ ushll v19.8h, v16.8b, #0 // v19: r
>+ ushll v20.8h, v17.8b, #0 // v20: g
>+ ushll v21.8h, v18.8b, #0 // v21: b
>+ ushll2 v22.8h, v16.16b, #0 // v22: r
>+ ushll2 v23.8h, v17.16b, #0 // v23: g
>+ ushll2 v24.8h, v18.16b, #0 // v24: b
>+.endm
>+
>+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>+ mov \dst1\().16b, v6.16b // dst1 = const_offset
>+ mov \dst2\().16b, v6.16b // dst2 = const_offset
>+ smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r
>+ smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r
>+ smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g
>+ smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g
>+ smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b
>+ smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b
>+ sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift
>+ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
>+.endm
>+
>+function ff_rgb24ToY_neon, export=1
>+ cmp w4, #0 // check width > 0
>+ b.le 4f
>+
>+ ldp w10, w11, [x5], #8 // w10: ry, w11: gy
I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.
>+ dup v0.8h, w10
>+ dup v1.8h, w11
>+ ldr w12, [x5] // w12: by
>+ dup v2.8h, w12
>+
>+ mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
>+ movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
>+ dup v6.4s, w9 // w9: const_offset
>+
>+ mov x2, #0 // w2: i
>+ and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16
>+ cbz w3, 3f
>+1:
>+ rgb24_to_yuv_load_rgb x1
>+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>+ stp q16, q17, [x0], #32 // store to dst
>+
>+ add w2, w2, #16 // i += 16
>+ add x1, x1, #48 // src += 48
>+ cmp w2, w3 // i < (width / 16 * 16)
>+ b.lt 1b
>+ b 3f
>+2:
>+ ldrb w13, [x1] // w13: r
>+ ldrb w14, [x1, #1] // w14: g
>+ ldrb w15, [x1, #2] // w15: b
You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.
I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?
>+
>+ smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
>+ smaddl x13, w14, w11, x13 // x13 += gy * g
>+ smaddl x13, w15, w12, x13 // x13 += by * b
>+ asr w13, w13, #9 // x13 >>= 9
>+ strh w13, [x0], #2 // store to dst
>+
>+ add w2, w2, #1 // i++
>+ add x1, x1, #3 // src += 3
>+3:
>+ cmp w2, w4 // i < width
>+ b.lt 2b
>+4:
>+ ret
>+endfunc
>+
>+.macro rgb24_load_uv_coeff half
>+ add x6, x6, #12
>+
>+ ldp w10, w11, [x6], #8 // w10: ru, w11: gu
>+ dup v0.8h, w10
>+ dup v1.8h, w11
>+
>+ ldp w12, w13, [x6], #8 // w12: bu, w13: rv
>+ dup v2.8h, w12
>+ dup v3.8h, w13
>+
>+ ldp w14, w15, [x6], #8 // w14: gv, w15: bv
>+ dup v4.8h, w14
>+ dup v5.8h, w15
>+
>+ .if \half
>+ mov w9, #512
>+ movk w9, #128, lsl #16 // w9: const_offset
>+ .else
>+ mov w9, #256
>+ movk w9, #64, lsl #16 // w9: const_offset
>+ .endif
>+ dup v6.4s, w9
>+.endm
>+
>+function ff_rgb24ToUV_half_neon, export=1
>+ cmp w5, #0 // check width > 0
>+ b.le 4f
>+
>+ rgb24_load_uv_coeff half=1
>+
>+ mov x9, #0 // x9: i
>+ and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8
>+ cbz w7, 3f
>+1:
>+ ld3 { v16.16b, v17.16b, v18.16b }, [x3]
>+ uaddlp v19.8h, v16.16b // v19: r
>+ uaddlp v20.8h, v17.16b // v20: g
>+ uaddlp v21.8h, v18.16b // v21: b
>+
>+ rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>+ str q16, [x0], #16 // store dst_u
>+ rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>+ str q17, [x1], #16 // store dst_v
>+
>+ add w9, w9, #8 // i += 8
>+ add x3, x3, #48 // src += 48
>+ cmp w9, w7 // i < (width * 8 / 8)
>+ b.lt 1b
>+ b 3f
>+2:
>+ ldrb w2, [x3] // w2: r1
>+ ldrb w4, [x3, #3] // w4: r2
>+ add w2, w2, w4 // w2 = r1 + r2
>+
>+ ldrb w4, [x3, #1] // w4: g1
>+ ldrb w7, [x3, #4] // w7: g2
>+ add w4, w4, w7 // w4 = g1 + g2
>+
>+ ldrb w7, [x3, #2] // w7: b1
>+ ldrb w8, [x3, #5] // w8: b2
>+ add w7, w7, w8 // w7 = b1 + b2
>+
>+ umov w8, v6.s[0] // dst_u = const_offset
>+ smaddl x8, w2, w10, x8 // dst_u += ru * r
>+ smaddl x8, w4, w11, x8 // dst_u += gu * g
>+ smaddl x8, w7, w12, x8 // dst_u += bu * b
>+ asr x8, x8, #10 // dst_u >>= 10
>+ strh w8, [x0], #2 // store dst_u
>+
>+ umov w8, v6.s[0] // dst_v = const_offset
>+ smaddl x8, w2, w13, x8 // dst_v += rv * r
>+ smaddl x8, w4, w14, x8 // dst_v += gv * g
>+ smaddl x8, w7, w15, x8 // dst_v += bv * b
>+ asr x8, x8, #10 // dst_v >>= 10
>+ strh w8, [x1], #2 // store dst_v
>+
>+ add w9, w9, #1 // i++
>+ add x3, x3, #6 // src += 6
>+3:
>+ cmp w9, w5
>+ b.lt 2b
>+4:
>+ ret
>+endfunc
>+
>+function ff_rgb24ToUV_neon, export=1
>+ cmp w5, #0 // check width > 0
>+ b.le 4f
>+
>+ rgb24_load_uv_coeff half=0
>+
>+ mov x2, #0 // w2: i
>+ and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16
>+ cbz w4, 3f
>+1:
>+ rgb24_to_yuv_load_rgb x3
>+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>+ stp q16, q17, [x0], #32 // store to dst_u
>+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
>+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
>+ stp q16, q17, [x1], #32 // store to dst_v
>+
>+ add w2, w2, #16 // i += 16
>+ add x3, x3, #48 // src += 48
>+ cmp w2, w4 // i < (width / 16 * 16)
>+ b.lt 1b
>+ b 3f
>+2:
>+ ldrb w16, [x3] // w16: r
>+ ldrb w17, [x3, #1] // w17: g
>+ ldrb w4, [x3, #2] // w4: b
>+
>+ umov w7, v6.s[0] // w7 = const_offset
>+
>+ smaddl x8, w16, w10, x7 // x8 = ru * r + const_offset
>+ smaddl x8, w17, w11, x8 // x8 += gu * g
>+ smaddl x8, w4, w12, x8 // x8 += bu * b
>+ asr w8, w8, #9 // x8 >>= 9
>+ strh w8, [x0], #2 // store to dst_u
>+
>+ smaddl x8, w16, w13, x7 // x8 = rv * r + const_offset
>+ smaddl x8, w17, w14, x8 // x8 += gv * g
>+ smaddl x8, w4, w15, x8 // x8 += bv * b
>+ asr w8, w8, #9 // x8 >>= 9
>+ strh w8, [x1], #2 // store to dst_v
>+
>+ add w2, w2, #1 // i++
>+ add x3, x3, #3 // src += 3
>+3:
>+ cmp w2, w5 // i < width
>+ b.lt 2b
>+4:
>+ ret
>+endfunc
>diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>index bbd9719a44..4c4ea39dc1 100644
>--- a/libswscale/aarch64/swscale.c
>+++ b/libswscale/aarch64/swscale.c
>@@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
> default: break; \
> }
>
>+void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>+ const uint8_t *unused2, int width,
>+ uint32_t *rgb2yuv, void *opq);
>+
>+void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>+ const uint8_t *src1,
>+ const uint8_t *src2, int width, uint32_t *rgb2yuv,
>+ void *opq);
>+
>+void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>+ const uint8_t *src1,
>+ const uint8_t *src2, int width, uint32_t *rgb2yuv,
>+ void *opq);
>+
> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
>@@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> if (c->dstBpc == 8) {
> c->yuv2planeX = ff_yuv2planeX_8_neon;
> }
>+ switch (c->srcFormat) {
>+ case AV_PIX_FMT_RGB24:
>+ c->lumToYV12 = ff_rgb24ToY_neon;
>+ if (c->chrSrcHSubSample)
>+ c->chrToYV12 = ff_rgb24ToUV_half_neon;
>+ else
>+ c->chrToYV12 = ff_rgb24ToUV_neon;
>+ break;
>+ default:
>+ break;
>+ }
> }
> }
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
2024-06-05 6:29 ` Rémi Denis-Courmont
@ 2024-06-05 6:53 ` Zhao Zhili
2024-06-05 7:34 ` Rémi Denis-Courmont
2024-06-05 7:34 ` Martin Storsjö
0 siblings, 2 replies; 10+ messages in thread
From: Zhao Zhili @ 2024-06-05 6:53 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <remi@remlab.net> wrote:
>
>
>
> Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>> From: Zhao Zhili <zhilizhao@tencent.com>
>>
>> Test on Apple M1:
>>
>> rgb24_to_uv_1080_c: 7.2
>> rgb24_to_uv_1080_neon: 5.5
>> rgb24_to_uv_1280_c: 8.2
>> rgb24_to_uv_1280_neon: 6.2
>> rgb24_to_uv_1920_c: 12.5
>> rgb24_to_uv_1920_neon: 9.5
>>
>> rgb24_to_uv_half_540_c: 6.5
>> rgb24_to_uv_half_540_neon: 3.0
>> rgb24_to_uv_half_640_c: 7.5
>> rgb24_to_uv_half_640_neon: 3.2
>> rgb24_to_uv_half_960_c: 12.5
>> rgb24_to_uv_half_960_neon: 6.0
>>
>> rgb24_to_y_1080_c: 4.5
>> rgb24_to_y_1080_neon: 3.5
>> rgb24_to_y_1280_c: 5.2
>> rgb24_to_y_1280_neon: 4.2
>> rgb24_to_y_1920_c: 8.0
>> rgb24_to_y_1920_neon: 6.0
>>
>> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
>> ---
>> libswscale/aarch64/Makefile | 1 +
>> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++
>> libswscale/aarch64/swscale.c | 25 ++++
>> 3 files changed, 255 insertions(+)
>> create mode 100644 libswscale/aarch64/input.S
>>
>> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>> index da1d909561..adfd90a1b6 100644
>> --- a/libswscale/aarch64/Makefile
>> +++ b/libswscale/aarch64/Makefile
>> @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \
>> aarch64/swscale_unscaled.o \
>>
>> NEON-OBJS += aarch64/hscale.o \
>> + aarch64/input.o \
>> aarch64/output.o \
>> aarch64/rgb2rgb_neon.o \
>> aarch64/yuv2rgb_neon.o \
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> new file mode 100644
>> index 0000000000..ee0d223c6e
>> --- /dev/null
>> +++ b/libswscale/aarch64/input.S
>> @@ -0,0 +1,229 @@
>> +/*
>> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/aarch64/asm.S"
>> +
>> +.macro rgb24_to_yuv_load_rgb, src
>> + ld3 { v16.16b, v17.16b, v18.16b }, [\src]
>> + ushll v19.8h, v16.8b, #0 // v19: r
>> + ushll v20.8h, v17.8b, #0 // v20: g
>> + ushll v21.8h, v18.8b, #0 // v21: b
>> + ushll2 v22.8h, v16.16b, #0 // v22: r
>> + ushll2 v23.8h, v17.16b, #0 // v23: g
>> + ushll2 v24.8h, v18.16b, #0 // v24: b
>> +.endm
>> +
>> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>> + mov \dst1\().16b, v6.16b // dst1 = const_offset
>> + mov \dst2\().16b, v6.16b // dst2 = const_offset
>> + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r
>> + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r
>> + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g
>> + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g
>> + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b
>> + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b
>> + sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift
>> + sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
>> +.endm
>> +
>> +function ff_rgb24ToY_neon, export=1
>> + cmp w4, #0 // check width > 0
>> + b.le 4f
>> +
>> + ldp w10, w11, [x5], #8 // w10: ry, w11: gy
>
> I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.
Will do.
>
>> + dup v0.8h, w10
>> + dup v1.8h, w11
>> + ldr w12, [x5] // w12: by
>> + dup v2.8h, w12
>> +
>> + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
>> + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
>> + dup v6.4s, w9 // w9: const_offset
>> +
>> + mov x2, #0 // w2: i
>> + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16
>> + cbz w3, 3f
>> +1:
>> + rgb24_to_yuv_load_rgb x1
>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> + stp q16, q17, [x0], #32 // store to dst
>> +
>> + add w2, w2, #16 // i += 16
>> + add x1, x1, #48 // src += 48
>> + cmp w2, w3 // i < (width / 16 * 16)
>> + b.lt 1b
>> + b 3f
>> +2:
>> + ldrb w13, [x1] // w13: r
>> + ldrb w14, [x1, #1] // w14: g
>> + ldrb w15, [x1, #2] // w15: b
>
> You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.
>
> I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?
I have checked the assembly of C version. The compiler has done pretty well on loop unroll and
vectorize on this simple case.
>
>> +
>> + smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
>> + smaddl x13, w14, w11, x13 // x13 += gy * g
>> + smaddl x13, w15, w12, x13 // x13 += by * b
>> + asr w13, w13, #9 // x13 >>= 9
>> + strh w13, [x0], #2 // store to dst
>> +
>> + add w2, w2, #1 // i++
>> + add x1, x1, #3 // src += 3
>> +3:
>> + cmp w2, w4 // i < width
>> + b.lt 2b
>> +4:
>> + ret
>> +endfunc
>> +
>> +.macro rgb24_load_uv_coeff half
>> + add x6, x6, #12
>> +
>> + ldp w10, w11, [x6], #8 // w10: ru, w11: gu
>> + dup v0.8h, w10
>> + dup v1.8h, w11
>> +
>> + ldp w12, w13, [x6], #8 // w12: bu, w13: rv
>> + dup v2.8h, w12
>> + dup v3.8h, w13
>> +
>> + ldp w14, w15, [x6], #8 // w14: gv, w15: bv
>> + dup v4.8h, w14
>> + dup v5.8h, w15
>> +
>> + .if \half
>> + mov w9, #512
>> + movk w9, #128, lsl #16 // w9: const_offset
>> + .else
>> + mov w9, #256
>> + movk w9, #64, lsl #16 // w9: const_offset
>> + .endif
>> + dup v6.4s, w9
>> +.endm
>> +
>> +function ff_rgb24ToUV_half_neon, export=1
>> + cmp w5, #0 // check width > 0
>> + b.le 4f
>> +
>> + rgb24_load_uv_coeff half=1
>> +
>> + mov x9, #0 // x9: i
>> + and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8
>> + cbz w7, 3f
>> +1:
>> + ld3 { v16.16b, v17.16b, v18.16b }, [x3]
>> + uaddlp v19.8h, v16.16b // v19: r
>> + uaddlp v20.8h, v17.16b // v20: g
>> + uaddlp v21.8h, v18.16b // v21: b
>> +
>> + rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>> + str q16, [x0], #16 // store dst_u
>> + rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>> + str q17, [x1], #16 // store dst_v
>> +
>> + add w9, w9, #8 // i += 8
>> + add x3, x3, #48 // src += 48
>> + cmp w9, w7 // i < (width * 8 / 8)
>> + b.lt 1b
>> + b 3f
>> +2:
>> + ldrb w2, [x3] // w2: r1
>> + ldrb w4, [x3, #3] // w4: r2
>> + add w2, w2, w4 // w2 = r1 + r2
>> +
>> + ldrb w4, [x3, #1] // w4: g1
>> + ldrb w7, [x3, #4] // w7: g2
>> + add w4, w4, w7 // w4 = g1 + g2
>> +
>> + ldrb w7, [x3, #2] // w7: b1
>> + ldrb w8, [x3, #5] // w8: b2
>> + add w7, w7, w8 // w7 = b1 + b2
>> +
>> + umov w8, v6.s[0] // dst_u = const_offset
>> + smaddl x8, w2, w10, x8 // dst_u += ru * r
>> + smaddl x8, w4, w11, x8 // dst_u += gu * g
>> + smaddl x8, w7, w12, x8 // dst_u += bu * b
>> + asr x8, x8, #10 // dst_u >>= 10
>> + strh w8, [x0], #2 // store dst_u
>> +
>> + umov w8, v6.s[0] // dst_v = const_offset
>> + smaddl x8, w2, w13, x8 // dst_v += rv * r
>> + smaddl x8, w4, w14, x8 // dst_v += gv * g
>> + smaddl x8, w7, w15, x8 // dst_v += bv * b
>> + asr x8, x8, #10 // dst_v >>= 10
>> + strh w8, [x1], #2 // store dst_v
>> +
>> + add w9, w9, #1 // i++
>> + add x3, x3, #6 // src += 6
>> +3:
>> + cmp w9, w5
>> + b.lt 2b
>> +4:
>> + ret
>> +endfunc
>> +
>> +function ff_rgb24ToUV_neon, export=1
>> + cmp w5, #0 // check width > 0
>> + b.le 4f
>> +
>> + rgb24_load_uv_coeff half=0
>> +
>> + mov x2, #0 // w2: i
>> + and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16
>> + cbz w4, 3f
>> +1:
>> + rgb24_to_yuv_load_rgb x3
>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> + stp q16, q17, [x0], #32 // store to dst_u
>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
>> + stp q16, q17, [x1], #32 // store to dst_v
>> +
>> + add w2, w2, #16 // i += 16
>> + add x3, x3, #48 // src += 48
>> + cmp w2, w4 // i < (width / 16 * 16)
>> + b.lt 1b
>> + b 3f
>> +2:
>> + ldrb w16, [x3] // w16: r
>> + ldrb w17, [x3, #1] // w17: g
>> + ldrb w4, [x3, #2] // w4: b
>> +
>> + umov w7, v6.s[0] // w7 = const_offset
>> +
>> + smaddl x8, w16, w10, x7 // x8 = ru * r + const_offset
>> + smaddl x8, w17, w11, x8 // x8 += gu * g
>> + smaddl x8, w4, w12, x8 // x8 += bu * b
>> + asr w8, w8, #9 // x8 >>= 9
>> + strh w8, [x0], #2 // store to dst_u
>> +
>> + smaddl x8, w16, w13, x7 // x8 = rv * r + const_offset
>> + smaddl x8, w17, w14, x8 // x8 += gv * g
>> + smaddl x8, w4, w15, x8 // x8 += bv * b
>> + asr w8, w8, #9 // x8 >>= 9
>> + strh w8, [x1], #2 // store to dst_v
>> +
>> + add w2, w2, #1 // i++
>> + add x3, x3, #3 // src += 3
>> +3:
>> + cmp w2, w5 // i < width
>> + b.lt 2b
>> +4:
>> + ret
>> +endfunc
>> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>> index bbd9719a44..4c4ea39dc1 100644
>> --- a/libswscale/aarch64/swscale.c
>> +++ b/libswscale/aarch64/swscale.c
>> @@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
>> default: break; \
>> }
>>
>> +void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>> + const uint8_t *unused2, int width,
>> + uint32_t *rgb2yuv, void *opq);
>> +
>> +void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>> + const uint8_t *src1,
>> + const uint8_t *src2, int width, uint32_t *rgb2yuv,
>> + void *opq);
>> +
>> +void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>> + const uint8_t *src1,
>> + const uint8_t *src2, int width, uint32_t *rgb2yuv,
>> + void *opq);
>> +
>> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>> {
>> int cpu_flags = av_get_cpu_flags();
>> @@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>> if (c->dstBpc == 8) {
>> c->yuv2planeX = ff_yuv2planeX_8_neon;
>> }
>> + switch (c->srcFormat) {
>> + case AV_PIX_FMT_RGB24:
>> + c->lumToYV12 = ff_rgb24ToY_neon;
>> + if (c->chrSrcHSubSample)
>> + c->chrToYV12 = ff_rgb24ToUV_half_neon;
>> + else
>> + c->chrToYV12 = ff_rgb24ToUV_neon;
>> + break;
>> + default:
>> + break;
>> + }
>> }
>> }
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
2024-06-05 6:53 ` Zhao Zhili
@ 2024-06-05 7:34 ` Rémi Denis-Courmont
2024-06-05 7:34 ` Martin Storsjö
1 sibling, 0 replies; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-05 7:34 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le 5 juin 2024 09:53:45 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>
>
>> On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <remi@remlab.net> wrote:
>>
>>
>>
>> Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>>> From: Zhao Zhili <zhilizhao@tencent.com>
>>>
>>> Test on Apple M1:
>>>
>>> rgb24_to_uv_1080_c: 7.2
>>> rgb24_to_uv_1080_neon: 5.5
>>> rgb24_to_uv_1280_c: 8.2
>>> rgb24_to_uv_1280_neon: 6.2
>>> rgb24_to_uv_1920_c: 12.5
>>> rgb24_to_uv_1920_neon: 9.5
>>>
>>> rgb24_to_uv_half_540_c: 6.5
>>> rgb24_to_uv_half_540_neon: 3.0
>>> rgb24_to_uv_half_640_c: 7.5
>>> rgb24_to_uv_half_640_neon: 3.2
>>> rgb24_to_uv_half_960_c: 12.5
>>> rgb24_to_uv_half_960_neon: 6.0
>>>
>>> rgb24_to_y_1080_c: 4.5
>>> rgb24_to_y_1080_neon: 3.5
>>> rgb24_to_y_1280_c: 5.2
>>> rgb24_to_y_1280_neon: 4.2
>>> rgb24_to_y_1920_c: 8.0
>>> rgb24_to_y_1920_neon: 6.0
>>>
>>> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
>>> ---
>>> libswscale/aarch64/Makefile | 1 +
>>> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++
>>> libswscale/aarch64/swscale.c | 25 ++++
>>> 3 files changed, 255 insertions(+)
>>> create mode 100644 libswscale/aarch64/input.S
>>>
>>> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>>> index da1d909561..adfd90a1b6 100644
>>> --- a/libswscale/aarch64/Makefile
>>> +++ b/libswscale/aarch64/Makefile
>>> @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \
>>> aarch64/swscale_unscaled.o \
>>>
>>> NEON-OBJS += aarch64/hscale.o \
>>> + aarch64/input.o \
>>> aarch64/output.o \
>>> aarch64/rgb2rgb_neon.o \
>>> aarch64/yuv2rgb_neon.o \
>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>>> new file mode 100644
>>> index 0000000000..ee0d223c6e
>>> --- /dev/null
>>> +++ b/libswscale/aarch64/input.S
>>> @@ -0,0 +1,229 @@
>>> +/*
>>> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> + */
>>> +
>>> +#include "libavutil/aarch64/asm.S"
>>> +
>>> +.macro rgb24_to_yuv_load_rgb, src
>>> + ld3 { v16.16b, v17.16b, v18.16b }, [\src]
>>> + ushll v19.8h, v16.8b, #0 // v19: r
>>> + ushll v20.8h, v17.8b, #0 // v20: g
>>> + ushll v21.8h, v18.8b, #0 // v21: b
>>> + ushll2 v22.8h, v16.16b, #0 // v22: r
>>> + ushll2 v23.8h, v17.16b, #0 // v23: g
>>> + ushll2 v24.8h, v18.16b, #0 // v24: b
>>> +.endm
>>> +
>>> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>>> + mov \dst1\().16b, v6.16b // dst1 = const_offset
>>> + mov \dst2\().16b, v6.16b // dst2 = const_offset
>>> + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r
>>> + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r
>>> + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g
>>> + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g
>>> + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b
>>> + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b
>>> + sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift
>>> + sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
>>> +.endm
>>> +
>>> +function ff_rgb24ToY_neon, export=1
>>> + cmp w4, #0 // check width > 0
>>> + b.le 4f
>>> +
>>> + ldp w10, w11, [x5], #8 // w10: ry, w11: gy
>>
>> I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.
>
>Will do.
>
>>
>>> + dup v0.8h, w10
>>> + dup v1.8h, w11
>>> + ldr w12, [x5] // w12: by
>>> + dup v2.8h, w12
>>> +
>>> + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
>>> + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
>>> + dup v6.4s, w9 // w9: const_offset
>>> +
>>> + mov x2, #0 // w2: i
>>> + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16
>>> + cbz w3, 3f
>>> +1:
>>> + rgb24_to_yuv_load_rgb x1
>>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>> + stp q16, q17, [x0], #32 // store to dst
>>> +
>>> + add w2, w2, #16 // i += 16
>>> + add x1, x1, #48 // src += 48
>>> + cmp w2, w3 // i < (width / 16 * 16)
>>> + b.lt 1b
>>> + b 3f
>>> +2:
>>> + ldrb w13, [x1] // w13: r
>>> + ldrb w14, [x1, #1] // w14: g
>>> + ldrb w15, [x1, #2] // w15: b
>>
>> You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.
>>
>> I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?
>
>I have checked the assembly of C version. The compiler has done pretty well on loop unroll and
>vectorize on this simple case.
Uh, don't we disable auto-vectorisation in the configure script? Until/unless it is re-enabled, I think benchmarks should be done against non-auto-vectorised code, if only to stay representative of normal/default FFmpeg builds.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
2024-06-05 6:53 ` Zhao Zhili
2024-06-05 7:34 ` Rémi Denis-Courmont
@ 2024-06-05 7:34 ` Martin Storsjö
1 sibling, 0 replies; 10+ messages in thread
From: Martin Storsjö @ 2024-06-05 7:34 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Wed, 5 Jun 2024, Zhao Zhili wrote:
>> On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <remi@remlab.net> wrote:
>>
>> Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>>> From: Zhao Zhili <zhilizhao@tencent.com>
>>>
>>> Test on Apple M1:
>>>
>>> rgb24_to_uv_1080_c: 7.2
>>> rgb24_to_uv_1080_neon: 5.5
>>> rgb24_to_uv_1280_c: 8.2
>>> rgb24_to_uv_1280_neon: 6.2
>>> rgb24_to_uv_1920_c: 12.5
>>> rgb24_to_uv_1920_neon: 9.5
>>>
>>> rgb24_to_uv_half_540_c: 6.5
>>> rgb24_to_uv_half_540_neon: 3.0
>>> rgb24_to_uv_half_640_c: 7.5
>>> rgb24_to_uv_half_640_neon: 3.2
>>> rgb24_to_uv_half_960_c: 12.5
>>> rgb24_to_uv_half_960_neon: 6.0
>>>
>>> rgb24_to_y_1080_c: 4.5
>>> rgb24_to_y_1080_neon: 3.5
>>> rgb24_to_y_1280_c: 5.2
>>> rgb24_to_y_1280_neon: 4.2
>>> rgb24_to_y_1920_c: 8.0
>>> rgb24_to_y_1920_neon: 6.0
>>>
>>> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
>>> ---
>>> libswscale/aarch64/Makefile | 1 +
>>> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++
>>> libswscale/aarch64/swscale.c | 25 ++++
>>> 3 files changed, 255 insertions(+)
>>> create mode 100644 libswscale/aarch64/input.S
>>>
>>> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>>> index da1d909561..adfd90a1b6 100644
>>> --- a/libswscale/aarch64/Makefile
>>> +++ b/libswscale/aarch64/Makefile
>>> @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \
>>> aarch64/swscale_unscaled.o \
>>>
>>> NEON-OBJS += aarch64/hscale.o \
>>> + aarch64/input.o \
>>> aarch64/output.o \
>>> aarch64/rgb2rgb_neon.o \
>>> aarch64/yuv2rgb_neon.o \
>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>>> new file mode 100644
>>> index 0000000000..ee0d223c6e
>>> --- /dev/null
>>> +++ b/libswscale/aarch64/input.S
>>> @@ -0,0 +1,229 @@
>>> +/*
>>> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> + */
>>> +
>>> +#include "libavutil/aarch64/asm.S"
>>> +
>>> +.macro rgb24_to_yuv_load_rgb, src
>>> + ld3 { v16.16b, v17.16b, v18.16b }, [\src]
>>> + ushll v19.8h, v16.8b, #0 // v19: r
>>> + ushll v20.8h, v17.8b, #0 // v20: g
>>> + ushll v21.8h, v18.8b, #0 // v21: b
>>> + ushll2 v22.8h, v16.16b, #0 // v22: r
>>> + ushll2 v23.8h, v17.16b, #0 // v23: g
>>> + ushll2 v24.8h, v18.16b, #0 // v24: b
>>> +.endm
>>> +
>>> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>>> + mov \dst1\().16b, v6.16b // dst1 = const_offset
>>> + mov \dst2\().16b, v6.16b // dst2 = const_offset
>>> + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r
>>> + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r
>>> + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g
>>> + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g
>>> + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b
>>> + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b
>>> + sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift
>>> + sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
>>> +.endm
>>> +
>>> +function ff_rgb24ToY_neon, export=1
>>> + cmp w4, #0 // check width > 0
>>> + b.le 4f
>>> +
>>> + ldp w10, w11, [x5], #8 // w10: ry, w11: gy
>>
>> I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.
>
> Will do.
>
>>
>>> + dup v0.8h, w10
>>> + dup v1.8h, w11
>>> + ldr w12, [x5] // w12: by
>>> + dup v2.8h, w12
>>> +
>>> + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
>>> + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
>>> + dup v6.4s, w9 // w9: const_offset
>>> +
>>> + mov x2, #0 // w2: i
>>> + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16
>>> + cbz w3, 3f
>>> +1:
>>> + rgb24_to_yuv_load_rgb x1
>>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>> + stp q16, q17, [x0], #32 // store to dst
>>> +
>>> + add w2, w2, #16 // i += 16
>>> + add x1, x1, #48 // src += 48
>>> + cmp w2, w3 // i < (width / 16 * 16)
>>> + b.lt 1b
>>> + b 3f
>>> +2:
>>> + ldrb w13, [x1] // w13: r
>>> + ldrb w14, [x1, #1] // w14: g
>>> + ldrb w15, [x1, #2] // w15: b
>>
>> You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.
>>
>> I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?
>
> I have checked the assembly of C version. The compiler has done pretty well on loop unroll and
> vectorize on this simple case.
To add some context here; ffmpeg's configure disables autovectorization
with GCC (as it does miscompile things semi regularly), but not with
Clang. This can give somewhat misleading numbers wrt the relative speedup.
Then additionally, the Apple CPUs do have slightly different performance
characteristics than other cores too, indeed. Plus the very coarse timer
used on macOS doesn't help either...
FWIW, here are some numbers for this patch from some more traditional
CPUs, with a GCC build:
Cortex A53 A72 A78
rgb24_to_uv_1080_c: 19471.5 8720.7 7049.7
rgb24_to_uv_1080_neon: 5922.7 3147.5 2274.5
rgb24_to_uv_1280_c: 23067.0 10318.2 8348.5
rgb24_to_uv_1280_neon: 6842.5 3672.5 2656.5
rgb24_to_uv_1920_c: 34595.2 15483.2 12509.7
rgb24_to_uv_1920_neon: 10246.0 5496.7 3976.5
rgb24_to_uv_half_540_c: 11396.0 5481.0 4576.0
rgb24_to_uv_half_540_neon: 3655.7 1687.5 1382.5
rgb24_to_uv_half_640_c: 13546.0 6480.2 5399.0
rgb24_to_uv_half_640_neon: 4202.7 1958.2 1611.2
rgb24_to_uv_half_960_c: 20311.0 9724.2 8068.2
rgb24_to_uv_half_960_neon: 6282.7 2934.2 2372.2
rgb24_to_y_1080_c: 12984.2 4339.7 4074.2
rgb24_to_y_1080_neon: 3492.5 1960.5 1444.7
rgb24_to_y_1280_c: 15384.2 6709.2 4823.5
rgb24_to_y_1280_neon: 4038.2 2265.0 1674.0
rgb24_to_y_1920_c: 23069.7 7708.7 7224.7
rgb24_to_y_1920_neon: 6036.2 3389.0 2514.0
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
2024-06-05 6:29 ` Rémi Denis-Courmont
@ 2024-06-05 8:16 ` Martin Storsjö
1 sibling, 0 replies; 10+ messages in thread
From: Martin Storsjö @ 2024-06-05 8:16 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Zhao Zhili
On Tue, 4 Jun 2024, Zhao Zhili wrote:
> From: Zhao Zhili <zhilizhao@tencent.com>
>
> Test on Apple M1:
>
> rgb24_to_uv_1080_c: 7.2
> rgb24_to_uv_1080_neon: 5.5
> rgb24_to_uv_1280_c: 8.2
> rgb24_to_uv_1280_neon: 6.2
> rgb24_to_uv_1920_c: 12.5
> rgb24_to_uv_1920_neon: 9.5
>
> rgb24_to_uv_half_540_c: 6.5
> rgb24_to_uv_half_540_neon: 3.0
> rgb24_to_uv_half_640_c: 7.5
> rgb24_to_uv_half_640_neon: 3.2
> rgb24_to_uv_half_960_c: 12.5
> rgb24_to_uv_half_960_neon: 6.0
>
> rgb24_to_y_1080_c: 4.5
> rgb24_to_y_1080_neon: 3.5
> rgb24_to_y_1280_c: 5.2
> rgb24_to_y_1280_neon: 4.2
> rgb24_to_y_1920_c: 8.0
> rgb24_to_y_1920_neon: 6.0
>
> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
> ---
> libswscale/aarch64/Makefile | 1 +
> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c | 25 ++++
> 3 files changed, 255 insertions(+)
> create mode 100644 libswscale/aarch64/input.S
>
> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> index da1d909561..adfd90a1b6 100644
> --- a/libswscale/aarch64/Makefile
> +++ b/libswscale/aarch64/Makefile
> @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \
> aarch64/swscale_unscaled.o \
>
> NEON-OBJS += aarch64/hscale.o \
> + aarch64/input.o \
> aarch64/output.o \
> aarch64/rgb2rgb_neon.o \
> aarch64/yuv2rgb_neon.o \
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> new file mode 100644
> index 0000000000..ee0d223c6e
> --- /dev/null
> +++ b/libswscale/aarch64/input.S
> @@ -0,0 +1,229 @@
> +/*
> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +.macro rgb24_to_yuv_load_rgb, src
> + ld3 { v16.16b, v17.16b, v18.16b }, [\src]
> + ushll v19.8h, v16.8b, #0 // v19: r
> + ushll v20.8h, v17.8b, #0 // v20: g
> + ushll v21.8h, v18.8b, #0 // v21: b
> + ushll2 v22.8h, v16.16b, #0 // v22: r
> + ushll2 v23.8h, v17.16b, #0 // v23: g
> + ushll2 v24.8h, v18.16b, #0 // v24: b
Doing "ushll #0" is perhaps a bit unusual, the common thing would be
"uxtl" instead. It doesn't matter in practice though, it assembles to the
same instruction anyway.
> +.endm
> +
> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
> + mov \dst1\().16b, v6.16b // dst1 = const_offset
> + mov \dst2\().16b, v6.16b // dst2 = const_offset
> + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r
> + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r
> + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g
> + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g
> + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b
> + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b
For sequences like this, the Cortex A53 (and iirc at least the A55 too)
has got a fastpath; if you do multiple consequent smlal/smlsl (or regular
mla/mls) into the same register, you actually save a lot of time. E.g.
instead of this:
smlal dst1
smlal dst2
smlal dst1
smlal dst2
smlal dst1
smlal dst2
Do this:
smlal dst1
smlal dst1
smlal dst1
smlal dst2
smlal dst2
smlal dst2
For in-order cores (with this special fastpath - it is indeed a bit
non-obvious) this makes a huge difference, and for out of order cores,
they can reorder it as they prefer anyway (as this is not a very long
instruction sequence).
This makes a massive difference for the in-order cores.
Before: Cortex A53 A72 A73
rgb24_to_y_1920_neon: 6032.7 3385.7 2514.0
After:
rgb24_to_y_1920_neon: 5072.7 3388.2 2522.0
A 19% speedup on A53 with just with this one change, and it makes almost
no difference for the other cores (mostly within measurement noise).
> +function ff_rgb24ToY_neon, export=1
> + cmp w4, #0 // check width > 0
> + b.le 4f
> +
> + ldp w10, w11, [x5], #8 // w10: ry, w11: gy
> + dup v0.8h, w10
> + dup v1.8h, w11
> + ldr w12, [x5] // w12: by
> + dup v2.8h, w12
> +
> + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
> + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
> + dup v6.4s, w9 // w9: const_offset
> +
> + mov x2, #0 // w2: i
> + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16
> + cbz w3, 3f
This blindly assumes that if width > 0, then width is also >= 16 at least,
as we always run the SIMD codepath once here. Is this a requirement in
swscale, or should we check whether width >= 16 here too?
> +1:
> + rgb24_to_yuv_load_rgb x1
> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
> + stp q16, q17, [x0], #32 // store to dst
> +
> + add w2, w2, #16 // i += 16
> + add x1, x1, #48 // src += 48
> + cmp w2, w3 // i < (width / 16 * 16)
> + b.lt 1b
For the in-order cores, we can help with scheduling here too. You can do
the add/add/cmp before the stp. This adds a bit more distance between the
calculation of q17 and the store of it, and adds more distance between the
cmp and the b.lt that depends on it.
Secondly, it's a bit uncommon in SIMD like this, to count upwards
(incrementing i); instead, the common pattern is to keep the width in one
register and count it down.
Here, you can then do "cmp w2, #16", so you don't need to keep a register
with the aligned end value. And for the scalar codepath, when approaching
zero, you can do "subs w2, #1", "b.gt 2b", so you don't need two
instructions for add+cmp.
> + b 3f
> +2:
> + ldrb w13, [x1] // w13: r
> + ldrb w14, [x1, #1] // w14: g
> + ldrb w15, [x1, #2] // w15: b
> +
> + smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
> + smaddl x13, w14, w11, x13 // x13 += gy * g
> + smaddl x13, w15, w12, x13 // x13 += by * b
> + asr w13, w13, #9 // x13 >>= 9
> + strh w13, [x0], #2 // store to dst
> +
> + add w2, w2, #1 // i++
> + add x1, x1, #3 // src += 3
> +3:
> + cmp w2, w4 // i < width
> + b.lt 2b
> +4:
> + ret
> +endfunc
> +
> +.macro rgb24_load_uv_coeff half
> + add x6, x6, #12
> +
> + ldp w10, w11, [x6], #8 // w10: ru, w11: gu
> + dup v0.8h, w10
> + dup v1.8h, w11
> +
> + ldp w12, w13, [x6], #8 // w12: bu, w13: rv
> + dup v2.8h, w12
> + dup v3.8h, w13
> +
> + ldp w14, w15, [x6], #8 // w14: gv, w15: bv
> + dup v4.8h, w14
> + dup v5.8h, w15
As Remi mentioned, scheduling instructions like these can help a bit. But
here, each "ldp" depends on the updated x6 from the previous one, so it
maybe doesn't make much difference.
> +
> + .if \half
> + mov w9, #512
> + movk w9, #128, lsl #16 // w9: const_offset
> + .else
> + mov w9, #256
> + movk w9, #64, lsl #16 // w9: const_offset
> + .endif
> + dup v6.4s, w9
> +.endm
> +
> +function ff_rgb24ToUV_half_neon, export=1
> + cmp w5, #0 // check width > 0
> + b.le 4f
> +
> + rgb24_load_uv_coeff half=1
> +
> + mov x9, #0 // x9: i
> + and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8
> + cbz w7, 3f
> +1:
> + ld3 { v16.16b, v17.16b, v18.16b }, [x3]
> + uaddlp v19.8h, v16.16b // v19: r
> + uaddlp v20.8h, v17.16b // v20: g
> + uaddlp v21.8h, v18.16b // v21: b
> +
> + rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
> + str q16, [x0], #16 // store dst_u
> + rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
> + str q17, [x1], #16 // store dst_v
> +
> + add w9, w9, #8 // i += 8
> + add x3, x3, #48 // src += 48
> + cmp w9, w7 // i < (width * 8 / 8)
> + b.lt 1b
Here, you can also move the both str out to after the add/cmp, like above.
> + b 3f
> +2:
> + ldrb w2, [x3] // w2: r1
> + ldrb w4, [x3, #3] // w4: r2
> + add w2, w2, w4 // w2 = r1 + r2
> +
> + ldrb w4, [x3, #1] // w4: g1
> + ldrb w7, [x3, #4] // w7: g2
> + add w4, w4, w7 // w4 = g1 + g2
> +
> + ldrb w7, [x3, #2] // w7: b1
> + ldrb w8, [x3, #5] // w8: b2
> + add w7, w7, w8 // w7 = b1 + b2
> +
> + umov w8, v6.s[0] // dst_u = const_offset
> + smaddl x8, w2, w10, x8 // dst_u += ru * r
> + smaddl x8, w4, w11, x8 // dst_u += gu * g
> + smaddl x8, w7, w12, x8 // dst_u += bu * b
> + asr x8, x8, #10 // dst_u >>= 10
> + strh w8, [x0], #2 // store dst_u
> +
> + umov w8, v6.s[0] // dst_v = const_offset
> + smaddl x8, w2, w13, x8 // dst_v += rv * r
> + smaddl x8, w4, w14, x8 // dst_v += gv * g
> + smaddl x8, w7, w15, x8 // dst_v += bv * b
> + asr x8, x8, #10 // dst_v >>= 10
> + strh w8, [x1], #2 // store dst_v
> +
> + add w9, w9, #1 // i++
> + add x3, x3, #6 // src += 6
> +3:
> + cmp w9, w5
> + b.lt 2b
> +4:
> + ret
> +endfunc
> +
> +function ff_rgb24ToUV_neon, export=1
> + cmp w5, #0 // check width > 0
> + b.le 4f
> +
> + rgb24_load_uv_coeff half=0
> +
> + mov x2, #0 // w2: i
> + and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16
> + cbz w4, 3f
> +1:
> + rgb24_to_yuv_load_rgb x3
> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
> + stp q16, q17, [x0], #32 // store to dst_u
> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
> + stp q16, q17, [x1], #32 // store to dst_v
If you'd make the second pair of rgb24_to_yuv_product write into v18/v19
instead of v16/v17, you can move them out to after the add/add/cmp. (It
doesn't make much of a measurable difference, but is good scheduling in
general.)
> +
> + add w2, w2, #16 // i += 16
> + add x3, x3, #48 // src += 48
> + cmp w2, w4 // i < (width / 16 * 16)
> + b.lt 1b
> + b 3f
> +2:
> + ldrb w16, [x3] // w16: r
> + ldrb w17, [x3, #1] // w17: g
> + ldrb w4, [x3, #2] // w4: b
> +
> + umov w7, v6.s[0] // w7 = const_offset
SIMD->GPR moves are generally expensive, we shouldn't be doing this
within the loop, if the value is constant. Instead, you should do this in
a prologue to the scalar codepath.
But in this case, the value should already be present in w9, if I
understand the code correctly, so we don't need to fetch it from v6 - and
this is already what you do in ff_rgb24ToY_neon.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2024-06-05 8:16 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20240604135504.83169-1-quinkblack@foxmail.com>
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
2024-06-05 6:29 ` Rémi Denis-Courmont
2024-06-05 6:53 ` Zhao Zhili
2024-06-05 7:34 ` Rémi Denis-Courmont
2024-06-05 7:34 ` Martin Storsjö
2024-06-05 8:16 ` Martin Storsjö
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple Zhao Zhili
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME Zhao Zhili
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili
2024-06-04 20:25 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git