* [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation [not found] <20240604135504.83169-1-quinkblack@foxmail.com> @ 2024-06-04 13:55 ` Zhao Zhili 2024-06-05 6:29 ` Rémi Denis-Courmont 2024-06-05 8:16 ` Martin Storsjö 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple Zhao Zhili ` (2 subsequent siblings) 3 siblings, 2 replies; 10+ messages in thread From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Zhao Zhili From: Zhao Zhili <zhilizhao@tencent.com> Test on Apple M1: rgb24_to_uv_1080_c: 7.2 rgb24_to_uv_1080_neon: 5.5 rgb24_to_uv_1280_c: 8.2 rgb24_to_uv_1280_neon: 6.2 rgb24_to_uv_1920_c: 12.5 rgb24_to_uv_1920_neon: 9.5 rgb24_to_uv_half_540_c: 6.5 rgb24_to_uv_half_540_neon: 3.0 rgb24_to_uv_half_640_c: 7.5 rgb24_to_uv_half_640_neon: 3.2 rgb24_to_uv_half_960_c: 12.5 rgb24_to_uv_half_960_neon: 6.0 rgb24_to_y_1080_c: 4.5 rgb24_to_y_1080_neon: 3.5 rgb24_to_y_1280_c: 5.2 rgb24_to_y_1280_neon: 4.2 rgb24_to_y_1920_c: 8.0 rgb24_to_y_1920_neon: 6.0 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> --- libswscale/aarch64/Makefile | 1 + libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++ libswscale/aarch64/swscale.c | 25 ++++ 3 files changed, 255 insertions(+) create mode 100644 libswscale/aarch64/input.S diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile index da1d909561..adfd90a1b6 100644 --- a/libswscale/aarch64/Makefile +++ b/libswscale/aarch64/Makefile @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \ aarch64/swscale_unscaled.o \ NEON-OBJS += aarch64/hscale.o \ + aarch64/input.o \ aarch64/output.o \ aarch64/rgb2rgb_neon.o \ aarch64/yuv2rgb_neon.o \ diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S new file mode 100644 index 0000000000..ee0d223c6e --- /dev/null +++ b/libswscale/aarch64/input.S @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +.macro rgb24_to_yuv_load_rgb, src + ld3 { v16.16b, v17.16b, v18.16b }, [\src] + ushll v19.8h, v16.8b, #0 // v19: r + ushll v20.8h, v17.8b, #0 // v20: g + ushll v21.8h, v18.8b, #0 // v21: b + ushll2 v22.8h, v16.16b, #0 // v22: r + ushll2 v23.8h, v17.16b, #0 // v23: g + ushll2 v24.8h, v18.16b, #0 // v24: b +.endm + +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift + mov \dst1\().16b, v6.16b // dst1 = const_offset + mov \dst2\().16b, v6.16b // dst2 = const_offset + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b + sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift + sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift +.endm + +function ff_rgb24ToY_neon, export=1 + cmp w4, #0 // check width > 0 + b.le 4f + + ldp w10, w11, [x5], #8 // w10: ry, w11: gy + dup v0.8h, w10 + dup v1.8h, w11 + ldr w12, [x5] // w12: by + dup v2.8h, w12 + + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) + dup v6.4s, w9 // w9: const_offset + + mov x2, #0 // w2: i + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16 + cbz w3, 3f +1: + rgb24_to_yuv_load_rgb x1 + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 + stp q16, q17, [x0], #32 // store to dst + + add w2, w2, #16 // i += 16 + add x1, x1, #48 // src += 48 + cmp w2, w3 // i < (width / 16 * 16) + b.lt 1b + b 3f +2: + ldrb w13, [x1] // w13: r + ldrb w14, [x1, #1] // w14: g + ldrb w15, [x1, #2] // w15: b + + smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset + smaddl x13, w14, w11, x13 // x13 += gy * g + smaddl x13, w15, w12, x13 // x13 += by * b + asr w13, w13, #9 // x13 >>= 9 + strh w13, [x0], #2 // store to dst + + add w2, w2, #1 // i++ + add x1, x1, #3 // src += 3 +3: + cmp w2, w4 // i < width + b.lt 2b +4: + ret +endfunc + +.macro rgb24_load_uv_coeff half + add x6, x6, #12 + + ldp w10, w11, [x6], #8 // w10: ru, w11: gu + dup v0.8h, w10 + dup v1.8h, w11 + + ldp w12, w13, [x6], #8 // w12: bu, w13: rv + dup v2.8h, w12 + dup v3.8h, w13 + + ldp w14, w15, [x6], #8 // w14: gv, w15: bv + dup v4.8h, w14 + dup v5.8h, w15 + + .if \half + mov w9, #512 + movk w9, #128, lsl #16 // w9: const_offset + .else + mov w9, #256 + movk w9, #64, lsl #16 // w9: const_offset + .endif + dup v6.4s, w9 +.endm + +function ff_rgb24ToUV_half_neon, export=1 + cmp w5, #0 // check width > 0 + b.le 4f + + rgb24_load_uv_coeff half=1 + + mov x9, #0 // x9: i + and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8 + cbz w7, 3f +1: + ld3 { v16.16b, v17.16b, v18.16b }, [x3] + uaddlp v19.8h, v16.16b // v19: r + uaddlp v20.8h, v17.16b // v20: g + uaddlp v21.8h, v18.16b // v21: b + + rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 + str q16, [x0], #16 // store dst_u + rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 + str q17, [x1], #16 // store dst_v + + add w9, w9, #8 // i += 8 + add x3, x3, #48 // src += 48 + cmp w9, w7 // i < (width * 8 / 8) + b.lt 1b + b 3f +2: + ldrb w2, [x3] // w2: r1 + ldrb w4, [x3, #3] // w4: r2 + add w2, w2, w4 // w2 = r1 + r2 + + ldrb w4, [x3, #1] // w4: g1 + ldrb w7, [x3, #4] // w7: g2 + add w4, w4, w7 // w4 = g1 + g2 + + ldrb w7, [x3, #2] // w7: b1 + ldrb w8, [x3, #5] // w8: b2 + add w7, w7, w8 // w7 = b1 + b2 + + umov w8, v6.s[0] // dst_u = const_offset + smaddl x8, w2, w10, x8 // dst_u += ru * r + smaddl x8, w4, w11, x8 // dst_u += gu * g + smaddl x8, w7, w12, x8 // dst_u += bu * b + asr x8, x8, #10 // dst_u >>= 10 + strh w8, [x0], #2 // store dst_u + + umov w8, v6.s[0] // dst_v = const_offset + smaddl x8, w2, w13, x8 // dst_v += rv * r + smaddl x8, w4, w14, x8 // dst_v += gv * g + smaddl x8, w7, w15, x8 // dst_v += bv * b + asr x8, x8, #10 // dst_v >>= 10 + strh w8, [x1], #2 // store dst_v + + add w9, w9, #1 // i++ + add x3, x3, #6 // src += 6 +3: + cmp w9, w5 + b.lt 2b +4: + ret +endfunc + +function ff_rgb24ToUV_neon, export=1 + cmp w5, #0 // check width > 0 + b.le 4f + + rgb24_load_uv_coeff half=0 + + mov x2, #0 // w2: i + and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16 + cbz w4, 3f +1: + rgb24_to_yuv_load_rgb x3 + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 + stp q16, q17, [x0], #32 // store to dst_u + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9 + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9 + stp q16, q17, [x1], #32 // store to dst_v + + add w2, w2, #16 // i += 16 + add x3, x3, #48 // src += 48 + cmp w2, w4 // i < (width / 16 * 16) + b.lt 1b + b 3f +2: + ldrb w16, [x3] // w16: r + ldrb w17, [x3, #1] // w17: g + ldrb w4, [x3, #2] // w4: b + + umov w7, v6.s[0] // w7 = const_offset + + smaddl x8, w16, w10, x7 // x8 = ru * r + const_offset + smaddl x8, w17, w11, x8 // x8 += gu * g + smaddl x8, w4, w12, x8 // x8 += bu * b + asr w8, w8, #9 // x8 >>= 9 + strh w8, [x0], #2 // store to dst_u + + smaddl x8, w16, w13, x7 // x8 = rv * r + const_offset + smaddl x8, w17, w14, x8 // x8 += gv * g + smaddl x8, w4, w15, x8 // x8 += bv * b + asr w8, w8, #9 // x8 >>= 9 + strh w8, [x1], #2 // store to dst_v + + add w2, w2, #1 // i++ + add x3, x3, #3 // src += 3 +3: + cmp w2, w5 // i < width + b.lt 2b +4: + ret +endfunc diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index bbd9719a44..4c4ea39dc1 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon( default: break; \ } +void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, + const uint8_t *unused2, int width, + uint32_t *rgb2yuv, void *opq); + +void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, + const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv, + void *opq); + +void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, + const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv, + void *opq); + av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) { int cpu_flags = av_get_cpu_flags(); @@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) if (c->dstBpc == 8) { c->yuv2planeX = ff_yuv2planeX_8_neon; } + switch (c->srcFormat) { + case AV_PIX_FMT_RGB24: + c->lumToYV12 = ff_rgb24ToY_neon; + if (c->chrSrcHSubSample) + c->chrToYV12 = ff_rgb24ToUV_half_neon; + else + c->chrToYV12 = ff_rgb24ToUV_neon; + break; + default: + break; + } } } -- 2.42.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili @ 2024-06-05 6:29 ` Rémi Denis-Courmont 2024-06-05 6:53 ` Zhao Zhili 2024-06-05 8:16 ` Martin Storsjö 1 sibling, 1 reply; 10+ messages in thread From: Rémi Denis-Courmont @ 2024-06-05 6:29 UTC (permalink / raw) To: FFmpeg development discussions and patches Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : >From: Zhao Zhili <zhilizhao@tencent.com> > >Test on Apple M1: > >rgb24_to_uv_1080_c: 7.2 >rgb24_to_uv_1080_neon: 5.5 >rgb24_to_uv_1280_c: 8.2 >rgb24_to_uv_1280_neon: 6.2 >rgb24_to_uv_1920_c: 12.5 >rgb24_to_uv_1920_neon: 9.5 > >rgb24_to_uv_half_540_c: 6.5 >rgb24_to_uv_half_540_neon: 3.0 >rgb24_to_uv_half_640_c: 7.5 >rgb24_to_uv_half_640_neon: 3.2 >rgb24_to_uv_half_960_c: 12.5 >rgb24_to_uv_half_960_neon: 6.0 > >rgb24_to_y_1080_c: 4.5 >rgb24_to_y_1080_neon: 3.5 >rgb24_to_y_1280_c: 5.2 >rgb24_to_y_1280_neon: 4.2 >rgb24_to_y_1920_c: 8.0 >rgb24_to_y_1920_neon: 6.0 > >Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> >--- > libswscale/aarch64/Makefile | 1 + > libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++ > libswscale/aarch64/swscale.c | 25 ++++ > 3 files changed, 255 insertions(+) > create mode 100644 libswscale/aarch64/input.S > >diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile >index da1d909561..adfd90a1b6 100644 >--- a/libswscale/aarch64/Makefile >+++ b/libswscale/aarch64/Makefile >@@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \ > aarch64/swscale_unscaled.o \ > > NEON-OBJS += aarch64/hscale.o \ >+ aarch64/input.o \ > aarch64/output.o \ > aarch64/rgb2rgb_neon.o \ > aarch64/yuv2rgb_neon.o \ >diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >new file mode 100644 >index 0000000000..ee0d223c6e >--- /dev/null >+++ b/libswscale/aarch64/input.S >@@ -0,0 +1,229 @@ >+/* >+ * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> >+ * >+ * This file is part of FFmpeg. >+ * >+ * FFmpeg is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU Lesser General Public >+ * License as published by the Free Software Foundation; either >+ * version 2.1 of the License, or (at your option) any later version. >+ * >+ * FFmpeg is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * Lesser General Public License for more details. >+ * >+ * You should have received a copy of the GNU Lesser General Public >+ * License along with FFmpeg; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >+ */ >+ >+#include "libavutil/aarch64/asm.S" >+ >+.macro rgb24_to_yuv_load_rgb, src >+ ld3 { v16.16b, v17.16b, v18.16b }, [\src] >+ ushll v19.8h, v16.8b, #0 // v19: r >+ ushll v20.8h, v17.8b, #0 // v20: g >+ ushll v21.8h, v18.8b, #0 // v21: b >+ ushll2 v22.8h, v16.16b, #0 // v22: r >+ ushll2 v23.8h, v17.16b, #0 // v23: g >+ ushll2 v24.8h, v18.16b, #0 // v24: b >+.endm >+ >+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift >+ mov \dst1\().16b, v6.16b // dst1 = const_offset >+ mov \dst2\().16b, v6.16b // dst2 = const_offset >+ smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r >+ smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r >+ smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g >+ smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g >+ smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b >+ smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b >+ sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift >+ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >+.endm >+ >+function ff_rgb24ToY_neon, export=1 >+ cmp w4, #0 // check width > 0 >+ b.le 4f >+ >+ ldp w10, w11, [x5], #8 // w10: ry, w11: gy I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs. >+ dup v0.8h, w10 >+ dup v1.8h, w11 >+ ldr w12, [x5] // w12: by >+ dup v2.8h, w12 >+ >+ mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) >+ movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) >+ dup v6.4s, w9 // w9: const_offset >+ >+ mov x2, #0 // w2: i >+ and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16 >+ cbz w3, 3f >+1: >+ rgb24_to_yuv_load_rgb x1 >+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >+ stp q16, q17, [x0], #32 // store to dst >+ >+ add w2, w2, #16 // i += 16 >+ add x1, x1, #48 // src += 48 >+ cmp w2, w3 // i < (width / 16 * 16) >+ b.lt 1b >+ b 3f >+2: >+ ldrb w13, [x1] // w13: r >+ ldrb w14, [x1, #1] // w14: g >+ ldrb w15, [x1, #2] // w15: b You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference. I don't get why the perf gain is so low, or is this an artefact of Apple CPUs? >+ >+ smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset >+ smaddl x13, w14, w11, x13 // x13 += gy * g >+ smaddl x13, w15, w12, x13 // x13 += by * b >+ asr w13, w13, #9 // x13 >>= 9 >+ strh w13, [x0], #2 // store to dst >+ >+ add w2, w2, #1 // i++ >+ add x1, x1, #3 // src += 3 >+3: >+ cmp w2, w4 // i < width >+ b.lt 2b >+4: >+ ret >+endfunc >+ >+.macro rgb24_load_uv_coeff half >+ add x6, x6, #12 >+ >+ ldp w10, w11, [x6], #8 // w10: ru, w11: gu >+ dup v0.8h, w10 >+ dup v1.8h, w11 >+ >+ ldp w12, w13, [x6], #8 // w12: bu, w13: rv >+ dup v2.8h, w12 >+ dup v3.8h, w13 >+ >+ ldp w14, w15, [x6], #8 // w14: gv, w15: bv >+ dup v4.8h, w14 >+ dup v5.8h, w15 >+ >+ .if \half >+ mov w9, #512 >+ movk w9, #128, lsl #16 // w9: const_offset >+ .else >+ mov w9, #256 >+ movk w9, #64, lsl #16 // w9: const_offset >+ .endif >+ dup v6.4s, w9 >+.endm >+ >+function ff_rgb24ToUV_half_neon, export=1 >+ cmp w5, #0 // check width > 0 >+ b.le 4f >+ >+ rgb24_load_uv_coeff half=1 >+ >+ mov x9, #0 // x9: i >+ and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8 >+ cbz w7, 3f >+1: >+ ld3 { v16.16b, v17.16b, v18.16b }, [x3] >+ uaddlp v19.8h, v16.16b // v19: r >+ uaddlp v20.8h, v17.16b // v20: g >+ uaddlp v21.8h, v18.16b // v21: b >+ >+ rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 >+ str q16, [x0], #16 // store dst_u >+ rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 >+ str q17, [x1], #16 // store dst_v >+ >+ add w9, w9, #8 // i += 8 >+ add x3, x3, #48 // src += 48 >+ cmp w9, w7 // i < (width * 8 / 8) >+ b.lt 1b >+ b 3f >+2: >+ ldrb w2, [x3] // w2: r1 >+ ldrb w4, [x3, #3] // w4: r2 >+ add w2, w2, w4 // w2 = r1 + r2 >+ >+ ldrb w4, [x3, #1] // w4: g1 >+ ldrb w7, [x3, #4] // w7: g2 >+ add w4, w4, w7 // w4 = g1 + g2 >+ >+ ldrb w7, [x3, #2] // w7: b1 >+ ldrb w8, [x3, #5] // w8: b2 >+ add w7, w7, w8 // w7 = b1 + b2 >+ >+ umov w8, v6.s[0] // dst_u = const_offset >+ smaddl x8, w2, w10, x8 // dst_u += ru * r >+ smaddl x8, w4, w11, x8 // dst_u += gu * g >+ smaddl x8, w7, w12, x8 // dst_u += bu * b >+ asr x8, x8, #10 // dst_u >>= 10 >+ strh w8, [x0], #2 // store dst_u >+ >+ umov w8, v6.s[0] // dst_v = const_offset >+ smaddl x8, w2, w13, x8 // dst_v += rv * r >+ smaddl x8, w4, w14, x8 // dst_v += gv * g >+ smaddl x8, w7, w15, x8 // dst_v += bv * b >+ asr x8, x8, #10 // dst_v >>= 10 >+ strh w8, [x1], #2 // store dst_v >+ >+ add w9, w9, #1 // i++ >+ add x3, x3, #6 // src += 6 >+3: >+ cmp w9, w5 >+ b.lt 2b >+4: >+ ret >+endfunc >+ >+function ff_rgb24ToUV_neon, export=1 >+ cmp w5, #0 // check width > 0 >+ b.le 4f >+ >+ rgb24_load_uv_coeff half=0 >+ >+ mov x2, #0 // w2: i >+ and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16 >+ cbz w4, 3f >+1: >+ rgb24_to_yuv_load_rgb x3 >+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >+ stp q16, q17, [x0], #32 // store to dst_u >+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9 >+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9 >+ stp q16, q17, [x1], #32 // store to dst_v >+ >+ add w2, w2, #16 // i += 16 >+ add x3, x3, #48 // src += 48 >+ cmp w2, w4 // i < (width / 16 * 16) >+ b.lt 1b >+ b 3f >+2: >+ ldrb w16, [x3] // w16: r >+ ldrb w17, [x3, #1] // w17: g >+ ldrb w4, [x3, #2] // w4: b >+ >+ umov w7, v6.s[0] // w7 = const_offset >+ >+ smaddl x8, w16, w10, x7 // x8 = ru * r + const_offset >+ smaddl x8, w17, w11, x8 // x8 += gu * g >+ smaddl x8, w4, w12, x8 // x8 += bu * b >+ asr w8, w8, #9 // x8 >>= 9 >+ strh w8, [x0], #2 // store to dst_u >+ >+ smaddl x8, w16, w13, x7 // x8 = rv * r + const_offset >+ smaddl x8, w17, w14, x8 // x8 += gv * g >+ smaddl x8, w4, w15, x8 // x8 += bv * b >+ asr w8, w8, #9 // x8 >>= 9 >+ strh w8, [x1], #2 // store to dst_v >+ >+ add w2, w2, #1 // i++ >+ add x3, x3, #3 // src += 3 >+3: >+ cmp w2, w5 // i < width >+ b.lt 2b >+4: >+ ret >+endfunc >diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c >index bbd9719a44..4c4ea39dc1 100644 >--- a/libswscale/aarch64/swscale.c >+++ b/libswscale/aarch64/swscale.c >@@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon( > default: break; \ > } > >+void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, >+ const uint8_t *unused2, int width, >+ uint32_t *rgb2yuv, void *opq); >+ >+void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, >+ const uint8_t *src1, >+ const uint8_t *src2, int width, uint32_t *rgb2yuv, >+ void *opq); >+ >+void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, >+ const uint8_t *src1, >+ const uint8_t *src2, int width, uint32_t *rgb2yuv, >+ void *opq); >+ > av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) > { > int cpu_flags = av_get_cpu_flags(); >@@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) > if (c->dstBpc == 8) { > c->yuv2planeX = ff_yuv2planeX_8_neon; > } >+ switch (c->srcFormat) { >+ case AV_PIX_FMT_RGB24: >+ c->lumToYV12 = ff_rgb24ToY_neon; >+ if (c->chrSrcHSubSample) >+ c->chrToYV12 = ff_rgb24ToUV_half_neon; >+ else >+ c->chrToYV12 = ff_rgb24ToUV_neon; >+ break; >+ default: >+ break; >+ } > } > } _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation 2024-06-05 6:29 ` Rémi Denis-Courmont @ 2024-06-05 6:53 ` Zhao Zhili 2024-06-05 7:34 ` Rémi Denis-Courmont 2024-06-05 7:34 ` Martin Storsjö 0 siblings, 2 replies; 10+ messages in thread From: Zhao Zhili @ 2024-06-05 6:53 UTC (permalink / raw) To: FFmpeg development discussions and patches > On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <remi@remlab.net> wrote: > > > > Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit : >> From: Zhao Zhili <zhilizhao@tencent.com> >> >> Test on Apple M1: >> >> rgb24_to_uv_1080_c: 7.2 >> rgb24_to_uv_1080_neon: 5.5 >> rgb24_to_uv_1280_c: 8.2 >> rgb24_to_uv_1280_neon: 6.2 >> rgb24_to_uv_1920_c: 12.5 >> rgb24_to_uv_1920_neon: 9.5 >> >> rgb24_to_uv_half_540_c: 6.5 >> rgb24_to_uv_half_540_neon: 3.0 >> rgb24_to_uv_half_640_c: 7.5 >> rgb24_to_uv_half_640_neon: 3.2 >> rgb24_to_uv_half_960_c: 12.5 >> rgb24_to_uv_half_960_neon: 6.0 >> >> rgb24_to_y_1080_c: 4.5 >> rgb24_to_y_1080_neon: 3.5 >> rgb24_to_y_1280_c: 5.2 >> rgb24_to_y_1280_neon: 4.2 >> rgb24_to_y_1920_c: 8.0 >> rgb24_to_y_1920_neon: 6.0 >> >> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> >> --- >> libswscale/aarch64/Makefile | 1 + >> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++ >> libswscale/aarch64/swscale.c | 25 ++++ >> 3 files changed, 255 insertions(+) >> create mode 100644 libswscale/aarch64/input.S >> >> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile >> index da1d909561..adfd90a1b6 100644 >> --- a/libswscale/aarch64/Makefile >> +++ b/libswscale/aarch64/Makefile >> @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \ >> aarch64/swscale_unscaled.o \ >> >> NEON-OBJS += aarch64/hscale.o \ >> + aarch64/input.o \ >> aarch64/output.o \ >> aarch64/rgb2rgb_neon.o \ >> aarch64/yuv2rgb_neon.o \ >> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >> new file mode 100644 >> index 0000000000..ee0d223c6e >> --- /dev/null >> +++ b/libswscale/aarch64/input.S >> @@ -0,0 +1,229 @@ >> +/* >> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> >> + * >> + * This file is part of FFmpeg. >> + * >> + * FFmpeg is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2.1 of the License, or (at your option) any later version. >> + * >> + * FFmpeg is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with FFmpeg; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >> + */ >> + >> +#include "libavutil/aarch64/asm.S" >> + >> +.macro rgb24_to_yuv_load_rgb, src >> + ld3 { v16.16b, v17.16b, v18.16b }, [\src] >> + ushll v19.8h, v16.8b, #0 // v19: r >> + ushll v20.8h, v17.8b, #0 // v20: g >> + ushll v21.8h, v18.8b, #0 // v21: b >> + ushll2 v22.8h, v16.16b, #0 // v22: r >> + ushll2 v23.8h, v17.16b, #0 // v23: g >> + ushll2 v24.8h, v18.16b, #0 // v24: b >> +.endm >> + >> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift >> + mov \dst1\().16b, v6.16b // dst1 = const_offset >> + mov \dst2\().16b, v6.16b // dst2 = const_offset >> + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r >> + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r >> + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g >> + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g >> + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b >> + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b >> + sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift >> + sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >> +.endm >> + >> +function ff_rgb24ToY_neon, export=1 >> + cmp w4, #0 // check width > 0 >> + b.le 4f >> + >> + ldp w10, w11, [x5], #8 // w10: ry, w11: gy > > I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs. Will do. > >> + dup v0.8h, w10 >> + dup v1.8h, w11 >> + ldr w12, [x5] // w12: by >> + dup v2.8h, w12 >> + >> + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) >> + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) >> + dup v6.4s, w9 // w9: const_offset >> + >> + mov x2, #0 // w2: i >> + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16 >> + cbz w3, 3f >> +1: >> + rgb24_to_yuv_load_rgb x1 >> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >> + stp q16, q17, [x0], #32 // store to dst >> + >> + add w2, w2, #16 // i += 16 >> + add x1, x1, #48 // src += 48 >> + cmp w2, w3 // i < (width / 16 * 16) >> + b.lt 1b >> + b 3f >> +2: >> + ldrb w13, [x1] // w13: r >> + ldrb w14, [x1, #1] // w14: g >> + ldrb w15, [x1, #2] // w15: b > > You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference. > > I don't get why the perf gain is so low, or is this an artefact of Apple CPUs? I have checked the assembly of C version. The compiler has done pretty well on loop unroll and vectorize on this simple case. > >> + >> + smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset >> + smaddl x13, w14, w11, x13 // x13 += gy * g >> + smaddl x13, w15, w12, x13 // x13 += by * b >> + asr w13, w13, #9 // x13 >>= 9 >> + strh w13, [x0], #2 // store to dst >> + >> + add w2, w2, #1 // i++ >> + add x1, x1, #3 // src += 3 >> +3: >> + cmp w2, w4 // i < width >> + b.lt 2b >> +4: >> + ret >> +endfunc >> + >> +.macro rgb24_load_uv_coeff half >> + add x6, x6, #12 >> + >> + ldp w10, w11, [x6], #8 // w10: ru, w11: gu >> + dup v0.8h, w10 >> + dup v1.8h, w11 >> + >> + ldp w12, w13, [x6], #8 // w12: bu, w13: rv >> + dup v2.8h, w12 >> + dup v3.8h, w13 >> + >> + ldp w14, w15, [x6], #8 // w14: gv, w15: bv >> + dup v4.8h, w14 >> + dup v5.8h, w15 >> + >> + .if \half >> + mov w9, #512 >> + movk w9, #128, lsl #16 // w9: const_offset >> + .else >> + mov w9, #256 >> + movk w9, #64, lsl #16 // w9: const_offset >> + .endif >> + dup v6.4s, w9 >> +.endm >> + >> +function ff_rgb24ToUV_half_neon, export=1 >> + cmp w5, #0 // check width > 0 >> + b.le 4f >> + >> + rgb24_load_uv_coeff half=1 >> + >> + mov x9, #0 // x9: i >> + and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8 >> + cbz w7, 3f >> +1: >> + ld3 { v16.16b, v17.16b, v18.16b }, [x3] >> + uaddlp v19.8h, v16.16b // v19: r >> + uaddlp v20.8h, v17.16b // v20: g >> + uaddlp v21.8h, v18.16b // v21: b >> + >> + rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 >> + str q16, [x0], #16 // store dst_u >> + rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 >> + str q17, [x1], #16 // store dst_v >> + >> + add w9, w9, #8 // i += 8 >> + add x3, x3, #48 // src += 48 >> + cmp w9, w7 // i < (width * 8 / 8) >> + b.lt 1b >> + b 3f >> +2: >> + ldrb w2, [x3] // w2: r1 >> + ldrb w4, [x3, #3] // w4: r2 >> + add w2, w2, w4 // w2 = r1 + r2 >> + >> + ldrb w4, [x3, #1] // w4: g1 >> + ldrb w7, [x3, #4] // w7: g2 >> + add w4, w4, w7 // w4 = g1 + g2 >> + >> + ldrb w7, [x3, #2] // w7: b1 >> + ldrb w8, [x3, #5] // w8: b2 >> + add w7, w7, w8 // w7 = b1 + b2 >> + >> + umov w8, v6.s[0] // dst_u = const_offset >> + smaddl x8, w2, w10, x8 // dst_u += ru * r >> + smaddl x8, w4, w11, x8 // dst_u += gu * g >> + smaddl x8, w7, w12, x8 // dst_u += bu * b >> + asr x8, x8, #10 // dst_u >>= 10 >> + strh w8, [x0], #2 // store dst_u >> + >> + umov w8, v6.s[0] // dst_v = const_offset >> + smaddl x8, w2, w13, x8 // dst_v += rv * r >> + smaddl x8, w4, w14, x8 // dst_v += gv * g >> + smaddl x8, w7, w15, x8 // dst_v += bv * b >> + asr x8, x8, #10 // dst_v >>= 10 >> + strh w8, [x1], #2 // store dst_v >> + >> + add w9, w9, #1 // i++ >> + add x3, x3, #6 // src += 6 >> +3: >> + cmp w9, w5 >> + b.lt 2b >> +4: >> + ret >> +endfunc >> + >> +function ff_rgb24ToUV_neon, export=1 >> + cmp w5, #0 // check width > 0 >> + b.le 4f >> + >> + rgb24_load_uv_coeff half=0 >> + >> + mov x2, #0 // w2: i >> + and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16 >> + cbz w4, 3f >> +1: >> + rgb24_to_yuv_load_rgb x3 >> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >> + stp q16, q17, [x0], #32 // store to dst_u >> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9 >> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9 >> + stp q16, q17, [x1], #32 // store to dst_v >> + >> + add w2, w2, #16 // i += 16 >> + add x3, x3, #48 // src += 48 >> + cmp w2, w4 // i < (width / 16 * 16) >> + b.lt 1b >> + b 3f >> +2: >> + ldrb w16, [x3] // w16: r >> + ldrb w17, [x3, #1] // w17: g >> + ldrb w4, [x3, #2] // w4: b >> + >> + umov w7, v6.s[0] // w7 = const_offset >> + >> + smaddl x8, w16, w10, x7 // x8 = ru * r + const_offset >> + smaddl x8, w17, w11, x8 // x8 += gu * g >> + smaddl x8, w4, w12, x8 // x8 += bu * b >> + asr w8, w8, #9 // x8 >>= 9 >> + strh w8, [x0], #2 // store to dst_u >> + >> + smaddl x8, w16, w13, x7 // x8 = rv * r + const_offset >> + smaddl x8, w17, w14, x8 // x8 += gv * g >> + smaddl x8, w4, w15, x8 // x8 += bv * b >> + asr w8, w8, #9 // x8 >>= 9 >> + strh w8, [x1], #2 // store to dst_v >> + >> + add w2, w2, #1 // i++ >> + add x3, x3, #3 // src += 3 >> +3: >> + cmp w2, w5 // i < width >> + b.lt 2b >> +4: >> + ret >> +endfunc >> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c >> index bbd9719a44..4c4ea39dc1 100644 >> --- a/libswscale/aarch64/swscale.c >> +++ b/libswscale/aarch64/swscale.c >> @@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon( >> default: break; \ >> } >> >> +void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, >> + const uint8_t *unused2, int width, >> + uint32_t *rgb2yuv, void *opq); >> + >> +void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, >> + const uint8_t *src1, >> + const uint8_t *src2, int width, uint32_t *rgb2yuv, >> + void *opq); >> + >> +void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, >> + const uint8_t *src1, >> + const uint8_t *src2, int width, uint32_t *rgb2yuv, >> + void *opq); >> + >> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >> { >> int cpu_flags = av_get_cpu_flags(); >> @@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >> if (c->dstBpc == 8) { >> c->yuv2planeX = ff_yuv2planeX_8_neon; >> } >> + switch (c->srcFormat) { >> + case AV_PIX_FMT_RGB24: >> + c->lumToYV12 = ff_rgb24ToY_neon; >> + if (c->chrSrcHSubSample) >> + c->chrToYV12 = ff_rgb24ToUV_half_neon; >> + else >> + c->chrToYV12 = ff_rgb24ToUV_neon; >> + break; >> + default: >> + break; >> + } >> } >> } > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation 2024-06-05 6:53 ` Zhao Zhili @ 2024-06-05 7:34 ` Rémi Denis-Courmont 2024-06-05 7:34 ` Martin Storsjö 1 sibling, 0 replies; 10+ messages in thread From: Rémi Denis-Courmont @ 2024-06-05 7:34 UTC (permalink / raw) To: FFmpeg development discussions and patches Le 5 juin 2024 09:53:45 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : > > >> On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <remi@remlab.net> wrote: >> >> >> >> Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit : >>> From: Zhao Zhili <zhilizhao@tencent.com> >>> >>> Test on Apple M1: >>> >>> rgb24_to_uv_1080_c: 7.2 >>> rgb24_to_uv_1080_neon: 5.5 >>> rgb24_to_uv_1280_c: 8.2 >>> rgb24_to_uv_1280_neon: 6.2 >>> rgb24_to_uv_1920_c: 12.5 >>> rgb24_to_uv_1920_neon: 9.5 >>> >>> rgb24_to_uv_half_540_c: 6.5 >>> rgb24_to_uv_half_540_neon: 3.0 >>> rgb24_to_uv_half_640_c: 7.5 >>> rgb24_to_uv_half_640_neon: 3.2 >>> rgb24_to_uv_half_960_c: 12.5 >>> rgb24_to_uv_half_960_neon: 6.0 >>> >>> rgb24_to_y_1080_c: 4.5 >>> rgb24_to_y_1080_neon: 3.5 >>> rgb24_to_y_1280_c: 5.2 >>> rgb24_to_y_1280_neon: 4.2 >>> rgb24_to_y_1920_c: 8.0 >>> rgb24_to_y_1920_neon: 6.0 >>> >>> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> >>> --- >>> libswscale/aarch64/Makefile | 1 + >>> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++ >>> libswscale/aarch64/swscale.c | 25 ++++ >>> 3 files changed, 255 insertions(+) >>> create mode 100644 libswscale/aarch64/input.S >>> >>> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile >>> index da1d909561..adfd90a1b6 100644 >>> --- a/libswscale/aarch64/Makefile >>> +++ b/libswscale/aarch64/Makefile >>> @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \ >>> aarch64/swscale_unscaled.o \ >>> >>> NEON-OBJS += aarch64/hscale.o \ >>> + aarch64/input.o \ >>> aarch64/output.o \ >>> aarch64/rgb2rgb_neon.o \ >>> aarch64/yuv2rgb_neon.o \ >>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >>> new file mode 100644 >>> index 0000000000..ee0d223c6e >>> --- /dev/null >>> +++ b/libswscale/aarch64/input.S >>> @@ -0,0 +1,229 @@ >>> +/* >>> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> >>> + * >>> + * This file is part of FFmpeg. >>> + * >>> + * FFmpeg is free software; you can redistribute it and/or >>> + * modify it under the terms of the GNU Lesser General Public >>> + * License as published by the Free Software Foundation; either >>> + * version 2.1 of the License, or (at your option) any later version. >>> + * >>> + * FFmpeg is distributed in the hope that it will be useful, >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + * Lesser General Public License for more details. >>> + * >>> + * You should have received a copy of the GNU Lesser General Public >>> + * License along with FFmpeg; if not, write to the Free Software >>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >>> + */ >>> + >>> +#include "libavutil/aarch64/asm.S" >>> + >>> +.macro rgb24_to_yuv_load_rgb, src >>> + ld3 { v16.16b, v17.16b, v18.16b }, [\src] >>> + ushll v19.8h, v16.8b, #0 // v19: r >>> + ushll v20.8h, v17.8b, #0 // v20: g >>> + ushll v21.8h, v18.8b, #0 // v21: b >>> + ushll2 v22.8h, v16.16b, #0 // v22: r >>> + ushll2 v23.8h, v17.16b, #0 // v23: g >>> + ushll2 v24.8h, v18.16b, #0 // v24: b >>> +.endm >>> + >>> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift >>> + mov \dst1\().16b, v6.16b // dst1 = const_offset >>> + mov \dst2\().16b, v6.16b // dst2 = const_offset >>> + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r >>> + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r >>> + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g >>> + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g >>> + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b >>> + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b >>> + sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift >>> + sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >>> +.endm >>> + >>> +function ff_rgb24ToY_neon, export=1 >>> + cmp w4, #0 // check width > 0 >>> + b.le 4f >>> + >>> + ldp w10, w11, [x5], #8 // w10: ry, w11: gy >> >> I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs. > >Will do. > >> >>> + dup v0.8h, w10 >>> + dup v1.8h, w11 >>> + ldr w12, [x5] // w12: by >>> + dup v2.8h, w12 >>> + >>> + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) >>> + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) >>> + dup v6.4s, w9 // w9: const_offset >>> + >>> + mov x2, #0 // w2: i >>> + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16 >>> + cbz w3, 3f >>> +1: >>> + rgb24_to_yuv_load_rgb x1 >>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >>> + stp q16, q17, [x0], #32 // store to dst >>> + >>> + add w2, w2, #16 // i += 16 >>> + add x1, x1, #48 // src += 48 >>> + cmp w2, w3 // i < (width / 16 * 16) >>> + b.lt 1b >>> + b 3f >>> +2: >>> + ldrb w13, [x1] // w13: r >>> + ldrb w14, [x1, #1] // w14: g >>> + ldrb w15, [x1, #2] // w15: b >> >> You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference. >> >> I don't get why the perf gain is so low, or is this an artefact of Apple CPUs? > >I have checked the assembly of C version. The compiler has done pretty well on loop unroll and >vectorize on this simple case. Uh, don't we disable auto-vectorisation in the configure script? Until/unless it is re-enabled, I think benchmarks should be done against non-auto-vectorised code, if only to stay representative of normal/default FFmpeg builds. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation 2024-06-05 6:53 ` Zhao Zhili 2024-06-05 7:34 ` Rémi Denis-Courmont @ 2024-06-05 7:34 ` Martin Storsjö 1 sibling, 0 replies; 10+ messages in thread From: Martin Storsjö @ 2024-06-05 7:34 UTC (permalink / raw) To: FFmpeg development discussions and patches On Wed, 5 Jun 2024, Zhao Zhili wrote: >> On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <remi@remlab.net> wrote: >> >> Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit : >>> From: Zhao Zhili <zhilizhao@tencent.com> >>> >>> Test on Apple M1: >>> >>> rgb24_to_uv_1080_c: 7.2 >>> rgb24_to_uv_1080_neon: 5.5 >>> rgb24_to_uv_1280_c: 8.2 >>> rgb24_to_uv_1280_neon: 6.2 >>> rgb24_to_uv_1920_c: 12.5 >>> rgb24_to_uv_1920_neon: 9.5 >>> >>> rgb24_to_uv_half_540_c: 6.5 >>> rgb24_to_uv_half_540_neon: 3.0 >>> rgb24_to_uv_half_640_c: 7.5 >>> rgb24_to_uv_half_640_neon: 3.2 >>> rgb24_to_uv_half_960_c: 12.5 >>> rgb24_to_uv_half_960_neon: 6.0 >>> >>> rgb24_to_y_1080_c: 4.5 >>> rgb24_to_y_1080_neon: 3.5 >>> rgb24_to_y_1280_c: 5.2 >>> rgb24_to_y_1280_neon: 4.2 >>> rgb24_to_y_1920_c: 8.0 >>> rgb24_to_y_1920_neon: 6.0 >>> >>> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> >>> --- >>> libswscale/aarch64/Makefile | 1 + >>> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++ >>> libswscale/aarch64/swscale.c | 25 ++++ >>> 3 files changed, 255 insertions(+) >>> create mode 100644 libswscale/aarch64/input.S >>> >>> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile >>> index da1d909561..adfd90a1b6 100644 >>> --- a/libswscale/aarch64/Makefile >>> +++ b/libswscale/aarch64/Makefile >>> @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \ >>> aarch64/swscale_unscaled.o \ >>> >>> NEON-OBJS += aarch64/hscale.o \ >>> + aarch64/input.o \ >>> aarch64/output.o \ >>> aarch64/rgb2rgb_neon.o \ >>> aarch64/yuv2rgb_neon.o \ >>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >>> new file mode 100644 >>> index 0000000000..ee0d223c6e >>> --- /dev/null >>> +++ b/libswscale/aarch64/input.S >>> @@ -0,0 +1,229 @@ >>> +/* >>> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> >>> + * >>> + * This file is part of FFmpeg. >>> + * >>> + * FFmpeg is free software; you can redistribute it and/or >>> + * modify it under the terms of the GNU Lesser General Public >>> + * License as published by the Free Software Foundation; either >>> + * version 2.1 of the License, or (at your option) any later version. >>> + * >>> + * FFmpeg is distributed in the hope that it will be useful, >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + * Lesser General Public License for more details. >>> + * >>> + * You should have received a copy of the GNU Lesser General Public >>> + * License along with FFmpeg; if not, write to the Free Software >>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >>> + */ >>> + >>> +#include "libavutil/aarch64/asm.S" >>> + >>> +.macro rgb24_to_yuv_load_rgb, src >>> + ld3 { v16.16b, v17.16b, v18.16b }, [\src] >>> + ushll v19.8h, v16.8b, #0 // v19: r >>> + ushll v20.8h, v17.8b, #0 // v20: g >>> + ushll v21.8h, v18.8b, #0 // v21: b >>> + ushll2 v22.8h, v16.16b, #0 // v22: r >>> + ushll2 v23.8h, v17.16b, #0 // v23: g >>> + ushll2 v24.8h, v18.16b, #0 // v24: b >>> +.endm >>> + >>> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift >>> + mov \dst1\().16b, v6.16b // dst1 = const_offset >>> + mov \dst2\().16b, v6.16b // dst2 = const_offset >>> + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r >>> + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r >>> + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g >>> + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g >>> + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b >>> + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b >>> + sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift >>> + sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >>> +.endm >>> + >>> +function ff_rgb24ToY_neon, export=1 >>> + cmp w4, #0 // check width > 0 >>> + b.le 4f >>> + >>> + ldp w10, w11, [x5], #8 // w10: ry, w11: gy >> >> I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs. > > Will do. > >> >>> + dup v0.8h, w10 >>> + dup v1.8h, w11 >>> + ldr w12, [x5] // w12: by >>> + dup v2.8h, w12 >>> + >>> + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) >>> + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) >>> + dup v6.4s, w9 // w9: const_offset >>> + >>> + mov x2, #0 // w2: i >>> + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16 >>> + cbz w3, 3f >>> +1: >>> + rgb24_to_yuv_load_rgb x1 >>> + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >>> + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >>> + stp q16, q17, [x0], #32 // store to dst >>> + >>> + add w2, w2, #16 // i += 16 >>> + add x1, x1, #48 // src += 48 >>> + cmp w2, w3 // i < (width / 16 * 16) >>> + b.lt 1b >>> + b 3f >>> +2: >>> + ldrb w13, [x1] // w13: r >>> + ldrb w14, [x1, #1] // w14: g >>> + ldrb w15, [x1, #2] // w15: b >> >> You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference. >> >> I don't get why the perf gain is so low, or is this an artefact of Apple CPUs? > > I have checked the assembly of C version. The compiler has done pretty well on loop unroll and > vectorize on this simple case. To add some context here; ffmpeg's configure disables autovectorization with GCC (as it does miscompile things semi regularly), but not with Clang. This can give somewhat misleading numbers wrt the relative speedup. Then additionally, the Apple CPUs do have slightly different performance characteristics than other cores too, indeed. Plus the very coarse timer used on macOS doesn't help either... FWIW, here are some numbers for this patch from some more traditional CPUs, with a GCC build: Cortex A53 A72 A78 rgb24_to_uv_1080_c: 19471.5 8720.7 7049.7 rgb24_to_uv_1080_neon: 5922.7 3147.5 2274.5 rgb24_to_uv_1280_c: 23067.0 10318.2 8348.5 rgb24_to_uv_1280_neon: 6842.5 3672.5 2656.5 rgb24_to_uv_1920_c: 34595.2 15483.2 12509.7 rgb24_to_uv_1920_neon: 10246.0 5496.7 3976.5 rgb24_to_uv_half_540_c: 11396.0 5481.0 4576.0 rgb24_to_uv_half_540_neon: 3655.7 1687.5 1382.5 rgb24_to_uv_half_640_c: 13546.0 6480.2 5399.0 rgb24_to_uv_half_640_neon: 4202.7 1958.2 1611.2 rgb24_to_uv_half_960_c: 20311.0 9724.2 8068.2 rgb24_to_uv_half_960_neon: 6282.7 2934.2 2372.2 rgb24_to_y_1080_c: 12984.2 4339.7 4074.2 rgb24_to_y_1080_neon: 3492.5 1960.5 1444.7 rgb24_to_y_1280_c: 15384.2 6709.2 4823.5 rgb24_to_y_1280_neon: 4038.2 2265.0 1674.0 rgb24_to_y_1920_c: 23069.7 7708.7 7224.7 rgb24_to_y_1920_neon: 6036.2 3389.0 2514.0 // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili 2024-06-05 6:29 ` Rémi Denis-Courmont @ 2024-06-05 8:16 ` Martin Storsjö 1 sibling, 0 replies; 10+ messages in thread From: Martin Storsjö @ 2024-06-05 8:16 UTC (permalink / raw) To: FFmpeg development discussions and patches; +Cc: Zhao Zhili On Tue, 4 Jun 2024, Zhao Zhili wrote: > From: Zhao Zhili <zhilizhao@tencent.com> > > Test on Apple M1: > > rgb24_to_uv_1080_c: 7.2 > rgb24_to_uv_1080_neon: 5.5 > rgb24_to_uv_1280_c: 8.2 > rgb24_to_uv_1280_neon: 6.2 > rgb24_to_uv_1920_c: 12.5 > rgb24_to_uv_1920_neon: 9.5 > > rgb24_to_uv_half_540_c: 6.5 > rgb24_to_uv_half_540_neon: 3.0 > rgb24_to_uv_half_640_c: 7.5 > rgb24_to_uv_half_640_neon: 3.2 > rgb24_to_uv_half_960_c: 12.5 > rgb24_to_uv_half_960_neon: 6.0 > > rgb24_to_y_1080_c: 4.5 > rgb24_to_y_1080_neon: 3.5 > rgb24_to_y_1280_c: 5.2 > rgb24_to_y_1280_neon: 4.2 > rgb24_to_y_1920_c: 8.0 > rgb24_to_y_1920_neon: 6.0 > > Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> > --- > libswscale/aarch64/Makefile | 1 + > libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++ > libswscale/aarch64/swscale.c | 25 ++++ > 3 files changed, 255 insertions(+) > create mode 100644 libswscale/aarch64/input.S > > diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile > index da1d909561..adfd90a1b6 100644 > --- a/libswscale/aarch64/Makefile > +++ b/libswscale/aarch64/Makefile > @@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \ > aarch64/swscale_unscaled.o \ > > NEON-OBJS += aarch64/hscale.o \ > + aarch64/input.o \ > aarch64/output.o \ > aarch64/rgb2rgb_neon.o \ > aarch64/yuv2rgb_neon.o \ > diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S > new file mode 100644 > index 0000000000..ee0d223c6e > --- /dev/null > +++ b/libswscale/aarch64/input.S > @@ -0,0 +1,229 @@ > +/* > + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include "libavutil/aarch64/asm.S" > + > +.macro rgb24_to_yuv_load_rgb, src > + ld3 { v16.16b, v17.16b, v18.16b }, [\src] > + ushll v19.8h, v16.8b, #0 // v19: r > + ushll v20.8h, v17.8b, #0 // v20: g > + ushll v21.8h, v18.8b, #0 // v21: b > + ushll2 v22.8h, v16.16b, #0 // v22: r > + ushll2 v23.8h, v17.16b, #0 // v23: g > + ushll2 v24.8h, v18.16b, #0 // v24: b Doing "ushll #0" is perhaps a bit unusual, the common thing would be "uxtl" instead. It doesn't matter in practice though, it assembles to the same instruction anyway. > +.endm > + > +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift > + mov \dst1\().16b, v6.16b // dst1 = const_offset > + mov \dst2\().16b, v6.16b // dst2 = const_offset > + smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r > + smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r > + smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g > + smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g > + smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b > + smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b For sequences like this, the Cortex A53 (and iirc at least the A55 too) has got a fastpath; if you do multiple consequent smlal/smlsl (or regular mla/mls) into the same register, you actually save a lot of time. E.g. instead of this: smlal dst1 smlal dst2 smlal dst1 smlal dst2 smlal dst1 smlal dst2 Do this: smlal dst1 smlal dst1 smlal dst1 smlal dst2 smlal dst2 smlal dst2 For in-order cores (with this special fastpath - it is indeed a bit non-obvious) this makes a huge difference, and for out of order cores, they can reorder it as they prefer anyway (as this is not a very long instruction sequence). This makes a massive difference for the in-order cores. Before: Cortex A53 A72 A73 rgb24_to_y_1920_neon: 6032.7 3385.7 2514.0 After: rgb24_to_y_1920_neon: 5072.7 3388.2 2522.0 A 19% speedup on A53 with just with this one change, and it makes almost no difference for the other cores (mostly within measurement noise). > +function ff_rgb24ToY_neon, export=1 > + cmp w4, #0 // check width > 0 > + b.le 4f > + > + ldp w10, w11, [x5], #8 // w10: ry, w11: gy > + dup v0.8h, w10 > + dup v1.8h, w11 > + ldr w12, [x5] // w12: by > + dup v2.8h, w12 > + > + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) > + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) > + dup v6.4s, w9 // w9: const_offset > + > + mov x2, #0 // w2: i > + and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16 > + cbz w3, 3f This blindly assumes that if width > 0, then width is also >= 16 at least, as we always run the SIMD codepath once here. Is this a requirement in swscale, or should we check whether width >= 16 here too? > +1: > + rgb24_to_yuv_load_rgb x1 > + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 > + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 > + stp q16, q17, [x0], #32 // store to dst > + > + add w2, w2, #16 // i += 16 > + add x1, x1, #48 // src += 48 > + cmp w2, w3 // i < (width / 16 * 16) > + b.lt 1b For the in-order cores, we can help with scheduling here too. You can do the add/add/cmp before the stp. This adds a bit more distance between the calculation of q17 and the store of it, and adds more distance between the cmp and the b.lt that depends on it. Secondly, it's a bit uncommon in SIMD like this, to count upwards (incrementing i); instead, the common pattern is to keep the width in one register and count it down. Here, you can then do "cmp w2, #16", so you don't need to keep a register with the aligned end value. And for the scalar codepath, when approaching zero, you can do "subs w2, #1", "b.gt 2b", so you don't need two instructions for add+cmp. > + b 3f > +2: > + ldrb w13, [x1] // w13: r > + ldrb w14, [x1, #1] // w14: g > + ldrb w15, [x1, #2] // w15: b > + > + smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset > + smaddl x13, w14, w11, x13 // x13 += gy * g > + smaddl x13, w15, w12, x13 // x13 += by * b > + asr w13, w13, #9 // x13 >>= 9 > + strh w13, [x0], #2 // store to dst > + > + add w2, w2, #1 // i++ > + add x1, x1, #3 // src += 3 > +3: > + cmp w2, w4 // i < width > + b.lt 2b > +4: > + ret > +endfunc > + > +.macro rgb24_load_uv_coeff half > + add x6, x6, #12 > + > + ldp w10, w11, [x6], #8 // w10: ru, w11: gu > + dup v0.8h, w10 > + dup v1.8h, w11 > + > + ldp w12, w13, [x6], #8 // w12: bu, w13: rv > + dup v2.8h, w12 > + dup v3.8h, w13 > + > + ldp w14, w15, [x6], #8 // w14: gv, w15: bv > + dup v4.8h, w14 > + dup v5.8h, w15 As Remi mentioned, scheduling instructions like these can help a bit. But here, each "ldp" depends on the updated x6 from the previous one, so it maybe doesn't make much difference. > + > + .if \half > + mov w9, #512 > + movk w9, #128, lsl #16 // w9: const_offset > + .else > + mov w9, #256 > + movk w9, #64, lsl #16 // w9: const_offset > + .endif > + dup v6.4s, w9 > +.endm > + > +function ff_rgb24ToUV_half_neon, export=1 > + cmp w5, #0 // check width > 0 > + b.le 4f > + > + rgb24_load_uv_coeff half=1 > + > + mov x9, #0 // x9: i > + and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8 > + cbz w7, 3f > +1: > + ld3 { v16.16b, v17.16b, v18.16b }, [x3] > + uaddlp v19.8h, v16.16b // v19: r > + uaddlp v20.8h, v17.16b // v20: g > + uaddlp v21.8h, v18.16b // v21: b > + > + rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 > + str q16, [x0], #16 // store dst_u > + rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 > + str q17, [x1], #16 // store dst_v > + > + add w9, w9, #8 // i += 8 > + add x3, x3, #48 // src += 48 > + cmp w9, w7 // i < (width * 8 / 8) > + b.lt 1b Here, you can also move the both str out to after the add/cmp, like above. > + b 3f > +2: > + ldrb w2, [x3] // w2: r1 > + ldrb w4, [x3, #3] // w4: r2 > + add w2, w2, w4 // w2 = r1 + r2 > + > + ldrb w4, [x3, #1] // w4: g1 > + ldrb w7, [x3, #4] // w7: g2 > + add w4, w4, w7 // w4 = g1 + g2 > + > + ldrb w7, [x3, #2] // w7: b1 > + ldrb w8, [x3, #5] // w8: b2 > + add w7, w7, w8 // w7 = b1 + b2 > + > + umov w8, v6.s[0] // dst_u = const_offset > + smaddl x8, w2, w10, x8 // dst_u += ru * r > + smaddl x8, w4, w11, x8 // dst_u += gu * g > + smaddl x8, w7, w12, x8 // dst_u += bu * b > + asr x8, x8, #10 // dst_u >>= 10 > + strh w8, [x0], #2 // store dst_u > + > + umov w8, v6.s[0] // dst_v = const_offset > + smaddl x8, w2, w13, x8 // dst_v += rv * r > + smaddl x8, w4, w14, x8 // dst_v += gv * g > + smaddl x8, w7, w15, x8 // dst_v += bv * b > + asr x8, x8, #10 // dst_v >>= 10 > + strh w8, [x1], #2 // store dst_v > + > + add w9, w9, #1 // i++ > + add x3, x3, #6 // src += 6 > +3: > + cmp w9, w5 > + b.lt 2b > +4: > + ret > +endfunc > + > +function ff_rgb24ToUV_neon, export=1 > + cmp w5, #0 // check width > 0 > + b.le 4f > + > + rgb24_load_uv_coeff half=0 > + > + mov x2, #0 // w2: i > + and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16 > + cbz w4, 3f > +1: > + rgb24_to_yuv_load_rgb x3 > + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 > + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 > + stp q16, q17, [x0], #32 // store to dst_u > + rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9 > + rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9 > + stp q16, q17, [x1], #32 // store to dst_v If you'd make the second pair of rgb24_to_yuv_product write into v18/v19 instead of v16/v17, you can move them out to after the add/add/cmp. (It doesn't make much of a measurable difference, but is good scheduling in general.) > + > + add w2, w2, #16 // i += 16 > + add x3, x3, #48 // src += 48 > + cmp w2, w4 // i < (width / 16 * 16) > + b.lt 1b > + b 3f > +2: > + ldrb w16, [x3] // w16: r > + ldrb w17, [x3, #1] // w17: g > + ldrb w4, [x3, #2] // w4: b > + > + umov w7, v6.s[0] // w7 = const_offset SIMD->GPR moves are generally expensive, we shouldn't be doing this within the loop, if the value is constant. Instead, you should do this in a prologue to the scalar codepath. But in this case, the value should already be present in w9, if I understand the code correctly, so we don't need to fetch it from v6 - and this is already what you do in ff_rgb24ToY_neon. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple [not found] <20240604135504.83169-1-quinkblack@foxmail.com> 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili @ 2024-06-04 13:55 ` Zhao Zhili 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME Zhao Zhili 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili 3 siblings, 0 replies; 10+ messages in thread From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Zhao Zhili From: Zhao Zhili <zhilizhao@tencent.com> It will fallback to mach_absolute_time inside libavutil/timer.h --- libavutil/aarch64/timer.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/libavutil/aarch64/timer.h b/libavutil/aarch64/timer.h index 8b28fd354c..fadc9568f8 100644 --- a/libavutil/aarch64/timer.h +++ b/libavutil/aarch64/timer.h @@ -24,13 +24,7 @@ #include <stdint.h> #include "config.h" -#if defined(__APPLE__) - -#include <mach/mach_time.h> - -#define AV_READ_TIME mach_absolute_time - -#elif HAVE_INLINE_ASM +#if HAVE_INLINE_ASM && !defined(__APPLE__) #define AV_READ_TIME read_time -- 2.42.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME [not found] <20240604135504.83169-1-quinkblack@foxmail.com> 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple Zhao Zhili @ 2024-06-04 13:55 ` Zhao Zhili 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili 3 siblings, 0 replies; 10+ messages in thread From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Zhao Zhili From: Zhao Zhili <zhilizhao@tencent.com> --- libavutil/timer.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libavutil/timer.h b/libavutil/timer.h index 2cd299eca3..74c4d84e69 100644 --- a/libavutil/timer.h +++ b/libavutil/timer.h @@ -46,6 +46,8 @@ #include "macos_kperf.h" #elif HAVE_MACH_ABSOLUTE_TIME #include <mach/mach_time.h> +#elif HAVE_CLOCK_GETTIME +#include <time.h> #endif #include "common.h" @@ -70,6 +72,9 @@ # define AV_READ_TIME gethrtime # elif HAVE_MACH_ABSOLUTE_TIME # define AV_READ_TIME mach_absolute_time +# elif HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC) +# include "libavutil/time.h" +# define AV_READ_TIME av_gettime_relative # endif #endif -- 2.42.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android [not found] <20240604135504.83169-1-quinkblack@foxmail.com> ` (2 preceding siblings ...) 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME Zhao Zhili @ 2024-06-04 13:55 ` Zhao Zhili 2024-06-04 20:25 ` Martin Storsjö 3 siblings, 1 reply; 10+ messages in thread From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Zhao Zhili From: Zhao Zhili <zhilizhao@tencent.com> The inline asm doesn't work on Android. --- libavutil/aarch64/timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavutil/aarch64/timer.h b/libavutil/aarch64/timer.h index fadc9568f8..13a58b48e4 100644 --- a/libavutil/aarch64/timer.h +++ b/libavutil/aarch64/timer.h @@ -24,7 +24,7 @@ #include <stdint.h> #include "config.h" -#if HAVE_INLINE_ASM && !defined(__APPLE__) +#if HAVE_INLINE_ASM && !defined(__APPLE__) && !defined(__ANDROID__) #define AV_READ_TIME read_time -- 2.42.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili @ 2024-06-04 20:25 ` Martin Storsjö 0 siblings, 0 replies; 10+ messages in thread From: Martin Storsjö @ 2024-06-04 20:25 UTC (permalink / raw) To: FFmpeg development discussions and patches; +Cc: Zhao Zhili On Tue, 4 Jun 2024, Zhao Zhili wrote: > From: Zhao Zhili <zhilizhao@tencent.com> > > The inline asm doesn't work on Android. Using pmccntr_el0 doen't work, no, but instead of falling back to clock_gettime, you may want to use cntvct_el0 instead of pmccntr_el0. IIRC that works on Android, at least it worked a number of years ago. It has less precision than pmccntr_el0, but maybe is better than clock_gettime? I.e., use similar inline assembly as before, but with cntvct_el0 instead of pmccntr_el0. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2024-06-05 8:16 UTC | newest] Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <20240604135504.83169-1-quinkblack@foxmail.com> 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili 2024-06-05 6:29 ` Rémi Denis-Courmont 2024-06-05 6:53 ` Zhao Zhili 2024-06-05 7:34 ` Rémi Denis-Courmont 2024-06-05 7:34 ` Martin Storsjö 2024-06-05 8:16 ` Martin Storsjö 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple Zhao Zhili 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME Zhao Zhili 2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili 2024-06-04 20:25 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git