Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
       [not found] <20240604135504.83169-1-quinkblack@foxmail.com>
@ 2024-06-04 13:55 ` Zhao Zhili
  2024-06-05  6:29   ` Rémi Denis-Courmont
  2024-06-05  8:16   ` Martin Storsjö
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple Zhao Zhili
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 10+ messages in thread
From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Zhao Zhili

From: Zhao Zhili <zhilizhao@tencent.com>

Test on Apple M1:

rgb24_to_uv_1080_c: 7.2
rgb24_to_uv_1080_neon: 5.5
rgb24_to_uv_1280_c: 8.2
rgb24_to_uv_1280_neon: 6.2
rgb24_to_uv_1920_c: 12.5
rgb24_to_uv_1920_neon: 9.5

rgb24_to_uv_half_540_c: 6.5
rgb24_to_uv_half_540_neon: 3.0
rgb24_to_uv_half_640_c: 7.5
rgb24_to_uv_half_640_neon: 3.2
rgb24_to_uv_half_960_c: 12.5
rgb24_to_uv_half_960_neon: 6.0

rgb24_to_y_1080_c: 4.5
rgb24_to_y_1080_neon: 3.5
rgb24_to_y_1280_c: 5.2
rgb24_to_y_1280_neon: 4.2
rgb24_to_y_1920_c: 8.0
rgb24_to_y_1920_neon: 6.0

Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
---
 libswscale/aarch64/Makefile  |   1 +
 libswscale/aarch64/input.S   | 229 +++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c |  25 ++++
 3 files changed, 255 insertions(+)
 create mode 100644 libswscale/aarch64/input.S

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index da1d909561..adfd90a1b6 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -3,6 +3,7 @@ OBJS        += aarch64/rgb2rgb.o                \
                aarch64/swscale_unscaled.o       \
 
 NEON-OBJS   += aarch64/hscale.o                 \
+               aarch64/input.o                  \
                aarch64/output.o                 \
                aarch64/rgb2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
new file mode 100644
index 0000000000..ee0d223c6e
--- /dev/null
+++ b/libswscale/aarch64/input.S
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro rgb24_to_yuv_load_rgb, src
+        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
+        ushll           v19.8h, v16.8b, #0         // v19: r
+        ushll           v20.8h, v17.8b, #0         // v20: g
+        ushll           v21.8h, v18.8b, #0         // v21: b
+        ushll2          v22.8h, v16.16b, #0        // v22: r
+        ushll2          v23.8h, v17.16b, #0        // v23: g
+        ushll2          v24.8h, v18.16b, #0        // v24: b
+.endm
+
+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
+        mov             \dst1\().16b, v6.16b                    // dst1 = const_offset
+        mov             \dst2\().16b, v6.16b                    // dst2 = const_offset
+        smlal           \dst1\().4s, \coef0\().4h, \r\().4h     // dst1 += rx * r
+        smlal2          \dst2\().4s, \coef0\().8h, \r\().8h     // dst2 += rx * r
+        smlal           \dst1\().4s, \coef1\().4h, \g\().4h     // dst1 += gx * g
+        smlal2          \dst2\().4s, \coef1\().8h, \g\().8h     // dst2 += gx * g
+        smlal           \dst1\().4s, \coef2\().4h, \b\().4h     // dst1 += bx * b
+        smlal2          \dst2\().4s, \coef2\().8h, \b\().8h     // dst2 += bx * b
+        sqshrn          \dst\().4h, \dst1\().4s, \right_shift   // dst_lower_half = dst1 >> right_shift
+        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
+.endm
+
+function ff_rgb24ToY_neon, export=1
+        cmp             w4, #0                  // check width > 0
+        b.le            4f
+
+        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy
+        dup             v0.8h, w10
+        dup             v1.8h, w11
+        ldr             w12, [x5]               // w12: by
+        dup             v2.8h, w12
+
+        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
+        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
+        dup             v6.4s, w9               // w9: const_offset
+
+        mov             x2, #0                  // w2: i
+        and             w3, w4, #0xFFFFFFF0     // w3 = width / 16 * 16
+        cbz             w3, 3f
+1:
+        rgb24_to_yuv_load_rgb x1
+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
+        stp             q16, q17, [x0], #32     // store to dst
+
+        add             w2, w2, #16             // i += 16
+        add             x1, x1, #48             // src += 48
+        cmp             w2, w3                  // i < (width / 16 * 16)
+        b.lt            1b
+        b               3f
+2:
+        ldrb            w13, [x1]               // w13: r
+        ldrb            w14, [x1, #1]           // w14: g
+        ldrb            w15, [x1, #2]           // w15: b
+
+        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
+        smaddl          x13, w14, w11, x13      // x13 += gy * g
+        smaddl          x13, w15, w12, x13      // x13 += by * b
+        asr             w13, w13, #9            // x13 >>= 9
+        strh            w13, [x0], #2           // store to dst
+
+        add             w2, w2, #1              // i++
+        add             x1, x1, #3              // src += 3
+3:
+        cmp             w2, w4                  // i < width
+        b.lt            2b
+4:
+        ret
+endfunc
+
+.macro rgb24_load_uv_coeff half
+        add             x6, x6, #12
+
+        ldp             w10, w11, [x6], #8      // w10: ru, w11: gu
+        dup             v0.8h, w10
+        dup             v1.8h, w11
+
+        ldp             w12, w13, [x6], #8      // w12: bu, w13: rv
+        dup             v2.8h, w12
+        dup             v3.8h, w13
+
+        ldp             w14, w15, [x6], #8      // w14: gv, w15: bv
+        dup             v4.8h, w14
+        dup             v5.8h, w15
+
+    .if \half
+        mov             w9, #512
+        movk            w9, #128, lsl #16       // w9: const_offset
+    .else
+        mov             w9, #256
+        movk            w9, #64, lsl #16        // w9: const_offset
+    .endif
+        dup             v6.4s, w9
+.endm
+
+function ff_rgb24ToUV_half_neon, export=1
+        cmp             w5, #0          // check width > 0
+        b.le            4f
+
+        rgb24_load_uv_coeff half=1
+
+        mov             x9, #0                  // x9: i
+        and             w7, w5, #0xFFFFFFF8     // w7 = width / 8 * 8
+        cbz             w7, 3f
+1:
+        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
+        uaddlp          v19.8h, v16.16b         // v19: r
+        uaddlp          v20.8h, v17.16b         // v20: g
+        uaddlp          v21.8h, v18.16b         // v21: b
+
+        rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
+        str             q16, [x0], #16          // store dst_u
+        rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
+        str             q17, [x1], #16          // store dst_v
+
+        add             w9, w9, #8              // i += 8
+        add             x3, x3, #48             // src += 48
+        cmp             w9, w7                  // i < (width * 8 / 8)
+        b.lt            1b
+        b               3f
+2:
+        ldrb            w2, [x3]                // w2: r1
+        ldrb            w4, [x3, #3]            // w4: r2
+        add             w2, w2, w4              // w2 = r1 + r2
+
+        ldrb            w4, [x3, #1]            // w4: g1
+        ldrb            w7, [x3, #4]            // w7: g2
+        add             w4, w4, w7              // w4 = g1 + g2
+
+        ldrb            w7, [x3, #2]            // w7: b1
+        ldrb            w8, [x3, #5]            // w8: b2
+        add             w7, w7, w8              // w7 = b1 + b2
+
+        umov            w8, v6.s[0]             // dst_u = const_offset
+        smaddl          x8, w2, w10, x8         // dst_u += ru * r
+        smaddl          x8, w4, w11, x8         // dst_u += gu * g
+        smaddl          x8, w7, w12, x8         // dst_u += bu * b
+        asr             x8, x8, #10             // dst_u >>= 10
+        strh            w8, [x0], #2            // store dst_u
+
+        umov            w8, v6.s[0]             // dst_v = const_offset
+        smaddl          x8, w2, w13, x8         // dst_v += rv * r
+        smaddl          x8, w4, w14, x8         // dst_v += gv * g
+        smaddl          x8, w7, w15, x8         // dst_v += bv * b
+        asr             x8, x8, #10             // dst_v >>= 10
+        strh            w8, [x1], #2            // store dst_v
+
+        add             w9, w9, #1              // i++
+        add             x3, x3, #6              // src += 6
+3:
+        cmp             w9, w5
+        b.lt            2b
+4:
+        ret
+endfunc
+
+function ff_rgb24ToUV_neon, export=1
+        cmp             w5, #0                  // check width > 0
+        b.le            4f
+
+        rgb24_load_uv_coeff half=0
+
+        mov             x2, #0                  // w2: i
+        and             w4, w5, #0xFFFFFFF0     // w4: width / 16 * 16
+        cbz             w4, 3f
+1:
+        rgb24_to_yuv_load_rgb x3
+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
+        stp             q16, q17, [x0], #32      // store to dst_u
+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
+        stp             q16, q17, [x1], #32      // store to dst_v
+
+        add             w2, w2, #16             // i += 16
+        add             x3, x3, #48             // src += 48
+        cmp             w2, w4                  // i < (width / 16 * 16)
+        b.lt            1b
+        b               3f
+2:
+        ldrb            w16, [x3]               // w16: r
+        ldrb            w17, [x3, #1]           // w17: g
+        ldrb            w4, [x3, #2]            // w4: b
+
+        umov            w7, v6.s[0]            // w7 = const_offset
+
+        smaddl          x8, w16, w10, x7        // x8 = ru * r + const_offset
+        smaddl          x8, w17, w11, x8        // x8 += gu * g
+        smaddl          x8, w4, w12, x8         // x8 += bu * b
+        asr             w8, w8, #9              // x8 >>= 9
+        strh            w8, [x0], #2            // store to dst_u
+
+        smaddl          x8, w16, w13, x7        // x8 = rv * r + const_offset
+        smaddl          x8, w17, w14, x8        // x8 += gv * g
+        smaddl          x8, w4, w15, x8         // x8 += bv * b
+        asr             w8, w8, #9              // x8 >>= 9
+        strh            w8, [x1], #2            // store to dst_v
+
+        add             w2, w2, #1              // i++
+        add             x3, x3, #3              // src += 3
+3:
+        cmp             w2, w5                  // i < width
+        b.lt            2b
+4:
+        ret
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index bbd9719a44..4c4ea39dc1 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
     default: break;                                                     \
     }
 
+void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                      const uint8_t *unused2, int width,
+                      uint32_t *rgb2yuv, void *opq);
+
+void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
+                       const uint8_t *src1,
+                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
+                       void *opq);
+
+void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
+                       const uint8_t *src1,
+                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
+                       void *opq);
+
 av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
         }
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_RGB24:
+            c->lumToYV12 = ff_rgb24ToY_neon;
+            if (c->chrSrcHSubSample)
+                c->chrToYV12 = ff_rgb24ToUV_half_neon;
+            else
+                c->chrToYV12 = ff_rgb24ToUV_neon;
+            break;
+        default:
+            break;
+        }
     }
 }
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple
       [not found] <20240604135504.83169-1-quinkblack@foxmail.com>
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
@ 2024-06-04 13:55 ` Zhao Zhili
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME Zhao Zhili
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili
  3 siblings, 0 replies; 10+ messages in thread
From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Zhao Zhili

From: Zhao Zhili <zhilizhao@tencent.com>

It will fallback to mach_absolute_time inside libavutil/timer.h
---
 libavutil/aarch64/timer.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/libavutil/aarch64/timer.h b/libavutil/aarch64/timer.h
index 8b28fd354c..fadc9568f8 100644
--- a/libavutil/aarch64/timer.h
+++ b/libavutil/aarch64/timer.h
@@ -24,13 +24,7 @@
 #include <stdint.h>
 #include "config.h"
 
-#if defined(__APPLE__)
-
-#include <mach/mach_time.h>
-
-#define AV_READ_TIME mach_absolute_time
-
-#elif HAVE_INLINE_ASM
+#if HAVE_INLINE_ASM && !defined(__APPLE__)
 
 #define AV_READ_TIME read_time
 
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME
       [not found] <20240604135504.83169-1-quinkblack@foxmail.com>
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple Zhao Zhili
@ 2024-06-04 13:55 ` Zhao Zhili
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili
  3 siblings, 0 replies; 10+ messages in thread
From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Zhao Zhili

From: Zhao Zhili <zhilizhao@tencent.com>

---
 libavutil/timer.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libavutil/timer.h b/libavutil/timer.h
index 2cd299eca3..74c4d84e69 100644
--- a/libavutil/timer.h
+++ b/libavutil/timer.h
@@ -46,6 +46,8 @@
 #include "macos_kperf.h"
 #elif HAVE_MACH_ABSOLUTE_TIME
 #include <mach/mach_time.h>
+#elif HAVE_CLOCK_GETTIME
+#include <time.h>
 #endif
 
 #include "common.h"
@@ -70,6 +72,9 @@
 #       define AV_READ_TIME gethrtime
 #   elif HAVE_MACH_ABSOLUTE_TIME
 #       define AV_READ_TIME mach_absolute_time
+#   elif HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC)
+#       include "libavutil/time.h"
+#       define AV_READ_TIME av_gettime_relative
 #   endif
 #endif
 
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android
       [not found] <20240604135504.83169-1-quinkblack@foxmail.com>
                   ` (2 preceding siblings ...)
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME Zhao Zhili
@ 2024-06-04 13:55 ` Zhao Zhili
  2024-06-04 20:25   ` Martin Storsjö
  3 siblings, 1 reply; 10+ messages in thread
From: Zhao Zhili @ 2024-06-04 13:55 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Zhao Zhili

From: Zhao Zhili <zhilizhao@tencent.com>

The inline asm doesn't work on Android.
---
 libavutil/aarch64/timer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/aarch64/timer.h b/libavutil/aarch64/timer.h
index fadc9568f8..13a58b48e4 100644
--- a/libavutil/aarch64/timer.h
+++ b/libavutil/aarch64/timer.h
@@ -24,7 +24,7 @@
 #include <stdint.h>
 #include "config.h"
 
-#if HAVE_INLINE_ASM && !defined(__APPLE__)
+#if HAVE_INLINE_ASM && !defined(__APPLE__) && !defined(__ANDROID__)
 
 #define AV_READ_TIME read_time
 
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili
@ 2024-06-04 20:25   ` Martin Storsjö
  0 siblings, 0 replies; 10+ messages in thread
From: Martin Storsjö @ 2024-06-04 20:25 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Zhao Zhili

On Tue, 4 Jun 2024, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao@tencent.com>
>
> The inline asm doesn't work on Android.

Using pmccntr_el0 doen't work, no, but instead of falling back to 
clock_gettime, you may want to use cntvct_el0 instead of pmccntr_el0. IIRC 
that works on Android, at least it worked a number of years ago. It has 
less precision than pmccntr_el0, but maybe is better than clock_gettime?

I.e., use similar inline assembly as before, but with cntvct_el0 instead 
of pmccntr_el0.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
@ 2024-06-05  6:29   ` Rémi Denis-Courmont
  2024-06-05  6:53     ` Zhao Zhili
  2024-06-05  8:16   ` Martin Storsjö
  1 sibling, 1 reply; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-05  6:29 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>From: Zhao Zhili <zhilizhao@tencent.com>
>
>Test on Apple M1:
>
>rgb24_to_uv_1080_c: 7.2
>rgb24_to_uv_1080_neon: 5.5
>rgb24_to_uv_1280_c: 8.2
>rgb24_to_uv_1280_neon: 6.2
>rgb24_to_uv_1920_c: 12.5
>rgb24_to_uv_1920_neon: 9.5
>
>rgb24_to_uv_half_540_c: 6.5
>rgb24_to_uv_half_540_neon: 3.0
>rgb24_to_uv_half_640_c: 7.5
>rgb24_to_uv_half_640_neon: 3.2
>rgb24_to_uv_half_960_c: 12.5
>rgb24_to_uv_half_960_neon: 6.0
>
>rgb24_to_y_1080_c: 4.5
>rgb24_to_y_1080_neon: 3.5
>rgb24_to_y_1280_c: 5.2
>rgb24_to_y_1280_neon: 4.2
>rgb24_to_y_1920_c: 8.0
>rgb24_to_y_1920_neon: 6.0
>
>Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
>---
> libswscale/aarch64/Makefile  |   1 +
> libswscale/aarch64/input.S   | 229 +++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c |  25 ++++
> 3 files changed, 255 insertions(+)
> create mode 100644 libswscale/aarch64/input.S
>
>diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>index da1d909561..adfd90a1b6 100644
>--- a/libswscale/aarch64/Makefile
>+++ b/libswscale/aarch64/Makefile
>@@ -3,6 +3,7 @@ OBJS        += aarch64/rgb2rgb.o                \
>                aarch64/swscale_unscaled.o       \
> 
> NEON-OBJS   += aarch64/hscale.o                 \
>+               aarch64/input.o                  \
>                aarch64/output.o                 \
>                aarch64/rgb2rgb_neon.o           \
>                aarch64/yuv2rgb_neon.o           \
>diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>new file mode 100644
>index 0000000000..ee0d223c6e
>--- /dev/null
>+++ b/libswscale/aarch64/input.S
>@@ -0,0 +1,229 @@
>+/*
>+ * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/aarch64/asm.S"
>+
>+.macro rgb24_to_yuv_load_rgb, src
>+        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>+        ushll           v19.8h, v16.8b, #0         // v19: r
>+        ushll           v20.8h, v17.8b, #0         // v20: g
>+        ushll           v21.8h, v18.8b, #0         // v21: b
>+        ushll2          v22.8h, v16.16b, #0        // v22: r
>+        ushll2          v23.8h, v17.16b, #0        // v23: g
>+        ushll2          v24.8h, v18.16b, #0        // v24: b
>+.endm
>+
>+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>+        mov             \dst1\().16b, v6.16b                    // dst1 = const_offset
>+        mov             \dst2\().16b, v6.16b                    // dst2 = const_offset
>+        smlal           \dst1\().4s, \coef0\().4h, \r\().4h     // dst1 += rx * r
>+        smlal2          \dst2\().4s, \coef0\().8h, \r\().8h     // dst2 += rx * r
>+        smlal           \dst1\().4s, \coef1\().4h, \g\().4h     // dst1 += gx * g
>+        smlal2          \dst2\().4s, \coef1\().8h, \g\().8h     // dst2 += gx * g
>+        smlal           \dst1\().4s, \coef2\().4h, \b\().4h     // dst1 += bx * b
>+        smlal2          \dst2\().4s, \coef2\().8h, \b\().8h     // dst2 += bx * b
>+        sqshrn          \dst\().4h, \dst1\().4s, \right_shift   // dst_lower_half = dst1 >> right_shift
>+        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>+.endm
>+
>+function ff_rgb24ToY_neon, export=1
>+        cmp             w4, #0                  // check width > 0
>+        b.le            4f
>+
>+        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy

I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.

>+        dup             v0.8h, w10
>+        dup             v1.8h, w11
>+        ldr             w12, [x5]               // w12: by
>+        dup             v2.8h, w12
>+
>+        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>+        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
>+        dup             v6.4s, w9               // w9: const_offset
>+
>+        mov             x2, #0                  // w2: i
>+        and             w3, w4, #0xFFFFFFF0     // w3 = width / 16 * 16
>+        cbz             w3, 3f
>+1:
>+        rgb24_to_yuv_load_rgb x1
>+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>+        stp             q16, q17, [x0], #32     // store to dst
>+
>+        add             w2, w2, #16             // i += 16
>+        add             x1, x1, #48             // src += 48
>+        cmp             w2, w3                  // i < (width / 16 * 16)
>+        b.lt            1b
>+        b               3f
>+2:
>+        ldrb            w13, [x1]               // w13: r
>+        ldrb            w14, [x1, #1]           // w14: g
>+        ldrb            w15, [x1, #2]           // w15: b

You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.

I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?

>+
>+        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
>+        smaddl          x13, w14, w11, x13      // x13 += gy * g
>+        smaddl          x13, w15, w12, x13      // x13 += by * b
>+        asr             w13, w13, #9            // x13 >>= 9
>+        strh            w13, [x0], #2           // store to dst
>+
>+        add             w2, w2, #1              // i++
>+        add             x1, x1, #3              // src += 3
>+3:
>+        cmp             w2, w4                  // i < width
>+        b.lt            2b
>+4:
>+        ret
>+endfunc
>+
>+.macro rgb24_load_uv_coeff half
>+        add             x6, x6, #12
>+
>+        ldp             w10, w11, [x6], #8      // w10: ru, w11: gu
>+        dup             v0.8h, w10
>+        dup             v1.8h, w11
>+
>+        ldp             w12, w13, [x6], #8      // w12: bu, w13: rv
>+        dup             v2.8h, w12
>+        dup             v3.8h, w13
>+
>+        ldp             w14, w15, [x6], #8      // w14: gv, w15: bv
>+        dup             v4.8h, w14
>+        dup             v5.8h, w15
>+
>+    .if \half
>+        mov             w9, #512
>+        movk            w9, #128, lsl #16       // w9: const_offset
>+    .else
>+        mov             w9, #256
>+        movk            w9, #64, lsl #16        // w9: const_offset
>+    .endif
>+        dup             v6.4s, w9
>+.endm
>+
>+function ff_rgb24ToUV_half_neon, export=1
>+        cmp             w5, #0          // check width > 0
>+        b.le            4f
>+
>+        rgb24_load_uv_coeff half=1
>+
>+        mov             x9, #0                  // x9: i
>+        and             w7, w5, #0xFFFFFFF8     // w7 = width / 8 * 8
>+        cbz             w7, 3f
>+1:
>+        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>+        uaddlp          v19.8h, v16.16b         // v19: r
>+        uaddlp          v20.8h, v17.16b         // v20: g
>+        uaddlp          v21.8h, v18.16b         // v21: b
>+
>+        rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>+        str             q16, [x0], #16          // store dst_u
>+        rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>+        str             q17, [x1], #16          // store dst_v
>+
>+        add             w9, w9, #8              // i += 8
>+        add             x3, x3, #48             // src += 48
>+        cmp             w9, w7                  // i < (width * 8 / 8)
>+        b.lt            1b
>+        b               3f
>+2:
>+        ldrb            w2, [x3]                // w2: r1
>+        ldrb            w4, [x3, #3]            // w4: r2
>+        add             w2, w2, w4              // w2 = r1 + r2
>+
>+        ldrb            w4, [x3, #1]            // w4: g1
>+        ldrb            w7, [x3, #4]            // w7: g2
>+        add             w4, w4, w7              // w4 = g1 + g2
>+
>+        ldrb            w7, [x3, #2]            // w7: b1
>+        ldrb            w8, [x3, #5]            // w8: b2
>+        add             w7, w7, w8              // w7 = b1 + b2
>+
>+        umov            w8, v6.s[0]             // dst_u = const_offset
>+        smaddl          x8, w2, w10, x8         // dst_u += ru * r
>+        smaddl          x8, w4, w11, x8         // dst_u += gu * g
>+        smaddl          x8, w7, w12, x8         // dst_u += bu * b
>+        asr             x8, x8, #10             // dst_u >>= 10
>+        strh            w8, [x0], #2            // store dst_u
>+
>+        umov            w8, v6.s[0]             // dst_v = const_offset
>+        smaddl          x8, w2, w13, x8         // dst_v += rv * r
>+        smaddl          x8, w4, w14, x8         // dst_v += gv * g
>+        smaddl          x8, w7, w15, x8         // dst_v += bv * b
>+        asr             x8, x8, #10             // dst_v >>= 10
>+        strh            w8, [x1], #2            // store dst_v
>+
>+        add             w9, w9, #1              // i++
>+        add             x3, x3, #6              // src += 6
>+3:
>+        cmp             w9, w5
>+        b.lt            2b
>+4:
>+        ret
>+endfunc
>+
>+function ff_rgb24ToUV_neon, export=1
>+        cmp             w5, #0                  // check width > 0
>+        b.le            4f
>+
>+        rgb24_load_uv_coeff half=0
>+
>+        mov             x2, #0                  // w2: i
>+        and             w4, w5, #0xFFFFFFF0     // w4: width / 16 * 16
>+        cbz             w4, 3f
>+1:
>+        rgb24_to_yuv_load_rgb x3
>+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>+        stp             q16, q17, [x0], #32      // store to dst_u
>+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
>+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
>+        stp             q16, q17, [x1], #32      // store to dst_v
>+
>+        add             w2, w2, #16             // i += 16
>+        add             x3, x3, #48             // src += 48
>+        cmp             w2, w4                  // i < (width / 16 * 16)
>+        b.lt            1b
>+        b               3f
>+2:
>+        ldrb            w16, [x3]               // w16: r
>+        ldrb            w17, [x3, #1]           // w17: g
>+        ldrb            w4, [x3, #2]            // w4: b
>+
>+        umov            w7, v6.s[0]            // w7 = const_offset
>+
>+        smaddl          x8, w16, w10, x7        // x8 = ru * r + const_offset
>+        smaddl          x8, w17, w11, x8        // x8 += gu * g
>+        smaddl          x8, w4, w12, x8         // x8 += bu * b
>+        asr             w8, w8, #9              // x8 >>= 9
>+        strh            w8, [x0], #2            // store to dst_u
>+
>+        smaddl          x8, w16, w13, x7        // x8 = rv * r + const_offset
>+        smaddl          x8, w17, w14, x8        // x8 += gv * g
>+        smaddl          x8, w4, w15, x8         // x8 += bv * b
>+        asr             w8, w8, #9              // x8 >>= 9
>+        strh            w8, [x1], #2            // store to dst_v
>+
>+        add             w2, w2, #1              // i++
>+        add             x3, x3, #3              // src += 3
>+3:
>+        cmp             w2, w5                  // i < width
>+        b.lt            2b
>+4:
>+        ret
>+endfunc
>diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>index bbd9719a44..4c4ea39dc1 100644
>--- a/libswscale/aarch64/swscale.c
>+++ b/libswscale/aarch64/swscale.c
>@@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
>     default: break;                                                     \
>     }
> 
>+void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>+                      const uint8_t *unused2, int width,
>+                      uint32_t *rgb2yuv, void *opq);
>+
>+void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>+                       const uint8_t *src1,
>+                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
>+                       void *opq);
>+
>+void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>+                       const uint8_t *src1,
>+                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
>+                       void *opq);
>+
> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> {
>     int cpu_flags = av_get_cpu_flags();
>@@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>         if (c->dstBpc == 8) {
>             c->yuv2planeX = ff_yuv2planeX_8_neon;
>         }
>+        switch (c->srcFormat) {
>+        case AV_PIX_FMT_RGB24:
>+            c->lumToYV12 = ff_rgb24ToY_neon;
>+            if (c->chrSrcHSubSample)
>+                c->chrToYV12 = ff_rgb24ToUV_half_neon;
>+            else
>+                c->chrToYV12 = ff_rgb24ToUV_neon;
>+            break;
>+        default:
>+            break;
>+        }
>     }
> }
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-05  6:29   ` Rémi Denis-Courmont
@ 2024-06-05  6:53     ` Zhao Zhili
  2024-06-05  7:34       ` Rémi Denis-Courmont
  2024-06-05  7:34       ` Martin Storsjö
  0 siblings, 2 replies; 10+ messages in thread
From: Zhao Zhili @ 2024-06-05  6:53 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



> On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <remi@remlab.net> wrote:
> 
> 
> 
> Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>> From: Zhao Zhili <zhilizhao@tencent.com>
>> 
>> Test on Apple M1:
>> 
>> rgb24_to_uv_1080_c: 7.2
>> rgb24_to_uv_1080_neon: 5.5
>> rgb24_to_uv_1280_c: 8.2
>> rgb24_to_uv_1280_neon: 6.2
>> rgb24_to_uv_1920_c: 12.5
>> rgb24_to_uv_1920_neon: 9.5
>> 
>> rgb24_to_uv_half_540_c: 6.5
>> rgb24_to_uv_half_540_neon: 3.0
>> rgb24_to_uv_half_640_c: 7.5
>> rgb24_to_uv_half_640_neon: 3.2
>> rgb24_to_uv_half_960_c: 12.5
>> rgb24_to_uv_half_960_neon: 6.0
>> 
>> rgb24_to_y_1080_c: 4.5
>> rgb24_to_y_1080_neon: 3.5
>> rgb24_to_y_1280_c: 5.2
>> rgb24_to_y_1280_neon: 4.2
>> rgb24_to_y_1920_c: 8.0
>> rgb24_to_y_1920_neon: 6.0
>> 
>> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
>> ---
>> libswscale/aarch64/Makefile  |   1 +
>> libswscale/aarch64/input.S   | 229 +++++++++++++++++++++++++++++++++++
>> libswscale/aarch64/swscale.c |  25 ++++
>> 3 files changed, 255 insertions(+)
>> create mode 100644 libswscale/aarch64/input.S
>> 
>> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>> index da1d909561..adfd90a1b6 100644
>> --- a/libswscale/aarch64/Makefile
>> +++ b/libswscale/aarch64/Makefile
>> @@ -3,6 +3,7 @@ OBJS        += aarch64/rgb2rgb.o                \
>>               aarch64/swscale_unscaled.o       \
>> 
>> NEON-OBJS   += aarch64/hscale.o                 \
>> +               aarch64/input.o                  \
>>               aarch64/output.o                 \
>>               aarch64/rgb2rgb_neon.o           \
>>               aarch64/yuv2rgb_neon.o           \
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> new file mode 100644
>> index 0000000000..ee0d223c6e
>> --- /dev/null
>> +++ b/libswscale/aarch64/input.S
>> @@ -0,0 +1,229 @@
>> +/*
>> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/aarch64/asm.S"
>> +
>> +.macro rgb24_to_yuv_load_rgb, src
>> +        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>> +        ushll           v19.8h, v16.8b, #0         // v19: r
>> +        ushll           v20.8h, v17.8b, #0         // v20: g
>> +        ushll           v21.8h, v18.8b, #0         // v21: b
>> +        ushll2          v22.8h, v16.16b, #0        // v22: r
>> +        ushll2          v23.8h, v17.16b, #0        // v23: g
>> +        ushll2          v24.8h, v18.16b, #0        // v24: b
>> +.endm
>> +
>> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>> +        mov             \dst1\().16b, v6.16b                    // dst1 = const_offset
>> +        mov             \dst2\().16b, v6.16b                    // dst2 = const_offset
>> +        smlal           \dst1\().4s, \coef0\().4h, \r\().4h     // dst1 += rx * r
>> +        smlal2          \dst2\().4s, \coef0\().8h, \r\().8h     // dst2 += rx * r
>> +        smlal           \dst1\().4s, \coef1\().4h, \g\().4h     // dst1 += gx * g
>> +        smlal2          \dst2\().4s, \coef1\().8h, \g\().8h     // dst2 += gx * g
>> +        smlal           \dst1\().4s, \coef2\().4h, \b\().4h     // dst1 += bx * b
>> +        smlal2          \dst2\().4s, \coef2\().8h, \b\().8h     // dst2 += bx * b
>> +        sqshrn          \dst\().4h, \dst1\().4s, \right_shift   // dst_lower_half = dst1 >> right_shift
>> +        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>> +.endm
>> +
>> +function ff_rgb24ToY_neon, export=1
>> +        cmp             w4, #0                  // check width > 0
>> +        b.le            4f
>> +
>> +        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy
> 
> I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.

Will do.

> 
>> +        dup             v0.8h, w10
>> +        dup             v1.8h, w11
>> +        ldr             w12, [x5]               // w12: by
>> +        dup             v2.8h, w12
>> +
>> +        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>> +        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
>> +        dup             v6.4s, w9               // w9: const_offset
>> +
>> +        mov             x2, #0                  // w2: i
>> +        and             w3, w4, #0xFFFFFFF0     // w3 = width / 16 * 16
>> +        cbz             w3, 3f
>> +1:
>> +        rgb24_to_yuv_load_rgb x1
>> +        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> +        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> +        stp             q16, q17, [x0], #32     // store to dst
>> +
>> +        add             w2, w2, #16             // i += 16
>> +        add             x1, x1, #48             // src += 48
>> +        cmp             w2, w3                  // i < (width / 16 * 16)
>> +        b.lt            1b
>> +        b               3f
>> +2:
>> +        ldrb            w13, [x1]               // w13: r
>> +        ldrb            w14, [x1, #1]           // w14: g
>> +        ldrb            w15, [x1, #2]           // w15: b
> 
> You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.
> 
> I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?

I have checked the assembly of C version. The compiler has done pretty well on loop unroll and
vectorize on this simple case.

> 
>> +
>> +        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
>> +        smaddl          x13, w14, w11, x13      // x13 += gy * g
>> +        smaddl          x13, w15, w12, x13      // x13 += by * b
>> +        asr             w13, w13, #9            // x13 >>= 9
>> +        strh            w13, [x0], #2           // store to dst
>> +
>> +        add             w2, w2, #1              // i++
>> +        add             x1, x1, #3              // src += 3
>> +3:
>> +        cmp             w2, w4                  // i < width
>> +        b.lt            2b
>> +4:
>> +        ret
>> +endfunc
>> +
>> +.macro rgb24_load_uv_coeff half
>> +        add             x6, x6, #12
>> +
>> +        ldp             w10, w11, [x6], #8      // w10: ru, w11: gu
>> +        dup             v0.8h, w10
>> +        dup             v1.8h, w11
>> +
>> +        ldp             w12, w13, [x6], #8      // w12: bu, w13: rv
>> +        dup             v2.8h, w12
>> +        dup             v3.8h, w13
>> +
>> +        ldp             w14, w15, [x6], #8      // w14: gv, w15: bv
>> +        dup             v4.8h, w14
>> +        dup             v5.8h, w15
>> +
>> +    .if \half
>> +        mov             w9, #512
>> +        movk            w9, #128, lsl #16       // w9: const_offset
>> +    .else
>> +        mov             w9, #256
>> +        movk            w9, #64, lsl #16        // w9: const_offset
>> +    .endif
>> +        dup             v6.4s, w9
>> +.endm
>> +
>> +function ff_rgb24ToUV_half_neon, export=1
>> +        cmp             w5, #0          // check width > 0
>> +        b.le            4f
>> +
>> +        rgb24_load_uv_coeff half=1
>> +
>> +        mov             x9, #0                  // x9: i
>> +        and             w7, w5, #0xFFFFFFF8     // w7 = width / 8 * 8
>> +        cbz             w7, 3f
>> +1:
>> +        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>> +        uaddlp          v19.8h, v16.16b         // v19: r
>> +        uaddlp          v20.8h, v17.16b         // v20: g
>> +        uaddlp          v21.8h, v18.16b         // v21: b
>> +
>> +        rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>> +        str             q16, [x0], #16          // store dst_u
>> +        rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>> +        str             q17, [x1], #16          // store dst_v
>> +
>> +        add             w9, w9, #8              // i += 8
>> +        add             x3, x3, #48             // src += 48
>> +        cmp             w9, w7                  // i < (width * 8 / 8)
>> +        b.lt            1b
>> +        b               3f
>> +2:
>> +        ldrb            w2, [x3]                // w2: r1
>> +        ldrb            w4, [x3, #3]            // w4: r2
>> +        add             w2, w2, w4              // w2 = r1 + r2
>> +
>> +        ldrb            w4, [x3, #1]            // w4: g1
>> +        ldrb            w7, [x3, #4]            // w7: g2
>> +        add             w4, w4, w7              // w4 = g1 + g2
>> +
>> +        ldrb            w7, [x3, #2]            // w7: b1
>> +        ldrb            w8, [x3, #5]            // w8: b2
>> +        add             w7, w7, w8              // w7 = b1 + b2
>> +
>> +        umov            w8, v6.s[0]             // dst_u = const_offset
>> +        smaddl          x8, w2, w10, x8         // dst_u += ru * r
>> +        smaddl          x8, w4, w11, x8         // dst_u += gu * g
>> +        smaddl          x8, w7, w12, x8         // dst_u += bu * b
>> +        asr             x8, x8, #10             // dst_u >>= 10
>> +        strh            w8, [x0], #2            // store dst_u
>> +
>> +        umov            w8, v6.s[0]             // dst_v = const_offset
>> +        smaddl          x8, w2, w13, x8         // dst_v += rv * r
>> +        smaddl          x8, w4, w14, x8         // dst_v += gv * g
>> +        smaddl          x8, w7, w15, x8         // dst_v += bv * b
>> +        asr             x8, x8, #10             // dst_v >>= 10
>> +        strh            w8, [x1], #2            // store dst_v
>> +
>> +        add             w9, w9, #1              // i++
>> +        add             x3, x3, #6              // src += 6
>> +3:
>> +        cmp             w9, w5
>> +        b.lt            2b
>> +4:
>> +        ret
>> +endfunc
>> +
>> +function ff_rgb24ToUV_neon, export=1
>> +        cmp             w5, #0                  // check width > 0
>> +        b.le            4f
>> +
>> +        rgb24_load_uv_coeff half=0
>> +
>> +        mov             x2, #0                  // w2: i
>> +        and             w4, w5, #0xFFFFFFF0     // w4: width / 16 * 16
>> +        cbz             w4, 3f
>> +1:
>> +        rgb24_to_yuv_load_rgb x3
>> +        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> +        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> +        stp             q16, q17, [x0], #32      // store to dst_u
>> +        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
>> +        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
>> +        stp             q16, q17, [x1], #32      // store to dst_v
>> +
>> +        add             w2, w2, #16             // i += 16
>> +        add             x3, x3, #48             // src += 48
>> +        cmp             w2, w4                  // i < (width / 16 * 16)
>> +        b.lt            1b
>> +        b               3f
>> +2:
>> +        ldrb            w16, [x3]               // w16: r
>> +        ldrb            w17, [x3, #1]           // w17: g
>> +        ldrb            w4, [x3, #2]            // w4: b
>> +
>> +        umov            w7, v6.s[0]            // w7 = const_offset
>> +
>> +        smaddl          x8, w16, w10, x7        // x8 = ru * r + const_offset
>> +        smaddl          x8, w17, w11, x8        // x8 += gu * g
>> +        smaddl          x8, w4, w12, x8         // x8 += bu * b
>> +        asr             w8, w8, #9              // x8 >>= 9
>> +        strh            w8, [x0], #2            // store to dst_u
>> +
>> +        smaddl          x8, w16, w13, x7        // x8 = rv * r + const_offset
>> +        smaddl          x8, w17, w14, x8        // x8 += gv * g
>> +        smaddl          x8, w4, w15, x8         // x8 += bv * b
>> +        asr             w8, w8, #9              // x8 >>= 9
>> +        strh            w8, [x1], #2            // store to dst_v
>> +
>> +        add             w2, w2, #1              // i++
>> +        add             x3, x3, #3              // src += 3
>> +3:
>> +        cmp             w2, w5                  // i < width
>> +        b.lt            2b
>> +4:
>> +        ret
>> +endfunc
>> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>> index bbd9719a44..4c4ea39dc1 100644
>> --- a/libswscale/aarch64/swscale.c
>> +++ b/libswscale/aarch64/swscale.c
>> @@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
>>    default: break;                                                     \
>>    }
>> 
>> +void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>> +                      const uint8_t *unused2, int width,
>> +                      uint32_t *rgb2yuv, void *opq);
>> +
>> +void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>> +                       const uint8_t *src1,
>> +                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
>> +                       void *opq);
>> +
>> +void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>> +                       const uint8_t *src1,
>> +                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
>> +                       void *opq);
>> +
>> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>> {
>>    int cpu_flags = av_get_cpu_flags();
>> @@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>>        if (c->dstBpc == 8) {
>>            c->yuv2planeX = ff_yuv2planeX_8_neon;
>>        }
>> +        switch (c->srcFormat) {
>> +        case AV_PIX_FMT_RGB24:
>> +            c->lumToYV12 = ff_rgb24ToY_neon;
>> +            if (c->chrSrcHSubSample)
>> +                c->chrToYV12 = ff_rgb24ToUV_half_neon;
>> +            else
>> +                c->chrToYV12 = ff_rgb24ToUV_neon;
>> +            break;
>> +        default:
>> +            break;
>> +        }
>>    }
>> }
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-05  6:53     ` Zhao Zhili
@ 2024-06-05  7:34       ` Rémi Denis-Courmont
  2024-06-05  7:34       ` Martin Storsjö
  1 sibling, 0 replies; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-05  7:34 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



Le 5 juin 2024 09:53:45 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>
>
>> On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <remi@remlab.net> wrote:
>> 
>> 
>> 
>> Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>>> From: Zhao Zhili <zhilizhao@tencent.com>
>>> 
>>> Test on Apple M1:
>>> 
>>> rgb24_to_uv_1080_c: 7.2
>>> rgb24_to_uv_1080_neon: 5.5
>>> rgb24_to_uv_1280_c: 8.2
>>> rgb24_to_uv_1280_neon: 6.2
>>> rgb24_to_uv_1920_c: 12.5
>>> rgb24_to_uv_1920_neon: 9.5
>>> 
>>> rgb24_to_uv_half_540_c: 6.5
>>> rgb24_to_uv_half_540_neon: 3.0
>>> rgb24_to_uv_half_640_c: 7.5
>>> rgb24_to_uv_half_640_neon: 3.2
>>> rgb24_to_uv_half_960_c: 12.5
>>> rgb24_to_uv_half_960_neon: 6.0
>>> 
>>> rgb24_to_y_1080_c: 4.5
>>> rgb24_to_y_1080_neon: 3.5
>>> rgb24_to_y_1280_c: 5.2
>>> rgb24_to_y_1280_neon: 4.2
>>> rgb24_to_y_1920_c: 8.0
>>> rgb24_to_y_1920_neon: 6.0
>>> 
>>> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
>>> ---
>>> libswscale/aarch64/Makefile  |   1 +
>>> libswscale/aarch64/input.S   | 229 +++++++++++++++++++++++++++++++++++
>>> libswscale/aarch64/swscale.c |  25 ++++
>>> 3 files changed, 255 insertions(+)
>>> create mode 100644 libswscale/aarch64/input.S
>>> 
>>> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>>> index da1d909561..adfd90a1b6 100644
>>> --- a/libswscale/aarch64/Makefile
>>> +++ b/libswscale/aarch64/Makefile
>>> @@ -3,6 +3,7 @@ OBJS        += aarch64/rgb2rgb.o                \
>>>               aarch64/swscale_unscaled.o       \
>>> 
>>> NEON-OBJS   += aarch64/hscale.o                 \
>>> +               aarch64/input.o                  \
>>>               aarch64/output.o                 \
>>>               aarch64/rgb2rgb_neon.o           \
>>>               aarch64/yuv2rgb_neon.o           \
>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>>> new file mode 100644
>>> index 0000000000..ee0d223c6e
>>> --- /dev/null
>>> +++ b/libswscale/aarch64/input.S
>>> @@ -0,0 +1,229 @@
>>> +/*
>>> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> + */
>>> +
>>> +#include "libavutil/aarch64/asm.S"
>>> +
>>> +.macro rgb24_to_yuv_load_rgb, src
>>> +        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>>> +        ushll           v19.8h, v16.8b, #0         // v19: r
>>> +        ushll           v20.8h, v17.8b, #0         // v20: g
>>> +        ushll           v21.8h, v18.8b, #0         // v21: b
>>> +        ushll2          v22.8h, v16.16b, #0        // v22: r
>>> +        ushll2          v23.8h, v17.16b, #0        // v23: g
>>> +        ushll2          v24.8h, v18.16b, #0        // v24: b
>>> +.endm
>>> +
>>> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>>> +        mov             \dst1\().16b, v6.16b                    // dst1 = const_offset
>>> +        mov             \dst2\().16b, v6.16b                    // dst2 = const_offset
>>> +        smlal           \dst1\().4s, \coef0\().4h, \r\().4h     // dst1 += rx * r
>>> +        smlal2          \dst2\().4s, \coef0\().8h, \r\().8h     // dst2 += rx * r
>>> +        smlal           \dst1\().4s, \coef1\().4h, \g\().4h     // dst1 += gx * g
>>> +        smlal2          \dst2\().4s, \coef1\().8h, \g\().8h     // dst2 += gx * g
>>> +        smlal           \dst1\().4s, \coef2\().4h, \b\().4h     // dst1 += bx * b
>>> +        smlal2          \dst2\().4s, \coef2\().8h, \b\().8h     // dst2 += bx * b
>>> +        sqshrn          \dst\().4h, \dst1\().4s, \right_shift   // dst_lower_half = dst1 >> right_shift
>>> +        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>>> +.endm
>>> +
>>> +function ff_rgb24ToY_neon, export=1
>>> +        cmp             w4, #0                  // check width > 0
>>> +        b.le            4f
>>> +
>>> +        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy
>> 
>> I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.
>
>Will do.
>
>> 
>>> +        dup             v0.8h, w10
>>> +        dup             v1.8h, w11
>>> +        ldr             w12, [x5]               // w12: by
>>> +        dup             v2.8h, w12
>>> +
>>> +        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>>> +        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
>>> +        dup             v6.4s, w9               // w9: const_offset
>>> +
>>> +        mov             x2, #0                  // w2: i
>>> +        and             w3, w4, #0xFFFFFFF0     // w3 = width / 16 * 16
>>> +        cbz             w3, 3f
>>> +1:
>>> +        rgb24_to_yuv_load_rgb x1
>>> +        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>> +        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>> +        stp             q16, q17, [x0], #32     // store to dst
>>> +
>>> +        add             w2, w2, #16             // i += 16
>>> +        add             x1, x1, #48             // src += 48
>>> +        cmp             w2, w3                  // i < (width / 16 * 16)
>>> +        b.lt            1b
>>> +        b               3f
>>> +2:
>>> +        ldrb            w13, [x1]               // w13: r
>>> +        ldrb            w14, [x1, #1]           // w14: g
>>> +        ldrb            w15, [x1, #2]           // w15: b
>> 
>> You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.
>> 
>> I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?
>
>I have checked the assembly of C version. The compiler has done pretty well on loop unroll and
>vectorize on this simple case.

Uh, don't we disable auto-vectorisation in the configure script? Until/unless it is re-enabled, I think benchmarks should be done against non-auto-vectorised code, if only to stay representative of normal/default FFmpeg builds.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-05  6:53     ` Zhao Zhili
  2024-06-05  7:34       ` Rémi Denis-Courmont
@ 2024-06-05  7:34       ` Martin Storsjö
  1 sibling, 0 replies; 10+ messages in thread
From: Martin Storsjö @ 2024-06-05  7:34 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, 5 Jun 2024, Zhao Zhili wrote:

>> On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <remi@remlab.net> wrote:
>> 
>> Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>>> From: Zhao Zhili <zhilizhao@tencent.com>
>>> 
>>> Test on Apple M1:
>>> 
>>> rgb24_to_uv_1080_c: 7.2
>>> rgb24_to_uv_1080_neon: 5.5
>>> rgb24_to_uv_1280_c: 8.2
>>> rgb24_to_uv_1280_neon: 6.2
>>> rgb24_to_uv_1920_c: 12.5
>>> rgb24_to_uv_1920_neon: 9.5
>>> 
>>> rgb24_to_uv_half_540_c: 6.5
>>> rgb24_to_uv_half_540_neon: 3.0
>>> rgb24_to_uv_half_640_c: 7.5
>>> rgb24_to_uv_half_640_neon: 3.2
>>> rgb24_to_uv_half_960_c: 12.5
>>> rgb24_to_uv_half_960_neon: 6.0
>>> 
>>> rgb24_to_y_1080_c: 4.5
>>> rgb24_to_y_1080_neon: 3.5
>>> rgb24_to_y_1280_c: 5.2
>>> rgb24_to_y_1280_neon: 4.2
>>> rgb24_to_y_1920_c: 8.0
>>> rgb24_to_y_1920_neon: 6.0
>>> 
>>> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
>>> ---
>>> libswscale/aarch64/Makefile  |   1 +
>>> libswscale/aarch64/input.S   | 229 +++++++++++++++++++++++++++++++++++
>>> libswscale/aarch64/swscale.c |  25 ++++
>>> 3 files changed, 255 insertions(+)
>>> create mode 100644 libswscale/aarch64/input.S
>>> 
>>> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>>> index da1d909561..adfd90a1b6 100644
>>> --- a/libswscale/aarch64/Makefile
>>> +++ b/libswscale/aarch64/Makefile
>>> @@ -3,6 +3,7 @@ OBJS        += aarch64/rgb2rgb.o                \
>>>               aarch64/swscale_unscaled.o       \
>>> 
>>> NEON-OBJS   += aarch64/hscale.o                 \
>>> +               aarch64/input.o                  \
>>>               aarch64/output.o                 \
>>>               aarch64/rgb2rgb_neon.o           \
>>>               aarch64/yuv2rgb_neon.o           \
>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>>> new file mode 100644
>>> index 0000000000..ee0d223c6e
>>> --- /dev/null
>>> +++ b/libswscale/aarch64/input.S
>>> @@ -0,0 +1,229 @@
>>> +/*
>>> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> + */
>>> +
>>> +#include "libavutil/aarch64/asm.S"
>>> +
>>> +.macro rgb24_to_yuv_load_rgb, src
>>> +        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>>> +        ushll           v19.8h, v16.8b, #0         // v19: r
>>> +        ushll           v20.8h, v17.8b, #0         // v20: g
>>> +        ushll           v21.8h, v18.8b, #0         // v21: b
>>> +        ushll2          v22.8h, v16.16b, #0        // v22: r
>>> +        ushll2          v23.8h, v17.16b, #0        // v23: g
>>> +        ushll2          v24.8h, v18.16b, #0        // v24: b
>>> +.endm
>>> +
>>> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>>> +        mov             \dst1\().16b, v6.16b                    // dst1 = const_offset
>>> +        mov             \dst2\().16b, v6.16b                    // dst2 = const_offset
>>> +        smlal           \dst1\().4s, \coef0\().4h, \r\().4h     // dst1 += rx * r
>>> +        smlal2          \dst2\().4s, \coef0\().8h, \r\().8h     // dst2 += rx * r
>>> +        smlal           \dst1\().4s, \coef1\().4h, \g\().4h     // dst1 += gx * g
>>> +        smlal2          \dst2\().4s, \coef1\().8h, \g\().8h     // dst2 += gx * g
>>> +        smlal           \dst1\().4s, \coef2\().4h, \b\().4h     // dst1 += bx * b
>>> +        smlal2          \dst2\().4s, \coef2\().8h, \b\().8h     // dst2 += bx * b
>>> +        sqshrn          \dst\().4h, \dst1\().4s, \right_shift   // dst_lower_half = dst1 >> right_shift
>>> +        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>>> +.endm
>>> +
>>> +function ff_rgb24ToY_neon, export=1
>>> +        cmp             w4, #0                  // check width > 0
>>> +        b.le            4f
>>> +
>>> +        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy
>> 
>> I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.
>
> Will do.
>
>> 
>>> +        dup             v0.8h, w10
>>> +        dup             v1.8h, w11
>>> +        ldr             w12, [x5]               // w12: by
>>> +        dup             v2.8h, w12
>>> +
>>> +        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>>> +        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
>>> +        dup             v6.4s, w9               // w9: const_offset
>>> +
>>> +        mov             x2, #0                  // w2: i
>>> +        and             w3, w4, #0xFFFFFFF0     // w3 = width / 16 * 16
>>> +        cbz             w3, 3f
>>> +1:
>>> +        rgb24_to_yuv_load_rgb x1
>>> +        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>> +        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>> +        stp             q16, q17, [x0], #32     // store to dst
>>> +
>>> +        add             w2, w2, #16             // i += 16
>>> +        add             x1, x1, #48             // src += 48
>>> +        cmp             w2, w3                  // i < (width / 16 * 16)
>>> +        b.lt            1b
>>> +        b               3f
>>> +2:
>>> +        ldrb            w13, [x1]               // w13: r
>>> +        ldrb            w14, [x1, #1]           // w14: g
>>> +        ldrb            w15, [x1, #2]           // w15: b
>> 
>> You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.
>> 
>> I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?
>
> I have checked the assembly of C version. The compiler has done pretty well on loop unroll and
> vectorize on this simple case.

To add some context here; ffmpeg's configure disables autovectorization 
with GCC (as it does miscompile things semi regularly), but not with 
Clang. This can give somewhat misleading numbers wrt the relative speedup.

Then additionally, the Apple CPUs do have slightly different performance 
characteristics than other cores too, indeed. Plus the very coarse timer 
used on macOS doesn't help either...

FWIW, here are some numbers for this patch from some more traditional 
CPUs, with a GCC build:

                          Cortex A53      A72      A78
rgb24_to_uv_1080_c:         19471.5   8720.7   7049.7
rgb24_to_uv_1080_neon:       5922.7   3147.5   2274.5
rgb24_to_uv_1280_c:         23067.0  10318.2   8348.5
rgb24_to_uv_1280_neon:       6842.5   3672.5   2656.5
rgb24_to_uv_1920_c:         34595.2  15483.2  12509.7
rgb24_to_uv_1920_neon:      10246.0   5496.7   3976.5
rgb24_to_uv_half_540_c:     11396.0   5481.0   4576.0
rgb24_to_uv_half_540_neon:   3655.7   1687.5   1382.5
rgb24_to_uv_half_640_c:     13546.0   6480.2   5399.0
rgb24_to_uv_half_640_neon:   4202.7   1958.2   1611.2
rgb24_to_uv_half_960_c:     20311.0   9724.2   8068.2
rgb24_to_uv_half_960_neon:   6282.7   2934.2   2372.2
rgb24_to_y_1080_c:          12984.2   4339.7   4074.2
rgb24_to_y_1080_neon:        3492.5   1960.5   1444.7
rgb24_to_y_1280_c:          15384.2   6709.2   4823.5
rgb24_to_y_1280_neon:        4038.2   2265.0   1674.0
rgb24_to_y_1920_c:          23069.7   7708.7   7224.7
rgb24_to_y_1920_neon:        6036.2   3389.0   2514.0


// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
  2024-06-05  6:29   ` Rémi Denis-Courmont
@ 2024-06-05  8:16   ` Martin Storsjö
  1 sibling, 0 replies; 10+ messages in thread
From: Martin Storsjö @ 2024-06-05  8:16 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Zhao Zhili

On Tue, 4 Jun 2024, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao@tencent.com>
>
> Test on Apple M1:
>
> rgb24_to_uv_1080_c: 7.2
> rgb24_to_uv_1080_neon: 5.5
> rgb24_to_uv_1280_c: 8.2
> rgb24_to_uv_1280_neon: 6.2
> rgb24_to_uv_1920_c: 12.5
> rgb24_to_uv_1920_neon: 9.5
>
> rgb24_to_uv_half_540_c: 6.5
> rgb24_to_uv_half_540_neon: 3.0
> rgb24_to_uv_half_640_c: 7.5
> rgb24_to_uv_half_640_neon: 3.2
> rgb24_to_uv_half_960_c: 12.5
> rgb24_to_uv_half_960_neon: 6.0
>
> rgb24_to_y_1080_c: 4.5
> rgb24_to_y_1080_neon: 3.5
> rgb24_to_y_1280_c: 5.2
> rgb24_to_y_1280_neon: 4.2
> rgb24_to_y_1920_c: 8.0
> rgb24_to_y_1920_neon: 6.0
>
> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
> ---
> libswscale/aarch64/Makefile  |   1 +
> libswscale/aarch64/input.S   | 229 +++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c |  25 ++++
> 3 files changed, 255 insertions(+)
> create mode 100644 libswscale/aarch64/input.S
>
> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> index da1d909561..adfd90a1b6 100644
> --- a/libswscale/aarch64/Makefile
> +++ b/libswscale/aarch64/Makefile
> @@ -3,6 +3,7 @@ OBJS        += aarch64/rgb2rgb.o                \
>                aarch64/swscale_unscaled.o       \
>
> NEON-OBJS   += aarch64/hscale.o                 \
> +               aarch64/input.o                  \
>                aarch64/output.o                 \
>                aarch64/rgb2rgb_neon.o           \
>                aarch64/yuv2rgb_neon.o           \
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> new file mode 100644
> index 0000000000..ee0d223c6e
> --- /dev/null
> +++ b/libswscale/aarch64/input.S
> @@ -0,0 +1,229 @@
> +/*
> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +.macro rgb24_to_yuv_load_rgb, src
> +        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
> +        ushll           v19.8h, v16.8b, #0         // v19: r
> +        ushll           v20.8h, v17.8b, #0         // v20: g
> +        ushll           v21.8h, v18.8b, #0         // v21: b
> +        ushll2          v22.8h, v16.16b, #0        // v22: r
> +        ushll2          v23.8h, v17.16b, #0        // v23: g
> +        ushll2          v24.8h, v18.16b, #0        // v24: b

Doing "ushll #0" is perhaps a bit unusual, the common thing would be 
"uxtl" instead. It doesn't matter in practice though, it assembles to the 
same instruction anyway.

> +.endm
> +
> +.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
> +        mov             \dst1\().16b, v6.16b                    // dst1 = const_offset
> +        mov             \dst2\().16b, v6.16b                    // dst2 = const_offset
> +        smlal           \dst1\().4s, \coef0\().4h, \r\().4h     // dst1 += rx * r
> +        smlal2          \dst2\().4s, \coef0\().8h, \r\().8h     // dst2 += rx * r
> +        smlal           \dst1\().4s, \coef1\().4h, \g\().4h     // dst1 += gx * g
> +        smlal2          \dst2\().4s, \coef1\().8h, \g\().8h     // dst2 += gx * g
> +        smlal           \dst1\().4s, \coef2\().4h, \b\().4h     // dst1 += bx * b
> +        smlal2          \dst2\().4s, \coef2\().8h, \b\().8h     // dst2 += bx * b

For sequences like this, the Cortex A53 (and iirc at least the A55 too) 
has got a fastpath; if you do multiple consequent smlal/smlsl (or regular 
mla/mls) into the same register, you actually save a lot of time. E.g. 
instead of this:

     smlal dst1
     smlal dst2
     smlal dst1
     smlal dst2
     smlal dst1
     smlal dst2

Do this:

     smlal dst1
     smlal dst1
     smlal dst1
     smlal dst2
     smlal dst2
     smlal dst2

For in-order cores (with this special fastpath - it is indeed a bit 
non-obvious) this makes a huge difference, and for out of order cores, 
they can reorder it as they prefer anyway (as this is not a very long 
instruction sequence).

This makes a massive difference for the in-order cores.

Before:                  Cortex A53      A72      A73
rgb24_to_y_1920_neon:        6032.7   3385.7   2514.0
After:
rgb24_to_y_1920_neon:        5072.7   3388.2   2522.0

A 19% speedup on A53 with just with this one change, and it makes almost 
no difference for the other cores (mostly within measurement noise).

> +function ff_rgb24ToY_neon, export=1
> +        cmp             w4, #0                  // check width > 0
> +        b.le            4f
> +
> +        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy
> +        dup             v0.8h, w10
> +        dup             v1.8h, w11
> +        ldr             w12, [x5]               // w12: by
> +        dup             v2.8h, w12
> +
> +        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
> +        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
> +        dup             v6.4s, w9               // w9: const_offset
> +
> +        mov             x2, #0                  // w2: i
> +        and             w3, w4, #0xFFFFFFF0     // w3 = width / 16 * 16
> +        cbz             w3, 3f

This blindly assumes that if width > 0, then width is also >= 16 at least, 
as we always run the SIMD codepath once here. Is this a requirement in 
swscale, or should we check whether width >= 16 here too?

> +1:
> +        rgb24_to_yuv_load_rgb x1
> +        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
> +        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
> +        stp             q16, q17, [x0], #32     // store to dst
> +
> +        add             w2, w2, #16             // i += 16
> +        add             x1, x1, #48             // src += 48
> +        cmp             w2, w3                  // i < (width / 16 * 16)
> +        b.lt            1b

For the in-order cores, we can help with scheduling here too. You can do 
the add/add/cmp before the stp. This adds a bit more distance between the 
calculation of q17 and the store of it, and adds more distance between the 
cmp and the b.lt that depends on it.

Secondly, it's a bit uncommon in SIMD like this, to count upwards 
(incrementing i); instead, the common pattern is to keep the width in one 
register and count it down.

Here, you can then do "cmp w2, #16", so you don't need to keep a register 
with the aligned end value. And for the scalar codepath, when approaching 
zero, you can do "subs w2, #1", "b.gt 2b", so you don't need two 
instructions for add+cmp.

> +        b               3f
> +2:
> +        ldrb            w13, [x1]               // w13: r
> +        ldrb            w14, [x1, #1]           // w14: g
> +        ldrb            w15, [x1, #2]           // w15: b
> +
> +        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
> +        smaddl          x13, w14, w11, x13      // x13 += gy * g
> +        smaddl          x13, w15, w12, x13      // x13 += by * b
> +        asr             w13, w13, #9            // x13 >>= 9
> +        strh            w13, [x0], #2           // store to dst
> +
> +        add             w2, w2, #1              // i++
> +        add             x1, x1, #3              // src += 3
> +3:
> +        cmp             w2, w4                  // i < width
> +        b.lt            2b
> +4:
> +        ret
> +endfunc
> +
> +.macro rgb24_load_uv_coeff half
> +        add             x6, x6, #12
> +
> +        ldp             w10, w11, [x6], #8      // w10: ru, w11: gu
> +        dup             v0.8h, w10
> +        dup             v1.8h, w11
> +
> +        ldp             w12, w13, [x6], #8      // w12: bu, w13: rv
> +        dup             v2.8h, w12
> +        dup             v3.8h, w13
> +
> +        ldp             w14, w15, [x6], #8      // w14: gv, w15: bv
> +        dup             v4.8h, w14
> +        dup             v5.8h, w15

As Remi mentioned, scheduling instructions like these can help a bit. But 
here, each "ldp" depends on the updated x6 from the previous one, so it 
maybe doesn't make much difference.

> +
> +    .if \half
> +        mov             w9, #512
> +        movk            w9, #128, lsl #16       // w9: const_offset
> +    .else
> +        mov             w9, #256
> +        movk            w9, #64, lsl #16        // w9: const_offset
> +    .endif
> +        dup             v6.4s, w9
> +.endm
> +
> +function ff_rgb24ToUV_half_neon, export=1
> +        cmp             w5, #0          // check width > 0
> +        b.le            4f
> +
> +        rgb24_load_uv_coeff half=1
> +
> +        mov             x9, #0                  // x9: i
> +        and             w7, w5, #0xFFFFFFF8     // w7 = width / 8 * 8
> +        cbz             w7, 3f
> +1:
> +        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
> +        uaddlp          v19.8h, v16.16b         // v19: r
> +        uaddlp          v20.8h, v17.16b         // v20: g
> +        uaddlp          v21.8h, v18.16b         // v21: b
> +
> +        rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
> +        str             q16, [x0], #16          // store dst_u
> +        rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
> +        str             q17, [x1], #16          // store dst_v
> +
> +        add             w9, w9, #8              // i += 8
> +        add             x3, x3, #48             // src += 48
> +        cmp             w9, w7                  // i < (width * 8 / 8)
> +        b.lt            1b

Here, you can also move the both str out to after the add/cmp, like above.

> +        b               3f
> +2:
> +        ldrb            w2, [x3]                // w2: r1
> +        ldrb            w4, [x3, #3]            // w4: r2
> +        add             w2, w2, w4              // w2 = r1 + r2
> +
> +        ldrb            w4, [x3, #1]            // w4: g1
> +        ldrb            w7, [x3, #4]            // w7: g2
> +        add             w4, w4, w7              // w4 = g1 + g2
> +
> +        ldrb            w7, [x3, #2]            // w7: b1
> +        ldrb            w8, [x3, #5]            // w8: b2
> +        add             w7, w7, w8              // w7 = b1 + b2
> +
> +        umov            w8, v6.s[0]             // dst_u = const_offset
> +        smaddl          x8, w2, w10, x8         // dst_u += ru * r
> +        smaddl          x8, w4, w11, x8         // dst_u += gu * g
> +        smaddl          x8, w7, w12, x8         // dst_u += bu * b
> +        asr             x8, x8, #10             // dst_u >>= 10
> +        strh            w8, [x0], #2            // store dst_u
> +
> +        umov            w8, v6.s[0]             // dst_v = const_offset
> +        smaddl          x8, w2, w13, x8         // dst_v += rv * r
> +        smaddl          x8, w4, w14, x8         // dst_v += gv * g
> +        smaddl          x8, w7, w15, x8         // dst_v += bv * b
> +        asr             x8, x8, #10             // dst_v >>= 10
> +        strh            w8, [x1], #2            // store dst_v
> +
> +        add             w9, w9, #1              // i++
> +        add             x3, x3, #6              // src += 6
> +3:
> +        cmp             w9, w5
> +        b.lt            2b
> +4:
> +        ret
> +endfunc
> +
> +function ff_rgb24ToUV_neon, export=1
> +        cmp             w5, #0                  // check width > 0
> +        b.le            4f
> +
> +        rgb24_load_uv_coeff half=0
> +
> +        mov             x2, #0                  // w2: i
> +        and             w4, w5, #0xFFFFFFF0     // w4: width / 16 * 16
> +        cbz             w4, 3f
> +1:
> +        rgb24_to_yuv_load_rgb x3
> +        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
> +        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
> +        stp             q16, q17, [x0], #32      // store to dst_u
> +        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
> +        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
> +        stp             q16, q17, [x1], #32      // store to dst_v

If you'd make the second pair of rgb24_to_yuv_product write into v18/v19 
instead of v16/v17, you can move them out to after the add/add/cmp. (It 
doesn't make much of a measurable difference, but is good scheduling in 
general.)

> +
> +        add             w2, w2, #16             // i += 16
> +        add             x3, x3, #48             // src += 48
> +        cmp             w2, w4                  // i < (width / 16 * 16)
> +        b.lt            1b
> +        b               3f
> +2:
> +        ldrb            w16, [x3]               // w16: r
> +        ldrb            w17, [x3, #1]           // w17: g
> +        ldrb            w4, [x3, #2]            // w4: b
> +
> +        umov            w7, v6.s[0]            // w7 = const_offset

SIMD->GPR moves are generally expensive, we shouldn't be doing this 
within the loop, if the value is constant. Instead, you should do this in 
a prologue to the scalar codepath.

But in this case, the value should already be present in w9, if I 
understand the code correctly, so we don't need to fetch it from v6 - and 
this is already what you do in ff_rgb24ToY_neon.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-06-05  8:16 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20240604135504.83169-1-quinkblack@foxmail.com>
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
2024-06-05  6:29   ` Rémi Denis-Courmont
2024-06-05  6:53     ` Zhao Zhili
2024-06-05  7:34       ` Rémi Denis-Courmont
2024-06-05  7:34       ` Martin Storsjö
2024-06-05  8:16   ` Martin Storsjö
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 3/5] avutil/aarch64: Skip define AV_READ_TIME for apple Zhao Zhili
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 4/5] avutil/timer: Add clock_gettime as a fallback of AV_READ_TIME Zhao Zhili
2024-06-04 13:55 ` [FFmpeg-devel] [PATCH 5/5] avutil/aarch64: Fallback to clock_gettime as timer on Android Zhao Zhili
2024-06-04 20:25   ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git