Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Krzysztof Pyrkosz <ffmpeg@szaka.eu>
Subject: [FFmpeg-devel] [PATCH v2] swscale/aarch64: dotprod implementation of rgba32_to_Y
Date: Mon,  3 Mar 2025 22:00:23 +0100
Message-ID: <20250303210022.7884-2-ffmpeg@szaka.eu> (raw)
In-Reply-To: <eb9b4df7-2eaf-df0-33a1-cf718949be9@martin.st>

The idea is to split the 16 bit coefficients into lower and upper half,
invoke udot for the lower half, shift by 8, and follow by udot for the
upper half.

Benchmark on A78:
bgra_to_y_128_c:                                       682.0 ( 1.00x)
bgra_to_y_128_neon:                                    181.2 ( 3.76x)
bgra_to_y_128_dotprod:                                 117.8 ( 5.79x)
bgra_to_y_1080_c:                                     5742.5 ( 1.00x)
bgra_to_y_1080_neon:                                  1472.5 ( 3.90x)
bgra_to_y_1080_dotprod:                                906.5 ( 6.33x)
bgra_to_y_1920_c:                                    10194.0 ( 1.00x)
bgra_to_y_1920_neon:                                  2589.8 ( 3.94x)
bgra_to_y_1920_dotprod:                               1573.8 ( 6.48x)
---
 libswscale/aarch64/input.S   | 88 ++++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c | 17 +++++++
 2 files changed, 105 insertions(+)

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 5cb18711fb..c1c0adffc8 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -313,3 +313,91 @@ rgbToUV_neon bgr24, rgb24, element=3
 rgbToUV_neon bgra32, rgba32, element=4
 
 rgbToUV_neon abgr32, argb32, element=4, alpha_first=1
+
+#if HAVE_DOTPROD
+ENABLE_DOTPROD
+
+function ff_bgra32ToY_neon_dotprod, export=1
+        cmp             w4, #0                  // check width > 0
+        ldp             w12, w11, [x5]          // w12: ry, w11: gy
+        ldr             w10, [x5, #8]           // w10: by
+        b.gt            4f
+        ret
+endfunc
+
+function ff_rgba32ToY_neon_dotprod, export=1
+        cmp             w4, #0                  // check width > 0
+        ldp             w10, w11, [x5]          // w10: ry, w11: gy
+        ldr             w12, [x5, #8]           // w12: by
+        b.le            3f
+4:
+        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
+        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
+        dup             v6.4s, w9               // w9: const_offset
+
+        cmp             w4, #16
+        mov             w7, w10
+        bfi             w7, w11, 8, 8           // the bfi instructions are used to assemble
+        bfi             w7, w12, 16, 8          // 4 byte r,g,b,0 mask to be then used by udot.
+        dup             v0.4s, w7               // v0 holds the lower byte of each coefficient
+
+        lsr             w6, w10, #8
+        lsr             w7, w11, #8
+        lsr             w8, w12, #8
+
+        bfi             w6, w7, 8, 8
+        bfi             w6, w8, 16, 8
+        dup             v1.4s, w6               // v1 holds the upper byte of each coefficient
+        b.lt            2f
+1:
+        ld1             { v16.16b, v17.16b, v18.16b, v19.16b }, [x1], #64
+        sub             w4, w4, #16             // width -= 16
+
+        mov             v2.16b, v6.16b
+        mov             v3.16b, v6.16b
+        mov             v4.16b, v6.16b
+        mov             v5.16b, v6.16b
+        cmp             w4, #16                 // width >= 16 ?
+
+        udot            v2.4s, v16.16b, v0.16b
+        udot            v3.4s, v17.16b, v0.16b
+        udot            v4.4s, v18.16b, v0.16b
+        udot            v5.4s, v19.16b, v0.16b
+
+        ushr            v2.4s, v2.4s, #8
+        ushr            v3.4s, v3.4s, #8
+        ushr            v4.4s, v4.4s, #8
+        ushr            v5.4s, v5.4s, #8
+
+        udot            v2.4s, v16.16b, v1.16b
+        udot            v3.4s, v17.16b, v1.16b
+        udot            v4.4s, v18.16b, v1.16b
+        udot            v5.4s, v19.16b, v1.16b
+
+        sqshrn          v16.4h, v2.4s, #1
+        sqshrn2         v16.8h, v3.4s, #1
+        sqshrn          v17.4h, v4.4s, #1
+        sqshrn2         v17.8h, v5.4s, #1
+
+        stp             q16, q17, [x0], #32     // store to dst
+        b.ge            1b
+        cbz             x4, 3f
+2:
+        ldrb            w13, [x1]               // w13: r
+        ldrb            w14, [x1, #1]           // w14: g
+        ldrb            w15, [x1, #2]           // w15: b
+
+        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
+        smaddl          x13, w14, w11, x13      // x13 += gy * g
+        smaddl          x13, w15, w12, x13      // x13 += by * b
+        asr             w13, w13, #9            // x13 >>= 9
+        sub             w4, w4, #1              // width--
+        add             x1, x1, #4
+        strh            w13, [x0], #2           // store to dst
+        cbnz            w4, 2b
+3:
+        ret
+endfunc
+
+DISABLE_DOTPROD
+#endif
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 92c49dcf3a..18746b1b19 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -210,6 +210,9 @@ void ff_##name##ToUV_neon(uint8_t *, uint8_t *, const uint8_t *, \
 void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
                               const uint8_t *, const uint8_t *, int w, \
                               uint32_t *coeffs, void *)
+#define NEON_INPUT_DOTPROD(name) \
+void ff_##name##ToY_neon_dotprod(uint8_t *dst, const uint8_t *src, const uint8_t *, \
+                                 const uint8_t *, int w, uint32_t *coeffs, void *);
 
 NEON_INPUT(abgr32);
 NEON_INPUT(argb32);
@@ -217,6 +220,8 @@ NEON_INPUT(bgr24);
 NEON_INPUT(bgra32);
 NEON_INPUT(rgb24);
 NEON_INPUT(rgba32);
+NEON_INPUT_DOTPROD(bgra32);
+NEON_INPUT_DOTPROD(rgba32);
 
 void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
                                uint32_t coeff, int64_t offset);
@@ -295,6 +300,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
                 c->chrToYV12 = ff_bgr24ToUV_neon;
             break;
         case AV_PIX_FMT_BGRA:
+#if HAVE_DOTPROD
+            if (have_dotprod(cpu_flags)) {
+                c->lumToYV12 = ff_bgra32ToY_neon_dotprod;
+            }
+            else
+#endif
             c->lumToYV12 = ff_bgra32ToY_neon;
             if (c->chrSrcHSubSample)
                 c->chrToYV12 = ff_bgra32ToUV_half_neon;
@@ -309,6 +320,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
                 c->chrToYV12 = ff_rgb24ToUV_neon;
             break;
         case AV_PIX_FMT_RGBA:
+#if HAVE_DOTPROD
+            if (have_dotprod(cpu_flags)) {
+                c->lumToYV12 = ff_rgba32ToY_neon_dotprod;
+            }
+            else
+#endif
             c->lumToYV12 = ff_rgba32ToY_neon;
             if (c->chrSrcHSubSample)
                 c->chrToYV12 = ff_rgba32ToUV_half_neon;
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

      reply	other threads:[~2025-03-03 21:00 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-27 22:44 [FFmpeg-devel] [PATCH] " Krzysztof Pyrkosz via ffmpeg-devel
2025-02-28  2:31 ` Zhao Zhili
2025-02-28 10:21   ` Niklas Haas
2025-02-28 10:43     ` Martin Storsjö
2025-02-28 10:49     ` Andreas Rheinhardt
2025-02-28 11:32       ` Niklas Haas
2025-03-01 22:55 ` Martin Storsjö
2025-03-03 21:00   ` Krzysztof Pyrkosz via ffmpeg-devel [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250303210022.7884-2-ffmpeg@szaka.eu \
    --to=ffmpeg-devel@ffmpeg.org \
    --cc=ffmpeg@szaka.eu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git