* [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y
@ 2025-02-27 22:44 Krzysztof Pyrkosz via ffmpeg-devel
2025-02-28 2:31 ` Zhao Zhili
0 siblings, 1 reply; 2+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-27 22:44 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Krzysztof Pyrkosz
---
I was curious whether it's possible to implement this function without
any widening, and it turns out it not only is, but it's quite
performant at the same time!
The idea is to split the 16 bit coefficients into lower and upper half,
invoke udot for the lower half, shift by 8, and follow by udot for the
upper half. The code is based upon existing version.
Benchmark on A78:
bgra_to_y_128_c: 682.0 ( 1.00x)
bgra_to_y_128_neon: 181.2 ( 3.76x)
bgra_to_y_128_dotprod: 117.8 ( 5.79x)
bgra_to_y_1080_c: 5742.5 ( 1.00x)
bgra_to_y_1080_neon: 1472.5 ( 3.90x)
bgra_to_y_1080_dotprod: 906.5 ( 6.33x)
bgra_to_y_1920_c: 10194.0 ( 1.00x)
bgra_to_y_1920_neon: 2589.8 ( 3.94x)
bgra_to_y_1920_dotprod: 1573.8 ( 6.48x)
Krzysztof
libswscale/aarch64/input.S | 88 ++++++++++++++++++++++++++++++++++++
libswscale/aarch64/swscale.c | 17 +++++++
2 files changed, 105 insertions(+)
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 5cb18711fb..5fe6c3f6f5 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -313,3 +313,91 @@ rgbToUV_neon bgr24, rgb24, element=3
rgbToUV_neon bgra32, rgba32, element=4
rgbToUV_neon abgr32, argb32, element=4, alpha_first=1
+
+#if HAVE_DOTPROD
+ENABLE_DOTPROD
+
+function ff_bgra32ToY_neon_dotprod, export=1
+ cmp w4, #0 // check width > 0
+ ldp w12, w11, [x5] // w12: ry, w11: gy
+ ldr w10, [x5, #8] // w10: by
+ b.gt 4f
+ ret
+endfunc
+
+function ff_rgba32ToY_neon_dotprod, export=1
+ cmp w4, #0 // check width > 0
+ ldp w10, w11, [x5] // w10: ry, w11: gy
+ ldr w12, [x5, #8] // w12: by
+ b.le 3f
+4:
+ mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
+ movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
+ dup v6.4s, w9 // w9: const_offset
+
+ cmp w4, #16
+ mov w7, w10
+ bfi w7, w11, 8, 8
+ bfi w7, w12, 16, 8
+ dup v0.4s, w7
+
+ lsr w6, w10, #8
+ lsr w7, w11, #8
+ lsr w8, w12, #8
+
+ bfi w6, w7, 8, 8
+ bfi w6, w8, 16, 8
+ dup v1.4s, w6
+ b.lt 2f
+1:
+ ld1 { v16.16b, v17.16b, v18.16b, v19.16b }, [x1], #64
+ sub w4, w4, #16 // width -= 16
+ cmp w4, #16 // width >= 16 ?
+
+ mov v2.16b, v6.16b
+ mov v3.16b, v6.16b
+ mov v4.16b, v6.16b
+ mov v5.16b, v6.16b
+
+ udot v2.4s, v16.16b, v0.16b
+ udot v3.4s, v17.16b, v0.16b
+ udot v4.4s, v18.16b, v0.16b
+ udot v5.4s, v19.16b, v0.16b
+
+ ushr v2.4s, v2.4s, #8
+ ushr v3.4s, v3.4s, #8
+ ushr v4.4s, v4.4s, #8
+ ushr v5.4s, v5.4s, #8
+
+ udot v2.4s, v16.16b, v1.16b
+ udot v3.4s, v17.16b, v1.16b
+ udot v4.4s, v18.16b, v1.16b
+ udot v5.4s, v19.16b, v1.16b
+
+ sqshrn v16.4h, v2.4s, #1
+ sqshrn2 v16.8h, v3.4s, #1
+ sqshrn v17.4h, v4.4s, #1
+ sqshrn2 v17.8h, v5.4s, #1
+
+ stp q16, q17, [x0], #32 // store to dst
+ b.ge 1b
+ cbz x4, 3f
+2:
+ ldrb w13, [x1] // w13: r
+ ldrb w14, [x1, #1] // w14: g
+ ldrb w15, [x1, #2] // w15: b
+
+ smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
+ smaddl x13, w14, w11, x13 // x13 += gy * g
+ smaddl x13, w15, w12, x13 // x13 += by * b
+ asr w13, w13, #9 // x13 >>= 9
+ sub w4, w4, #1 // width--
+ add x1, x1, #4
+ strh w13, [x0], #2 // store to dst
+ cbnz w4, 2b
+3:
+ ret
+endfunc
+
+DISABLE_DOTPROD
+#endif
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 92c49dcf3a..ffcc6a0605 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -210,6 +210,9 @@ void ff_##name##ToUV_neon(uint8_t *, uint8_t *, const uint8_t *, \
void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
const uint8_t *, const uint8_t *, int w, \
uint32_t *coeffs, void *)
+#define NEON_INPUT_DOTPROD(name) \
+void ff_##name##ToY_neon_dotprod(uint8_t *dst, const uint8_t *src, const uint8_t *, \
+ const uint8_t *, int w, uint32_t *coeffs, void *);
NEON_INPUT(abgr32);
NEON_INPUT(argb32);
@@ -217,6 +220,8 @@ NEON_INPUT(bgr24);
NEON_INPUT(bgra32);
NEON_INPUT(rgb24);
NEON_INPUT(rgba32);
+NEON_INPUT_DOTPROD(bgra32);
+NEON_INPUT_DOTPROD(rgba32);
void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
uint32_t coeff, int64_t offset);
@@ -295,6 +300,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
c->chrToYV12 = ff_bgr24ToUV_neon;
break;
case AV_PIX_FMT_BGRA:
+#if HAVE_DOTPROD
+ if (have_dotprod(cpu_flags)) {
+ c->lumToYV12 = ff_bgra32ToY_neon_dotprod;
+ }
+ else
+#endif
c->lumToYV12 = ff_bgra32ToY_neon;
if (c->chrSrcHSubSample)
c->chrToYV12 = ff_bgra32ToUV_half_neon;
@@ -309,6 +320,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
c->chrToYV12 = ff_rgb24ToUV_neon;
break;
case AV_PIX_FMT_RGBA:
+#if HAVE_DOTPROD
+ if (have_dotprod(cpu_flags)) {
+ c->lumToYV12 = ff_rgba32ToY_neon_dotprod;
+ }
+ else
+#endif
c->lumToYV12 = ff_rgba32ToY_neon;
if (c->chrSrcHSubSample)
c->chrToYV12 = ff_rgba32ToUV_half_neon;
--
2.47.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y
2025-02-27 22:44 [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-28 2:31 ` Zhao Zhili
0 siblings, 0 replies; 2+ messages in thread
From: Zhao Zhili @ 2025-02-28 2:31 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Niklas Haas
Cc haasn.
Libswscale in under refactor. Does current asm works after refactor, or they need to be refactored or
rewrite after? If it’s the second case, maybe we should hold on to do more asm with libswscale
before hassn work done.
> On Feb 28, 2025, at 06:44, Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> wrote:
>
> ---
> I was curious whether it's possible to implement this function without
> any widening, and it turns out it not only is, but it's quite
> performant at the same time!
>
> The idea is to split the 16 bit coefficients into lower and upper half,
> invoke udot for the lower half, shift by 8, and follow by udot for the
> upper half. The code is based upon existing version.
>
> Benchmark on A78:
> bgra_to_y_128_c: 682.0 ( 1.00x)
> bgra_to_y_128_neon: 181.2 ( 3.76x)
> bgra_to_y_128_dotprod: 117.8 ( 5.79x)
> bgra_to_y_1080_c: 5742.5 ( 1.00x)
> bgra_to_y_1080_neon: 1472.5 ( 3.90x)
> bgra_to_y_1080_dotprod: 906.5 ( 6.33x)
> bgra_to_y_1920_c: 10194.0 ( 1.00x)
> bgra_to_y_1920_neon: 2589.8 ( 3.94x)
> bgra_to_y_1920_dotprod: 1573.8 ( 6.48x)
>
> Krzysztof
>
> libswscale/aarch64/input.S | 88 ++++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c | 17 +++++++
> 2 files changed, 105 insertions(+)
>
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index 5cb18711fb..5fe6c3f6f5 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -313,3 +313,91 @@ rgbToUV_neon bgr24, rgb24, element=3
> rgbToUV_neon bgra32, rgba32, element=4
>
> rgbToUV_neon abgr32, argb32, element=4, alpha_first=1
> +
> +#if HAVE_DOTPROD
> +ENABLE_DOTPROD
> +
> +function ff_bgra32ToY_neon_dotprod, export=1
> + cmp w4, #0 // check width > 0
> + ldp w12, w11, [x5] // w12: ry, w11: gy
> + ldr w10, [x5, #8] // w10: by
> + b.gt 4f
> + ret
> +endfunc
> +
> +function ff_rgba32ToY_neon_dotprod, export=1
> + cmp w4, #0 // check width > 0
> + ldp w10, w11, [x5] // w10: ry, w11: gy
> + ldr w12, [x5, #8] // w12: by
> + b.le 3f
> +4:
> + mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
> + movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
> + dup v6.4s, w9 // w9: const_offset
> +
> + cmp w4, #16
> + mov w7, w10
> + bfi w7, w11, 8, 8
> + bfi w7, w12, 16, 8
> + dup v0.4s, w7
> +
> + lsr w6, w10, #8
> + lsr w7, w11, #8
> + lsr w8, w12, #8
> +
> + bfi w6, w7, 8, 8
> + bfi w6, w8, 16, 8
> + dup v1.4s, w6
> + b.lt 2f
> +1:
> + ld1 { v16.16b, v17.16b, v18.16b, v19.16b }, [x1], #64
> + sub w4, w4, #16 // width -= 16
> + cmp w4, #16 // width >= 16 ?
> +
> + mov v2.16b, v6.16b
> + mov v3.16b, v6.16b
> + mov v4.16b, v6.16b
> + mov v5.16b, v6.16b
> +
> + udot v2.4s, v16.16b, v0.16b
> + udot v3.4s, v17.16b, v0.16b
> + udot v4.4s, v18.16b, v0.16b
> + udot v5.4s, v19.16b, v0.16b
> +
> + ushr v2.4s, v2.4s, #8
> + ushr v3.4s, v3.4s, #8
> + ushr v4.4s, v4.4s, #8
> + ushr v5.4s, v5.4s, #8
> +
> + udot v2.4s, v16.16b, v1.16b
> + udot v3.4s, v17.16b, v1.16b
> + udot v4.4s, v18.16b, v1.16b
> + udot v5.4s, v19.16b, v1.16b
> +
> + sqshrn v16.4h, v2.4s, #1
> + sqshrn2 v16.8h, v3.4s, #1
> + sqshrn v17.4h, v4.4s, #1
> + sqshrn2 v17.8h, v5.4s, #1
> +
> + stp q16, q17, [x0], #32 // store to dst
> + b.ge 1b
> + cbz x4, 3f
> +2:
> + ldrb w13, [x1] // w13: r
> + ldrb w14, [x1, #1] // w14: g
> + ldrb w15, [x1, #2] // w15: b
> +
> + smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
> + smaddl x13, w14, w11, x13 // x13 += gy * g
> + smaddl x13, w15, w12, x13 // x13 += by * b
> + asr w13, w13, #9 // x13 >>= 9
> + sub w4, w4, #1 // width--
> + add x1, x1, #4
> + strh w13, [x0], #2 // store to dst
> + cbnz w4, 2b
> +3:
> + ret
> +endfunc
> +
> +DISABLE_DOTPROD
> +#endif
> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> index 92c49dcf3a..ffcc6a0605 100644
> --- a/libswscale/aarch64/swscale.c
> +++ b/libswscale/aarch64/swscale.c
> @@ -210,6 +210,9 @@ void ff_##name##ToUV_neon(uint8_t *, uint8_t *, const uint8_t *, \
> void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
> const uint8_t *, const uint8_t *, int w, \
> uint32_t *coeffs, void *)
> +#define NEON_INPUT_DOTPROD(name) \
> +void ff_##name##ToY_neon_dotprod(uint8_t *dst, const uint8_t *src, const uint8_t *, \
> + const uint8_t *, int w, uint32_t *coeffs, void *);
>
> NEON_INPUT(abgr32);
> NEON_INPUT(argb32);
> @@ -217,6 +220,8 @@ NEON_INPUT(bgr24);
> NEON_INPUT(bgra32);
> NEON_INPUT(rgb24);
> NEON_INPUT(rgba32);
> +NEON_INPUT_DOTPROD(bgra32);
> +NEON_INPUT_DOTPROD(rgba32);
>
> void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
> uint32_t coeff, int64_t offset);
> @@ -295,6 +300,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
> c->chrToYV12 = ff_bgr24ToUV_neon;
> break;
> case AV_PIX_FMT_BGRA:
> +#if HAVE_DOTPROD
> + if (have_dotprod(cpu_flags)) {
> + c->lumToYV12 = ff_bgra32ToY_neon_dotprod;
> + }
> + else
> +#endif
> c->lumToYV12 = ff_bgra32ToY_neon;
> if (c->chrSrcHSubSample)
> c->chrToYV12 = ff_bgra32ToUV_half_neon;
> @@ -309,6 +320,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
> c->chrToYV12 = ff_rgb24ToUV_neon;
> break;
> case AV_PIX_FMT_RGBA:
> +#if HAVE_DOTPROD
> + if (have_dotprod(cpu_flags)) {
> + c->lumToYV12 = ff_rgba32ToY_neon_dotprod;
> + }
> + else
> +#endif
> c->lumToYV12 = ff_rgba32ToY_neon;
> if (c->chrSrcHSubSample)
> c->chrToYV12 = ff_rgba32ToUV_half_neon;
> --
> 2.47.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-02-28 2:31 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-02-27 22:44 [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y Krzysztof Pyrkosz via ffmpeg-devel
2025-02-28 2:31 ` Zhao Zhili
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git