Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y
@ 2025-02-27 22:44 Krzysztof Pyrkosz via ffmpeg-devel
  2025-02-28  2:31 ` Zhao Zhili
  0 siblings, 1 reply; 6+ messages in thread
From: Krzysztof Pyrkosz via ffmpeg-devel @ 2025-02-27 22:44 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Krzysztof Pyrkosz

---
I was curious whether it's possible to implement this function without
any widening, and it turns out it not only is, but it's quite
performant at the same time!

The idea is to split the 16 bit coefficients into lower and upper half,
invoke udot for the lower half, shift by 8, and follow by udot for the
upper half. The code is based upon existing version.

Benchmark on A78:
bgra_to_y_128_c:                                       682.0 ( 1.00x)
bgra_to_y_128_neon:                                    181.2 ( 3.76x)
bgra_to_y_128_dotprod:                                 117.8 ( 5.79x)
bgra_to_y_1080_c:                                     5742.5 ( 1.00x)
bgra_to_y_1080_neon:                                  1472.5 ( 3.90x)
bgra_to_y_1080_dotprod:                                906.5 ( 6.33x)
bgra_to_y_1920_c:                                    10194.0 ( 1.00x)
bgra_to_y_1920_neon:                                  2589.8 ( 3.94x)
bgra_to_y_1920_dotprod:                               1573.8 ( 6.48x)

Krzysztof

 libswscale/aarch64/input.S   | 88 ++++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c | 17 +++++++
 2 files changed, 105 insertions(+)

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 5cb18711fb..5fe6c3f6f5 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -313,3 +313,91 @@ rgbToUV_neon bgr24, rgb24, element=3
 rgbToUV_neon bgra32, rgba32, element=4
 
 rgbToUV_neon abgr32, argb32, element=4, alpha_first=1
+
+#if HAVE_DOTPROD
+ENABLE_DOTPROD
+
+function ff_bgra32ToY_neon_dotprod, export=1
+        cmp             w4, #0                  // check width > 0
+        ldp             w12, w11, [x5]          // w12: ry, w11: gy
+        ldr             w10, [x5, #8]           // w10: by
+        b.gt            4f
+        ret
+endfunc
+
+function ff_rgba32ToY_neon_dotprod, export=1
+        cmp             w4, #0                  // check width > 0
+        ldp             w10, w11, [x5]          // w10: ry, w11: gy
+        ldr             w12, [x5, #8]           // w12: by
+        b.le            3f
+4:
+        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
+        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
+        dup             v6.4s, w9               // w9: const_offset
+
+        cmp             w4, #16
+        mov             w7, w10
+        bfi             w7, w11, 8, 8
+        bfi             w7, w12, 16, 8
+        dup             v0.4s, w7
+
+        lsr             w6, w10, #8
+        lsr             w7, w11, #8
+        lsr             w8, w12, #8
+
+        bfi             w6, w7, 8, 8
+        bfi             w6, w8, 16, 8
+        dup             v1.4s, w6
+        b.lt            2f
+1:
+        ld1             { v16.16b, v17.16b, v18.16b, v19.16b }, [x1], #64
+        sub             w4, w4, #16             // width -= 16
+        cmp             w4, #16                 // width >= 16 ?
+
+        mov             v2.16b, v6.16b
+        mov             v3.16b, v6.16b
+        mov             v4.16b, v6.16b
+        mov             v5.16b, v6.16b
+
+        udot            v2.4s, v16.16b, v0.16b
+        udot            v3.4s, v17.16b, v0.16b
+        udot            v4.4s, v18.16b, v0.16b
+        udot            v5.4s, v19.16b, v0.16b
+
+        ushr            v2.4s, v2.4s, #8
+        ushr            v3.4s, v3.4s, #8
+        ushr            v4.4s, v4.4s, #8
+        ushr            v5.4s, v5.4s, #8
+
+        udot            v2.4s, v16.16b, v1.16b
+        udot            v3.4s, v17.16b, v1.16b
+        udot            v4.4s, v18.16b, v1.16b
+        udot            v5.4s, v19.16b, v1.16b
+
+        sqshrn          v16.4h, v2.4s, #1
+        sqshrn2         v16.8h, v3.4s, #1
+        sqshrn          v17.4h, v4.4s, #1
+        sqshrn2         v17.8h, v5.4s, #1
+
+        stp             q16, q17, [x0], #32     // store to dst
+        b.ge            1b
+        cbz             x4, 3f
+2:
+        ldrb            w13, [x1]               // w13: r
+        ldrb            w14, [x1, #1]           // w14: g
+        ldrb            w15, [x1, #2]           // w15: b
+
+        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
+        smaddl          x13, w14, w11, x13      // x13 += gy * g
+        smaddl          x13, w15, w12, x13      // x13 += by * b
+        asr             w13, w13, #9            // x13 >>= 9
+        sub             w4, w4, #1              // width--
+        add             x1, x1, #4
+        strh            w13, [x0], #2           // store to dst
+        cbnz            w4, 2b
+3:
+        ret
+endfunc
+
+DISABLE_DOTPROD
+#endif
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 92c49dcf3a..ffcc6a0605 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -210,6 +210,9 @@ void ff_##name##ToUV_neon(uint8_t *, uint8_t *, const uint8_t *, \
 void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
                               const uint8_t *, const uint8_t *, int w, \
                               uint32_t *coeffs, void *)
+#define NEON_INPUT_DOTPROD(name) \
+void ff_##name##ToY_neon_dotprod(uint8_t *dst, const uint8_t *src, const uint8_t *, \
+                                 const uint8_t *, int w, uint32_t *coeffs, void *);
 
 NEON_INPUT(abgr32);
 NEON_INPUT(argb32);
@@ -217,6 +220,8 @@ NEON_INPUT(bgr24);
 NEON_INPUT(bgra32);
 NEON_INPUT(rgb24);
 NEON_INPUT(rgba32);
+NEON_INPUT_DOTPROD(bgra32);
+NEON_INPUT_DOTPROD(rgba32);
 
 void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
                                uint32_t coeff, int64_t offset);
@@ -295,6 +300,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
                 c->chrToYV12 = ff_bgr24ToUV_neon;
             break;
         case AV_PIX_FMT_BGRA:
+#if HAVE_DOTPROD
+            if (have_dotprod(cpu_flags)) {
+                c->lumToYV12 = ff_bgra32ToY_neon_dotprod;
+            }
+	    else
+#endif
             c->lumToYV12 = ff_bgra32ToY_neon;
             if (c->chrSrcHSubSample)
                 c->chrToYV12 = ff_bgra32ToUV_half_neon;
@@ -309,6 +320,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
                 c->chrToYV12 = ff_rgb24ToUV_neon;
             break;
         case AV_PIX_FMT_RGBA:
+#if HAVE_DOTPROD
+            if (have_dotprod(cpu_flags)) {
+                c->lumToYV12 = ff_rgba32ToY_neon_dotprod;
+            }
+	    else
+#endif
             c->lumToYV12 = ff_rgba32ToY_neon;
             if (c->chrSrcHSubSample)
                 c->chrToYV12 = ff_rgba32ToUV_half_neon;
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y
  2025-02-27 22:44 [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y Krzysztof Pyrkosz via ffmpeg-devel
@ 2025-02-28  2:31 ` Zhao Zhili
  2025-02-28 10:21   ` Niklas Haas
  0 siblings, 1 reply; 6+ messages in thread
From: Zhao Zhili @ 2025-02-28  2:31 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Niklas Haas

Cc haasn.

Libswscale in under refactor. Does current asm works after refactor, or they need to be refactored or
rewrite after? If it’s the second case, maybe we should hold on to do more asm with libswscale 
before hassn work done.

> On Feb 28, 2025, at 06:44, Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> wrote:
> 
> ---
> I was curious whether it's possible to implement this function without
> any widening, and it turns out it not only is, but it's quite
> performant at the same time!
> 
> The idea is to split the 16 bit coefficients into lower and upper half,
> invoke udot for the lower half, shift by 8, and follow by udot for the
> upper half. The code is based upon existing version.
> 
> Benchmark on A78:
> bgra_to_y_128_c:                                       682.0 ( 1.00x)
> bgra_to_y_128_neon:                                    181.2 ( 3.76x)
> bgra_to_y_128_dotprod:                                 117.8 ( 5.79x)
> bgra_to_y_1080_c:                                     5742.5 ( 1.00x)
> bgra_to_y_1080_neon:                                  1472.5 ( 3.90x)
> bgra_to_y_1080_dotprod:                                906.5 ( 6.33x)
> bgra_to_y_1920_c:                                    10194.0 ( 1.00x)
> bgra_to_y_1920_neon:                                  2589.8 ( 3.94x)
> bgra_to_y_1920_dotprod:                               1573.8 ( 6.48x)
> 
> Krzysztof
> 
> libswscale/aarch64/input.S   | 88 ++++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c | 17 +++++++
> 2 files changed, 105 insertions(+)
> 
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index 5cb18711fb..5fe6c3f6f5 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -313,3 +313,91 @@ rgbToUV_neon bgr24, rgb24, element=3
> rgbToUV_neon bgra32, rgba32, element=4
> 
> rgbToUV_neon abgr32, argb32, element=4, alpha_first=1
> +
> +#if HAVE_DOTPROD
> +ENABLE_DOTPROD
> +
> +function ff_bgra32ToY_neon_dotprod, export=1
> +        cmp             w4, #0                  // check width > 0
> +        ldp             w12, w11, [x5]          // w12: ry, w11: gy
> +        ldr             w10, [x5, #8]           // w10: by
> +        b.gt            4f
> +        ret
> +endfunc
> +
> +function ff_rgba32ToY_neon_dotprod, export=1
> +        cmp             w4, #0                  // check width > 0
> +        ldp             w10, w11, [x5]          // w10: ry, w11: gy
> +        ldr             w12, [x5, #8]           // w12: by
> +        b.le            3f
> +4:
> +        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
> +        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
> +        dup             v6.4s, w9               // w9: const_offset
> +
> +        cmp             w4, #16
> +        mov             w7, w10
> +        bfi             w7, w11, 8, 8
> +        bfi             w7, w12, 16, 8
> +        dup             v0.4s, w7
> +
> +        lsr             w6, w10, #8
> +        lsr             w7, w11, #8
> +        lsr             w8, w12, #8
> +
> +        bfi             w6, w7, 8, 8
> +        bfi             w6, w8, 16, 8
> +        dup             v1.4s, w6
> +        b.lt            2f
> +1:
> +        ld1             { v16.16b, v17.16b, v18.16b, v19.16b }, [x1], #64
> +        sub             w4, w4, #16             // width -= 16
> +        cmp             w4, #16                 // width >= 16 ?
> +
> +        mov             v2.16b, v6.16b
> +        mov             v3.16b, v6.16b
> +        mov             v4.16b, v6.16b
> +        mov             v5.16b, v6.16b
> +
> +        udot            v2.4s, v16.16b, v0.16b
> +        udot            v3.4s, v17.16b, v0.16b
> +        udot            v4.4s, v18.16b, v0.16b
> +        udot            v5.4s, v19.16b, v0.16b
> +
> +        ushr            v2.4s, v2.4s, #8
> +        ushr            v3.4s, v3.4s, #8
> +        ushr            v4.4s, v4.4s, #8
> +        ushr            v5.4s, v5.4s, #8
> +
> +        udot            v2.4s, v16.16b, v1.16b
> +        udot            v3.4s, v17.16b, v1.16b
> +        udot            v4.4s, v18.16b, v1.16b
> +        udot            v5.4s, v19.16b, v1.16b
> +
> +        sqshrn          v16.4h, v2.4s, #1
> +        sqshrn2         v16.8h, v3.4s, #1
> +        sqshrn          v17.4h, v4.4s, #1
> +        sqshrn2         v17.8h, v5.4s, #1
> +
> +        stp             q16, q17, [x0], #32     // store to dst
> +        b.ge            1b
> +        cbz             x4, 3f
> +2:
> +        ldrb            w13, [x1]               // w13: r
> +        ldrb            w14, [x1, #1]           // w14: g
> +        ldrb            w15, [x1, #2]           // w15: b
> +
> +        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
> +        smaddl          x13, w14, w11, x13      // x13 += gy * g
> +        smaddl          x13, w15, w12, x13      // x13 += by * b
> +        asr             w13, w13, #9            // x13 >>= 9
> +        sub             w4, w4, #1              // width--
> +        add             x1, x1, #4
> +        strh            w13, [x0], #2           // store to dst
> +        cbnz            w4, 2b
> +3:
> +        ret
> +endfunc
> +
> +DISABLE_DOTPROD
> +#endif
> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> index 92c49dcf3a..ffcc6a0605 100644
> --- a/libswscale/aarch64/swscale.c
> +++ b/libswscale/aarch64/swscale.c
> @@ -210,6 +210,9 @@ void ff_##name##ToUV_neon(uint8_t *, uint8_t *, const uint8_t *, \
> void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
>                               const uint8_t *, const uint8_t *, int w, \
>                               uint32_t *coeffs, void *)
> +#define NEON_INPUT_DOTPROD(name) \
> +void ff_##name##ToY_neon_dotprod(uint8_t *dst, const uint8_t *src, const uint8_t *, \
> +                                 const uint8_t *, int w, uint32_t *coeffs, void *);
> 
> NEON_INPUT(abgr32);
> NEON_INPUT(argb32);
> @@ -217,6 +220,8 @@ NEON_INPUT(bgr24);
> NEON_INPUT(bgra32);
> NEON_INPUT(rgb24);
> NEON_INPUT(rgba32);
> +NEON_INPUT_DOTPROD(bgra32);
> +NEON_INPUT_DOTPROD(rgba32);
> 
> void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
>                                uint32_t coeff, int64_t offset);
> @@ -295,6 +300,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
>                 c->chrToYV12 = ff_bgr24ToUV_neon;
>             break;
>         case AV_PIX_FMT_BGRA:
> +#if HAVE_DOTPROD
> +            if (have_dotprod(cpu_flags)) {
> +                c->lumToYV12 = ff_bgra32ToY_neon_dotprod;
> +            }
> +	    else
> +#endif
>             c->lumToYV12 = ff_bgra32ToY_neon;
>             if (c->chrSrcHSubSample)
>                 c->chrToYV12 = ff_bgra32ToUV_half_neon;
> @@ -309,6 +320,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
>                 c->chrToYV12 = ff_rgb24ToUV_neon;
>             break;
>         case AV_PIX_FMT_RGBA:
> +#if HAVE_DOTPROD
> +            if (have_dotprod(cpu_flags)) {
> +                c->lumToYV12 = ff_rgba32ToY_neon_dotprod;
> +            }
> +	    else
> +#endif
>             c->lumToYV12 = ff_rgba32ToY_neon;
>             if (c->chrSrcHSubSample)
>                 c->chrToYV12 = ff_rgba32ToUV_half_neon;
> -- 
> 2.47.2
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y
  2025-02-28  2:31 ` Zhao Zhili
@ 2025-02-28 10:21   ` Niklas Haas
  2025-02-28 10:43     ` Martin Storsjö
  2025-02-28 10:49     ` Andreas Rheinhardt
  0 siblings, 2 replies; 6+ messages in thread
From: Niklas Haas @ 2025-02-28 10:21 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Fri, 28 Feb 2025 10:31:19 +0800 Zhao Zhili <quinkblack@foxmail.com> wrote:
> Cc haasn.
>
> Libswscale in under refactor. Does current asm works after refactor, or they need to be refactored or
> rewrite after? If it’s the second case, maybe we should hold on to do more asm with libswscale
> before hassn work done.

No, almost all current asm will be unused after the rewrite. There are some we
can in theory reuse, but for the most part, it doesn't seem to be worth it.

Especially for the very bespoke functions like this one.

For context, in general, the focus in nu-swscale is to focus more on smaller,
flexible primitives and have the calling code combine them as needed. So instead
of a "brga_to_y" function, you would have a sequence that looks like this:

Operation list:
  [ u8 XXXX -> dddX] SWS_OP_READ         : 4 elem(s) packed >> 0
  [ u8 ...X -> dddX] SWS_OP_SWIZZLE      : 2103
  [ u8 ...X -> dddX] SWS_OP_CONVERT      : u8 -> f32
  [f32 ...X -> .XXX] SWS_OP_LINEAR       : dot3 [[0.299000 0.587000 0.114000 0 0] [0 1 0 0 0] [0 0 1 0 0] [0 0 0 1 0]]
  [f32 .XXX -> .XXX] SWS_OP_DITHER       : 16x16 {255 _ _ _}
  [f32 .XXX -> dXXX] SWS_OP_CONVERT      : f32 -> u8
  [ u8 .XXX -> XXXX] SWS_OP_WRITE        : 1 elem(s) packed >> 0

Where each low-level implementation can combine one, or multiple, such
operations together. For example, in the current prototype, SWS_OP_CONVERT and
SWS_OP_WRITE can be fused together into a single implementation.

Note also the conversion to float. I found that the cost of going through
floats seems to be lower on average, across all tested platforms, than the
extra cost of dealing with integers (which require extra shifting, extra
dithering, and extra width conversions - all of which exceed the cost of just
one extra float->int conversion step). This also comes with improved accuracy.

>
> > On Feb 28, 2025, at 06:44, Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> wrote:
> >
> > ---
> > I was curious whether it's possible to implement this function without
> > any widening, and it turns out it not only is, but it's quite
> > performant at the same time!
> >
> > The idea is to split the 16 bit coefficients into lower and upper half,
> > invoke udot for the lower half, shift by 8, and follow by udot for the
> > upper half. The code is based upon existing version.
> >
> > Benchmark on A78:
> > bgra_to_y_128_c:                                       682.0 ( 1.00x)
> > bgra_to_y_128_neon:                                    181.2 ( 3.76x)
> > bgra_to_y_128_dotprod:                                 117.8 ( 5.79x)
> > bgra_to_y_1080_c:                                     5742.5 ( 1.00x)
> > bgra_to_y_1080_neon:                                  1472.5 ( 3.90x)
> > bgra_to_y_1080_dotprod:                                906.5 ( 6.33x)
> > bgra_to_y_1920_c:                                    10194.0 ( 1.00x)
> > bgra_to_y_1920_neon:                                  2589.8 ( 3.94x)
> > bgra_to_y_1920_dotprod:                               1573.8 ( 6.48x)
> >
> > Krzysztof
> >
> > libswscale/aarch64/input.S   | 88 ++++++++++++++++++++++++++++++++++++
> > libswscale/aarch64/swscale.c | 17 +++++++
> > 2 files changed, 105 insertions(+)
> >
> > diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> > index 5cb18711fb..5fe6c3f6f5 100644
> > --- a/libswscale/aarch64/input.S
> > +++ b/libswscale/aarch64/input.S
> > @@ -313,3 +313,91 @@ rgbToUV_neon bgr24, rgb24, element=3
> > rgbToUV_neon bgra32, rgba32, element=4
> >
> > rgbToUV_neon abgr32, argb32, element=4, alpha_first=1
> > +
> > +#if HAVE_DOTPROD
> > +ENABLE_DOTPROD
> > +
> > +function ff_bgra32ToY_neon_dotprod, export=1
> > +        cmp             w4, #0                  // check width > 0
> > +        ldp             w12, w11, [x5]          // w12: ry, w11: gy
> > +        ldr             w10, [x5, #8]           // w10: by
> > +        b.gt            4f
> > +        ret
> > +endfunc
> > +
> > +function ff_rgba32ToY_neon_dotprod, export=1
> > +        cmp             w4, #0                  // check width > 0
> > +        ldp             w10, w11, [x5]          // w10: ry, w11: gy
> > +        ldr             w12, [x5, #8]           // w12: by
> > +        b.le            3f
> > +4:
> > +        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
> > +        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
> > +        dup             v6.4s, w9               // w9: const_offset
> > +
> > +        cmp             w4, #16
> > +        mov             w7, w10
> > +        bfi             w7, w11, 8, 8
> > +        bfi             w7, w12, 16, 8
> > +        dup             v0.4s, w7
> > +
> > +        lsr             w6, w10, #8
> > +        lsr             w7, w11, #8
> > +        lsr             w8, w12, #8
> > +
> > +        bfi             w6, w7, 8, 8
> > +        bfi             w6, w8, 16, 8
> > +        dup             v1.4s, w6
> > +        b.lt            2f
> > +1:
> > +        ld1             { v16.16b, v17.16b, v18.16b, v19.16b }, [x1], #64
> > +        sub             w4, w4, #16             // width -= 16
> > +        cmp             w4, #16                 // width >= 16 ?
> > +
> > +        mov             v2.16b, v6.16b
> > +        mov             v3.16b, v6.16b
> > +        mov             v4.16b, v6.16b
> > +        mov             v5.16b, v6.16b
> > +
> > +        udot            v2.4s, v16.16b, v0.16b
> > +        udot            v3.4s, v17.16b, v0.16b
> > +        udot            v4.4s, v18.16b, v0.16b
> > +        udot            v5.4s, v19.16b, v0.16b
> > +
> > +        ushr            v2.4s, v2.4s, #8
> > +        ushr            v3.4s, v3.4s, #8
> > +        ushr            v4.4s, v4.4s, #8
> > +        ushr            v5.4s, v5.4s, #8
> > +
> > +        udot            v2.4s, v16.16b, v1.16b
> > +        udot            v3.4s, v17.16b, v1.16b
> > +        udot            v4.4s, v18.16b, v1.16b
> > +        udot            v5.4s, v19.16b, v1.16b
> > +
> > +        sqshrn          v16.4h, v2.4s, #1
> > +        sqshrn2         v16.8h, v3.4s, #1
> > +        sqshrn          v17.4h, v4.4s, #1
> > +        sqshrn2         v17.8h, v5.4s, #1
> > +
> > +        stp             q16, q17, [x0], #32     // store to dst
> > +        b.ge            1b
> > +        cbz             x4, 3f
> > +2:
> > +        ldrb            w13, [x1]               // w13: r
> > +        ldrb            w14, [x1, #1]           // w14: g
> > +        ldrb            w15, [x1, #2]           // w15: b
> > +
> > +        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
> > +        smaddl          x13, w14, w11, x13      // x13 += gy * g
> > +        smaddl          x13, w15, w12, x13      // x13 += by * b
> > +        asr             w13, w13, #9            // x13 >>= 9
> > +        sub             w4, w4, #1              // width--
> > +        add             x1, x1, #4
> > +        strh            w13, [x0], #2           // store to dst
> > +        cbnz            w4, 2b
> > +3:
> > +        ret
> > +endfunc
> > +
> > +DISABLE_DOTPROD
> > +#endif
> > diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> > index 92c49dcf3a..ffcc6a0605 100644
> > --- a/libswscale/aarch64/swscale.c
> > +++ b/libswscale/aarch64/swscale.c
> > @@ -210,6 +210,9 @@ void ff_##name##ToUV_neon(uint8_t *, uint8_t *, const uint8_t *, \
> > void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
> >                               const uint8_t *, const uint8_t *, int w, \
> >                               uint32_t *coeffs, void *)
> > +#define NEON_INPUT_DOTPROD(name) \
> > +void ff_##name##ToY_neon_dotprod(uint8_t *dst, const uint8_t *src, const uint8_t *, \
> > +                                 const uint8_t *, int w, uint32_t *coeffs, void *);
> >
> > NEON_INPUT(abgr32);
> > NEON_INPUT(argb32);
> > @@ -217,6 +220,8 @@ NEON_INPUT(bgr24);
> > NEON_INPUT(bgra32);
> > NEON_INPUT(rgb24);
> > NEON_INPUT(rgba32);
> > +NEON_INPUT_DOTPROD(bgra32);
> > +NEON_INPUT_DOTPROD(rgba32);
> >
> > void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
> >                                uint32_t coeff, int64_t offset);
> > @@ -295,6 +300,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
> >                 c->chrToYV12 = ff_bgr24ToUV_neon;
> >             break;
> >         case AV_PIX_FMT_BGRA:
> > +#if HAVE_DOTPROD
> > +            if (have_dotprod(cpu_flags)) {
> > +                c->lumToYV12 = ff_bgra32ToY_neon_dotprod;
> > +            }
> > +	    else
> > +#endif
> >             c->lumToYV12 = ff_bgra32ToY_neon;
> >             if (c->chrSrcHSubSample)
> >                 c->chrToYV12 = ff_bgra32ToUV_half_neon;
> > @@ -309,6 +320,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
> >                 c->chrToYV12 = ff_rgb24ToUV_neon;
> >             break;
> >         case AV_PIX_FMT_RGBA:
> > +#if HAVE_DOTPROD
> > +            if (have_dotprod(cpu_flags)) {
> > +                c->lumToYV12 = ff_rgba32ToY_neon_dotprod;
> > +            }
> > +	    else
> > +#endif
> >             c->lumToYV12 = ff_rgba32ToY_neon;
> >             if (c->chrSrcHSubSample)
> >                 c->chrToYV12 = ff_rgba32ToUV_half_neon;
> > --
> > 2.47.2
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y
  2025-02-28 10:21   ` Niklas Haas
@ 2025-02-28 10:43     ` Martin Storsjö
  2025-02-28 10:49     ` Andreas Rheinhardt
  1 sibling, 0 replies; 6+ messages in thread
From: Martin Storsjö @ 2025-02-28 10:43 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Fri, 28 Feb 2025, Niklas Haas wrote:

> On Fri, 28 Feb 2025 10:31:19 +0800 Zhao Zhili <quinkblack@foxmail.com> wrote:
>> Cc haasn.
>>
>> Libswscale in under refactor. Does current asm works after refactor, or they need to be refactored or
>> rewrite after? If it’s the second case, maybe we should hold on to do more asm with libswscale
>> before hassn work done.
>
> No, almost all current asm will be unused after the rewrite. There are some we
> can in theory reuse, but for the most part, it doesn't seem to be worth it.

Despite that, I don't see a problem with adding more asm for what we have 
currently though, until we actually do replace it with the new rewrite.

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y
  2025-02-28 10:21   ` Niklas Haas
  2025-02-28 10:43     ` Martin Storsjö
@ 2025-02-28 10:49     ` Andreas Rheinhardt
  2025-02-28 11:32       ` Niklas Haas
  1 sibling, 1 reply; 6+ messages in thread
From: Andreas Rheinhardt @ 2025-02-28 10:49 UTC (permalink / raw)
  To: ffmpeg-devel

Niklas Haas:
> On Fri, 28 Feb 2025 10:31:19 +0800 Zhao Zhili <quinkblack@foxmail.com> wrote:
>> Cc haasn.
>>
>> Libswscale in under refactor. Does current asm works after refactor, or they need to be refactored or
>> rewrite after? If it’s the second case, maybe we should hold on to do more asm with libswscale
>> before hassn work done.
> 
> No, almost all current asm will be unused after the rewrite. There are some we
> can in theory reuse, but for the most part, it doesn't seem to be worth it.
> 
> Especially for the very bespoke functions like this one.
> 
> For context, in general, the focus in nu-swscale is to focus more on smaller,
> flexible primitives and have the calling code combine them as needed. So instead
> of a "brga_to_y" function, you would have a sequence that looks like this:
> 
> Operation list:
>   [ u8 XXXX -> dddX] SWS_OP_READ         : 4 elem(s) packed >> 0
>   [ u8 ...X -> dddX] SWS_OP_SWIZZLE      : 2103
>   [ u8 ...X -> dddX] SWS_OP_CONVERT      : u8 -> f32
>   [f32 ...X -> .XXX] SWS_OP_LINEAR       : dot3 [[0.299000 0.587000 0.114000 0 0] [0 1 0 0 0] [0 0 1 0 0] [0 0 0 1 0]]
>   [f32 .XXX -> .XXX] SWS_OP_DITHER       : 16x16 {255 _ _ _}
>   [f32 .XXX -> dXXX] SWS_OP_CONVERT      : f32 -> u8
>   [ u8 .XXX -> XXXX] SWS_OP_WRITE        : 1 elem(s) packed >> 0
> 
> Where each low-level implementation can combine one, or multiple, such
> operations together. For example, in the current prototype, SWS_OP_CONVERT and
> SWS_OP_WRITE can be fused together into a single implementation.
> 
> Note also the conversion to float. I found that the cost of going through
> floats seems to be lower on average, across all tested platforms, than the
> extra cost of dealing with integers (which require extra shifting, extra
> dithering, and extra width conversions - all of which exceed the cost of just
> one extra float->int conversion step). This also comes with improved accuracy.
> 
But what about bitexactness?

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y
  2025-02-28 10:49     ` Andreas Rheinhardt
@ 2025-02-28 11:32       ` Niklas Haas
  0 siblings, 0 replies; 6+ messages in thread
From: Niklas Haas @ 2025-02-28 11:32 UTC (permalink / raw)
  To: ffmpeg-devel

On Fri, 28 Feb 2025 11:49:53 +0100 Andreas Rheinhardt <andreas.rheinhardt@outlook.com> wrote:
> Niklas Haas:
> > On Fri, 28 Feb 2025 10:31:19 +0800 Zhao Zhili <quinkblack@foxmail.com> wrote:
> >> Cc haasn.
> >>
> >> Libswscale in under refactor. Does current asm works after refactor, or they need to be refactored or
> >> rewrite after? If it’s the second case, maybe we should hold on to do more asm with libswscale
> >> before hassn work done.
> >
> > No, almost all current asm will be unused after the rewrite. There are some we
> > can in theory reuse, but for the most part, it doesn't seem to be worth it.
> >
> > Especially for the very bespoke functions like this one.
> >
> > For context, in general, the focus in nu-swscale is to focus more on smaller,
> > flexible primitives and have the calling code combine them as needed. So instead
> > of a "brga_to_y" function, you would have a sequence that looks like this:
> >
> > Operation list:
> >   [ u8 XXXX -> dddX] SWS_OP_READ         : 4 elem(s) packed >> 0
> >   [ u8 ...X -> dddX] SWS_OP_SWIZZLE      : 2103
> >   [ u8 ...X -> dddX] SWS_OP_CONVERT      : u8 -> f32
> >   [f32 ...X -> .XXX] SWS_OP_LINEAR       : dot3 [[0.299000 0.587000 0.114000 0 0] [0 1 0 0 0] [0 0 1 0 0] [0 0 0 1 0]]
> >   [f32 .XXX -> .XXX] SWS_OP_DITHER       : 16x16 {255 _ _ _}
> >   [f32 .XXX -> dXXX] SWS_OP_CONVERT      : f32 -> u8
> >   [ u8 .XXX -> XXXX] SWS_OP_WRITE        : 1 elem(s) packed >> 0
> >
> > Where each low-level implementation can combine one, or multiple, such
> > operations together. For example, in the current prototype, SWS_OP_CONVERT and
> > SWS_OP_WRITE can be fused together into a single implementation.
> >
> > Note also the conversion to float. I found that the cost of going through
> > floats seems to be lower on average, across all tested platforms, than the
> > extra cost of dealing with integers (which require extra shifting, extra
> > dithering, and extra width conversions - all of which exceed the cost of just
> > one extra float->int conversion step). This also comes with improved accuracy.
> >
> But what about bitexactness?

Are you worried about bitexactness relative to an integer implementation, or
bitexactness between platforms?

For the former, all coefficients are calculated as AVRational, using only
exact values (e.g. matrix coefficients as taken from the spec), and collapsed
down to a single linear operation in the end. This guarantees no loss of
precision, as long as we pick a floating point precision in the end that is
sufficient to store all of the needed bits of precision. (This can even be
determined automatically)

For example, if the input is 16 bit or below, a 32 bit float is enough to
guarantee bitexactness. For 32 bit or higher inputs, we would need to bump
up to 64 bit intermediates, although note that swscale currently does not
accept 32 bit integer coefficients in any case.

Also, in the special cases where all matrix coefficients are integers (which
we can easily check for AVRational), we can even skip the float conversion
(and dithering) steps entirely and collapse it down to e.g. a pure bit shift.

For the latter, as long as all platforms implement IEEE semantics I don't
think there is any room for deviation.

>
> - Andreas
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-02-28 11:32 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-02-27 22:44 [FFmpeg-devel] [PATCH] swscale/aarch64: dotprod implementation of rgba32_to_Y Krzysztof Pyrkosz via ffmpeg-devel
2025-02-28  2:31 ` Zhao Zhili
2025-02-28 10:21   ` Niklas Haas
2025-02-28 10:43     ` Martin Storsjö
2025-02-28 10:49     ` Andreas Rheinhardt
2025-02-28 11:32       ` Niklas Haas

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git