On Sun, Oct 30, 2022 at 05:32:35PM -0700, mindmark@gmail.com wrote:
> From: Mark Reid <mindmark@gmail.com>
> 
> ---
>  libswscale/output.c                      | 92 ++++++++++++++++++++++++
>  libswscale/swscale_unscaled.c            |  4 +-
>  libswscale/tests/floatimg_cmp.c          |  4 +-
>  libswscale/utils.c                       | 16 +++--
>  libswscale/yuv2rgb.c                     |  2 +
>  tests/ref/fate/filter-pixdesc-rgbaf32be  |  1 +
>  tests/ref/fate/filter-pixdesc-rgbaf32le  |  1 +
>  tests/ref/fate/filter-pixdesc-rgbf32be   |  1 +
>  tests/ref/fate/filter-pixdesc-rgbf32le   |  1 +
>  tests/ref/fate/filter-pixfmts-copy       |  4 ++
>  tests/ref/fate/filter-pixfmts-crop       |  4 ++
>  tests/ref/fate/filter-pixfmts-field      |  4 ++
>  tests/ref/fate/filter-pixfmts-fieldorder |  4 ++
>  tests/ref/fate/filter-pixfmts-hflip      |  4 ++
>  tests/ref/fate/filter-pixfmts-il         |  4 ++
>  tests/ref/fate/filter-pixfmts-null       |  4 ++
>  tests/ref/fate/filter-pixfmts-scale      |  4 ++
>  tests/ref/fate/filter-pixfmts-transpose  |  4 ++
>  tests/ref/fate/filter-pixfmts-vflip      |  4 ++
>  tests/ref/fate/sws-floatimg-cmp          | 16 +++++
>  20 files changed, 170 insertions(+), 8 deletions(-)
>  create mode 100644 tests/ref/fate/filter-pixdesc-rgbaf32be
>  create mode 100644 tests/ref/fate/filter-pixdesc-rgbaf32le
>  create mode 100644 tests/ref/fate/filter-pixdesc-rgbf32be
>  create mode 100644 tests/ref/fate/filter-pixdesc-rgbf32le
> 
> diff --git a/libswscale/output.c b/libswscale/output.c
> index 0e1c1225a0..e2ec9cbdf5 100644
> --- a/libswscale/output.c
> +++ b/libswscale/output.c
> @@ -2474,6 +2474,92 @@ yuv2gbrpf32_full_X_c(SwsContext *c, const int16_t *lumFilter,
>      }
>  }
>  
> +static void
> +yuv2rgbaf32_full_X_c(SwsContext *c, const int16_t *lumFilter,
> +                    const int16_t **lumSrcx, int lumFilterSize,
> +                    const int16_t *chrFilter, const int16_t **chrUSrcx,
> +                    const int16_t **chrVSrcx, int chrFilterSize,
> +                    const int16_t **alpSrcx, uint8_t *dest,
> +                    int dstW, int y)
> +{
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->dstFormat);
> +    int i;
> +    int alpha = desc->flags & AV_PIX_FMT_FLAG_ALPHA;
> +    int hasAlpha = alpha && alpSrcx;
> +    int pixelStep = alpha ? 4 : 3;
> +    uint32_t *dest32 = (uint32_t*)dest;
> +    const int32_t **lumSrc  = (const int32_t**)lumSrcx;
> +    const int32_t **chrUSrc = (const int32_t**)chrUSrcx;
> +    const int32_t **chrVSrc = (const int32_t**)chrVSrcx;
> +    const int32_t **alpSrc  = (const int32_t**)alpSrcx;
> +    static const float float_mult = 1.0f / 65535.0f;
> +    uint32_t a = av_float2int(1.0f);
> +
> +    for (i = 0; i < dstW; i++) {
> +        int j;
> +        int Y = -0x40000000;
> +        int U = -(128 << 23);
> +        int V = -(128 << 23);
> +        int R, G, B, A;
> +
> +        for (j = 0; j < lumFilterSize; j++)
> +            Y += lumSrc[j][i] * (unsigned)lumFilter[j];
> +
> +        for (j = 0; j < chrFilterSize; j++) {
> +            U += chrUSrc[j][i] * (unsigned)chrFilter[j];
> +            V += chrVSrc[j][i] * (unsigned)chrFilter[j];
> +        }
> +
> +        Y >>= 14;
> +        Y += 0x10000;
> +        U >>= 14;
> +        V >>= 14;
> +
> +        if (hasAlpha) {
> +            A = -0x40000000;
> +
> +            for (j = 0; j < lumFilterSize; j++)
> +                A += alpSrc[j][i] * (unsigned)lumFilter[j];
> +
> +            A >>= 1;
> +            A += 0x20002000;
> +            a = av_float2int(float_mult * (float)(av_clip_uintp2(A, 30) >> 14));
> +        }
> +
> +        Y -= c->yuv2rgb_y_offset;
> +        Y *= c->yuv2rgb_y_coeff;
> +        Y += 1 << 13;
> +        R = V * c->yuv2rgb_v2r_coeff;
> +        G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
> +        B =                            U * c->yuv2rgb_u2b_coeff;
> +

> +        R = av_clip_uintp2(Y + R, 30);
> +        G = av_clip_uintp2(Y + G, 30);
> +        B = av_clip_uintp2(Y + B, 30);

these additions can overflow i think given sufficiently "bad" input
especially with the bt2020 matrix
ive posted a proposed solution for the rgba64 / gbrp16/32f cases
something similar can be done here

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Opposition brings concord. Out of discord comes the fairest harmony.
-- Heraclitus