* [FFmpeg-devel] [PATCH] swscale/input: add rgbaf16 input support
@ 2022-08-08 18:23 Timo Rothenpieler
2022-08-08 19:39 ` Mark Reid
0 siblings, 1 reply; 6+ messages in thread
From: Timo Rothenpieler @ 2022-08-08 18:23 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Timo Rothenpieler
This is by no means perfect, since at least ddagrab will return scRGB
data with values outside of 0.0f to 1.0f for HDR values.
Its primary purpose is to be able to work with the format at all.
_Float16 support was available on arm/aarch64 for a while, and with gcc
12 was enabled on x86 as long as SSE2 is supported.
If the target arch supports f16c, gcc emits fairly efficient assembly,
taking advantage of it. This is the case on x86-64-v3 or higher.
Without f16c, it emulates it in software using sse2 instructions.
---
I am by no means certain this is the correct way to implement this.
Tested it with ddagrab output in that format, and it looks like what I'd
expect.
Specially the order of arguments is a bit of a mystery. I'd have
expected them to be in order of the planes, so for packed formats, only
the first one would matter.
But a bunch of other packed formats left the first src unused, and so I
followed along, and it ended up working fine.
configure | 2 +
libswscale/input.c | 95 ++++++++++++++++++++++++++++++++++++++++++++
libswscale/utils.c | 3 ++
libswscale/version.h | 2 +-
4 files changed, 101 insertions(+), 1 deletion(-)
diff --git a/configure b/configure
index 6761d0cb32..d989498bba 100755
--- a/configure
+++ b/configure
@@ -2143,6 +2143,7 @@ ARCH_FEATURES="
fast_64bit
fast_clz
fast_cmov
+ float16
local_aligned
simd_align_16
simd_align_32
@@ -6228,6 +6229,7 @@ check_builtin MemoryBarrier windows.h "MemoryBarrier()"
check_builtin sync_val_compare_and_swap "" "int *ptr; int oldval, newval; __sync_val_compare_and_swap(ptr, oldval, newval)"
check_builtin gmtime_r time.h "time_t *time; struct tm *tm; gmtime_r(time, tm)"
check_builtin localtime_r time.h "time_t *time; struct tm *tm; localtime_r(time, tm)"
+check_builtin float16 "" "_Float16 f16var"
case "$custom_allocator" in
jemalloc)
diff --git a/libswscale/input.c b/libswscale/input.c
index 68abc4d62c..0b5bd952e8 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -1111,6 +1111,89 @@ static void grayf32##endian_name##ToY16_c(uint8_t *dst, const uint8_t *src,
rgbf32_planar_funcs_endian(le, 0)
rgbf32_planar_funcs_endian(be, 1)
+static void rgbaf16ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *_rgb2yuv)
+{
+#if HAVE_FLOAT16
+ const _Float16 *src = (const _Float16*)src1;
+ uint16_t *dstU = (uint16_t*)_dstU;
+ uint16_t *dstV = (uint16_t*)_dstV;
+ int32_t *rgb2yuv = (int32_t*)_rgb2yuv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ int i;
+ av_assert1(src1==src2);
+ for (i = 0; i < width; i++) {
+ int r = (lrintf(av_clipf(65535.0f * src[i*8+0], 0.0f, 65535.0f)) +
+ lrintf(av_clipf(65535.0f * src[i*8+4], 0.0f, 65535.0f))) >> 1;
+ int g = (lrintf(av_clipf(65535.0f * src[i*8+1], 0.0f, 65535.0f)) +
+ lrintf(av_clipf(65535.0f * src[i*8+5], 0.0f, 65535.0f))) >> 1;
+ int b = (lrintf(av_clipf(65535.0f * src[i*8+2], 0.0f, 65535.0f)) +
+ lrintf(av_clipf(65535.0f * src[i*8+6], 0.0f, 65535.0f))) >> 1;
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+#endif
+}
+
+static void rgbaf16ToUV_c(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *_rgb2yuv)
+{
+#if HAVE_FLOAT16
+ const _Float16 *src = (const _Float16*)src1;
+ uint16_t *dstU = (uint16_t*)_dstU;
+ uint16_t *dstV = (uint16_t*)_dstV;
+ int32_t *rgb2yuv = (int32_t*)_rgb2yuv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ int i;
+ av_assert1(src1==src2);
+ for (i = 0; i < width; i++) {
+ int r = lrintf(av_clipf(65535.0f * src[i*4+0], 0.0f, 65535.0f));
+ int g = lrintf(av_clipf(65535.0f * src[i*4+1], 0.0f, 65535.0f));
+ int b = lrintf(av_clipf(65535.0f * src[i*4+2], 0.0f, 65535.0f));
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+#endif
+}
+
+static void rgbaf16ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,
+ int width, uint32_t *_rgb2yuv)
+{
+#if HAVE_FLOAT16
+ const _Float16 *src = (const _Float16*)_src;
+ uint16_t *dst = (uint16_t*)_dst;
+ int32_t *rgb2yuv = (int32_t*)_rgb2yuv;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ int i;
+ for (i = 0; i < width; i++) {
+ int r = lrintf(av_clipf(65535.0f * src[i*4+0], 0.0f, 65535.0f));
+ int g = lrintf(av_clipf(65535.0f * src[i*4+1], 0.0f, 65535.0f));
+ int b = lrintf(av_clipf(65535.0f * src[i*4+2], 0.0f, 65535.0f));
+
+ dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+#endif
+}
+
+static void rgbaf16ToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,
+ int width, uint32_t *unused2)
+{
+#if HAVE_FLOAT16
+ const _Float16 *src = (const _Float16*)_src;
+ uint16_t *dst = (uint16_t*)_dst;
+ int i;
+ for (i=0; i<width; i++) {
+ dst[i] = lrintf(av_clipf(65535.0f * src[i*4+3], 0.0f, 65535.0f));
+ }
+#endif
+}
+
av_cold void ff_sws_init_input_funcs(SwsContext *c)
{
enum AVPixelFormat srcFormat = c->srcFormat;
@@ -1375,6 +1458,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
case AV_PIX_FMT_X2BGR10LE:
c->chrToYV12 = bgr30leToUV_half_c;
break;
+ case AV_PIX_FMT_RGBAF16:
+ c->chrToYV12 = rgbaf16ToUV_half_c;
+ break;
}
} else {
switch (srcFormat) {
@@ -1462,6 +1548,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
case AV_PIX_FMT_X2BGR10LE:
c->chrToYV12 = bgr30leToUV_c;
break;
+ case AV_PIX_FMT_RGBAF16:
+ c->chrToYV12 = rgbaf16ToUV_c;
+ break;
}
}
@@ -1750,6 +1839,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
case AV_PIX_FMT_X2BGR10LE:
c->lumToYV12 = bgr30leToY_c;
break;
+ case AV_PIX_FMT_RGBAF16:
+ c->lumToYV12 = rgbaf16ToY_c;
+ break;
}
if (c->needAlpha) {
if (is16BPS(srcFormat) || isNBPS(srcFormat)) {
@@ -1769,6 +1861,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
case AV_PIX_FMT_ARGB:
c->alpToYV12 = abgrToA_c;
break;
+ case AV_PIX_FMT_RGBAF16:
+ c->alpToYV12 = rgbaf16ToA_c;
+ break;
case AV_PIX_FMT_YA8:
c->alpToYV12 = uyvyToY_c;
break;
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 34503e57f4..c5c22017ff 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -259,6 +259,9 @@ static const FormatEntry format_entries[] = {
[AV_PIX_FMT_P416LE] = { 1, 1 },
[AV_PIX_FMT_NV16] = { 1, 1 },
[AV_PIX_FMT_VUYA] = { 1, 1 },
+#if HAVE_FLOAT16
+ [AV_PIX_FMT_RGBAF16] = { 1, 0 },
+#endif
};
int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
diff --git a/libswscale/version.h b/libswscale/version.h
index 3193562d18..d8694bb5c0 100644
--- a/libswscale/version.h
+++ b/libswscale/version.h
@@ -29,7 +29,7 @@
#include "version_major.h"
#define LIBSWSCALE_VERSION_MINOR 8
-#define LIBSWSCALE_VERSION_MICRO 102
+#define LIBSWSCALE_VERSION_MICRO 103
#define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
LIBSWSCALE_VERSION_MINOR, \
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/input: add rgbaf16 input support
2022-08-08 18:23 [FFmpeg-devel] [PATCH] swscale/input: add rgbaf16 input support Timo Rothenpieler
@ 2022-08-08 19:39 ` Mark Reid
2022-08-08 20:59 ` Timo Rothenpieler
0 siblings, 1 reply; 6+ messages in thread
From: Mark Reid @ 2022-08-08 19:39 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Timo Rothenpieler
On Mon, Aug 8, 2022 at 11:24 AM Timo Rothenpieler <timo@rothenpieler.org>
wrote:
> This is by no means perfect, since at least ddagrab will return scRGB
> data with values outside of 0.0f to 1.0f for HDR values.
> Its primary purpose is to be able to work with the format at all.
>
> _Float16 support was available on arm/aarch64 for a while, and with gcc
> 12 was enabled on x86 as long as SSE2 is supported.
>
> If the target arch supports f16c, gcc emits fairly efficient assembly,
> taking advantage of it. This is the case on x86-64-v3 or higher.
> Without f16c, it emulates it in software using sse2 instructions.
> ---
>
> I am by no means certain this is the correct way to implement this.
> Tested it with ddagrab output in that format, and it looks like what I'd
> expect.
>
> Specially the order of arguments is a bit of a mystery. I'd have
> expected them to be in order of the planes, so for packed formats, only
> the first one would matter.
> But a bunch of other packed formats left the first src unused, and so I
> followed along, and it ended up working fine.
>
>
Have you looked at the exr decoder half2float.h? It already has f16 to f32
decoding functions.
> configure | 2 +
> libswscale/input.c | 95 ++++++++++++++++++++++++++++++++++++++++++++
> libswscale/utils.c | 3 ++
> libswscale/version.h | 2 +-
> 4 files changed, 101 insertions(+), 1 deletion(-)
>
> diff --git a/configure b/configure
> index 6761d0cb32..d989498bba 100755
> --- a/configure
> +++ b/configure
> @@ -2143,6 +2143,7 @@ ARCH_FEATURES="
> fast_64bit
> fast_clz
> fast_cmov
> + float16
> local_aligned
> simd_align_16
> simd_align_32
> @@ -6228,6 +6229,7 @@ check_builtin MemoryBarrier windows.h
> "MemoryBarrier()"
> check_builtin sync_val_compare_and_swap "" "int *ptr; int oldval, newval;
> __sync_val_compare_and_swap(ptr, oldval, newval)"
> check_builtin gmtime_r time.h "time_t *time; struct tm *tm;
> gmtime_r(time, tm)"
> check_builtin localtime_r time.h "time_t *time; struct tm *tm;
> localtime_r(time, tm)"
> +check_builtin float16 "" "_Float16 f16var"
>
> case "$custom_allocator" in
> jemalloc)
> diff --git a/libswscale/input.c b/libswscale/input.c
> index 68abc4d62c..0b5bd952e8 100644
> --- a/libswscale/input.c
> +++ b/libswscale/input.c
> @@ -1111,6 +1111,89 @@ static void grayf32##endian_name##ToY16_c(uint8_t
> *dst, const uint8_t *src,
> rgbf32_planar_funcs_endian(le, 0)
> rgbf32_planar_funcs_endian(be, 1)
>
> +static void rgbaf16ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
> + const uint8_t *unused0, const uint8_t
> *src1, const uint8_t *src2,
> + int width, uint32_t *_rgb2yuv)
> +{
> +#if HAVE_FLOAT16
> + const _Float16 *src = (const _Float16*)src1;
> + uint16_t *dstU = (uint16_t*)_dstU;
> + uint16_t *dstV = (uint16_t*)_dstV;
> + int32_t *rgb2yuv = (int32_t*)_rgb2yuv;
> + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
> rgb2yuv[BU_IDX];
> + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> rgb2yuv[BV_IDX];
> + int i;
> + av_assert1(src1==src2);
> + for (i = 0; i < width; i++) {
> + int r = (lrintf(av_clipf(65535.0f * src[i*8+0], 0.0f, 65535.0f)) +
> + lrintf(av_clipf(65535.0f * src[i*8+4], 0.0f, 65535.0f)))
> >> 1;
> + int g = (lrintf(av_clipf(65535.0f * src[i*8+1], 0.0f, 65535.0f)) +
> + lrintf(av_clipf(65535.0f * src[i*8+5], 0.0f, 65535.0f)))
> >> 1;
> + int b = (lrintf(av_clipf(65535.0f * src[i*8+2], 0.0f, 65535.0f)) +
> + lrintf(av_clipf(65535.0f * src[i*8+6], 0.0f, 65535.0f)))
> >> 1;
> +
> + dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >>
> RGB2YUV_SHIFT;
> + dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >>
> RGB2YUV_SHIFT;
> + }
> +#endif
> +}
> +
> +static void rgbaf16ToUV_c(uint8_t *_dstU, uint8_t *_dstV,
> + const uint8_t *unused0, const uint8_t *src1,
> const uint8_t *src2,
> + int width, uint32_t *_rgb2yuv)
> +{
> +#if HAVE_FLOAT16
> + const _Float16 *src = (const _Float16*)src1;
> + uint16_t *dstU = (uint16_t*)_dstU;
> + uint16_t *dstV = (uint16_t*)_dstV;
> + int32_t *rgb2yuv = (int32_t*)_rgb2yuv;
> + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
> rgb2yuv[BU_IDX];
> + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> rgb2yuv[BV_IDX];
> + int i;
> + av_assert1(src1==src2);
> + for (i = 0; i < width; i++) {
> + int r = lrintf(av_clipf(65535.0f * src[i*4+0], 0.0f, 65535.0f));
> + int g = lrintf(av_clipf(65535.0f * src[i*4+1], 0.0f, 65535.0f));
> + int b = lrintf(av_clipf(65535.0f * src[i*4+2], 0.0f, 65535.0f));
> +
> + dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >>
> RGB2YUV_SHIFT;
> + dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >>
> RGB2YUV_SHIFT;
> + }
> +#endif
> +}
> +
> +static void rgbaf16ToY_c(uint8_t *_dst, const uint8_t *_src, const
> uint8_t *unused0, const uint8_t *unused1,
> + int width, uint32_t *_rgb2yuv)
> +{
> +#if HAVE_FLOAT16
> + const _Float16 *src = (const _Float16*)_src;
> + uint16_t *dst = (uint16_t*)_dst;
> + int32_t *rgb2yuv = (int32_t*)_rgb2yuv;
> + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by =
> rgb2yuv[BY_IDX];
> + int i;
> + for (i = 0; i < width; i++) {
> + int r = lrintf(av_clipf(65535.0f * src[i*4+0], 0.0f, 65535.0f));
> + int g = lrintf(av_clipf(65535.0f * src[i*4+1], 0.0f, 65535.0f));
> + int b = lrintf(av_clipf(65535.0f * src[i*4+2], 0.0f, 65535.0f));
> +
> + dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >>
> RGB2YUV_SHIFT;
> + }
> +#endif
> +}
> +
> +static void rgbaf16ToA_c(uint8_t *_dst, const uint8_t *_src, const
> uint8_t *unused0, const uint8_t *unused1,
> + int width, uint32_t *unused2)
> +{
> +#if HAVE_FLOAT16
> + const _Float16 *src = (const _Float16*)_src;
> + uint16_t *dst = (uint16_t*)_dst;
> + int i;
> + for (i=0; i<width; i++) {
> + dst[i] = lrintf(av_clipf(65535.0f * src[i*4+3], 0.0f, 65535.0f));
> + }
> +#endif
> +}
> +
> av_cold void ff_sws_init_input_funcs(SwsContext *c)
> {
> enum AVPixelFormat srcFormat = c->srcFormat;
> @@ -1375,6 +1458,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
> case AV_PIX_FMT_X2BGR10LE:
> c->chrToYV12 = bgr30leToUV_half_c;
> break;
> + case AV_PIX_FMT_RGBAF16:
> + c->chrToYV12 = rgbaf16ToUV_half_c;
> + break;
> }
> } else {
> switch (srcFormat) {
> @@ -1462,6 +1548,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
> case AV_PIX_FMT_X2BGR10LE:
> c->chrToYV12 = bgr30leToUV_c;
> break;
> + case AV_PIX_FMT_RGBAF16:
> + c->chrToYV12 = rgbaf16ToUV_c;
> + break;
> }
> }
>
> @@ -1750,6 +1839,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
> case AV_PIX_FMT_X2BGR10LE:
> c->lumToYV12 = bgr30leToY_c;
> break;
> + case AV_PIX_FMT_RGBAF16:
> + c->lumToYV12 = rgbaf16ToY_c;
> + break;
> }
> if (c->needAlpha) {
> if (is16BPS(srcFormat) || isNBPS(srcFormat)) {
> @@ -1769,6 +1861,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
> case AV_PIX_FMT_ARGB:
> c->alpToYV12 = abgrToA_c;
> break;
> + case AV_PIX_FMT_RGBAF16:
> + c->alpToYV12 = rgbaf16ToA_c;
> + break;
> case AV_PIX_FMT_YA8:
> c->alpToYV12 = uyvyToY_c;
> break;
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index 34503e57f4..c5c22017ff 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -259,6 +259,9 @@ static const FormatEntry format_entries[] = {
> [AV_PIX_FMT_P416LE] = { 1, 1 },
> [AV_PIX_FMT_NV16] = { 1, 1 },
> [AV_PIX_FMT_VUYA] = { 1, 1 },
> +#if HAVE_FLOAT16
> + [AV_PIX_FMT_RGBAF16] = { 1, 0 },
> +#endif
> };
>
> int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
> diff --git a/libswscale/version.h b/libswscale/version.h
> index 3193562d18..d8694bb5c0 100644
> --- a/libswscale/version.h
> +++ b/libswscale/version.h
> @@ -29,7 +29,7 @@
> #include "version_major.h"
>
> #define LIBSWSCALE_VERSION_MINOR 8
> -#define LIBSWSCALE_VERSION_MICRO 102
> +#define LIBSWSCALE_VERSION_MICRO 103
>
> #define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
> LIBSWSCALE_VERSION_MINOR, \
> --
> 2.34.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/input: add rgbaf16 input support
2022-08-08 19:39 ` Mark Reid
@ 2022-08-08 20:59 ` Timo Rothenpieler
2022-08-08 22:07 ` Mark Reid
0 siblings, 1 reply; 6+ messages in thread
From: Timo Rothenpieler @ 2022-08-08 20:59 UTC (permalink / raw)
To: ffmpeg-devel
On 08.08.2022 21:39, Mark Reid wrote:
> On Mon, Aug 8, 2022 at 11:24 AM Timo Rothenpieler <timo@rothenpieler.org>
> wrote:
>
>> This is by no means perfect, since at least ddagrab will return scRGB
>> data with values outside of 0.0f to 1.0f for HDR values.
>> Its primary purpose is to be able to work with the format at all.
>>
>> _Float16 support was available on arm/aarch64 for a while, and with gcc
>> 12 was enabled on x86 as long as SSE2 is supported.
>>
>> If the target arch supports f16c, gcc emits fairly efficient assembly,
>> taking advantage of it. This is the case on x86-64-v3 or higher.
>> Without f16c, it emulates it in software using sse2 instructions.
>> ---
>>
>> I am by no means certain this is the correct way to implement this.
>> Tested it with ddagrab output in that format, and it looks like what I'd
>> expect.
>>
>> Specially the order of arguments is a bit of a mystery. I'd have
>> expected them to be in order of the planes, so for packed formats, only
>> the first one would matter.
>> But a bunch of other packed formats left the first src unused, and so I
>> followed along, and it ended up working fine.
>>
>>
> Have you looked at the exr decoder half2float.h? It already has f16 to f32
> decoding functions.
>
For performance, using the compilers native, and potentially hardware
accelerated, support is probably preferable.
Though as a no-float16-fallback it's probably not too horrible.
Just not sure if it's worth the extra effort, given that by the time
this sees any use at all, gcc 12 will be very common.
Might even think about _Float16 support for exr in that case.
Would be an interesting benchmark.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/input: add rgbaf16 input support
2022-08-08 20:59 ` Timo Rothenpieler
@ 2022-08-08 22:07 ` Mark Reid
2022-08-08 22:37 ` Timo Rothenpieler
0 siblings, 1 reply; 6+ messages in thread
From: Mark Reid @ 2022-08-08 22:07 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Mon, Aug 8, 2022 at 1:59 PM Timo Rothenpieler <timo@rothenpieler.org>
wrote:
> On 08.08.2022 21:39, Mark Reid wrote:
> > On Mon, Aug 8, 2022 at 11:24 AM Timo Rothenpieler <timo@rothenpieler.org
> >
> > wrote:
> >
> >> This is by no means perfect, since at least ddagrab will return scRGB
> >> data with values outside of 0.0f to 1.0f for HDR values.
> >> Its primary purpose is to be able to work with the format at all.
> >>
> >> _Float16 support was available on arm/aarch64 for a while, and with gcc
> >> 12 was enabled on x86 as long as SSE2 is supported.
> >>
> >> If the target arch supports f16c, gcc emits fairly efficient assembly,
> >> taking advantage of it. This is the case on x86-64-v3 or higher.
> >> Without f16c, it emulates it in software using sse2 instructions.
> >> ---
> >>
> >> I am by no means certain this is the correct way to implement this.
> >> Tested it with ddagrab output in that format, and it looks like what I'd
> >> expect.
> >>
> >> Specially the order of arguments is a bit of a mystery. I'd have
> >> expected them to be in order of the planes, so for packed formats, only
> >> the first one would matter.
> >> But a bunch of other packed formats left the first src unused, and so I
> >> followed along, and it ended up working fine.
> >>
> >>
> > Have you looked at the exr decoder half2float.h? It already has f16 to
> f32
> > decoding functions.
> >
>
> For performance, using the compilers native, and potentially hardware
> accelerated, support is probably preferable.
> Though as a no-float16-fallback it's probably not too horrible.
> Just not sure if it's worth the extra effort, given that by the time
> this sees any use at all, gcc 12 will be very common.
>
> Might even think about _Float16 support for exr in that case.
> Would be an interesting benchmark.
>
Having the fallback will likely be required to have this patch accepted,
also this will need fate tests.
+static void rgbaf16ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
> + const uint8_t *unused0, const uint8_t
> *src1, const uint8_t *src2,
> + int width, uint32_t *_rgb2yuv)
> +{
> +#if HAVE_FLOAT16
> + const _Float16 *src = (const _Float16*)src1;
> + uint16_t *dstU = (uint16_t*)_dstU;
> + uint16_t *dstV = (uint16_t*)_dstV;
> + int32_t *rgb2yuv = (int32_t*)_rgb2yuv;
> + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
> rgb2yuv[BU_IDX];
> + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
> rgb2yuv[BV_IDX];
> + int i;
> + av_assert1(src1==src2);
> + for (i = 0; i < width; i++) {
> + int r = (lrintf(av_clipf(65535.0f * src[i*8+0], 0.0f, 65535.0f)) +
> + lrintf(av_clipf(65535.0f * src[i*8+4], 0.0f, 65535.0f)))
> >> 1;
> + int g = (lrintf(av_clipf(65535.0f * src[i*8+1], 0.0f, 65535.0f)) +
> + lrintf(av_clipf(65535.0f * src[i*8+5], 0.0f, 65535.0f)))
> >> 1;
> + int b = (lrintf(av_clipf(65535.0f * src[i*8+2], 0.0f, 65535.0f)) +
> + lrintf(av_clipf(65535.0f * src[i*8+6], 0.0f, 65535.0f)))
> >> 1;
> +
> + dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >>
> RGB2YUV_SHIFT;
> + dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >>
> RGB2YUV_SHIFT;
> + }
> +#endif
> +}
IF defining out the core of the function is not the best approach here,
specifically for platforms without HAVE_FLOAT16.
I would probably try and put the accelerated half2float conversion in
half2float.h and move that header to libavutil instead.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/input: add rgbaf16 input support
2022-08-08 22:07 ` Mark Reid
@ 2022-08-08 22:37 ` Timo Rothenpieler
2022-08-08 22:59 ` Timo Rothenpieler
0 siblings, 1 reply; 6+ messages in thread
From: Timo Rothenpieler @ 2022-08-08 22:37 UTC (permalink / raw)
To: ffmpeg-devel
On 09.08.2022 00:07, Mark Reid wrote:
> On Mon, Aug 8, 2022 at 1:59 PM Timo Rothenpieler <timo@rothenpieler.org>
> wrote:
>
>> On 08.08.2022 21:39, Mark Reid wrote:
>>> On Mon, Aug 8, 2022 at 11:24 AM Timo Rothenpieler <timo@rothenpieler.org
>>>
>>> wrote:
>>>
>>>> This is by no means perfect, since at least ddagrab will return scRGB
>>>> data with values outside of 0.0f to 1.0f for HDR values.
>>>> Its primary purpose is to be able to work with the format at all.
>>>>
>>>> _Float16 support was available on arm/aarch64 for a while, and with gcc
>>>> 12 was enabled on x86 as long as SSE2 is supported.
>>>>
>>>> If the target arch supports f16c, gcc emits fairly efficient assembly,
>>>> taking advantage of it. This is the case on x86-64-v3 or higher.
>>>> Without f16c, it emulates it in software using sse2 instructions.
>>>> ---
>>>>
>>>> I am by no means certain this is the correct way to implement this.
>>>> Tested it with ddagrab output in that format, and it looks like what I'd
>>>> expect.
>>>>
>>>> Specially the order of arguments is a bit of a mystery. I'd have
>>>> expected them to be in order of the planes, so for packed formats, only
>>>> the first one would matter.
>>>> But a bunch of other packed formats left the first src unused, and so I
>>>> followed along, and it ended up working fine.
>>>>
>>>>
>>> Have you looked at the exr decoder half2float.h? It already has f16 to
>> f32
>>> decoding functions.
>>>
>>
>> For performance, using the compilers native, and potentially hardware
>> accelerated, support is probably preferable.
>> Though as a no-float16-fallback it's probably not too horrible.
>> Just not sure if it's worth the extra effort, given that by the time
>> this sees any use at all, gcc 12 will be very common.
>>
>> Might even think about _Float16 support for exr in that case.
>> Would be an interesting benchmark.
>>
>
> Having the fallback will likely be required to have this patch accepted,
> also this will need fate tests.
>
> +static void rgbaf16ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
>> + const uint8_t *unused0, const uint8_t
>> *src1, const uint8_t *src2,
>> + int width, uint32_t *_rgb2yuv)
>> +{
>> +#if HAVE_FLOAT16
>> + const _Float16 *src = (const _Float16*)src1;
>> + uint16_t *dstU = (uint16_t*)_dstU;
>> + uint16_t *dstV = (uint16_t*)_dstV;
>> + int32_t *rgb2yuv = (int32_t*)_rgb2yuv;
>> + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu =
>> rgb2yuv[BU_IDX];
>> + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv =
>> rgb2yuv[BV_IDX];
>> + int i;
>> + av_assert1(src1==src2);
>> + for (i = 0; i < width; i++) {
>> + int r = (lrintf(av_clipf(65535.0f * src[i*8+0], 0.0f, 65535.0f)) +
>> + lrintf(av_clipf(65535.0f * src[i*8+4], 0.0f, 65535.0f)))
>>>> 1;
>> + int g = (lrintf(av_clipf(65535.0f * src[i*8+1], 0.0f, 65535.0f)) +
>> + lrintf(av_clipf(65535.0f * src[i*8+5], 0.0f, 65535.0f)))
>>>> 1;
>> + int b = (lrintf(av_clipf(65535.0f * src[i*8+2], 0.0f, 65535.0f)) +
>> + lrintf(av_clipf(65535.0f * src[i*8+6], 0.0f, 65535.0f)))
>>>> 1;
>> +
>> + dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >>
>> RGB2YUV_SHIFT;
>> + dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >>
>> RGB2YUV_SHIFT;
>> + }
>> +#endif
>> +}
>
>
> IF defining out the core of the function is not the best approach here,
> specifically for platforms without HAVE_FLOAT16.
> I would probably try and put the accelerated half2float conversion in
> half2float.h and move that header to libavutil instead.
The entire support for the format is removed from swscale in this case,
so the function ending up empty doesn't matter.
I'll see if it can be added to half2float, but I can't even tell if it
implements ieee floats, or something else.
One issue is that SIMD acceleration for half to single operation
operates on either 4 or 8 values in parallel.
That doesn't work with how half2float.h is right now set up. For one,
it's always exactly one value, and then it's also taking in and
returning integers.
Looking at the current two consumers, it might be possible to make them
take advantage of the SIMD version. They seem to operate on blocks of
data most of the time.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/input: add rgbaf16 input support
2022-08-08 22:37 ` Timo Rothenpieler
@ 2022-08-08 22:59 ` Timo Rothenpieler
0 siblings, 0 replies; 6+ messages in thread
From: Timo Rothenpieler @ 2022-08-08 22:59 UTC (permalink / raw)
To: ffmpeg-devel
On 09.08.2022 00:37, Timo Rothenpieler wrote:
> The entire support for the format is removed from swscale in this case,
> so the function ending up empty doesn't matter.
>
> I'll see if it can be added to half2float, but I can't even tell if it
> implements ieee floats, or something else.
Did a very straight forward implementation with unions:
> static uint32_t half2float(uint16_t h, const uint32_t *mantissatable, const uint32_t *exponenttable,
> const uint16_t *offsettable)
> {
> #if HAVE_FLOAT16
> union {
> uint16_t i;
> _Float16 f;
> } u16;
> union {
> uint32_t i;
> float f;
> } u32;
> u16.i = h;
> u32.f = u16.f;
> return u32.i;
> #else
> uint32_t f;
>
> f = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
>
> return f;
> #endif
> }
Unfortunately, this makes all exr fate tests fail with differing output
checksums.
At least the checksums match between f16c SIMD version and fallback sse2
implementation.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2022-08-08 22:59 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-08 18:23 [FFmpeg-devel] [PATCH] swscale/input: add rgbaf16 input support Timo Rothenpieler
2022-08-08 19:39 ` Mark Reid
2022-08-08 20:59 ` Timo Rothenpieler
2022-08-08 22:07 ` Mark Reid
2022-08-08 22:37 ` Timo Rothenpieler
2022-08-08 22:59 ` Timo Rothenpieler
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git