Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format
@ 2022-08-10 20:47 Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 02/11] avutil/hwcontext_d3d11va: add support for rgbaf16 pixel format Timo Rothenpieler
                   ` (9 more replies)
  0 siblings, 10 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

This is the default format of the Windows compositor and what DXGI
Desktop Duplication will give you for any kind of HDR output.
---
 libavutil/pixdesc.c              | 28 ++++++++++++++++++++++++++++
 libavutil/pixfmt.h               |  5 +++++
 libavutil/version.h              |  4 ++--
 tests/ref/fate/imgutils          |  2 ++
 tests/ref/fate/sws-pixdesc-query | 13 +++++++++++++
 5 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index e078fd5320..f7558ff8b9 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -2504,6 +2504,34 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         },
         .flags = AV_PIX_FMT_FLAG_ALPHA,
     },
+    [AV_PIX_FMT_RGBAF16BE] = {
+        .name = "rgbaf16be",
+        .nb_components = 4,
+        .log2_chroma_w = 0,
+        .log2_chroma_h = 0,
+        .comp = {
+            { 0, 8, 0, 0, 16 },       /* R */
+            { 0, 8, 2, 0, 16 },       /* G */
+            { 0, 8, 4, 0, 16 },       /* B */
+            { 0, 8, 6, 0, 16 },       /* A */
+        },
+        .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_RGB |
+                 AV_PIX_FMT_FLAG_ALPHA | AV_PIX_FMT_FLAG_FLOAT,
+    },
+    [AV_PIX_FMT_RGBAF16LE] = {
+        .name = "rgbaf16le",
+        .nb_components = 4,
+        .log2_chroma_w = 0,
+        .log2_chroma_h = 0,
+        .comp = {
+            { 0, 8, 0, 0, 16 },       /* R */
+            { 0, 8, 2, 0, 16 },       /* G */
+            { 0, 8, 4, 0, 16 },       /* B */
+            { 0, 8, 6, 0, 16 },       /* A */
+        },
+        .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA |
+                 AV_PIX_FMT_FLAG_FLOAT,
+    },
 };
 
 static const char * const color_range_names[] = {
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 9d1fdaf82d..86c9bdefeb 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -369,6 +369,9 @@ enum AVPixelFormat {
 
     AV_PIX_FMT_VUYA,        ///< packed VUYA 4:4:4, 32bpp, VUYAVUYA...
 
+    AV_PIX_FMT_RGBAF16BE,   ///< IEEE-754 half precision packed RGBA 16:16:16:16, 64bpp, RGBARGBA..., big-endian
+    AV_PIX_FMT_RGBAF16LE,   ///< IEEE-754 half precision packed RGBA 16:16:16:16, 64bpp, RGBARGBA..., little-endian
+
     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 };
 
@@ -466,6 +469,8 @@ enum AVPixelFormat {
 #define AV_PIX_FMT_P216       AV_PIX_FMT_NE(P216BE, P216LE)
 #define AV_PIX_FMT_P416       AV_PIX_FMT_NE(P416BE, P416LE)
 
+#define AV_PIX_FMT_RGBAF16    AV_PIX_FMT_NE(RGBAF16BE, RGBAF16LE)
+
 /**
   * Chromaticity coordinates of the source primaries.
   * These values match the ones defined by ISO/IEC 23091-2_2019 subclause 8.1 and ITU-T H.273.
diff --git a/libavutil/version.h b/libavutil/version.h
index ee43526dc6..f0a8b5c098 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -79,8 +79,8 @@
  */
 
 #define LIBAVUTIL_VERSION_MAJOR  57
-#define LIBAVUTIL_VERSION_MINOR  32
-#define LIBAVUTIL_VERSION_MICRO 101
+#define LIBAVUTIL_VERSION_MINOR  33
+#define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
                                                LIBAVUTIL_VERSION_MINOR, \
diff --git a/tests/ref/fate/imgutils b/tests/ref/fate/imgutils
index 4ec66febb8..01c9877de5 100644
--- a/tests/ref/fate/imgutils
+++ b/tests/ref/fate/imgutils
@@ -247,3 +247,5 @@ p216le          planes: 2, linesizes: 128 128   0   0, plane_sizes:  6144  6144
 p416be          planes: 2, linesizes: 128 256   0   0, plane_sizes:  6144 12288     0     0, plane_offsets:  6144     0     0, total_size: 18432
 p416le          planes: 2, linesizes: 128 256   0   0, plane_sizes:  6144 12288     0     0, plane_offsets:  6144     0     0, total_size: 18432
 vuya            planes: 1, linesizes: 256   0   0   0, plane_sizes: 12288     0     0     0, plane_offsets:     0     0     0, total_size: 12288
+rgbaf16be       planes: 1, linesizes: 512   0   0   0, plane_sizes: 24576     0     0     0, plane_offsets:     0     0     0, total_size: 24576
+rgbaf16le       planes: 1, linesizes: 512   0   0   0, plane_sizes: 24576     0     0     0, plane_offsets:     0     0     0, total_size: 24576
diff --git a/tests/ref/fate/sws-pixdesc-query b/tests/ref/fate/sws-pixdesc-query
index bd0f1fcb82..f79d99e513 100644
--- a/tests/ref/fate/sws-pixdesc-query
+++ b/tests/ref/fate/sws-pixdesc-query
@@ -21,6 +21,8 @@ is16BPS:
   rgb48le
   rgba64be
   rgba64le
+  rgbaf16be
+  rgbaf16le
   ya16be
   ya16le
   yuv420p16be
@@ -157,6 +159,7 @@ isBE:
   rgb555be
   rgb565be
   rgba64be
+  rgbaf16be
   x2bgr10be
   x2rgb10be
   xyz12be
@@ -479,6 +482,8 @@ isRGB:
   rgb8
   rgba64be
   rgba64le
+  rgbaf16be
+  rgbaf16le
   x2bgr10be
   x2bgr10le
   x2rgb10be
@@ -629,6 +634,8 @@ AnyRGB:
   rgb8
   rgba64be
   rgba64le
+  rgbaf16be
+  rgbaf16le
   x2bgr10be
   x2bgr10le
   x2rgb10be
@@ -655,6 +662,8 @@ ALPHA:
   rgb32_1
   rgba64be
   rgba64le
+  rgbaf16be
+  rgbaf16le
   vuya
   ya16be
   ya16le
@@ -739,6 +748,8 @@ Packed:
   rgb8
   rgba64be
   rgba64le
+  rgbaf16be
+  rgbaf16le
   uyvy422
   uyyvyy411
   vuya
@@ -918,6 +929,8 @@ PackedRGB:
   rgb8
   rgba64be
   rgba64le
+  rgbaf16be
+  rgbaf16le
   x2bgr10be
   x2bgr10le
   x2rgb10be
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 02/11] avutil/hwcontext_d3d11va: add support for rgbaf16 pixel format
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 03/11] avfilter/vsrc_ddagrab: add rgbaf16 output support Timo Rothenpieler
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

---
 libavutil/hwcontext_d3d11va.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/hwcontext_d3d11va.c b/libavutil/hwcontext_d3d11va.c
index 27c0c80413..363ec6a47d 100644
--- a/libavutil/hwcontext_d3d11va.c
+++ b/libavutil/hwcontext_d3d11va.c
@@ -88,6 +88,7 @@ static const struct {
     { DXGI_FORMAT_P010,         AV_PIX_FMT_P010 },
     { DXGI_FORMAT_B8G8R8A8_UNORM,    AV_PIX_FMT_BGRA },
     { DXGI_FORMAT_R10G10B10A2_UNORM, AV_PIX_FMT_X2BGR10 },
+    { DXGI_FORMAT_R16G16B16A16_FLOAT, AV_PIX_FMT_RGBAF16 },
     // Special opaque formats. The pix_fmt is merely a place holder, as the
     // opaque format cannot be accessed directly.
     { DXGI_FORMAT_420_OPAQUE,   AV_PIX_FMT_YUV420P },
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 03/11] avfilter/vsrc_ddagrab: add rgbaf16 output support
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 02/11] avutil/hwcontext_d3d11va: add support for rgbaf16 pixel format Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 04/11] avfilter/vsrc_ddagrab: add options for more control over output format fallback Timo Rothenpieler
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

---
 libavfilter/version.h      |  2 +-
 libavfilter/vsrc_ddagrab.c | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libavfilter/version.h b/libavfilter/version.h
index 19a009c110..fa67606495 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -32,7 +32,7 @@
 #include "version_major.h"
 
 #define LIBAVFILTER_VERSION_MINOR  46
-#define LIBAVFILTER_VERSION_MICRO 101
+#define LIBAVFILTER_VERSION_MICRO 102
 
 
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
diff --git a/libavfilter/vsrc_ddagrab.c b/libavfilter/vsrc_ddagrab.c
index ce36716281..252505b96d 100644
--- a/libavfilter/vsrc_ddagrab.c
+++ b/libavfilter/vsrc_ddagrab.c
@@ -115,6 +115,8 @@ static const AVOption ddagrab_options[] = {
     { "bgra",       "only output 8 Bit BGRA",            0,            AV_OPT_TYPE_CONST,      { .i64 = DXGI_FORMAT_B8G8R8A8_UNORM },    0, INT_MAX, FLAGS, "output_fmt" },
     { "10bit",      "only output default 10 Bit format", 0,            AV_OPT_TYPE_CONST,      { .i64 = DXGI_FORMAT_R10G10B10A2_UNORM }, 0, INT_MAX, FLAGS, "output_fmt" },
     { "x2bgr10",    "only output 10 Bit X2BGR10",        0,            AV_OPT_TYPE_CONST,      { .i64 = DXGI_FORMAT_R10G10B10A2_UNORM }, 0, INT_MAX, FLAGS, "output_fmt" },
+    { "16bit",      "only output default 16 Bit format", 0,            AV_OPT_TYPE_CONST,      { .i64 = DXGI_FORMAT_R16G16B16A16_FLOAT },0, INT_MAX, FLAGS, "output_fmt" },
+    { "rgbaf16",    "only output 16 Bit RGBAF16",        0,            AV_OPT_TYPE_CONST,      { .i64 = DXGI_FORMAT_R16G16B16A16_FLOAT },0, INT_MAX, FLAGS, "output_fmt" },
     { NULL }
 };
 
@@ -212,6 +214,7 @@ static av_cold int init_dxgi_dda(AVFilterContext *avctx)
     if (set_thread_dpi && SUCCEEDED(hr)) {
         DPI_AWARENESS_CONTEXT prev_dpi_ctx;
         DXGI_FORMAT formats[] = {
+            DXGI_FORMAT_R16G16B16A16_FLOAT,
             DXGI_FORMAT_R10G10B10A2_UNORM,
             DXGI_FORMAT_B8G8R8A8_UNORM
         };
@@ -665,6 +668,10 @@ static av_cold int init_hwframes_ctx(AVFilterContext *avctx)
         av_log(avctx, AV_LOG_VERBOSE, "Probed 10 bit RGB frame format\n");
         dda->frames_ctx->sw_format = AV_PIX_FMT_X2BGR10;
         break;
+    case DXGI_FORMAT_R16G16B16A16_FLOAT:
+        av_log(avctx, AV_LOG_VERBOSE, "Probed 16 bit float RGB frame format\n");
+        dda->frames_ctx->sw_format = AV_PIX_FMT_RGBAF16;
+        break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unexpected texture output format!\n");
         return AVERROR_BUG;
@@ -990,6 +997,12 @@ static int ddagrab_request_frame(AVFilterLink *outlink)
         frame->color_primaries = AVCOL_PRI_BT709;
         frame->color_trc       = AVCOL_TRC_IEC61966_2_1;
         frame->colorspace      = AVCOL_SPC_RGB;
+    } else if(desc.Format == DXGI_FORMAT_R16G16B16A16_FLOAT) {
+        // According to MSDN, all floating point formats contain sRGB image data with linear 1.0 gamma.
+        frame->color_range     = AVCOL_RANGE_JPEG;
+        frame->color_primaries = AVCOL_PRI_BT709;
+        frame->color_trc       = AVCOL_TRC_LINEAR;
+        frame->colorspace      = AVCOL_SPC_RGB;
     } else {
         ret = AVERROR_BUG;
         goto fail;
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 04/11] avfilter/vsrc_ddagrab: add options for more control over output format fallback
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 02/11] avutil/hwcontext_d3d11va: add support for rgbaf16 pixel format Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 03/11] avfilter/vsrc_ddagrab: add rgbaf16 output support Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 05/11] avutil: move half-precision float helper to avutil Timo Rothenpieler
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

---
 libavfilter/vsrc_ddagrab.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/libavfilter/vsrc_ddagrab.c b/libavfilter/vsrc_ddagrab.c
index 252505b96d..00c72187ea 100644
--- a/libavfilter/vsrc_ddagrab.c
+++ b/libavfilter/vsrc_ddagrab.c
@@ -98,6 +98,8 @@ typedef struct DdagrabContext {
     int        offset_x;
     int        offset_y;
     int        out_fmt;
+    int        allow_fallback;
+    int        force_fmt;
 } DdagrabContext;
 
 #define OFFSET(x) offsetof(DdagrabContext, x)
@@ -117,6 +119,10 @@ static const AVOption ddagrab_options[] = {
     { "x2bgr10",    "only output 10 Bit X2BGR10",        0,            AV_OPT_TYPE_CONST,      { .i64 = DXGI_FORMAT_R10G10B10A2_UNORM }, 0, INT_MAX, FLAGS, "output_fmt" },
     { "16bit",      "only output default 16 Bit format", 0,            AV_OPT_TYPE_CONST,      { .i64 = DXGI_FORMAT_R16G16B16A16_FLOAT },0, INT_MAX, FLAGS, "output_fmt" },
     { "rgbaf16",    "only output 16 Bit RGBAF16",        0,            AV_OPT_TYPE_CONST,      { .i64 = DXGI_FORMAT_R16G16B16A16_FLOAT },0, INT_MAX, FLAGS, "output_fmt" },
+    { "allow_fallback", "don't error on fallback to default 8 Bit format",
+                                                   OFFSET(allow_fallback), AV_OPT_TYPE_BOOL,   { .i64 = 0    },       0,       1, FLAGS },
+    { "force_fmt",  "exclude BGRA from format list (experimental, discouraged by Microsoft)",
+                                                   OFFSET(force_fmt),  AV_OPT_TYPE_BOOL,       { .i64 = 0    },       0,       1, FLAGS },
     { NULL }
 };
 
@@ -226,7 +232,7 @@ static av_cold int init_dxgi_dda(AVFilterContext *avctx)
         } else if (dda->out_fmt) {
             formats[0] = dda->out_fmt;
             formats[1] = DXGI_FORMAT_B8G8R8A8_UNORM;
-            nb_formats = 2;
+            nb_formats = dda->force_fmt ? 1 : 2;
         }
 
         IDXGIOutput_Release(dxgi_output);
@@ -262,7 +268,7 @@ static av_cold int init_dxgi_dda(AVFilterContext *avctx)
 #else
     {
 #endif
-        if (dda->out_fmt && dda->out_fmt != DXGI_FORMAT_B8G8R8A8_UNORM) {
+        if (dda->out_fmt && dda->out_fmt != DXGI_FORMAT_B8G8R8A8_UNORM && (!dda->allow_fallback || dda->force_fmt)) {
             av_log(avctx, AV_LOG_ERROR, "Only 8 bit output supported with legacy API\n");
             return AVERROR(ENOTSUP);
         }
@@ -733,7 +739,7 @@ static int ddagrab_config_props(AVFilterLink *outlink)
     if (ret < 0)
         return ret;
 
-    if (dda->out_fmt && dda->raw_format != dda->out_fmt) {
+    if (dda->out_fmt && dda->raw_format != dda->out_fmt && (!dda->allow_fallback || dda->force_fmt)) {
         av_log(avctx, AV_LOG_ERROR, "Requested output format unavailable.\n");
         return AVERROR(ENOTSUP);
     }
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 05/11] avutil: move half-precision float helper to avutil
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
                   ` (2 preceding siblings ...)
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 04/11] avfilter/vsrc_ddagrab: add options for more control over output format fallback Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN Timo Rothenpieler
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

---
 libavcodec/exr.c                       | 2 +-
 libavcodec/exrenc.c                    | 2 +-
 libavcodec/pnmdec.c                    | 3 ++-
 libavcodec/pnmenc.c                    | 2 +-
 {libavcodec => libavutil}/float2half.h | 6 +++---
 {libavcodec => libavutil}/half2float.h | 6 +++---
 6 files changed, 11 insertions(+), 10 deletions(-)
 rename {libavcodec => libavutil}/float2half.h (96%)
 rename {libavcodec => libavutil}/half2float.h (96%)

diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index 3a6b9c3014..5c6ca9adbf 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -41,6 +41,7 @@
 #include "libavutil/avstring.h"
 #include "libavutil/opt.h"
 #include "libavutil/color_utils.h"
+#include "libavutil/half2float.h"
 
 #include "avcodec.h"
 #include "bytestream.h"
@@ -53,7 +54,6 @@
 #include "exrdsp.h"
 #include "get_bits.h"
 #include "internal.h"
-#include "half2float.h"
 #include "mathops.h"
 #include "thread.h"
 
diff --git a/libavcodec/exrenc.c b/libavcodec/exrenc.c
index 8cf7827bb6..56c084d483 100644
--- a/libavcodec/exrenc.c
+++ b/libavcodec/exrenc.c
@@ -31,11 +31,11 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/float2half.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "codec_internal.h"
 #include "encode.h"
-#include "float2half.h"
 
 enum ExrCompr {
     EXR_RAW,
diff --git a/libavcodec/pnmdec.c b/libavcodec/pnmdec.c
index 130407df25..9383dc8e60 100644
--- a/libavcodec/pnmdec.c
+++ b/libavcodec/pnmdec.c
@@ -21,12 +21,13 @@
 
 #include "config_components.h"
 
+#include "libavutil/half2float.h"
+
 #include "avcodec.h"
 #include "codec_internal.h"
 #include "internal.h"
 #include "put_bits.h"
 #include "pnm.h"
-#include "half2float.h"
 
 static void samplecpy(uint8_t *dst, const uint8_t *src, int n, int maxval)
 {
diff --git a/libavcodec/pnmenc.c b/libavcodec/pnmenc.c
index b16c93c88f..7ce534d06e 100644
--- a/libavcodec/pnmenc.c
+++ b/libavcodec/pnmenc.c
@@ -24,10 +24,10 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/float2half.h"
 #include "avcodec.h"
 #include "codec_internal.h"
 #include "encode.h"
-#include "float2half.h"
 
 typedef struct PHMEncContext {
     uint16_t basetable[512];
diff --git a/libavcodec/float2half.h b/libavutil/float2half.h
similarity index 96%
rename from libavcodec/float2half.h
rename to libavutil/float2half.h
index e05125088c..d6aaab8278 100644
--- a/libavcodec/float2half.h
+++ b/libavutil/float2half.h
@@ -16,8 +16,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_FLOAT2HALF_H
-#define AVCODEC_FLOAT2HALF_H
+#ifndef AVUTIL_FLOAT2HALF_H
+#define AVUTIL_FLOAT2HALF_H
 
 #include <stdint.h>
 
@@ -64,4 +64,4 @@ static uint16_t float2half(uint32_t f, uint16_t *basetable, uint8_t *shifttable)
     return h;
 }
 
-#endif /* AVCODEC_FLOAT2HALF_H */
+#endif /* AVUTIL_FLOAT2HALF_H */
diff --git a/libavcodec/half2float.h b/libavutil/half2float.h
similarity index 96%
rename from libavcodec/half2float.h
rename to libavutil/half2float.h
index 7df6747e50..1f6deade07 100644
--- a/libavcodec/half2float.h
+++ b/libavutil/half2float.h
@@ -16,8 +16,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_HALF2FLOAT_H
-#define AVCODEC_HALF2FLOAT_H
+#ifndef AVUTIL_HALF2FLOAT_H
+#define AVUTIL_HALF2FLOAT_H
 
 #include <stdint.h>
 
@@ -71,4 +71,4 @@ static uint32_t half2float(uint16_t h, const uint32_t *mantissatable, const uint
     return f;
 }
 
-#endif /* AVCODEC_HALF2FLOAT_H */
+#endif /* AVUTIL_HALF2FLOAT_H */
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
                   ` (3 preceding siblings ...)
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 05/11] avutil: move half-precision float helper to avutil Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-10 21:24   ` Andreas Rheinhardt
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 07/11] avutil/half2float: move tables to header-internal structs Timo Rothenpieler
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

IEEE-754 differentiates two different kind of NaNs.
Quiet and Signaling ones. They are differentiated by the MSB of the
mantissa.

For whatever reason, actual hardware conversion of half to single always
sets the signaling bit to 1 if the mantissa is != 0, and to 0 if it's 0.
So our code has to follow suite or fate-testing hardware float16 will be
impossible.
---
 libavcodec/exr.c                                    | 2 +-
 libavcodec/pnm.h                                    | 2 +-
 libavutil/half2float.h                              | 5 +++++
 tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index 5c6ca9adbf..47f4786491 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -191,7 +191,7 @@ typedef struct EXRContext {
     float gamma;
     union av_intfloat32 gamma_table[65536];
 
-    uint32_t mantissatable[2048];
+    uint32_t mantissatable[3072];
     uint32_t exponenttable[64];
     uint16_t offsettable[64];
 } EXRContext;
diff --git a/libavcodec/pnm.h b/libavcodec/pnm.h
index 5bf2eaa4d9..7e5445f529 100644
--- a/libavcodec/pnm.h
+++ b/libavcodec/pnm.h
@@ -34,7 +34,7 @@ typedef struct PNMContext {
     int half;
     float scale;
 
-    uint32_t mantissatable[2048];
+    uint32_t mantissatable[3072];
     uint32_t exponenttable[64];
     uint16_t offsettable[64];
 } PNMContext;
diff --git a/libavutil/half2float.h b/libavutil/half2float.h
index 1f6deade07..5af4690cfe 100644
--- a/libavutil/half2float.h
+++ b/libavutil/half2float.h
@@ -45,6 +45,9 @@ static void half2float_table(uint32_t *mantissatable, uint32_t *exponenttable,
         mantissatable[i] = convertmantissa(i);
     for (int i = 1024; i < 2048; i++)
         mantissatable[i] = 0x38000000UL + ((i - 1024) << 13UL);
+    for (int i = 2048; i < 3072; i++)
+        mantissatable[i] = mantissatable[i - 1024] | 0x400000UL;
+    mantissatable[2048] = mantissatable[1024];
 
     exponenttable[0] = 0;
     for (int i = 1; i < 31; i++)
@@ -58,7 +61,9 @@ static void half2float_table(uint32_t *mantissatable, uint32_t *exponenttable,
     offsettable[0] = 0;
     for (int i = 1; i < 64; i++)
         offsettable[i] = 1024;
+    offsettable[31] = 2048;
     offsettable[32] = 0;
+    offsettable[63] = 2048;
 }
 
 static uint32_t half2float(uint16_t h, const uint32_t *mantissatable, const uint32_t *exponenttable,
diff --git a/tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF b/tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF
index b6201116fe..e45a40b498 100644
--- a/tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF
+++ b/tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF
@@ -3,4 +3,4 @@
 #codec_id 0: rawvideo
 #dimensions 0: 256x256
 #sar 0: 1/1
-0,          0,          0,        1,   786432, 0x1445e411
+0,          0,          0,        1,   786432, 0xce9be2be
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 07/11] avutil/half2float: move tables to header-internal structs
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
                   ` (4 preceding siblings ...)
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header Timo Rothenpieler
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

Having to put the knowledge of the size of those arrays into a multitude
of places is rather smelly.
---
 libavcodec/exr.c       | 27 ++++++++--------------
 libavcodec/exrenc.c    | 11 +++++----
 libavcodec/pnm.h       |  5 ++---
 libavcodec/pnmdec.c    | 42 ++++++++--------------------------
 libavcodec/pnmenc.c    | 13 +++++------
 libavutil/float2half.h | 51 +++++++++++++++++++++++-------------------
 libavutil/half2float.h | 46 ++++++++++++++++++++-----------------
 7 files changed, 84 insertions(+), 111 deletions(-)

diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index 47f4786491..825354873d 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -191,9 +191,7 @@ typedef struct EXRContext {
     float gamma;
     union av_intfloat32 gamma_table[65536];
 
-    uint32_t mantissatable[3072];
-    uint32_t exponenttable[64];
-    uint16_t offsettable[64];
+    half2float_tables h2f_tables;
 } EXRContext;
 
 static int zip_uncompress(const EXRContext *s, const uint8_t *src, int compressed_size,
@@ -899,10 +897,7 @@ static int ac_uncompress(const EXRContext *s, GetByteContext *gb, float *block)
             n += val & 0xff;
         } else {
             ret = n;
-            block[ff_zigzag_direct[n]] = av_int2float(half2float(val,
-                                                      s->mantissatable,
-                                                      s->exponenttable,
-                                                      s->offsettable));
+            block[ff_zigzag_direct[n]] = av_int2float(half2float(val, &s->h2f_tables));
             n++;
         }
     }
@@ -1120,8 +1115,7 @@ static int dwa_uncompress(const EXRContext *s, const uint8_t *src, int compresse
                 uint16_t *dc = (uint16_t *)td->dc_data;
                 union av_intfloat32 dc_val;
 
-                dc_val.i = half2float(dc[idx], s->mantissatable,
-                                      s->exponenttable, s->offsettable);
+                dc_val.i = half2float(dc[idx], &s->h2f_tables);
 
                 block[0] = dc_val.f;
                 ac_uncompress(s, &agb, block);
@@ -1171,7 +1165,7 @@ static int dwa_uncompress(const EXRContext *s, const uint8_t *src, int compresse
         for (int x = 0; x < td->xsize; x++) {
             uint16_t ha = ai0[x] | (ai1[x] << 8);
 
-            ao[x] = half2float(ha, s->mantissatable, s->exponenttable, s->offsettable);
+            ao[x] = half2float(ha, &s->h2f_tables);
         }
     }
 
@@ -1427,10 +1421,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
                         }
                     } else {
                         for (x = 0; x < xsize; x++) {
-                            ptr_x[0].i = half2float(bytestream_get_le16(&src),
-                                                    s->mantissatable,
-                                                    s->exponenttable,
-                                                    s->offsettable);
+                            ptr_x[0].i = half2float(bytestream_get_le16(&src), &s->h2f_tables);
                             ptr_x++;
                         }
                     }
@@ -2217,7 +2208,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     float one_gamma = 1.0f / s->gamma;
     avpriv_trc_function trc_func = NULL;
 
-    half2float_table(s->mantissatable, s->exponenttable, s->offsettable);
+    init_half2float_tables(&s->h2f_tables);
 
     s->avctx              = avctx;
 
@@ -2230,18 +2221,18 @@ static av_cold int decode_init(AVCodecContext *avctx)
     trc_func = avpriv_get_trc_function_from_trc(s->apply_trc_type);
     if (trc_func) {
         for (i = 0; i < 65536; ++i) {
-            t.i = half2float(i, s->mantissatable, s->exponenttable, s->offsettable);
+            t.i = half2float(i, &s->h2f_tables);
             t.f = trc_func(t.f);
             s->gamma_table[i] = t;
         }
     } else {
         if (one_gamma > 0.9999f && one_gamma < 1.0001f) {
             for (i = 0; i < 65536; ++i) {
-                s->gamma_table[i].i = half2float(i, s->mantissatable, s->exponenttable, s->offsettable);
+                s->gamma_table[i].i = half2float(i, &s->h2f_tables);
             }
         } else {
             for (i = 0; i < 65536; ++i) {
-                t.i = half2float(i, s->mantissatable, s->exponenttable, s->offsettable);
+                t.i = half2float(i, &s->h2f_tables);
                 /* If negative value we reuse half value */
                 if (t.f <= 0.0f) {
                     s->gamma_table[i] = t;
diff --git a/libavcodec/exrenc.c b/libavcodec/exrenc.c
index 56c084d483..6ab9400b7c 100644
--- a/libavcodec/exrenc.c
+++ b/libavcodec/exrenc.c
@@ -87,15 +87,14 @@ typedef struct EXRContext {
 
     EXRScanlineData *scanline;
 
-    uint16_t basetable[512];
-    uint8_t shifttable[512];
+    float2half_tables f2h_tables;
 } EXRContext;
 
 static av_cold int encode_init(AVCodecContext *avctx)
 {
     EXRContext *s = avctx->priv_data;
 
-    float2half_tables(s->basetable, s->shifttable);
+    init_float2half_tables(&s->f2h_tables);
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_GBRPF32:
@@ -256,7 +255,7 @@ static int encode_scanline_rle(EXRContext *s, const AVFrame *frame)
                 const uint32_t *src = (const uint32_t *)(frame->data[ch] + y * frame->linesize[ch]);
 
                 for (int x = 0; x < frame->width; x++)
-                    dst[x] = float2half(src[x], s->basetable, s->shifttable);
+                    dst[x] = float2half(src[x], &s->f2h_tables);
             }
             break;
         }
@@ -324,7 +323,7 @@ static int encode_scanline_zip(EXRContext *s, const AVFrame *frame)
                     const uint32_t *src = (const uint32_t *)(frame->data[ch] + (y * s->scanline_height + l) * frame->linesize[ch]);
 
                     for (int x = 0; x < frame->width; x++)
-                        dst[x] = float2half(src[x], s->basetable, s->shifttable);
+                        dst[x] = float2half(src[x], &s->f2h_tables);
                 }
             }
             break;
@@ -482,7 +481,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                     const uint32_t *src = (const uint32_t *)(frame->data[ch] + y * frame->linesize[ch]);
 
                     for (int x = 0; x < frame->width; x++)
-                        bytestream2_put_le16(pb, float2half(src[x], s->basetable, s->shifttable));
+                        bytestream2_put_le16(pb, float2half(src[x], &s->f2h_tables));
                 }
             }
         }
diff --git a/libavcodec/pnm.h b/libavcodec/pnm.h
index 7e5445f529..25251d9e4a 100644
--- a/libavcodec/pnm.h
+++ b/libavcodec/pnm.h
@@ -22,6 +22,7 @@
 #ifndef AVCODEC_PNM_H
 #define AVCODEC_PNM_H
 
+#include "libavutil/half2float.h"
 #include "avcodec.h"
 
 typedef struct PNMContext {
@@ -34,9 +35,7 @@ typedef struct PNMContext {
     int half;
     float scale;
 
-    uint32_t mantissatable[3072];
-    uint32_t exponenttable[64];
-    uint16_t offsettable[64];
+    half2float_tables h2f_tables;
 } PNMContext;
 
 int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s);
diff --git a/libavcodec/pnmdec.c b/libavcodec/pnmdec.c
index 9383dc8e60..6adc348ec8 100644
--- a/libavcodec/pnmdec.c
+++ b/libavcodec/pnmdec.c
@@ -313,18 +313,9 @@ static int pnm_decode_frame(AVCodecContext *avctx, AVFrame *p,
                 b = (float *)p->data[1];
                 for (int i = 0; i < avctx->height; i++) {
                     for (int j = 0; j < avctx->width; j++) {
-                        r[j] = av_int2float(half2float(AV_RL16(s->bytestream+0),
-                                                       s->mantissatable,
-                                                       s->exponenttable,
-                                                       s->offsettable)) * scale;
-                        g[j] = av_int2float(half2float(AV_RL16(s->bytestream+2),
-                                                       s->mantissatable,
-                                                       s->exponenttable,
-                                                       s->offsettable)) * scale;
-                        b[j] = av_int2float(half2float(AV_RL16(s->bytestream+4),
-                                                       s->mantissatable,
-                                                       s->exponenttable,
-                                                       s->offsettable)) * scale;
+                        r[j] = av_int2float(half2float(AV_RL16(s->bytestream+0), &s->h2f_tables)) * scale;
+                        g[j] = av_int2float(half2float(AV_RL16(s->bytestream+2), &s->h2f_tables)) * scale;
+                        b[j] = av_int2float(half2float(AV_RL16(s->bytestream+4), &s->h2f_tables)) * scale;
                         s->bytestream += 6;
                     }
 
@@ -340,18 +331,9 @@ static int pnm_decode_frame(AVCodecContext *avctx, AVFrame *p,
                 b = (float *)p->data[1];
                 for (int i = 0; i < avctx->height; i++) {
                     for (int j = 0; j < avctx->width; j++) {
-                        r[j] = av_int2float(half2float(AV_RB16(s->bytestream+0),
-                                                       s->mantissatable,
-                                                       s->exponenttable,
-                                                       s->offsettable)) * scale;
-                        g[j] = av_int2float(half2float(AV_RB16(s->bytestream+2),
-                                                       s->mantissatable,
-                                                       s->exponenttable,
-                                                       s->offsettable)) * scale;
-                        b[j] = av_int2float(half2float(AV_RB16(s->bytestream+4),
-                                                       s->mantissatable,
-                                                       s->exponenttable,
-                                                       s->offsettable)) * scale;
+                        r[j] = av_int2float(half2float(AV_RB16(s->bytestream+0), &s->h2f_tables)) * scale;
+                        g[j] = av_int2float(half2float(AV_RB16(s->bytestream+2), &s->h2f_tables)) * scale;
+                        b[j] = av_int2float(half2float(AV_RB16(s->bytestream+4), &s->h2f_tables)) * scale;
                         s->bytestream += 6;
                     }
 
@@ -394,10 +376,7 @@ static int pnm_decode_frame(AVCodecContext *avctx, AVFrame *p,
                 float *g = (float *)p->data[0];
                 for (int i = 0; i < avctx->height; i++) {
                     for (int j = 0; j < avctx->width; j++) {
-                        g[j] = av_int2float(half2float(AV_RL16(s->bytestream),
-                                                       s->mantissatable,
-                                                       s->exponenttable,
-                                                       s->offsettable)) * scale;
+                        g[j] = av_int2float(half2float(AV_RL16(s->bytestream), &s->h2f_tables)) * scale;
                         s->bytestream += 2;
                     }
                     g += p->linesize[0] / 4;
@@ -406,10 +385,7 @@ static int pnm_decode_frame(AVCodecContext *avctx, AVFrame *p,
                 float *g = (float *)p->data[0];
                 for (int i = 0; i < avctx->height; i++) {
                     for (int j = 0; j < avctx->width; j++) {
-                        g[j] = av_int2float(half2float(AV_RB16(s->bytestream),
-                                                       s->mantissatable,
-                                                       s->exponenttable,
-                                                       s->offsettable)) * scale;
+                        g[j] = av_int2float(half2float(AV_RB16(s->bytestream), &s->h2f_tables)) * scale;
                         s->bytestream += 2;
                     }
                     g += p->linesize[0] / 4;
@@ -501,7 +477,7 @@ static av_cold int phm_dec_init(AVCodecContext *avctx)
 {
     PNMContext *s = avctx->priv_data;
 
-    half2float_table(s->mantissatable, s->exponenttable, s->offsettable);
+    init_half2float_tables(&s->h2f_tables);
 
     return 0;
 }
diff --git a/libavcodec/pnmenc.c b/libavcodec/pnmenc.c
index 7ce534d06e..70992531bf 100644
--- a/libavcodec/pnmenc.c
+++ b/libavcodec/pnmenc.c
@@ -30,8 +30,7 @@
 #include "encode.h"
 
 typedef struct PHMEncContext {
-    uint16_t basetable[512];
-    uint8_t shifttable[512];
+    float2half_tables f2h_tables;
 } PHMEncContext;
 
 static int pnm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
@@ -169,9 +168,9 @@ static int pnm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
         for (int i = 0; i < avctx->height; i++) {
             for (int j = 0; j < avctx->width; j++) {
-                AV_WN16(bytestream + 0, float2half(av_float2int(r[j]), s->basetable, s->shifttable));
-                AV_WN16(bytestream + 2, float2half(av_float2int(g[j]), s->basetable, s->shifttable));
-                AV_WN16(bytestream + 4, float2half(av_float2int(b[j]), s->basetable, s->shifttable));
+                AV_WN16(bytestream + 0, float2half(av_float2int(r[j]), &s->f2h_tables));
+                AV_WN16(bytestream + 2, float2half(av_float2int(g[j]), &s->f2h_tables));
+                AV_WN16(bytestream + 4, float2half(av_float2int(b[j]), &s->f2h_tables));
                 bytestream += 6;
             }
 
@@ -184,7 +183,7 @@ static int pnm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
         for (int i = 0; i < avctx->height; i++) {
             for (int j = 0; j < avctx->width; j++) {
-                AV_WN16(bytestream, float2half(av_float2int(g[j]), s->basetable, s->shifttable));
+                AV_WN16(bytestream, float2half(av_float2int(g[j]), &s->f2h_tables));
                 bytestream += 2;
             }
 
@@ -295,7 +294,7 @@ static av_cold int phm_enc_init(AVCodecContext *avctx)
 {
     PHMEncContext *s = avctx->priv_data;
 
-    float2half_tables(s->basetable, s->shifttable);
+    init_float2half_tables(&s->f2h_tables);
 
     return 0;
 }
diff --git a/libavutil/float2half.h b/libavutil/float2half.h
index d6aaab8278..9252560649 100644
--- a/libavutil/float2half.h
+++ b/libavutil/float2half.h
@@ -21,45 +21,50 @@
 
 #include <stdint.h>
 
-static void float2half_tables(uint16_t *basetable, uint8_t *shifttable)
+typedef struct float2half_tables {
+    uint16_t basetable[512];
+    uint8_t shifttable[512];
+} float2half_tables;
+
+static void init_float2half_tables(float2half_tables *t)
 {
     for (int i = 0; i < 256; i++) {
         int e = i - 127;
 
         if (e < -24) { // Very small numbers map to zero
-            basetable[i|0x000]  = 0x0000;
-            basetable[i|0x100]  = 0x8000;
-            shifttable[i|0x000] = 24;
-            shifttable[i|0x100] = 24;
+            t->basetable[i|0x000]  = 0x0000;
+            t->basetable[i|0x100]  = 0x8000;
+            t->shifttable[i|0x000] = 24;
+            t->shifttable[i|0x100] = 24;
         } else if (e < -14) { // Small numbers map to denorms
-            basetable[i|0x000] = (0x0400>>(-e-14));
-            basetable[i|0x100] = (0x0400>>(-e-14)) | 0x8000;
-            shifttable[i|0x000] = -e-1;
-            shifttable[i|0x100] = -e-1;
+            t->basetable[i|0x000] = (0x0400>>(-e-14));
+            t->basetable[i|0x100] = (0x0400>>(-e-14)) | 0x8000;
+            t->shifttable[i|0x000] = -e-1;
+            t->shifttable[i|0x100] = -e-1;
         } else if (e <= 15) { // Normal numbers just lose precision
-            basetable[i|0x000] = ((e + 15) << 10);
-            basetable[i|0x100] = ((e + 15) << 10) | 0x8000;
-            shifttable[i|0x000] = 13;
-            shifttable[i|0x100] = 13;
+            t->basetable[i|0x000] = ((e + 15) << 10);
+            t->basetable[i|0x100] = ((e + 15) << 10) | 0x8000;
+            t->shifttable[i|0x000] = 13;
+            t->shifttable[i|0x100] = 13;
         } else if (e < 128) { // Large numbers map to Infinity
-            basetable[i|0x000]  = 0x7C00;
-            basetable[i|0x100]  = 0xFC00;
-            shifttable[i|0x000] = 24;
-            shifttable[i|0x100] = 24;
+            t->basetable[i|0x000]  = 0x7C00;
+            t->basetable[i|0x100]  = 0xFC00;
+            t->shifttable[i|0x000] = 24;
+            t->shifttable[i|0x100] = 24;
         } else { // Infinity and NaN's stay Infinity and NaN's
-            basetable[i|0x000]  = 0x7C00;
-            basetable[i|0x100]  = 0xFC00;
-            shifttable[i|0x000] = 13;
-            shifttable[i|0x100] = 13;
+            t->basetable[i|0x000]  = 0x7C00;
+            t->basetable[i|0x100]  = 0xFC00;
+            t->shifttable[i|0x000] = 13;
+            t->shifttable[i|0x100] = 13;
         }
     }
 }
 
-static uint16_t float2half(uint32_t f, uint16_t *basetable, uint8_t *shifttable)
+static uint16_t float2half(uint32_t f, const float2half_tables *t)
 {
     uint16_t h;
 
-    h = basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> shifttable[(f >> 23) & 0x1ff]);
+    h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> t->shifttable[(f >> 23) & 0x1ff]);
 
     return h;
 }
diff --git a/libavutil/half2float.h b/libavutil/half2float.h
index 5af4690cfe..10b6fef4e6 100644
--- a/libavutil/half2float.h
+++ b/libavutil/half2float.h
@@ -21,6 +21,12 @@
 
 #include <stdint.h>
 
+typedef struct half2float_tables {
+    uint32_t mantissatable[3072];
+    uint32_t exponenttable[64];
+    uint16_t offsettable[64];
+} half2float_tables;
+
 static uint32_t convertmantissa(uint32_t i)
 {
     int32_t m = i << 13; // Zero pad mantissa bits
@@ -37,41 +43,39 @@ static uint32_t convertmantissa(uint32_t i)
     return m | e; // Return combined number
 }
 
-static void half2float_table(uint32_t *mantissatable, uint32_t *exponenttable,
-                             uint16_t *offsettable)
+static void init_half2float_tables(half2float_tables *t)
 {
-    mantissatable[0] = 0;
+    t->mantissatable[0] = 0;
     for (int i = 1; i < 1024; i++)
-        mantissatable[i] = convertmantissa(i);
+        t->mantissatable[i] = convertmantissa(i);
     for (int i = 1024; i < 2048; i++)
-        mantissatable[i] = 0x38000000UL + ((i - 1024) << 13UL);
+        t->mantissatable[i] = 0x38000000UL + ((i - 1024) << 13UL);
     for (int i = 2048; i < 3072; i++)
-        mantissatable[i] = mantissatable[i - 1024] | 0x400000UL;
-    mantissatable[2048] = mantissatable[1024];
+        t->mantissatable[i] = t->mantissatable[i - 1024] | 0x400000UL;
+    t->mantissatable[2048] = t->mantissatable[1024];
 
-    exponenttable[0] = 0;
+    t->exponenttable[0] = 0;
     for (int i = 1; i < 31; i++)
-        exponenttable[i] = i << 23;
+        t->exponenttable[i] = i << 23;
     for (int i = 33; i < 63; i++)
-        exponenttable[i] = 0x80000000UL + ((i - 32) << 23UL);
-    exponenttable[31]= 0x47800000UL;
-    exponenttable[32]= 0x80000000UL;
-    exponenttable[63]= 0xC7800000UL;
+        t->exponenttable[i] = 0x80000000UL + ((i - 32) << 23UL);
+    t->exponenttable[31]= 0x47800000UL;
+    t->exponenttable[32]= 0x80000000UL;
+    t->exponenttable[63]= 0xC7800000UL;
 
-    offsettable[0] = 0;
+    t->offsettable[0] = 0;
     for (int i = 1; i < 64; i++)
-        offsettable[i] = 1024;
-    offsettable[31] = 2048;
-    offsettable[32] = 0;
-    offsettable[63] = 2048;
+        t->offsettable[i] = 1024;
+    t->offsettable[31] = 2048;
+    t->offsettable[32] = 0;
+    t->offsettable[63] = 2048;
 }
 
-static uint32_t half2float(uint16_t h, const uint32_t *mantissatable, const uint32_t *exponenttable,
-                           const uint16_t *offsettable)
+static uint32_t half2float(uint16_t h, const half2float_tables *t)
 {
     uint32_t f;
 
-    f = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
+    f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + t->exponenttable[h >> 10];
 
     return f;
 }
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
                   ` (5 preceding siblings ...)
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 07/11] avutil/half2float: move tables to header-internal structs Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-11 20:46   ` Michael Niedermayer
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 09/11] avutil/half2float: use native _Float16 if available Timo Rothenpieler
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

---
 libavcodec/Makefile     |  8 +++---
 libavcodec/exr.c        |  2 +-
 libavcodec/exrenc.c     |  2 +-
 libavcodec/float2half.c | 19 +++++++++++++
 libavcodec/half2float.c | 19 +++++++++++++
 libavcodec/pnmdec.c     |  2 +-
 libavcodec/pnmenc.c     |  2 +-
 libavutil/float2half.c  | 53 ++++++++++++++++++++++++++++++++++
 libavutil/float2half.h  | 36 ++---------------------
 libavutil/half2float.c  | 63 +++++++++++++++++++++++++++++++++++++++++
 libavutil/half2float.h  | 46 ++----------------------------
 11 files changed, 166 insertions(+), 86 deletions(-)
 create mode 100644 libavcodec/float2half.c
 create mode 100644 libavcodec/half2float.c
 create mode 100644 libavutil/float2half.c
 create mode 100644 libavutil/half2float.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 029f1bad3d..cb80f73d99 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -337,8 +337,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
 OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
 OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
 OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
-OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
-OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o
+OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
+OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o float2half.o
 OBJS-$(CONFIG_FASTAUDIO_DECODER)       += fastaudio.o
 OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
 OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
@@ -570,8 +570,8 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
 OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
 OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
 OBJS-$(CONFIG_PGX_DECODER)             += pgxdec.o
-OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o
-OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o
+OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
+OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o float2half.o
 OBJS-$(CONFIG_PHOTOCD_DECODER)         += photocd.o
 OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
 OBJS-$(CONFIG_PIXLET_DECODER)          += pixlet.o
diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index 825354873d..a3582bfdd6 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -2208,7 +2208,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     float one_gamma = 1.0f / s->gamma;
     avpriv_trc_function trc_func = NULL;
 
-    init_half2float_tables(&s->h2f_tables);
+    ff_init_half2float_tables(&s->h2f_tables);
 
     s->avctx              = avctx;
 
diff --git a/libavcodec/exrenc.c b/libavcodec/exrenc.c
index 6ab9400b7c..77b1ce052b 100644
--- a/libavcodec/exrenc.c
+++ b/libavcodec/exrenc.c
@@ -94,7 +94,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
 {
     EXRContext *s = avctx->priv_data;
 
-    init_float2half_tables(&s->f2h_tables);
+    ff_init_float2half_tables(&s->f2h_tables);
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_GBRPF32:
diff --git a/libavcodec/float2half.c b/libavcodec/float2half.c
new file mode 100644
index 0000000000..90a6f63fac
--- /dev/null
+++ b/libavcodec/float2half.c
@@ -0,0 +1,19 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/float2half.c"
diff --git a/libavcodec/half2float.c b/libavcodec/half2float.c
new file mode 100644
index 0000000000..1b023f96a5
--- /dev/null
+++ b/libavcodec/half2float.c
@@ -0,0 +1,19 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/half2float.c"
diff --git a/libavcodec/pnmdec.c b/libavcodec/pnmdec.c
index 6adc348ec8..fbed282e93 100644
--- a/libavcodec/pnmdec.c
+++ b/libavcodec/pnmdec.c
@@ -477,7 +477,7 @@ static av_cold int phm_dec_init(AVCodecContext *avctx)
 {
     PNMContext *s = avctx->priv_data;
 
-    init_half2float_tables(&s->h2f_tables);
+    ff_init_half2float_tables(&s->h2f_tables);
 
     return 0;
 }
diff --git a/libavcodec/pnmenc.c b/libavcodec/pnmenc.c
index 70992531bf..50f55bb1b9 100644
--- a/libavcodec/pnmenc.c
+++ b/libavcodec/pnmenc.c
@@ -294,7 +294,7 @@ static av_cold int phm_enc_init(AVCodecContext *avctx)
 {
     PHMEncContext *s = avctx->priv_data;
 
-    init_float2half_tables(&s->f2h_tables);
+    ff_init_float2half_tables(&s->f2h_tables);
 
     return 0;
 }
diff --git a/libavutil/float2half.c b/libavutil/float2half.c
new file mode 100644
index 0000000000..dba14cef5d
--- /dev/null
+++ b/libavutil/float2half.c
@@ -0,0 +1,53 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/float2half.h"
+
+void ff_init_float2half_tables(float2half_tables *t)
+{
+    for (int i = 0; i < 256; i++) {
+        int e = i - 127;
+
+        if (e < -24) { // Very small numbers map to zero
+            t->basetable[i|0x000]  = 0x0000;
+            t->basetable[i|0x100]  = 0x8000;
+            t->shifttable[i|0x000] = 24;
+            t->shifttable[i|0x100] = 24;
+        } else if (e < -14) { // Small numbers map to denorms
+            t->basetable[i|0x000] = (0x0400>>(-e-14));
+            t->basetable[i|0x100] = (0x0400>>(-e-14)) | 0x8000;
+            t->shifttable[i|0x000] = -e-1;
+            t->shifttable[i|0x100] = -e-1;
+        } else if (e <= 15) { // Normal numbers just lose precision
+            t->basetable[i|0x000] = ((e + 15) << 10);
+            t->basetable[i|0x100] = ((e + 15) << 10) | 0x8000;
+            t->shifttable[i|0x000] = 13;
+            t->shifttable[i|0x100] = 13;
+        } else if (e < 128) { // Large numbers map to Infinity
+            t->basetable[i|0x000]  = 0x7C00;
+            t->basetable[i|0x100]  = 0xFC00;
+            t->shifttable[i|0x000] = 24;
+            t->shifttable[i|0x100] = 24;
+        } else { // Infinity and NaN's stay Infinity and NaN's
+            t->basetable[i|0x000]  = 0x7C00;
+            t->basetable[i|0x100]  = 0xFC00;
+            t->shifttable[i|0x000] = 13;
+            t->shifttable[i|0x100] = 13;
+        }
+    }
+}
diff --git a/libavutil/float2half.h b/libavutil/float2half.h
index 9252560649..b8c9cdfc4f 100644
--- a/libavutil/float2half.h
+++ b/libavutil/float2half.h
@@ -26,41 +26,9 @@ typedef struct float2half_tables {
     uint8_t shifttable[512];
 } float2half_tables;
 
-static void init_float2half_tables(float2half_tables *t)
-{
-    for (int i = 0; i < 256; i++) {
-        int e = i - 127;
-
-        if (e < -24) { // Very small numbers map to zero
-            t->basetable[i|0x000]  = 0x0000;
-            t->basetable[i|0x100]  = 0x8000;
-            t->shifttable[i|0x000] = 24;
-            t->shifttable[i|0x100] = 24;
-        } else if (e < -14) { // Small numbers map to denorms
-            t->basetable[i|0x000] = (0x0400>>(-e-14));
-            t->basetable[i|0x100] = (0x0400>>(-e-14)) | 0x8000;
-            t->shifttable[i|0x000] = -e-1;
-            t->shifttable[i|0x100] = -e-1;
-        } else if (e <= 15) { // Normal numbers just lose precision
-            t->basetable[i|0x000] = ((e + 15) << 10);
-            t->basetable[i|0x100] = ((e + 15) << 10) | 0x8000;
-            t->shifttable[i|0x000] = 13;
-            t->shifttable[i|0x100] = 13;
-        } else if (e < 128) { // Large numbers map to Infinity
-            t->basetable[i|0x000]  = 0x7C00;
-            t->basetable[i|0x100]  = 0xFC00;
-            t->shifttable[i|0x000] = 24;
-            t->shifttable[i|0x100] = 24;
-        } else { // Infinity and NaN's stay Infinity and NaN's
-            t->basetable[i|0x000]  = 0x7C00;
-            t->basetable[i|0x100]  = 0xFC00;
-            t->shifttable[i|0x000] = 13;
-            t->shifttable[i|0x100] = 13;
-        }
-    }
-}
+void ff_init_float2half_tables(float2half_tables *t);
 
-static uint16_t float2half(uint32_t f, const float2half_tables *t)
+static inline uint16_t float2half(uint32_t f, const float2half_tables *t)
 {
     uint16_t h;
 
diff --git a/libavutil/half2float.c b/libavutil/half2float.c
new file mode 100644
index 0000000000..baac8e4093
--- /dev/null
+++ b/libavutil/half2float.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/half2float.h"
+
+static uint32_t convertmantissa(uint32_t i)
+{
+    int32_t m = i << 13; // Zero pad mantissa bits
+    int32_t e = 0; // Zero exponent
+
+    while (!(m & 0x00800000)) { // While not normalized
+        e -= 0x00800000; // Decrement exponent (1<<23)
+        m <<= 1; // Shift mantissa
+    }
+
+    m &= ~0x00800000; // Clear leading 1 bit
+    e +=  0x38800000; // Adjust bias ((127-14)<<23)
+
+    return m | e; // Return combined number
+}
+
+void ff_init_half2float_tables(half2float_tables *t)
+{
+    t->mantissatable[0] = 0;
+    for (int i = 1; i < 1024; i++)
+        t->mantissatable[i] = convertmantissa(i);
+    for (int i = 1024; i < 2048; i++)
+        t->mantissatable[i] = 0x38000000UL + ((i - 1024) << 13UL);
+    for (int i = 2048; i < 3072; i++)
+        t->mantissatable[i] = t->mantissatable[i - 1024] | 0x400000UL;
+    t->mantissatable[2048] = t->mantissatable[1024];
+
+    t->exponenttable[0] = 0;
+    for (int i = 1; i < 31; i++)
+        t->exponenttable[i] = i << 23;
+    for (int i = 33; i < 63; i++)
+        t->exponenttable[i] = 0x80000000UL + ((i - 32) << 23UL);
+    t->exponenttable[31]= 0x47800000UL;
+    t->exponenttable[32]= 0x80000000UL;
+    t->exponenttable[63]= 0xC7800000UL;
+
+    t->offsettable[0] = 0;
+    for (int i = 1; i < 64; i++)
+        t->offsettable[i] = 1024;
+    t->offsettable[31] = 2048;
+    t->offsettable[32] = 0;
+    t->offsettable[63] = 2048;
+}
diff --git a/libavutil/half2float.h b/libavutil/half2float.h
index 10b6fef4e6..cb58e44a1c 100644
--- a/libavutil/half2float.h
+++ b/libavutil/half2float.h
@@ -27,51 +27,9 @@ typedef struct half2float_tables {
     uint16_t offsettable[64];
 } half2float_tables;
 
-static uint32_t convertmantissa(uint32_t i)
-{
-    int32_t m = i << 13; // Zero pad mantissa bits
-    int32_t e = 0; // Zero exponent
-
-    while (!(m & 0x00800000)) { // While not normalized
-        e -= 0x00800000; // Decrement exponent (1<<23)
-        m <<= 1; // Shift mantissa
-    }
-
-    m &= ~0x00800000; // Clear leading 1 bit
-    e +=  0x38800000; // Adjust bias ((127-14)<<23)
-
-    return m | e; // Return combined number
-}
-
-static void init_half2float_tables(half2float_tables *t)
-{
-    t->mantissatable[0] = 0;
-    for (int i = 1; i < 1024; i++)
-        t->mantissatable[i] = convertmantissa(i);
-    for (int i = 1024; i < 2048; i++)
-        t->mantissatable[i] = 0x38000000UL + ((i - 1024) << 13UL);
-    for (int i = 2048; i < 3072; i++)
-        t->mantissatable[i] = t->mantissatable[i - 1024] | 0x400000UL;
-    t->mantissatable[2048] = t->mantissatable[1024];
-
-    t->exponenttable[0] = 0;
-    for (int i = 1; i < 31; i++)
-        t->exponenttable[i] = i << 23;
-    for (int i = 33; i < 63; i++)
-        t->exponenttable[i] = 0x80000000UL + ((i - 32) << 23UL);
-    t->exponenttable[31]= 0x47800000UL;
-    t->exponenttable[32]= 0x80000000UL;
-    t->exponenttable[63]= 0xC7800000UL;
-
-    t->offsettable[0] = 0;
-    for (int i = 1; i < 64; i++)
-        t->offsettable[i] = 1024;
-    t->offsettable[31] = 2048;
-    t->offsettable[32] = 0;
-    t->offsettable[63] = 2048;
-}
+void ff_init_half2float_tables(half2float_tables *t);
 
-static uint32_t half2float(uint16_t h, const half2float_tables *t)
+static inline uint32_t half2float(uint16_t h, const half2float_tables *t)
 {
     uint32_t f;
 
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 09/11] avutil/half2float: use native _Float16 if available
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
                   ` (6 preceding siblings ...)
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-10 21:03   ` Andreas Rheinhardt
  2022-08-10 22:51   ` [FFmpeg-devel] [PATCH v2 " Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 10/11] swscale: add SwsContext parameter to input functions Timo Rothenpieler
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 11/11] swscale/input: add rgbaf16 input support Timo Rothenpieler
  9 siblings, 2 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

_Float16 support was available on arm/aarch64 for a while, and with gcc
12 was enabled on x86 as long as SSE2 is supported.

If the target arch supports f16c, gcc emits fairly efficient assembly,
taking advantage of it. This is the case on x86-64-v3 or higher.
Without f16c, it emulates it in software using sse2 instructions.
---
 configure              |  4 ++++
 libavutil/float2half.c |  2 ++
 libavutil/float2half.h | 16 ++++++++++++++++
 libavutil/half2float.c |  4 ++++
 libavutil/half2float.h | 16 ++++++++++++++++
 5 files changed, 42 insertions(+)

diff --git a/configure b/configure
index 6761d0cb32..2536ae012d 100755
--- a/configure
+++ b/configure
@@ -2143,6 +2143,7 @@ ARCH_FEATURES="
     fast_64bit
     fast_clz
     fast_cmov
+    float16
     local_aligned
     simd_align_16
     simd_align_32
@@ -5125,6 +5126,8 @@ elif enabled arm; then
             ;;
     esac
 
+    test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee
+
 elif enabled avr32; then
 
     case $cpu in
@@ -6228,6 +6231,7 @@ check_builtin MemoryBarrier windows.h "MemoryBarrier()"
 check_builtin sync_val_compare_and_swap "" "int *ptr; int oldval, newval; __sync_val_compare_and_swap(ptr, oldval, newval)"
 check_builtin gmtime_r time.h "time_t *time; struct tm *tm; gmtime_r(time, tm)"
 check_builtin localtime_r time.h "time_t *time; struct tm *tm; localtime_r(time, tm)"
+check_builtin float16 "" "_Float16 f16var"
 
 case "$custom_allocator" in
     jemalloc)
diff --git a/libavutil/float2half.c b/libavutil/float2half.c
index dba14cef5d..1390d3acc0 100644
--- a/libavutil/float2half.c
+++ b/libavutil/float2half.c
@@ -20,6 +20,7 @@
 
 void ff_init_float2half_tables(float2half_tables *t)
 {
+#if !HAVE_FLOAT16
     for (int i = 0; i < 256; i++) {
         int e = i - 127;
 
@@ -50,4 +51,5 @@ void ff_init_float2half_tables(float2half_tables *t)
             t->shifttable[i|0x100] = 13;
         }
     }
+#endif
 }
diff --git a/libavutil/float2half.h b/libavutil/float2half.h
index b8c9cdfc4f..8c1fb804b7 100644
--- a/libavutil/float2half.h
+++ b/libavutil/float2half.h
@@ -20,21 +20,37 @@
 #define AVUTIL_FLOAT2HALF_H
 
 #include <stdint.h>
+#include "intfloat.h"
+
+#include "config.h"
 
 typedef struct float2half_tables {
+#if HAVE_FLOAT16
+    uint8_t dummy;
+#else
     uint16_t basetable[512];
     uint8_t shifttable[512];
+#endif
 } float2half_tables;
 
 void ff_init_float2half_tables(float2half_tables *t);
 
 static inline uint16_t float2half(uint32_t f, const float2half_tables *t)
 {
+#if HAVE_FLOAT16
+    union {
+        _Float16 f;
+        uint16_t i;
+    } u;
+    u.f = av_int2float(f);
+    return u.i;
+#else
     uint16_t h;
 
     h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> t->shifttable[(f >> 23) & 0x1ff]);
 
     return h;
+#endif
 }
 
 #endif /* AVUTIL_FLOAT2HALF_H */
diff --git a/libavutil/half2float.c b/libavutil/half2float.c
index baac8e4093..873226d3a0 100644
--- a/libavutil/half2float.c
+++ b/libavutil/half2float.c
@@ -18,6 +18,7 @@
 
 #include "libavutil/half2float.h"
 
+#if !HAVE_FLOAT16
 static uint32_t convertmantissa(uint32_t i)
 {
     int32_t m = i << 13; // Zero pad mantissa bits
@@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i)
 
     return m | e; // Return combined number
 }
+#endif
 
 void ff_init_half2float_tables(half2float_tables *t)
 {
+#if !HAVE_FLOAT16
     t->mantissatable[0] = 0;
     for (int i = 1; i < 1024; i++)
         t->mantissatable[i] = convertmantissa(i);
@@ -60,4 +63,5 @@ void ff_init_half2float_tables(half2float_tables *t)
     t->offsettable[31] = 2048;
     t->offsettable[32] = 0;
     t->offsettable[63] = 2048;
+#endif
 }
diff --git a/libavutil/half2float.h b/libavutil/half2float.h
index cb58e44a1c..b2a7c934a6 100644
--- a/libavutil/half2float.h
+++ b/libavutil/half2float.h
@@ -20,22 +20,38 @@
 #define AVUTIL_HALF2FLOAT_H
 
 #include <stdint.h>
+#include "intfloat.h"
+
+#include "config.h"
 
 typedef struct half2float_tables {
+#if HAVE_FLOAT16
+    uint8_t dummy;
+#else
     uint32_t mantissatable[3072];
     uint32_t exponenttable[64];
     uint16_t offsettable[64];
+#endif
 } half2float_tables;
 
 void ff_init_half2float_tables(half2float_tables *t);
 
 static inline uint32_t half2float(uint16_t h, const half2float_tables *t)
 {
+#if HAVE_FLOAT16
+    union {
+        _Float16 f;
+        uint16_t i;
+    } u;
+    u.i = h;
+    return av_float2int(u.f);
+#else
     uint32_t f;
 
     f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + t->exponenttable[h >> 10];
 
     return f;
+#endif
 }
 
 #endif /* AVUTIL_HALF2FLOAT_H */
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 10/11] swscale: add SwsContext parameter to input functions
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
                   ` (7 preceding siblings ...)
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 09/11] avutil/half2float: use native _Float16 if available Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-10 20:52   ` Timo Rothenpieler
  2022-08-10 21:55   ` Andreas Rheinhardt
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 11/11] swscale/input: add rgbaf16 input support Timo Rothenpieler
  9 siblings, 2 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

---
 libswscale/hscale.c           |  12 +--
 libswscale/input.c            | 149 ++++++++++++++++++----------------
 libswscale/swscale_internal.h |  17 ++--
 libswscale/x86/swscale.c      |  13 +--
 4 files changed, 106 insertions(+), 85 deletions(-)

diff --git a/libswscale/hscale.c b/libswscale/hscale.c
index eca0635338..6789ce7540 100644
--- a/libswscale/hscale.c
+++ b/libswscale/hscale.c
@@ -105,18 +105,18 @@ static int lum_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int
         uint8_t * dst = desc->dst->plane[0].line[i];
 
         if (c->lumToYV12) {
-            c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal);
+            c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal, c->input_opaque);
         } else if (c->readLumPlanar) {
-            c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table);
+            c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table, c->input_opaque);
         }
 
 
         if (desc->alpha) {
             dst = desc->dst->plane[3].line[i];
             if (c->alpToYV12) {
-                c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal);
+                c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal, c->input_opaque);
             } else if (c->readAlpPlanar) {
-                c->readAlpPlanar(dst, src, srcW, NULL);
+                c->readAlpPlanar(dst, src, srcW, NULL, c->input_opaque);
             }
         }
     }
@@ -224,9 +224,9 @@ static int chr_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int
         uint8_t * dst1 = desc->dst->plane[1].line[i];
         uint8_t * dst2 = desc->dst->plane[2].line[i];
         if (c->chrToYV12) {
-            c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal);
+            c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal, c->input_opaque);
         } else if (c->readChrPlanar) {
-            c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table);
+            c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table, c->input_opaque);
         }
     }
     return sliceH;
diff --git a/libswscale/input.c b/libswscale/input.c
index 68abc4d62c..36ef1e43ac 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -88,7 +88,7 @@ rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
 
 #define rgb64funcs(pattern, BE_LE, origin) \
 static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
-                                    int width, uint32_t *rgb2yuv) \
+                                    int width, uint32_t *rgb2yuv, void *opq) \
 { \
     const uint16_t *src = (const uint16_t *) _src; \
     uint16_t *dst = (uint16_t *) _dst; \
@@ -97,7 +97,7 @@ static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src,
  \
 static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
-                                    int width, uint32_t *rgb2yuv) \
+                                    int width, uint32_t *rgb2yuv, void *opq) \
 { \
     const uint16_t *src1 = (const uint16_t *) _src1, \
                    *src2 = (const uint16_t *) _src2; \
@@ -107,7 +107,7 @@ static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
  \
 static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
-                                    int width, uint32_t *rgb2yuv) \
+                                    int width, uint32_t *rgb2yuv, void *opq) \
 { \
     const uint16_t *src1 = (const uint16_t *) _src1, \
                    *src2 = (const uint16_t *) _src2; \
@@ -192,7 +192,8 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
                                             const uint8_t *_src,        \
                                             const uint8_t *unused0, const uint8_t *unused1,\
                                             int width,                  \
-                                            uint32_t *rgb2yuv)          \
+                                            uint32_t *rgb2yuv,          \
+                                            void *opq)                  \
 {                                                                       \
     const uint16_t *src = (const uint16_t *)_src;                       \
     uint16_t *dst       = (uint16_t *)_dst;                             \
@@ -205,7 +206,8 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
                                              const uint8_t *_src1,      \
                                              const uint8_t *_src2,      \
                                              int width,                 \
-                                             uint32_t *rgb2yuv)         \
+                                             uint32_t *rgb2yuv,         \
+                                             void *opq)                 \
 {                                                                       \
     const uint16_t *src1 = (const uint16_t *)_src1,                     \
                    *src2 = (const uint16_t *)_src2;                     \
@@ -220,7 +222,8 @@ static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU,       \
                                                   const uint8_t *_src1, \
                                                   const uint8_t *_src2, \
                                                   int width,            \
-                                                  uint32_t *rgb2yuv)    \
+                                                  uint32_t *rgb2yuv,    \
+                                                  void *opq)            \
 {                                                                       \
     const uint16_t *src1 = (const uint16_t *)_src1,                     \
                    *src2 = (const uint16_t *)_src2;                     \
@@ -345,7 +348,7 @@ static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr,          \
                          maskg, maskb, rsh, gsh, bsh, S)                \
 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,            \
-                          int width, uint32_t *tab)                     \
+                          int width, uint32_t *tab, void *opq)          \
 {                                                                       \
     rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp,    \
                            maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \
@@ -353,7 +356,7 @@ static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unuse
                                                                         \
 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
                            const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy,    \
-                           int width, uint32_t *tab)                    \
+                           int width, uint32_t *tab, void *opq)         \
 {                                                                       \
     rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,                \
                             shr, shg, shb, shp,                         \
@@ -363,7 +366,7 @@ static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV,           \
                                 const uint8_t *unused0, const uint8_t *src,                     \
                                 const uint8_t *dummy,                   \
-                                int width, uint32_t *tab)               \
+                                int width, uint32_t *tab, void *opq)    \
 {                                                                       \
     rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,           \
                                  shr, shg, shb, shp,                    \
@@ -392,7 +395,7 @@ rgb16_32_wrapper(AV_PIX_FMT_X2BGR10LE, bgr30le, 0, 6, 16, 0, 0x3FF, 0xFFC00, 0x3
 
 static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
                          const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
-                         int width, uint32_t *rgb2yuv)
+                         int width, uint32_t *rgb2yuv, void *opq)
 {
     uint16_t *dstU = (uint16_t *)_dstU;
     uint16_t *dstV = (uint16_t *)_dstV;
@@ -411,7 +414,7 @@ static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
 }
 
 static void rgba64leToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
-                          const uint8_t *unused2, int width, uint32_t *unused)
+                          const uint8_t *unused2, int width, uint32_t *unused, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     const uint16_t *src = (const uint16_t *)_src;
@@ -421,7 +424,7 @@ static void rgba64leToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unu
 }
 
 static void rgba64beToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
-                          const uint8_t *unused2, int width, uint32_t *unused)
+                          const uint8_t *unused2, int width, uint32_t *unused, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     const uint16_t *src = (const uint16_t *)_src;
@@ -430,7 +433,8 @@ static void rgba64beToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unu
         dst[i] = AV_RB16(src + 4 * i + 3);
 }
 
-static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                      const uint8_t *unused2, int width, uint32_t *unused, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     int i;
@@ -439,7 +443,8 @@ static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
     }
 }
 
-static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                      const uint8_t *unused2, int width, uint32_t *unused, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     int i;
@@ -448,7 +453,8 @@ static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
     }
 }
 
-static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
+static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                     const uint8_t *unused2, int width, uint32_t *pal, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     int i;
@@ -459,7 +465,8 @@ static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
     }
 }
 
-static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
+static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                     const uint8_t *unused2, int width, uint32_t *pal, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     int i;
@@ -471,8 +478,8 @@ static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
 }
 
 static void palToUV_c(uint8_t *_dstU, uint8_t *_dstV,
-                           const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
-                      int width, uint32_t *pal)
+                      const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+                      int width, uint32_t *pal, void *opq)
 {
     uint16_t *dstU = (uint16_t *)_dstU;
     int16_t *dstV = (int16_t *)_dstV;
@@ -486,7 +493,8 @@ static void palToUV_c(uint8_t *_dstU, uint8_t *_dstV,
     }
 }
 
-static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
+static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                          const uint8_t *unused2,  int width, uint32_t *unused, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     int i, j;
@@ -503,7 +511,8 @@ static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unus
     }
 }
 
-static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
+static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                          const uint8_t *unused2,  int width, uint32_t *unused, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     int i, j;
@@ -520,8 +529,8 @@ static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unus
     }
 }
 
-static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
-                      uint32_t *unused)
+static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
+                      uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -529,7 +538,7 @@ static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1,
 }
 
 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
-                       const uint8_t *src2, int width, uint32_t *unused)
+                       const uint8_t *src2, int width, uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -540,7 +549,7 @@ static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, con
 }
 
 static void yvy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
-                       const uint8_t *src2, int width, uint32_t *unused)
+                       const uint8_t *src2, int width, uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -551,7 +560,7 @@ static void yvy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, con
 }
 
 static void y210le_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src,
-                        const uint8_t *unused1, int width, uint32_t *unused2)
+                        const uint8_t *unused1, int width, uint32_t *unused2, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -561,7 +570,7 @@ static void y210le_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, co
 }
 
 static void y210le_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0,
-                       const uint8_t *unused1, int width, uint32_t *unused2)
+                       const uint8_t *unused1, int width, uint32_t *unused2, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -569,7 +578,7 @@ static void y210le_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0,
 }
 
 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2, int width,
-                       uint32_t *unused)
+                       uint32_t *unused, void *opq)
 {
     int i;
     const uint16_t *src = (const uint16_t *)_src;
@@ -579,7 +588,7 @@ static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused
 }
 
 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
-                        const uint8_t *_src2, int width, uint32_t *unused)
+                        const uint8_t *_src2, int width, uint32_t *unused, void *opq)
 {
     int i;
     const uint16_t *src1 = (const uint16_t *)_src1,
@@ -592,7 +601,7 @@ static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
 }
 
 static void read_ya16le_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
-                               uint32_t *unused)
+                               uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -600,7 +609,7 @@ static void read_ya16le_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *
 }
 
 static void read_ya16le_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
-                                uint32_t *unused)
+                                uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -608,7 +617,7 @@ static void read_ya16le_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t
 }
 
 static void read_ya16be_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
-                               uint32_t *unused)
+                               uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -616,7 +625,7 @@ static void read_ya16be_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *
 }
 
 static void read_ya16be_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
-                                uint32_t *unused)
+                                uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -624,7 +633,7 @@ static void read_ya16be_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t
 }
 
 static void read_ayuv64le_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
-                               uint32_t *unused2)
+                               uint32_t *unused2, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -633,7 +642,7 @@ static void read_ayuv64le_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *u
 
 
 static void read_ayuv64le_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src,
-                               const uint8_t *unused1, int width, uint32_t *unused2)
+                               const uint8_t *unused1, int width, uint32_t *unused2, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -643,7 +652,7 @@ static void read_ayuv64le_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unus
 }
 
 static void read_ayuv64le_A_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
-                                uint32_t *unused2)
+                              uint32_t *unused2, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -651,7 +660,7 @@ static void read_ayuv64le_A_c(uint8_t *dst, const uint8_t *src, const uint8_t *u
 }
 
 static void read_vuya_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src,
-                           const uint8_t *unused1, int width, uint32_t *unused2)
+                           const uint8_t *unused1, int width, uint32_t *unused2, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -661,7 +670,7 @@ static void read_vuya_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0,
 }
 
 static void read_vuya_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
-                          uint32_t *unused2)
+                          uint32_t *unused2, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -669,7 +678,7 @@ static void read_vuya_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unuse
 }
 
 static void read_vuya_A_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
-                          uint32_t *unused2)
+                          uint32_t *unused2, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -679,7 +688,7 @@ static void read_vuya_A_c(uint8_t *dst, const uint8_t *src, const uint8_t *unuse
 /* This is almost identical to the previous, end exists only because
  * yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */
 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
-                      uint32_t *unused)
+                      uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++)
@@ -687,7 +696,7 @@ static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1,
 }
 
 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
-                       const uint8_t *src2, int width, uint32_t *unused)
+                       const uint8_t *src2, int width, uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -709,20 +718,20 @@ static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
 
 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
-                       int width, uint32_t *unused)
+                       int width, uint32_t *unused, void *opq)
 {
     nvXXtoUV_c(dstU, dstV, src1, width);
 }
 
 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
-                       int width, uint32_t *unused)
+                       int width, uint32_t *unused, void *opq)
 {
     nvXXtoUV_c(dstV, dstU, src1, width);
 }
 
 static void p010LEToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1,
-                        const uint8_t *unused2, int width, uint32_t *unused)
+                        const uint8_t *unused2, int width, uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -731,7 +740,7 @@ static void p010LEToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1
 }
 
 static void p010BEToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1,
-                        const uint8_t *unused2, int width, uint32_t *unused)
+                        const uint8_t *unused2, int width, uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -741,7 +750,7 @@ static void p010BEToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1
 
 static void p010LEToUV_c(uint8_t *dstU, uint8_t *dstV,
                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
-                       int width, uint32_t *unused)
+                       int width, uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -751,8 +760,8 @@ static void p010LEToUV_c(uint8_t *dstU, uint8_t *dstV,
 }
 
 static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
-                       int width, uint32_t *unused)
+                         const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+                         int width, uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -762,8 +771,8 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
 }
 
 static void p016LEToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
-                       int width, uint32_t *unused)
+                         const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+                         int width, uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -773,8 +782,8 @@ static void p016LEToUV_c(uint8_t *dstU, uint8_t *dstV,
 }
 
 static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
-                       int width, uint32_t *unused)
+                         const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+                         int width, uint32_t *unused, void *opq)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -786,7 +795,7 @@ static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV,
 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
 
 static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
-                       int width, uint32_t *rgb2yuv)
+                       int width, uint32_t *rgb2yuv, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
@@ -801,7 +810,7 @@ static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1
 }
 
 static void bgr24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
-                        const uint8_t *src2, int width, uint32_t *rgb2yuv)
+                        const uint8_t *src2, int width, uint32_t *rgb2yuv, void *opq)
 {
     int16_t *dstU = (int16_t *)_dstU;
     int16_t *dstV = (int16_t *)_dstV;
@@ -820,7 +829,7 @@ static void bgr24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
 }
 
 static void bgr24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
-                             const uint8_t *src2, int width, uint32_t *rgb2yuv)
+                             const uint8_t *src2, int width, uint32_t *rgb2yuv, void *opq)
 {
     int16_t *dstU = (int16_t *)_dstU;
     int16_t *dstV = (int16_t *)_dstV;
@@ -839,7 +848,7 @@ static void bgr24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unus
 }
 
 static void rgb24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
-                       uint32_t *rgb2yuv)
+                       uint32_t *rgb2yuv, void *opq)
 {
     int16_t *dst = (int16_t *)_dst;
     int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
@@ -854,7 +863,7 @@ static void rgb24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1
 }
 
 static void rgb24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
-                        const uint8_t *src2, int width, uint32_t *rgb2yuv)
+                        const uint8_t *src2, int width, uint32_t *rgb2yuv, void *opq)
 {
     int16_t *dstU = (int16_t *)_dstU;
     int16_t *dstV = (int16_t *)_dstV;
@@ -873,7 +882,7 @@ static void rgb24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
 }
 
 static void rgb24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
-                             const uint8_t *src2, int width, uint32_t *rgb2yuv)
+                             const uint8_t *src2, int width, uint32_t *rgb2yuv, void *opq)
 {
     int16_t *dstU = (int16_t *)_dstU;
     int16_t *dstV = (int16_t *)_dstV;
@@ -891,7 +900,7 @@ static void rgb24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unus
     }
 }
 
-static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv)
+static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq)
 {
     uint16_t *dst = (uint16_t *)_dst;
     int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
@@ -905,7 +914,7 @@ static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width, int
     }
 }
 
-static void planar_rgb_to_a(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *unused)
+static void planar_rgb_to_a(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *unused, void *opq)
 {
     uint16_t *dst = (uint16_t *)_dst;
     int i;
@@ -913,7 +922,7 @@ static void planar_rgb_to_a(uint8_t *_dst, const uint8_t *src[4], int width, int
         dst[i] = src[3][i] << 6;
 }
 
-static void planar_rgb_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv)
+static void planar_rgb_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq)
 {
     uint16_t *dstU = (uint16_t *)_dstU;
     uint16_t *dstV = (uint16_t *)_dstV;
@@ -1049,24 +1058,27 @@ static av_always_inline void grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src,
 
 #define rgb9plus_planar_funcs_endian(nbits, endian_name, endian)                                    \
 static void planar_rgb##nbits##endian_name##_to_y(uint8_t *dst, const uint8_t *src[4],              \
-                                                  int w, int32_t *rgb2yuv)                          \
+                                                  int w, int32_t *rgb2yuv, void *opq)               \
 {                                                                                                   \
     planar_rgb16_to_y(dst, src, w, nbits, endian, rgb2yuv);                                         \
 }                                                                                                   \
 static void planar_rgb##nbits##endian_name##_to_uv(uint8_t *dstU, uint8_t *dstV,                    \
-                                                   const uint8_t *src[4], int w, int32_t *rgb2yuv)  \
+                                                   const uint8_t *src[4], int w, int32_t *rgb2yuv,  \
+                                                   void *opq)                                       \
 {                                                                                                   \
     planar_rgb16_to_uv(dstU, dstV, src, w, nbits, endian, rgb2yuv);                                 \
 }                                                                                                   \
 
 #define rgb9plus_planar_transparency_funcs(nbits)                           \
 static void planar_rgb##nbits##le_to_a(uint8_t *dst, const uint8_t *src[4], \
-                                       int w, int32_t *rgb2yuv)             \
+                                       int w, int32_t *rgb2yuv,             \
+                                       void *opq)                           \
 {                                                                           \
     planar_rgb16_to_a(dst, src, w, nbits, 0, rgb2yuv);                      \
 }                                                                           \
 static void planar_rgb##nbits##be_to_a(uint8_t *dst, const uint8_t *src[4], \
-                                       int w, int32_t *rgb2yuv)             \
+                                       int w, int32_t *rgb2yuv,             \
+                                       void *opq)                           \
 {                                                                           \
     planar_rgb16_to_a(dst, src, w, nbits, 1, rgb2yuv);                      \
 }
@@ -1087,23 +1099,24 @@ rgb9plus_planar_transparency_funcs(16)
 
 #define rgbf32_planar_funcs_endian(endian_name, endian)                                             \
 static void planar_rgbf32##endian_name##_to_y(uint8_t *dst, const uint8_t *src[4],                  \
-                                                  int w, int32_t *rgb2yuv)                          \
+                                                  int w, int32_t *rgb2yuv, void *opq)               \
 {                                                                                                   \
     planar_rgbf32_to_y(dst, src, w, endian, rgb2yuv);                                               \
 }                                                                                                   \
 static void planar_rgbf32##endian_name##_to_uv(uint8_t *dstU, uint8_t *dstV,                        \
-                                                   const uint8_t *src[4], int w, int32_t *rgb2yuv)  \
+                                               const uint8_t *src[4], int w, int32_t *rgb2yuv,      \
+                                               void *opq)                                           \
 {                                                                                                   \
     planar_rgbf32_to_uv(dstU, dstV, src, w, endian, rgb2yuv);                                       \
 }                                                                                                   \
 static void planar_rgbf32##endian_name##_to_a(uint8_t *dst, const uint8_t *src[4],                  \
-                                              int w, int32_t *rgb2yuv)                              \
+                                              int w, int32_t *rgb2yuv, void *opq)                   \
 {                                                                                                   \
     planar_rgbf32_to_a(dst, src, w, endian, rgb2yuv);                                               \
 }                                                                                                   \
 static void grayf32##endian_name##ToY16_c(uint8_t *dst, const uint8_t *src,                         \
                                           const uint8_t *unused1, const uint8_t *unused2,           \
-                                          int width, uint32_t *unused)                              \
+                                          int width, uint32_t *unused, void *opq)                   \
 {                                                                                                   \
     grayf32ToY16_c(dst, src, unused1, unused2, width, endian, unused);                              \
 }
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index e118b54457..9ab542933f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -559,26 +559,31 @@ typedef struct SwsContext {
     yuv2packedX_fn yuv2packedX;
     yuv2anyX_fn yuv2anyX;
 
+    /// Opaque data pointer passed to all input functions.
+    void *input_opaque;
+
     /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
     void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
-                      int width, uint32_t *pal);
+                      int width, uint32_t *pal, void *opq);
     /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
     void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
-                      int width, uint32_t *pal);
+                      int width, uint32_t *pal, void *opq);
     /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
     void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
                       const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
-                      int width, uint32_t *pal);
+                      int width, uint32_t *pal, void *opq);
 
     /**
      * Functions to read planar input, such as planar RGB, and convert
      * internally to Y/UV/A.
      */
     /** @{ */
-    void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
+    void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv,
+                          void *opq);
     void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],
-                          int width, int32_t *rgb2yuv);
-    void (*readAlpPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
+                          int width, int32_t *rgb2yuv, void *opq);
+    void (*readAlpPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv,
+                          void *opq);
     /** @} */
 
     /**
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 628f12137c..270798ba3d 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -299,13 +299,13 @@ VSCALE_FUNCS(avx, avx);
 #define INPUT_Y_FUNC(fmt, opt) \
 void ff_ ## fmt ## ToY_  ## opt(uint8_t *dst, const uint8_t *src, \
                                 const uint8_t *unused1, const uint8_t *unused2, \
-                                int w, uint32_t *unused)
+                                int w, uint32_t *unused, void *opq)
 #define INPUT_UV_FUNC(fmt, opt) \
 void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
                                 const uint8_t *unused0, \
                                 const uint8_t *src1, \
                                 const uint8_t *src2, \
-                                int w, uint32_t *unused)
+                                int w, uint32_t *unused, void *opq)
 #define INPUT_FUNC(fmt, opt) \
     INPUT_Y_FUNC(fmt, opt); \
     INPUT_UV_FUNC(fmt, opt)
@@ -373,15 +373,18 @@ YUV2GBRP_DECL(avx2);
 
 #define INPUT_PLANAR_RGB_Y_FN_DECL(fmt, opt)                               \
 void ff_planar_##fmt##_to_y_##opt(uint8_t *dst,                            \
-                           const uint8_t *src[4], int w, int32_t *rgb2yuv)
+                           const uint8_t *src[4], int w, int32_t *rgb2yuv, \
+                           void *opq)
 
 #define INPUT_PLANAR_RGB_UV_FN_DECL(fmt, opt)                              \
 void ff_planar_##fmt##_to_uv_##opt(uint8_t *dstU, uint8_t *dstV,           \
-                           const uint8_t *src[4], int w, int32_t *rgb2yuv)
+                           const uint8_t *src[4], int w, int32_t *rgb2yuv, \
+                           void *opq)
 
 #define INPUT_PLANAR_RGB_A_FN_DECL(fmt, opt)                               \
 void ff_planar_##fmt##_to_a_##opt(uint8_t *dst,                            \
-                           const uint8_t *src[4], int w, int32_t *rgb2yuv)
+                           const uint8_t *src[4], int w, int32_t *rgb2yuv, \
+                           void *opq)
 
 
 #define INPUT_PLANAR_RGBXX_A_DECL(fmt, opt) \
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH 11/11] swscale/input: add rgbaf16 input support
  2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
                   ` (8 preceding siblings ...)
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 10/11] swscale: add SwsContext parameter to input functions Timo Rothenpieler
@ 2022-08-10 20:47 ` Timo Rothenpieler
  2022-08-10 21:37   ` Timo Rothenpieler
  9 siblings, 1 reply; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:47 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

This is by no means perfect, since at least ddagrab will return scRGB
data with values outside of 0.0f to 1.0f for HDR values.
Its primary purpose is to be able to work with the format at all.
---
 libavutil/Makefile            |   1 +
 libswscale/half2float.c       |  19 +++++
 libswscale/input.c            | 130 ++++++++++++++++++++++++++++++++++
 libswscale/slice.c            |   9 ++-
 libswscale/swscale_internal.h |  10 +++
 libswscale/utils.c            |   2 +
 libswscale/version.h          |   2 +-
 7 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 libswscale/half2float.c

diff --git a/libavutil/Makefile b/libavutil/Makefile
index 3d9c07aea8..1aac1a4cc0 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -131,6 +131,7 @@ OBJS = adler32.o                                                        \
        float_dsp.o                                                      \
        fixed_dsp.o                                                      \
        frame.o                                                          \
+       half2float.o                                                     \
        hash.o                                                           \
        hdr_dynamic_metadata.o                                           \
        hdr_dynamic_vivid_metadata.o                                     \
diff --git a/libswscale/half2float.c b/libswscale/half2float.c
new file mode 100644
index 0000000000..1b023f96a5
--- /dev/null
+++ b/libswscale/half2float.c
@@ -0,0 +1,19 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/half2float.c"
diff --git a/libswscale/input.c b/libswscale/input.c
index 36ef1e43ac..818b57d2c3 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -1124,6 +1124,112 @@ static void grayf32##endian_name##ToY16_c(uint8_t *dst, const uint8_t *src,
 rgbf32_planar_funcs_endian(le, 0)
 rgbf32_planar_funcs_endian(be, 1)
 
+#define rdpx(src) av_int2float(half2float(is_be ? AV_RB16(&src) : AV_RL16(&src), h2f_tbl))
+
+static av_always_inline void rgbaf16ToUV_half_endian(uint16_t *dstU, uint16_t *dstV, int is_be,
+                                                     const uint16_t *src, int width,
+                                                     int32_t *rgb2yuv, half2float_tables *h2f_tbl)
+{
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    int i;
+    for (i = 0; i < width; i++) {
+        int r = (lrintf(av_clipf(65535.0f * rdpx(src[i*8+0]), 0.0f, 65535.0f)) +
+                 lrintf(av_clipf(65535.0f * rdpx(src[i*8+4]), 0.0f, 65535.0f))) >> 1;
+        int g = (lrintf(av_clipf(65535.0f * rdpx(src[i*8+1]), 0.0f, 65535.0f)) +
+                 lrintf(av_clipf(65535.0f * rdpx(src[i*8+5]), 0.0f, 65535.0f))) >> 1;
+        int b = (lrintf(av_clipf(65535.0f * rdpx(src[i*8+2]), 0.0f, 65535.0f)) +
+                 lrintf(av_clipf(65535.0f * rdpx(src[i*8+6]), 0.0f, 65535.0f))) >> 1;
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void rgbaf16ToUV_endian(uint16_t *dstU, uint16_t *dstV, int is_be,
+                                                const uint16_t *src, int width,
+                                                int32_t *rgb2yuv, half2float_tables *h2f_tbl)
+{
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    int i;
+    for (i = 0; i < width; i++) {
+        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*4+0]), 0.0f, 65535.0f));
+        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*4+1]), 0.0f, 65535.0f));
+        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*4+2]), 0.0f, 65535.0f));
+
+        dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void rgbaf16ToY_endian(uint16_t *dst, const uint16_t *src, int is_be,
+                                               int width, int32_t *rgb2yuv, half2float_tables *h2f_tbl)
+{
+    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+    int i;
+    for (i = 0; i < width; i++) {
+        int r = lrintf(av_clipf(65535.0f * rdpx(src[i*4+0]), 0.0f, 65535.0f));
+        int g = lrintf(av_clipf(65535.0f * rdpx(src[i*4+1]), 0.0f, 65535.0f));
+        int b = lrintf(av_clipf(65535.0f * rdpx(src[i*4+2]), 0.0f, 65535.0f));
+
+        dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void rgbaf16ToA_endian(uint16_t *dst, const uint16_t *src, int is_be,
+                                               int width, half2float_tables *h2f_tbl)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i] = lrintf(av_clipf(65535.0f * rdpx(src[i*4+3]), 0.0f, 65535.0f));
+    }
+}
+
+#undef rdpx
+
+#define rgbaf16_funcs_endian(endian_name, endian)                                                         \
+static void rgbaf16##endian_name##ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused,      \
+                                              const uint8_t *src1, const uint8_t *src2,                   \
+                                              int width, uint32_t *_rgb2yuv, void *opq)                   \
+{                                                                                                         \
+    const uint16_t *src = (const uint16_t*)src1;                                                          \
+    uint16_t *dstU = (uint16_t*)_dstU;                                                                    \
+    uint16_t *dstV = (uint16_t*)_dstV;                                                                    \
+    int32_t *rgb2yuv = (int32_t*)_rgb2yuv;                                                                \
+    av_assert1(src1==src2);                                                                               \
+    rgbaf16ToUV_half_endian(dstU, dstV, endian, src, width, rgb2yuv, opq);                                \
+}                                                                                                         \
+static void rgbaf16##endian_name##ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused,           \
+                                         const uint8_t *src1, const uint8_t *src2,                        \
+                                         int width, uint32_t *_rgb2yuv, void *opq)                        \
+{                                                                                                         \
+    const uint16_t *src = (const uint16_t*)src1;                                                          \
+    uint16_t *dstU = (uint16_t*)_dstU;                                                                    \
+    uint16_t *dstV = (uint16_t*)_dstV;                                                                    \
+    int32_t *rgb2yuv = (int32_t*)_rgb2yuv;                                                                \
+    av_assert1(src1==src2);                                                                               \
+    rgbaf16ToUV_half_endian(dstU, dstV, endian, src, width, rgb2yuv, opq);                                \
+}                                                                                                         \
+static void rgbaf16##endian_name##ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0,       \
+                                        const uint8_t *unused1, int width, uint32_t *_rgb2yuv, void *opq) \
+{                                                                                                         \
+    const uint16_t *src = (const uint16_t*)_src;                                                          \
+    uint16_t *dst = (uint16_t*)_dst;                                                                      \
+    int32_t *rgb2yuv = (int32_t*)_rgb2yuv;                                                                \
+    rgbaf16ToY_endian(dst, src, endian, width, rgb2yuv, opq);                                             \
+}                                                                                                         \
+static void rgbaf16##endian_name##ToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0,       \
+                                        const uint8_t *unused1, int width, uint32_t *unused2, void *opq)  \
+{                                                                                                         \
+    const uint16_t *src = (const uint16_t*)_src;                                                          \
+    uint16_t *dst = (uint16_t*)_dst;                                                                      \
+    rgbaf16ToA_endian(dst, src, endian, width, opq);                                                      \
+}
+
+rgbaf16_funcs_endian(le, 0)
+rgbaf16_funcs_endian(be, 1)
+
 av_cold void ff_sws_init_input_funcs(SwsContext *c)
 {
     enum AVPixelFormat srcFormat = c->srcFormat;
@@ -1388,6 +1494,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case AV_PIX_FMT_X2BGR10LE:
             c->chrToYV12 = bgr30leToUV_half_c;
             break;
+        case AV_PIX_FMT_RGBAF16BE:
+            c->chrToYV12 = rgbaf16beToUV_half_c;
+            break;
+        case AV_PIX_FMT_RGBAF16LE:
+            c->chrToYV12 = rgbaf16leToUV_half_c;
+            break;
         }
     } else {
         switch (srcFormat) {
@@ -1475,6 +1587,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case AV_PIX_FMT_X2BGR10LE:
             c->chrToYV12 = bgr30leToUV_c;
             break;
+        case AV_PIX_FMT_RGBAF16BE:
+            c->chrToYV12 = rgbaf16beToUV_c;
+            break;
+        case AV_PIX_FMT_RGBAF16LE:
+            c->chrToYV12 = rgbaf16leToUV_c;
+            break;
         }
     }
 
@@ -1763,6 +1881,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_X2BGR10LE:
         c->lumToYV12 = bgr30leToY_c;
         break;
+    case AV_PIX_FMT_RGBAF16BE:
+        c->lumToYV12 = rgbaf16beToY_c;
+        break;
+    case AV_PIX_FMT_RGBAF16LE:
+        c->lumToYV12 = rgbaf16leToY_c;
+        break;
     }
     if (c->needAlpha) {
         if (is16BPS(srcFormat) || isNBPS(srcFormat)) {
@@ -1782,6 +1906,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case AV_PIX_FMT_ARGB:
             c->alpToYV12 = abgrToA_c;
             break;
+        case AV_PIX_FMT_RGBAF16BE:
+            c->alpToYV12 = rgbaf16beToA_c;
+            break;
+        case AV_PIX_FMT_RGBAF16LE:
+            c->alpToYV12 = rgbaf16leToA_c;
+            break;
         case AV_PIX_FMT_YA8:
             c->alpToYV12 = uyvyToY_c;
             break;
diff --git a/libswscale/slice.c b/libswscale/slice.c
index b3ee06d632..db1c696727 100644
--- a/libswscale/slice.c
+++ b/libswscale/slice.c
@@ -282,7 +282,13 @@ int ff_init_filters(SwsContext * c)
     c->descIndex[0] = num_ydesc + (need_gamma ? 1 : 0);
     c->descIndex[1] = num_ydesc + num_cdesc + (need_gamma ? 1 : 0);
 
-
+    if (isFloat16(c->srcFormat)) {
+        c->h2f_tables = av_malloc(sizeof(*c->h2f_tables));
+        if (!c->h2f_tables)
+            return AVERROR(ENOMEM);
+        ff_init_half2float_tables(c->h2f_tables);
+        c->input_opaque = c->h2f_tables;
+    }
 
     c->desc  = av_calloc(c->numDesc,  sizeof(*c->desc));
     if (!c->desc)
@@ -393,5 +399,6 @@ int ff_free_filters(SwsContext *c)
             free_slice(&c->slice[i]);
         av_freep(&c->slice);
     }
+    av_freep(&c->h2f_tables);
     return 0;
 }
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 9ab542933f..7d9f785298 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -35,6 +35,7 @@
 #include "libavutil/pixdesc.h"
 #include "libavutil/slicethread.h"
 #include "libavutil/ppc/util_altivec.h"
+#include "libavutil/half2float.h"
 
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 
@@ -679,6 +680,8 @@ typedef struct SwsContext {
     unsigned int dst_slice_align;
     atomic_int   stride_unaligned_warned;
     atomic_int   data_unaligned_warned;
+
+    half2float_tables *h2f_tables;
 } SwsContext;
 //FIXME check init (where 0)
 
@@ -840,6 +843,13 @@ static av_always_inline int isFloat(enum AVPixelFormat pix_fmt)
     return desc->flags & AV_PIX_FMT_FLAG_FLOAT;
 }
 
+static av_always_inline int isFloat16(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return (desc->flags & AV_PIX_FMT_FLAG_FLOAT) && desc->comp[0].depth == 16;
+}
+
 static av_always_inline int isALPHA(enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 34503e57f4..81646c0d73 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -259,6 +259,8 @@ static const FormatEntry format_entries[] = {
     [AV_PIX_FMT_P416LE]      = { 1, 1 },
     [AV_PIX_FMT_NV16]        = { 1, 1 },
     [AV_PIX_FMT_VUYA]        = { 1, 1 },
+    [AV_PIX_FMT_RGBAF16BE]   = { 1, 0 },
+    [AV_PIX_FMT_RGBAF16LE]   = { 1, 0 },
 };
 
 int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
diff --git a/libswscale/version.h b/libswscale/version.h
index 3193562d18..d8694bb5c0 100644
--- a/libswscale/version.h
+++ b/libswscale/version.h
@@ -29,7 +29,7 @@
 #include "version_major.h"
 
 #define LIBSWSCALE_VERSION_MINOR   8
-#define LIBSWSCALE_VERSION_MICRO 102
+#define LIBSWSCALE_VERSION_MICRO 103
 
 #define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
                                                LIBSWSCALE_VERSION_MINOR, \
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 10/11] swscale: add SwsContext parameter to input functions
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 10/11] swscale: add SwsContext parameter to input functions Timo Rothenpieler
@ 2022-08-10 20:52   ` Timo Rothenpieler
  2022-08-10 21:55   ` Andreas Rheinhardt
  1 sibling, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 20:52 UTC (permalink / raw)
  To: ffmpeg-devel

Forgot to update the commit message.
It no longer adds the SwsContext, but an opaque pointer which is easier 
to deal with from assembly, should any future code have a use for it.

Fixed locally
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 09/11] avutil/half2float: use native _Float16 if available
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 09/11] avutil/half2float: use native _Float16 if available Timo Rothenpieler
@ 2022-08-10 21:03   ` Andreas Rheinhardt
  2022-08-10 21:58     ` Timo Rothenpieler
  2022-08-10 22:51   ` [FFmpeg-devel] [PATCH v2 " Timo Rothenpieler
  1 sibling, 1 reply; 39+ messages in thread
From: Andreas Rheinhardt @ 2022-08-10 21:03 UTC (permalink / raw)
  To: ffmpeg-devel

Timo Rothenpieler:
> _Float16 support was available on arm/aarch64 for a while, and with gcc
> 12 was enabled on x86 as long as SSE2 is supported.
> 
> If the target arch supports f16c, gcc emits fairly efficient assembly,
> taking advantage of it. This is the case on x86-64-v3 or higher.
> Without f16c, it emulates it in software using sse2 instructions.

How is the performance of this emulation compared to our current code?
And how is the native _Float16 performance compared to the current code?

> ---
>  configure              |  4 ++++
>  libavutil/float2half.c |  2 ++
>  libavutil/float2half.h | 16 ++++++++++++++++
>  libavutil/half2float.c |  4 ++++
>  libavutil/half2float.h | 16 ++++++++++++++++
>  5 files changed, 42 insertions(+)
> 
> diff --git a/configure b/configure
> index 6761d0cb32..2536ae012d 100755
> --- a/configure
> +++ b/configure
> @@ -2143,6 +2143,7 @@ ARCH_FEATURES="
>      fast_64bit
>      fast_clz
>      fast_cmov
> +    float16
>      local_aligned
>      simd_align_16
>      simd_align_32
> @@ -5125,6 +5126,8 @@ elif enabled arm; then
>              ;;
>      esac
>  
> +    test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee
> +
>  elif enabled avr32; then
>  
>      case $cpu in
> @@ -6228,6 +6231,7 @@ check_builtin MemoryBarrier windows.h "MemoryBarrier()"
>  check_builtin sync_val_compare_and_swap "" "int *ptr; int oldval, newval; __sync_val_compare_and_swap(ptr, oldval, newval)"
>  check_builtin gmtime_r time.h "time_t *time; struct tm *tm; gmtime_r(time, tm)"
>  check_builtin localtime_r time.h "time_t *time; struct tm *tm; localtime_r(time, tm)"
> +check_builtin float16 "" "_Float16 f16var"
>  
>  case "$custom_allocator" in
>      jemalloc)
> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
> index dba14cef5d..1390d3acc0 100644
> --- a/libavutil/float2half.c
> +++ b/libavutil/float2half.c
> @@ -20,6 +20,7 @@
>  
>  void ff_init_float2half_tables(float2half_tables *t)
>  {
> +#if !HAVE_FLOAT16
>      for (int i = 0; i < 256; i++) {
>          int e = i - 127;
>  
> @@ -50,4 +51,5 @@ void ff_init_float2half_tables(float2half_tables *t)
>              t->shifttable[i|0x100] = 13;
>          }
>      }
> +#endif
>  }
> diff --git a/libavutil/float2half.h b/libavutil/float2half.h
> index b8c9cdfc4f..8c1fb804b7 100644
> --- a/libavutil/float2half.h
> +++ b/libavutil/float2half.h
> @@ -20,21 +20,37 @@
>  #define AVUTIL_FLOAT2HALF_H
>  
>  #include <stdint.h>
> +#include "intfloat.h"
> +
> +#include "config.h"
>  
>  typedef struct float2half_tables {
> +#if HAVE_FLOAT16
> +    uint8_t dummy;
> +#else
>      uint16_t basetable[512];
>      uint8_t shifttable[512];
> +#endif
>  } float2half_tables;
>  
>  void ff_init_float2half_tables(float2half_tables *t);
>  
>  static inline uint16_t float2half(uint32_t f, const float2half_tables *t)
>  {
> +#if HAVE_FLOAT16
> +    union {
> +        _Float16 f;
> +        uint16_t i;
> +    } u;
> +    u.f = av_int2float(f);
> +    return u.i;
> +#else
>      uint16_t h;
>  
>      h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> t->shifttable[(f >> 23) & 0x1ff]);
>  
>      return h;
> +#endif
>  }
>  
>  #endif /* AVUTIL_FLOAT2HALF_H */
> diff --git a/libavutil/half2float.c b/libavutil/half2float.c
> index baac8e4093..873226d3a0 100644
> --- a/libavutil/half2float.c
> +++ b/libavutil/half2float.c
> @@ -18,6 +18,7 @@
>  
>  #include "libavutil/half2float.h"
>  
> +#if !HAVE_FLOAT16
>  static uint32_t convertmantissa(uint32_t i)
>  {
>      int32_t m = i << 13; // Zero pad mantissa bits
> @@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i)
>  
>      return m | e; // Return combined number
>  }
> +#endif
>  
>  void ff_init_half2float_tables(half2float_tables *t)
>  {
> +#if !HAVE_FLOAT16
>      t->mantissatable[0] = 0;
>      for (int i = 1; i < 1024; i++)
>          t->mantissatable[i] = convertmantissa(i);
> @@ -60,4 +63,5 @@ void ff_init_half2float_tables(half2float_tables *t)
>      t->offsettable[31] = 2048;
>      t->offsettable[32] = 0;
>      t->offsettable[63] = 2048;
> +#endif
>  }
> diff --git a/libavutil/half2float.h b/libavutil/half2float.h
> index cb58e44a1c..b2a7c934a6 100644
> --- a/libavutil/half2float.h
> +++ b/libavutil/half2float.h
> @@ -20,22 +20,38 @@
>  #define AVUTIL_HALF2FLOAT_H
>  
>  #include <stdint.h>
> +#include "intfloat.h"
> +
> +#include "config.h"
>  
>  typedef struct half2float_tables {
> +#if HAVE_FLOAT16
> +    uint8_t dummy;
> +#else
>      uint32_t mantissatable[3072];
>      uint32_t exponenttable[64];
>      uint16_t offsettable[64];
> +#endif
>  } half2float_tables;
>  
>  void ff_init_half2float_tables(half2float_tables *t);
>  
>  static inline uint32_t half2float(uint16_t h, const half2float_tables *t)
>  {
> +#if HAVE_FLOAT16
> +    union {
> +        _Float16 f;
> +        uint16_t i;
> +    } u;
> +    u.i = h;
> +    return av_float2int(u.f);
> +#else
>      uint32_t f;
>  
>      f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + t->exponenttable[h >> 10];
>  
>      return f;
> +#endif
>  }
>  
>  #endif /* AVUTIL_HALF2FLOAT_H */

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN Timo Rothenpieler
@ 2022-08-10 21:24   ` Andreas Rheinhardt
  2022-08-10 21:36     ` Timo Rothenpieler
  0 siblings, 1 reply; 39+ messages in thread
From: Andreas Rheinhardt @ 2022-08-10 21:24 UTC (permalink / raw)
  To: ffmpeg-devel

Timo Rothenpieler:
> IEEE-754 differentiates two different kind of NaNs.
> Quiet and Signaling ones. They are differentiated by the MSB of the
> mantissa.
> 
> For whatever reason, actual hardware conversion of half to single always
> sets the signaling bit to 1 if the mantissa is != 0, and to 0 if it's 0.
> So our code has to follow suite or fate-testing hardware float16 will be
> impossible.

What does the exr spec say about quiet and signaling nans?

> ---
>  libavcodec/exr.c                                    | 2 +-
>  libavcodec/pnm.h                                    | 2 +-
>  libavutil/half2float.h                              | 5 +++++
>  tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF | 2 +-
>  4 files changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
> index 5c6ca9adbf..47f4786491 100644
> --- a/libavcodec/exr.c
> +++ b/libavcodec/exr.c
> @@ -191,7 +191,7 @@ typedef struct EXRContext {
>      float gamma;
>      union av_intfloat32 gamma_table[65536];
>  
> -    uint32_t mantissatable[2048];
> +    uint32_t mantissatable[3072];
>      uint32_t exponenttable[64];
>      uint16_t offsettable[64];
>  } EXRContext;
> diff --git a/libavcodec/pnm.h b/libavcodec/pnm.h
> index 5bf2eaa4d9..7e5445f529 100644
> --- a/libavcodec/pnm.h
> +++ b/libavcodec/pnm.h
> @@ -34,7 +34,7 @@ typedef struct PNMContext {
>      int half;
>      float scale;
>  
> -    uint32_t mantissatable[2048];
> +    uint32_t mantissatable[3072];
>      uint32_t exponenttable[64];
>      uint16_t offsettable[64];
>  } PNMContext;
> diff --git a/libavutil/half2float.h b/libavutil/half2float.h
> index 1f6deade07..5af4690cfe 100644
> --- a/libavutil/half2float.h
> +++ b/libavutil/half2float.h
> @@ -45,6 +45,9 @@ static void half2float_table(uint32_t *mantissatable, uint32_t *exponenttable,
>          mantissatable[i] = convertmantissa(i);
>      for (int i = 1024; i < 2048; i++)
>          mantissatable[i] = 0x38000000UL + ((i - 1024) << 13UL);
> +    for (int i = 2048; i < 3072; i++)
> +        mantissatable[i] = mantissatable[i - 1024] | 0x400000UL;
> +    mantissatable[2048] = mantissatable[1024];
>  
>      exponenttable[0] = 0;
>      for (int i = 1; i < 31; i++)
> @@ -58,7 +61,9 @@ static void half2float_table(uint32_t *mantissatable, uint32_t *exponenttable,
>      offsettable[0] = 0;
>      for (int i = 1; i < 64; i++)
>          offsettable[i] = 1024;
> +    offsettable[31] = 2048;
>      offsettable[32] = 0;
> +    offsettable[63] = 2048;
>  }
>  
>  static uint32_t half2float(uint16_t h, const uint32_t *mantissatable, const uint32_t *exponenttable,
> diff --git a/tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF b/tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF
> index b6201116fe..e45a40b498 100644
> --- a/tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF
> +++ b/tests/ref/fate/exr-rgb-scanline-zip-half-0x0-0xFFFF
> @@ -3,4 +3,4 @@
>  #codec_id 0: rawvideo
>  #dimensions 0: 256x256
>  #sar 0: 1/1
> -0,          0,          0,        1,   786432, 0x1445e411
> +0,          0,          0,        1,   786432, 0xce9be2be

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 21:24   ` Andreas Rheinhardt
@ 2022-08-10 21:36     ` Timo Rothenpieler
  2022-08-10 21:43       ` Andreas Rheinhardt
  0 siblings, 1 reply; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 21:36 UTC (permalink / raw)
  To: ffmpeg-devel

On 10.08.2022 23:24, Andreas Rheinhardt wrote:
> Timo Rothenpieler:
>> IEEE-754 differentiates two different kind of NaNs.
>> Quiet and Signaling ones. They are differentiated by the MSB of the
>> mantissa.
>>
>> For whatever reason, actual hardware conversion of half to single always
>> sets the signaling bit to 1 if the mantissa is != 0, and to 0 if it's 0.
>> So our code has to follow suite or fate-testing hardware float16 will be
>> impossible.
> 
> What does the exr spec say about quiet and signaling nans?

Not sure how exr would be involved here.
But I tested this on both aarch64, x86 with sse2 emulation and x86 f16c 
on alderlake and zen2.
They all perfectly agree and match 100% what this changed code produces 
for the entire range of 65k possible values.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 11/11] swscale/input: add rgbaf16 input support
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 11/11] swscale/input: add rgbaf16 input support Timo Rothenpieler
@ 2022-08-10 21:37   ` Timo Rothenpieler
  0 siblings, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 21:37 UTC (permalink / raw)
  To: ffmpeg-devel

On 10.08.2022 22:47, Timo Rothenpieler wrote:
...
> +#define rgbaf16_funcs_endian(endian_name, endian)                                                         \
> +static void rgbaf16##endian_name##ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused,      \
> +                                              const uint8_t *src1, const uint8_t *src2,                   \
> +                                              int width, uint32_t *_rgb2yuv, void *opq)                   \
> +{                                                                                                         \
> +    const uint16_t *src = (const uint16_t*)src1;                                                          \
> +    uint16_t *dstU = (uint16_t*)_dstU;                                                                    \
> +    uint16_t *dstV = (uint16_t*)_dstV;                                                                    \
> +    int32_t *rgb2yuv = (int32_t*)_rgb2yuv;                                                                \
> +    av_assert1(src1==src2);                                                                               \
> +    rgbaf16ToUV_half_endian(dstU, dstV, endian, src, width, rgb2yuv, opq);                                \
> +}                                                                                                         \
> +static void rgbaf16##endian_name##ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused,           \
> +                                         const uint8_t *src1, const uint8_t *src2,                        \
> +                                         int width, uint32_t *_rgb2yuv, void *opq)                        \
> +{                                                                                                         \
> +    const uint16_t *src = (const uint16_t*)src1;                                                          \
> +    uint16_t *dstU = (uint16_t*)_dstU;                                                                    \
> +    uint16_t *dstV = (uint16_t*)_dstV;                                                                    \
> +    int32_t *rgb2yuv = (int32_t*)_rgb2yuv;                                                                \
> +    av_assert1(src1==src2);                                                                               \
> +    rgbaf16ToUV_half_endian(dstU, dstV, endian, src, width, rgb2yuv, opq);                                \
> +}                                                                                                         \

copy/paste error here: This should be the non-half version. Fixed locally.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 21:36     ` Timo Rothenpieler
@ 2022-08-10 21:43       ` Andreas Rheinhardt
  2022-08-10 21:53         ` Timo Rothenpieler
  0 siblings, 1 reply; 39+ messages in thread
From: Andreas Rheinhardt @ 2022-08-10 21:43 UTC (permalink / raw)
  To: ffmpeg-devel

Timo Rothenpieler:
> On 10.08.2022 23:24, Andreas Rheinhardt wrote:
>> Timo Rothenpieler:
>>> IEEE-754 differentiates two different kind of NaNs.
>>> Quiet and Signaling ones. They are differentiated by the MSB of the
>>> mantissa.
>>>
>>> For whatever reason, actual hardware conversion of half to single always
>>> sets the signaling bit to 1 if the mantissa is != 0, and to 0 if it's 0.
>>> So our code has to follow suite or fate-testing hardware float16 will be
>>> impossible.
>>
>> What does the exr spec say about quiet and signaling nans?
> 
> Not sure how exr would be involved here.

Your patch changes the output of an exr-test. The output of the exr
decoder is presumably determined by the exr spec. There is after all the
possibility that what hardware does in hardware and what this patch does
in software is incompatible with what exr specifies.

> But I tested this on both aarch64, x86 with sse2 emulation and x86 f16c
> on alderlake and zen2.
> They all perfectly agree and match 100% what this changed code produces
> for the entire range of 65k possible values.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 21:43       ` Andreas Rheinhardt
@ 2022-08-10 21:53         ` Timo Rothenpieler
  2022-08-10 22:14           ` Mark Reid
  0 siblings, 1 reply; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 21:53 UTC (permalink / raw)
  To: ffmpeg-devel

On 10.08.2022 23:43, Andreas Rheinhardt wrote:
> Timo Rothenpieler:
>> On 10.08.2022 23:24, Andreas Rheinhardt wrote:
>>> Timo Rothenpieler:
>>>> IEEE-754 differentiates two different kind of NaNs.
>>>> Quiet and Signaling ones. They are differentiated by the MSB of the
>>>> mantissa.
>>>>
>>>> For whatever reason, actual hardware conversion of half to single always
>>>> sets the signaling bit to 1 if the mantissa is != 0, and to 0 if it's 0.
>>>> So our code has to follow suite or fate-testing hardware float16 will be
>>>> impossible.
>>>
>>> What does the exr spec say about quiet and signaling nans?
>>
>> Not sure how exr would be involved here.
> 
> Your patch changes the output of an exr-test. The output of the exr
> decoder is presumably determined by the exr spec. There is after all the
> possibility that what hardware does in hardware and what this patch does
> in software is incompatible with what exr specifies.

The exr spec just says something along the lines of analogous to 
ieee-754 floats: 
https://openexr.readthedocs.io/en/latest/TechnicalIntroduction.html?highlight=ieee#the-half-data-type
It barely ever mentions NaNs, other than that they exist. Which makes 
sense, given they don't typically appear in images.

The only output changed is that for how NaNs are converted.
And given the cross-validation with multiple hardware implementations, 
I'm confident that it's correct.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 10/11] swscale: add SwsContext parameter to input functions
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 10/11] swscale: add SwsContext parameter to input functions Timo Rothenpieler
  2022-08-10 20:52   ` Timo Rothenpieler
@ 2022-08-10 21:55   ` Andreas Rheinhardt
  2022-08-10 22:02     ` Timo Rothenpieler
  1 sibling, 1 reply; 39+ messages in thread
From: Andreas Rheinhardt @ 2022-08-10 21:55 UTC (permalink / raw)
  To: ffmpeg-devel

Timo Rothenpieler:
> ---
>  libswscale/hscale.c           |  12 +--
>  libswscale/input.c            | 149 ++++++++++++++++++----------------
>  libswscale/swscale_internal.h |  17 ++--
>  libswscale/x86/swscale.c      |  13 +--
>  4 files changed, 106 insertions(+), 85 deletions(-)
> 
> diff --git a/libswscale/hscale.c b/libswscale/hscale.c
> index eca0635338..6789ce7540 100644
> --- a/libswscale/hscale.c
> +++ b/libswscale/hscale.c
> @@ -105,18 +105,18 @@ static int lum_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int
>          uint8_t * dst = desc->dst->plane[0].line[i];
>  
>          if (c->lumToYV12) {
> -            c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal);
> +            c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal, c->input_opaque);
>          } else if (c->readLumPlanar) {
> -            c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table);
> +            c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table, c->input_opaque);
>          }
>  
>  
>          if (desc->alpha) {
>              dst = desc->dst->plane[3].line[i];
>              if (c->alpToYV12) {
> -                c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal);
> +                c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal, c->input_opaque);
>              } else if (c->readAlpPlanar) {
> -                c->readAlpPlanar(dst, src, srcW, NULL);
> +                c->readAlpPlanar(dst, src, srcW, NULL, c->input_opaque);
>              }
>          }
>      }
> @@ -224,9 +224,9 @@ static int chr_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int
>          uint8_t * dst1 = desc->dst->plane[1].line[i];
>          uint8_t * dst2 = desc->dst->plane[2].line[i];
>          if (c->chrToYV12) {
> -            c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal);
> +            c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal, c->input_opaque);
>          } else if (c->readChrPlanar) {
> -            c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table);
> +            c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table, c->input_opaque);
>          }
>      }
>      return sliceH;
> diff --git a/libswscale/input.c b/libswscale/input.c
> index 68abc4d62c..36ef1e43ac 100644
> --- a/libswscale/input.c
> +++ b/libswscale/input.c
> @@ -88,7 +88,7 @@ rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
>  
>  #define rgb64funcs(pattern, BE_LE, origin) \
>  static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
> -                                    int width, uint32_t *rgb2yuv) \
> +                                    int width, uint32_t *rgb2yuv, void *opq) \
>  { \
>      const uint16_t *src = (const uint16_t *) _src; \
>      uint16_t *dst = (uint16_t *) _dst; \
> @@ -97,7 +97,7 @@ static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src,
>   \
>  static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
>                                      const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
> -                                    int width, uint32_t *rgb2yuv) \
> +                                    int width, uint32_t *rgb2yuv, void *opq) \
>  { \
>      const uint16_t *src1 = (const uint16_t *) _src1, \
>                     *src2 = (const uint16_t *) _src2; \
> @@ -107,7 +107,7 @@ static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
>   \
>  static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
>                                      const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
> -                                    int width, uint32_t *rgb2yuv) \
> +                                    int width, uint32_t *rgb2yuv, void *opq) \
>  { \
>      const uint16_t *src1 = (const uint16_t *) _src1, \
>                     *src2 = (const uint16_t *) _src2; \
> @@ -192,7 +192,8 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
>                                              const uint8_t *_src,        \
>                                              const uint8_t *unused0, const uint8_t *unused1,\
>                                              int width,                  \
> -                                            uint32_t *rgb2yuv)          \
> +                                            uint32_t *rgb2yuv,          \
> +                                            void *opq)                  \
>  {                                                                       \
>      const uint16_t *src = (const uint16_t *)_src;                       \
>      uint16_t *dst       = (uint16_t *)_dst;                             \
> @@ -205,7 +206,8 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
>                                               const uint8_t *_src1,      \
>                                               const uint8_t *_src2,      \
>                                               int width,                 \
> -                                             uint32_t *rgb2yuv)         \
> +                                             uint32_t *rgb2yuv,         \
> +                                             void *opq)                 \
>  {                                                                       \
>      const uint16_t *src1 = (const uint16_t *)_src1,                     \
>                     *src2 = (const uint16_t *)_src2;                     \
> @@ -220,7 +222,8 @@ static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU,       \
>                                                    const uint8_t *_src1, \
>                                                    const uint8_t *_src2, \
>                                                    int width,            \
> -                                                  uint32_t *rgb2yuv)    \
> +                                                  uint32_t *rgb2yuv,    \
> +                                                  void *opq)            \
>  {                                                                       \
>      const uint16_t *src1 = (const uint16_t *)_src1,                     \
>                     *src2 = (const uint16_t *)_src2;                     \
> @@ -345,7 +348,7 @@ static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
>  #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr,          \
>                           maskg, maskb, rsh, gsh, bsh, S)                \
>  static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,            \
> -                          int width, uint32_t *tab)                     \
> +                          int width, uint32_t *tab, void *opq)          \
>  {                                                                       \
>      rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp,    \
>                             maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \
> @@ -353,7 +356,7 @@ static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unuse
>                                                                          \
>  static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
>                             const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy,    \
> -                           int width, uint32_t *tab)                    \
> +                           int width, uint32_t *tab, void *opq)         \
>  {                                                                       \
>      rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,                \
>                              shr, shg, shb, shp,                         \
> @@ -363,7 +366,7 @@ static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
>  static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV,           \
>                                  const uint8_t *unused0, const uint8_t *src,                     \
>                                  const uint8_t *dummy,                   \
> -                                int width, uint32_t *tab)               \
> +                                int width, uint32_t *tab, void *opq)    \
>  {                                                                       \
>      rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,           \
>                                   shr, shg, shb, shp,                    \
> @@ -392,7 +395,7 @@ rgb16_32_wrapper(AV_PIX_FMT_X2BGR10LE, bgr30le, 0, 6, 16, 0, 0x3FF, 0xFFC00, 0x3
>  
>  static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
>                           const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
> -                         int width, uint32_t *rgb2yuv)
> +                         int width, uint32_t *rgb2yuv, void *opq)
>  {
>      uint16_t *dstU = (uint16_t *)_dstU;
>      uint16_t *dstV = (uint16_t *)_dstV;
> @@ -411,7 +414,7 @@ static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
>  }
>  
>  static void rgba64leToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
> -                          const uint8_t *unused2, int width, uint32_t *unused)
> +                          const uint8_t *unused2, int width, uint32_t *unused, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      const uint16_t *src = (const uint16_t *)_src;
> @@ -421,7 +424,7 @@ static void rgba64leToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unu
>  }
>  
>  static void rgba64beToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
> -                          const uint8_t *unused2, int width, uint32_t *unused)
> +                          const uint8_t *unused2, int width, uint32_t *unused, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      const uint16_t *src = (const uint16_t *)_src;
> @@ -430,7 +433,8 @@ static void rgba64beToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unu
>          dst[i] = AV_RB16(src + 4 * i + 3);
>  }
>  
> -static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
> +static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
> +                      const uint8_t *unused2, int width, uint32_t *unused, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      int i;
> @@ -439,7 +443,8 @@ static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>      }
>  }
>  
> -static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
> +static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
> +                      const uint8_t *unused2, int width, uint32_t *unused, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      int i;
> @@ -448,7 +453,8 @@ static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>      }
>  }
>  
> -static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
> +static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
> +                     const uint8_t *unused2, int width, uint32_t *pal, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      int i;
> @@ -459,7 +465,8 @@ static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>      }
>  }
>  
> -static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
> +static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
> +                     const uint8_t *unused2, int width, uint32_t *pal, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      int i;
> @@ -471,8 +478,8 @@ static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>  }
>  
>  static void palToUV_c(uint8_t *_dstU, uint8_t *_dstV,
> -                           const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> -                      int width, uint32_t *pal)
> +                      const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> +                      int width, uint32_t *pal, void *opq)
>  {
>      uint16_t *dstU = (uint16_t *)_dstU;
>      int16_t *dstV = (int16_t *)_dstV;
> @@ -486,7 +493,8 @@ static void palToUV_c(uint8_t *_dstU, uint8_t *_dstV,
>      }
>  }
>  
> -static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
> +static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
> +                          const uint8_t *unused2,  int width, uint32_t *unused, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      int i, j;
> @@ -503,7 +511,8 @@ static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unus
>      }
>  }
>  
> -static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
> +static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
> +                          const uint8_t *unused2,  int width, uint32_t *unused, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      int i, j;
> @@ -520,8 +529,8 @@ static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unus
>      }
>  }
>  
> -static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
> -                      uint32_t *unused)
> +static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
> +                      uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -529,7 +538,7 @@ static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1,
>  }
>  
>  static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
> -                       const uint8_t *src2, int width, uint32_t *unused)
> +                       const uint8_t *src2, int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -540,7 +549,7 @@ static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, con
>  }
>  
>  static void yvy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
> -                       const uint8_t *src2, int width, uint32_t *unused)
> +                       const uint8_t *src2, int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -551,7 +560,7 @@ static void yvy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, con
>  }
>  
>  static void y210le_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src,
> -                        const uint8_t *unused1, int width, uint32_t *unused2)
> +                        const uint8_t *unused1, int width, uint32_t *unused2, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -561,7 +570,7 @@ static void y210le_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, co
>  }
>  
>  static void y210le_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0,
> -                       const uint8_t *unused1, int width, uint32_t *unused2)
> +                       const uint8_t *unused1, int width, uint32_t *unused2, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -569,7 +578,7 @@ static void y210le_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0,
>  }
>  
>  static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2, int width,
> -                       uint32_t *unused)
> +                       uint32_t *unused, void *opq)
>  {
>      int i;
>      const uint16_t *src = (const uint16_t *)_src;
> @@ -579,7 +588,7 @@ static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused
>  }
>  
>  static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
> -                        const uint8_t *_src2, int width, uint32_t *unused)
> +                        const uint8_t *_src2, int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      const uint16_t *src1 = (const uint16_t *)_src1,
> @@ -592,7 +601,7 @@ static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>  }
>  
>  static void read_ya16le_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
> -                               uint32_t *unused)
> +                               uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -600,7 +609,7 @@ static void read_ya16le_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *
>  }
>  
>  static void read_ya16le_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
> -                                uint32_t *unused)
> +                                uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -608,7 +617,7 @@ static void read_ya16le_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t
>  }
>  
>  static void read_ya16be_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
> -                               uint32_t *unused)
> +                               uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -616,7 +625,7 @@ static void read_ya16be_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *
>  }
>  
>  static void read_ya16be_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
> -                                uint32_t *unused)
> +                                uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -624,7 +633,7 @@ static void read_ya16be_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t
>  }
>  
>  static void read_ayuv64le_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
> -                               uint32_t *unused2)
> +                               uint32_t *unused2, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -633,7 +642,7 @@ static void read_ayuv64le_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *u
>  
>  
>  static void read_ayuv64le_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src,
> -                               const uint8_t *unused1, int width, uint32_t *unused2)
> +                               const uint8_t *unused1, int width, uint32_t *unused2, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -643,7 +652,7 @@ static void read_ayuv64le_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unus
>  }
>  
>  static void read_ayuv64le_A_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
> -                                uint32_t *unused2)
> +                              uint32_t *unused2, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -651,7 +660,7 @@ static void read_ayuv64le_A_c(uint8_t *dst, const uint8_t *src, const uint8_t *u
>  }
>  
>  static void read_vuya_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src,
> -                           const uint8_t *unused1, int width, uint32_t *unused2)
> +                           const uint8_t *unused1, int width, uint32_t *unused2, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -661,7 +670,7 @@ static void read_vuya_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0,
>  }
>  
>  static void read_vuya_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
> -                          uint32_t *unused2)
> +                          uint32_t *unused2, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -669,7 +678,7 @@ static void read_vuya_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unuse
>  }
>  
>  static void read_vuya_A_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
> -                          uint32_t *unused2)
> +                          uint32_t *unused2, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -679,7 +688,7 @@ static void read_vuya_A_c(uint8_t *dst, const uint8_t *src, const uint8_t *unuse
>  /* This is almost identical to the previous, end exists only because
>   * yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */
>  static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
> -                      uint32_t *unused)
> +                      uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++)
> @@ -687,7 +696,7 @@ static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1,
>  }
>  
>  static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
> -                       const uint8_t *src2, int width, uint32_t *unused)
> +                       const uint8_t *src2, int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -709,20 +718,20 @@ static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
>  
>  static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
>                         const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> -                       int width, uint32_t *unused)
> +                       int width, uint32_t *unused, void *opq)
>  {
>      nvXXtoUV_c(dstU, dstV, src1, width);
>  }
>  
>  static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
>                         const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> -                       int width, uint32_t *unused)
> +                       int width, uint32_t *unused, void *opq)
>  {
>      nvXXtoUV_c(dstV, dstU, src1, width);
>  }
>  
>  static void p010LEToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1,
> -                        const uint8_t *unused2, int width, uint32_t *unused)
> +                        const uint8_t *unused2, int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -731,7 +740,7 @@ static void p010LEToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1
>  }
>  
>  static void p010BEToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1,
> -                        const uint8_t *unused2, int width, uint32_t *unused)
> +                        const uint8_t *unused2, int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -741,7 +750,7 @@ static void p010BEToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1
>  
>  static void p010LEToUV_c(uint8_t *dstU, uint8_t *dstV,
>                         const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> -                       int width, uint32_t *unused)
> +                       int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -751,8 +760,8 @@ static void p010LEToUV_c(uint8_t *dstU, uint8_t *dstV,
>  }
>  
>  static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
> -                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> -                       int width, uint32_t *unused)
> +                         const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> +                         int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -762,8 +771,8 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
>  }
>  
>  static void p016LEToUV_c(uint8_t *dstU, uint8_t *dstV,
> -                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> -                       int width, uint32_t *unused)
> +                         const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> +                         int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -773,8 +782,8 @@ static void p016LEToUV_c(uint8_t *dstU, uint8_t *dstV,
>  }
>  
>  static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV,
> -                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> -                       int width, uint32_t *unused)
> +                         const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
> +                         int width, uint32_t *unused, void *opq)
>  {
>      int i;
>      for (i = 0; i < width; i++) {
> @@ -786,7 +795,7 @@ static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV,
>  #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
>  
>  static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
> -                       int width, uint32_t *rgb2yuv)
> +                       int width, uint32_t *rgb2yuv, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
> @@ -801,7 +810,7 @@ static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1
>  }
>  
>  static void bgr24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
> -                        const uint8_t *src2, int width, uint32_t *rgb2yuv)
> +                        const uint8_t *src2, int width, uint32_t *rgb2yuv, void *opq)
>  {
>      int16_t *dstU = (int16_t *)_dstU;
>      int16_t *dstV = (int16_t *)_dstV;
> @@ -820,7 +829,7 @@ static void bgr24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>  }
>  
>  static void bgr24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
> -                             const uint8_t *src2, int width, uint32_t *rgb2yuv)
> +                             const uint8_t *src2, int width, uint32_t *rgb2yuv, void *opq)
>  {
>      int16_t *dstU = (int16_t *)_dstU;
>      int16_t *dstV = (int16_t *)_dstV;
> @@ -839,7 +848,7 @@ static void bgr24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unus
>  }
>  
>  static void rgb24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
> -                       uint32_t *rgb2yuv)
> +                       uint32_t *rgb2yuv, void *opq)
>  {
>      int16_t *dst = (int16_t *)_dst;
>      int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
> @@ -854,7 +863,7 @@ static void rgb24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1
>  }
>  
>  static void rgb24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
> -                        const uint8_t *src2, int width, uint32_t *rgb2yuv)
> +                        const uint8_t *src2, int width, uint32_t *rgb2yuv, void *opq)
>  {
>      int16_t *dstU = (int16_t *)_dstU;
>      int16_t *dstV = (int16_t *)_dstV;
> @@ -873,7 +882,7 @@ static void rgb24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>  }
>  
>  static void rgb24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
> -                             const uint8_t *src2, int width, uint32_t *rgb2yuv)
> +                             const uint8_t *src2, int width, uint32_t *rgb2yuv, void *opq)
>  {
>      int16_t *dstU = (int16_t *)_dstU;
>      int16_t *dstV = (int16_t *)_dstV;
> @@ -891,7 +900,7 @@ static void rgb24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unus
>      }
>  }
>  
> -static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv)
> +static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq)
>  {
>      uint16_t *dst = (uint16_t *)_dst;
>      int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
> @@ -905,7 +914,7 @@ static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width, int
>      }
>  }
>  
> -static void planar_rgb_to_a(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *unused)
> +static void planar_rgb_to_a(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *unused, void *opq)
>  {
>      uint16_t *dst = (uint16_t *)_dst;
>      int i;
> @@ -913,7 +922,7 @@ static void planar_rgb_to_a(uint8_t *_dst, const uint8_t *src[4], int width, int
>          dst[i] = src[3][i] << 6;
>  }
>  
> -static void planar_rgb_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv)
> +static void planar_rgb_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq)
>  {
>      uint16_t *dstU = (uint16_t *)_dstU;
>      uint16_t *dstV = (uint16_t *)_dstV;
> @@ -1049,24 +1058,27 @@ static av_always_inline void grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src,
>  
>  #define rgb9plus_planar_funcs_endian(nbits, endian_name, endian)                                    \
>  static void planar_rgb##nbits##endian_name##_to_y(uint8_t *dst, const uint8_t *src[4],              \
> -                                                  int w, int32_t *rgb2yuv)                          \
> +                                                  int w, int32_t *rgb2yuv, void *opq)               \
>  {                                                                                                   \
>      planar_rgb16_to_y(dst, src, w, nbits, endian, rgb2yuv);                                         \
>  }                                                                                                   \
>  static void planar_rgb##nbits##endian_name##_to_uv(uint8_t *dstU, uint8_t *dstV,                    \
> -                                                   const uint8_t *src[4], int w, int32_t *rgb2yuv)  \
> +                                                   const uint8_t *src[4], int w, int32_t *rgb2yuv,  \
> +                                                   void *opq)                                       \
>  {                                                                                                   \
>      planar_rgb16_to_uv(dstU, dstV, src, w, nbits, endian, rgb2yuv);                                 \
>  }                                                                                                   \
>  
>  #define rgb9plus_planar_transparency_funcs(nbits)                           \
>  static void planar_rgb##nbits##le_to_a(uint8_t *dst, const uint8_t *src[4], \
> -                                       int w, int32_t *rgb2yuv)             \
> +                                       int w, int32_t *rgb2yuv,             \
> +                                       void *opq)                           \
>  {                                                                           \
>      planar_rgb16_to_a(dst, src, w, nbits, 0, rgb2yuv);                      \
>  }                                                                           \
>  static void planar_rgb##nbits##be_to_a(uint8_t *dst, const uint8_t *src[4], \
> -                                       int w, int32_t *rgb2yuv)             \
> +                                       int w, int32_t *rgb2yuv,             \
> +                                       void *opq)                           \
>  {                                                                           \
>      planar_rgb16_to_a(dst, src, w, nbits, 1, rgb2yuv);                      \
>  }
> @@ -1087,23 +1099,24 @@ rgb9plus_planar_transparency_funcs(16)
>  
>  #define rgbf32_planar_funcs_endian(endian_name, endian)                                             \
>  static void planar_rgbf32##endian_name##_to_y(uint8_t *dst, const uint8_t *src[4],                  \
> -                                                  int w, int32_t *rgb2yuv)                          \
> +                                                  int w, int32_t *rgb2yuv, void *opq)               \
>  {                                                                                                   \
>      planar_rgbf32_to_y(dst, src, w, endian, rgb2yuv);                                               \
>  }                                                                                                   \
>  static void planar_rgbf32##endian_name##_to_uv(uint8_t *dstU, uint8_t *dstV,                        \
> -                                                   const uint8_t *src[4], int w, int32_t *rgb2yuv)  \
> +                                               const uint8_t *src[4], int w, int32_t *rgb2yuv,      \
> +                                               void *opq)                                           \
>  {                                                                                                   \
>      planar_rgbf32_to_uv(dstU, dstV, src, w, endian, rgb2yuv);                                       \
>  }                                                                                                   \
>  static void planar_rgbf32##endian_name##_to_a(uint8_t *dst, const uint8_t *src[4],                  \
> -                                              int w, int32_t *rgb2yuv)                              \
> +                                              int w, int32_t *rgb2yuv, void *opq)                   \
>  {                                                                                                   \
>      planar_rgbf32_to_a(dst, src, w, endian, rgb2yuv);                                               \
>  }                                                                                                   \
>  static void grayf32##endian_name##ToY16_c(uint8_t *dst, const uint8_t *src,                         \
>                                            const uint8_t *unused1, const uint8_t *unused2,           \
> -                                          int width, uint32_t *unused)                              \
> +                                          int width, uint32_t *unused, void *opq)                   \
>  {                                                                                                   \
>      grayf32ToY16_c(dst, src, unused1, unused2, width, endian, unused);                              \
>  }
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index e118b54457..9ab542933f 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -559,26 +559,31 @@ typedef struct SwsContext {
>      yuv2packedX_fn yuv2packedX;
>      yuv2anyX_fn yuv2anyX;
>  
> +    /// Opaque data pointer passed to all input functions.
> +    void *input_opaque;
> +
>      /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
>      void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
> -                      int width, uint32_t *pal);
> +                      int width, uint32_t *pal, void *opq);
>      /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
>      void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
> -                      int width, uint32_t *pal);
> +                      int width, uint32_t *pal, void *opq);
>      /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
>      void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
>                        const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
> -                      int width, uint32_t *pal);
> +                      int width, uint32_t *pal, void *opq);
>  
>      /**
>       * Functions to read planar input, such as planar RGB, and convert
>       * internally to Y/UV/A.
>       */
>      /** @{ */
> -    void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
> +    void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv,
> +                          void *opq);
>      void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],
> -                          int width, int32_t *rgb2yuv);
> -    void (*readAlpPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
> +                          int width, int32_t *rgb2yuv, void *opq);
> +    void (*readAlpPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv,
> +                          void *opq);
>      /** @} */
>  
>      /**
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 628f12137c..270798ba3d 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -299,13 +299,13 @@ VSCALE_FUNCS(avx, avx);
>  #define INPUT_Y_FUNC(fmt, opt) \
>  void ff_ ## fmt ## ToY_  ## opt(uint8_t *dst, const uint8_t *src, \
>                                  const uint8_t *unused1, const uint8_t *unused2, \
> -                                int w, uint32_t *unused)
> +                                int w, uint32_t *unused, void *opq)
>  #define INPUT_UV_FUNC(fmt, opt) \
>  void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
>                                  const uint8_t *unused0, \
>                                  const uint8_t *src1, \
>                                  const uint8_t *src2, \
> -                                int w, uint32_t *unused)
> +                                int w, uint32_t *unused, void *opq)
>  #define INPUT_FUNC(fmt, opt) \
>      INPUT_Y_FUNC(fmt, opt); \
>      INPUT_UV_FUNC(fmt, opt)
> @@ -373,15 +373,18 @@ YUV2GBRP_DECL(avx2);
>  
>  #define INPUT_PLANAR_RGB_Y_FN_DECL(fmt, opt)                               \
>  void ff_planar_##fmt##_to_y_##opt(uint8_t *dst,                            \
> -                           const uint8_t *src[4], int w, int32_t *rgb2yuv)
> +                           const uint8_t *src[4], int w, int32_t *rgb2yuv, \
> +                           void *opq)
>  
>  #define INPUT_PLANAR_RGB_UV_FN_DECL(fmt, opt)                              \
>  void ff_planar_##fmt##_to_uv_##opt(uint8_t *dstU, uint8_t *dstV,           \
> -                           const uint8_t *src[4], int w, int32_t *rgb2yuv)
> +                           const uint8_t *src[4], int w, int32_t *rgb2yuv, \
> +                           void *opq)
>  
>  #define INPUT_PLANAR_RGB_A_FN_DECL(fmt, opt)                               \
>  void ff_planar_##fmt##_to_a_##opt(uint8_t *dst,                            \
> -                           const uint8_t *src[4], int w, int32_t *rgb2yuv)
> +                           const uint8_t *src[4], int w, int32_t *rgb2yuv, \
> +                           void *opq)
>  
>  
>  #define INPUT_PLANAR_RGBXX_A_DECL(fmt, opt) \

Don't you need to update the assembly, too? (Do we support anything x86
where the callee has to clean up the stack?)

- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 09/11] avutil/half2float: use native _Float16 if available
  2022-08-10 21:03   ` Andreas Rheinhardt
@ 2022-08-10 21:58     ` Timo Rothenpieler
  2022-08-10 22:02       ` James Almer
  0 siblings, 1 reply; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 21:58 UTC (permalink / raw)
  To: ffmpeg-devel

On 10.08.2022 23:03, Andreas Rheinhardt wrote:
> Timo Rothenpieler:
>> _Float16 support was available on arm/aarch64 for a while, and with gcc
>> 12 was enabled on x86 as long as SSE2 is supported.
>>
>> If the target arch supports f16c, gcc emits fairly efficient assembly,
>> taking advantage of it. This is the case on x86-64-v3 or higher.
>> Without f16c, it emulates it in software using sse2 instructions.
> 
> How is the performance of this emulation compared to our current code?
> And how is the native _Float16 performance compared to the current code?

The performance of the sse2 emulation is actually surprisingly poor, in 
a quick test:

./ffmpeg -s 512x512 -f rawvideo -pix_fmt rgbaf16 -i /dev/zero -vf 
format=yuv444p -f null -

_Float16 full SSE2 emulation:
frame=50074 fps=848 q=-0.0 size=N/A time=00:33:22.96 bitrate=N/A speed=33.9x

_Float16 f16c accelerated (Zen2, --cpu=znver2):
frame=50636 fps=1965 q=-0.0 Lsize=N/A time=00:33:45.40 bitrate=N/A 
speed=78.6x

classic half2float full software implementation:
frame=49926 fps=1605 q=-0.0 Lsize=N/A time=00:33:17.00 bitrate=N/A 
speed=64.2x

Unfortunately I don't see a good way to runtime-detect the presence of 
f16c without going full self-written assembly, which would diminish the 
compilers ability to take advantage of f16c only ever operating on 4 or 
8 values at a time.
But the HAVE_FLOAT16 checks could be paired with a check for __F16C__, 
which seems to universally be the established define for "the code is 
being built f16c optimizations".

That at least avoids the case of the apparently quite slow sse2 emulation.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 10/11] swscale: add SwsContext parameter to input functions
  2022-08-10 21:55   ` Andreas Rheinhardt
@ 2022-08-10 22:02     ` Timo Rothenpieler
  0 siblings, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 22:02 UTC (permalink / raw)
  To: ffmpeg-devel

On 10.08.2022 23:55, Andreas Rheinhardt wrote:
> Don't you need to update the assembly, too? (Do we support anything x86
> where the callee has to clean up the stack?)

We concluded on IRC that that's not neccesary.
The assembly is pretty hard written to be cdecl, in which the caller 
cleans up the stack.

I tried adding the parameter there, and broke it in the process.
But fate still passes and the assembly isn't suddenly gonna change 
calling convention.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 09/11] avutil/half2float: use native _Float16 if available
  2022-08-10 21:58     ` Timo Rothenpieler
@ 2022-08-10 22:02       ` James Almer
  0 siblings, 0 replies; 39+ messages in thread
From: James Almer @ 2022-08-10 22:02 UTC (permalink / raw)
  To: ffmpeg-devel



On 8/10/2022 6:58 PM, Timo Rothenpieler wrote:
> On 10.08.2022 23:03, Andreas Rheinhardt wrote:
>> Timo Rothenpieler:
>>> _Float16 support was available on arm/aarch64 for a while, and with gcc
>>> 12 was enabled on x86 as long as SSE2 is supported.
>>>
>>> If the target arch supports f16c, gcc emits fairly efficient assembly,
>>> taking advantage of it. This is the case on x86-64-v3 or higher.
>>> Without f16c, it emulates it in software using sse2 instructions.
>>
>> How is the performance of this emulation compared to our current code?
>> And how is the native _Float16 performance compared to the current code?
> 
> The performance of the sse2 emulation is actually surprisingly poor, in 
> a quick test:
> 
> ./ffmpeg -s 512x512 -f rawvideo -pix_fmt rgbaf16 -i /dev/zero -vf 
> format=yuv444p -f null -
> 
> _Float16 full SSE2 emulation:
> frame=50074 fps=848 q=-0.0 size=N/A time=00:33:22.96 bitrate=N/A 
> speed=33.9x
> 
> _Float16 f16c accelerated (Zen2, --cpu=znver2):
> frame=50636 fps=1965 q=-0.0 Lsize=N/A time=00:33:45.40 bitrate=N/A 
> speed=78.6x
> 
> classic half2float full software implementation:
> frame=49926 fps=1605 q=-0.0 Lsize=N/A time=00:33:17.00 bitrate=N/A 
> speed=64.2x
> 
> Unfortunately I don't see a good way to runtime-detect the presence of 
> f16c without going full self-written assembly, which would diminish the 
> compilers ability to take advantage of f16c only ever operating on 4 or 
> 8 values at a time.
> But the HAVE_FLOAT16 checks could be paired with a check for __F16C__, 
> which seems to universally be the established define for "the code is 
> being built f16c optimizations".

That should do it, yes. We do check for __SSE__ and similar for some 
other lavu functions after all.

> 
> That at least avoids the case of the apparently quite slow sse2 emulation.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 21:53         ` Timo Rothenpieler
@ 2022-08-10 22:14           ` Mark Reid
  2022-08-10 22:18             ` James Almer
  0 siblings, 1 reply; 39+ messages in thread
From: Mark Reid @ 2022-08-10 22:14 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Aug 10, 2022 at 2:53 PM Timo Rothenpieler <timo@rothenpieler.org>
wrote:

> On 10.08.2022 23:43, Andreas Rheinhardt wrote:
> > Timo Rothenpieler:
> >> On 10.08.2022 23:24, Andreas Rheinhardt wrote:
> >>> Timo Rothenpieler:
> >>>> IEEE-754 differentiates two different kind of NaNs.
> >>>> Quiet and Signaling ones. They are differentiated by the MSB of the
> >>>> mantissa.
> >>>>
> >>>> For whatever reason, actual hardware conversion of half to single
> always
> >>>> sets the signaling bit to 1 if the mantissa is != 0, and to 0 if it's
> 0.
> >>>> So our code has to follow suite or fate-testing hardware float16 will
> be
> >>>> impossible.
> >>>
> >>> What does the exr spec say about quiet and signaling nans?
> >>
> >> Not sure how exr would be involved here.
> >
> > Your patch changes the output of an exr-test. The output of the exr
> > decoder is presumably determined by the exr spec. There is after all the
> > possibility that what hardware does in hardware and what this patch does
> > in software is incompatible with what exr specifies.
>
> The exr spec just says something along the lines of analogous to
> ieee-754 floats:
>
> https://openexr.readthedocs.io/en/latest/TechnicalIntroduction.html?highlight=ieee#the-half-data-type
> It barely ever mentions NaNs, other than that they exist. Which makes
> sense, given they don't typically appear in images.
>
> The only output changed is that for how NaNs are converted.
> And given the cross-validation with multiple hardware implementations,
> I'm confident that it's correct.
>

here is openexr implementation
https://github.com/AcademySoftwareFoundation/Imath/blob/main/src/Imath/toFloat.cpp#L78
It has been a while since I check but I believe the current implementation
matches this.

The fate sample: rgb_scanline_zip_half_float_0x0_to_0xFFFF.exr was created
to test this.
it contains every possible float16 value


> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 22:14           ` Mark Reid
@ 2022-08-10 22:18             ` James Almer
  2022-08-10 22:28               ` Timo Rothenpieler
  0 siblings, 1 reply; 39+ messages in thread
From: James Almer @ 2022-08-10 22:18 UTC (permalink / raw)
  To: ffmpeg-devel

On 8/10/2022 7:14 PM, Mark Reid wrote:
> On Wed, Aug 10, 2022 at 2:53 PM Timo Rothenpieler <timo@rothenpieler.org>
> wrote:
> 
>> On 10.08.2022 23:43, Andreas Rheinhardt wrote:
>>> Timo Rothenpieler:
>>>> On 10.08.2022 23:24, Andreas Rheinhardt wrote:
>>>>> Timo Rothenpieler:
>>>>>> IEEE-754 differentiates two different kind of NaNs.
>>>>>> Quiet and Signaling ones. They are differentiated by the MSB of the
>>>>>> mantissa.
>>>>>>
>>>>>> For whatever reason, actual hardware conversion of half to single
>> always
>>>>>> sets the signaling bit to 1 if the mantissa is != 0, and to 0 if it's
>> 0.
>>>>>> So our code has to follow suite or fate-testing hardware float16 will
>> be
>>>>>> impossible.
>>>>>
>>>>> What does the exr spec say about quiet and signaling nans?
>>>>
>>>> Not sure how exr would be involved here.
>>>
>>> Your patch changes the output of an exr-test. The output of the exr
>>> decoder is presumably determined by the exr spec. There is after all the
>>> possibility that what hardware does in hardware and what this patch does
>>> in software is incompatible with what exr specifies.
>>
>> The exr spec just says something along the lines of analogous to
>> ieee-754 floats:
>>
>> https://openexr.readthedocs.io/en/latest/TechnicalIntroduction.html?highlight=ieee#the-half-data-type
>> It barely ever mentions NaNs, other than that they exist. Which makes
>> sense, given they don't typically appear in images.
>>
>> The only output changed is that for how NaNs are converted.
>> And given the cross-validation with multiple hardware implementations,
>> I'm confident that it's correct.
>>
> 
> here is openexr implementation
> https://github.com/AcademySoftwareFoundation/Imath/blob/main/src/Imath/toFloat.cpp#L78
> It has been a while since I check but I believe the current implementation
> matches this.
> 
> The fate sample: rgb_scanline_zip_half_float_0x0_to_0xFFFF.exr was created
> to test this.
> it contains every possible float16 value

Then maybe the current implementation should be moved back to exr (it 
used to be internal to exr until Paul made it standalone), so this lavu 
module can match the existing hardware implementations of IEEE-734 half 
floats for the purpose of relevant pixel format support.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 22:18             ` James Almer
@ 2022-08-10 22:28               ` Timo Rothenpieler
  2022-08-10 22:37                 ` Mark Reid
  0 siblings, 1 reply; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 22:28 UTC (permalink / raw)
  To: ffmpeg-devel

On 11.08.2022 00:18, James Almer wrote:
> Then maybe the current implementation should be moved back to exr (it 
> used to be internal to exr until Paul made it standalone), so this lavu 
> module can match the existing hardware implementations of IEEE-734 half 
> floats for the purpose of relevant pixel format support.

That doesn't seem necessary to me.
The values produced before and now are both correct, just different.
But there is no functional difference in the values it produces.

Duplicating the entirety of that code just for that seems extremely 
unnecessary.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 22:28               ` Timo Rothenpieler
@ 2022-08-10 22:37                 ` Mark Reid
  2022-08-10 22:55                   ` Timo Rothenpieler
  0 siblings, 1 reply; 39+ messages in thread
From: Mark Reid @ 2022-08-10 22:37 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Aug 10, 2022 at 3:28 PM Timo Rothenpieler <timo@rothenpieler.org>
wrote:

> On 11.08.2022 00:18, James Almer wrote:
> > Then maybe the current implementation should be moved back to exr (it
> > used to be internal to exr until Paul made it standalone), so this lavu
> > module can match the existing hardware implementations of IEEE-734 half
> > floats for the purpose of relevant pixel format support.
>
> That doesn't seem necessary to me.
> The values produced before and now are both correct, just different.
> But there is no functional difference in the values it produces.
>
> Duplicating the entirety of that code just for that seems extremely
> unnecessary.
>

openexr does note the intel implementations difference here
https://github.com/AcademySoftwareFoundation/Imath/blob/main/src/Imath/half.h#L288


> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [FFmpeg-devel] [PATCH v2 09/11] avutil/half2float: use native _Float16 if available
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 09/11] avutil/half2float: use native _Float16 if available Timo Rothenpieler
  2022-08-10 21:03   ` Andreas Rheinhardt
@ 2022-08-10 22:51   ` Timo Rothenpieler
  2022-08-11  0:14     ` James Almer
  1 sibling, 1 reply; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 22:51 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Timo Rothenpieler

_Float16 support was available on arm/aarch64 for a while, and with gcc
12 was enabled on x86 as long as SSE2 is supported.

If the target arch supports f16c, gcc emits fairly efficient assembly,
taking advantage of it. This is the case on x86-64-v3 or higher.
Same goes on arm, which has native float16 support.
On x86, without f16c, it emulates it in software using sse2 instructions.

This has shown to perform rather poorly:

_Float16 full SSE2 emulation:
frame=50074 fps=848 q=-0.0 size=N/A time=00:33:22.96 bitrate=N/A speed=33.9x

_Float16 f16c accelerated (Zen2, --cpu=znver2):
frame=50636 fps=1965 q=-0.0 Lsize=N/A time=00:33:45.40 bitrate=N/A speed=78.6x

classic half2float full software implementation:
frame=49926 fps=1605 q=-0.0 Lsize=N/A time=00:33:17.00 bitrate=N/A speed=64.2x

Hence an additional check was introduced, that only enables use of
_Float16 on x86 if f16c is being utilized.

On aarch64, a similar uplift in performance is seen:

RPi4 half2float full software implementation:
frame= 6088 fps=126 q=-0.0 Lsize=N/A time=00:04:03.48 bitrate=N/A speed=5.06x

RPi4 _Float16:
frame= 6103 fps=158 q=-0.0 Lsize=N/A time=00:04:04.08 bitrate=N/A speed=6.32x

Since arm/aarch64 always natively support 16 bit floats, it can always
be considered fast there.

I'm not aware of any additional platforms that currently support
_Float16. And if there are, they should be considered non-fast until
proven fast.
---
 configure              | 13 +++++++++++++
 libavutil/float2half.c |  2 ++
 libavutil/float2half.h | 16 ++++++++++++++++
 libavutil/half2float.c |  4 ++++
 libavutil/half2float.h | 16 ++++++++++++++++
 5 files changed, 51 insertions(+)

diff --git a/configure b/configure
index 6761d0cb32..6ede9a5a8f 100755
--- a/configure
+++ b/configure
@@ -2143,6 +2143,8 @@ ARCH_FEATURES="
     fast_64bit
     fast_clz
     fast_cmov
+    fast_float16
+    float16
     local_aligned
     simd_align_16
     simd_align_32
@@ -5125,6 +5127,8 @@ elif enabled arm; then
             ;;
     esac
 
+    test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee
+
 elif enabled avr32; then
 
     case $cpu in
@@ -6229,6 +6233,15 @@ check_builtin sync_val_compare_and_swap "" "int *ptr; int oldval, newval; __sync
 check_builtin gmtime_r time.h "time_t *time; struct tm *tm; gmtime_r(time, tm)"
 check_builtin localtime_r time.h "time_t *time; struct tm *tm; localtime_r(time, tm)"
 
+check_builtin float16 "" "_Float16 f16var"
+if enabled float16; then
+    if enabled x86; then
+        test_cpp_condition stddef.h "defined(__F16C__)" && enable fast_float16
+    elif enabled arm || enabled aarch64; then
+        enable fast_float16
+    fi
+fi
+
 case "$custom_allocator" in
     jemalloc)
         # jemalloc by default does not use a prefix
diff --git a/libavutil/float2half.c b/libavutil/float2half.c
index dba14cef5d..7002612194 100644
--- a/libavutil/float2half.c
+++ b/libavutil/float2half.c
@@ -20,6 +20,7 @@
 
 void ff_init_float2half_tables(float2half_tables *t)
 {
+#if !HAVE_FAST_FLOAT16
     for (int i = 0; i < 256; i++) {
         int e = i - 127;
 
@@ -50,4 +51,5 @@ void ff_init_float2half_tables(float2half_tables *t)
             t->shifttable[i|0x100] = 13;
         }
     }
+#endif
 }
diff --git a/libavutil/float2half.h b/libavutil/float2half.h
index b8c9cdfc4f..437666966b 100644
--- a/libavutil/float2half.h
+++ b/libavutil/float2half.h
@@ -20,21 +20,37 @@
 #define AVUTIL_FLOAT2HALF_H
 
 #include <stdint.h>
+#include "intfloat.h"
+
+#include "config.h"
 
 typedef struct float2half_tables {
+#if HAVE_FAST_FLOAT16
+    uint8_t dummy;
+#else
     uint16_t basetable[512];
     uint8_t shifttable[512];
+#endif
 } float2half_tables;
 
 void ff_init_float2half_tables(float2half_tables *t);
 
 static inline uint16_t float2half(uint32_t f, const float2half_tables *t)
 {
+#if HAVE_FAST_FLOAT16
+    union {
+        _Float16 f;
+        uint16_t i;
+    } u;
+    u.f = av_int2float(f);
+    return u.i;
+#else
     uint16_t h;
 
     h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> t->shifttable[(f >> 23) & 0x1ff]);
 
     return h;
+#endif
 }
 
 #endif /* AVUTIL_FLOAT2HALF_H */
diff --git a/libavutil/half2float.c b/libavutil/half2float.c
index baac8e4093..ff198a8187 100644
--- a/libavutil/half2float.c
+++ b/libavutil/half2float.c
@@ -18,6 +18,7 @@
 
 #include "libavutil/half2float.h"
 
+#if !HAVE_FAST_FLOAT16
 static uint32_t convertmantissa(uint32_t i)
 {
     int32_t m = i << 13; // Zero pad mantissa bits
@@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i)
 
     return m | e; // Return combined number
 }
+#endif
 
 void ff_init_half2float_tables(half2float_tables *t)
 {
+#if !HAVE_FAST_FLOAT16
     t->mantissatable[0] = 0;
     for (int i = 1; i < 1024; i++)
         t->mantissatable[i] = convertmantissa(i);
@@ -60,4 +63,5 @@ void ff_init_half2float_tables(half2float_tables *t)
     t->offsettable[31] = 2048;
     t->offsettable[32] = 0;
     t->offsettable[63] = 2048;
+#endif
 }
diff --git a/libavutil/half2float.h b/libavutil/half2float.h
index cb58e44a1c..57ee8372fe 100644
--- a/libavutil/half2float.h
+++ b/libavutil/half2float.h
@@ -20,22 +20,38 @@
 #define AVUTIL_HALF2FLOAT_H
 
 #include <stdint.h>
+#include "intfloat.h"
+
+#include "config.h"
 
 typedef struct half2float_tables {
+#if HAVE_FAST_FLOAT16
+    uint8_t dummy;
+#else
     uint32_t mantissatable[3072];
     uint32_t exponenttable[64];
     uint16_t offsettable[64];
+#endif
 } half2float_tables;
 
 void ff_init_half2float_tables(half2float_tables *t);
 
 static inline uint32_t half2float(uint16_t h, const half2float_tables *t)
 {
+#if HAVE_FAST_FLOAT16
+    union {
+        _Float16 f;
+        uint16_t i;
+    } u;
+    u.i = h;
+    return av_float2int(u.f);
+#else
     uint32_t f;
 
     f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + t->exponenttable[h >> 10];
 
     return f;
+#endif
 }
 
 #endif /* AVUTIL_HALF2FLOAT_H */
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 22:37                 ` Mark Reid
@ 2022-08-10 22:55                   ` Timo Rothenpieler
  2022-08-11  2:18                     ` Mark Reid
  0 siblings, 1 reply; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-10 22:55 UTC (permalink / raw)
  To: ffmpeg-devel

On 11.08.2022 00:37, Mark Reid wrote:
> On Wed, Aug 10, 2022 at 3:28 PM Timo Rothenpieler <timo@rothenpieler.org>
> wrote:
> 
>> On 11.08.2022 00:18, James Almer wrote:
>>> Then maybe the current implementation should be moved back to exr (it
>>> used to be internal to exr until Paul made it standalone), so this lavu
>>> module can match the existing hardware implementations of IEEE-734 half
>>> floats for the purpose of relevant pixel format support.
>>
>> That doesn't seem necessary to me.
>> The values produced before and now are both correct, just different.
>> But there is no functional difference in the values it produces.
>>
>> Duplicating the entirety of that code just for that seems extremely
>> unnecessary.
>>
> 
> openexr does note the intel implementations difference here
> https://github.com/AcademySoftwareFoundation/Imath/blob/main/src/Imath/half.h#L288

It's actually quite curious how that came to be.
My natural idea would be that our current and EXRs code does it right.

But all hardware as well as gccs software emulation agrees. Makes me 
wonder if it's fully intentional and according to some spec. But I 
couldn't find anything on the matter.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 09/11] avutil/half2float: use native _Float16 if available
  2022-08-10 22:51   ` [FFmpeg-devel] [PATCH v2 " Timo Rothenpieler
@ 2022-08-11  0:14     ` James Almer
  2022-08-11 11:50       ` Timo Rothenpieler
  0 siblings, 1 reply; 39+ messages in thread
From: James Almer @ 2022-08-11  0:14 UTC (permalink / raw)
  To: ffmpeg-devel

On 8/10/2022 7:51 PM, Timo Rothenpieler wrote:
> _Float16 support was available on arm/aarch64 for a while, and with gcc
> 12 was enabled on x86 as long as SSE2 is supported.
> 
> If the target arch supports f16c, gcc emits fairly efficient assembly,
> taking advantage of it. This is the case on x86-64-v3 or higher.
> Same goes on arm, which has native float16 support.
> On x86, without f16c, it emulates it in software using sse2 instructions.
> 
> This has shown to perform rather poorly:
> 
> _Float16 full SSE2 emulation:
> frame=50074 fps=848 q=-0.0 size=N/A time=00:33:22.96 bitrate=N/A speed=33.9x
> 
> _Float16 f16c accelerated (Zen2, --cpu=znver2):
> frame=50636 fps=1965 q=-0.0 Lsize=N/A time=00:33:45.40 bitrate=N/A speed=78.6x
> 
> classic half2float full software implementation:
> frame=49926 fps=1605 q=-0.0 Lsize=N/A time=00:33:17.00 bitrate=N/A speed=64.2x
> 
> Hence an additional check was introduced, that only enables use of
> _Float16 on x86 if f16c is being utilized.
> 
> On aarch64, a similar uplift in performance is seen:
> 
> RPi4 half2float full software implementation:
> frame= 6088 fps=126 q=-0.0 Lsize=N/A time=00:04:03.48 bitrate=N/A speed=5.06x
> 
> RPi4 _Float16:
> frame= 6103 fps=158 q=-0.0 Lsize=N/A time=00:04:04.08 bitrate=N/A speed=6.32x
> 
> Since arm/aarch64 always natively support 16 bit floats, it can always
> be considered fast there.
> 
> I'm not aware of any additional platforms that currently support
> _Float16. And if there are, they should be considered non-fast until
> proven fast.
> ---
>   configure              | 13 +++++++++++++
>   libavutil/float2half.c |  2 ++
>   libavutil/float2half.h | 16 ++++++++++++++++
>   libavutil/half2float.c |  4 ++++
>   libavutil/half2float.h | 16 ++++++++++++++++
>   5 files changed, 51 insertions(+)
> 
> diff --git a/configure b/configure
> index 6761d0cb32..6ede9a5a8f 100755
> --- a/configure
> +++ b/configure
> @@ -2143,6 +2143,8 @@ ARCH_FEATURES="
>       fast_64bit
>       fast_clz
>       fast_cmov
> +    fast_float16
> +    float16

If HAVE_FLOAT16 is not going to be used, then don't export it here. 
Leave it as a configure internal variable.

>       local_aligned
>       simd_align_16
>       simd_align_32
> @@ -5125,6 +5127,8 @@ elif enabled arm; then
>               ;;
>       esac
>   
> +    test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee
> +
>   elif enabled avr32; then
>   
>       case $cpu in
> @@ -6229,6 +6233,15 @@ check_builtin sync_val_compare_and_swap "" "int *ptr; int oldval, newval; __sync
>   check_builtin gmtime_r time.h "time_t *time; struct tm *tm; gmtime_r(time, tm)"
>   check_builtin localtime_r time.h "time_t *time; struct tm *tm; localtime_r(time, tm)"
>   
> +check_builtin float16 "" "_Float16 f16var"
> +if enabled float16; then
> +    if enabled x86; then
> +        test_cpp_condition stddef.h "defined(__F16C__)" && enable fast_float16
> +    elif enabled arm || enabled aarch64; then
> +        enable fast_float16
> +    fi
> +fi
> +
>   case "$custom_allocator" in
>       jemalloc)
>           # jemalloc by default does not use a prefix
> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
> index dba14cef5d..7002612194 100644
> --- a/libavutil/float2half.c
> +++ b/libavutil/float2half.c
> @@ -20,6 +20,7 @@
>   
>   void ff_init_float2half_tables(float2half_tables *t)
>   {
> +#if !HAVE_FAST_FLOAT16
>       for (int i = 0; i < 256; i++) {
>           int e = i - 127;
>   
> @@ -50,4 +51,5 @@ void ff_init_float2half_tables(float2half_tables *t)
>               t->shifttable[i|0x100] = 13;
>           }
>       }
> +#endif
>   }
> diff --git a/libavutil/float2half.h b/libavutil/float2half.h
> index b8c9cdfc4f..437666966b 100644
> --- a/libavutil/float2half.h
> +++ b/libavutil/float2half.h
> @@ -20,21 +20,37 @@
>   #define AVUTIL_FLOAT2HALF_H
>   
>   #include <stdint.h>
> +#include "intfloat.h"
> +
> +#include "config.h"
>   
>   typedef struct float2half_tables {
> +#if HAVE_FAST_FLOAT16
> +    uint8_t dummy;
> +#else
>       uint16_t basetable[512];
>       uint8_t shifttable[512];
> +#endif
>   } float2half_tables;
>   
>   void ff_init_float2half_tables(float2half_tables *t);
>   
>   static inline uint16_t float2half(uint32_t f, const float2half_tables *t)
>   {
> +#if HAVE_FAST_FLOAT16
> +    union {
> +        _Float16 f;
> +        uint16_t i;
> +    } u;
> +    u.f = av_int2float(f);
> +    return u.i;
> +#else
>       uint16_t h;
>   
>       h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> t->shifttable[(f >> 23) & 0x1ff]);
>   
>       return h;
> +#endif
>   }
>   
>   #endif /* AVUTIL_FLOAT2HALF_H */
> diff --git a/libavutil/half2float.c b/libavutil/half2float.c
> index baac8e4093..ff198a8187 100644
> --- a/libavutil/half2float.c
> +++ b/libavutil/half2float.c
> @@ -18,6 +18,7 @@
>   
>   #include "libavutil/half2float.h"
>   
> +#if !HAVE_FAST_FLOAT16
>   static uint32_t convertmantissa(uint32_t i)
>   {
>       int32_t m = i << 13; // Zero pad mantissa bits
> @@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i)
>   
>       return m | e; // Return combined number
>   }
> +#endif
>   
>   void ff_init_half2float_tables(half2float_tables *t)
>   {
> +#if !HAVE_FAST_FLOAT16
>       t->mantissatable[0] = 0;
>       for (int i = 1; i < 1024; i++)
>           t->mantissatable[i] = convertmantissa(i);
> @@ -60,4 +63,5 @@ void ff_init_half2float_tables(half2float_tables *t)
>       t->offsettable[31] = 2048;
>       t->offsettable[32] = 0;
>       t->offsettable[63] = 2048;
> +#endif
>   }
> diff --git a/libavutil/half2float.h b/libavutil/half2float.h
> index cb58e44a1c..57ee8372fe 100644
> --- a/libavutil/half2float.h
> +++ b/libavutil/half2float.h
> @@ -20,22 +20,38 @@
>   #define AVUTIL_HALF2FLOAT_H
>   
>   #include <stdint.h>
> +#include "intfloat.h"
> +
> +#include "config.h"
>   
>   typedef struct half2float_tables {
> +#if HAVE_FAST_FLOAT16
> +    uint8_t dummy;
> +#else
>       uint32_t mantissatable[3072];
>       uint32_t exponenttable[64];
>       uint16_t offsettable[64];
> +#endif
>   } half2float_tables;
>   
>   void ff_init_half2float_tables(half2float_tables *t);
>   
>   static inline uint32_t half2float(uint16_t h, const half2float_tables *t)
>   {
> +#if HAVE_FAST_FLOAT16
> +    union {
> +        _Float16 f;
> +        uint16_t i;
> +    } u;
> +    u.i = h;
> +    return av_float2int(u.f);
> +#else
>       uint32_t f;
>   
>       f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + t->exponenttable[h >> 10];
>   
>       return f;
> +#endif
>   }
>   
>   #endif /* AVUTIL_HALF2FLOAT_H */
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN
  2022-08-10 22:55                   ` Timo Rothenpieler
@ 2022-08-11  2:18                     ` Mark Reid
  0 siblings, 0 replies; 39+ messages in thread
From: Mark Reid @ 2022-08-11  2:18 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Aug 10, 2022 at 3:56 PM Timo Rothenpieler <timo@rothenpieler.org>
wrote:

> On 11.08.2022 00:37, Mark Reid wrote:
> > On Wed, Aug 10, 2022 at 3:28 PM Timo Rothenpieler <timo@rothenpieler.org
> >
> > wrote:
> >
> >> On 11.08.2022 00:18, James Almer wrote:
> >>> Then maybe the current implementation should be moved back to exr (it
> >>> used to be internal to exr until Paul made it standalone), so this lavu
> >>> module can match the existing hardware implementations of IEEE-734 half
> >>> floats for the purpose of relevant pixel format support.
> >>
> >> That doesn't seem necessary to me.
> >> The values produced before and now are both correct, just different.
> >> But there is no functional difference in the values it produces.
> >>
> >> Duplicating the entirety of that code just for that seems extremely
> >> unnecessary.
> >>
> >
> > openexr does note the intel implementations difference here
> >
> https://github.com/AcademySoftwareFoundation/Imath/blob/main/src/Imath/half.h#L288
>
> It's actually quite curious how that came to be.
> My natural idea would be that our current and EXRs code does it right.
>
> But all hardware as well as gccs software emulation agrees. Makes me
> wonder if it's fully intentional and according to some spec. But I
> couldn't find anything on the matter.
>

Ya I'm curious too now. I might ask the exr folks.
I noticed the difference when I fixed the subnormal bug a couple years ago.
That is why I changed it to match the openexr's halfToFloat() version that
preserves the Nan values instead of changing them.
This new behavior might have been what it was before I changed it
in 8d19b3c4a5.

I looked at the intel architecture developer's manual and sadly it only
describes the float32 to float16 algorithm.
The change back seems pretty benign to me too. The openexr implementation
is relying on the hardware instruction too if it can.
I don't know what one would do with the exact NaN value anyway.


_______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 09/11] avutil/half2float: use native _Float16 if available
  2022-08-11  0:14     ` James Almer
@ 2022-08-11 11:50       ` Timo Rothenpieler
  0 siblings, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-11 11:50 UTC (permalink / raw)
  To: ffmpeg-devel

On 11/08/2022 02:14, James Almer wrote:
> On 8/10/2022 7:51 PM, Timo Rothenpieler wrote:
>> _Float16 support was available on arm/aarch64 for a while, and with gcc
>> 12 was enabled on x86 as long as SSE2 is supported.
>>
>> If the target arch supports f16c, gcc emits fairly efficient assembly,
>> taking advantage of it. This is the case on x86-64-v3 or higher.
>> Same goes on arm, which has native float16 support.
>> On x86, without f16c, it emulates it in software using sse2 instructions.
>>
>> This has shown to perform rather poorly:
>>
>> _Float16 full SSE2 emulation:
>> frame=50074 fps=848 q=-0.0 size=N/A time=00:33:22.96 bitrate=N/A 
>> speed=33.9x
>>
>> _Float16 f16c accelerated (Zen2, --cpu=znver2):
>> frame=50636 fps=1965 q=-0.0 Lsize=N/A time=00:33:45.40 bitrate=N/A 
>> speed=78.6x
>>
>> classic half2float full software implementation:
>> frame=49926 fps=1605 q=-0.0 Lsize=N/A time=00:33:17.00 bitrate=N/A 
>> speed=64.2x
>>
>> Hence an additional check was introduced, that only enables use of
>> _Float16 on x86 if f16c is being utilized.
>>
>> On aarch64, a similar uplift in performance is seen:
>>
>> RPi4 half2float full software implementation:
>> frame= 6088 fps=126 q=-0.0 Lsize=N/A time=00:04:03.48 bitrate=N/A 
>> speed=5.06x
>>
>> RPi4 _Float16:
>> frame= 6103 fps=158 q=-0.0 Lsize=N/A time=00:04:04.08 bitrate=N/A 
>> speed=6.32x
>>
>> Since arm/aarch64 always natively support 16 bit floats, it can always
>> be considered fast there.
>>
>> I'm not aware of any additional platforms that currently support
>> _Float16. And if there are, they should be considered non-fast until
>> proven fast.
>> ---
>>   configure              | 13 +++++++++++++
>>   libavutil/float2half.c |  2 ++
>>   libavutil/float2half.h | 16 ++++++++++++++++
>>   libavutil/half2float.c |  4 ++++
>>   libavutil/half2float.h | 16 ++++++++++++++++
>>   5 files changed, 51 insertions(+)
>>
>> diff --git a/configure b/configure
>> index 6761d0cb32..6ede9a5a8f 100755
>> --- a/configure
>> +++ b/configure
>> @@ -2143,6 +2143,8 @@ ARCH_FEATURES="
>>       fast_64bit
>>       fast_clz
>>       fast_cmov
>> +    fast_float16
>> +    float16
> 
> If HAVE_FLOAT16 is not going to be used, then don't export it here. 
> Leave it as a configure internal variable.
> 

Good point, fixed locally.

>>       local_aligned
>>       simd_align_16
>>       simd_align_32
>> @@ -5125,6 +5127,8 @@ elif enabled arm; then
>>               ;;
>>       esac
>> +    test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee
>> +
>>   elif enabled avr32; then
>>       case $cpu in
>> @@ -6229,6 +6233,15 @@ check_builtin sync_val_compare_and_swap "" "int 
>> *ptr; int oldval, newval; __sync
>>   check_builtin gmtime_r time.h "time_t *time; struct tm *tm; 
>> gmtime_r(time, tm)"
>>   check_builtin localtime_r time.h "time_t *time; struct tm *tm; 
>> localtime_r(time, tm)"
>> +check_builtin float16 "" "_Float16 f16var"
>> +if enabled float16; then
>> +    if enabled x86; then
>> +        test_cpp_condition stddef.h "defined(__F16C__)" && enable 
>> fast_float16
>> +    elif enabled arm || enabled aarch64; then
>> +        enable fast_float16
>> +    fi
>> +fi
>> +
>>   case "$custom_allocator" in
>>       jemalloc)
>>           # jemalloc by default does not use a prefix
>> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
>> index dba14cef5d..7002612194 100644
>> --- a/libavutil/float2half.c
>> +++ b/libavutil/float2half.c
>> @@ -20,6 +20,7 @@
>>   void ff_init_float2half_tables(float2half_tables *t)
>>   {
>> +#if !HAVE_FAST_FLOAT16
>>       for (int i = 0; i < 256; i++) {
>>           int e = i - 127;
>> @@ -50,4 +51,5 @@ void ff_init_float2half_tables(float2half_tables *t)
>>               t->shifttable[i|0x100] = 13;
>>           }
>>       }
>> +#endif
>>   }
>> diff --git a/libavutil/float2half.h b/libavutil/float2half.h
>> index b8c9cdfc4f..437666966b 100644
>> --- a/libavutil/float2half.h
>> +++ b/libavutil/float2half.h
>> @@ -20,21 +20,37 @@
>>   #define AVUTIL_FLOAT2HALF_H
>>   #include <stdint.h>
>> +#include "intfloat.h"
>> +
>> +#include "config.h"
>>   typedef struct float2half_tables {
>> +#if HAVE_FAST_FLOAT16
>> +    uint8_t dummy;
>> +#else
>>       uint16_t basetable[512];
>>       uint8_t shifttable[512];
>> +#endif
>>   } float2half_tables;
>>   void ff_init_float2half_tables(float2half_tables *t);
>>   static inline uint16_t float2half(uint32_t f, const 
>> float2half_tables *t)
>>   {
>> +#if HAVE_FAST_FLOAT16
>> +    union {
>> +        _Float16 f;
>> +        uint16_t i;
>> +    } u;
>> +    u.f = av_int2float(f);
>> +    return u.i;
>> +#else
>>       uint16_t h;
>>       h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> 
>> t->shifttable[(f >> 23) & 0x1ff]);
>>       return h;
>> +#endif
>>   }
>>   #endif /* AVUTIL_FLOAT2HALF_H */
>> diff --git a/libavutil/half2float.c b/libavutil/half2float.c
>> index baac8e4093..ff198a8187 100644
>> --- a/libavutil/half2float.c
>> +++ b/libavutil/half2float.c
>> @@ -18,6 +18,7 @@
>>   #include "libavutil/half2float.h"
>> +#if !HAVE_FAST_FLOAT16
>>   static uint32_t convertmantissa(uint32_t i)
>>   {
>>       int32_t m = i << 13; // Zero pad mantissa bits
>> @@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i)
>>       return m | e; // Return combined number
>>   }
>> +#endif
>>   void ff_init_half2float_tables(half2float_tables *t)
>>   {
>> +#if !HAVE_FAST_FLOAT16
>>       t->mantissatable[0] = 0;
>>       for (int i = 1; i < 1024; i++)
>>           t->mantissatable[i] = convertmantissa(i);
>> @@ -60,4 +63,5 @@ void ff_init_half2float_tables(half2float_tables *t)
>>       t->offsettable[31] = 2048;
>>       t->offsettable[32] = 0;
>>       t->offsettable[63] = 2048;
>> +#endif
>>   }
>> diff --git a/libavutil/half2float.h b/libavutil/half2float.h
>> index cb58e44a1c..57ee8372fe 100644
>> --- a/libavutil/half2float.h
>> +++ b/libavutil/half2float.h
>> @@ -20,22 +20,38 @@
>>   #define AVUTIL_HALF2FLOAT_H
>>   #include <stdint.h>
>> +#include "intfloat.h"
>> +
>> +#include "config.h"
>>   typedef struct half2float_tables {
>> +#if HAVE_FAST_FLOAT16
>> +    uint8_t dummy;
>> +#else
>>       uint32_t mantissatable[3072];
>>       uint32_t exponenttable[64];
>>       uint16_t offsettable[64];
>> +#endif
>>   } half2float_tables;
>>   void ff_init_half2float_tables(half2float_tables *t);
>>   static inline uint32_t half2float(uint16_t h, const 
>> half2float_tables *t)
>>   {
>> +#if HAVE_FAST_FLOAT16
>> +    union {
>> +        _Float16 f;
>> +        uint16_t i;
>> +    } u;
>> +    u.i = h;
>> +    return av_float2int(u.f);
>> +#else
>>       uint32_t f;
>>       f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + 
>> t->exponenttable[h >> 10];
>>       return f;
>> +#endif
>>   }
>>   #endif /* AVUTIL_HALF2FLOAT_H */
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header
  2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header Timo Rothenpieler
@ 2022-08-11 20:46   ` Michael Niedermayer
  2022-08-11 20:50     ` Andreas Rheinhardt
  0 siblings, 1 reply; 39+ messages in thread
From: Michael Niedermayer @ 2022-08-11 20:46 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 3467 bytes --]

On Wed, Aug 10, 2022 at 10:47:09PM +0200, Timo Rothenpieler wrote:
> ---
>  libavcodec/Makefile     |  8 +++---
>  libavcodec/exr.c        |  2 +-
>  libavcodec/exrenc.c     |  2 +-
>  libavcodec/float2half.c | 19 +++++++++++++
>  libavcodec/half2float.c | 19 +++++++++++++
>  libavcodec/pnmdec.c     |  2 +-
>  libavcodec/pnmenc.c     |  2 +-
>  libavutil/float2half.c  | 53 ++++++++++++++++++++++++++++++++++
>  libavutil/float2half.h  | 36 ++---------------------
>  libavutil/half2float.c  | 63 +++++++++++++++++++++++++++++++++++++++++
>  libavutil/half2float.h  | 46 ++----------------------------
>  11 files changed, 166 insertions(+), 86 deletions(-)
>  create mode 100644 libavcodec/float2half.c
>  create mode 100644 libavcodec/half2float.c
>  create mode 100644 libavutil/float2half.c
>  create mode 100644 libavutil/half2float.c
> 
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index 029f1bad3d..cb80f73d99 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -337,8 +337,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
>  OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
>  OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
>  OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
> -OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
> -OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o
> +OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
> +OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o float2half.o
>  OBJS-$(CONFIG_FASTAUDIO_DECODER)       += fastaudio.o
>  OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
>  OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
> @@ -570,8 +570,8 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
>  OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
>  OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
>  OBJS-$(CONFIG_PGX_DECODER)             += pgxdec.o
> -OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o
> -OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o
> +OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
> +OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o float2half.o
>  OBJS-$(CONFIG_PHOTOCD_DECODER)         += photocd.o
>  OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
>  OBJS-$(CONFIG_PIXLET_DECODER)          += pixlet.o
> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
> index 825354873d..a3582bfdd6 100644
> --- a/libavcodec/exr.c
> +++ b/libavcodec/exr.c
> @@ -2208,7 +2208,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
>      float one_gamma = 1.0f / s->gamma;
>      avpriv_trc_function trc_func = NULL;
>  
> -    init_half2float_tables(&s->h2f_tables);
> +    ff_init_half2float_tables(&s->h2f_tables);
[...]
> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
> new file mode 100644
> index 0000000000..dba14cef5d
> --- /dev/null
> +++ b/libavutil/float2half.c
[...]
> +void ff_init_float2half_tables(float2half_tables *t)

this will need avpriv or break linking with shared libs

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

"You are 36 times more likely to die in a bathtub than at the hands of a
terrorist. Also, you are 2.5 times more likely to become a president and
2 times more likely to become an astronaut, than to die in a terrorist
attack." -- Thoughty2


[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header
  2022-08-11 20:46   ` Michael Niedermayer
@ 2022-08-11 20:50     ` Andreas Rheinhardt
  2022-08-11 21:16       ` Michael Niedermayer
  0 siblings, 1 reply; 39+ messages in thread
From: Andreas Rheinhardt @ 2022-08-11 20:50 UTC (permalink / raw)
  To: ffmpeg-devel

Michael Niedermayer:
> On Wed, Aug 10, 2022 at 10:47:09PM +0200, Timo Rothenpieler wrote:
>> ---
>>  libavcodec/Makefile     |  8 +++---
>>  libavcodec/exr.c        |  2 +-
>>  libavcodec/exrenc.c     |  2 +-
>>  libavcodec/float2half.c | 19 +++++++++++++
>>  libavcodec/half2float.c | 19 +++++++++++++
>>  libavcodec/pnmdec.c     |  2 +-
>>  libavcodec/pnmenc.c     |  2 +-
>>  libavutil/float2half.c  | 53 ++++++++++++++++++++++++++++++++++
>>  libavutil/float2half.h  | 36 ++---------------------
>>  libavutil/half2float.c  | 63 +++++++++++++++++++++++++++++++++++++++++
>>  libavutil/half2float.h  | 46 ++----------------------------
>>  11 files changed, 166 insertions(+), 86 deletions(-)
>>  create mode 100644 libavcodec/float2half.c
>>  create mode 100644 libavcodec/half2float.c
>>  create mode 100644 libavutil/float2half.c
>>  create mode 100644 libavutil/half2float.c
>>
>> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
>> index 029f1bad3d..cb80f73d99 100644
>> --- a/libavcodec/Makefile
>> +++ b/libavcodec/Makefile
>> @@ -337,8 +337,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
>>  OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
>>  OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
>>  OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
>> -OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
>> -OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o
>> +OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
>> +OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o float2half.o
>>  OBJS-$(CONFIG_FASTAUDIO_DECODER)       += fastaudio.o
>>  OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
>>  OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
>> @@ -570,8 +570,8 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
>>  OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
>>  OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
>>  OBJS-$(CONFIG_PGX_DECODER)             += pgxdec.o
>> -OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o
>> -OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o
>> +OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
>> +OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o float2half.o
>>  OBJS-$(CONFIG_PHOTOCD_DECODER)         += photocd.o
>>  OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
>>  OBJS-$(CONFIG_PIXLET_DECODER)          += pixlet.o
>> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
>> index 825354873d..a3582bfdd6 100644
>> --- a/libavcodec/exr.c
>> +++ b/libavcodec/exr.c
>> @@ -2208,7 +2208,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
>>      float one_gamma = 1.0f / s->gamma;
>>      avpriv_trc_function trc_func = NULL;
>>  
>> -    init_half2float_tables(&s->h2f_tables);
>> +    ff_init_half2float_tables(&s->h2f_tables);
> [...]
>> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
>> new file mode 100644
>> index 0000000000..dba14cef5d
>> --- /dev/null
>> +++ b/libavutil/float2half.c
> [...]
>> +void ff_init_float2half_tables(float2half_tables *t)
> 
> this will need avpriv or break linking with shared libs
> 

No, because this code is duplicated into all libraries that need it.
(In case of static linking, only one of the variants will be used
(namely the first one encountered in the link.)

- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header
  2022-08-11 20:50     ` Andreas Rheinhardt
@ 2022-08-11 21:16       ` Michael Niedermayer
  2022-08-11 21:31         ` Andreas Rheinhardt
  0 siblings, 1 reply; 39+ messages in thread
From: Michael Niedermayer @ 2022-08-11 21:16 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 4624 bytes --]

On Thu, Aug 11, 2022 at 10:50:10PM +0200, Andreas Rheinhardt wrote:
> Michael Niedermayer:
> > On Wed, Aug 10, 2022 at 10:47:09PM +0200, Timo Rothenpieler wrote:
> >> ---
> >>  libavcodec/Makefile     |  8 +++---
> >>  libavcodec/exr.c        |  2 +-
> >>  libavcodec/exrenc.c     |  2 +-
> >>  libavcodec/float2half.c | 19 +++++++++++++
> >>  libavcodec/half2float.c | 19 +++++++++++++
> >>  libavcodec/pnmdec.c     |  2 +-
> >>  libavcodec/pnmenc.c     |  2 +-
> >>  libavutil/float2half.c  | 53 ++++++++++++++++++++++++++++++++++
> >>  libavutil/float2half.h  | 36 ++---------------------
> >>  libavutil/half2float.c  | 63 +++++++++++++++++++++++++++++++++++++++++
> >>  libavutil/half2float.h  | 46 ++----------------------------
> >>  11 files changed, 166 insertions(+), 86 deletions(-)
> >>  create mode 100644 libavcodec/float2half.c
> >>  create mode 100644 libavcodec/half2float.c
> >>  create mode 100644 libavutil/float2half.c
> >>  create mode 100644 libavutil/half2float.c
> >>
> >> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> >> index 029f1bad3d..cb80f73d99 100644
> >> --- a/libavcodec/Makefile
> >> +++ b/libavcodec/Makefile
> >> @@ -337,8 +337,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
> >>  OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
> >>  OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
> >>  OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
> >> -OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
> >> -OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o
> >> +OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
> >> +OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o float2half.o
> >>  OBJS-$(CONFIG_FASTAUDIO_DECODER)       += fastaudio.o
> >>  OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
> >>  OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
> >> @@ -570,8 +570,8 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
> >>  OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
> >>  OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
> >>  OBJS-$(CONFIG_PGX_DECODER)             += pgxdec.o
> >> -OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o
> >> -OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o
> >> +OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
> >> +OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o float2half.o
> >>  OBJS-$(CONFIG_PHOTOCD_DECODER)         += photocd.o
> >>  OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
> >>  OBJS-$(CONFIG_PIXLET_DECODER)          += pixlet.o
> >> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
> >> index 825354873d..a3582bfdd6 100644
> >> --- a/libavcodec/exr.c
> >> +++ b/libavcodec/exr.c
> >> @@ -2208,7 +2208,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
> >>      float one_gamma = 1.0f / s->gamma;
> >>      avpriv_trc_function trc_func = NULL;
> >>  
> >> -    init_half2float_tables(&s->h2f_tables);
> >> +    ff_init_half2float_tables(&s->h2f_tables);
> > [...]
> >> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
> >> new file mode 100644
> >> index 0000000000..dba14cef5d
> >> --- /dev/null
> >> +++ b/libavutil/float2half.c
> > [...]
> >> +void ff_init_float2half_tables(float2half_tables *t)
> > 
> > this will need avpriv or break linking with shared libs
> > 
> 
> No, because this code is duplicated into all libraries that need it.
> (In case of static linking, only one of the variants will be used
> (namely the first one encountered in the link.)

libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
clang: error: linker command failed with exit code 1 (use -v to see invocation)
Makefile:131: recipe for target 'ffplay_g' failed
make: *** [ffplay_g] Error 1
make: *** Waiting for unfinished jobs....
clang: error: linker command failed with exit code 1 (use -v to see invocation)
libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
Makefile:131: recipe for target 'ffprobe_g' failed
make: *** [ffprobe_g] Error 1
clang: error: linker command failed with exit code 1 (use -v to see invocation)
Makefile:131: recipe for target 'ffmpeg_g' failed
make: *** [ffmpeg_g] Error 1


[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Dictatorship naturally arises out of democracy, and the most aggravated
form of tyranny and slavery out of the most extreme liberty. -- Plato

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header
  2022-08-11 21:16       ` Michael Niedermayer
@ 2022-08-11 21:31         ` Andreas Rheinhardt
  2022-08-14 19:32           ` Michael Niedermayer
  2022-08-14 21:54           ` Timo Rothenpieler
  0 siblings, 2 replies; 39+ messages in thread
From: Andreas Rheinhardt @ 2022-08-11 21:31 UTC (permalink / raw)
  To: ffmpeg-devel

Michael Niedermayer:
> On Thu, Aug 11, 2022 at 10:50:10PM +0200, Andreas Rheinhardt wrote:
>> Michael Niedermayer:
>>> On Wed, Aug 10, 2022 at 10:47:09PM +0200, Timo Rothenpieler wrote:
>>>> ---
>>>>  libavcodec/Makefile     |  8 +++---
>>>>  libavcodec/exr.c        |  2 +-
>>>>  libavcodec/exrenc.c     |  2 +-
>>>>  libavcodec/float2half.c | 19 +++++++++++++
>>>>  libavcodec/half2float.c | 19 +++++++++++++
>>>>  libavcodec/pnmdec.c     |  2 +-
>>>>  libavcodec/pnmenc.c     |  2 +-
>>>>  libavutil/float2half.c  | 53 ++++++++++++++++++++++++++++++++++
>>>>  libavutil/float2half.h  | 36 ++---------------------
>>>>  libavutil/half2float.c  | 63 +++++++++++++++++++++++++++++++++++++++++
>>>>  libavutil/half2float.h  | 46 ++----------------------------
>>>>  11 files changed, 166 insertions(+), 86 deletions(-)
>>>>  create mode 100644 libavcodec/float2half.c
>>>>  create mode 100644 libavcodec/half2float.c
>>>>  create mode 100644 libavutil/float2half.c
>>>>  create mode 100644 libavutil/half2float.c
>>>>
>>>> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
>>>> index 029f1bad3d..cb80f73d99 100644
>>>> --- a/libavcodec/Makefile
>>>> +++ b/libavcodec/Makefile
>>>> @@ -337,8 +337,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
>>>>  OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
>>>>  OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
>>>>  OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
>>>> -OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
>>>> -OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o
>>>> +OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
>>>> +OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o float2half.o
>>>>  OBJS-$(CONFIG_FASTAUDIO_DECODER)       += fastaudio.o
>>>>  OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
>>>>  OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
>>>> @@ -570,8 +570,8 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
>>>>  OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
>>>>  OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
>>>>  OBJS-$(CONFIG_PGX_DECODER)             += pgxdec.o
>>>> -OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o
>>>> -OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o
>>>> +OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
>>>> +OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o float2half.o
>>>>  OBJS-$(CONFIG_PHOTOCD_DECODER)         += photocd.o
>>>>  OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
>>>>  OBJS-$(CONFIG_PIXLET_DECODER)          += pixlet.o
>>>> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
>>>> index 825354873d..a3582bfdd6 100644
>>>> --- a/libavcodec/exr.c
>>>> +++ b/libavcodec/exr.c
>>>> @@ -2208,7 +2208,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
>>>>      float one_gamma = 1.0f / s->gamma;
>>>>      avpriv_trc_function trc_func = NULL;
>>>>  
>>>> -    init_half2float_tables(&s->h2f_tables);
>>>> +    ff_init_half2float_tables(&s->h2f_tables);
>>> [...]
>>>> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
>>>> new file mode 100644
>>>> index 0000000000..dba14cef5d
>>>> --- /dev/null
>>>> +++ b/libavutil/float2half.c
>>> [...]
>>>> +void ff_init_float2half_tables(float2half_tables *t)
>>>
>>> this will need avpriv or break linking with shared libs
>>>
>>
>> No, because this code is duplicated into all libraries that need it.
>> (In case of static linking, only one of the variants will be used
>> (namely the first one encountered in the link.)
> 
> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> clang: error: linker command failed with exit code 1 (use -v to see invocation)
> Makefile:131: recipe for target 'ffplay_g' failed
> make: *** [ffplay_g] Error 1
> make: *** Waiting for unfinished jobs....
> clang: error: linker command failed with exit code 1 (use -v to see invocation)
> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> Makefile:131: recipe for target 'ffprobe_g' failed
> make: *** [ffprobe_g] Error 1
> clang: error: linker command failed with exit code 1 (use -v to see invocation)
> Makefile:131: recipe for target 'ffmpeg_g' failed
> make: *** [ffmpeg_g] Error 1
> 

That is with the whole patchset applied, isn't it!? Duplicating the init
stuff into swscale has been forgotten.

- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header
  2022-08-11 21:31         ` Andreas Rheinhardt
@ 2022-08-14 19:32           ` Michael Niedermayer
  2022-08-15  4:20             ` Andreas Rheinhardt
  2022-08-14 21:54           ` Timo Rothenpieler
  1 sibling, 1 reply; 39+ messages in thread
From: Michael Niedermayer @ 2022-08-14 19:32 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 7568 bytes --]

On Thu, Aug 11, 2022 at 11:31:32PM +0200, Andreas Rheinhardt wrote:
> Michael Niedermayer:
> > On Thu, Aug 11, 2022 at 10:50:10PM +0200, Andreas Rheinhardt wrote:
> >> Michael Niedermayer:
> >>> On Wed, Aug 10, 2022 at 10:47:09PM +0200, Timo Rothenpieler wrote:
> >>>> ---
> >>>>  libavcodec/Makefile     |  8 +++---
> >>>>  libavcodec/exr.c        |  2 +-
> >>>>  libavcodec/exrenc.c     |  2 +-
> >>>>  libavcodec/float2half.c | 19 +++++++++++++
> >>>>  libavcodec/half2float.c | 19 +++++++++++++
> >>>>  libavcodec/pnmdec.c     |  2 +-
> >>>>  libavcodec/pnmenc.c     |  2 +-
> >>>>  libavutil/float2half.c  | 53 ++++++++++++++++++++++++++++++++++
> >>>>  libavutil/float2half.h  | 36 ++---------------------
> >>>>  libavutil/half2float.c  | 63 +++++++++++++++++++++++++++++++++++++++++
> >>>>  libavutil/half2float.h  | 46 ++----------------------------
> >>>>  11 files changed, 166 insertions(+), 86 deletions(-)
> >>>>  create mode 100644 libavcodec/float2half.c
> >>>>  create mode 100644 libavcodec/half2float.c
> >>>>  create mode 100644 libavutil/float2half.c
> >>>>  create mode 100644 libavutil/half2float.c
> >>>>
> >>>> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> >>>> index 029f1bad3d..cb80f73d99 100644
> >>>> --- a/libavcodec/Makefile
> >>>> +++ b/libavcodec/Makefile
> >>>> @@ -337,8 +337,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
> >>>>  OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
> >>>>  OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
> >>>>  OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
> >>>> -OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
> >>>> -OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o
> >>>> +OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
> >>>> +OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o float2half.o
> >>>>  OBJS-$(CONFIG_FASTAUDIO_DECODER)       += fastaudio.o
> >>>>  OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
> >>>>  OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
> >>>> @@ -570,8 +570,8 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
> >>>>  OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
> >>>>  OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
> >>>>  OBJS-$(CONFIG_PGX_DECODER)             += pgxdec.o
> >>>> -OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o
> >>>> -OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o
> >>>> +OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
> >>>> +OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o float2half.o
> >>>>  OBJS-$(CONFIG_PHOTOCD_DECODER)         += photocd.o
> >>>>  OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
> >>>>  OBJS-$(CONFIG_PIXLET_DECODER)          += pixlet.o
> >>>> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
> >>>> index 825354873d..a3582bfdd6 100644
> >>>> --- a/libavcodec/exr.c
> >>>> +++ b/libavcodec/exr.c
> >>>> @@ -2208,7 +2208,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
> >>>>      float one_gamma = 1.0f / s->gamma;
> >>>>      avpriv_trc_function trc_func = NULL;
> >>>>  
> >>>> -    init_half2float_tables(&s->h2f_tables);
> >>>> +    ff_init_half2float_tables(&s->h2f_tables);
> >>> [...]
> >>>> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
> >>>> new file mode 100644
> >>>> index 0000000000..dba14cef5d
> >>>> --- /dev/null
> >>>> +++ b/libavutil/float2half.c
> >>> [...]
> >>>> +void ff_init_float2half_tables(float2half_tables *t)
> >>>
> >>> this will need avpriv or break linking with shared libs
> >>>
> >>
> >> No, because this code is duplicated into all libraries that need it.
> >> (In case of static linking, only one of the variants will be used
> >> (namely the first one encountered in the link.)
> > 
> > libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> > libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> > clang: error: linker command failed with exit code 1 (use -v to see invocation)
> > Makefile:131: recipe for target 'ffplay_g' failed
> > make: *** [ffplay_g] Error 1
> > make: *** Waiting for unfinished jobs....
> > clang: error: linker command failed with exit code 1 (use -v to see invocation)
> > libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> > Makefile:131: recipe for target 'ffprobe_g' failed
> > make: *** [ffprobe_g] Error 1
> > clang: error: linker command failed with exit code 1 (use -v to see invocation)
> > Makefile:131: recipe for target 'ffmpeg_g' failed
> > make: *** [ffmpeg_g] Error 1
> > 
> 
> That is with the whole patchset applied, isn't it!? Duplicating the init
> stuff into swscale has been forgotten.

Ive tried the new latest patchset and this still happens

make distclean ; ../configure --enable-shared --cc='ccache clang-6.0'  --enable-pthreads --samples=fate/fate-suite/ --enable-version3  --extra-cflags='-O1 -fno-omit-frame-pointer'  && make -j32

LD	ffmpeg_g
LD	ffplay_g
LD	ffprobe_g
libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
clang: error: linker command failed with exit code 1 (use -v to see invocation)
Makefile:131: recipe for target 'ffprobe_g' failed
make: *** [ffprobe_g] Error 1
make: *** Waiting for unfinished jobs....
libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
clang: error: linker command failed with exit code 1 (use -v to see invocation)
Makefile:131: recipe for target 'ffplay_g' failed
make: *** [ffplay_g] Error 1
libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
clang: error: linker command failed with exit code 1 (use -v to see invocation)
Makefile:131: recipe for target 'ffmpeg_g' failed
make: *** [ffmpeg_g] Error 1

git grep ff_init_half2float_tables
libavcodec/exr.c:    ff_init_half2float_tables(&s->h2f_tables);
libavcodec/pnmdec.c:    ff_init_half2float_tables(&s->h2f_tables);
libavutil/half2float.c:void ff_init_half2float_tables(Half2FloatTables *t)
libavutil/half2float.h:void ff_init_half2float_tables(Half2FloatTables *t);
libswscale/slice.c:        ff_init_half2float_tables(c->h2f_tables);

libavcodec/half2float.c:#include "libavutil/half2float.c"
libswscale/half2float.c:#include "libavutil/half2float.c"

git grep half2float.o
libavcodec/Makefile:OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
libavcodec/Makefile:OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
libavutil/Makefile:       half2float.o                                                     \

also i tried this:
-#include "libavutil/half2float.c"
+#inklude "libavutil/half2float.c"

and still it fails at linking stage not compile, really impressive the compiler
speaks german C too


Heres the patches applied:

46e36b5371 swscale/input: add rgbaf16 input support
34446f3971 swscale: add opaque parameter to input functions
88cf148514 avutil/half2float: use native _Float16 if available
af6c41450c avutil/half2float: move non-inline init code out of header
2f2ac4c8c9 avutil/half2float: move tables to header-internal structs
19ac9a570e avutil/half2float: adjust conversion of NaN
45180741cd avutil: move half-precision float helper to avutil

(some more unrelated patches)

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

No great genius has ever existed without some touch of madness. -- Aristotle

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header
  2022-08-11 21:31         ` Andreas Rheinhardt
  2022-08-14 19:32           ` Michael Niedermayer
@ 2022-08-14 21:54           ` Timo Rothenpieler
  1 sibling, 0 replies; 39+ messages in thread
From: Timo Rothenpieler @ 2022-08-14 21:54 UTC (permalink / raw)
  To: ffmpeg-devel

On 11.08.2022 23:31, Andreas Rheinhardt wrote:
> Michael Niedermayer:
>> On Thu, Aug 11, 2022 at 10:50:10PM +0200, Andreas Rheinhardt wrote:
>>> Michael Niedermayer:
>>>> On Wed, Aug 10, 2022 at 10:47:09PM +0200, Timo Rothenpieler wrote:
>>>>> ---
>>>>>   libavcodec/Makefile     |  8 +++---
>>>>>   libavcodec/exr.c        |  2 +-
>>>>>   libavcodec/exrenc.c     |  2 +-
>>>>>   libavcodec/float2half.c | 19 +++++++++++++
>>>>>   libavcodec/half2float.c | 19 +++++++++++++
>>>>>   libavcodec/pnmdec.c     |  2 +-
>>>>>   libavcodec/pnmenc.c     |  2 +-
>>>>>   libavutil/float2half.c  | 53 ++++++++++++++++++++++++++++++++++
>>>>>   libavutil/float2half.h  | 36 ++---------------------
>>>>>   libavutil/half2float.c  | 63 +++++++++++++++++++++++++++++++++++++++++
>>>>>   libavutil/half2float.h  | 46 ++----------------------------
>>>>>   11 files changed, 166 insertions(+), 86 deletions(-)
>>>>>   create mode 100644 libavcodec/float2half.c
>>>>>   create mode 100644 libavcodec/half2float.c
>>>>>   create mode 100644 libavutil/float2half.c
>>>>>   create mode 100644 libavutil/half2float.c
>>>>>
>>>>> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
>>>>> index 029f1bad3d..cb80f73d99 100644
>>>>> --- a/libavcodec/Makefile
>>>>> +++ b/libavcodec/Makefile
>>>>> @@ -337,8 +337,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
>>>>>   OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
>>>>>   OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
>>>>>   OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
>>>>> -OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
>>>>> -OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o
>>>>> +OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
>>>>> +OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o float2half.o
>>>>>   OBJS-$(CONFIG_FASTAUDIO_DECODER)       += fastaudio.o
>>>>>   OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
>>>>>   OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
>>>>> @@ -570,8 +570,8 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
>>>>>   OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
>>>>>   OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
>>>>>   OBJS-$(CONFIG_PGX_DECODER)             += pgxdec.o
>>>>> -OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o
>>>>> -OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o
>>>>> +OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
>>>>> +OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o float2half.o
>>>>>   OBJS-$(CONFIG_PHOTOCD_DECODER)         += photocd.o
>>>>>   OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
>>>>>   OBJS-$(CONFIG_PIXLET_DECODER)          += pixlet.o
>>>>> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
>>>>> index 825354873d..a3582bfdd6 100644
>>>>> --- a/libavcodec/exr.c
>>>>> +++ b/libavcodec/exr.c
>>>>> @@ -2208,7 +2208,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
>>>>>       float one_gamma = 1.0f / s->gamma;
>>>>>       avpriv_trc_function trc_func = NULL;
>>>>>   
>>>>> -    init_half2float_tables(&s->h2f_tables);
>>>>> +    ff_init_half2float_tables(&s->h2f_tables);
>>>> [...]
>>>>> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
>>>>> new file mode 100644
>>>>> index 0000000000..dba14cef5d
>>>>> --- /dev/null
>>>>> +++ b/libavutil/float2half.c
>>>> [...]
>>>>> +void ff_init_float2half_tables(float2half_tables *t)
>>>>
>>>> this will need avpriv or break linking with shared libs
>>>>
>>>
>>> No, because this code is duplicated into all libraries that need it.
>>> (In case of static linking, only one of the variants will be used
>>> (namely the first one encountered in the link.)
>>
>> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
>> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
>> clang: error: linker command failed with exit code 1 (use -v to see invocation)
>> Makefile:131: recipe for target 'ffplay_g' failed
>> make: *** [ffplay_g] Error 1
>> make: *** Waiting for unfinished jobs....
>> clang: error: linker command failed with exit code 1 (use -v to see invocation)
>> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
>> Makefile:131: recipe for target 'ffprobe_g' failed
>> make: *** [ffprobe_g] Error 1
>> clang: error: linker command failed with exit code 1 (use -v to see invocation)
>> Makefile:131: recipe for target 'ffmpeg_g' failed
>> make: *** [ffmpeg_g] Error 1
>>
> 
> That is with the whole patchset applied, isn't it!? Duplicating the init
> stuff into swscale has been forgotten.

It must have gotten lost in some rebase of the whole set.
Added back to the Makefile locally.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header
  2022-08-14 19:32           ` Michael Niedermayer
@ 2022-08-15  4:20             ` Andreas Rheinhardt
  2022-08-15 18:09               ` Michael Niedermayer
  0 siblings, 1 reply; 39+ messages in thread
From: Andreas Rheinhardt @ 2022-08-15  4:20 UTC (permalink / raw)
  To: ffmpeg-devel

Michael Niedermayer:
> On Thu, Aug 11, 2022 at 11:31:32PM +0200, Andreas Rheinhardt wrote:
>> Michael Niedermayer:
>>> On Thu, Aug 11, 2022 at 10:50:10PM +0200, Andreas Rheinhardt wrote:
>>>> Michael Niedermayer:
>>>>> On Wed, Aug 10, 2022 at 10:47:09PM +0200, Timo Rothenpieler wrote:
>>>>>> ---
>>>>>>  libavcodec/Makefile     |  8 +++---
>>>>>>  libavcodec/exr.c        |  2 +-
>>>>>>  libavcodec/exrenc.c     |  2 +-
>>>>>>  libavcodec/float2half.c | 19 +++++++++++++
>>>>>>  libavcodec/half2float.c | 19 +++++++++++++
>>>>>>  libavcodec/pnmdec.c     |  2 +-
>>>>>>  libavcodec/pnmenc.c     |  2 +-
>>>>>>  libavutil/float2half.c  | 53 ++++++++++++++++++++++++++++++++++
>>>>>>  libavutil/float2half.h  | 36 ++---------------------
>>>>>>  libavutil/half2float.c  | 63 +++++++++++++++++++++++++++++++++++++++++
>>>>>>  libavutil/half2float.h  | 46 ++----------------------------
>>>>>>  11 files changed, 166 insertions(+), 86 deletions(-)
>>>>>>  create mode 100644 libavcodec/float2half.c
>>>>>>  create mode 100644 libavcodec/half2float.c
>>>>>>  create mode 100644 libavutil/float2half.c
>>>>>>  create mode 100644 libavutil/half2float.c
>>>>>>
>>>>>> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
>>>>>> index 029f1bad3d..cb80f73d99 100644
>>>>>> --- a/libavcodec/Makefile
>>>>>> +++ b/libavcodec/Makefile
>>>>>> @@ -337,8 +337,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
>>>>>>  OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
>>>>>>  OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
>>>>>>  OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
>>>>>> -OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
>>>>>> -OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o
>>>>>> +OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
>>>>>> +OBJS-$(CONFIG_EXR_ENCODER)             += exrenc.o float2half.o
>>>>>>  OBJS-$(CONFIG_FASTAUDIO_DECODER)       += fastaudio.o
>>>>>>  OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
>>>>>>  OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
>>>>>> @@ -570,8 +570,8 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
>>>>>>  OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
>>>>>>  OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
>>>>>>  OBJS-$(CONFIG_PGX_DECODER)             += pgxdec.o
>>>>>> -OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o
>>>>>> -OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o
>>>>>> +OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
>>>>>> +OBJS-$(CONFIG_PHM_ENCODER)             += pnmenc.o float2half.o
>>>>>>  OBJS-$(CONFIG_PHOTOCD_DECODER)         += photocd.o
>>>>>>  OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
>>>>>>  OBJS-$(CONFIG_PIXLET_DECODER)          += pixlet.o
>>>>>> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
>>>>>> index 825354873d..a3582bfdd6 100644
>>>>>> --- a/libavcodec/exr.c
>>>>>> +++ b/libavcodec/exr.c
>>>>>> @@ -2208,7 +2208,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
>>>>>>      float one_gamma = 1.0f / s->gamma;
>>>>>>      avpriv_trc_function trc_func = NULL;
>>>>>>  
>>>>>> -    init_half2float_tables(&s->h2f_tables);
>>>>>> +    ff_init_half2float_tables(&s->h2f_tables);
>>>>> [...]
>>>>>> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
>>>>>> new file mode 100644
>>>>>> index 0000000000..dba14cef5d
>>>>>> --- /dev/null
>>>>>> +++ b/libavutil/float2half.c
>>>>> [...]
>>>>>> +void ff_init_float2half_tables(float2half_tables *t)
>>>>>
>>>>> this will need avpriv or break linking with shared libs
>>>>>
>>>>
>>>> No, because this code is duplicated into all libraries that need it.
>>>> (In case of static linking, only one of the variants will be used
>>>> (namely the first one encountered in the link.)
>>>
>>> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
>>> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
>>> clang: error: linker command failed with exit code 1 (use -v to see invocation)
>>> Makefile:131: recipe for target 'ffplay_g' failed
>>> make: *** [ffplay_g] Error 1
>>> make: *** Waiting for unfinished jobs....
>>> clang: error: linker command failed with exit code 1 (use -v to see invocation)
>>> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
>>> Makefile:131: recipe for target 'ffprobe_g' failed
>>> make: *** [ffprobe_g] Error 1
>>> clang: error: linker command failed with exit code 1 (use -v to see invocation)
>>> Makefile:131: recipe for target 'ffmpeg_g' failed
>>> make: *** [ffmpeg_g] Error 1
>>>
>>
>> That is with the whole patchset applied, isn't it!? Duplicating the init
>> stuff into swscale has been forgotten.
> 
> Ive tried the new latest patchset and this still happens
> 

As explained here:
https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/300045.html, the
reason is that the patch that makes use of this in libswscale adds the
half2float.o dependency to libavutil.

> make distclean ; ../configure --enable-shared --cc='ccache clang-6.0'  --enable-pthreads --samples=fate/fate-suite/ --enable-version3  --extra-cflags='-O1 -fno-omit-frame-pointer'  && make -j32
> 
> LD	ffmpeg_g
> LD	ffplay_g
> LD	ffprobe_g
> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> clang: error: linker command failed with exit code 1 (use -v to see invocation)
> Makefile:131: recipe for target 'ffprobe_g' failed
> make: *** [ffprobe_g] Error 1
> make: *** Waiting for unfinished jobs....
> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> clang: error: linker command failed with exit code 1 (use -v to see invocation)
> Makefile:131: recipe for target 'ffplay_g' failed
> make: *** [ffplay_g] Error 1
> libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> clang: error: linker command failed with exit code 1 (use -v to see invocation)
> Makefile:131: recipe for target 'ffmpeg_g' failed
> make: *** [ffmpeg_g] Error 1
> 
> git grep ff_init_half2float_tables
> libavcodec/exr.c:    ff_init_half2float_tables(&s->h2f_tables);
> libavcodec/pnmdec.c:    ff_init_half2float_tables(&s->h2f_tables);
> libavutil/half2float.c:void ff_init_half2float_tables(Half2FloatTables *t)
> libavutil/half2float.h:void ff_init_half2float_tables(Half2FloatTables *t);
> libswscale/slice.c:        ff_init_half2float_tables(c->h2f_tables);
> 
> libavcodec/half2float.c:#include "libavutil/half2float.c"
> libswscale/half2float.c:#include "libavutil/half2float.c"
> 
> git grep half2float.o
> libavcodec/Makefile:OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
> libavcodec/Makefile:OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
> libavutil/Makefile:       half2float.o                                                     \
> 
> also i tried this:
> -#include "libavutil/half2float.c"
> +#inklude "libavutil/half2float.c"
> 
> and still it fails at linking stage not compile, really impressive the compiler
> speaks german C too
> 

No, the libswscale/half2float.c file is just not compiled at all, see above.

> 
> Heres the patches applied:
> 
> 46e36b5371 swscale/input: add rgbaf16 input support
> 34446f3971 swscale: add opaque parameter to input functions
> 88cf148514 avutil/half2float: use native _Float16 if available
> af6c41450c avutil/half2float: move non-inline init code out of header
> 2f2ac4c8c9 avutil/half2float: move tables to header-internal structs
> 19ac9a570e avutil/half2float: adjust conversion of NaN
> 45180741cd avutil: move half-precision float helper to avutil
> 
> (some more unrelated patches)
> 
> thx
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header
  2022-08-15  4:20             ` Andreas Rheinhardt
@ 2022-08-15 18:09               ` Michael Niedermayer
  0 siblings, 0 replies; 39+ messages in thread
From: Michael Niedermayer @ 2022-08-15 18:09 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 2678 bytes --]

On Mon, Aug 15, 2022 at 06:20:45AM +0200, Andreas Rheinhardt wrote:
> Michael Niedermayer:
> > On Thu, Aug 11, 2022 at 11:31:32PM +0200, Andreas Rheinhardt wrote:
[...]
> > make distclean ; ../configure --enable-shared --cc='ccache clang-6.0'  --enable-pthreads --samples=fate/fate-suite/ --enable-version3  --extra-cflags='-O1 -fno-omit-frame-pointer'  && make -j32
> > 
> > LD	ffmpeg_g
> > LD	ffplay_g
> > LD	ffprobe_g
> > libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> > clang: error: linker command failed with exit code 1 (use -v to see invocation)
> > Makefile:131: recipe for target 'ffprobe_g' failed
> > make: *** [ffprobe_g] Error 1
> > make: *** Waiting for unfinished jobs....
> > libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> > clang: error: linker command failed with exit code 1 (use -v to see invocation)
> > Makefile:131: recipe for target 'ffplay_g' failed
> > make: *** [ffplay_g] Error 1
> > libswscale/libswscale.so: undefined reference to `ff_init_half2float_tables'
> > clang: error: linker command failed with exit code 1 (use -v to see invocation)
> > Makefile:131: recipe for target 'ffmpeg_g' failed
> > make: *** [ffmpeg_g] Error 1
> > 
> > git grep ff_init_half2float_tables
> > libavcodec/exr.c:    ff_init_half2float_tables(&s->h2f_tables);
> > libavcodec/pnmdec.c:    ff_init_half2float_tables(&s->h2f_tables);
> > libavutil/half2float.c:void ff_init_half2float_tables(Half2FloatTables *t)
> > libavutil/half2float.h:void ff_init_half2float_tables(Half2FloatTables *t);
> > libswscale/slice.c:        ff_init_half2float_tables(c->h2f_tables);
> > 
> > libavcodec/half2float.c:#include "libavutil/half2float.c"
> > libswscale/half2float.c:#include "libavutil/half2float.c"
> > 
> > git grep half2float.o
> > libavcodec/Makefile:OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o half2float.o
> > libavcodec/Makefile:OBJS-$(CONFIG_PHM_DECODER)             += pnmdec.o pnm.o half2float.o
> > libavutil/Makefile:       half2float.o                                                     \
> > 
> > also i tried this:
> > -#include "libavutil/half2float.c"
> > +#inklude "libavutil/half2float.c"
> > 
> > and still it fails at linking stage not compile, really impressive the compiler
> > speaks german C too
> > 
> 
> No, the libswscale/half2float.c file is just not compiled at all, see above.

You seem to have a failure in humor.so

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

It is a danger to trust the dream we wish for rather than
the science we have, -- Dr. Kenneth Brown

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2022-08-15 18:10 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-10 20:47 [FFmpeg-devel] [PATCH 01/11] lavu/pixfmt: add packed RGBA float16 format Timo Rothenpieler
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 02/11] avutil/hwcontext_d3d11va: add support for rgbaf16 pixel format Timo Rothenpieler
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 03/11] avfilter/vsrc_ddagrab: add rgbaf16 output support Timo Rothenpieler
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 04/11] avfilter/vsrc_ddagrab: add options for more control over output format fallback Timo Rothenpieler
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 05/11] avutil: move half-precision float helper to avutil Timo Rothenpieler
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 06/11] avutil/half2float: adjust conversion of NaN Timo Rothenpieler
2022-08-10 21:24   ` Andreas Rheinhardt
2022-08-10 21:36     ` Timo Rothenpieler
2022-08-10 21:43       ` Andreas Rheinhardt
2022-08-10 21:53         ` Timo Rothenpieler
2022-08-10 22:14           ` Mark Reid
2022-08-10 22:18             ` James Almer
2022-08-10 22:28               ` Timo Rothenpieler
2022-08-10 22:37                 ` Mark Reid
2022-08-10 22:55                   ` Timo Rothenpieler
2022-08-11  2:18                     ` Mark Reid
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 07/11] avutil/half2float: move tables to header-internal structs Timo Rothenpieler
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 08/11] avutil/half2float: move non-inline init code out of header Timo Rothenpieler
2022-08-11 20:46   ` Michael Niedermayer
2022-08-11 20:50     ` Andreas Rheinhardt
2022-08-11 21:16       ` Michael Niedermayer
2022-08-11 21:31         ` Andreas Rheinhardt
2022-08-14 19:32           ` Michael Niedermayer
2022-08-15  4:20             ` Andreas Rheinhardt
2022-08-15 18:09               ` Michael Niedermayer
2022-08-14 21:54           ` Timo Rothenpieler
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 09/11] avutil/half2float: use native _Float16 if available Timo Rothenpieler
2022-08-10 21:03   ` Andreas Rheinhardt
2022-08-10 21:58     ` Timo Rothenpieler
2022-08-10 22:02       ` James Almer
2022-08-10 22:51   ` [FFmpeg-devel] [PATCH v2 " Timo Rothenpieler
2022-08-11  0:14     ` James Almer
2022-08-11 11:50       ` Timo Rothenpieler
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 10/11] swscale: add SwsContext parameter to input functions Timo Rothenpieler
2022-08-10 20:52   ` Timo Rothenpieler
2022-08-10 21:55   ` Andreas Rheinhardt
2022-08-10 22:02     ` Timo Rothenpieler
2022-08-10 20:47 ` [FFmpeg-devel] [PATCH 11/11] swscale/input: add rgbaf16 input support Timo Rothenpieler
2022-08-10 21:37   ` Timo Rothenpieler

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git