Re: [FFmpeg-devel] [PATCH v2 09/11] avutil/half2float: use native _Float16 if available

From: Timo Rothenpieler <timo@rothenpieler.org>
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH v2 09/11] avutil/half2float: use native _Float16 if available
Date: Thu, 11 Aug 2022 13:50:35 +0200
Message-ID: <e09ddf6b-7c93-790b-83a0-842d86149237@rothenpieler.org> (raw)
In-Reply-To: <314f36cb-5fe1-551b-81bc-b3b902dd6c79@gmail.com>

On 11/08/2022 02:14, James Almer wrote:
> On 8/10/2022 7:51 PM, Timo Rothenpieler wrote:
>> _Float16 support was available on arm/aarch64 for a while, and with gcc
>> 12 was enabled on x86 as long as SSE2 is supported.
>>
>> If the target arch supports f16c, gcc emits fairly efficient assembly,
>> taking advantage of it. This is the case on x86-64-v3 or higher.
>> Same goes on arm, which has native float16 support.
>> On x86, without f16c, it emulates it in software using sse2 instructions.
>>
>> This has shown to perform rather poorly:
>>
>> _Float16 full SSE2 emulation:
>> frame=50074 fps=848 q=-0.0 size=N/A time=00:33:22.96 bitrate=N/A 
>> speed=33.9x
>>
>> _Float16 f16c accelerated (Zen2, --cpu=znver2):
>> frame=50636 fps=1965 q=-0.0 Lsize=N/A time=00:33:45.40 bitrate=N/A 
>> speed=78.6x
>>
>> classic half2float full software implementation:
>> frame=49926 fps=1605 q=-0.0 Lsize=N/A time=00:33:17.00 bitrate=N/A 
>> speed=64.2x
>>
>> Hence an additional check was introduced, that only enables use of
>> _Float16 on x86 if f16c is being utilized.
>>
>> On aarch64, a similar uplift in performance is seen:
>>
>> RPi4 half2float full software implementation:
>> frame= 6088 fps=126 q=-0.0 Lsize=N/A time=00:04:03.48 bitrate=N/A 
>> speed=5.06x
>>
>> RPi4 _Float16:
>> frame= 6103 fps=158 q=-0.0 Lsize=N/A time=00:04:04.08 bitrate=N/A 
>> speed=6.32x
>>
>> Since arm/aarch64 always natively support 16 bit floats, it can always
>> be considered fast there.
>>
>> I'm not aware of any additional platforms that currently support
>> _Float16. And if there are, they should be considered non-fast until
>> proven fast.
>> ---
>>   configure              | 13 +++++++++++++
>>   libavutil/float2half.c |  2 ++
>>   libavutil/float2half.h | 16 ++++++++++++++++
>>   libavutil/half2float.c |  4 ++++
>>   libavutil/half2float.h | 16 ++++++++++++++++
>>   5 files changed, 51 insertions(+)
>>
>> diff --git a/configure b/configure
>> index 6761d0cb32..6ede9a5a8f 100755
>> --- a/configure
>> +++ b/configure
>> @@ -2143,6 +2143,8 @@ ARCH_FEATURES="
>>       fast_64bit
>>       fast_clz
>>       fast_cmov
>> +    fast_float16
>> +    float16
> 
> If HAVE_FLOAT16 is not going to be used, then don't export it here. 
> Leave it as a configure internal variable.
> 

Good point, fixed locally.

>>       local_aligned
>>       simd_align_16
>>       simd_align_32
>> @@ -5125,6 +5127,8 @@ elif enabled arm; then
>>               ;;
>>       esac
>> +    test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee
>> +
>>   elif enabled avr32; then
>>       case $cpu in
>> @@ -6229,6 +6233,15 @@ check_builtin sync_val_compare_and_swap "" "int 
>> *ptr; int oldval, newval; __sync
>>   check_builtin gmtime_r time.h "time_t *time; struct tm *tm; 
>> gmtime_r(time, tm)"
>>   check_builtin localtime_r time.h "time_t *time; struct tm *tm; 
>> localtime_r(time, tm)"
>> +check_builtin float16 "" "_Float16 f16var"
>> +if enabled float16; then
>> +    if enabled x86; then
>> +        test_cpp_condition stddef.h "defined(__F16C__)" && enable 
>> fast_float16
>> +    elif enabled arm || enabled aarch64; then
>> +        enable fast_float16
>> +    fi
>> +fi
>> +
>>   case "$custom_allocator" in
>>       jemalloc)
>>           # jemalloc by default does not use a prefix
>> diff --git a/libavutil/float2half.c b/libavutil/float2half.c
>> index dba14cef5d..7002612194 100644
>> --- a/libavutil/float2half.c
>> +++ b/libavutil/float2half.c
>> @@ -20,6 +20,7 @@
>>   void ff_init_float2half_tables(float2half_tables *t)
>>   {
>> +#if !HAVE_FAST_FLOAT16
>>       for (int i = 0; i < 256; i++) {
>>           int e = i - 127;
>> @@ -50,4 +51,5 @@ void ff_init_float2half_tables(float2half_tables *t)
>>               t->shifttable[i|0x100] = 13;
>>           }
>>       }
>> +#endif
>>   }
>> diff --git a/libavutil/float2half.h b/libavutil/float2half.h
>> index b8c9cdfc4f..437666966b 100644
>> --- a/libavutil/float2half.h
>> +++ b/libavutil/float2half.h
>> @@ -20,21 +20,37 @@
>>   #define AVUTIL_FLOAT2HALF_H
>>   #include <stdint.h>
>> +#include "intfloat.h"
>> +
>> +#include "config.h"
>>   typedef struct float2half_tables {
>> +#if HAVE_FAST_FLOAT16
>> +    uint8_t dummy;
>> +#else
>>       uint16_t basetable[512];
>>       uint8_t shifttable[512];
>> +#endif
>>   } float2half_tables;
>>   void ff_init_float2half_tables(float2half_tables *t);
>>   static inline uint16_t float2half(uint32_t f, const 
>> float2half_tables *t)
>>   {
>> +#if HAVE_FAST_FLOAT16
>> +    union {
>> +        _Float16 f;
>> +        uint16_t i;
>> +    } u;
>> +    u.f = av_int2float(f);
>> +    return u.i;
>> +#else
>>       uint16_t h;
>>       h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> 
>> t->shifttable[(f >> 23) & 0x1ff]);
>>       return h;
>> +#endif
>>   }
>>   #endif /* AVUTIL_FLOAT2HALF_H */
>> diff --git a/libavutil/half2float.c b/libavutil/half2float.c
>> index baac8e4093..ff198a8187 100644
>> --- a/libavutil/half2float.c
>> +++ b/libavutil/half2float.c
>> @@ -18,6 +18,7 @@
>>   #include "libavutil/half2float.h"
>> +#if !HAVE_FAST_FLOAT16
>>   static uint32_t convertmantissa(uint32_t i)
>>   {
>>       int32_t m = i << 13; // Zero pad mantissa bits
>> @@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i)
>>       return m | e; // Return combined number
>>   }
>> +#endif
>>   void ff_init_half2float_tables(half2float_tables *t)
>>   {
>> +#if !HAVE_FAST_FLOAT16
>>       t->mantissatable[0] = 0;
>>       for (int i = 1; i < 1024; i++)
>>           t->mantissatable[i] = convertmantissa(i);
>> @@ -60,4 +63,5 @@ void ff_init_half2float_tables(half2float_tables *t)
>>       t->offsettable[31] = 2048;
>>       t->offsettable[32] = 0;
>>       t->offsettable[63] = 2048;
>> +#endif
>>   }
>> diff --git a/libavutil/half2float.h b/libavutil/half2float.h
>> index cb58e44a1c..57ee8372fe 100644
>> --- a/libavutil/half2float.h
>> +++ b/libavutil/half2float.h
>> @@ -20,22 +20,38 @@
>>   #define AVUTIL_HALF2FLOAT_H
>>   #include <stdint.h>
>> +#include "intfloat.h"
>> +
>> +#include "config.h"
>>   typedef struct half2float_tables {
>> +#if HAVE_FAST_FLOAT16
>> +    uint8_t dummy;
>> +#else
>>       uint32_t mantissatable[3072];
>>       uint32_t exponenttable[64];
>>       uint16_t offsettable[64];
>> +#endif
>>   } half2float_tables;
>>   void ff_init_half2float_tables(half2float_tables *t);
>>   static inline uint32_t half2float(uint16_t h, const 
>> half2float_tables *t)
>>   {
>> +#if HAVE_FAST_FLOAT16
>> +    union {
>> +        _Float16 f;
>> +        uint16_t i;
>> +    } u;
>> +    u.i = h;
>> +    return av_float2int(u.f);
>> +#else
>>       uint32_t f;
>>       f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + 
>> t->exponenttable[h >> 10];
>>       return f;
>> +#endif
>>   }
>>   #endif /* AVUTIL_HALF2FLOAT_H */
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".