Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v2 1/2] swscale/input: Avoid calls to av_pix_fmt_desc_get()
@ 2022-09-16 14:52 Andreas Rheinhardt
  2022-09-16 14:55 ` [FFmpeg-devel] [PATCH v2 2/2] swscale/output: Don't call av_pix_fmt_desc_get() in a loop Andreas Rheinhardt
  2022-09-19 14:32 ` [FFmpeg-devel] [PATCH v2 1/2] swscale/input: Avoid calls to av_pix_fmt_desc_get() Andreas Rheinhardt
  0 siblings, 2 replies; 5+ messages in thread
From: Andreas Rheinhardt @ 2022-09-16 14:52 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Andreas Rheinhardt

Up until now, libswscale/input.c used a macro to read
an input pixel which involved a call to av_pix_fmt_desc_get()
to find out whether the input pixel format is BE or LE
despite this being known at compile-time (there are templates
per pixfmt). Even worse, these calls are made in a loop,
so that e.g. there are six calls to av_pix_fmt_desc_get()
for every pair of UV pixel processed in
rgb64ToUV_half_c_template().

This commit modifies these macros to ensure that isBE()
is evaluated at compile-time. This saved 9743B of .text
for me (GCC 11.2, -O3). For a simple RGB64LE->YUV420P
transformation like
ffmpeg -f lavfi -i haldclutsrc,format=rgba64le -pix_fmt yuv420p \
-threads 1  -t 1:00  -f null -
the amount of decicycles spent in rgb64LEToUV_half_c
(which is created via the template mentioned above)
decreases from 19751 to 5341; for RGBA64BE the number
went down from 11945 to 5393. For shared builds (where
the call to av_pix_fmt_desc_get() is indirect) the old numbers
are 15230 for RGBA64BE and 27502 for RGBA64LE, whereas
the numbers with this patch are indistinguishable from
the numbers from a static build.

Also make the macros that are touched conform to the
usual convention of using uppercase names while just at it.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libswscale/input.c | 122 +++++++++++++++++++++++++--------------------
 1 file changed, 68 insertions(+), 54 deletions(-)

diff --git a/libswscale/input.c b/libswscale/input.c
index 88e318e664..7ff7bfaa01 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -28,14 +28,21 @@
 #include "config.h"
 #include "swscale_internal.h"
 
-#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+#define input_pixel(pos) (is_be ? AV_RB16(pos) : AV_RL16(pos))
+
+#define IS_BE_LE 0
+#define IS_BE_BE 1
+#define IS_BE_   0
+/* ENDIAN_IDENTIFIER needs to be "BE", "LE" or "". The latter is intended
+ * for single-byte cases where the concept of endianness does not apply. */
+#define IS_BE(ENDIAN_IDENTIFIER) IS_BE_ ## ENDIAN_IDENTIFIER
 
 #define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? b_r : r_b)
 #define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? r_b : b_r)
 
 static av_always_inline void
 rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
-                    enum AVPixelFormat origin, int32_t *rgb2yuv)
+                    enum AVPixelFormat origin, int32_t *rgb2yuv, int is_be)
 {
     int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
     int i;
@@ -51,7 +58,7 @@ rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
 static av_always_inline void
 rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
                     const uint16_t *src1, const uint16_t *src2,
-                    int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
+                    int width, enum AVPixelFormat origin, int32_t *rgb2yuv, int is_be)
 {
     int i;
     int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
@@ -70,7 +77,7 @@ rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
 static av_always_inline void
 rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
                           const uint16_t *src1, const uint16_t *src2,
-                          int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
+                          int width, enum AVPixelFormat origin, int32_t *rgb2yuv, int is_be)
 {
     int i;
     int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
@@ -86,13 +93,13 @@ rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
     }
 }
 
-#define rgb64funcs(pattern, BE_LE, origin) \
+#define RGB64FUNCS_EXT(pattern, BE_LE, origin, is_be) \
 static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
                                     int width, uint32_t *rgb2yuv, void *opq) \
 { \
     const uint16_t *src = (const uint16_t *) _src; \
     uint16_t *dst = (uint16_t *) _dst; \
-    rgb64ToY_c_template(dst, src, width, origin, rgb2yuv); \
+    rgb64ToY_c_template(dst, src, width, origin, rgb2yuv, is_be); \
 } \
  \
 static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
@@ -102,7 +109,7 @@ static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
     const uint16_t *src1 = (const uint16_t *) _src1, \
                    *src2 = (const uint16_t *) _src2; \
     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
-    rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+    rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \
 } \
  \
 static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
@@ -112,18 +119,20 @@ static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV
     const uint16_t *src1 = (const uint16_t *) _src1, \
                    *src2 = (const uint16_t *) _src2; \
     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
-    rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+    rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \
 }
+#define RGB64FUNCS(pattern, endianness, base_fmt) \
+        RGB64FUNCS_EXT(pattern, endianness, base_fmt ## endianness, IS_BE(endianness))
 
-rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE)
-rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE)
-rgb64funcs(bgr, LE, AV_PIX_FMT_BGRA64LE)
-rgb64funcs(bgr, BE, AV_PIX_FMT_BGRA64BE)
+RGB64FUNCS(rgb, LE, AV_PIX_FMT_RGBA64)
+RGB64FUNCS(rgb, BE, AV_PIX_FMT_RGBA64)
+RGB64FUNCS(bgr, LE, AV_PIX_FMT_BGRA64)
+RGB64FUNCS(bgr, BE, AV_PIX_FMT_BGRA64)
 
 static av_always_inline void rgb48ToY_c_template(uint16_t *dst,
                                                  const uint16_t *src, int width,
                                                  enum AVPixelFormat origin,
-                                                 int32_t *rgb2yuv)
+                                                 int32_t *rgb2yuv, int is_be)
 {
     int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
     int i;
@@ -142,7 +151,7 @@ static av_always_inline void rgb48ToUV_c_template(uint16_t *dstU,
                                                   const uint16_t *src2,
                                                   int width,
                                                   enum AVPixelFormat origin,
-                                                  int32_t *rgb2yuv)
+                                                  int32_t *rgb2yuv, int is_be)
 {
     int i;
     int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
@@ -164,7 +173,7 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
                                                        const uint16_t *src2,
                                                        int width,
                                                        enum AVPixelFormat origin,
-                                                       int32_t *rgb2yuv)
+                                                       int32_t *rgb2yuv, int is_be)
 {
     int i;
     int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
@@ -187,7 +196,7 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
 #undef b
 #undef input_pixel
 
-#define rgb48funcs(pattern, BE_LE, origin)                              \
+#define RGB48FUNCS_EXT(pattern, BE_LE, origin, is_be)                   \
 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
                                             const uint8_t *_src,        \
                                             const uint8_t *unused0, const uint8_t *unused1,\
@@ -197,7 +206,7 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
 {                                                                       \
     const uint16_t *src = (const uint16_t *)_src;                       \
     uint16_t *dst       = (uint16_t *)_dst;                             \
-    rgb48ToY_c_template(dst, src, width, origin, rgb2yuv);              \
+    rgb48ToY_c_template(dst, src, width, origin, rgb2yuv, is_be);       \
 }                                                                       \
                                                                         \
 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
@@ -213,7 +222,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
                    *src2 = (const uint16_t *)_src2;                     \
     uint16_t *dstU = (uint16_t *)_dstU,                                 \
              *dstV = (uint16_t *)_dstV;                                 \
-    rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv);        \
+    rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \
 }                                                                       \
                                                                         \
 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU,       \
@@ -229,13 +238,15 @@ static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU,       \
                    *src2 = (const uint16_t *)_src2;                     \
     uint16_t *dstU = (uint16_t *)_dstU,                                 \
              *dstV = (uint16_t *)_dstV;                                 \
-    rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv);   \
+    rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \
 }
+#define RGB48FUNCS(pattern, endianness, base_fmt) \
+        RGB48FUNCS_EXT(pattern, endianness, base_fmt ## endianness, IS_BE(endianness))
 
-rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE)
-rgb48funcs(rgb, BE, AV_PIX_FMT_RGB48BE)
-rgb48funcs(bgr, LE, AV_PIX_FMT_BGR48LE)
-rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE)
+RGB48FUNCS(rgb, LE, AV_PIX_FMT_RGB48)
+RGB48FUNCS(rgb, BE, AV_PIX_FMT_RGB48)
+RGB48FUNCS(bgr, LE, AV_PIX_FMT_BGR48)
+RGB48FUNCS(bgr, BE, AV_PIX_FMT_BGR48)
 
 #define input_pixel(i) ((origin == AV_PIX_FMT_RGBA ||                      \
                          origin == AV_PIX_FMT_BGRA ||                      \
@@ -245,7 +256,7 @@ rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE)
                         : ((origin == AV_PIX_FMT_X2RGB10LE ||              \
                             origin == AV_PIX_FMT_X2BGR10LE)                \
                            ? AV_RL32(&src[(i) * 4])                        \
-                           : (isBE(origin) ? AV_RB16(&src[(i) * 2])        \
+                           : (is_be ? AV_RB16(&src[(i) * 2])               \
                               : AV_RL16(&src[(i) * 2]))))
 
 static av_always_inline void rgb16_32ToY_c_template(int16_t *dst,
@@ -257,7 +268,7 @@ static av_always_inline void rgb16_32ToY_c_template(int16_t *dst,
                                                     int maskr, int maskg,
                                                     int maskb, int rsh,
                                                     int gsh, int bsh, int S,
-                                                    int32_t *rgb2yuv)
+                                                    int32_t *rgb2yuv, int is_be)
 {
     const int ry       = rgb2yuv[RY_IDX]<<rsh, gy = rgb2yuv[GY_IDX]<<gsh, by = rgb2yuv[BY_IDX]<<bsh;
     const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
@@ -283,7 +294,7 @@ static av_always_inline void rgb16_32ToUV_c_template(int16_t *dstU,
                                                      int maskr, int maskg,
                                                      int maskb, int rsh,
                                                      int gsh, int bsh, int S,
-                                                     int32_t *rgb2yuv)
+                                                     int32_t *rgb2yuv, int is_be)
 {
     const int ru       = rgb2yuv[RU_IDX] * (1 << rsh), gu = rgb2yuv[GU_IDX] * (1 << gsh), bu = rgb2yuv[BU_IDX] * (1 << bsh),
               rv       = rgb2yuv[RV_IDX] * (1 << rsh), gv = rgb2yuv[GV_IDX] * (1 << gsh), bv = rgb2yuv[BV_IDX] * (1 << bsh);
@@ -311,7 +322,7 @@ static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
                                                           int maskr, int maskg,
                                                           int maskb, int rsh,
                                                           int gsh, int bsh, int S,
-                                                          int32_t *rgb2yuv)
+                                                          int32_t *rgb2yuv, int is_be)
 {
     const int ru       = rgb2yuv[RU_IDX] * (1 << rsh), gu = rgb2yuv[GU_IDX] * (1 << gsh), bu = rgb2yuv[BU_IDX] * (1 << bsh),
               rv       = rgb2yuv[RV_IDX] * (1 << rsh), gv = rgb2yuv[GV_IDX] * (1 << gsh), bv = rgb2yuv[BV_IDX] * (1 << bsh),
@@ -345,13 +356,13 @@ static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
 
 #undef input_pixel
 
-#define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr,          \
-                         maskg, maskb, rsh, gsh, bsh, S)                \
+#define RGB16_32FUNCS_EXT(fmt, name, shr, shg, shb, shp, maskr,         \
+                          maskg, maskb, rsh, gsh, bsh, S, is_be)        \
 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,            \
                           int width, uint32_t *tab, void *opq)          \
 {                                                                       \
     rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp,    \
-                           maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \
+                           maskr, maskg, maskb, rsh, gsh, bsh, S, tab, is_be); \
 }                                                                       \
                                                                         \
 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
@@ -360,7 +371,7 @@ static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
 {                                                                       \
     rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,                \
                             shr, shg, shb, shp,                         \
-                            maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\
+                            maskr, maskg, maskb, rsh, gsh, bsh, S, tab, is_be); \
 }                                                                       \
                                                                         \
 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV,           \
@@ -371,27 +382,32 @@ static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV,           \
     rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,           \
                                  shr, shg, shb, shp,                    \
                                  maskr, maskg, maskb,                   \
-                                 rsh, gsh, bsh, S, tab);                \
-}
-
-rgb16_32_wrapper(AV_PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
-rgb16_32_wrapper(AV_PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
-rgb16_32_wrapper(AV_PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
-rgb16_32_wrapper(AV_PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
-rgb16_32_wrapper(AV_PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
-rgb16_32_wrapper(AV_PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
-rgb16_32_wrapper(AV_PIX_FMT_BGR444LE, bgr12le, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
-rgb16_32_wrapper(AV_PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
-rgb16_32_wrapper(AV_PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
-rgb16_32_wrapper(AV_PIX_FMT_RGB444LE, rgb12le, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
-rgb16_32_wrapper(AV_PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
-rgb16_32_wrapper(AV_PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
-rgb16_32_wrapper(AV_PIX_FMT_BGR444BE, bgr12be, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
-rgb16_32_wrapper(AV_PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
-rgb16_32_wrapper(AV_PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
-rgb16_32_wrapper(AV_PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
-rgb16_32_wrapper(AV_PIX_FMT_X2RGB10LE, rgb30le, 16, 6, 0, 0, 0x3FF00000, 0xFFC00, 0x3FF, 0, 0, 4, RGB2YUV_SHIFT + 6)
-rgb16_32_wrapper(AV_PIX_FMT_X2BGR10LE, bgr30le, 0, 6, 16, 0, 0x3FF, 0xFFC00, 0x3FF00000, 4, 0, 0, RGB2YUV_SHIFT + 6)
+                                 rsh, gsh, bsh, S, tab, is_be);         \
+}
+
+#define RGB16_32FUNCS(base_fmt, endianness, name, shr, shg, shb, shp, maskr, \
+                      maskg, maskb, rsh, gsh, bsh, S) \
+    RGB16_32FUNCS_EXT(base_fmt ## endianness, name, shr, shg, shb, shp, maskr, \
+                      maskg, maskb, rsh, gsh, bsh, S, IS_BE(endianness))
+
+RGB16_32FUNCS(AV_PIX_FMT_BGR32,     , bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
+RGB16_32FUNCS(AV_PIX_FMT_BGR32_1,   , bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
+RGB16_32FUNCS(AV_PIX_FMT_RGB32,     , rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
+RGB16_32FUNCS(AV_PIX_FMT_RGB32_1,   , rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
+RGB16_32FUNCS(AV_PIX_FMT_BGR565,  LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
+RGB16_32FUNCS(AV_PIX_FMT_BGR555,  LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
+RGB16_32FUNCS(AV_PIX_FMT_BGR444,  LE, bgr12le, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
+RGB16_32FUNCS(AV_PIX_FMT_RGB565,  LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
+RGB16_32FUNCS(AV_PIX_FMT_RGB555,  LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
+RGB16_32FUNCS(AV_PIX_FMT_RGB444,  LE, rgb12le, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
+RGB16_32FUNCS(AV_PIX_FMT_BGR565,  BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
+RGB16_32FUNCS(AV_PIX_FMT_BGR555,  BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
+RGB16_32FUNCS(AV_PIX_FMT_BGR444,  BE, bgr12be, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
+RGB16_32FUNCS(AV_PIX_FMT_RGB565,  BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
+RGB16_32FUNCS(AV_PIX_FMT_RGB555,  BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
+RGB16_32FUNCS(AV_PIX_FMT_RGB444,  BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
+RGB16_32FUNCS(AV_PIX_FMT_X2RGB10, LE, rgb30le, 16, 6, 0, 0, 0x3FF00000, 0xFFC00, 0x3FF, 0, 0, 4, RGB2YUV_SHIFT + 6)
+RGB16_32FUNCS(AV_PIX_FMT_X2BGR10, LE, bgr30le, 0, 6, 16, 0, 0x3FF, 0xFFC00, 0x3FF00000, 4, 0, 0, RGB2YUV_SHIFT + 6)
 
 static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
                          const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
@@ -832,8 +848,6 @@ p01x_wrapper(10, 6)
 p01x_wrapper(12, 4)
 p01x_uv_wrapper(16, 0)
 
-#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
-
 static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
                        int width, uint32_t *rgb2yuv, void *opq)
 {
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH v2 2/2] swscale/output: Don't call av_pix_fmt_desc_get() in a loop
  2022-09-16 14:52 [FFmpeg-devel] [PATCH v2 1/2] swscale/input: Avoid calls to av_pix_fmt_desc_get() Andreas Rheinhardt
@ 2022-09-16 14:55 ` Andreas Rheinhardt
  2022-09-19 14:34   ` Michael Niedermayer
  2022-09-19 14:32 ` [FFmpeg-devel] [PATCH v2 1/2] swscale/input: Avoid calls to av_pix_fmt_desc_get() Andreas Rheinhardt
  1 sibling, 1 reply; 5+ messages in thread
From: Andreas Rheinhardt @ 2022-09-16 14:55 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Andreas Rheinhardt

Up until now, libswscale/output.c used a macro to write
an output pixel which involved a call to av_pix_fmt_desc_get()
to find out whether the input pixel format is BE or LE
despite this being known at compile-time (there are templates
per pixfmt). Even worse, these calls are made in a loop,
so that e.g. there are eight calls to av_pix_fmt_desc_get()
for every pixel processed in yuv2rgba64_X_c_template()
for 64bit RGB formats.

This commit modifies these macros to ensure that isBE()
is evaluated at compile-time. This saved 41184B of .text
for me (GCC 11.2, -O3). Of course, it also improved performance.
E.g. ffmpeg_g -f lavfi -i testsrc2,format=yuva420p -pix_fmt rgba64le \
-threads 1  -t 1:00  -f null - (which uses yuv2rgba64le_X_c,
which is an invocation of yuv2rgba64_X_c_template() mentioned above),
performance improved from 95589 to 41387 decicycles for one call
to yuv2packedX; for the be variant the numbers went down from
76087 to 43024 decicycles.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libswscale/output.c | 100 +++++++++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 42 deletions(-)

diff --git a/libswscale/output.c b/libswscale/output.c
index 2f599698e9..0e1c1225a0 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -133,6 +133,11 @@ DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[][8] = {
 };
 #endif
 
+#define IS_BE_LE 0
+#define IS_BE_BE 1
+/* ENDIAN_IDENTIFIER needs to be "BE" or "LE". */
+#define IS_BE(ENDIAN_IDENTIFIER) IS_BE_ ## ENDIAN_IDENTIFIER
+
 #define output_pixel(pos, val, bias, signedness) \
     if (big_endian) { \
         AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
@@ -935,7 +940,7 @@ YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
 #define R_B ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? R : B)
 #define B_R ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? B : R)
 #define output_pixel(pos, val) \
-    if (isBE(target)) { \
+    if (is_be) { \
         AV_WB16(pos, val); \
     } else { \
         AV_WL16(pos, val); \
@@ -947,7 +952,8 @@ yuv2ya16_X_c_template(SwsContext *c, const int16_t *lumFilter,
                         const int16_t *chrFilter, const int32_t **unused_chrUSrc,
                         const int32_t **unused_chrVSrc, int unused_chrFilterSize,
                         const int32_t **alpSrc, uint16_t *dest, int dstW,
-                        int y, enum AVPixelFormat target, int unused_hasAlpha, int unused_eightbytes)
+                        int y, enum AVPixelFormat target,
+                        int unused_hasAlpha, int unused_eightbytes, int is_be)
 {
     int hasAlpha = !!alpSrc;
     int i;
@@ -984,7 +990,8 @@ yuv2ya16_2_c_template(SwsContext *c, const int32_t *buf[2],
                         const int32_t *unused_ubuf[2], const int32_t *unused_vbuf[2],
                         const int32_t *abuf[2], uint16_t *dest, int dstW,
                         int yalpha, int unused_uvalpha, int y,
-                        enum AVPixelFormat target, int unused_hasAlpha, int unused_eightbytes)
+                        enum AVPixelFormat target, int unused_hasAlpha,
+                        int unused_eightbytes, int is_be)
 {
     int hasAlpha = abuf && abuf[0] && abuf[1];
     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
@@ -1015,7 +1022,8 @@ static av_always_inline void
 yuv2ya16_1_c_template(SwsContext *c, const int32_t *buf0,
                         const int32_t *unused_ubuf[2], const int32_t *unused_vbuf[2],
                         const int32_t *abuf0, uint16_t *dest, int dstW,
-                        int unused_uvalpha, int y, enum AVPixelFormat target, int unused_hasAlpha, int unused_eightbytes)
+                        int unused_uvalpha, int y, enum AVPixelFormat target,
+                        int unused_hasAlpha, int unused_eightbytes, int is_be)
 {
     int hasAlpha = !!abuf0;
     int i;
@@ -1043,7 +1051,8 @@ yuv2rgba64_X_c_template(SwsContext *c, const int16_t *lumFilter,
                        const int16_t *chrFilter, const int32_t **chrUSrc,
                        const int32_t **chrVSrc, int chrFilterSize,
                        const int32_t **alpSrc, uint16_t *dest, int dstW,
-                       int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       int y, enum AVPixelFormat target, int hasAlpha, int eightbytes,
+                       int is_be)
 {
     int i;
     int A1 = 0xffff<<14, A2 = 0xffff<<14;
@@ -1124,7 +1133,8 @@ yuv2rgba64_2_c_template(SwsContext *c, const int32_t *buf[2],
                        const int32_t *ubuf[2], const int32_t *vbuf[2],
                        const int32_t *abuf[2], uint16_t *dest, int dstW,
                        int yalpha, int uvalpha, int y,
-                       enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       enum AVPixelFormat target, int hasAlpha, int eightbytes,
+                       int is_be)
 {
     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
@@ -1188,7 +1198,8 @@ static av_always_inline void
 yuv2rgba64_1_c_template(SwsContext *c, const int32_t *buf0,
                        const int32_t *ubuf[2], const int32_t *vbuf[2],
                        const int32_t *abuf0, uint16_t *dest, int dstW,
-                       int uvalpha, int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       int uvalpha, int y, enum AVPixelFormat target,
+                       int hasAlpha, int eightbytes, int is_be)
 {
     const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
     int i;
@@ -1293,7 +1304,8 @@ yuv2rgba64_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
                        const int16_t *chrFilter, const int32_t **chrUSrc,
                        const int32_t **chrVSrc, int chrFilterSize,
                        const int32_t **alpSrc, uint16_t *dest, int dstW,
-                       int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       int y, enum AVPixelFormat target, int hasAlpha,
+                       int eightbytes, int is_be)
 {
     int i;
     int A = 0xffff<<14;
@@ -1356,7 +1368,8 @@ yuv2rgba64_full_2_c_template(SwsContext *c, const int32_t *buf[2],
                        const int32_t *ubuf[2], const int32_t *vbuf[2],
                        const int32_t *abuf[2], uint16_t *dest, int dstW,
                        int yalpha, int uvalpha, int y,
-                       enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       enum AVPixelFormat target, int hasAlpha, int eightbytes,
+                       int is_be)
 {
     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
@@ -1407,7 +1420,8 @@ static av_always_inline void
 yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,
                        const int32_t *ubuf[2], const int32_t *vbuf[2],
                        const int32_t *abuf0, uint16_t *dest, int dstW,
-                       int uvalpha, int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
+                       int uvalpha, int y, enum AVPixelFormat target,
+                       int hasAlpha, int eightbytes, int is_be)
 {
     const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
     int i;
@@ -1484,7 +1498,7 @@ yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,
 #undef r_b
 #undef b_r
 
-#define YUV2PACKED16WRAPPER(name, base, ext, fmt, hasAlpha, eightbytes) \
+#define YUV2PACKED16WRAPPER_EXT(name, base, ext, fmt, is_be, hasAlpha, eightbytes) \
 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
                         const int16_t **_lumSrc, int lumFilterSize, \
                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
@@ -1499,7 +1513,7 @@ static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
     uint16_t *dest = (uint16_t *) _dest; \
     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                          alpSrc, dest, dstW, y, fmt, hasAlpha, eightbytes); \
+                          alpSrc, dest, dstW, y, fmt, hasAlpha, eightbytes, is_be); \
 } \
  \
 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
@@ -1513,7 +1527,7 @@ static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
                   **abuf = (const int32_t **) _abuf; \
     uint16_t *dest = (uint16_t *) _dest; \
     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
-                          dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha, eightbytes); \
+                          dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha, eightbytes, is_be); \
 } \
  \
 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
@@ -1527,36 +1541,38 @@ static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
                   *abuf0 = (const int32_t *)  _abuf0; \
     uint16_t *dest = (uint16_t *) _dest; \
     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
-                                  dstW, uvalpha, y, fmt, hasAlpha, eightbytes); \
+                                  dstW, uvalpha, y, fmt, hasAlpha, eightbytes, is_be); \
 }
-
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48be, AV_PIX_FMT_RGB48BE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48le, AV_PIX_FMT_RGB48LE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48be, AV_PIX_FMT_BGR48BE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48le, AV_PIX_FMT_BGR48LE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64be, AV_PIX_FMT_RGBA64BE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64le, AV_PIX_FMT_RGBA64LE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64be, AV_PIX_FMT_RGBA64BE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64le, AV_PIX_FMT_RGBA64LE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64be, AV_PIX_FMT_BGRA64BE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64le, AV_PIX_FMT_BGRA64LE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64be, AV_PIX_FMT_BGRA64BE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64le, AV_PIX_FMT_BGRA64LE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, ya16, ya16be, AV_PIX_FMT_YA16BE, 1, 0)
-YUV2PACKED16WRAPPER(yuv2, ya16, ya16le, AV_PIX_FMT_YA16LE, 1, 0)
-
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48be_full, AV_PIX_FMT_RGB48BE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48le_full, AV_PIX_FMT_RGB48LE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48be_full, AV_PIX_FMT_BGR48BE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48le_full, AV_PIX_FMT_BGR48LE, 0, 0)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64be_full, AV_PIX_FMT_RGBA64BE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64le_full, AV_PIX_FMT_RGBA64LE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64be_full, AV_PIX_FMT_RGBA64BE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64le_full, AV_PIX_FMT_RGBA64LE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64be_full, AV_PIX_FMT_BGRA64BE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64le_full, AV_PIX_FMT_BGRA64LE, 1, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64be_full, AV_PIX_FMT_BGRA64BE, 0, 1)
-YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64le_full, AV_PIX_FMT_BGRA64LE, 0, 1)
+#define YUV2PACKED16WRAPPER(name, base, ext, base_fmt, endianness, hasAlpha, eightbytes) \
+    YUV2PACKED16WRAPPER_EXT(name, base, ext, base_fmt ## endianness, IS_BE(endianness), hasAlpha, eightbytes)
+
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48be, AV_PIX_FMT_RGB48, BE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48le, AV_PIX_FMT_RGB48, LE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48be, AV_PIX_FMT_BGR48, BE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48le, AV_PIX_FMT_BGR48, LE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64be, AV_PIX_FMT_RGBA64, BE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64le, AV_PIX_FMT_RGBA64, LE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64be, AV_PIX_FMT_RGBA64, BE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64le, AV_PIX_FMT_RGBA64, LE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64be, AV_PIX_FMT_BGRA64, BE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64le, AV_PIX_FMT_BGRA64, LE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64be, AV_PIX_FMT_BGRA64, BE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64le, AV_PIX_FMT_BGRA64, LE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, ya16, ya16be, AV_PIX_FMT_YA16, BE, 1, 0)
+YUV2PACKED16WRAPPER(yuv2, ya16, ya16le, AV_PIX_FMT_YA16, LE, 1, 0)
+
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48be_full, AV_PIX_FMT_RGB48, BE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48le_full, AV_PIX_FMT_RGB48, LE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48be_full, AV_PIX_FMT_BGR48, BE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48le_full, AV_PIX_FMT_BGR48, LE, 0, 0)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64be_full, AV_PIX_FMT_RGBA64, BE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64le_full, AV_PIX_FMT_RGBA64, LE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64be_full, AV_PIX_FMT_RGBA64, BE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64le_full, AV_PIX_FMT_RGBA64, LE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64be_full, AV_PIX_FMT_BGRA64, BE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64le_full, AV_PIX_FMT_BGRA64, LE, 1, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64be_full, AV_PIX_FMT_BGRA64, BE, 0, 1)
+YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64le_full, AV_PIX_FMT_BGRA64, LE, 0, 1)
 
 /*
  * Write out 2 RGB pixels in the target pixel format. This function takes a
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 1/2] swscale/input: Avoid calls to av_pix_fmt_desc_get()
  2022-09-16 14:52 [FFmpeg-devel] [PATCH v2 1/2] swscale/input: Avoid calls to av_pix_fmt_desc_get() Andreas Rheinhardt
  2022-09-16 14:55 ` [FFmpeg-devel] [PATCH v2 2/2] swscale/output: Don't call av_pix_fmt_desc_get() in a loop Andreas Rheinhardt
@ 2022-09-19 14:32 ` Andreas Rheinhardt
  1 sibling, 0 replies; 5+ messages in thread
From: Andreas Rheinhardt @ 2022-09-19 14:32 UTC (permalink / raw)
  To: ffmpeg-devel

Andreas Rheinhardt:
> Up until now, libswscale/input.c used a macro to read
> an input pixel which involved a call to av_pix_fmt_desc_get()
> to find out whether the input pixel format is BE or LE
> despite this being known at compile-time (there are templates
> per pixfmt). Even worse, these calls are made in a loop,
> so that e.g. there are six calls to av_pix_fmt_desc_get()
> for every pair of UV pixel processed in
> rgb64ToUV_half_c_template().
> 
> This commit modifies these macros to ensure that isBE()
> is evaluated at compile-time. This saved 9743B of .text
> for me (GCC 11.2, -O3). For a simple RGB64LE->YUV420P
> transformation like
> ffmpeg -f lavfi -i haldclutsrc,format=rgba64le -pix_fmt yuv420p \
> -threads 1  -t 1:00  -f null -
> the amount of decicycles spent in rgb64LEToUV_half_c
> (which is created via the template mentioned above)
> decreases from 19751 to 5341; for RGBA64BE the number
> went down from 11945 to 5393. For shared builds (where
> the call to av_pix_fmt_desc_get() is indirect) the old numbers
> are 15230 for RGBA64BE and 27502 for RGBA64LE, whereas
> the numbers with this patch are indistinguishable from
> the numbers from a static build.
> 
> Also make the macros that are touched conform to the
> usual convention of using uppercase names while just at it.
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
>  libswscale/input.c | 122 +++++++++++++++++++++++++--------------------
>  1 file changed, 68 insertions(+), 54 deletions(-)
> 
> diff --git a/libswscale/input.c b/libswscale/input.c
> index 88e318e664..7ff7bfaa01 100644
> --- a/libswscale/input.c
> +++ b/libswscale/input.c
> @@ -28,14 +28,21 @@
>  #include "config.h"
>  #include "swscale_internal.h"
>  
> -#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
> +#define input_pixel(pos) (is_be ? AV_RB16(pos) : AV_RL16(pos))
> +
> +#define IS_BE_LE 0
> +#define IS_BE_BE 1
> +#define IS_BE_   0
> +/* ENDIAN_IDENTIFIER needs to be "BE", "LE" or "". The latter is intended
> + * for single-byte cases where the concept of endianness does not apply. */
> +#define IS_BE(ENDIAN_IDENTIFIER) IS_BE_ ## ENDIAN_IDENTIFIER
>  
>  #define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? b_r : r_b)
>  #define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? r_b : b_r)
>  
>  static av_always_inline void
>  rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
> -                    enum AVPixelFormat origin, int32_t *rgb2yuv)
> +                    enum AVPixelFormat origin, int32_t *rgb2yuv, int is_be)
>  {
>      int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
>      int i;
> @@ -51,7 +58,7 @@ rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
>  static av_always_inline void
>  rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
>                      const uint16_t *src1, const uint16_t *src2,
> -                    int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
> +                    int width, enum AVPixelFormat origin, int32_t *rgb2yuv, int is_be)
>  {
>      int i;
>      int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
> @@ -70,7 +77,7 @@ rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
>  static av_always_inline void
>  rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
>                            const uint16_t *src1, const uint16_t *src2,
> -                          int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
> +                          int width, enum AVPixelFormat origin, int32_t *rgb2yuv, int is_be)
>  {
>      int i;
>      int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
> @@ -86,13 +93,13 @@ rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
>      }
>  }
>  
> -#define rgb64funcs(pattern, BE_LE, origin) \
> +#define RGB64FUNCS_EXT(pattern, BE_LE, origin, is_be) \
>  static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
>                                      int width, uint32_t *rgb2yuv, void *opq) \
>  { \
>      const uint16_t *src = (const uint16_t *) _src; \
>      uint16_t *dst = (uint16_t *) _dst; \
> -    rgb64ToY_c_template(dst, src, width, origin, rgb2yuv); \
> +    rgb64ToY_c_template(dst, src, width, origin, rgb2yuv, is_be); \
>  } \
>   \
>  static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
> @@ -102,7 +109,7 @@ static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
>      const uint16_t *src1 = (const uint16_t *) _src1, \
>                     *src2 = (const uint16_t *) _src2; \
>      uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
> -    rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
> +    rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \
>  } \
>   \
>  static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
> @@ -112,18 +119,20 @@ static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV
>      const uint16_t *src1 = (const uint16_t *) _src1, \
>                     *src2 = (const uint16_t *) _src2; \
>      uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
> -    rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
> +    rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \
>  }
> +#define RGB64FUNCS(pattern, endianness, base_fmt) \
> +        RGB64FUNCS_EXT(pattern, endianness, base_fmt ## endianness, IS_BE(endianness))
>  
> -rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE)
> -rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE)
> -rgb64funcs(bgr, LE, AV_PIX_FMT_BGRA64LE)
> -rgb64funcs(bgr, BE, AV_PIX_FMT_BGRA64BE)
> +RGB64FUNCS(rgb, LE, AV_PIX_FMT_RGBA64)
> +RGB64FUNCS(rgb, BE, AV_PIX_FMT_RGBA64)
> +RGB64FUNCS(bgr, LE, AV_PIX_FMT_BGRA64)
> +RGB64FUNCS(bgr, BE, AV_PIX_FMT_BGRA64)
>  
>  static av_always_inline void rgb48ToY_c_template(uint16_t *dst,
>                                                   const uint16_t *src, int width,
>                                                   enum AVPixelFormat origin,
> -                                                 int32_t *rgb2yuv)
> +                                                 int32_t *rgb2yuv, int is_be)
>  {
>      int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
>      int i;
> @@ -142,7 +151,7 @@ static av_always_inline void rgb48ToUV_c_template(uint16_t *dstU,
>                                                    const uint16_t *src2,
>                                                    int width,
>                                                    enum AVPixelFormat origin,
> -                                                  int32_t *rgb2yuv)
> +                                                  int32_t *rgb2yuv, int is_be)
>  {
>      int i;
>      int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
> @@ -164,7 +173,7 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
>                                                         const uint16_t *src2,
>                                                         int width,
>                                                         enum AVPixelFormat origin,
> -                                                       int32_t *rgb2yuv)
> +                                                       int32_t *rgb2yuv, int is_be)
>  {
>      int i;
>      int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
> @@ -187,7 +196,7 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
>  #undef b
>  #undef input_pixel
>  
> -#define rgb48funcs(pattern, BE_LE, origin)                              \
> +#define RGB48FUNCS_EXT(pattern, BE_LE, origin, is_be)                   \
>  static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
>                                              const uint8_t *_src,        \
>                                              const uint8_t *unused0, const uint8_t *unused1,\
> @@ -197,7 +206,7 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
>  {                                                                       \
>      const uint16_t *src = (const uint16_t *)_src;                       \
>      uint16_t *dst       = (uint16_t *)_dst;                             \
> -    rgb48ToY_c_template(dst, src, width, origin, rgb2yuv);              \
> +    rgb48ToY_c_template(dst, src, width, origin, rgb2yuv, is_be);       \
>  }                                                                       \
>                                                                          \
>  static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
> @@ -213,7 +222,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
>                     *src2 = (const uint16_t *)_src2;                     \
>      uint16_t *dstU = (uint16_t *)_dstU,                                 \
>               *dstV = (uint16_t *)_dstV;                                 \
> -    rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv);        \
> +    rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \
>  }                                                                       \
>                                                                          \
>  static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU,       \
> @@ -229,13 +238,15 @@ static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU,       \
>                     *src2 = (const uint16_t *)_src2;                     \
>      uint16_t *dstU = (uint16_t *)_dstU,                                 \
>               *dstV = (uint16_t *)_dstV;                                 \
> -    rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv);   \
> +    rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \
>  }
> +#define RGB48FUNCS(pattern, endianness, base_fmt) \
> +        RGB48FUNCS_EXT(pattern, endianness, base_fmt ## endianness, IS_BE(endianness))
>  
> -rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE)
> -rgb48funcs(rgb, BE, AV_PIX_FMT_RGB48BE)
> -rgb48funcs(bgr, LE, AV_PIX_FMT_BGR48LE)
> -rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE)
> +RGB48FUNCS(rgb, LE, AV_PIX_FMT_RGB48)
> +RGB48FUNCS(rgb, BE, AV_PIX_FMT_RGB48)
> +RGB48FUNCS(bgr, LE, AV_PIX_FMT_BGR48)
> +RGB48FUNCS(bgr, BE, AV_PIX_FMT_BGR48)
>  
>  #define input_pixel(i) ((origin == AV_PIX_FMT_RGBA ||                      \
>                           origin == AV_PIX_FMT_BGRA ||                      \
> @@ -245,7 +256,7 @@ rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE)
>                          : ((origin == AV_PIX_FMT_X2RGB10LE ||              \
>                              origin == AV_PIX_FMT_X2BGR10LE)                \
>                             ? AV_RL32(&src[(i) * 4])                        \
> -                           : (isBE(origin) ? AV_RB16(&src[(i) * 2])        \
> +                           : (is_be ? AV_RB16(&src[(i) * 2])               \
>                                : AV_RL16(&src[(i) * 2]))))
>  
>  static av_always_inline void rgb16_32ToY_c_template(int16_t *dst,
> @@ -257,7 +268,7 @@ static av_always_inline void rgb16_32ToY_c_template(int16_t *dst,
>                                                      int maskr, int maskg,
>                                                      int maskb, int rsh,
>                                                      int gsh, int bsh, int S,
> -                                                    int32_t *rgb2yuv)
> +                                                    int32_t *rgb2yuv, int is_be)
>  {
>      const int ry       = rgb2yuv[RY_IDX]<<rsh, gy = rgb2yuv[GY_IDX]<<gsh, by = rgb2yuv[BY_IDX]<<bsh;
>      const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
> @@ -283,7 +294,7 @@ static av_always_inline void rgb16_32ToUV_c_template(int16_t *dstU,
>                                                       int maskr, int maskg,
>                                                       int maskb, int rsh,
>                                                       int gsh, int bsh, int S,
> -                                                     int32_t *rgb2yuv)
> +                                                     int32_t *rgb2yuv, int is_be)
>  {
>      const int ru       = rgb2yuv[RU_IDX] * (1 << rsh), gu = rgb2yuv[GU_IDX] * (1 << gsh), bu = rgb2yuv[BU_IDX] * (1 << bsh),
>                rv       = rgb2yuv[RV_IDX] * (1 << rsh), gv = rgb2yuv[GV_IDX] * (1 << gsh), bv = rgb2yuv[BV_IDX] * (1 << bsh);
> @@ -311,7 +322,7 @@ static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
>                                                            int maskr, int maskg,
>                                                            int maskb, int rsh,
>                                                            int gsh, int bsh, int S,
> -                                                          int32_t *rgb2yuv)
> +                                                          int32_t *rgb2yuv, int is_be)
>  {
>      const int ru       = rgb2yuv[RU_IDX] * (1 << rsh), gu = rgb2yuv[GU_IDX] * (1 << gsh), bu = rgb2yuv[BU_IDX] * (1 << bsh),
>                rv       = rgb2yuv[RV_IDX] * (1 << rsh), gv = rgb2yuv[GV_IDX] * (1 << gsh), bv = rgb2yuv[BV_IDX] * (1 << bsh),
> @@ -345,13 +356,13 @@ static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
>  
>  #undef input_pixel
>  
> -#define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr,          \
> -                         maskg, maskb, rsh, gsh, bsh, S)                \
> +#define RGB16_32FUNCS_EXT(fmt, name, shr, shg, shb, shp, maskr,         \
> +                          maskg, maskb, rsh, gsh, bsh, S, is_be)        \
>  static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,            \
>                            int width, uint32_t *tab, void *opq)          \
>  {                                                                       \
>      rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp,    \
> -                           maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \
> +                           maskr, maskg, maskb, rsh, gsh, bsh, S, tab, is_be); \
>  }                                                                       \
>                                                                          \
>  static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
> @@ -360,7 +371,7 @@ static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
>  {                                                                       \
>      rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,                \
>                              shr, shg, shb, shp,                         \
> -                            maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\
> +                            maskr, maskg, maskb, rsh, gsh, bsh, S, tab, is_be); \
>  }                                                                       \
>                                                                          \
>  static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV,           \
> @@ -371,27 +382,32 @@ static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV,           \
>      rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,           \
>                                   shr, shg, shb, shp,                    \
>                                   maskr, maskg, maskb,                   \
> -                                 rsh, gsh, bsh, S, tab);                \
> -}
> -
> -rgb16_32_wrapper(AV_PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
> -rgb16_32_wrapper(AV_PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
> -rgb16_32_wrapper(AV_PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
> -rgb16_32_wrapper(AV_PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
> -rgb16_32_wrapper(AV_PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
> -rgb16_32_wrapper(AV_PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
> -rgb16_32_wrapper(AV_PIX_FMT_BGR444LE, bgr12le, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
> -rgb16_32_wrapper(AV_PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
> -rgb16_32_wrapper(AV_PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
> -rgb16_32_wrapper(AV_PIX_FMT_RGB444LE, rgb12le, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
> -rgb16_32_wrapper(AV_PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
> -rgb16_32_wrapper(AV_PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
> -rgb16_32_wrapper(AV_PIX_FMT_BGR444BE, bgr12be, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
> -rgb16_32_wrapper(AV_PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
> -rgb16_32_wrapper(AV_PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
> -rgb16_32_wrapper(AV_PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
> -rgb16_32_wrapper(AV_PIX_FMT_X2RGB10LE, rgb30le, 16, 6, 0, 0, 0x3FF00000, 0xFFC00, 0x3FF, 0, 0, 4, RGB2YUV_SHIFT + 6)
> -rgb16_32_wrapper(AV_PIX_FMT_X2BGR10LE, bgr30le, 0, 6, 16, 0, 0x3FF, 0xFFC00, 0x3FF00000, 4, 0, 0, RGB2YUV_SHIFT + 6)
> +                                 rsh, gsh, bsh, S, tab, is_be);         \
> +}
> +
> +#define RGB16_32FUNCS(base_fmt, endianness, name, shr, shg, shb, shp, maskr, \
> +                      maskg, maskb, rsh, gsh, bsh, S) \
> +    RGB16_32FUNCS_EXT(base_fmt ## endianness, name, shr, shg, shb, shp, maskr, \
> +                      maskg, maskb, rsh, gsh, bsh, S, IS_BE(endianness))
> +
> +RGB16_32FUNCS(AV_PIX_FMT_BGR32,     , bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
> +RGB16_32FUNCS(AV_PIX_FMT_BGR32_1,   , bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT + 8)
> +RGB16_32FUNCS(AV_PIX_FMT_RGB32,     , rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
> +RGB16_32FUNCS(AV_PIX_FMT_RGB32_1,   , rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT + 8)
> +RGB16_32FUNCS(AV_PIX_FMT_BGR565,  LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
> +RGB16_32FUNCS(AV_PIX_FMT_BGR555,  LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
> +RGB16_32FUNCS(AV_PIX_FMT_BGR444,  LE, bgr12le, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
> +RGB16_32FUNCS(AV_PIX_FMT_RGB565,  LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
> +RGB16_32FUNCS(AV_PIX_FMT_RGB555,  LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
> +RGB16_32FUNCS(AV_PIX_FMT_RGB444,  LE, rgb12le, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
> +RGB16_32FUNCS(AV_PIX_FMT_BGR565,  BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT + 8)
> +RGB16_32FUNCS(AV_PIX_FMT_BGR555,  BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT + 7)
> +RGB16_32FUNCS(AV_PIX_FMT_BGR444,  BE, bgr12be, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT + 4)
> +RGB16_32FUNCS(AV_PIX_FMT_RGB565,  BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT + 8)
> +RGB16_32FUNCS(AV_PIX_FMT_RGB555,  BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
> +RGB16_32FUNCS(AV_PIX_FMT_RGB444,  BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
> +RGB16_32FUNCS(AV_PIX_FMT_X2RGB10, LE, rgb30le, 16, 6, 0, 0, 0x3FF00000, 0xFFC00, 0x3FF, 0, 0, 4, RGB2YUV_SHIFT + 6)
> +RGB16_32FUNCS(AV_PIX_FMT_X2BGR10, LE, bgr30le, 0, 6, 16, 0, 0x3FF, 0xFFC00, 0x3FF00000, 4, 0, 0, RGB2YUV_SHIFT + 6)
>  
>  static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
>                           const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
> @@ -832,8 +848,6 @@ p01x_wrapper(10, 6)
>  p01x_wrapper(12, 4)
>  p01x_uv_wrapper(16, 0)
>  
> -#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
> -
>  static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
>                         int width, uint32_t *rgb2yuv, void *opq)
>  {

Will apply this patchset tomorrow unless there are objections.

- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 2/2] swscale/output: Don't call av_pix_fmt_desc_get() in a loop
  2022-09-16 14:55 ` [FFmpeg-devel] [PATCH v2 2/2] swscale/output: Don't call av_pix_fmt_desc_get() in a loop Andreas Rheinhardt
@ 2022-09-19 14:34   ` Michael Niedermayer
  2022-09-19 14:36     ` Andreas Rheinhardt
  0 siblings, 1 reply; 5+ messages in thread
From: Michael Niedermayer @ 2022-09-19 14:34 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 1756 bytes --]

On Fri, Sep 16, 2022 at 04:55:39PM +0200, Andreas Rheinhardt wrote:
> Up until now, libswscale/output.c used a macro to write
> an output pixel which involved a call to av_pix_fmt_desc_get()
> to find out whether the input pixel format is BE or LE
> despite this being known at compile-time (there are templates
> per pixfmt). Even worse, these calls are made in a loop,
> so that e.g. there are eight calls to av_pix_fmt_desc_get()
> for every pixel processed in yuv2rgba64_X_c_template()
> for 64bit RGB formats.
> 
> This commit modifies these macros to ensure that isBE()
> is evaluated at compile-time. This saved 41184B of .text
> for me (GCC 11.2, -O3). Of course, it also improved performance.
> E.g. ffmpeg_g -f lavfi -i testsrc2,format=yuva420p -pix_fmt rgba64le \
> -threads 1  -t 1:00  -f null - (which uses yuv2rgba64le_X_c,
> which is an invocation of yuv2rgba64_X_c_template() mentioned above),
> performance improved from 95589 to 41387 decicycles for one call
> to yuv2packedX; for the be variant the numbers went down from
> 76087 to 43024 decicycles.
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
>  libswscale/output.c | 100 +++++++++++++++++++++++++-------------------
>  1 file changed, 58 insertions(+), 42 deletions(-)

This looks alot better than before

thx

PS: i still think that broader support for compile time evaluation of 
"pure" functions would be usefull. Ideally with minimal mess on the source
side, more on the build tool side


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

When you are offended at any man's fault, turn to yourself and study your
own failings. Then you will forget your anger. -- Epictetus

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 2/2] swscale/output: Don't call av_pix_fmt_desc_get() in a loop
  2022-09-19 14:34   ` Michael Niedermayer
@ 2022-09-19 14:36     ` Andreas Rheinhardt
  0 siblings, 0 replies; 5+ messages in thread
From: Andreas Rheinhardt @ 2022-09-19 14:36 UTC (permalink / raw)
  To: ffmpeg-devel

Michael Niedermayer:
> On Fri, Sep 16, 2022 at 04:55:39PM +0200, Andreas Rheinhardt wrote:
>> Up until now, libswscale/output.c used a macro to write
>> an output pixel which involved a call to av_pix_fmt_desc_get()
>> to find out whether the input pixel format is BE or LE
>> despite this being known at compile-time (there are templates
>> per pixfmt). Even worse, these calls are made in a loop,
>> so that e.g. there are eight calls to av_pix_fmt_desc_get()
>> for every pixel processed in yuv2rgba64_X_c_template()
>> for 64bit RGB formats.
>>
>> This commit modifies these macros to ensure that isBE()
>> is evaluated at compile-time. This saved 41184B of .text
>> for me (GCC 11.2, -O3). Of course, it also improved performance.
>> E.g. ffmpeg_g -f lavfi -i testsrc2,format=yuva420p -pix_fmt rgba64le \
>> -threads 1  -t 1:00  -f null - (which uses yuv2rgba64le_X_c,
>> which is an invocation of yuv2rgba64_X_c_template() mentioned above),
>> performance improved from 95589 to 41387 decicycles for one call
>> to yuv2packedX; for the be variant the numbers went down from
>> 76087 to 43024 decicycles.
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
>> ---
>>  libswscale/output.c | 100 +++++++++++++++++++++++++-------------------
>>  1 file changed, 58 insertions(+), 42 deletions(-)
> 
> This looks alot better than before
> 
> thx
> 
> PS: i still think that broader support for compile time evaluation of 
> "pure" functions would be usefull. Ideally with minimal mess on the source
> side, more on the build tool side
> 

I agree with that. Hopefully we find a solution.

- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-09-19 14:36 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-16 14:52 [FFmpeg-devel] [PATCH v2 1/2] swscale/input: Avoid calls to av_pix_fmt_desc_get() Andreas Rheinhardt
2022-09-16 14:55 ` [FFmpeg-devel] [PATCH v2 2/2] swscale/output: Don't call av_pix_fmt_desc_get() in a loop Andreas Rheinhardt
2022-09-19 14:34   ` Michael Niedermayer
2022-09-19 14:36     ` Andreas Rheinhardt
2022-09-19 14:32 ` [FFmpeg-devel] [PATCH v2 1/2] swscale/input: Avoid calls to av_pix_fmt_desc_get() Andreas Rheinhardt

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git