[FFmpeg-devel] [PATCH] avfilter/vf_colordetect: optimize C functions a bit (PR #20138)

* [FFmpeg-devel] [PATCH] avfilter/vf_colordetect: optimize C functions a bit (PR #20138)
@ 2025-08-06 16:18 Kacper Michajłow
  0 siblings, 0 replies; only message in thread
From: Kacper Michajłow @ 2025-08-06 16:18 UTC (permalink / raw)
  To: ffmpeg-devel

PR #20138 opened by Kacper Michajłow (kasper93)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20138
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20138.patch

They are used to process tail, so it's still good to have them faster.
Even if AVX version are used.

GCC 14.2.0 | x86_64 (default config) | Before:

detect_alpha_8_full_c:                                3803.0 ( 1.00x)
detect_alpha_8_full_avx2:                              166.4 (22.86x)
detect_alpha_8_full_avx512icl:                         144.2 (26.37x)
detect_alpha_8_limited_c:                            10454.4 ( 1.00x)
detect_alpha_8_limited_avx2:                           616.5 (16.96x)
detect_alpha_8_limited_avx512icl:                      509.4 (20.52x)
detect_alpha_16_full_c:                               1903.0 ( 1.00x)
detect_alpha_16_full_avx2:                             172.4 (11.04x)
detect_alpha_16_full_avx512icl:                        163.4 (11.65x)
detect_alpha_16_limited_c:                            3703.6 ( 1.00x)
detect_alpha_16_limited_avx2:                          644.4 ( 5.75x)
detect_alpha_16_limited_avx512icl:                     558.0 ( 6.64x)
detect_range_8_c:                                     5855.9 ( 1.00x)
detect_range_8_avx2:                                   150.4 (38.94x)
detect_range_8_avx512icl:                              146.7 (39.91x)
detect_range_16_c:                                    2702.2 ( 1.00x)
detect_range_16_avx2:                                  256.7 (10.53x)
detect_range_16_avx512icl:                             116.8 (23.13x)

GCC 14.2.0 | x86_64 (default config) | After:

detect_alpha_8_full_c:                                 376.3 ( 1.00x)
detect_alpha_8_full_avx2:                              169.2 ( 2.22x)
detect_alpha_8_full_avx512icl:                         134.6 ( 2.80x)
detect_alpha_8_limited_c:                             6024.1 ( 1.00x)
detect_alpha_8_limited_avx2:                           641.8 ( 9.39x)
detect_alpha_8_limited_avx512icl:                      493.0 (12.22x)
detect_alpha_16_full_c:                                436.4 ( 1.00x)
detect_alpha_16_full_avx2:                             156.3 ( 2.79x)
detect_alpha_16_full_avx512icl:                        151.8 ( 2.87x)
detect_alpha_16_limited_c:                            3679.9 ( 1.00x)
detect_alpha_16_limited_avx2:                          642.0 ( 5.73x)
detect_alpha_16_limited_avx512icl:                     555.2 ( 6.63x)
detect_range_8_c:                                      655.2 ( 1.00x)
detect_range_8_avx2:                                   153.9 ( 4.26x)
detect_range_8_avx512icl:                              147.4 ( 4.45x)
detect_range_16_c:                                     743.3 ( 1.00x)
detect_range_16_avx2:                                  258.6 ( 2.87x)
detect_range_16_avx512icl:                             107.7 ( 6.90x)

Clang 19.1.7 | x86_64 (default config) | Before:

detect_alpha_8_full_c:                                7013.4 ( 1.00x)
detect_alpha_8_full_avx2:                              141.8 (49.46x)
detect_alpha_8_full_avx512icl:                         133.8 (52.40x)
detect_alpha_8_limited_c:                             7038.8 ( 1.00x)
detect_alpha_8_limited_avx2:                           605.0 (11.63x)
detect_alpha_8_limited_avx512icl:                      506.5 (13.90x)
detect_alpha_16_full_c:                               1799.5 ( 1.00x)
detect_alpha_16_full_avx2:                             143.0 (12.59x)
detect_alpha_16_full_avx512icl:                        127.5 (14.12x)
detect_alpha_16_limited_c:                            3499.6 ( 1.00x)
detect_alpha_16_limited_avx2:                          633.6 ( 5.52x)
detect_alpha_16_limited_avx512icl:                     551.9 ( 6.34x)
detect_range_8_c:                                     5253.6 ( 1.00x)
detect_range_8_avx2:                                   125.0 (42.01x)
detect_range_8_avx512icl:                              123.2 (42.65x)
detect_range_16_c:                                    3055.2 ( 1.00x)
detect_range_16_avx2:                                  230.0 (13.28x)
detect_range_16_avx512icl:                              95.9 (31.86x)

Clang 19.1.7 | x86_64 (default config) | After:

detect_alpha_8_full_c:                                 323.3 ( 1.00x)
detect_alpha_8_full_avx2:                              149.7 ( 2.16x)
detect_alpha_8_full_avx512icl:                         127.7 ( 2.53x)
detect_alpha_8_limited_c:                             5075.9 ( 1.00x)
detect_alpha_8_limited_avx2:                           625.4 ( 8.12x)
detect_alpha_8_limited_avx512icl:                      493.0 (10.30x)
detect_alpha_16_full_c:                                421.0 ( 1.00x)
detect_alpha_16_full_avx2:                             238.8 ( 1.76x)
detect_alpha_16_full_avx512icl:                        126.0 ( 3.34x)
detect_alpha_16_limited_c:                            3516.8 ( 1.00x)
detect_alpha_16_limited_avx2:                          624.7 ( 5.63x)
detect_alpha_16_limited_avx512icl:                     544.7 ( 6.46x)
detect_range_8_c:                                      609.1 ( 1.00x)
detect_range_8_avx2:                                   239.4 ( 2.54x)
detect_range_8_avx512icl:                               89.0 ( 6.84x)
detect_range_16_c:                                     463.9 ( 1.00x)
detect_range_16_avx2:                                  127.4 ( 3.64x)
detect_range_16_avx512icl:                              86.4 ( 5.37x)


From 486e65d4334387c7c4f0b8cf008d5ffc30351dd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20Michaj=C5=82ow?= <kasper93@gmail.com>
Date: Wed, 6 Aug 2025 18:16:08 +0200
Subject: [PATCH] avfilter/vf_colordetect: optimize C functions a bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

They are used to process tail, so it's still good to have them faster.
Even if AVX version are used.

GCC 14.2.0 | x86_64 (default config) | Before:

detect_alpha_8_full_c:                                3803.0 ( 1.00x)
detect_alpha_8_full_avx2:                              166.4 (22.86x)
detect_alpha_8_full_avx512icl:                         144.2 (26.37x)
detect_alpha_8_limited_c:                            10454.4 ( 1.00x)
detect_alpha_8_limited_avx2:                           616.5 (16.96x)
detect_alpha_8_limited_avx512icl:                      509.4 (20.52x)
detect_alpha_16_full_c:                               1903.0 ( 1.00x)
detect_alpha_16_full_avx2:                             172.4 (11.04x)
detect_alpha_16_full_avx512icl:                        163.4 (11.65x)
detect_alpha_16_limited_c:                            3703.6 ( 1.00x)
detect_alpha_16_limited_avx2:                          644.4 ( 5.75x)
detect_alpha_16_limited_avx512icl:                     558.0 ( 6.64x)
detect_range_8_c:                                     5855.9 ( 1.00x)
detect_range_8_avx2:                                   150.4 (38.94x)
detect_range_8_avx512icl:                              146.7 (39.91x)
detect_range_16_c:                                    2702.2 ( 1.00x)
detect_range_16_avx2:                                  256.7 (10.53x)
detect_range_16_avx512icl:                             116.8 (23.13x)

GCC 14.2.0 | x86_64 (default config) | After:

detect_alpha_8_full_c:                                 376.3 ( 1.00x)
detect_alpha_8_full_avx2:                              169.2 ( 2.22x)
detect_alpha_8_full_avx512icl:                         134.6 ( 2.80x)
detect_alpha_8_limited_c:                             6024.1 ( 1.00x)
detect_alpha_8_limited_avx2:                           641.8 ( 9.39x)
detect_alpha_8_limited_avx512icl:                      493.0 (12.22x)
detect_alpha_16_full_c:                                436.4 ( 1.00x)
detect_alpha_16_full_avx2:                             156.3 ( 2.79x)
detect_alpha_16_full_avx512icl:                        151.8 ( 2.87x)
detect_alpha_16_limited_c:                            3679.9 ( 1.00x)
detect_alpha_16_limited_avx2:                          642.0 ( 5.73x)
detect_alpha_16_limited_avx512icl:                     555.2 ( 6.63x)
detect_range_8_c:                                      655.2 ( 1.00x)
detect_range_8_avx2:                                   153.9 ( 4.26x)
detect_range_8_avx512icl:                              147.4 ( 4.45x)
detect_range_16_c:                                     743.3 ( 1.00x)
detect_range_16_avx2:                                  258.6 ( 2.87x)
detect_range_16_avx512icl:                             107.7 ( 6.90x)

Clang 19.1.7 | x86_64 (default config) | Before:

detect_alpha_8_full_c:                                7013.4 ( 1.00x)
detect_alpha_8_full_avx2:                              141.8 (49.46x)
detect_alpha_8_full_avx512icl:                         133.8 (52.40x)
detect_alpha_8_limited_c:                             7038.8 ( 1.00x)
detect_alpha_8_limited_avx2:                           605.0 (11.63x)
detect_alpha_8_limited_avx512icl:                      506.5 (13.90x)
detect_alpha_16_full_c:                               1799.5 ( 1.00x)
detect_alpha_16_full_avx2:                             143.0 (12.59x)
detect_alpha_16_full_avx512icl:                        127.5 (14.12x)
detect_alpha_16_limited_c:                            3499.6 ( 1.00x)
detect_alpha_16_limited_avx2:                          633.6 ( 5.52x)
detect_alpha_16_limited_avx512icl:                     551.9 ( 6.34x)
detect_range_8_c:                                     5253.6 ( 1.00x)
detect_range_8_avx2:                                   125.0 (42.01x)
detect_range_8_avx512icl:                              123.2 (42.65x)
detect_range_16_c:                                    3055.2 ( 1.00x)
detect_range_16_avx2:                                  230.0 (13.28x)
detect_range_16_avx512icl:                              95.9 (31.86x)

Clang 19.1.7 | x86_64 (default config) | After:

detect_alpha_8_full_c:                                 323.3 ( 1.00x)
detect_alpha_8_full_avx2:                              149.7 ( 2.16x)
detect_alpha_8_full_avx512icl:                         127.7 ( 2.53x)
detect_alpha_8_limited_c:                             5075.9 ( 1.00x)
detect_alpha_8_limited_avx2:                           625.4 ( 8.12x)
detect_alpha_8_limited_avx512icl:                      493.0 (10.30x)
detect_alpha_16_full_c:                                421.0 ( 1.00x)
detect_alpha_16_full_avx2:                             238.8 ( 1.76x)
detect_alpha_16_full_avx512icl:                        126.0 ( 3.34x)
detect_alpha_16_limited_c:                            3516.8 ( 1.00x)
detect_alpha_16_limited_avx2:                          624.7 ( 5.63x)
detect_alpha_16_limited_avx512icl:                     544.7 ( 6.46x)
detect_range_8_c:                                      609.1 ( 1.00x)
detect_range_8_avx2:                                   239.4 ( 2.54x)
detect_range_8_avx512icl:                               89.0 ( 6.84x)
detect_range_16_c:                                     463.9 ( 1.00x)
detect_range_16_avx2:                                  127.4 ( 3.64x)
detect_range_16_avx512icl:                              86.4 ( 5.37x)

Signed-off-by: Kacper Michajłow <kasper93@gmail.com>
---
 libavfilter/vf_colordetect.h | 78 ++++++++++++++++++++++++------------
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/libavfilter/vf_colordetect.h b/libavfilter/vf_colordetect.h
index afd2db9c26..24279643d3 100644
--- a/libavfilter/vf_colordetect.h
+++ b/libavfilter/vf_colordetect.h
@@ -22,6 +22,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <libavutil/avassert.h>
 #include <libavutil/macros.h>
 #include <libavutil/pixfmt.h>
 
@@ -44,16 +45,46 @@ void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
 void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
                                   enum AVColorRange color_range);
 
+static inline int ff_detect_range_impl_c(const uint8_t *data, ptrdiff_t stride,
+                                    ptrdiff_t width, ptrdiff_t height,
+                                    uint8_t mpeg_min, uint8_t mpeg_max)
+{
+    while (height--) {
+        uint8_t cond = 0;
+        for (int x = 0; x < width; x++) {
+            const uint8_t val = data[x];
+            cond |= val < mpeg_min || val > mpeg_max;
+        }
+        if (cond)
+            return 1;
+        data += stride;
+    }
+
+    return 0;
+}
+
 static inline int ff_detect_range_c(const uint8_t *data, ptrdiff_t stride,
                                     ptrdiff_t width, ptrdiff_t height,
                                     int mpeg_min, int mpeg_max)
+{
+    av_assume(mpeg_min >= 0 && mpeg_min <= UINT8_MAX);
+    av_assume(mpeg_max >= 0 && mpeg_max <= UINT8_MAX);
+    return ff_detect_range_impl_c(data, stride, width, height, mpeg_min, mpeg_max);
+}
+
+static inline int ff_detect_range16_impl_c(const uint8_t *data, ptrdiff_t stride,
+                                      ptrdiff_t width, ptrdiff_t height,
+                                      uint16_t mpeg_min, uint16_t mpeg_max)
 {
     while (height--) {
+        const uint16_t *data16 = (const uint16_t *) data;
+        uint8_t cond = 0;
         for (int x = 0; x < width; x++) {
-            const uint8_t val = data[x];
-            if (val < mpeg_min || val > mpeg_max)
-                return 1;
+            const uint16_t val = data16[x];
+            cond |= val < mpeg_min || val > mpeg_max;
         }
+        if (cond)
+            return 1;
         data += stride;
     }
 
@@ -64,17 +95,9 @@ static inline int ff_detect_range16_c(const uint8_t *data, ptrdiff_t stride,
                                       ptrdiff_t width, ptrdiff_t height,
                                       int mpeg_min, int mpeg_max)
 {
-    while (height--) {
-        const uint16_t *data16 = (const uint16_t *) data;
-        for (int x = 0; x < width; x++) {
-            const uint16_t val = data16[x];
-            if (val < mpeg_min || val > mpeg_max)
-                return 1;
-        }
-        data += stride;
-    }
-
-    return 0;
+    av_assume(mpeg_min >= 0 && mpeg_min <= UINT16_MAX);
+    av_assume(mpeg_max >= 0 && mpeg_max <= UINT16_MAX);
+    return ff_detect_range16_impl_c(data, stride, width, height, mpeg_min, mpeg_max);
 }
 
 static inline int
@@ -84,10 +107,11 @@ ff_detect_alpha_full_c(const uint8_t *color, ptrdiff_t color_stride,
                        int p, int q, int k)
 {
     while (height--) {
-        for (int x = 0; x < width; x++) {
-            if (color[x] > alpha[x])
-                return 1;
-        }
+        uint8_t cond = 0;
+        for (int x = 0; x < width; x++)
+            cond |= color[x] > alpha[x];
+        if (cond)
+            return 1;
         color += color_stride;
         alpha += alpha_stride;
     }
@@ -101,10 +125,11 @@ ff_detect_alpha_limited_c(const uint8_t *color, ptrdiff_t color_stride,
                           int p, int q, int k)
 {
     while (height--) {
-        for (int x = 0; x < width; x++) {
-            if (p * color[x] - k > q * alpha[x])
-                return 1;
-        }
+        uint8_t cond = 0;
+        for (int x = 0; x < width; x++)
+            cond |= p * color[x] - k > q * alpha[x];
+        if (cond)
+            return 1;
         color += color_stride;
         alpha += alpha_stride;
     }
@@ -120,10 +145,11 @@ ff_detect_alpha16_full_c(const uint8_t *color, ptrdiff_t color_stride,
     while (height--) {
         const uint16_t *color16 = (const uint16_t *) color;
         const uint16_t *alpha16 = (const uint16_t *) alpha;
-        for (int x = 0; x < width; x++) {
-            if (color16[x] > alpha16[x])
-                return 1;
-        }
+        uint8_t cond = 0;
+        for (int x = 0; x < width; x++)
+            cond |= color16[x] > alpha16[x];
+        if (cond)
+            return 1;
         color += color_stride;
         alpha += alpha_stride;
     }
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] only message in thread