[FFmpeg-devel] [PATCH 1/6] avfilter/vf_overlay: Don't perform UB pointer arithmetic

From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH 1/6] avfilter/vf_overlay: Don't perform UB pointer arithmetic
Date: Sat, 7 Jun 2025 22:09:47 +0200
Message-ID: <GV1P250MB0737E4CE269C4D662F8B58A68F69A@GV1P250MB0737.EURP250.PROD.OUTLOOK.COM> (raw)

[-- Attachment #1: Type: text/plain, Size: 29 bytes --]

Patches attached.

- Andreas

[-- Attachment #2: 0001-avfilter-vf_overlay-Don-t-perform-UB-pointer-arithme.patch --]
[-- Type: text/x-patch, Size: 7168 bytes --]

From 9de09c4db6c914eeec505d4365850fda3bf86b8d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 4 Jun 2025 03:20:02 +0200
Subject: [PATCH 1/6] avfilter/vf_overlay: Don't perform UB pointer arithmetic

This happens when the pixel format of the output does not
have an alpha channel. It leads to FATE failures with
the ffmpeg-filter_colorkey, filter-overlay-dvdsub-2397
filter-overlay,
filter-overlay_{gbrp_gbrap,nv12,nv21,yuv420,yuv420_yuva420,
yuv420p10,yuv422_yuva422,yuv422p10,yuv444_yuva444,yuv444p10}
and sub2video tests when using Clang UBSan.

Fix this by only performing the pointer arithmetic when
it is going to be used. This can be checked via variables
that compile-time constants due to inlining, so that the
checks are free. Given that the pointer is potentially
used as a function argument, the compiler could elide
the calculation, but not it can. The size of .text decreased
by 1632B with GCC 14 and by 1392B with Clang 19 (both -O3).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavfilter/vf_overlay.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index 8560ed7c17..0a9ff60ebb 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -467,7 +467,7 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
     int dst_hp = AV_CEIL_RSHIFT(dst_h, vsub);                                                              \
     int yp = y>>vsub;                                                                                      \
     int xp = x>>hsub;                                                                                      \
-    uint##depth##_t *s, *sp, *d, *dp, *dap, *a, *da, *ap;                                                  \
+    uint##depth##_t *s, *sp, *d, *dp, *dap, *a, *ap;                                                       \
     int jmax, j, k, kmax;                                                                                  \
     int slice_start, slice_end;                                                                            \
     const uint##depth##_t max = (1 << nbits) - 1;                                                          \
@@ -486,14 +486,15 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
                       + (yp + slice_start) * dst->linesize[dst_plane]                                      \
                       + dst_offset);                                                                       \
     ap = (uint##depth##_t *)(src->data[3] + (slice_start << vsub) * src->linesize[3]);                     \
-    dap = (uint##depth##_t *)(dst->data[3] + ((yp + slice_start) << vsub) * dst->linesize[3]);             \
+    if (main_has_alpha)                                                                                    \
+        dap = (uint##depth##_t *)(dst->data[3] + ((yp + slice_start) << vsub) * dst->linesize[3]);         \
                                                                                                            \
     for (j = slice_start; j < slice_end; j++) {                                                            \
         k = FFMAX(-xp, 0);                                                                                 \
         d = dp + (xp+k) * dst_step;                                                                        \
         s = sp + k;                                                                                        \
         a = ap + (k<<hsub);                                                                                \
-        da = dap + ((xp+k) << hsub);                                                                       \
+        uint##depth##_t *da = main_has_alpha ? dap + ((xp+k) << hsub) : NULL;                              \
         kmax = FFMIN(-xp + dst_wp, src_wp);                                                                \
                                                                                                            \
         if (nbits == 8 && ((vsub && j+1 < src_hp) || !vsub) && octx->blend_row[i]) {                       \
@@ -502,7 +503,8 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
                                                                                                            \
             s += c;                                                                                        \
             d += dst_step * c;                                                                             \
-            da += (1 << hsub) * c;                                                                         \
+            if (main_has_alpha)                                                                            \
+                da += (1 << hsub) * c;                                                                     \
             a += (1 << hsub) * c;                                                                          \
             k += c;                                                                                        \
         }                                                                                                  \
@@ -560,13 +562,15 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
             }                                                                                              \
             s++;                                                                                           \
             d += dst_step;                                                                                 \
-            da += 1 << hsub;                                                                               \
+            if (main_has_alpha)                                                                            \
+                da += 1 << hsub;                                                                           \
             a += 1 << hsub;                                                                                \
         }                                                                                                  \
         dp += dst->linesize[dst_plane] / bytes;                                                            \
         sp += src->linesize[i] / bytes;                                                                    \
         ap += (1 << vsub) * src->linesize[3] / bytes;                                                      \
-        dap += (1 << vsub) * dst->linesize[3] / bytes;                                                     \
+        if (main_has_alpha)                                                                                \
+            dap += (1 << vsub) * dst->linesize[3] / bytes;                                                 \
     }                                                                                                      \
 }
 DEFINE_BLEND_PLANE(8, 8)
-- 
2.45.2


[-- Attachment #3: 0002-avfilter-vf_overlay-Avoid-converting-stride-to-uint1.patch --]
[-- Type: text/x-patch, Size: 10764 bytes --]

From e25ce3fdc6333c7d6408a221d0fc0b90eb057973 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 4 Jun 2025 15:54:55 +0200
Subject: [PATCH 2/6] avfilter/vf_overlay: Avoid converting stride to uint16_t
 and back

Just keep the pointers for the beginning of a line uint8_t*
and use uint16_t* to do the actual processing.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavfilter/vf_overlay.c | 42 ++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index 0a9ff60ebb..88aaad2aad 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -467,7 +467,6 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
     int dst_hp = AV_CEIL_RSHIFT(dst_h, vsub);                                                              \
     int yp = y>>vsub;                                                                                      \
     int xp = x>>hsub;                                                                                      \
-    uint##depth##_t *s, *sp, *d, *dp, *dap, *a, *ap;                                                       \
     int jmax, j, k, kmax;                                                                                  \
     int slice_start, slice_end;                                                                            \
     const uint##depth##_t max = (1 << nbits) - 1;                                                          \
@@ -481,20 +480,19 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
     slice_start = j + (jmax * jobnr) / nb_jobs;                                                            \
     slice_end = j + (jmax * (jobnr+1)) / nb_jobs;                                                          \
                                                                                                            \
-    sp = (uint##depth##_t *)(src->data[i] + (slice_start) * src->linesize[i]);                             \
-    dp = (uint##depth##_t *)(dst->data[dst_plane]                                                          \
+    const uint8_t *sp = src->data[i] + (slice_start) * src->linesize[i];                                   \
+    uint8_t       *dp = dst->data[dst_plane]                                                               \
                       + (yp + slice_start) * dst->linesize[dst_plane]                                      \
-                      + dst_offset);                                                                       \
-    ap = (uint##depth##_t *)(src->data[3] + (slice_start << vsub) * src->linesize[3]);                     \
-    if (main_has_alpha)                                                                                    \
-        dap = (uint##depth##_t *)(dst->data[3] + ((yp + slice_start) << vsub) * dst->linesize[3]);         \
+                      + dst_offset;                                                                        \
+    const uint8_t *ap = src->data[3] + (slice_start << vsub) * src->linesize[3];                           \
+    const uint8_t *dap = main_has_alpha ? dst->data[3] + ((yp + slice_start) << vsub) * dst->linesize[3] : NULL; \
                                                                                                            \
     for (j = slice_start; j < slice_end; j++) {                                                            \
         k = FFMAX(-xp, 0);                                                                                 \
-        d = dp + (xp+k) * dst_step;                                                                        \
-        s = sp + k;                                                                                        \
-        a = ap + (k<<hsub);                                                                                \
-        uint##depth##_t *da = main_has_alpha ? dap + ((xp+k) << hsub) : NULL;                              \
+        const uint##depth##_t  *s = (const uint##depth##_t *)sp + k;                                       \
+        const uint##depth##_t  *a = (const uint##depth##_t *)ap + (k<<hsub);                               \
+        const uint##depth##_t *da = main_has_alpha ? (uint##depth##_t *)dap + ((xp + k) << hsub) : NULL;   \
+        uint##depth##_t *d = (uint##depth##_t *)dp + (xp + k) * dst_step;                                  \
         kmax = FFMIN(-xp + dst_wp, src_wp);                                                                \
                                                                                                            \
         if (nbits == 8 && ((vsub && j+1 < src_hp) || !vsub) && octx->blend_row[i]) {                       \
@@ -566,11 +564,11 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
                 da += 1 << hsub;                                                                           \
             a += 1 << hsub;                                                                                \
         }                                                                                                  \
-        dp += dst->linesize[dst_plane] / bytes;                                                            \
-        sp += src->linesize[i] / bytes;                                                                    \
-        ap += (1 << vsub) * src->linesize[3] / bytes;                                                      \
+        dp += dst->linesize[dst_plane];                                                                    \
+        sp += src->linesize[i];                                                                            \
+        ap += (1 << vsub) * src->linesize[3];                                                              \
         if (main_has_alpha)                                                                                \
-            dap += (1 << vsub) * dst->linesize[3] / bytes;                                                 \
+            dap += (1 << vsub) * dst->linesize[3];                                                         \
     }                                                                                                      \
 }
 DEFINE_BLEND_PLANE(8, 8)
@@ -584,11 +582,9 @@ static inline void alpha_composite_##depth##_##nbits##bits(const AVFrame *src, c
                                    int jobnr, int nb_jobs)                                                 \
 {                                                                                                          \
     uint##depth##_t alpha;          /* the amount of overlay to blend on to main */                        \
-    uint##depth##_t *s, *sa, *d, *da;                                                                      \
     int i, imax, j, jmax;                                                                                  \
     int slice_start, slice_end;                                                                            \
     const uint##depth##_t max = (1 << nbits) - 1;                                                          \
-    int bytes = depth / 8;                                                                                 \
                                                                                                            \
     imax = FFMIN3(-y + dst_h, FFMIN(src_h, dst_h), y + src_h);                                             \
     i = FFMAX(-y, 0);                                                                                      \
@@ -596,13 +592,13 @@ static inline void alpha_composite_##depth##_##nbits##bits(const AVFrame *src, c
     slice_start = i + (imax * jobnr) / nb_jobs;                                                            \
     slice_end = i + ((imax * (jobnr+1)) / nb_jobs);                                                        \
                                                                                                            \
-    sa = (uint##depth##_t *)(src->data[3] + (slice_start) * src->linesize[3]);                             \
-    da = (uint##depth##_t *)(dst->data[3] + (y + slice_start) * dst->linesize[3]);                         \
+    const uint8_t *sa = src->data[3] +     (slice_start) * src->linesize[3];                               \
+    uint8_t       *da = dst->data[3] + (y + slice_start) * dst->linesize[3];                               \
                                                                                                            \
     for (i = slice_start; i < slice_end; i++) {                                                            \
         j = FFMAX(-x, 0);                                                                                  \
-        s = sa + j;                                                                                        \
-        d = da + x+j;                                                                                      \
+        const uint##depth##_t *s = (const uint##depth##_t *)sa + j;                                        \
+        uint##depth##_t *d = (uint##depth##_t *)da + x+j;                                                  \
                                                                                                            \
         for (jmax = FFMIN(-x + dst_w, src_w); j < jmax; j++) {                                             \
             alpha = *s;                                                                                    \
@@ -622,8 +618,8 @@ static inline void alpha_composite_##depth##_##nbits##bits(const AVFrame *src, c
             d += 1;                                                                                        \
             s += 1;                                                                                        \
         }                                                                                                  \
-        da += dst->linesize[3] / bytes;                                                                    \
-        sa += src->linesize[3] / bytes;                                                                    \
+        da += dst->linesize[3];                                                                            \
+        sa += src->linesize[3];                                                                            \
     }                                                                                                      \
 }
 DEFINE_ALPHA_COMPOSITE(8, 8)
-- 
2.45.2


[-- Attachment #4: 0003-avfilter-vf_overlay-Pass-variable-type-directly-in-m.patch --]
[-- Type: text/x-patch, Size: 8350 bytes --]

From 3e8107e7dcaf969051f0895e1de98ec064513857 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 4 Jun 2025 16:28:08 +0200
Subject: [PATCH 3/6] avfilter/vf_overlay: Pass variable type directly in macro

Improves readability.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavfilter/vf_overlay.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index 88aaad2aad..7d8d280a41 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -444,7 +444,7 @@ static av_always_inline void blend_slice_packed_rgb(AVFilterContext *ctx,
     }
 }
 
-#define DEFINE_BLEND_PLANE(depth, nbits)                                                                   \
+#define DEFINE_BLEND_PLANE(depth, T, nbits)                                                                \
 static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext *ctx,                     \
                                          AVFrame *dst, const AVFrame *src,                                 \
                                          int src_w, int src_h,                                             \
@@ -469,8 +469,8 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
     int xp = x>>hsub;                                                                                      \
     int jmax, j, k, kmax;                                                                                  \
     int slice_start, slice_end;                                                                            \
-    const uint##depth##_t max = (1 << nbits) - 1;                                                          \
-    const uint##depth##_t mid = (1 << (nbits -1)) ;                                                        \
+    const T max = (1 << nbits) - 1;                                                                        \
+    const T mid = (1 << (nbits - 1));                                                                      \
     int bytes = depth / 8;                                                                                 \
                                                                                                            \
     dst_step /= bytes;                                                                                     \
@@ -489,10 +489,10 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
                                                                                                            \
     for (j = slice_start; j < slice_end; j++) {                                                            \
         k = FFMAX(-xp, 0);                                                                                 \
-        const uint##depth##_t  *s = (const uint##depth##_t *)sp + k;                                       \
-        const uint##depth##_t  *a = (const uint##depth##_t *)ap + (k<<hsub);                               \
-        const uint##depth##_t *da = main_has_alpha ? (uint##depth##_t *)dap + ((xp + k) << hsub) : NULL;   \
-        uint##depth##_t *d = (uint##depth##_t *)dp + (xp + k) * dst_step;                                  \
+        const T  *s = (const T *)sp + k;                                                                   \
+        const T  *a = (const T *)ap + (k << hsub);                                                         \
+        const T *da = main_has_alpha ? (T *)dap + ((xp + k) << hsub) : NULL;                               \
+        T *d  = (T *)dp + (xp + k) * dst_step;                                                             \
         kmax = FFMIN(-xp + dst_wp, src_wp);                                                                \
                                                                                                            \
         if (nbits == 8 && ((vsub && j+1 < src_hp) || !vsub) && octx->blend_row[i]) {                       \
@@ -571,20 +571,20 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
             dap += (1 << vsub) * dst->linesize[3];                                                         \
     }                                                                                                      \
 }
-DEFINE_BLEND_PLANE(8, 8)
-DEFINE_BLEND_PLANE(16, 10)
+DEFINE_BLEND_PLANE(8,  uint8_t,  8)
+DEFINE_BLEND_PLANE(16, uint16_t, 10)
 
-#define DEFINE_ALPHA_COMPOSITE(depth, nbits)                                                               \
+#define DEFINE_ALPHA_COMPOSITE(depth, T, nbits)                                                            \
 static inline void alpha_composite_##depth##_##nbits##bits(const AVFrame *src, const AVFrame *dst,         \
                                    int src_w, int src_h,                                                   \
                                    int dst_w, int dst_h,                                                   \
                                    int x, int y,                                                           \
                                    int jobnr, int nb_jobs)                                                 \
 {                                                                                                          \
-    uint##depth##_t alpha;          /* the amount of overlay to blend on to main */                        \
+    T alpha;          /* the amount of overlay to blend on to main */                                      \
     int i, imax, j, jmax;                                                                                  \
     int slice_start, slice_end;                                                                            \
-    const uint##depth##_t max = (1 << nbits) - 1;                                                          \
+    const T max = (1 << nbits) - 1;                                                                        \
                                                                                                            \
     imax = FFMIN3(-y + dst_h, FFMIN(src_h, dst_h), y + src_h);                                             \
     i = FFMAX(-y, 0);                                                                                      \
@@ -597,8 +597,8 @@ static inline void alpha_composite_##depth##_##nbits##bits(const AVFrame *src, c
                                                                                                            \
     for (i = slice_start; i < slice_end; i++) {                                                            \
         j = FFMAX(-x, 0);                                                                                  \
-        const uint##depth##_t *s = (const uint##depth##_t *)sa + j;                                        \
-        uint##depth##_t *d = (uint##depth##_t *)da + x+j;                                                  \
+        const T *s = (const T *)sa + j;                                                                    \
+        T *d = (T *)da + x + j;                                                                            \
                                                                                                            \
         for (jmax = FFMIN(-x + dst_w, src_w); j < jmax; j++) {                                             \
             alpha = *s;                                                                                    \
@@ -622,8 +622,8 @@ static inline void alpha_composite_##depth##_##nbits##bits(const AVFrame *src, c
         sa += src->linesize[3];                                                                            \
     }                                                                                                      \
 }
-DEFINE_ALPHA_COMPOSITE(8, 8)
-DEFINE_ALPHA_COMPOSITE(16, 10)
+DEFINE_ALPHA_COMPOSITE(8,  uint8_t,  8)
+DEFINE_ALPHA_COMPOSITE(16, uint16_t, 10)
 
 #define DEFINE_BLEND_SLICE_YUV(depth, nbits)                                                               \
 static av_always_inline void blend_slice_yuv_##depth##_##nbits##bits(AVFilterContext *ctx,                 \
-- 
2.45.2


[-- Attachment #5: 0004-avfilter-vf_overlay-Use-correct-alpha-when-8-bits.patch --]
[-- Type: text/x-patch, Size: 5640 bytes --]

From 8b374a0b9fc847c4d786995fa9486fdffa6edcb2 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 4 Jun 2025 19:34:01 +0200
Subject: [PATCH 4/6] avfilter/vf_overlay: Use correct alpha when > 8 bits

When chroma subsampling is in use, the filter averages
the corresponding (non subsampled) alpha values to get
the actual alpha value. When vertical subsampling is
in use, the next line is accessed via a[src->linesize[3]],
yet a is an uint16_t* for >8 bit formats and linesize
is always in bytes, so that this actually uses the second
line below the current one. This is fixed in this commit.

No FATE test needed updates, because the filter-overlay-yuv420p10
and filter-overlay-yuv444p10 tests use a yuv420p test file
that has constant opacity after conversion to yuva.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavfilter/vf_overlay.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index 7d8d280a41..cd92c7323b 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -353,6 +353,9 @@ static int config_output(AVFilterLink *outlink)
 // ((((x) + (y)) << 8) - ((x) + (y)) - (y) * (x)) is a faster version of: 255 * (x + y)
 #define UNPREMULTIPLY_ALPHA(x, y) ((((x) << 16) - ((x) << 9) + (x)) / ((((x) + (y)) << 8) - ((x) + (y)) - (y) * (x)))
 
+#define PTR_ADD(TYPE, ptr, byte_addend) ((TYPE*)((uint8_t*)ptr + (byte_addend)))
+#define CPTR_ADD(TYPE, ptr, byte_addend) ((const TYPE*)((const uint8_t*)ptr + (byte_addend)))
+
 /**
  * Blend image in src to destination buffer dst at position (x, y).
  */
@@ -511,13 +514,14 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
                                                                                                            \
             /* average alpha for color components, improve quality */                                      \
             if (hsub && vsub && j+1 < src_hp && k+1 < src_wp) {                                            \
-                alpha = (a[0] + a[src->linesize[3]] +                                                      \
-                         a[1] + a[src->linesize[3]+1]) >> 2;                                               \
+                const T *next_line = CPTR_ADD(T, a, src->linesize[3]);                                     \
+                alpha = (a[0] + next_line[0] +                                                             \
+                         a[1] + next_line[1]) >> 2;                                                        \
             } else if (hsub || vsub) {                                                                     \
                 alpha_h = hsub && k+1 < src_wp ?                                                           \
                     (a[0] + a[1]) >> 1 : a[0];                                                             \
                 alpha_v = vsub && j+1 < src_hp ?                                                           \
-                    (a[0] + a[src->linesize[3]]) >> 1 : a[0];                                              \
+                    (a[0] + *CPTR_ADD(T, a, src->linesize[3])) >> 1 : a[0];                                \
                 alpha = (alpha_v + alpha_h) >> 1;                                                          \
             } else                                                                                         \
                 alpha = a[0];                                                                              \
@@ -527,13 +531,14 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
                 /* average alpha for color components, improve quality */                                  \
                 uint8_t alpha_d;                                                                           \
                 if (hsub && vsub && j+1 < src_hp && k+1 < src_wp) {                                        \
-                    alpha_d = (da[0] + da[dst->linesize[3]] +                                              \
-                               da[1] + da[dst->linesize[3]+1]) >> 2;                                       \
+                    const T *next_line = CPTR_ADD(T, da, dst->linesize[3]);                                \
+                    alpha_d = (da[0] + next_line[0] +                                                      \
+                               da[1] + next_line[1]) >> 2;                                                 \
                 } else if (hsub || vsub) {                                                                 \
                     alpha_h = hsub && k+1 < src_wp ?                                                       \
                         (da[0] + da[1]) >> 1 : da[0];                                                      \
                     alpha_v = vsub && j+1 < src_hp ?                                                       \
-                        (da[0] + da[dst->linesize[3]]) >> 1 : da[0];                                       \
+                        (da[0] + *CPTR_ADD(T, da, dst->linesize[3])) >> 1 : da[0];                         \
                     alpha_d = (alpha_v + alpha_h) >> 1;                                                    \
                 } else                                                                                     \
                     alpha_d = da[0];                                                                       \
-- 
2.45.2


[-- Attachment #6: 0005-avfilter-vf_overlay-Keep-dst_step-in-bytes.patch --]
[-- Type: text/x-patch, Size: 4601 bytes --]

From 99d166480e89e78c5d6f8dfff407e896a296fccc Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 4 Jun 2025 20:06:01 +0200
Subject: [PATCH 5/6] avfilter/vf_overlay: Keep dst_step in bytes

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavfilter/vf_overlay.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index cd92c7323b..17fe8e47c9 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -474,9 +474,7 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
     int slice_start, slice_end;                                                                            \
     const T max = (1 << nbits) - 1;                                                                        \
     const T mid = (1 << (nbits - 1));                                                                      \
-    int bytes = depth / 8;                                                                                 \
                                                                                                            \
-    dst_step /= bytes;                                                                                     \
     j = FFMAX(-yp, 0);                                                                                     \
     jmax = FFMIN3(-yp + dst_hp, FFMIN(src_hp, dst_hp), yp + src_hp);                                       \
                                                                                                            \
@@ -495,7 +493,7 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
         const T  *s = (const T *)sp + k;                                                                   \
         const T  *a = (const T *)ap + (k << hsub);                                                         \
         const T *da = main_has_alpha ? (T *)dap + ((xp + k) << hsub) : NULL;                               \
-        T *d  = (T *)dp + (xp + k) * dst_step;                                                             \
+        T *d  = (T *)(dp + (xp + k) * dst_step);                                                           \
         kmax = FFMIN(-xp + dst_wp, src_wp);                                                                \
                                                                                                            \
         if (nbits == 8 && ((vsub && j+1 < src_hp) || !vsub) && octx->blend_row[i]) {                       \
@@ -503,7 +501,7 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
                     (uint8_t*)a, kmax - k, src->linesize[3]);                                              \
                                                                                                            \
             s += c;                                                                                        \
-            d += dst_step * c;                                                                             \
+            d  = PTR_ADD(T, d, dst_step * c);                                                              \
             if (main_has_alpha)                                                                            \
                 da += (1 << hsub) * c;                                                                     \
             a += (1 << hsub) * c;                                                                          \
@@ -564,7 +562,7 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
                 }                                                                                          \
             }                                                                                              \
             s++;                                                                                           \
-            d += dst_step;                                                                                 \
+            d  = PTR_ADD(T, d, dst_step);                                                                  \
             if (main_has_alpha)                                                                            \
                 da += 1 << hsub;                                                                           \
             a += 1 << hsub;                                                                                \
-- 
2.45.2


[-- Attachment #7: 0006-avfilter-vf_overlay-Hoist-calculations-out-of-loop.patch --]
[-- Type: text/x-patch, Size: 8436 bytes --]

From 36483d23c356a049ca6c3dd6034dbb6453062c8a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 4 Jun 2025 20:27:46 +0200
Subject: [PATCH 6/6] avfilter/vf_overlay: Hoist calculations out of loop

Also use const where appropriate.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavfilter/vf_overlay.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index 17fe8e47c9..d77fe87c05 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -470,16 +470,13 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
     int dst_hp = AV_CEIL_RSHIFT(dst_h, vsub);                                                              \
     int yp = y>>vsub;                                                                                      \
     int xp = x>>hsub;                                                                                      \
-    int jmax, j, k, kmax;                                                                                  \
-    int slice_start, slice_end;                                                                            \
     const T max = (1 << nbits) - 1;                                                                        \
     const T mid = (1 << (nbits - 1));                                                                      \
                                                                                                            \
-    j = FFMAX(-yp, 0);                                                                                     \
-    jmax = FFMIN3(-yp + dst_hp, FFMIN(src_hp, dst_hp), yp + src_hp);                                       \
-                                                                                                           \
-    slice_start = j + (jmax * jobnr) / nb_jobs;                                                            \
-    slice_end = j + (jmax * (jobnr+1)) / nb_jobs;                                                          \
+    const int jmin = FFMAX(-yp, 0), jmax = FFMIN3(-yp + dst_hp, FFMIN(src_hp, dst_hp), yp + src_hp);       \
+    const int kmin = FFMAX(-xp, 0), kmax = FFMIN(-xp + dst_wp, src_wp);                                    \
+    const int slice_start = jmin + (jmax *  jobnr)      / nb_jobs;                                         \
+    const int slice_end   = jmin + (jmax * (jobnr + 1)) / nb_jobs;                                         \
                                                                                                            \
     const uint8_t *sp = src->data[i] + (slice_start) * src->linesize[i];                                   \
     uint8_t       *dp = dst->data[dst_plane]                                                               \
@@ -488,13 +485,12 @@ static av_always_inline void blend_plane_##depth##_##nbits##bits(AVFilterContext
     const uint8_t *ap = src->data[3] + (slice_start << vsub) * src->linesize[3];                           \
     const uint8_t *dap = main_has_alpha ? dst->data[3] + ((yp + slice_start) << vsub) * dst->linesize[3] : NULL; \
                                                                                                            \
-    for (j = slice_start; j < slice_end; j++) {                                                            \
-        k = FFMAX(-xp, 0);                                                                                 \
+    for (int j = slice_start; j < slice_end; ++j) {                                                        \
+        int k = kmin;                                                                                      \
         const T  *s = (const T *)sp + k;                                                                   \
         const T  *a = (const T *)ap + (k << hsub);                                                         \
         const T *da = main_has_alpha ? (T *)dap + ((xp + k) << hsub) : NULL;                               \
         T *d  = (T *)(dp + (xp + k) * dst_step);                                                           \
-        kmax = FFMIN(-xp + dst_wp, src_wp);                                                                \
                                                                                                            \
         if (nbits == 8 && ((vsub && j+1 < src_hp) || !vsub) && octx->blend_row[i]) {                       \
             int c = octx->blend_row[i]((uint8_t*)d, (uint8_t*)da, (uint8_t*)s,                             \
@@ -585,25 +581,21 @@ static inline void alpha_composite_##depth##_##nbits##bits(const AVFrame *src, c
                                    int jobnr, int nb_jobs)                                                 \
 {                                                                                                          \
     T alpha;          /* the amount of overlay to blend on to main */                                      \
-    int i, imax, j, jmax;                                                                                  \
-    int slice_start, slice_end;                                                                            \
     const T max = (1 << nbits) - 1;                                                                        \
                                                                                                            \
-    imax = FFMIN3(-y + dst_h, FFMIN(src_h, dst_h), y + src_h);                                             \
-    i = FFMAX(-y, 0);                                                                                      \
-                                                                                                           \
-    slice_start = i + (imax * jobnr) / nb_jobs;                                                            \
-    slice_end = i + ((imax * (jobnr+1)) / nb_jobs);                                                        \
+    const int imin = FFMAX(-y, 0), imax = FFMIN3(-y + dst_h, FFMIN(src_h, dst_h), y + src_h);              \
+    const int jmin = FFMAX(-x, 0), jmax = FFMIN(-x + dst_w, src_w);                                        \
+    const int slice_start = imin + ( imax *  jobnr)      / nb_jobs;                                        \
+    const int slice_end   = imin + ((imax * (jobnr + 1)) / nb_jobs);                                       \
                                                                                                            \
     const uint8_t *sa = src->data[3] +     (slice_start) * src->linesize[3];                               \
     uint8_t       *da = dst->data[3] + (y + slice_start) * dst->linesize[3];                               \
                                                                                                            \
-    for (i = slice_start; i < slice_end; i++) {                                                            \
-        j = FFMAX(-x, 0);                                                                                  \
-        const T *s = (const T *)sa + j;                                                                    \
-        T *d = (T *)da + x + j;                                                                            \
+    for (int i = slice_start; i < slice_end; ++i) {                                                        \
+        const T *s = (const T *)sa + jmin;                                                                 \
+        T *d = (T *)da + x + jmin;                                                                         \
                                                                                                            \
-        for (jmax = FFMIN(-x + dst_w, src_w); j < jmax; j++) {                                             \
+        for (int j = jmin; j < jmax; ++j) {                                                                \
             alpha = *s;                                                                                    \
             if (alpha != 0 && alpha != max) {                                                              \
                 uint8_t alpha_d = *d;                                                                      \
-- 
2.45.2


[-- Attachment #8: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".