From: Niklas Haas via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Niklas Haas <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PR] swscale/ops: minor fixes (some UB, some missing x86 functions) (PR #22275)
Date: Tue, 24 Feb 2026 12:04:35 -0000
Message-ID: <177193467597.25.8893915831943615934@29965ddac10e> (raw)
PR #22275 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22275
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22275.patch
Makes it so that the self-test now passes `ubsan` again.
>From 0f194da38d3087fbf32070365da41c7e7f5350d1 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Tue, 24 Feb 2026 12:21:11 +0100
Subject: [PATCH 1/8] swscale/ops_chain: properly mark unreachable branch
By breaking to the `av_unreachable` below. This branch is unreachable because
of the `if (entry->flexible)` branch further above.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops_chain.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c
index d1ec1ef83d..e5fbc4f016 100644
--- a/libswscale/ops_chain.c
+++ b/libswscale/ops_chain.c
@@ -152,7 +152,7 @@ static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps nex
case SWS_OP_LSHIFT:
case SWS_OP_RSHIFT:
av_assert1(entry->flexible);
- return score;
+ break;
case SWS_OP_SWIZZLE:
for (int i = 0; i < 4; i++) {
if (op->swizzle.in[i] != entry->swizzle.in[i] && !next.unused[i])
@@ -169,7 +169,7 @@ static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps nex
case SWS_OP_MIN:
case SWS_OP_MAX:
av_assert1(entry->flexible);
- return score;
+ break;
case SWS_OP_LINEAR:
/* All required elements must be present */
if (op->lin.mask & ~entry->linear_mask)
--
2.52.0
>From b2a6b54b7ee083cbe273cd598ad949f650759d3d Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Tue, 24 Feb 2026 12:24:57 +0100
Subject: [PATCH 2/8] swscale/ops_chain: add ability to match fixed scale
factor
This is useful especially for the special case of scaling by common
not-quite-power-of-two constants like 255 or 1023.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops_chain.c | 2 +-
libswscale/ops_chain.h | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c
index e5fbc4f016..2445154186 100644
--- a/libswscale/ops_chain.c
+++ b/libswscale/ops_chain.c
@@ -184,7 +184,7 @@ static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps nex
score += av_popcount(SWS_MASK_ALL ^ entry->linear_mask);
return score;
case SWS_OP_SCALE:
- return score;
+ return av_cmp_q(op->c.q, entry->scale) ? 0 : score;
case SWS_OP_TYPE_NB:
break;
}
diff --git a/libswscale/ops_chain.h b/libswscale/ops_chain.h
index 2f5a31793e..0bc8c01283 100644
--- a/libswscale/ops_chain.h
+++ b/libswscale/ops_chain.h
@@ -111,6 +111,7 @@ typedef struct SwsOpEntry {
uint32_t linear_mask; /* subset of SwsLinearOp */
int dither_size; /* subset of SwsDitherOp */
int clear_value; /* clear value for integer clears */
+ AVRational scale; /* scale factor for SWS_OP_SCALE */
};
/* Kernel implementation */
--
2.52.0
>From 6bf73d2044875201856ff38af827878b4547fe89 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Tue, 24 Feb 2026 11:51:18 +0100
Subject: [PATCH 3/8] swscale/x86/ops: allow matching planar rw against
1-element packed fmt
Otherwise, the x86 backend fails to serve e.g. rgb565le.
For -src rgb565le:
Before: Overall speedup=2.210x faster, min=0.256x max=60.465x
After: Overall speedup=4.929x faster, min=0.638x max=181.260x
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/x86/ops.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 44dbe05b35..fadc1ce8c9 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -551,7 +551,7 @@ static bool op_is_type_invariant(const SwsOp *op)
switch (op->op) {
case SWS_OP_READ:
case SWS_OP_WRITE:
- return !op->rw.packed && !op->rw.frac;
+ return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac;
case SWS_OP_SWIZZLE:
case SWS_OP_CLEAR:
return true;
--
2.52.0
>From 3c96548547e0d555a61465172b12bb149f914e79 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Tue, 24 Feb 2026 12:02:18 +0100
Subject: [PATCH 4/8] swscale/x86/ops: add missing U32 <-> F32 conversions
For -src x2rgb10le:
Before: Overall speedup=1.634x faster, min=0.356x max=44.083x
After: Overall speedup=4.662x faster, min=0.676x max=137.445x
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/x86/ops.c | 4 ++++
libswscale/x86/ops_float.asm | 30 ++++++++++++++++++++++++++++++
2 files changed, 34 insertions(+)
diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index fadc1ce8c9..e9598ba437 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -439,6 +439,8 @@ static const SwsOpTable ops16##EXT = {
DECL_CONVERT(EXT, F32, U8) \
DECL_CONVERT(EXT, U16, F32) \
DECL_CONVERT(EXT, F32, U16) \
+ DECL_CONVERT(EXT, U32, F32) \
+ DECL_CONVERT(EXT, F32, U32) \
DECL_EXPAND(EXT, U8, U32) \
DECL_MIN_MAX(EXT) \
DECL_SCALE(EXT) \
@@ -489,6 +491,8 @@ static const SwsOpTable ops32##EXT = {
REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
+ REF_COMMON_PATTERNS(convert_U32_F32##EXT), \
+ REF_COMMON_PATTERNS(convert_F32_U32##EXT), \
REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
REF_COMMON_PATTERNS(min##EXT), \
REF_COMMON_PATTERNS(max##EXT), \
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
index 2863085a8e..5336adb50b 100644
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -77,6 +77,20 @@ IF W, vcvtdq2ps mw2, mw2
CONTINUE tmp0q
%endmacro
+%macro conv32to32f 0
+op convert_U32_F32
+ LOAD_CONT tmp0q
+IF X, vcvtdq2ps mx, mx
+IF Y, vcvtdq2ps my, my
+IF Z, vcvtdq2ps mz, mz
+IF W, vcvtdq2ps mw, mw
+IF X, vcvtdq2ps mx2, mx2
+IF Y, vcvtdq2ps my2, my2
+IF Z, vcvtdq2ps mz2, mz2
+IF W, vcvtdq2ps mw2, mw2
+ CONTINUE tmp0q
+%endmacro
+
%macro conv32fto8 0
op convert_F32_U8
LOAD_CONT tmp0q
@@ -130,6 +144,20 @@ IF W, vpermq mw, mw, q3120
CONTINUE tmp0q
%endmacro
+%macro conv32fto32 0
+op convert_F32_U32
+ LOAD_CONT tmp0q
+IF X, cvttps2dq mx, mx
+IF Y, cvttps2dq my, my
+IF Z, cvttps2dq mz, mz
+IF W, cvttps2dq mw, mw
+IF X, cvttps2dq mx2, mx2
+IF Y, cvttps2dq my2, my2
+IF Z, cvttps2dq mz2, mz2
+IF W, cvttps2dq mw2, mw2
+ CONTINUE tmp0q
+%endmacro
+
%macro min_max 0
op min
IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0]
@@ -375,8 +403,10 @@ op dot3
INIT_YMM avx2
decl_common_patterns conv8to32f
decl_common_patterns conv16to32f
+decl_common_patterns conv32to32f
decl_common_patterns conv32fto8
decl_common_patterns conv32fto16
+decl_common_patterns conv32fto32
decl_common_patterns min_max
decl_common_patterns scale
decl_common_patterns dither_fns
--
2.52.0
>From 3032759f60e79239306822f20adc370d08a360f2 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Tue, 24 Feb 2026 12:24:32 +0100
Subject: [PATCH 5/8] swscale/x86/ops: properly mark SWS_OP_SCALE as flexible
---
libswscale/x86/ops.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index e9598ba437..0c6ec03a76 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -182,6 +182,7 @@ static int setup_shift(const SwsOp *op, SwsOpPriv *out)
DECL_COMMON_PATTERNS(F32, scale##EXT, \
.op = SWS_OP_SCALE, \
.setup = ff_sws_setup_q, \
+ .flexible = true, \
);
static int setup_dither(const SwsOp *op, SwsOpPriv *out)
--
2.52.0
>From b498e723623cdb0f844546d6b85e9fcad1592d20 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Tue, 24 Feb 2026 12:41:38 +0100
Subject: [PATCH 6/8] swscale/x86/ops: add special case for expanding bits to
bytes/words
Not super useful but also not expensive to carry.
monob -> gbrp:
Before: time=84 us, ref=137 us, speedup=1.618x faster
After: time=23 us, ref=185 us, speedup=7.773x faster
monob -> gray16le:
Before: time=75 us, ref=108 us, speedup=1.440x faster
After: time=20 us, ref=108 us, speedup=5.192x faster
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/x86/ops.c | 10 ++++++++++
libswscale/x86/ops_int.asm | 19 +++++++++++++++++--
2 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 0c6ec03a76..82e85635d6 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -185,6 +185,12 @@ static int setup_shift(const SwsOp *op, SwsOpPriv *out)
.flexible = true, \
);
+#define DECL_EXPAND_BITS(EXT, BITS) \
+ DECL_ASM(U##BITS, expand_bits##BITS##EXT, \
+ .op = SWS_OP_SCALE, \
+ .scale = Q((1 << (BITS)) - 1), \
+ );
+
static int setup_dither(const SwsOp *op, SwsOpPriv *out)
{
/* 1x1 matrix / single constant */
@@ -268,6 +274,7 @@ static int setup_linear(const SwsOp *op, SwsOpPriv *out)
DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
+ DECL_EXPAND_BITS(EXT, 8) \
DECL_PACKED_RW(EXT, 8) \
DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
@@ -336,6 +343,7 @@ static const SwsOpTable ops8##EXT = {
&op_read_nibbles1##EXT, \
&op_read_bits1##EXT, \
&op_write_bits1##EXT, \
+ &op_expand_bits8##EXT, \
&op_pack_1210##EXT, \
&op_pack_3320##EXT, \
&op_pack_2330##EXT, \
@@ -386,6 +394,7 @@ static const SwsOpTable ops8##EXT = {
#define DECL_FUNCS_16(SIZE, EXT, FLAG) \
DECL_PACKED_RW(EXT, 16) \
+ DECL_EXPAND_BITS(EXT, 16) \
DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
@@ -414,6 +423,7 @@ static const SwsOpTable ops16##EXT = {
&op_unpack_4440##EXT, \
&op_unpack_5550##EXT, \
&op_unpack_5650##EXT, \
+ &op_expand_bits16##EXT, \
REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm
index 44af92a7da..bc9e43a098 100644
--- a/libswscale/x86/ops_int.asm
+++ b/libswscale/x86/ops_int.asm
@@ -52,6 +52,9 @@ mask2: times 32 db 0x03
mask3: times 32 db 0x07
mask4: times 32 db 0x0F
+const1b equ mask1
+const1w: times 16 dw 0x01
+
SECTION .text
;---------------------------------------------------------
@@ -456,7 +459,7 @@ IF V2, movd mx2, [in0q + 2]
%endif
mova m8, [bits_shuf]
VBROADCASTI128 m9, [bits_mask]
- VBROADCASTI128 m10, [mask1]
+ VBROADCASTI128 m10, [const1b]
LOAD_CONT tmp0q
add in0q, (mmsize >> 3) * (1 + V2)
pshufb mx, m8
@@ -947,7 +950,7 @@ IF W, vpermq mw, mw, q3120
%endmacro
;---------------------------------------------------------
-; Shifting
+; Shifting and scaling
%macro lshift16 0
op lshift16
@@ -983,6 +986,16 @@ IF W, psrlw mw2, xm8
CONTINUE tmp0q
%endmacro
+; special cases for expanding bits to full range
+%macro expand_bits 2 ; bits, suffix
+op expand_bits%1
+ mova m8, [const1%2]
+ LOAD_CONT tmp0q
+ pcmpeq%2 mx, m8
+IF V2, pcmpeq%2 mx2, m8
+ CONTINUE tmp0q
+%endmacro
+
;---------------------------------------------------------
; Macro instantiations for kernel functions
@@ -1000,6 +1013,7 @@ IF W, psrlw mw2, xm8
read_nibbles
read_bits
write_bits
+ expand_bits 8, b
pack_generic 1, 2, 1
pack_generic 3, 3, 2
@@ -1022,6 +1036,7 @@ IF W, psrlw mw2, xm8
%macro funcs_u16 0
rw_packed 16
+ expand_bits 16, w
pack_generic 4, 4, 4
pack_generic 5, 5, 5
pack_generic 5, 6, 5
--
2.52.0
>From 8f450150cb2877629f920d9d730a9139fc558262 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Tue, 24 Feb 2026 12:54:27 +0100
Subject: [PATCH 7/8] swscale/ops_backend: avoid UB from incorrect function
signature
Annoying C-ism; we can't overload the function type even though they will
always be pointers. We can't even get away with using (void *) in the
function signature, despite casts to void * being technically valid.
Avoid the issue altogether by just moving the process loop into the
type-specific template altogether, and just referring to the correct
compiled process function at runtime. Hopefully, the compiler should be
able to optimize these into a single implementation.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops_backend.c | 36 ++++++++++++------------------------
libswscale/ops_tmpl_common.c | 24 ++++++++++++++++++++++++
2 files changed, 36 insertions(+), 24 deletions(-)
diff --git a/libswscale/ops_backend.c b/libswscale/ops_backend.c
index a503139016..449ba8c975 100644
--- a/libswscale/ops_backend.c
+++ b/libswscale/ops_backend.c
@@ -48,29 +48,6 @@ typedef float f32block_t[SWS_BLOCK_SIZE];
# include "ops_tmpl_float.c"
#undef BIT_DEPTH
-static void process(const SwsOpExec *exec, const void *priv,
- const int bx_start, const int y_start, int bx_end, int y_end)
-{
- const SwsOpChain *chain = priv;
- const SwsOpImpl *impl = chain->impl;
- u32block_t x, y, z, w; /* allocate enough space for any intermediate */
-
- SwsOpIter iterdata;
- SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */
-
- for (iter->y = y_start; iter->y < y_end; iter->y++) {
- for (int i = 0; i < 4; i++) {
- iter->in[i] = exec->in[i] + (iter->y - y_start) * exec->in_stride[i];
- iter->out[i] = exec->out[i] + (iter->y - y_start) * exec->out_stride[i];
- }
-
- for (int block = bx_start; block < bx_end; block++) {
- iter->x = block * SWS_BLOCK_SIZE;
- CONTINUE(u32block_t, x, y, z, w);
- }
- }
-}
-
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
{
int ret;
@@ -79,6 +56,9 @@ static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
if (!chain)
return AVERROR(ENOMEM);
+ av_assert0(ops->num_ops > 0);
+ const SwsPixelType read_type = ops->ops[0].type;
+
static const SwsOpTable *const tables[] = {
&bitfn(op_table_int, u8),
&bitfn(op_table_int, u16),
@@ -96,12 +76,20 @@ static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
}
*out = (SwsCompiledOp) {
- .func = process,
.block_size = SWS_BLOCK_SIZE,
.cpu_flags = chain->cpu_flags,
.priv = chain,
.free = ff_sws_op_chain_free_cb,
};
+
+ switch (read_type) {
+ case SWS_PIXEL_U8: out->func = process_u8; break;
+ case SWS_PIXEL_U16: out->func = process_u16; break;
+ case SWS_PIXEL_U32: out->func = process_u32; break;
+ case SWS_PIXEL_F32: out->func = process_f32; break;
+ default: return AVERROR(EINVAL);
+ }
+
return 0;
}
diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c
index 7cfec4e3f6..c0e0d9f3fb 100644
--- a/libswscale/ops_tmpl_common.c
+++ b/libswscale/ops_tmpl_common.c
@@ -175,3 +175,27 @@ WRAP_COMMON_PATTERNS(scale,
.setup = ff_sws_setup_q,
.flexible = true,
);
+
+static void fn(process)(const SwsOpExec *exec, const void *priv,
+ const int bx_start, const int y_start,
+ int bx_end, int y_end)
+{
+ const SwsOpChain *chain = priv;
+ const SwsOpImpl *impl = chain->impl;
+ u32block_t x, y, z, w; /* allocate enough space for any intermediate */
+
+ SwsOpIter iterdata;
+ SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */
+
+ for (iter->y = y_start; iter->y < y_end; iter->y++) {
+ for (int i = 0; i < 4; i++) {
+ iter->in[i] = exec->in[i] + (iter->y - y_start) * exec->in_stride[i];
+ iter->out[i] = exec->out[i] + (iter->y - y_start) * exec->out_stride[i];
+ }
+
+ for (int block = bx_start; block < bx_end; block++) {
+ iter->x = block * SWS_BLOCK_SIZE;
+ CONTINUE(block_t, (void *) x, (void *) y, (void *) z, (void *) w);
+ }
+ }
+}
--
2.52.0
>From 95a3b32c2a204a792a20af95da276f3d4b786762 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Tue, 24 Feb 2026 12:58:30 +0100
Subject: [PATCH 8/8] swscale/ops: avoid UB in handle_tail()
Stupid NULL + 0 rule.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 900077584a..cf5950aa7d 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -1028,9 +1028,9 @@ handle_tail(const SwsOpPass *p, SwsOpExec *exec,
}
for (int i = 0; i < 4; i++) {
- if (!copy_in)
+ if (!copy_in && exec->in[i])
exec->in[i] += in.linesize[i];
- if (!copy_out)
+ if (!copy_out && exec->out[i])
exec->out[i] += out.linesize[i];
}
}
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2026-02-24 21:33 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=177193467597.25.8893915831943615934@29965ddac10e \
--to=ffmpeg-devel@ffmpeg.org \
--cc=code@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git