* [FFmpeg-devel] [PR] libswscale: directly ref untouched planes instead of memcpy (PR #22293)
@ 2026-02-26 13:33 Niklas Haas via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: Niklas Haas via ffmpeg-devel @ 2026-02-26 13:33 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Niklas Haas
PR #22293 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22293
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22293.patch
Currently, libswscale always goes through a full memcpy pass even when converting e.g. yuva444p -> gbrap. This series adds partial support for the new ops backend to detect and promote such per-plane no-ops to refcopies. This is done by "bubbling" metadata about plane copies upwards to SwsGraph, where the caller (swscale.c) can use it to ref planes directly.
This does unfortunately also require avoiding `av_frame_get_buffer()` in favor of a custom allocator for plane buffers, that I have loosely based on avfilter/framepool.c
>From 5d1f06693be18e5bc1c29ced7cf5a3f2b5b7b389 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Wed, 18 Feb 2026 15:07:47 +0100
Subject: [PATCH 01/18] tests/swscale: exclude init time from benchmark
This was originally intended to also include performance gains/losses
due to complicated setup logic, but in practice it just means that changing
the number of iterations dramatically affects the measured speedup; which
makes it harder to do quick bench runs during development.
---
libswscale/tests/swscale.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
index 6f6240e4b6..63f566ba18 100644
--- a/libswscale/tests/swscale.c
+++ b/libswscale/tests/swscale.c
@@ -193,7 +193,7 @@ static float get_loss(const float ssim[4])
}
static int scale_legacy(AVFrame *dst, const AVFrame *src, struct mode mode,
- struct options opts)
+ struct options opts, int64_t *out_time)
{
SwsContext *sws_legacy;
int ret;
@@ -215,8 +215,10 @@ static int scale_legacy(AVFrame *dst, const AVFrame *src, struct mode mode,
if ((ret = sws_init_context(sws_legacy, NULL, NULL)) < 0)
goto error;
+ int64_t time = av_gettime_relative();
for (int i = 0; ret >= 0 && i < opts.iters; i++)
ret = sws_scale_frame(sws_legacy, dst, src);
+ *out_time = av_gettime_relative() - time;
error:
sws_freeContext(sws_legacy);
@@ -273,6 +275,12 @@ static int run_test(enum AVPixelFormat src_fmt, enum AVPixelFormat dst_fmt,
sws[1]->dither = mode.dither;
sws[1]->threads = opts.threads;
+ if (sws_frame_setup(sws[1], dst, src) < 0) {
+ av_log(NULL, AV_LOG_ERROR, "Failed to setup %s ---> %s\n",
+ av_get_pix_fmt_name(src->format), av_get_pix_fmt_name(dst->format));
+ goto error;
+ }
+
time = av_gettime_relative();
for (int i = 0; i < opts.iters; i++) {
@@ -316,13 +324,11 @@ static int run_test(enum AVPixelFormat src_fmt, enum AVPixelFormat dst_fmt,
if (!ssim_ref && sws_isSupportedInput(src->format) && sws_isSupportedOutput(dst->format)) {
/* Compare against the legacy swscale API as a reference */
- time_ref = av_gettime_relative();
- if (scale_legacy(dst, src, mode, opts) < 0) {
+ if (scale_legacy(dst, src, mode, opts, &time_ref) < 0) {
av_log(NULL, AV_LOG_ERROR, "Failed ref %s ---> %s\n",
av_get_pix_fmt_name(src->format), av_get_pix_fmt_name(dst->format));
goto error;
}
- time_ref = av_gettime_relative() - time_ref;
if (sws_scale_frame(sws[2], out, dst) < 0)
goto error;
--
2.52.0
>From 877868ab83f349854c05e27c49d31e9e20177ab7 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Sun, 22 Feb 2026 19:47:58 +0100
Subject: [PATCH 02/18] tests/swscale: unref buffers before each iteration
Otherwise, we always pass frames that already have buffers allocated, which
breaks the no-op refcopy optimizations.
Testing with -p 0.1 -threads 16 -bench 10, on an AMD Ryzen 9 9950X3D:
Before:
Overall speedup=2.776x faster, min=0.133x max=629.496x
yuv444p 1920x1080 -> yuv444p 1920x1080, flags=0x100000 dither=1
time=9 us, ref=9 us, speedup=1.043x faster
After:
Overall speedup=2.721x faster, min=0.140x max=574.034x
yuv444p 1920x1080 -> yuv444p 1920x1080, flags=0x100000 dither=1
time=0 us, ref=28 us, speedup=516.504x faster
(The slowdown in the legacy swscale case is from swscale's lack of a no-op
refcopy optimizaton, plus the fact that it's now actually doing memory
work instead of a no-op / redundant memset)
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/tests/swscale.c | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
index 63f566ba18..8001f0d105 100644
--- a/libswscale/tests/swscale.c
+++ b/libswscale/tests/swscale.c
@@ -192,6 +192,18 @@ static float get_loss(const float ssim[4])
return 1.0 - sum;
}
+static void unref_buffers(AVFrame *frame)
+{
+ for (int i = 0; i < FF_ARRAY_ELEMS(frame->buf); i++) {
+ if (!frame->buf[i])
+ break;
+ av_buffer_unref(&frame->buf[i]);
+ }
+
+ memset(frame->data, 0, sizeof(frame->data));
+ memset(frame->linesize, 0, sizeof(frame->linesize));
+}
+
static int scale_legacy(AVFrame *dst, const AVFrame *src, struct mode mode,
struct options opts, int64_t *out_time)
{
@@ -216,8 +228,10 @@ static int scale_legacy(AVFrame *dst, const AVFrame *src, struct mode mode,
goto error;
int64_t time = av_gettime_relative();
- for (int i = 0; ret >= 0 && i < opts.iters; i++)
+ for (int i = 0; ret >= 0 && i < opts.iters; i++) {
+ unref_buffers(dst);
ret = sws_scale_frame(sws_legacy, dst, src);
+ }
*out_time = av_gettime_relative() - time;
error:
@@ -284,6 +298,7 @@ static int run_test(enum AVPixelFormat src_fmt, enum AVPixelFormat dst_fmt,
time = av_gettime_relative();
for (int i = 0; i < opts.iters; i++) {
+ unref_buffers(dst);
if (sws_scale_frame(sws[1], dst, src) < 0) {
av_log(NULL, AV_LOG_ERROR, "Failed %s ---> %s\n",
av_get_pix_fmt_name(src->format), av_get_pix_fmt_name(dst->format));
--
2.52.0
>From 495ac476adebc988f90c1d8e300fd69a2116b7ec Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Sat, 21 Feb 2026 17:59:22 +0100
Subject: [PATCH 03/18] swscale: add internal buffer pool abstraction
These can be used by swscale.c instead of performing naked allocations when
the output buffers are not provided by the caller. Comes with the advantage
of being able to zero-initialize buffers, for valgrind etc. to be a bit
happier, and may also save some performance in theory.
Code mostly copied from libavfilter/framepool.c, which we unfortunately
can't easily re-use due to not being exposed in e.g. libavutil, but I've
taken the plane alignment strategy of libavutil/frame.c instead of the naive
one in framepool.c because this makes frames more likely to match the
stride of AVFrames allocated by an external API user.
(Note that vf_scale.c already pre-allocates buffers from the downstream
filter's get_buffer() callback)
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/swscale.c | 1 +
libswscale/swscale_internal.h | 12 +++++
libswscale/utils.c | 88 +++++++++++++++++++++++++++++++++++
3 files changed, 101 insertions(+)
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 0e1831800c..a8fe3d03f1 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -27,6 +27,7 @@
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/emms.h"
+#include "libavutil/imgutils.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 5c58272664..9979133b5c 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -326,6 +326,18 @@ typedef void (*planarX2_YV12_fn)(uint8_t *dst, uint8_t *dst2,
struct SwsSlice;
struct SwsFilterDescriptor;
+typedef struct SwsBufferPool {
+ AVBufferPool *pools[4]; /* pool for each plane allocation */
+ int linesize[4];
+ int format;
+ int width;
+ int height;
+} SwsBufferPool;
+
+int ff_sws_buffer_pool_reinit(SwsBufferPool *pool, const AVFrame *dst);
+void ff_sws_buffer_pool_uninit(SwsBufferPool *pool);
+int ff_sws_buffer_pool_get(const SwsBufferPool *pool, AVFrame *dst, int plane);
+
/* This struct should be aligned on at least a 32-byte boundary. */
struct SwsInternal {
/* Currently active user-facing options. Also contains AVClass */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 52095ab2c7..345e1b186c 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -2456,3 +2456,91 @@ int ff_range_add(RangeList *rl, unsigned int start, unsigned int len)
return 0;
}
+
+void ff_sws_buffer_pool_uninit(SwsBufferPool *pool)
+{
+ for (int i = 0; i < FF_ARRAY_ELEMS(pool->pools); i++)
+ av_buffer_pool_uninit(&pool->pools[i]);
+ memset(pool, 0, sizeof(*pool));
+}
+
+int ff_sws_buffer_pool_reinit(SwsBufferPool *pool, const AVFrame *dst)
+{
+ const int align = av_cpu_max_align();
+ if (pool->format == dst->format &&
+ pool->width == dst->width &&
+ pool->height == dst->height)
+ return 0; /* pool already compatible */
+
+ ff_sws_buffer_pool_uninit(pool);
+
+ int ret = av_image_check_size2(dst->width, dst->height, INT64_MAX,
+ dst->format, 0, NULL);
+ if (ret < 0)
+ goto fail;
+
+ for (int i = 1; i <= align; i += i) {
+ ret = av_image_fill_linesizes(pool->linesize, dst->format, FFALIGN(dst->width, i));
+ if (ret < 0)
+ goto fail;
+ if (!(pool->linesize[0] & (align - 1)))
+ break;
+ }
+
+ ptrdiff_t linesize1[4];
+ for (int i = 0; i < 4; i++)
+ linesize1[i] = pool->linesize[i] = FFALIGN(pool->linesize[i], align);
+
+ size_t sizes[4];
+ ret = av_image_fill_plane_sizes(sizes, dst->format, dst->height, linesize1);
+ if (ret < 0)
+ goto fail;
+
+ for (int i = 0; i < 4 && sizes[i]; i++) {
+ if (sizes[i] > SIZE_MAX - align) {
+ ret = AVERROR(EINVAL);
+ goto fail;
+ }
+
+ pool->pools[i] = av_buffer_pool_init(sizes[i] + align, NULL);
+ if (!pool->pools[i]) {
+ ret = AVERROR(ENOMEM);
+ goto fail;
+ }
+ }
+
+ pool->format = dst->format;
+ pool->width = dst->width;
+ pool->height = dst->height;
+ return 0;
+
+fail:
+ ff_sws_buffer_pool_uninit(pool);
+ return ret;
+}
+
+int ff_sws_buffer_pool_get(const SwsBufferPool *pool, AVFrame *dst, int plane)
+{
+ av_assert0(plane >= 0 && plane < FF_ARRAY_ELEMS(pool->pools));
+ if (!pool->pools[plane])
+ return AVERROR(EINVAL);
+
+ AVBufferRef *buf = av_buffer_pool_get(pool->pools[plane]);
+ if (!buf)
+ return AVERROR(ENOMEM);
+
+ const int align = av_cpu_max_align();
+ uint8_t *aligned_data = (uint8_t *) FFALIGN((uintptr_t) buf->data, align);
+ for (int i = 0; i < FF_ARRAY_ELEMS(dst->buf); i++) {
+ if (!dst->buf[i]) {
+ dst->data[plane] = aligned_data;
+ dst->linesize[plane] = pool->linesize[plane];
+ dst->buf[i] = buf;
+ return 0;
+ }
+ }
+
+ /* No free buffer slot found? Shouldn't be possible */
+ av_buffer_unref(&buf);
+ return AVERROR_BUG;
+}
--
2.52.0
>From 1b01a037d67f93e65925c599775ff3f5a560607e Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Sat, 21 Feb 2026 18:09:46 +0100
Subject: [PATCH 04/18] swscale: add buffer pool to SwsContext and use it for
allocations
This will be needed down the line anways for partial plane allocations.
Testing with -p 0.1 -threads 16 -bench 10, on an AMD Ryzen 9 9950X3D:
Before: Overall speedup=2.721x faster, min=0.140x max=574.034x
After: Overall speedup=2.648x faster, min=0.121x max=656.435x
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/swscale.c | 21 ++++++++++++++++++++-
libswscale/swscale_internal.h | 6 ++++--
libswscale/utils.c | 1 +
libswscale/x86/output.asm | 2 +-
4 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index a8fe3d03f1..e05310a724 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1368,6 +1368,21 @@ static int frame_ref(AVFrame *dst, const AVFrame *src)
return 0;
}
+static int frame_alloc_buffers(SwsContext *sws, AVFrame *frame)
+{
+ SwsInternal *c = sws_internal(sws);
+ const SwsBufferPool *pool = &c->pool;
+
+ const int nb_planes = av_pix_fmt_count_planes(frame->format);
+ for (int i = 0; i < nb_planes; i++) {
+ int ret = ff_sws_buffer_pool_get(pool, frame, i);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
int sws_scale_frame(SwsContext *sws, AVFrame *dst, const AVFrame *src)
{
int ret;
@@ -1408,7 +1423,7 @@ int sws_scale_frame(SwsContext *sws, AVFrame *dst, const AVFrame *src)
return ret;
} else {
if (!dst->data[0]) {
- ret = av_frame_get_buffer(dst, 0);
+ ret = frame_alloc_buffers(sws, dst);
if (ret < 0)
return ret;
}
@@ -1452,6 +1467,10 @@ int sws_frame_setup(SwsContext *ctx, const AVFrame *dst, const AVFrame *src)
if ((ret = validate_params(ctx)) < 0)
return ret;
+ ret = ff_sws_buffer_pool_reinit(&s->pool, dst);
+ if (ret < 0)
+ return ret;
+
for (int field = 0; field < 2; field++) {
SwsFormat src_fmt = ff_fmt_from_frame(src, field);
SwsFormat dst_fmt = ff_fmt_from_frame(dst, field);
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 9979133b5c..9c3d575ab7 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -353,6 +353,7 @@ struct SwsInternal {
/* Scaling graph, reinitialized dynamically as needed. */
SwsGraph *graph[2]; /* top, bottom fields */
+ SwsBufferPool pool; /* pool for allocations */
// values passed to current sws_receive_slice() call
int dst_slice_start;
@@ -721,8 +722,9 @@ static_assert(offsetof(SwsInternal, redDither) + DITHER32_INT == offsetof(SwsInt
#if ARCH_X86_64
/* x86 yuv2gbrp uses the SwsInternal for yuv coefficients
- if struct offsets change the asm needs to be updated too */
-static_assert(offsetof(SwsInternal, yuv2rgb_y_offset) == 40348,
+ * if struct offsets change the asm needs to be updated too
+ * See: libswscale/x86/output.asm */
+static_assert(offsetof(SwsInternal, yuv2rgb_y_offset) == 40412,
"yuv2rgb_y_offset must be updated in x86 asm");
#endif
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 345e1b186c..1b16ab9667 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -2261,6 +2261,7 @@ void sws_freeContext(SwsContext *sws)
for (i = 0; i < FF_ARRAY_ELEMS(c->graph); i++)
ff_sws_graph_free(&c->graph[i]);
+ ff_sws_buffer_pool_uninit(&c->pool);
for (i = 0; i < c->nb_slice_ctx; i++)
sws_freeContext(c->slice_ctx[i]);
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index f2e884780a..9d5c7a9f7a 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -582,7 +582,7 @@ yuv2nv12cX_fn yuv2nv21
%if ARCH_X86_64
struc SwsInternal
- .padding: resb 40348 ; offsetof(SwsInternal, yuv2rgb_y_offset)
+ .padding: resb 40412 ; offsetof(SwsInternal, yuv2rgb_y_offset)
.yuv2rgb_y_offset: resd 1
.yuv2rgb_y_coeff: resd 1
.yuv2rgb_v2r_coeff: resd 1
--
2.52.0
>From bc1e6a11b88d21a2bc903ff6084dcd0dff26ae97 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Sat, 21 Feb 2026 16:50:40 +0100
Subject: [PATCH 05/18] swscale/graph: only setup correct number of planes
Just for sanity.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/graph.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/libswscale/graph.c b/libswscale/graph.c
index e6843a7752..290c891761 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -841,9 +841,11 @@ static SwsImg pass_output(const SwsPass *pass, const SwsImg *fallback)
if (!pass)
return *fallback;
+ const int nb_planes = av_pix_fmt_count_planes(pass->format);
SwsImg img = pass->output->img;
- for (int i = 0; i < FF_ARRAY_ELEMS(img.data); i++) {
+ for (int i = 0; i < nb_planes; i++) {
if (!img.data[i]) {
+ av_assert0(fallback->data[i]);
img.data[i] = fallback->data[i];
img.linesize[i] = fallback->linesize[i];
}
--
2.52.0
>From 1e20e777c7ff8e91cf04eed88914dc6ffac19f32 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Wed, 11 Feb 2026 18:17:23 +0100
Subject: [PATCH 06/18] swscale/graph: add plane copy metadata
This can be set opportunistically by passes which are effectively memcopies.
Information about end-to-end plane copies will also propagate upwards to the
SwsGraph, and can be used there by the caller.
The reason we need to solve it this way is deeply tied to the way SwsGraph
is designed; specifically the fact that it can be used with arbitrary data
pointers and also lives per-field (rather than per-frame), meaning
that we can't just directly mutate the output frame somehow from the run()
call to ref the output planes.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/graph.c | 22 ++++++++++++++++++++++
libswscale/graph.h | 19 +++++++++++++++++++
2 files changed, 41 insertions(+)
diff --git a/libswscale/graph.c b/libswscale/graph.c
index 290c891761..1658748c32 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -105,6 +105,7 @@ SwsPass *ff_sws_graph_add_pass(SwsGraph *graph, enum AVPixelFormat fmt,
if (!pass)
return NULL;
+ memset(pass->plane_copy, -1, sizeof(pass->plane_copy));
pass->graph = graph;
pass->run = run;
pass->priv = priv;
@@ -717,12 +718,32 @@ static int init_passes(SwsGraph *graph)
if (!pass) {
/* No passes were added, so no operations were necessary */
graph->noop = 1;
+ const int nb_planes = av_pix_fmt_count_planes(dst.format);
+ for (int i = 0; i < nb_planes; i++)
+ graph->plane_copy[i] = i;
/* Add threaded memcpy pass */
pass = ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height,
pass, 1, NULL, run_copy);
if (!pass)
return AVERROR(ENOMEM);
+ } else {
+ /* Compute end-to-end plane copy map */
+ for (int n = 0; n < graph->num_passes; n++) {
+ const SwsPass *pass = graph->passes[n];
+ for (int i = 0; i < FF_ARRAY_ELEMS(graph->plane_copy); i++) {
+ const int idx = pass->plane_copy[i];
+ /* This pass writes to an output buffer other than the image
+ * output, or copies from the output of a different pass */
+ if (idx < 0 || pass->output->buf[i] ||
+ (pass->input && pass->input->output->buf[idx]))
+ continue;
+ av_assert0(graph->plane_copy[i] == -1);
+ graph->plane_copy[i] = idx;
+ av_log(graph->ctx, AV_LOG_DEBUG, "Plane %d passthrough from "
+ "plane %d\n", i, idx);
+ }
+ }
}
return 0;
@@ -752,6 +773,7 @@ int ff_sws_graph_create(SwsContext *ctx, const SwsFormat *dst, const SwsFormat *
graph->dst = *dst;
graph->field = field;
graph->opts_copy = *ctx;
+ memset(graph->plane_copy, -1, sizeof(graph->plane_copy));
ret = avpriv_slicethread_create(&graph->slicethread, (void *) graph,
sws_graph_worker, NULL, ctx->threads);
diff --git a/libswscale/graph.h b/libswscale/graph.h
index b9ad2425bd..ddbba64ef5 100644
--- a/libswscale/graph.h
+++ b/libswscale/graph.h
@@ -101,6 +101,15 @@ struct SwsPass {
*/
SwsPassBuffer *output; /* refstruct */
+ /**
+ * Map of planes which are directly copied from the pass input. These
+ * may be promoted from a memcpy to a refcopy.
+ *
+ * Each entry maps the output index to the corresponding input plane
+ * index, or -1 for no copythrough.
+ */
+ int plane_copy[4];
+
/**
* Called once from the main thread before running the filter. Optional.
*/
@@ -123,6 +132,16 @@ typedef struct SwsGraph {
bool incomplete; /* set during init() if formats had to be inferred */
bool noop; /* set during init() if the graph is a no-op */
+ /**
+ * Map of planes which directly copied from the input. These may be
+ * promoted from a memcpy to a refcopy. This requires special handling
+ * by the caller.
+ *
+ * Each entry maps the output index to the corresponding input plane
+ * index, or -1 for no copythrough.
+ */
+ int plane_copy[4];
+
/** Sorted sequence of filter passes to apply */
SwsPass **passes;
int num_passes;
--
2.52.0
>From 3981945858d7b4a5014f7e60862c301ac104f6cb Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Mon, 9 Feb 2026 14:35:54 +0100
Subject: [PATCH 07/18] swscale/ops_memcpy: omit memcpy() if src and dst are
identical
See the previous commit. This allows already referenced planes to be skipped
(and avoids UB in this case).
Note that this is already safe for the other backends, because reading from
and writing to the same pointer via e.g. AVX operations is also effectively
a no-op.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops_memcpy.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/libswscale/ops_memcpy.c b/libswscale/ops_memcpy.c
index 15002ae5ff..005b12de7d 100644
--- a/libswscale/ops_memcpy.c
+++ b/libswscale/ops_memcpy.c
@@ -42,6 +42,9 @@ static void process(const SwsOpExec *exec, const void *priv,
const int idx = p->index[i];
if (idx < 0) {
memset(out, p->clear_value[i], exec->out_stride[i] * lines);
+ } else if (out == exec->in[idx]) {
+ av_assert1(exec->out_stride[i] == exec->in_stride[idx]);
+ continue; /* plane was already ref'd */
} else if (exec->out_stride[i] == exec->in_stride[idx]) {
memcpy(out, exec->in[idx], exec->out_stride[i] * lines);
} else {
--
2.52.0
>From b2f2d79ec719a00ebecb528774b74b4e91503746 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Thu, 12 Feb 2026 10:44:19 +0100
Subject: [PATCH 08/18] swscale/ops: use SwsCompFlags typedef instead of plain
int
This improves the debugging experience. These are all internal structs so
there is no need to worry about ABI stability as a result of adding flags.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops.c | 14 +++++++-------
libswscale/ops.h | 8 ++++----
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/libswscale/ops.c b/libswscale/ops.c
index b709ffef79..e3f5668501 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -215,11 +215,11 @@ void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4])
}
/* merge_comp_flags() forms a monoid with flags_identity as the null element */
-static const unsigned flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT;
-static unsigned merge_comp_flags(unsigned a, unsigned b)
+static const SwsCompFlags flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT;
+static SwsCompFlags merge_comp_flags(SwsCompFlags a, SwsCompFlags b)
{
- const unsigned flags_or = SWS_COMP_GARBAGE;
- const unsigned flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT;
+ const SwsCompFlags flags_or = SWS_COMP_GARBAGE;
+ const SwsCompFlags flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT;
return ((a & b) & flags_and) | ((a | b) & flags_or);
}
@@ -322,7 +322,7 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
}
break;
case SWS_OP_PACK: {
- unsigned flags = flags_identity;
+ SwsCompFlags flags = flags_identity;
for (int i = 0; i < 4; i++) {
if (op->pack.pattern[i])
flags = merge_comp_flags(flags, prev.flags[i]);
@@ -358,7 +358,7 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
break;
case SWS_OP_LINEAR:
for (int i = 0; i < 4; i++) {
- unsigned flags = flags_identity;
+ SwsCompFlags flags = flags_identity;
AVRational min = Q(0), max = Q(0);
for (int j = 0; j < 4; j++) {
const AVRational k = op->lin.m[i][j];
@@ -667,7 +667,7 @@ static const char *describe_lin_mask(uint32_t mask)
return "ERR";
}
-static char describe_comp_flags(unsigned flags)
+static char describe_comp_flags(SwsCompFlags flags)
{
if (flags & SWS_COMP_GARBAGE)
return 'X';
diff --git a/libswscale/ops.h b/libswscale/ops.h
index d1576b9325..86a5f27a58 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -69,12 +69,12 @@ typedef enum SwsOpType {
SWS_OP_TYPE_NB,
} SwsOpType;
-enum SwsCompFlags {
+typedef enum SwsCompFlags {
SWS_COMP_GARBAGE = 1 << 0, /* contents are undefined / garbage data */
SWS_COMP_EXACT = 1 << 1, /* value is an exact integer */
SWS_COMP_ZERO = 1 << 2, /* known to be a constant zero */
SWS_COMP_SWAPPED = 1 << 3, /* byte order is swapped */
-};
+} SwsCompFlags;
typedef union SwsConst {
/* Generic constant value */
@@ -87,8 +87,8 @@ static_assert(sizeof(SwsConst) == sizeof(AVRational) * 4,
"First field of SwsConst should span the entire union");
typedef struct SwsComps {
- unsigned flags[4]; /* knowledge about (output) component contents */
- bool unused[4]; /* which input components are definitely unused */
+ SwsCompFlags flags[4]; /* knowledge about (output) component contents */
+ bool unused[4]; /* which input components are definitely unused */
/* Keeps track of the known possible value range, or {0, 0} for undefined
* or (unknown range) floating point inputs */
--
2.52.0
>From 6641a07a91ea1d23a1ddfdf867f06f5711a7a411 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Mon, 16 Feb 2026 11:09:11 +0100
Subject: [PATCH 09/18] swscale/ops: add ff_sws_op_type_name
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops.c | 26 ++++++++++++++++++++++++++
libswscale/ops.h | 2 ++
2 files changed, 28 insertions(+)
diff --git a/libswscale/ops.c b/libswscale/ops.c
index e3f5668501..6da080732e 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -94,6 +94,32 @@ bool ff_sws_pixel_type_is_int(SwsPixelType type)
return false;
}
+const char *ff_sws_op_type_name(SwsOpType op)
+{
+ switch (op) {
+ case SWS_OP_READ: return "SWS_OP_READ";
+ case SWS_OP_WRITE: return "SWS_OP_WRITE";
+ case SWS_OP_SWAP_BYTES: return "SWS_OP_SWAP_BYTES";
+ case SWS_OP_SWIZZLE: return "SWS_OP_SWIZZLE";
+ case SWS_OP_UNPACK: return "SWS_OP_UNPACK";
+ case SWS_OP_PACK: return "SWS_OP_PACK";
+ case SWS_OP_LSHIFT: return "SWS_OP_LSHIFT";
+ case SWS_OP_RSHIFT: return "SWS_OP_RSHIFT";
+ case SWS_OP_CLEAR: return "SWS_OP_CLEAR";
+ case SWS_OP_CONVERT: return "SWS_OP_CONVERT";
+ case SWS_OP_MIN: return "SWS_OP_MIN";
+ case SWS_OP_MAX: return "SWS_OP_MAX";
+ case SWS_OP_SCALE: return "SWS_OP_SCALE";
+ case SWS_OP_LINEAR: return "SWS_OP_LINEAR";
+ case SWS_OP_DITHER: return "SWS_OP_DITHER";
+ case SWS_OP_INVALID: return "SWS_OP_INVALID";
+ case SWS_OP_TYPE_NB: break;
+ }
+
+ av_unreachable("Invalid operation type!");
+ return false;
+}
+
/* biased towards `a` */
static AVRational av_min_q(AVRational a, AVRational b)
{
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 86a5f27a58..fe83739b5a 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -69,6 +69,8 @@ typedef enum SwsOpType {
SWS_OP_TYPE_NB,
} SwsOpType;
+const char *ff_sws_op_type_name(SwsOpType op);
+
typedef enum SwsCompFlags {
SWS_COMP_GARBAGE = 1 << 0, /* contents are undefined / garbage data */
SWS_COMP_EXACT = 1 << 1, /* value is an exact integer */
--
2.52.0
>From 112c9385ca3b423196dd8012b48248aab935d224 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Mon, 16 Feb 2026 12:04:09 +0100
Subject: [PATCH 10/18] swscale/ops: simplify ff_sws_op_list_print
Using the new ff_sws_op_type_name() helper.
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops.c | 38 ++++++++++++++++----------------------
1 file changed, 16 insertions(+), 22 deletions(-)
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 6da080732e..f4bf8ab255 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -746,8 +746,9 @@ void ff_sws_op_list_print(void *log, int lev, int lev_extra,
}
for (int i = 0; i < ops->num_ops; i++) {
- const SwsOp *op = &ops->ops[i];
+ const SwsOp *op = &ops->ops[i];
const SwsOp *next = i + 1 < ops->num_ops ? &ops->ops[i + 1] : op;
+ const char *name = ff_sws_op_type_name(op->op);
char buf[32];
av_log(log, lev, " [%3s %c%c%c%c -> %c%c%c%c] ",
@@ -763,68 +764,62 @@ void ff_sws_op_list_print(void *log, int lev, int lev_extra,
switch (op->op) {
case SWS_OP_INVALID:
- av_log(log, lev, "SWS_OP_INVALID\n");
+ case SWS_OP_SWAP_BYTES:
+ av_log(log, lev, "%s\n", name);
break;
case SWS_OP_READ:
case SWS_OP_WRITE:
- av_log(log, lev, "%-20s: %d elem(s) %s >> %d%s\n",
- op->op == SWS_OP_READ ? "SWS_OP_READ"
- : "SWS_OP_WRITE",
+ av_log(log, lev, "%-20s: %d elem(s) %s >> %d%s\n", name,
op->rw.elems, op->rw.packed ? "packed" : "planar",
op->rw.frac,
describe_order(op->op == SWS_OP_READ ? ops->order_src
: ops->order_dst,
op->rw.packed ? 1 : op->rw.elems, buf));
break;
- case SWS_OP_SWAP_BYTES:
- av_log(log, lev, "SWS_OP_SWAP_BYTES\n");
- break;
case SWS_OP_LSHIFT:
- av_log(log, lev, "%-20s: << %u\n", "SWS_OP_LSHIFT", op->c.u);
+ av_log(log, lev, "%-20s: << %u\n", name, op->c.u);
break;
case SWS_OP_RSHIFT:
- av_log(log, lev, "%-20s: >> %u\n", "SWS_OP_RSHIFT", op->c.u);
+ av_log(log, lev, "%-20s: >> %u\n", name, op->c.u);
break;
case SWS_OP_PACK:
case SWS_OP_UNPACK:
- av_log(log, lev, "%-20s: {%d %d %d %d}\n",
- op->op == SWS_OP_PACK ? "SWS_OP_PACK"
- : "SWS_OP_UNPACK",
+ av_log(log, lev, "%-20s: {%d %d %d %d}\n", name,
op->pack.pattern[0], op->pack.pattern[1],
op->pack.pattern[2], op->pack.pattern[3]);
break;
case SWS_OP_CLEAR:
- av_log(log, lev, "%-20s: {%s %s %s %s}\n", "SWS_OP_CLEAR",
+ av_log(log, lev, "%-20s: {%s %s %s %s}\n", name,
op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_",
op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_",
op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_",
op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_");
break;
case SWS_OP_SWIZZLE:
- av_log(log, lev, "%-20s: %d%d%d%d\n", "SWS_OP_SWIZZLE",
+ av_log(log, lev, "%-20s: %d%d%d%d\n", name,
op->swizzle.x, op->swizzle.y, op->swizzle.z, op->swizzle.w);
break;
case SWS_OP_CONVERT:
- av_log(log, lev, "%-20s: %s -> %s%s\n", "SWS_OP_CONVERT",
+ av_log(log, lev, "%-20s: %s -> %s%s\n", name,
ff_sws_pixel_type_name(op->type),
ff_sws_pixel_type_name(op->convert.to),
op->convert.expand ? " (expand)" : "");
break;
case SWS_OP_DITHER:
- av_log(log, lev, "%-20s: %dx%d matrix + {%d %d %d %d}\n", "SWS_OP_DITHER",
+ av_log(log, lev, "%-20s: %dx%d matrix + {%d %d %d %d}\n", name,
1 << op->dither.size_log2, 1 << op->dither.size_log2,
op->dither.y_offset[0], op->dither.y_offset[1],
op->dither.y_offset[2], op->dither.y_offset[3]);
break;
case SWS_OP_MIN:
- av_log(log, lev, "%-20s: x <= {%s %s %s %s}\n", "SWS_OP_MIN",
+ av_log(log, lev, "%-20s: x <= {%s %s %s %s}\n", name,
op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_",
op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_",
op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_",
op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_");
break;
case SWS_OP_MAX:
- av_log(log, lev, "%-20s: {%s %s %s %s} <= x\n", "SWS_OP_MAX",
+ av_log(log, lev, "%-20s: {%s %s %s %s} <= x\n", name,
op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_",
op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_",
op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_",
@@ -835,15 +830,14 @@ void ff_sws_op_list_print(void *log, int lev, int lev_extra,
"[%s %s %s %s %s] "
"[%s %s %s %s %s] "
"[%s %s %s %s %s]]\n",
- "SWS_OP_LINEAR", describe_lin_mask(op->lin.mask),
+ name, describe_lin_mask(op->lin.mask),
PRINTQ(op->lin.m[0][0]), PRINTQ(op->lin.m[0][1]), PRINTQ(op->lin.m[0][2]), PRINTQ(op->lin.m[0][3]), PRINTQ(op->lin.m[0][4]),
PRINTQ(op->lin.m[1][0]), PRINTQ(op->lin.m[1][1]), PRINTQ(op->lin.m[1][2]), PRINTQ(op->lin.m[1][3]), PRINTQ(op->lin.m[1][4]),
PRINTQ(op->lin.m[2][0]), PRINTQ(op->lin.m[2][1]), PRINTQ(op->lin.m[2][2]), PRINTQ(op->lin.m[2][3]), PRINTQ(op->lin.m[2][4]),
PRINTQ(op->lin.m[3][0]), PRINTQ(op->lin.m[3][1]), PRINTQ(op->lin.m[3][2]), PRINTQ(op->lin.m[3][3]), PRINTQ(op->lin.m[3][4]));
break;
case SWS_OP_SCALE:
- av_log(log, lev, "%-20s: * %s\n", "SWS_OP_SCALE",
- PRINTQ(op->c.q));
+ av_log(log, lev, "%-20s: * %s\n", name, PRINTQ(op->c.q));
break;
case SWS_OP_TYPE_NB:
break;
--
2.52.0
>From fdc2445f676e371cc4dd8b64934d83c7f19268c6 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Thu, 8 Jan 2026 13:08:23 +0100
Subject: [PATCH 11/18] swscale/ops: solve for plane dependencies
When output planes are independent of each other, we can try and split them
into separate output paths.
Generates benign diffs that just reflect the new addition.
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops.c | 36 ++++++++++++++++++++++++++++--------
libswscale/ops.h | 4 ++++
tests/ref/fate/sws-ops-list | 2 +-
3 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/libswscale/ops.c b/libswscale/ops.c
index f4bf8ab255..9aad4c9b44 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -244,8 +244,10 @@ void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4])
static const SwsCompFlags flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT;
static SwsCompFlags merge_comp_flags(SwsCompFlags a, SwsCompFlags b)
{
- const SwsCompFlags flags_or = SWS_COMP_GARBAGE;
const SwsCompFlags flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT;
+ const SwsCompFlags flags_or = SWS_COMP_PLANE0 | SWS_COMP_PLANE1 |
+ SWS_COMP_PLANE2 | SWS_COMP_PLANE3 |
+ SWS_COMP_GARBAGE;
return ((a & b) & flags_and) | ((a | b) & flags_or);
}
@@ -306,6 +308,9 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
op->comps.min[i] = prev.min[i];
op->comps.max[i] = prev.max[i];
}
+ /* Mark plane dependencies */
+ for (int i = 0; i < op->rw.elems; i++)
+ op->comps.flags[i] |= SWS_COMP_PLANE0 << (op->rw.packed ? 0 : i);
break;
case SWS_OP_SWAP_BYTES:
for (int i = 0; i < 4; i++) {
@@ -719,6 +724,19 @@ static const char *describe_order(SwsSwizzleOp order, int planes, char buf[32])
return buf;
}
+static char *describe_comp_planes(SwsCompFlags flags, char buf[5])
+{
+ int num = 0;
+ for (int i = 0; i < 4; i++) {
+ if (flags & (SWS_COMP_PLANE0 << i))
+ buf[num++] = '0' + i;
+ }
+ if (!num)
+ buf[num++] = '_';
+ buf[num] = '\0';
+ return buf;
+}
+
static const char *print_q(const AVRational q, char buf[], int buf_len)
{
if (!q.den) {
@@ -843,12 +861,11 @@ void ff_sws_op_list_print(void *log, int lev, int lev_extra,
break;
}
- if (op->comps.min[0].den || op->comps.min[1].den ||
- op->comps.min[2].den || op->comps.min[3].den ||
- op->comps.max[0].den || op->comps.max[1].den ||
- op->comps.max[2].den || op->comps.max[3].den)
+ if (op->comps.flags[0] || op->comps.flags[1] ||
+ op->comps.flags[2] || op->comps.flags[3])
{
- av_log(log, lev_extra, " min: {%s, %s, %s, %s}, max: {%s, %s, %s, %s}\n",
+ av_log(log, lev_extra, " min: {%s, %s, %s, %s}, "
+ "max: {%s, %s, %s, %s}, deps: {%s, %s, %s, %s}\n",
next->comps.unused[0] ? "_" : PRINTQ(op->comps.min[0]),
next->comps.unused[1] ? "_" : PRINTQ(op->comps.min[1]),
next->comps.unused[2] ? "_" : PRINTQ(op->comps.min[2]),
@@ -856,9 +873,12 @@ void ff_sws_op_list_print(void *log, int lev, int lev_extra,
next->comps.unused[0] ? "_" : PRINTQ(op->comps.max[0]),
next->comps.unused[1] ? "_" : PRINTQ(op->comps.max[1]),
next->comps.unused[2] ? "_" : PRINTQ(op->comps.max[2]),
- next->comps.unused[3] ? "_" : PRINTQ(op->comps.max[3]));
+ next->comps.unused[3] ? "_" : PRINTQ(op->comps.max[3]),
+ describe_comp_planes(op->comps.flags[0], (char[5]) {0}),
+ describe_comp_planes(op->comps.flags[1], (char[5]) {0}),
+ describe_comp_planes(op->comps.flags[2], (char[5]) {0}),
+ describe_comp_planes(op->comps.flags[3], (char[5]) {0}));
}
-
}
av_log(log, lev, " (X = unused, z = byteswapped, + = exact, 0 = zero)\n");
diff --git a/libswscale/ops.h b/libswscale/ops.h
index fe83739b5a..35de769490 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -76,6 +76,10 @@ typedef enum SwsCompFlags {
SWS_COMP_EXACT = 1 << 1, /* value is an exact integer */
SWS_COMP_ZERO = 1 << 2, /* known to be a constant zero */
SWS_COMP_SWAPPED = 1 << 3, /* byte order is swapped */
+ SWS_COMP_PLANE0 = 1 << 4, /* depends on values from plane 0 */
+ SWS_COMP_PLANE1 = 1 << 5, /* depends on values from plane 1 */
+ SWS_COMP_PLANE2 = 1 << 6, /* depends on values from plane 2 */
+ SWS_COMP_PLANE3 = 1 << 7, /* depends on values from plane 3 */
} SwsCompFlags;
typedef union SwsConst {
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index 13049a0c14..cd7d6b6e55 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-a312bd79cadff3e2e02fd14ae7e54e26
+061b8014b9e05a9b91af1fcfc0556e41
--
2.52.0
>From cf5f0a459f30ca6c9140343400d8f8768e7eba0d Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Thu, 12 Feb 2026 10:50:09 +0100
Subject: [PATCH 12/18] swscale/ops: keep track of copied/cleared components
These represent components which have not (yet) been modified from their
input values (i.e. after a read, or clear). Such components can be
basically passed through via a refcopy (where applicable), as well as helping
to distinguish dissimilar types of plane for (plane splitting).
Generates benign diffs like:
gbrp -> rgba:
- [ u8 XXXX -> +++X] SWS_OP_READ : 3 elem(s) planar >> 0, via {2, 0, 1}
- [ u8 ...X -> ++++] SWS_OP_CLEAR : {_ _ _ 255}
- [ u8 .... -> ++++] SWS_OP_WRITE : 4 elem(s) packed >> 0
- (X = unused, z = byteswapped, + = exact, 0 = zero)
+ [ u8 XXXX -> ===X] SWS_OP_READ : 3 elem(s) planar >> 0, via {2, 0, 1}
+ [ u8 ...X -> ===$] SWS_OP_CLEAR : {_ _ _ 255}
+ [ u8 .... -> ===$] SWS_OP_WRITE : 4 elem(s) packed >> 0
+ ('X' unused, 'z' byteswapped, '=' copied, '$' const, '+' integer, '0' zero)
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops.c | 86 ++++++++++++++++++++++++-------------
libswscale/ops.h | 2 +
tests/ref/fate/sws-ops-list | 2 +-
3 files changed, 59 insertions(+), 31 deletions(-)
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 9aad4c9b44..71150a464e 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -240,24 +240,23 @@ void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4])
av_unreachable("Invalid operation type!");
}
-/* merge_comp_flags() forms a monoid with flags_identity as the null element */
-static const SwsCompFlags flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT;
+enum {
+ SWS_COMP_IDENTITY = SWS_COMP_ZERO | SWS_COMP_EXACT |
+ SWS_COMP_COPY | SWS_COMP_CONST,
+
+ SWS_COMP_DIRTY = ~(SWS_COMP_COPY | SWS_COMP_CONST),
+};
+
+/* merge_comp_flags() forms a monoid with SWS_COMP_IDENTITY as the null element */
static SwsCompFlags merge_comp_flags(SwsCompFlags a, SwsCompFlags b)
{
- const SwsCompFlags flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT;
+ const SwsCompFlags flags_and = SWS_COMP_IDENTITY;
const SwsCompFlags flags_or = SWS_COMP_PLANE0 | SWS_COMP_PLANE1 |
SWS_COMP_PLANE2 | SWS_COMP_PLANE3 |
SWS_COMP_GARBAGE;
return ((a & b) & flags_and) | ((a | b) & flags_or);
}
-/* Linearly propagate flags per component */
-static void propagate_flags(SwsOp *op, const SwsComps *prev)
-{
- for (int i = 0; i < 4; i++)
- op->comps.flags[i] = prev->flags[i];
-}
-
/* Clear undefined values in dst with src */
static void clear_undefined_values(AVRational dst[4], const AVRational src[4])
{
@@ -309,12 +308,23 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
op->comps.max[i] = prev.max[i];
}
/* Mark plane dependencies */
- for (int i = 0; i < op->rw.elems; i++)
+ for (int i = 0; i < op->rw.elems; i++) {
op->comps.flags[i] |= SWS_COMP_PLANE0 << (op->rw.packed ? 0 : i);
+
+ /**
+ * Don't mark packed or fractional reads as a copy, because the
+ * read operation implicitly unpacks the data into separate
+ * components. The only case in which op lists involving such
+ * reads can be refcopies is in the case of a true noop, which
+ * is already covered by the no-op check.
+ */
+ if (!op->rw.packed && !op->rw.frac)
+ op->comps.flags[i] |= SWS_COMP_COPY;
+ }
break;
case SWS_OP_SWAP_BYTES:
for (int i = 0; i < 4; i++) {
- op->comps.flags[i] = prev.flags[i] ^ SWS_COMP_SWAPPED;
+ op->comps.flags[i] = (prev.flags[i] ^ SWS_COMP_SWAPPED) & SWS_COMP_DIRTY;
op->comps.min[i] = prev.min[i];
op->comps.max[i] = prev.max[i];
}
@@ -322,30 +332,36 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
case SWS_OP_WRITE:
for (int i = 0; i < op->rw.elems; i++)
av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE));
- /* fall through */
+ for (int i = 0; i < 4; i++)
+ op->comps.flags[i] = prev.flags[i];
+ break;
case SWS_OP_LSHIFT:
case SWS_OP_RSHIFT:
- propagate_flags(op, &prev);
+ for (int i = 0; i < 4; i++)
+ op->comps.flags[i] = prev.flags[i] & SWS_COMP_DIRTY;
break;
case SWS_OP_MIN:
- propagate_flags(op, &prev);
- clear_undefined_values(op->comps.max, op->c.q4);
- break;
case SWS_OP_MAX:
- propagate_flags(op, &prev);
- clear_undefined_values(op->comps.min, op->c.q4);
+ for (int i = 0; i < 4; i++) {
+ op->comps.flags[i] = prev.flags[i];
+ if (op->c.q4[i].den)
+ op->comps.flags[i] &= SWS_COMP_DIRTY;
+ }
+ clear_undefined_values(op-op == SWS_OP_MIN ? op->comps.max : op->comps.min, op->c.q4);
break;
case SWS_OP_DITHER:
- /* Strip zero flag because of the nonzero dithering offset */
- for (int i = 0; i < 4; i++)
- op->comps.flags[i] = prev.flags[i] & ~SWS_COMP_ZERO;
+ for (int i = 0; i < 4; i++) {
+ op->comps.flags[i] = prev.flags[i] & SWS_COMP_DIRTY;
+ /* Strip zero flag because of the nonzero dithering offset */
+ op->comps.flags[i] &= ~SWS_COMP_ZERO;
+ }
break;
case SWS_OP_UNPACK:
for (int i = 0; i < 4; i++) {
const int pattern = op->pack.pattern[i];
if (pattern) {
av_assert1(pattern < 32);
- op->comps.flags[i] = prev.flags[0];
+ op->comps.flags[i] = prev.flags[0] & SWS_COMP_DIRTY;
op->comps.min[i] = Q(0);
op->comps.max[i] = Q((1ULL << pattern) - 1);
} else
@@ -353,20 +369,20 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
}
break;
case SWS_OP_PACK: {
- SwsCompFlags flags = flags_identity;
+ SwsCompFlags flags = SWS_COMP_IDENTITY;
for (int i = 0; i < 4; i++) {
if (op->pack.pattern[i])
flags = merge_comp_flags(flags, prev.flags[i]);
if (i > 0) /* clear remaining comps for sanity */
op->comps.flags[i] = SWS_COMP_GARBAGE;
}
- op->comps.flags[0] = flags;
+ op->comps.flags[0] = flags & SWS_COMP_DIRTY;
break;
}
case SWS_OP_CLEAR:
for (int i = 0; i < 4; i++) {
if (op->c.q4[i].den) {
- op->comps.flags[i] = 0;
+ op->comps.flags[i] = SWS_COMP_CONST;
if (op->c.q4[i].num == 0)
op->comps.flags[i] |= SWS_COMP_ZERO;
if (op->c.q4[i].den == 1)
@@ -383,14 +399,17 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
case SWS_OP_CONVERT:
for (int i = 0; i < 4; i++) {
op->comps.flags[i] = prev.flags[i];
+ if (!(prev.flags[i] & SWS_COMP_EXACT) || op->convert.expand)
+ op->comps.flags[i] &= SWS_COMP_DIRTY;
if (ff_sws_pixel_type_is_int(op->convert.to))
op->comps.flags[i] |= SWS_COMP_EXACT;
}
break;
case SWS_OP_LINEAR:
for (int i = 0; i < 4; i++) {
- SwsCompFlags flags = flags_identity;
+ SwsCompFlags flags = SWS_COMP_IDENTITY;
AVRational min = Q(0), max = Q(0);
+ AVRational sum = Q(0);
for (int j = 0; j < 4; j++) {
const AVRational k = op->lin.m[i][j];
AVRational mink = av_mul_q(prev.min[j], k);
@@ -403,10 +422,13 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
FFSWAP(AVRational, mink, maxk);
min = av_add_q(min, mink);
max = av_add_q(max, maxk);
+ if (sum.num || av_cmp_q(k, Q(1)))
+ flags &= SWS_COMP_DIRTY;
+ sum = av_add_q(sum, k);
}
}
if (op->lin.m[i][4].num) { /* nonzero offset */
- flags &= ~SWS_COMP_ZERO;
+ flags &= ~SWS_COMP_ZERO | SWS_COMP_DIRTY;
if (op->lin.m[i][4].den != 1) /* fractional offset */
flags &= ~SWS_COMP_EXACT;
min = av_add_q(min, op->lin.m[i][4]);
@@ -419,7 +441,7 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
break;
case SWS_OP_SCALE:
for (int i = 0; i < 4; i++) {
- op->comps.flags[i] = prev.flags[i];
+ op->comps.flags[i] = prev.flags[i] & SWS_COMP_DIRTY;
if (op->c.q.den != 1) /* fractional scale */
op->comps.flags[i] &= ~SWS_COMP_EXACT;
if (op->c.q.num < 0)
@@ -706,6 +728,10 @@ static char describe_comp_flags(SwsCompFlags flags)
return '0';
else if (flags & SWS_COMP_SWAPPED)
return 'z';
+ else if (flags & SWS_COMP_CONST)
+ return '$';
+ else if (flags & SWS_COMP_COPY)
+ return '=';
else if (flags & SWS_COMP_EXACT)
return '+';
else
@@ -881,7 +907,7 @@ void ff_sws_op_list_print(void *log, int lev, int lev_extra,
}
}
- av_log(log, lev, " (X = unused, z = byteswapped, + = exact, 0 = zero)\n");
+ av_log(log, lev, " ('X' unused, 'z' byteswapped, '=' copied, '$' const, '+' integer, '0' zero)\n");
}
int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 35de769490..e1de49c569 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -80,6 +80,8 @@ typedef enum SwsCompFlags {
SWS_COMP_PLANE1 = 1 << 5, /* depends on values from plane 1 */
SWS_COMP_PLANE2 = 1 << 6, /* depends on values from plane 2 */
SWS_COMP_PLANE3 = 1 << 7, /* depends on values from plane 3 */
+ SWS_COMP_COPY = 1 << 8, /* value is unmodified from the source plane */
+ SWS_COMP_CONST = 1 << 9, /* value is a fixed constant */
} SwsCompFlags;
typedef union SwsConst {
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index cd7d6b6e55..95237bbbeb 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-061b8014b9e05a9b91af1fcfc0556e41
+d718767aa22a99a0a2e3a66608fb8a77
--
2.52.0
>From d3099a80b3a2595b00de0199a87c8f017f441e29 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Sat, 7 Feb 2026 11:55:09 +0100
Subject: [PATCH 13/18] swscale/ops: make `ff_sws_compile_pass` not mutate the
input list
This also simplifies memory management in an upcoming change, apart from being
a less surprising API overall.
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops.c | 46 ++++++++++++++++++++++++++++++----------------
libswscale/ops.h | 6 ++----
2 files changed, 32 insertions(+), 20 deletions(-)
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 71150a464e..cf05c3f219 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -1204,14 +1204,13 @@ static int rw_pixel_bits(const SwsOp *op)
return elems * size * bits;
}
-int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst,
- SwsPass *input, SwsPass **output)
+int ff_sws_compile_pass(SwsGraph *graph, const SwsOpList *ops, int flags,
+ SwsFormat dst, SwsPass *input, SwsPass **output)
{
SwsContext *ctx = graph->ctx;
+ SwsOpList *copy = NULL;
+ SwsPass *pass = NULL;
SwsOpPass *p = NULL;
- const SwsOp *read = &ops->ops[0];
- const SwsOp *write = &ops->ops[ops->num_ops - 1];
- SwsPass *pass;
int ret;
/* Check if the whole operation graph is an end-to-end no-op */
@@ -1225,25 +1224,37 @@ int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat ds
return AVERROR(EINVAL);
}
- if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) {
+ if (ops->ops[0].op != SWS_OP_READ || ops->ops[ops->num_ops - 1].op != SWS_OP_WRITE) {
av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
"and write, respectively.\n");
return AVERROR(EINVAL);
}
- if (flags & SWS_OP_FLAG_OPTIMIZE)
- RET(ff_sws_op_list_optimize(ops));
- else
- ff_sws_op_list_update_comps(ops);
-
- p = av_mallocz(sizeof(*p));
- if (!p)
+ /* Make mutable copy of `ops` */
+ copy = ff_sws_op_list_duplicate(ops);
+ if (!copy)
return AVERROR(ENOMEM);
- ret = ff_sws_ops_compile(ctx, ops, &p->comp);
+ if (flags & SWS_OP_FLAG_OPTIMIZE) {
+ ret = ff_sws_op_list_optimize(copy);
+ if (ret < 0)
+ goto fail;
+ } else {
+ ff_sws_op_list_update_comps(copy);
+ }
+
+ p = av_mallocz(sizeof(*p));
+ if (!p) {
+ ret = AVERROR(ENOMEM);
+ goto fail;
+ }
+
+ ret = ff_sws_ops_compile(ctx, copy, &p->comp);
if (ret < 0)
goto fail;
+ const SwsOp *read = ©->ops[0];
+ const SwsOp *write = ©->ops[copy->num_ops - 1];
p->planes_in = rw_planes(read);
p->planes_out = rw_planes(write);
p->pixel_bits_in = rw_pixel_bits(read);
@@ -1256,8 +1267,8 @@ int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat ds
};
for (int i = 0; i < 4; i++) {
- p->idx_in[i] = i < p->planes_in ? ops->order_src.in[i] : -1;
- p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
+ p->idx_in[i] = i < p->planes_in ? copy->order_src.in[i] : -1;
+ p->idx_out[i] = i < p->planes_out ? copy->order_dst.in[i] : -1;
}
pass = ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height, input,
@@ -1268,11 +1279,14 @@ int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat ds
}
pass->setup = op_pass_setup;
pass->free = op_pass_free;
+ p = NULL; /* ownership transferred to pass */
+ ff_sws_op_list_free(©);
*output = pass;
return 0;
fail:
+ ff_sws_op_list_free(©);
op_pass_free(p);
return ret;
}
diff --git a/libswscale/ops.h b/libswscale/ops.h
index e1de49c569..429827eb4f 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -302,10 +302,8 @@ enum SwsOpCompileFlags {
/**
* Resolves an operation list to a graph pass. The first and last operations
* must be a read/write respectively. `flags` is a list of SwsOpCompileFlags.
- *
- * Note: `ops` may be modified by this function.
*/
-int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst,
- SwsPass *input, SwsPass **output);
+int ff_sws_compile_pass(SwsGraph *graph, const SwsOpList *ops, int flags,
+ SwsFormat dst, SwsPass *input, SwsPass **output);
#endif
--
2.52.0
>From 04f2894484be17829ce80c5703f309f47475b3c5 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Sat, 7 Feb 2026 11:59:40 +0100
Subject: [PATCH 14/18] swscale/ops: remove unneeded macro
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops.c | 6 ------
1 file changed, 6 deletions(-)
diff --git a/libswscale/ops.c b/libswscale/ops.c
index cf05c3f219..c7869d6b48 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -41,12 +41,6 @@ const SwsOpBackend * const ff_sws_op_backends[] = {
NULL
};
-#define RET(x) \
- do { \
- if ((ret = (x)) < 0) \
- return ret; \
- } while (0)
-
const char *ff_sws_pixel_type_name(SwsPixelType type)
{
switch (type) {
--
2.52.0
>From ce6bf40ca02a3cf27d789b2bb15da5188e7c4261 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Wed, 11 Feb 2026 18:37:55 +0100
Subject: [PATCH 15/18] swscale/ops: generate plane copy map for passes
Uses the previously added SWS_COMP_COPY to determine which planes are direct
copies of other planes.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/ops.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)
diff --git a/libswscale/ops.c b/libswscale/ops.c
index c7869d6b48..81da6fc48c 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -1198,6 +1198,49 @@ static int rw_pixel_bits(const SwsOp *op)
return elems * size * bits;
}
+/* Updates the plane copy (no-op) map for this operation list */
+static void op_list_get_plane_copy(const SwsOpList *ops, int plane_copy[4])
+{
+ if (!ops->num_ops)
+ return;
+
+ const SwsOp *write = &ops->ops[ops->num_ops - 1];
+ av_assert0(write->op == SWS_OP_WRITE);
+ if (write->rw.packed || write->rw.frac)
+ return; /* only regular planes can be directly ref'd */
+
+ /**
+ * Check if the type still matches (for sanity). Note that SWS_OP_CONVERT
+ * doesn't clear SWS_COMP_COPY, to allow the copythrough property to
+ * survive e.g. redundant (no-op) conversions through f32 and back.
+ *
+ * In practice, this check shouldn't ever matter, because any conversion
+ * to a different output type is overwhelmingly likely to involve at least
+ * _some_ change to the pixel data (otherwise why bother?), but it helps
+ * me sleep at night.
+ */
+ const SwsOp *read = &ops->ops[0];
+ if (read->op != SWS_OP_READ || read->type != write->type)
+ return;
+
+ for (int i = 0; i < write->rw.elems; i++) {
+ enum SwsCompFlags flags = write->comps.flags[i];
+ if (!(flags & SWS_COMP_COPY))
+ continue;
+
+ const int out_idx = ops->order_dst.in[i];
+ const int depmask = SWS_COMP_PLANE0 | SWS_COMP_PLANE1 |
+ SWS_COMP_PLANE2 | SWS_COMP_PLANE3;
+
+ switch (flags & depmask) {
+ case SWS_COMP_PLANE0: plane_copy[out_idx] = ops->order_src.in[0]; break;
+ case SWS_COMP_PLANE1: plane_copy[out_idx] = ops->order_src.in[1]; break;
+ case SWS_COMP_PLANE2: plane_copy[out_idx] = ops->order_src.in[2]; break;
+ case SWS_COMP_PLANE3: plane_copy[out_idx] = ops->order_src.in[3]; break;
+ }
+ }
+}
+
int ff_sws_compile_pass(SwsGraph *graph, const SwsOpList *ops, int flags,
SwsFormat dst, SwsPass *input, SwsPass **output)
{
@@ -1271,6 +1314,7 @@ int ff_sws_compile_pass(SwsGraph *graph, const SwsOpList *ops, int flags,
ret = AVERROR(ENOMEM);
goto fail;
}
+ op_list_get_plane_copy(copy, pass->plane_copy);
pass->setup = op_pass_setup;
pass->free = op_pass_free;
p = NULL; /* ownership transferred to pass */
--
2.52.0
>From e9b57df4b778c198b40fec7aebab29671fbe9393 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Sat, 21 Feb 2026 16:15:58 +0100
Subject: [PATCH 16/18] swscale/graph: internally ref copied intermediate
buffers
If the pass we are allocating output buffers for has a plane copy map,
we can optimize away the internal buffer by setting it to a ref of its
input buffer, if one exists.
In theory, we can also do this if there is not an input buffer, by
directly referencing a sentinel or placeholder SwsPassBuffer corresponding to
the original input image, but this will require a bit more work so I have
decided to hold off on it for now.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/graph.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/libswscale/graph.c b/libswscale/graph.c
index 1658748c32..90ee0e4e2b 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -75,6 +75,18 @@ static int pass_alloc_output(SwsPass *pass)
const int align = av_cpu_max_align();
for (int i = 0; i < 4; i++) {
+ int src_idx = pass->plane_copy[i];
+ if (src_idx >= 0 && pass->input) {
+ /* Ref the source plane instead of allocating a new buffer */
+ const SwsPassBuffer *src = pass->input->output;
+ output->buf[i] = av_buffer_ref(src->buf[src_idx]);
+ if (!output->buf[i])
+ return AVERROR(ENOMEM);
+ output->img.data[i] = src->img.data[src_idx];
+ output->img.linesize[i] = src->img.linesize[src_idx];
+ continue;
+ }
+
if (!sizes[i])
break;
if (sizes[i] > SIZE_MAX - align)
--
2.52.0
>From e3648cca4d5a7f40075ea068a441fb20516aaa4a Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Sun, 22 Feb 2026 19:30:34 +0100
Subject: [PATCH 17/18] swscale: restructure sws_scale_frame() slightly
Cosmetic restructuring to prepare for the upcoming non-cosmetic change in
logic.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/swscale.c | 47 +++++++++++++++++++++++---------------------
1 file changed, 25 insertions(+), 22 deletions(-)
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index e05310a724..9c4f8cda43 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1413,29 +1413,32 @@ int sws_scale_frame(SwsContext *sws, AVFrame *dst, const AVFrame *src)
if (!src->data[0])
return 0;
- if (c->graph[FIELD_TOP]->noop &&
- (!c->graph[FIELD_BOTTOM] || c->graph[FIELD_BOTTOM]->noop) &&
- src->buf[0] && !dst->buf[0] && !dst->data[0])
- {
- /* Lightweight refcopy */
- ret = frame_ref(dst, src);
- if (ret < 0)
- return ret;
- } else {
- if (!dst->data[0]) {
- ret = frame_alloc_buffers(sws, dst);
- if (ret < 0)
- return ret;
- }
+ const SwsGraph *top = c->graph[FIELD_TOP];
+ const SwsGraph *bot = c->graph[FIELD_BOTTOM];
+ if (dst->data[0]) /* user-provided buffers */
+ goto process_frame;
- for (int field = 0; field < 2; field++) {
- SwsGraph *graph = c->graph[field];
- SwsImg input = get_frame_img(src, field);
- SwsImg output = get_frame_img(dst, field);
- ff_sws_graph_run(graph, &output, &input);
- if (!graph->dst.interlaced)
- break;
- }
+ /* Sanity */
+ memset(dst->buf, 0, sizeof(dst->buf));
+ memset(dst->data, 0, sizeof(dst->data));
+ memset(dst->linesize, 0, sizeof(dst->linesize));
+ dst->extended_data = dst->data;
+
+ if (src->buf[0] && top->noop && (!bot || bot->noop))
+ return frame_ref(dst, src);
+
+ ret = frame_alloc_buffers(sws, dst);
+ if (ret < 0)
+ return ret;
+
+process_frame:
+ for (int field = 0; field < 2; field++) {
+ SwsGraph *graph = c->graph[field];
+ SwsImg input = get_frame_img(src, field);
+ SwsImg output = get_frame_img(dst, field);
+ ff_sws_graph_run(graph, &output, &input);
+ if (!bot)
+ break;
}
return 0;
--
2.52.0
>From ee2df995fda251bc6c27a312c054f8b6ff347575 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Sun, 22 Feb 2026 19:31:52 +0100
Subject: [PATCH 18/18] swscale: partially ref copied planes
Instead of all-or-nothing, this commit allows ref'ing some planes while
copying others, by referencing the SwsGraph plane copy map.
Testing with -p 0.1 -threads 16 -bench 10, on an AMD Ryzen 9 9950X3D:
Before:
Overall speedup=2.648x faster, min=0.121x max=656.435x
gray 1920x1080 -> yuvj444p 1920x1080, flags=0x100000 dither=1
time=27 us, ref=25 us, speedup=0.932x slower
After:
Overall speedup=2.749x faster, min=0.172x max=565.059x
gray 1920x1080 -> yuvj444p 1920x1080, flags=0x100000 dither=1
time=16 us, ref=25 us, speedup=1.490x faster
Note that the overall impact of this commit is rather limited on its own
due to the fact that most planes are not sufficiently recognized as
independent copies in the absence of plane splitting. This will be improved
by an upcoming series.
Signed-off-by: Niklas Haas <git@haasn.dev>
---
libswscale/swscale.c | 58 +++++++++++++++++++++++++++++++++++---------
1 file changed, 46 insertions(+), 12 deletions(-)
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 9c4f8cda43..1d629f35fa 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1351,20 +1351,37 @@ static SwsImg get_frame_img(const AVFrame *frame, int field)
return img;
}
-/* Subset of av_frame_ref() that only references (video) data buffers */
-static int frame_ref(AVFrame *dst, const AVFrame *src)
+/* Similar to av_frame_ref() but only references planes in the given map */
+static int frame_ref(AVFrame *dst, const AVFrame *src, const int plane_copy[4])
{
- /* ref the buffers */
- for (int i = 0; i < FF_ARRAY_ELEMS(src->buf); i++) {
- if (!src->buf[i])
+ int copied[4] = {0};
+ int nb_copied = 0;
+
+ for (int i = 0; i < 4; i++) {
+ const int idx = plane_copy[i];
+ if (idx < 0)
continue;
- dst->buf[i] = av_buffer_ref(src->buf[i]);
- if (!dst->buf[i])
- return AVERROR(ENOMEM);
+ /* Find corresponding source buffer */
+ uint8_t *src_data = src->data[idx];
+ av_assert0(src_data);
+ for (int j = 0; j < 4; j++) {
+ AVBufferRef *buf = src->buf[j];
+ if (!buf)
+ break;
+ if (src_data < buf->data || src_data >= buf->data + buf->size)
+ continue; /* wrong buffer */
+ if (!copied[j]) {
+ AVBufferRef *ref = av_buffer_ref(buf);
+ if (!ref)
+ return AVERROR(ENOMEM);
+ dst->buf[nb_copied++] = ref;
+ copied[j] = 1;
+ }
+ dst->data[i] = src_data;
+ dst->linesize[i] = src->linesize[idx];
+ }
}
- memcpy(dst->data, src->data, sizeof(src->data));
- memcpy(dst->linesize, src->linesize, sizeof(src->linesize));
return 0;
}
@@ -1375,6 +1392,8 @@ static int frame_alloc_buffers(SwsContext *sws, AVFrame *frame)
const int nb_planes = av_pix_fmt_count_planes(frame->format);
for (int i = 0; i < nb_planes; i++) {
+ if (frame->data[i])
+ continue; /* already ref'd by frame_ref */
int ret = ff_sws_buffer_pool_get(pool, frame, i);
if (ret < 0)
return ret;
@@ -1424,9 +1443,24 @@ int sws_scale_frame(SwsContext *sws, AVFrame *dst, const AVFrame *src)
memset(dst->linesize, 0, sizeof(dst->linesize));
dst->extended_data = dst->data;
- if (src->buf[0] && top->noop && (!bot || bot->noop))
- return frame_ref(dst, src);
+ if (src->buf[0]) {
+ /* Determine end-to-end plane copy map */
+ int plane_copy[FF_ARRAY_ELEMS(top->plane_copy)];
+ memcpy(plane_copy, top->plane_copy, sizeof(plane_copy));
+ for (int i = 0; bot && i < FF_ARRAY_ELEMS(plane_copy); i++) {
+ if (bot->plane_copy[i] != plane_copy[i])
+ plane_copy[i] = -1;
+ }
+ ret = frame_ref(dst, src, plane_copy);
+ if (ret < 0)
+ return ret;
+
+ if (top->noop && (!bot || bot->noop))
+ return 0; /* all planes should be ref'd now */
+ }
+
+ /* Allocate any missing buffers not yet ref'd */
ret = frame_alloc_buffers(sws, dst);
if (ret < 0)
return ret;
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2026-02-26 13:34 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-02-26 13:33 [FFmpeg-devel] [PR] libswscale: directly ref untouched planes instead of memcpy (PR #22293) Niklas Haas via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git