Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: Niklas Haas via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Niklas Haas <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PR] swscale: refactor dispatch layer to handle multiple passes per op list (PR #22319)
Date: Fri, 27 Feb 2026 17:28:04 -0000
Message-ID: <177221328498.25.6802438838394186248@29965ddac10e> (raw)

PR #22319 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22319
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22319.patch

Sorely needed for backends to be able to compile op lists involving resizes, in which case the backend will need to be able to compile multiple passes.

I explored many different designs for how to resolve this API gracefully, including one where backends directly add passes to SwsGraph instead of returning them back to the caller, but in the end, none of these designs worked well enough or had issues preventing them from being good solutions to the problem.

So in the end, I settled on allowing backends to return EAGAIN with a remainder in `ops` to return partial results.


>From 3e5f1925527e9658605ce0897fb3d7615d2d2a9b Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Fri, 27 Feb 2026 15:02:00 +0100
Subject: [PATCH 1/6] swscale/ops: move pass compilation logic to helper
 function

Purely cosmetic.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
 libswscale/ops.c | 97 +++++++++++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 43 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index b3db758682..7a307ca29e 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -1145,14 +1145,60 @@ static int rw_pixel_bits(const SwsOp *op)
     return elems * size * bits;
 }
 
+static int compile(SwsGraph *graph, const SwsOpList *ops,
+                   const SwsFormat *dst, SwsPass *input, SwsPass **output)
+{
+    SwsContext *ctx = graph->ctx;
+    SwsOpPass *p = av_mallocz(sizeof(*p));
+    if (!p)
+        return AVERROR(ENOMEM);
+
+    int ret = ff_sws_ops_compile(ctx, ops, &p->comp);
+    if (ret < 0)
+        goto fail;
+
+    const SwsOp *read = &ops->ops[0];
+    const SwsOp *write = &ops->ops[ops->num_ops - 1];
+    p->planes_in  = rw_planes(read);
+    p->planes_out = rw_planes(write);
+    p->pixel_bits_in  = rw_pixel_bits(read);
+    p->pixel_bits_out = rw_pixel_bits(write);
+    p->exec_base = (SwsOpExec) {
+        .width  = dst->width,
+        .height = dst->height,
+        .block_size_in  = p->comp.block_size * p->pixel_bits_in  >> 3,
+        .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
+    };
+
+    for (int i = 0; i < 4; i++) {
+        p->idx_in[i]  = i < p->planes_in  ? ops->order_src.in[i] : -1;
+        p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
+    }
+
+    SwsPass *pass;
+    pass = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
+                                 input, p->comp.slice_align, p, op_pass_run);
+    if (!pass) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    pass->setup = op_pass_setup;
+    pass->free  = op_pass_free;
+
+    *output = pass;
+    return 0;
+
+fail:
+    op_pass_free(p);
+    return ret;
+}
+
 int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags,
                         const SwsFormat *dst, SwsPass *input, SwsPass **output)
 {
     SwsContext *ctx = graph->ctx;
-    SwsOpPass *p = NULL;
     const SwsOp *read = &ops->ops[0];
     const SwsOp *write = &ops->ops[ops->num_ops - 1];
-    SwsPass *pass;
     int ret;
 
     /* Check if the whole operation graph is an end-to-end no-op */
@@ -1172,48 +1218,13 @@ int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags,
         return AVERROR(EINVAL);
     }
 
-    if (flags & SWS_OP_FLAG_OPTIMIZE)
-        RET(ff_sws_op_list_optimize(ops));
-    else
+    if (flags & SWS_OP_FLAG_OPTIMIZE) {
+        ret = ff_sws_op_list_optimize(ops);
+        if (ret < 0)
+            return ret;
+    } else {
         ff_sws_op_list_update_comps(ops);
-
-    p = av_mallocz(sizeof(*p));
-    if (!p)
-        return AVERROR(ENOMEM);
-
-    ret = ff_sws_ops_compile(ctx, ops, &p->comp);
-    if (ret < 0)
-        goto fail;
-
-    p->planes_in  = rw_planes(read);
-    p->planes_out = rw_planes(write);
-    p->pixel_bits_in  = rw_pixel_bits(read);
-    p->pixel_bits_out = rw_pixel_bits(write);
-    p->exec_base = (SwsOpExec) {
-        .width  = dst->width,
-        .height = dst->height,
-        .block_size_in  = p->comp.block_size * p->pixel_bits_in  >> 3,
-        .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
-    };
-
-    for (int i = 0; i < 4; i++) {
-        p->idx_in[i]  = i < p->planes_in  ? ops->order_src.in[i] : -1;
-        p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
     }
 
-    pass = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
-                                 input, p->comp.slice_align, p, op_pass_run);
-    if (!pass) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-    pass->setup = op_pass_setup;
-    pass->free  = op_pass_free;
-
-    *output = pass;
-    return 0;
-
-fail:
-    op_pass_free(p);
-    return ret;
+    return compile(graph, ops, dst, input, output);
 }
-- 
2.52.0


>From 6f40015e03d3a0653e3fc7cc6905a8072986bab1 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Fri, 27 Feb 2026 15:17:24 +0100
Subject: [PATCH 2/6] swscale/ops_dispatch: split off compile/dispatch code
 from ops.c

This code is self-contained and logically distinct from the ops-related
helpers in ops.c, so it belongs in its own file.

Purely cosmetic; no functional change.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
 libswscale/Makefile       |   1 +
 libswscale/ops.c          | 381 -----------------------------------
 libswscale/ops_dispatch.c | 408 ++++++++++++++++++++++++++++++++++++++
 libswscale/ops_dispatch.h | 103 ++++++++++
 libswscale/ops_internal.h |  65 +-----
 5 files changed, 513 insertions(+), 445 deletions(-)
 create mode 100644 libswscale/ops_dispatch.c
 create mode 100644 libswscale/ops_dispatch.h

diff --git a/libswscale/Makefile b/libswscale/Makefile
index 4331768cfa..568d7de605 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -30,6 +30,7 @@ OBJS-$(CONFIG_UNSTABLE) +=                              \
        ops.o                                            \
        ops_backend.o                                    \
        ops_chain.o                                      \
+       ops_dispatch.o                                   \
        ops_memcpy.o                                     \
        ops_optimizer.o                                  \
 
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 7a307ca29e..5b01776f4d 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -847,384 +847,3 @@ void ff_sws_op_list_print(void *log, int lev, int lev_extra,
 
     av_log(log, lev, "    (X = unused, z = byteswapped, + = exact, 0 = zero)\n");
 }
-
-int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
-                               const SwsOpList *ops, SwsCompiledOp *out)
-{
-    SwsOpList *copy, rest;
-    SwsCompiledOp compiled = {0};
-    int ret = 0;
-
-    copy = ff_sws_op_list_duplicate(ops);
-    if (!copy)
-        return AVERROR(ENOMEM);
-
-    /* Ensure these are always set during compilation */
-    ff_sws_op_list_update_comps(copy);
-
-    /* Make an on-stack copy of `ops` to ensure we can still properly clean up
-     * the copy afterwards */
-    rest = *copy;
-
-    ret = backend->compile(ctx, &rest, &compiled);
-    if (ret < 0) {
-        int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR;
-        av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n",
-               backend->name, av_err2str(ret));
-        if (rest.num_ops != ops->num_ops) {
-            av_log(ctx, msg_lev, "Uncompiled remainder:\n");
-            ff_sws_op_list_print(ctx, msg_lev, AV_LOG_TRACE, &rest);
-        }
-    } else {
-        *out = compiled;
-    }
-
-    ff_sws_op_list_free(&copy);
-    return ret;
-}
-
-int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
-{
-    for (int n = 0; ff_sws_op_backends[n]; n++) {
-        const SwsOpBackend *backend = ff_sws_op_backends[n];
-        if (ops->src.hw_format != backend->hw_format ||
-            ops->dst.hw_format != backend->hw_format)
-            continue;
-        if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0)
-            continue;
-
-        av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
-               "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
-               backend->name, out->block_size, out->over_read, out->over_write,
-               out->cpu_flags);
-        return 0;
-    }
-
-    av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n");
-    ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops);
-    return AVERROR(ENOTSUP);
-}
-
-typedef struct SwsOpPass {
-    SwsCompiledOp comp;
-    SwsOpExec exec_base;
-    int num_blocks;
-    int tail_off_in;
-    int tail_off_out;
-    int tail_size_in;
-    int tail_size_out;
-    int planes_in;
-    int planes_out;
-    int pixel_bits_in;
-    int pixel_bits_out;
-    int idx_in[4];
-    int idx_out[4];
-    bool memcpy_in;
-    bool memcpy_out;
-} SwsOpPass;
-
-static void op_pass_free(void *ptr)
-{
-    SwsOpPass *p = ptr;
-    if (!p)
-        return;
-
-    if (p->comp.free)
-        p->comp.free(p->comp.priv);
-
-    av_free(p);
-}
-
-static inline void get_row_data(const SwsOpPass *p, const int y,
-                                const uint8_t *in[4], uint8_t *out[4])
-{
-    const SwsOpExec *base = &p->exec_base;
-    for (int i = 0; i < p->planes_in; i++)
-        in[i] = base->in[i] + (y >> base->in_sub_y[i]) * base->in_stride[i];
-    for (int i = 0; i < p->planes_out; i++)
-        out[i] = base->out[i] + (y >> base->out_sub_y[i]) * base->out_stride[i];
-}
-
-static void op_pass_setup(const AVFrame *out, const AVFrame *in,
-                          const SwsPass *pass)
-{
-    const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in->format);
-    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format);
-
-    SwsOpPass *p = pass->priv;
-    SwsOpExec *exec = &p->exec_base;
-    const SwsCompiledOp *comp = &p->comp;
-    const int block_size = comp->block_size;
-    p->num_blocks = (pass->width + block_size - 1) / block_size;
-
-    /* Set up main loop parameters */
-    const int aligned_w  = p->num_blocks * block_size;
-    const int safe_width = (p->num_blocks - 1) * block_size;
-    const int tail_size  = pass->width - safe_width;
-    p->tail_off_in   = safe_width * p->pixel_bits_in  >> 3;
-    p->tail_off_out  = safe_width * p->pixel_bits_out >> 3;
-    p->tail_size_in  = tail_size  * p->pixel_bits_in  >> 3;
-    p->tail_size_out = tail_size  * p->pixel_bits_out >> 3;
-    p->memcpy_in     = false;
-    p->memcpy_out    = false;
-
-    for (int i = 0; i < p->planes_in; i++) {
-        const int idx        = p->idx_in[i];
-        const int chroma     = idx == 1 || idx == 2;
-        const int sub_x      = chroma ? indesc->log2_chroma_w : 0;
-        const int sub_y      = chroma ? indesc->log2_chroma_h : 0;
-        const int plane_w    = (aligned_w + sub_x) >> sub_x;
-        const int plane_pad  = (comp->over_read + sub_x) >> sub_x;
-        const int plane_size = plane_w * p->pixel_bits_in >> 3;
-        if (comp->slice_align)
-            p->memcpy_in |= plane_size + plane_pad > in->linesize[idx];
-        exec->in[i]        = in->data[idx];
-        exec->in_stride[i] = in->linesize[idx];
-        exec->in_sub_y[i]  = sub_y;
-        exec->in_sub_x[i]  = sub_x;
-    }
-
-    for (int i = 0; i < p->planes_out; i++) {
-        const int idx        = p->idx_out[i];
-        const int chroma     = idx == 1 || idx == 2;
-        const int sub_x      = chroma ? outdesc->log2_chroma_w : 0;
-        const int sub_y      = chroma ? outdesc->log2_chroma_h : 0;
-        const int plane_w    = (aligned_w + sub_x) >> sub_x;
-        const int plane_pad  = (comp->over_write + sub_x) >> sub_x;
-        const int plane_size = plane_w * p->pixel_bits_out >> 3;
-        if (comp->slice_align)
-            p->memcpy_out |= plane_size + plane_pad > out->linesize[idx];
-        exec->out[i]        = out->data[idx];
-        exec->out_stride[i] = out->linesize[idx];
-        exec->out_sub_y[i]  = sub_y;
-        exec->out_sub_x[i]  = sub_x;
-    }
-
-    /* Pre-fill pointer bump for the main section only; this value does not
-     * matter at all for the tail / last row handlers because they only ever
-     * process a single line */
-    const int blocks_main = p->num_blocks - p->memcpy_out;
-    for (int i = 0; i < 4; i++) {
-        exec->in_bump[i]  = exec->in_stride[i]  - blocks_main * exec->block_size_in;
-        exec->out_bump[i] = exec->out_stride[i] - blocks_main * exec->block_size_out;
-    }
-
-    exec->src_frame_ptr = in;
-    exec->dst_frame_ptr = out;
-}
-
-/* Dispatch kernel over the last column of the image using memcpy */
-static av_always_inline void
-handle_tail(const SwsOpPass *p, SwsOpExec *exec,
-            const bool copy_out, const bool copy_in,
-            int y, const int h)
-{
-    DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])];
-
-    const SwsOpExec *base = &p->exec_base;
-    const SwsCompiledOp *comp = &p->comp;
-    const int tail_size_in  = p->tail_size_in;
-    const int tail_size_out = p->tail_size_out;
-    const int bx = p->num_blocks - 1;
-
-    const uint8_t *in_data[4];
-    uint8_t *out_data[4];
-    get_row_data(p, y, in_data, out_data);
-
-    for (int i = 0; i < p->planes_in; i++) {
-        in_data[i] += p->tail_off_in;
-        if (copy_in) {
-            exec->in[i] = (void *) tmp[0][i];
-            exec->in_stride[i] = sizeof(tmp[0][i]);
-        } else {
-            exec->in[i] = in_data[i];
-        }
-    }
-
-    for (int i = 0; i < p->planes_out; i++) {
-        out_data[i] += p->tail_off_out;
-        if (copy_out) {
-            exec->out[i] = (void *) tmp[1][i];
-            exec->out_stride[i] = sizeof(tmp[1][i]);
-        } else {
-            exec->out[i] = out_data[i];
-        }
-    }
-
-    for (int y_end = y + h; y < y_end; y++) {
-        if (copy_in) {
-            for (int i = 0; i < p->planes_in; i++) {
-                av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]);
-                memcpy(tmp[0][i], in_data[i], tail_size_in);
-                in_data[i] += base->in_stride[i]; /* exec->in_stride was clobbered */
-            }
-        }
-
-        comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1);
-
-        if (copy_out) {
-            for (int i = 0; i < p->planes_out; i++) {
-                av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]);
-                memcpy(out_data[i], tmp[1][i], tail_size_out);
-                out_data[i] += base->out_stride[i];
-            }
-        }
-
-        for (int i = 0; i < 4; i++) {
-            if (!copy_in && exec->in[i])
-                exec->in[i] += exec->in_stride[i];
-            if (!copy_out && exec->out[i])
-                exec->out[i] += exec->out_stride[i];
-        }
-    }
-}
-
-static void op_pass_run(const AVFrame *out, const AVFrame *in, const int y,
-                        const int h, const SwsPass *pass)
-{
-    const SwsOpPass *p = pass->priv;
-    const SwsCompiledOp *comp = &p->comp;
-
-    /* Fill exec metadata for this slice */
-    DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base;
-    exec.slice_y = y;
-    exec.slice_h = h;
-
-    /**
-     *  To ensure safety, we need to consider the following:
-     *
-     * 1. We can overread the input, unless this is the last line of an
-     *    unpadded buffer. All defined operations can handle arbitrary pixel
-     *    input, so overread of arbitrary data is fine.
-     *
-     * 2. We can overwrite the output, as long as we don't write more than the
-     *    amount of pixels that fit into one linesize. So we always need to
-     *    memcpy the last column on the output side if unpadded.
-     *
-     * 3. For the last row, we also need to memcpy the remainder of the input,
-     *    to avoid reading past the end of the buffer. Note that since we know
-     *    the run() function is called on stripes of the same buffer, we don't
-     *    need to worry about this for the end of a slice.
-     */
-
-    const int last_slice  = y + h == pass->height;
-    const bool memcpy_in  = last_slice && p->memcpy_in;
-    const bool memcpy_out = p->memcpy_out;
-    const int num_blocks  = p->num_blocks;
-    const int blocks_main = num_blocks - memcpy_out;
-    const int h_main      = h - memcpy_in;
-
-    /* Handle main section */
-    get_row_data(p, y, exec.in, exec.out);
-    comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main);
-
-    if (memcpy_in) {
-        /* Safe part of last row */
-        get_row_data(p, y + h_main, exec.in, exec.out);
-        comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h);
-    }
-
-    /* Handle last column via memcpy, takes over `exec` so call these last */
-    if (memcpy_out)
-        handle_tail(p, &exec, true, false, y, h_main);
-    if (memcpy_in)
-        handle_tail(p, &exec, memcpy_out, true, y + h_main, 1);
-}
-
-static int rw_planes(const SwsOp *op)
-{
-    return op->rw.packed ? 1 : op->rw.elems;
-}
-
-static int rw_pixel_bits(const SwsOp *op)
-{
-    const int elems = op->rw.packed ? op->rw.elems : 1;
-    const int size  = ff_sws_pixel_type_size(op->type);
-    const int bits  = 8 >> op->rw.frac;
-    av_assert1(bits >= 1);
-    return elems * size * bits;
-}
-
-static int compile(SwsGraph *graph, const SwsOpList *ops,
-                   const SwsFormat *dst, SwsPass *input, SwsPass **output)
-{
-    SwsContext *ctx = graph->ctx;
-    SwsOpPass *p = av_mallocz(sizeof(*p));
-    if (!p)
-        return AVERROR(ENOMEM);
-
-    int ret = ff_sws_ops_compile(ctx, ops, &p->comp);
-    if (ret < 0)
-        goto fail;
-
-    const SwsOp *read = &ops->ops[0];
-    const SwsOp *write = &ops->ops[ops->num_ops - 1];
-    p->planes_in  = rw_planes(read);
-    p->planes_out = rw_planes(write);
-    p->pixel_bits_in  = rw_pixel_bits(read);
-    p->pixel_bits_out = rw_pixel_bits(write);
-    p->exec_base = (SwsOpExec) {
-        .width  = dst->width,
-        .height = dst->height,
-        .block_size_in  = p->comp.block_size * p->pixel_bits_in  >> 3,
-        .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
-    };
-
-    for (int i = 0; i < 4; i++) {
-        p->idx_in[i]  = i < p->planes_in  ? ops->order_src.in[i] : -1;
-        p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
-    }
-
-    SwsPass *pass;
-    pass = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
-                                 input, p->comp.slice_align, p, op_pass_run);
-    if (!pass) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-    pass->setup = op_pass_setup;
-    pass->free  = op_pass_free;
-
-    *output = pass;
-    return 0;
-
-fail:
-    op_pass_free(p);
-    return ret;
-}
-
-int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags,
-                        const SwsFormat *dst, SwsPass *input, SwsPass **output)
-{
-    SwsContext *ctx = graph->ctx;
-    const SwsOp *read = &ops->ops[0];
-    const SwsOp *write = &ops->ops[ops->num_ops - 1];
-    int ret;
-
-    /* Check if the whole operation graph is an end-to-end no-op */
-    if (ff_sws_op_list_is_noop(ops)) {
-        *output = input;
-        return 0;
-    }
-
-    if (ops->num_ops < 2) {
-        av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n");
-        return AVERROR(EINVAL);
-    }
-
-    if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) {
-        av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
-               "and write, respectively.\n");
-        return AVERROR(EINVAL);
-    }
-
-    if (flags & SWS_OP_FLAG_OPTIMIZE) {
-        ret = ff_sws_op_list_optimize(ops);
-        if (ret < 0)
-            return ret;
-    } else {
-        ff_sws_op_list_update_comps(ops);
-    }
-
-    return compile(graph, ops, dst, input, output);
-}
diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
new file mode 100644
index 0000000000..8f25de2881
--- /dev/null
+++ b/libswscale/ops_dispatch.c
@@ -0,0 +1,408 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+
+#include "ops.h"
+#include "ops_internal.h"
+#include "ops_dispatch.h"
+
+typedef struct SwsOpPass {
+    SwsCompiledOp comp;
+    SwsOpExec exec_base;
+    int num_blocks;
+    int tail_off_in;
+    int tail_off_out;
+    int tail_size_in;
+    int tail_size_out;
+    int planes_in;
+    int planes_out;
+    int pixel_bits_in;
+    int pixel_bits_out;
+    int idx_in[4];
+    int idx_out[4];
+    bool memcpy_in;
+    bool memcpy_out;
+} SwsOpPass;
+
+int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
+                               const SwsOpList *ops, SwsCompiledOp *out)
+{
+    SwsOpList *copy, rest;
+    SwsCompiledOp compiled = {0};
+    int ret = 0;
+
+    copy = ff_sws_op_list_duplicate(ops);
+    if (!copy)
+        return AVERROR(ENOMEM);
+
+    /* Ensure these are always set during compilation */
+    ff_sws_op_list_update_comps(copy);
+
+    /* Make an on-stack copy of `ops` to ensure we can still properly clean up
+     * the copy afterwards */
+    rest = *copy;
+
+    ret = backend->compile(ctx, &rest, &compiled);
+    if (ret < 0) {
+        int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR;
+        av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n",
+               backend->name, av_err2str(ret));
+        if (rest.num_ops != ops->num_ops) {
+            av_log(ctx, msg_lev, "Uncompiled remainder:\n");
+            ff_sws_op_list_print(ctx, msg_lev, AV_LOG_TRACE, &rest);
+        }
+    } else {
+        *out = compiled;
+    }
+
+    ff_sws_op_list_free(&copy);
+    return ret;
+}
+
+int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
+{
+    for (int n = 0; ff_sws_op_backends[n]; n++) {
+        const SwsOpBackend *backend = ff_sws_op_backends[n];
+        if (ops->src.hw_format != backend->hw_format ||
+            ops->dst.hw_format != backend->hw_format)
+            continue;
+        if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0)
+            continue;
+
+        av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
+               "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
+               backend->name, out->block_size, out->over_read, out->over_write,
+               out->cpu_flags);
+        return 0;
+    }
+
+    av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n");
+    ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops);
+    return AVERROR(ENOTSUP);
+}
+
+static void op_pass_free(void *ptr)
+{
+    SwsOpPass *p = ptr;
+    if (!p)
+        return;
+
+    if (p->comp.free)
+        p->comp.free(p->comp.priv);
+
+    av_free(p);
+}
+
+static inline void get_row_data(const SwsOpPass *p, const int y,
+                                const uint8_t *in[4], uint8_t *out[4])
+{
+    const SwsOpExec *base = &p->exec_base;
+    for (int i = 0; i < p->planes_in; i++)
+        in[i] = base->in[i] + (y >> base->in_sub_y[i]) * base->in_stride[i];
+    for (int i = 0; i < p->planes_out; i++)
+        out[i] = base->out[i] + (y >> base->out_sub_y[i]) * base->out_stride[i];
+}
+
+static void op_pass_setup(const AVFrame *out, const AVFrame *in,
+                          const SwsPass *pass)
+{
+    const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in->format);
+    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format);
+
+    SwsOpPass *p = pass->priv;
+    SwsOpExec *exec = &p->exec_base;
+    const SwsCompiledOp *comp = &p->comp;
+    const int block_size = comp->block_size;
+    p->num_blocks = (pass->width + block_size - 1) / block_size;
+
+    /* Set up main loop parameters */
+    const int aligned_w  = p->num_blocks * block_size;
+    const int safe_width = (p->num_blocks - 1) * block_size;
+    const int tail_size  = pass->width - safe_width;
+    p->tail_off_in   = safe_width * p->pixel_bits_in  >> 3;
+    p->tail_off_out  = safe_width * p->pixel_bits_out >> 3;
+    p->tail_size_in  = tail_size  * p->pixel_bits_in  >> 3;
+    p->tail_size_out = tail_size  * p->pixel_bits_out >> 3;
+    p->memcpy_in     = false;
+    p->memcpy_out    = false;
+
+    for (int i = 0; i < p->planes_in; i++) {
+        const int idx        = p->idx_in[i];
+        const int chroma     = idx == 1 || idx == 2;
+        const int sub_x      = chroma ? indesc->log2_chroma_w : 0;
+        const int sub_y      = chroma ? indesc->log2_chroma_h : 0;
+        const int plane_w    = (aligned_w + sub_x) >> sub_x;
+        const int plane_pad  = (comp->over_read + sub_x) >> sub_x;
+        const int plane_size = plane_w * p->pixel_bits_in >> 3;
+        if (comp->slice_align)
+            p->memcpy_in |= plane_size + plane_pad > in->linesize[idx];
+        exec->in[i]        = in->data[idx];
+        exec->in_stride[i] = in->linesize[idx];
+        exec->in_sub_y[i]  = sub_y;
+        exec->in_sub_x[i]  = sub_x;
+    }
+
+    for (int i = 0; i < p->planes_out; i++) {
+        const int idx        = p->idx_out[i];
+        const int chroma     = idx == 1 || idx == 2;
+        const int sub_x      = chroma ? outdesc->log2_chroma_w : 0;
+        const int sub_y      = chroma ? outdesc->log2_chroma_h : 0;
+        const int plane_w    = (aligned_w + sub_x) >> sub_x;
+        const int plane_pad  = (comp->over_write + sub_x) >> sub_x;
+        const int plane_size = plane_w * p->pixel_bits_out >> 3;
+        if (comp->slice_align)
+            p->memcpy_out |= plane_size + plane_pad > out->linesize[idx];
+        exec->out[i]        = out->data[idx];
+        exec->out_stride[i] = out->linesize[idx];
+        exec->out_sub_y[i]  = sub_y;
+        exec->out_sub_x[i]  = sub_x;
+    }
+
+    /* Pre-fill pointer bump for the main section only; this value does not
+     * matter at all for the tail / last row handlers because they only ever
+     * process a single line */
+    const int blocks_main = p->num_blocks - p->memcpy_out;
+    for (int i = 0; i < 4; i++) {
+        exec->in_bump[i]  = exec->in_stride[i]  - blocks_main * exec->block_size_in;
+        exec->out_bump[i] = exec->out_stride[i] - blocks_main * exec->block_size_out;
+    }
+
+    exec->src_frame_ptr = in;
+    exec->dst_frame_ptr = out;
+}
+
+/* Dispatch kernel over the last column of the image using memcpy */
+static av_always_inline void
+handle_tail(const SwsOpPass *p, SwsOpExec *exec,
+            const bool copy_out, const bool copy_in,
+            int y, const int h)
+{
+    DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])];
+
+    const SwsOpExec *base = &p->exec_base;
+    const SwsCompiledOp *comp = &p->comp;
+    const int tail_size_in  = p->tail_size_in;
+    const int tail_size_out = p->tail_size_out;
+    const int bx = p->num_blocks - 1;
+
+    const uint8_t *in_data[4];
+    uint8_t *out_data[4];
+    get_row_data(p, y, in_data, out_data);
+
+    for (int i = 0; i < p->planes_in; i++) {
+        in_data[i] += p->tail_off_in;
+        if (copy_in) {
+            exec->in[i] = (void *) tmp[0][i];
+            exec->in_stride[i] = sizeof(tmp[0][i]);
+        } else {
+            exec->in[i] = in_data[i];
+        }
+    }
+
+    for (int i = 0; i < p->planes_out; i++) {
+        out_data[i] += p->tail_off_out;
+        if (copy_out) {
+            exec->out[i] = (void *) tmp[1][i];
+            exec->out_stride[i] = sizeof(tmp[1][i]);
+        } else {
+            exec->out[i] = out_data[i];
+        }
+    }
+
+    for (int y_end = y + h; y < y_end; y++) {
+        if (copy_in) {
+            for (int i = 0; i < p->planes_in; i++) {
+                av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]);
+                memcpy(tmp[0][i], in_data[i], tail_size_in);
+                in_data[i] += base->in_stride[i]; /* exec->in_stride was clobbered */
+            }
+        }
+
+        comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1);
+
+        if (copy_out) {
+            for (int i = 0; i < p->planes_out; i++) {
+                av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]);
+                memcpy(out_data[i], tmp[1][i], tail_size_out);
+                out_data[i] += base->out_stride[i];
+            }
+        }
+
+        for (int i = 0; i < 4; i++) {
+            if (!copy_in && exec->in[i])
+                exec->in[i] += exec->in_stride[i];
+            if (!copy_out && exec->out[i])
+                exec->out[i] += exec->out_stride[i];
+        }
+    }
+}
+
+static void op_pass_run(const AVFrame *out, const AVFrame *in, const int y,
+                        const int h, const SwsPass *pass)
+{
+    const SwsOpPass *p = pass->priv;
+    const SwsCompiledOp *comp = &p->comp;
+
+    /* Fill exec metadata for this slice */
+    DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base;
+    exec.slice_y = y;
+    exec.slice_h = h;
+
+    /**
+     *  To ensure safety, we need to consider the following:
+     *
+     * 1. We can overread the input, unless this is the last line of an
+     *    unpadded buffer. All defined operations can handle arbitrary pixel
+     *    input, so overread of arbitrary data is fine.
+     *
+     * 2. We can overwrite the output, as long as we don't write more than the
+     *    amount of pixels that fit into one linesize. So we always need to
+     *    memcpy the last column on the output side if unpadded.
+     *
+     * 3. For the last row, we also need to memcpy the remainder of the input,
+     *    to avoid reading past the end of the buffer. Note that since we know
+     *    the run() function is called on stripes of the same buffer, we don't
+     *    need to worry about this for the end of a slice.
+     */
+
+    const int last_slice  = y + h == pass->height;
+    const bool memcpy_in  = last_slice && p->memcpy_in;
+    const bool memcpy_out = p->memcpy_out;
+    const int num_blocks  = p->num_blocks;
+    const int blocks_main = num_blocks - memcpy_out;
+    const int h_main      = h - memcpy_in;
+
+    /* Handle main section */
+    get_row_data(p, y, exec.in, exec.out);
+    comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main);
+
+    if (memcpy_in) {
+        /* Safe part of last row */
+        get_row_data(p, y + h_main, exec.in, exec.out);
+        comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h);
+    }
+
+    /* Handle last column via memcpy, takes over `exec` so call these last */
+    if (memcpy_out)
+        handle_tail(p, &exec, true, false, y, h_main);
+    if (memcpy_in)
+        handle_tail(p, &exec, memcpy_out, true, y + h_main, 1);
+}
+
+static int rw_planes(const SwsOp *op)
+{
+    return op->rw.packed ? 1 : op->rw.elems;
+}
+
+static int rw_pixel_bits(const SwsOp *op)
+{
+    const int elems = op->rw.packed ? op->rw.elems : 1;
+    const int size  = ff_sws_pixel_type_size(op->type);
+    const int bits  = 8 >> op->rw.frac;
+    av_assert1(bits >= 1);
+    return elems * size * bits;
+}
+
+static int compile(SwsGraph *graph, const SwsOpList *ops,
+                   const SwsFormat *dst, SwsPass *input, SwsPass **output)
+{
+    SwsContext *ctx = graph->ctx;
+    SwsOpPass *p = av_mallocz(sizeof(*p));
+    if (!p)
+        return AVERROR(ENOMEM);
+
+    int ret = ff_sws_ops_compile(ctx, ops, &p->comp);
+    if (ret < 0)
+        goto fail;
+
+    const SwsOp *read = &ops->ops[0];
+    const SwsOp *write = &ops->ops[ops->num_ops - 1];
+    p->planes_in  = rw_planes(read);
+    p->planes_out = rw_planes(write);
+    p->pixel_bits_in  = rw_pixel_bits(read);
+    p->pixel_bits_out = rw_pixel_bits(write);
+    p->exec_base = (SwsOpExec) {
+        .width  = dst->width,
+        .height = dst->height,
+        .block_size_in  = p->comp.block_size * p->pixel_bits_in  >> 3,
+        .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
+    };
+
+    for (int i = 0; i < 4; i++) {
+        p->idx_in[i]  = i < p->planes_in  ? ops->order_src.in[i] : -1;
+        p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
+    }
+
+    SwsPass *pass;
+    pass = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
+                                 input, p->comp.slice_align, p, op_pass_run);
+    if (!pass) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    pass->setup = op_pass_setup;
+    pass->free  = op_pass_free;
+
+    *output = pass;
+    return 0;
+
+fail:
+    op_pass_free(p);
+    return ret;
+}
+
+int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags,
+                        const SwsFormat *dst, SwsPass *input, SwsPass **output)
+{
+    SwsContext *ctx = graph->ctx;
+    const SwsOp *read = &ops->ops[0];
+    const SwsOp *write = &ops->ops[ops->num_ops - 1];
+    int ret;
+
+    /* Check if the whole operation graph is an end-to-end no-op */
+    if (ff_sws_op_list_is_noop(ops)) {
+        *output = input;
+        return 0;
+    }
+
+    if (ops->num_ops < 2) {
+        av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) {
+        av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
+               "and write, respectively.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (flags & SWS_OP_FLAG_OPTIMIZE) {
+        ret = ff_sws_op_list_optimize(ops);
+        if (ret < 0)
+            return ret;
+    } else {
+        ff_sws_op_list_update_comps(ops);
+    }
+
+    return compile(graph, ops, dst, input, output);
+}
diff --git a/libswscale/ops_dispatch.h b/libswscale/ops_dispatch.h
new file mode 100644
index 0000000000..51b7f26d13
--- /dev/null
+++ b/libswscale/ops_dispatch.h
@@ -0,0 +1,103 @@
+/**
+ * Copyright (C) 2026 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_OPS_DISPATCH_H
+#define SWSCALE_OPS_DISPATCH_H
+
+#include <assert.h>
+
+#include "libavutil/frame.h"
+#include "graph.h"
+
+/**
+ * Global execution context for all compiled functions.
+ *
+ * Note: This struct is hard-coded in assembly, so do not change the layout
+ * without updating the corresponding assembly definitions.
+ */
+typedef struct SwsOpExec {
+    /* The data pointers point to the first pixel to process */
+    const uint8_t *in[4];
+    uint8_t *out[4];
+
+    /* Separation between lines in bytes */
+    ptrdiff_t in_stride[4];
+    ptrdiff_t out_stride[4];
+
+    /* Pointer bump, difference between stride and processed line size */
+    ptrdiff_t in_bump[4];
+    ptrdiff_t out_bump[4];
+
+    /* Extra metadata, may or may not be useful */
+    int32_t width, height;      /* Overall image dimensions */
+    int32_t slice_y, slice_h;   /* Start and height of current slice */
+    int32_t block_size_in;      /* Size of a block of pixels in bytes */
+    int32_t block_size_out;
+
+    /* Subsampling factors for each plane */
+    uint8_t in_sub_y[4], out_sub_y[4];
+    uint8_t in_sub_x[4], out_sub_x[4];
+
+    const AVFrame *src_frame_ptr;
+    const AVFrame *dst_frame_ptr;
+} SwsOpExec;
+
+static_assert(sizeof(SwsOpExec) == 24 * sizeof(void *) +
+                                   6  * sizeof(int32_t) +
+                                   16 * sizeof(uint8_t) +
+                                   2  * sizeof(void *),
+              "SwsOpExec layout mismatch");
+
+/**
+ * Process a given range of pixel blocks.
+ *
+ * Note: `bx_start` and `bx_end` are in units of `SwsCompiledOp.block_size`.
+ */
+typedef void (*SwsOpFunc)(const SwsOpExec *exec, const void *priv,
+                          int bx_start, int y_start, int bx_end, int y_end);
+
+#define SWS_DECL_FUNC(NAME) \
+    void NAME(const SwsOpExec *, const void *, int, int, int, int)
+
+typedef struct SwsCompiledOp {
+    SwsOpFunc func;
+
+    int slice_align; /* slice height alignment */
+    int block_size;  /* number of pixels processed per iteration */
+    int over_read;   /* implementation over-reads input by this many bytes */
+    int over_write;  /* implementation over-writes output by this many bytes */
+    int cpu_flags;   /* active set of CPU flags (informative) */
+
+    /* Arbitrary private data */
+    void *priv;
+    void (*free)(void *priv);
+} SwsCompiledOp;
+
+/**
+ * Helper to add an SwsPass for CPU-based block/line processing of passes,
+ * using the given execution function and parameters.
+ *
+ * Returns 0 or a negative error code.
+ */
+int ff_sws_add_cpu_dispatch_pass(SwsGraph *graph, const SwsOpList *ops,
+                                 const SwsFormat *dst, SwsPass *input,
+                                 SwsPass **output);
+
+#endif /* SWSCALE_OPS_DISPATCH_H */
diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
index e07ef5285f..3db850c290 100644
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -24,6 +24,7 @@
 #include "libavutil/mem_internal.h"
 
 #include "ops.h"
+#include "ops_dispatch.h"
 
 #define Q(N) ((AVRational) { N, 1 })
 
@@ -51,70 +52,6 @@ static inline void ff_sws_pack_op_decode(const SwsOp *op, uint64_t mask[4], int
     }
 }
 
-/**
- * Global execution context for all compiled functions.
- *
- * Note: This struct is hard-coded in assembly, so do not change the layout
- * without updating the corresponding assembly definitions.
- */
-typedef struct SwsOpExec {
-    /* The data pointers point to the first pixel to process */
-    const uint8_t *in[4];
-    uint8_t *out[4];
-
-    /* Separation between lines in bytes */
-    ptrdiff_t in_stride[4];
-    ptrdiff_t out_stride[4];
-
-    /* Pointer bump, difference between stride and processed line size */
-    ptrdiff_t in_bump[4];
-    ptrdiff_t out_bump[4];
-
-    /* Extra metadata, may or may not be useful */
-    int32_t width, height;      /* Overall image dimensions */
-    int32_t slice_y, slice_h;   /* Start and height of current slice */
-    int32_t block_size_in;      /* Size of a block of pixels in bytes */
-    int32_t block_size_out;
-
-    /* Subsampling factors for each plane */
-    uint8_t in_sub_y[4], out_sub_y[4];
-    uint8_t in_sub_x[4], out_sub_x[4];
-
-    const AVFrame *src_frame_ptr;
-    const AVFrame *dst_frame_ptr;
-} SwsOpExec;
-
-static_assert(sizeof(SwsOpExec) == 24 * sizeof(void *) +
-                                   6  * sizeof(int32_t) +
-                                   16 * sizeof(uint8_t) +
-                                   2  * sizeof(void *),
-              "SwsOpExec layout mismatch");
-
-/**
- * Process a given range of pixel blocks.
- *
- * Note: `bx_start` and `bx_end` are in units of `SwsCompiledOp.block_size`.
- */
-typedef void (*SwsOpFunc)(const SwsOpExec *exec, const void *priv,
-                          int bx_start, int y_start, int bx_end, int y_end);
-
-#define SWS_DECL_FUNC(NAME) \
-    void NAME(const SwsOpExec *, const void *, int, int, int, int)
-
-typedef struct SwsCompiledOp {
-    SwsOpFunc func;
-
-    int slice_align; /* slice height alignment */
-    int block_size;  /* number of pixels processed per iteration */
-    int over_read;   /* implementation over-reads input by this many bytes */
-    int over_write;  /* implementation over-writes output by this many bytes */
-    int cpu_flags;   /* active set of CPU flags (informative) */
-
-    /* Arbitrary private data */
-    void *priv;
-    void (*free)(void *priv);
-} SwsCompiledOp;
-
 typedef struct SwsOpBackend {
     const char *name; /* Descriptive name for this backend */
 
-- 
2.52.0


>From 7960eabf633d1303abf5202a79235f57b40c0d14 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Fri, 27 Feb 2026 15:44:49 +0100
Subject: [PATCH 3/6] swscale/ops_dispatch: make ff_sws_ops_compile() private

I want to refactor this mechanism, and nothing else depends on it.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
 libswscale/ops_dispatch.c | 4 ++--
 libswscale/ops_internal.h | 7 -------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 8f25de2881..5ec1bb82c7 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -79,7 +79,7 @@ int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
     return ret;
 }
 
-int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
+static int compile_ops(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
 {
     for (int n = 0; ff_sws_op_backends[n]; n++) {
         const SwsOpBackend *backend = ff_sws_op_backends[n];
@@ -331,7 +331,7 @@ static int compile(SwsGraph *graph, const SwsOpList *ops,
     if (!p)
         return AVERROR(ENOMEM);
 
-    int ret = ff_sws_ops_compile(ctx, ops, &p->comp);
+    int ret = compile_ops(ctx, ops, &p->comp);
     if (ret < 0)
         goto fail;
 
diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
index 3db850c290..cfc386a081 100644
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -82,13 +82,6 @@ extern const SwsOpBackend *const ff_sws_op_backends[];
 int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
                                const SwsOpList *ops, SwsCompiledOp *out);
 
-/**
- * Compile a list of operations using the best available backend.
- *
- * Returns 0 on success, or a negative error code on failure.
- */
-int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out);
-
 /**
  * "Solve" an op list into a fixed shuffle mask, with an optional ability to
  * also directly clear the output value (for e.g. rgb24 -> rgb0). This can
-- 
2.52.0


>From db7dc3aee085b533a8459dd75847d8e40539d67f Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Fri, 27 Feb 2026 15:48:27 +0100
Subject: [PATCH 4/6] swscale/ops_dispatch: split copy out of
 ff_sws_op_compile_backend()

The main motivation here is that I want to be able to iterate over
compile_backend() calls within the same backend, which requires re-calling
this helper several times on the same op list _without_ resetting it.

However, we still need to preserve API, for e.g. checkasm/sw_ops.

Signed-off-by: Niklas Haas <git@haasn.dev>
---
 libswscale/ops_dispatch.c | 48 +++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 5ec1bb82c7..7aaa81212c 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -44,37 +44,40 @@ typedef struct SwsOpPass {
     bool memcpy_out;
 } SwsOpPass;
 
-int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
-                               const SwsOpList *ops, SwsCompiledOp *out)
+static int compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
+                           SwsOpList *ops, SwsCompiledOp *out)
 {
-    SwsOpList *copy, rest;
     SwsCompiledOp compiled = {0};
-    int ret = 0;
-
-    copy = ff_sws_op_list_duplicate(ops);
-    if (!copy)
-        return AVERROR(ENOMEM);
 
     /* Ensure these are always set during compilation */
-    ff_sws_op_list_update_comps(copy);
+    ff_sws_op_list_update_comps(ops);
 
-    /* Make an on-stack copy of `ops` to ensure we can still properly clean up
-     * the copy afterwards */
-    rest = *copy;
-
-    ret = backend->compile(ctx, &rest, &compiled);
+    const int orig_num_ops = ops->num_ops;
+    int ret = backend->compile(ctx, ops, &compiled);
     if (ret < 0) {
         int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR;
         av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n",
                backend->name, av_err2str(ret));
-        if (rest.num_ops != ops->num_ops) {
+        if (ops->num_ops != orig_num_ops) {
             av_log(ctx, msg_lev, "Uncompiled remainder:\n");
-            ff_sws_op_list_print(ctx, msg_lev, AV_LOG_TRACE, &rest);
+            ff_sws_op_list_print(ctx, msg_lev, AV_LOG_TRACE, ops);
         }
     } else {
         *out = compiled;
     }
 
+    return ret;
+}
+
+int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
+                               const SwsOpList *ops, SwsCompiledOp *out)
+{
+    SwsOpList *copy = ff_sws_op_list_duplicate(ops);
+    if (!copy)
+        return AVERROR(ENOMEM);
+
+    SwsOpList rest = *copy;
+    int ret = compile_backend(ctx, backend, &rest, out);
     ff_sws_op_list_free(&copy);
     return ret;
 }
@@ -86,8 +89,19 @@ static int compile_ops(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out
         if (ops->src.hw_format != backend->hw_format ||
             ops->dst.hw_format != backend->hw_format)
             continue;
-        if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0)
+
+        /* Make an on-stack copy of `ops` to ensure we can still properly clean up
+         * the copy afterwards */
+        SwsOpList *copy = ff_sws_op_list_duplicate(ops);
+        SwsOpList rest = *copy;
+
+        int ret = compile_backend(ctx, backend, &rest, out);
+        ff_sws_op_list_free(&copy);
+        if (ret < 0) {
+            /* Reset state for each new backend tried */
+            memset(out, 0, sizeof(*out));
             continue;
+        }
 
         av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
                "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
-- 
2.52.0


>From 5479a25efbf7643d57097dfe86f3b1b9198e384e Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Fri, 27 Feb 2026 18:07:36 +0100
Subject: [PATCH 5/6] swscale/ops_dispatch: set up static pass metadata during
 ops compile

This move some code around to prepare for an upcoming refactor. The main
motivation is to avoid compile_pass() depending on any particular
SwsOpList state; in particular after we change backends to allow
returning multiple different compiled ops.

This does require some slight sanitization to ensure each backend tried sees a
clean version of the SwsOpPass/SwsCompiledOp struct, as a result of needing
to set the read/write-dependent metadata before the backend has a chance to
mutate the op list.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
 libswscale/ops_dispatch.c | 85 ++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 37 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 7aaa81212c..87caf11a4b 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -82,7 +82,47 @@ int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
     return ret;
 }
 
-static int compile_ops(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
+static int rw_planes(const SwsOp *op)
+{
+    return op->rw.packed ? 1 : op->rw.elems;
+}
+
+static int rw_pixel_bits(const SwsOp *op)
+{
+    const int elems = op->rw.packed ? op->rw.elems : 1;
+    const int size  = ff_sws_pixel_type_size(op->type);
+    const int bits  = 8 >> op->rw.frac;
+    av_assert1(bits >= 1);
+    return elems * size * bits;
+}
+
+/* Compile and set up static metadata for a single pass */
+static int compile_single_pass(SwsContext *ctx, const SwsOpBackend *backend,
+                               SwsOpList *ops, SwsOpPass *p)
+{
+    av_assert0(ops->num_ops >= 2);
+    const SwsOp *read  = &ops->ops[0];
+    const SwsOp *write = &ops->ops[ops->num_ops - 1];
+    p->planes_in  = rw_planes(read);
+    p->planes_out = rw_planes(write);
+    p->pixel_bits_in  = rw_pixel_bits(read);
+    p->pixel_bits_out = rw_pixel_bits(write);
+    for (int i = 0; i < 4; i++) {
+        p->idx_in[i]  = i < p->planes_in  ? ops->order_src.in[i] : -1;
+        p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
+    }
+
+    int ret = compile_backend(ctx, backend, ops, &p->comp);
+    if (ret < 0)
+        return ret;
+
+    SwsOpExec *exec = &p->exec_base;
+    exec->block_size_in  = p->comp.block_size * p->pixel_bits_in  >> 3;
+    exec->block_size_out = p->comp.block_size * p->pixel_bits_out >> 3;
+    return ret;
+}
+
+static int compile_ops(SwsContext *ctx, const SwsOpList *ops, SwsOpPass *out)
 {
     for (int n = 0; ff_sws_op_backends[n]; n++) {
         const SwsOpBackend *backend = ff_sws_op_backends[n];
@@ -95,7 +135,7 @@ static int compile_ops(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out
         SwsOpList *copy = ff_sws_op_list_duplicate(ops);
         SwsOpList rest = *copy;
 
-        int ret = compile_backend(ctx, backend, &rest, out);
+        int ret = compile_single_pass(ctx, backend, &rest, out);
         ff_sws_op_list_free(&copy);
         if (ret < 0) {
             /* Reset state for each new backend tried */
@@ -103,10 +143,11 @@ static int compile_ops(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out
             continue;
         }
 
+        const SwsCompiledOp *comp = &out->comp;
         av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
                "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
-               backend->name, out->block_size, out->over_read, out->over_write,
-               out->cpu_flags);
+               backend->name, comp->block_size, comp->over_read, comp->over_write,
+               comp->cpu_flags);
         return 0;
     }
 
@@ -201,6 +242,8 @@ static void op_pass_setup(const AVFrame *out, const AVFrame *in,
         exec->out_bump[i] = exec->out_stride[i] - blocks_main * exec->block_size_out;
     }
 
+    exec->width  = pass->width;
+    exec->height = pass->height;
     exec->src_frame_ptr = in;
     exec->dst_frame_ptr = out;
 }
@@ -323,20 +366,6 @@ static void op_pass_run(const AVFrame *out, const AVFrame *in, const int y,
         handle_tail(p, &exec, memcpy_out, true, y + h_main, 1);
 }
 
-static int rw_planes(const SwsOp *op)
-{
-    return op->rw.packed ? 1 : op->rw.elems;
-}
-
-static int rw_pixel_bits(const SwsOp *op)
-{
-    const int elems = op->rw.packed ? op->rw.elems : 1;
-    const int size  = ff_sws_pixel_type_size(op->type);
-    const int bits  = 8 >> op->rw.frac;
-    av_assert1(bits >= 1);
-    return elems * size * bits;
-}
-
 static int compile(SwsGraph *graph, const SwsOpList *ops,
                    const SwsFormat *dst, SwsPass *input, SwsPass **output)
 {
@@ -345,28 +374,10 @@ static int compile(SwsGraph *graph, const SwsOpList *ops,
     if (!p)
         return AVERROR(ENOMEM);
 
-    int ret = compile_ops(ctx, ops, &p->comp);
+    int ret = compile_ops(ctx, ops, p);
     if (ret < 0)
         goto fail;
 
-    const SwsOp *read = &ops->ops[0];
-    const SwsOp *write = &ops->ops[ops->num_ops - 1];
-    p->planes_in  = rw_planes(read);
-    p->planes_out = rw_planes(write);
-    p->pixel_bits_in  = rw_pixel_bits(read);
-    p->pixel_bits_out = rw_pixel_bits(write);
-    p->exec_base = (SwsOpExec) {
-        .width  = dst->width,
-        .height = dst->height,
-        .block_size_in  = p->comp.block_size * p->pixel_bits_in  >> 3,
-        .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
-    };
-
-    for (int i = 0; i < 4; i++) {
-        p->idx_in[i]  = i < p->planes_in  ? ops->order_src.in[i] : -1;
-        p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
-    }
-
     SwsPass *pass;
     pass = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
                                  input, p->comp.slice_align, p, op_pass_run);
-- 
2.52.0


>From e1f23d7cc5e09af092636a517cd6e789e79a73d8 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Fri, 27 Feb 2026 16:24:55 +0100
Subject: [PATCH 6/6] swscale/ops: allow backends to return EAGAIN on compile()

This allows backends to handle op lists that require multiple passes,
e.g. when applying multiple filters, or filtering the result of a previous op.

In order for each pass to self-contain its state, we generate a linked list
of SwsOpPass structs before turning them all into SwsPass only on successful
compilation.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
---
 libswscale/ops_dispatch.c | 118 +++++++++++++++++++++++++-------------
 libswscale/ops_internal.h |   5 ++
 2 files changed, 83 insertions(+), 40 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 87caf11a4b..7f9cf88dd6 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -42,8 +42,19 @@ typedef struct SwsOpPass {
     int idx_out[4];
     bool memcpy_in;
     bool memcpy_out;
+
+    struct SwsOpPass *next; /* next pass in chain */
 } SwsOpPass;
 
+static void op_pass_free(void *ptr)
+{
+    SwsOpPass *p = ptr;
+    if (p->comp.free)
+        p->comp.free(p->comp.priv);
+
+    av_free(p);
+}
+
 static int compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
                            SwsOpList *ops, SwsCompiledOp *out)
 {
@@ -54,7 +65,7 @@ static int compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
 
     const int orig_num_ops = ops->num_ops;
     int ret = backend->compile(ctx, ops, &compiled);
-    if (ret < 0) {
+    if (ret < 0 && ret != AVERROR(EAGAIN)) {
         int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR;
         av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n",
                backend->name, av_err2str(ret));
@@ -113,7 +124,7 @@ static int compile_single_pass(SwsContext *ctx, const SwsOpBackend *backend,
     }
 
     int ret = compile_backend(ctx, backend, ops, &p->comp);
-    if (ret < 0)
+    if (ret < 0 && ret != AVERROR(EAGAIN))
         return ret;
 
     SwsOpExec *exec = &p->exec_base;
@@ -122,7 +133,8 @@ static int compile_single_pass(SwsContext *ctx, const SwsOpBackend *backend,
     return ret;
 }
 
-static int compile_ops(SwsContext *ctx, const SwsOpList *ops, SwsOpPass *out)
+static int compile_ops(SwsContext *ctx, const SwsOpList *ops,
+                       SwsOpPass **out_passes)
 {
     for (int n = 0; ff_sws_op_backends[n]; n++) {
         const SwsOpBackend *backend = ff_sws_op_backends[n];
@@ -135,19 +147,49 @@ static int compile_ops(SwsContext *ctx, const SwsOpList *ops, SwsOpPass *out)
         SwsOpList *copy = ff_sws_op_list_duplicate(ops);
         SwsOpList rest = *copy;
 
-        int ret = compile_single_pass(ctx, backend, &rest, out);
+        SwsOpPass *passes = NULL, *prev = NULL;
+        int ret;
+
+        do {
+            SwsOpPass *p = av_mallocz(sizeof(*p));
+            if (!p) {
+                ret = AVERROR(ENOMEM);
+                break;
+            }
+
+            /* Link to previous pass */
+            if (prev) {
+                prev->next = p;
+                prev = p;
+            } else {
+                passes = prev = p;
+            }
+
+            ret = compile_single_pass(ctx, backend, &rest, p);
+        } while (ret == AVERROR(EAGAIN));
+
         ff_sws_op_list_free(&copy);
         if (ret < 0) {
-            /* Reset state for each new backend tried */
-            memset(out, 0, sizeof(*out));
+            /* Free any passes that may have been partially initialized */
+            while (passes) {
+                SwsOpPass *next = passes->next;
+                op_pass_free(passes);
+                passes = next;
+            }
             continue;
         }
 
-        const SwsCompiledOp *comp = &out->comp;
-        av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
-               "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
-               backend->name, comp->block_size, comp->over_read, comp->over_write,
-               comp->cpu_flags);
+        *out_passes = passes;
+
+        av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s'\n", backend->name);
+        for (int i = 0; passes; i++) {
+            const SwsCompiledOp *comp = &passes->comp;
+            av_log(ctx, AV_LOG_DEBUG, "  Pass %d: block size = %d, "
+                   "over-read = %d, over-write = %d, cpu flags = 0x%x\n",
+                   i, comp->block_size, comp->over_read, comp->over_write,
+                   comp->cpu_flags);
+            passes = passes->next;
+        }
         return 0;
     }
 
@@ -156,18 +198,6 @@ static int compile_ops(SwsContext *ctx, const SwsOpList *ops, SwsOpPass *out)
     return AVERROR(ENOTSUP);
 }
 
-static void op_pass_free(void *ptr)
-{
-    SwsOpPass *p = ptr;
-    if (!p)
-        return;
-
-    if (p->comp.free)
-        p->comp.free(p->comp.priv);
-
-    av_free(p);
-}
-
 static inline void get_row_data(const SwsOpPass *p, const int y,
                                 const uint8_t *in[4], uint8_t *out[4])
 {
@@ -370,30 +400,38 @@ static int compile(SwsGraph *graph, const SwsOpList *ops,
                    const SwsFormat *dst, SwsPass *input, SwsPass **output)
 {
     SwsContext *ctx = graph->ctx;
-    SwsOpPass *p = av_mallocz(sizeof(*p));
-    if (!p)
-        return AVERROR(ENOMEM);
+    SwsPass *pass = NULL;
+    SwsOpPass *p;
 
-    int ret = compile_ops(ctx, ops, p);
+    int ret = compile_ops(ctx, ops, &p);
     if (ret < 0)
-        goto fail;
+        return ret;
 
-    SwsPass *pass;
-    pass = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
-                                 input, p->comp.slice_align, p, op_pass_run);
-    if (!pass) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
+    while (p) {
+        pass = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
+                                     input, p->comp.slice_align, p, op_pass_run);
+        if (!pass) {
+            /* Free any remaining passes */
+            while (p) {
+                SwsOpPass *next = p->next;
+                op_pass_free(p);
+                p = next;
+            }
+
+            return AVERROR(ENOMEM);
+        }
+
+        pass->setup = op_pass_setup;
+        pass->free  = op_pass_free;
+        p = p->next;
+
+        /* Chain passes together */
+        input = pass;
     }
-    pass->setup = op_pass_setup;
-    pass->free  = op_pass_free;
 
+    /* Return last pass added */
     *output = pass;
     return 0;
-
-fail:
-    op_pass_free(p);
-    return ret;
 }
 
 int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags,
diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
index cfc386a081..80215dcb71 100644
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -59,6 +59,11 @@ typedef struct SwsOpBackend {
      * Compile an operation list to an implementation chain. May modify `ops`
      * freely; the original list will be freed automatically by the caller.
      *
+     * A special return value of AVERROR(EAGAIN) indicates that the backend
+     * produced a partial output and must be called again on the remainder.
+     * In this case, the caller will call the function again with the same
+     * parameters (including any changes to `ops` made by the backend).
+     *
      * Returns 0 or a negative error code.
      */
     int (*compile)(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out);
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

                 reply	other threads:[~2026-02-27 17:29 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=177221328498.25.6802438838394186248@29965ddac10e \
    --to=ffmpeg-devel@ffmpeg.org \
    --cc=code@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git