Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: Niklas Haas <ffmpeg@haasn.xyz>
To: ffmpeg-devel@ffmpeg.org
Cc: Niklas Haas <git@haasn.dev>
Subject: [FFmpeg-devel] [PATCH v6 10/18] swscale/ops: add dispatch layer
Date: Wed, 11 Jun 2025 14:47:16 +0200
Message-ID: <20250611124805.73196-11-ffmpeg@haasn.xyz> (raw)
In-Reply-To: <20250611124805.73196-1-ffmpeg@haasn.xyz>

From: Niklas Haas <git@haasn.dev>

This handles the low-level execution of an op list, and integration into
the SwsGraph infrastructure. To handle frames with insufficient padding in
the stride (or a width smaller than one block size), we use a fallback loop
that pads the last column of pixels using `memcpy` into an appropriately
sized buffer.
---
 libswscale/ops.c          | 269 ++++++++++++++++++++++++++++++++++++++
 libswscale/ops.h          |  14 ++
 libswscale/ops_internal.h |  18 ++-
 3 files changed, 294 insertions(+), 7 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index 101ceda568..762eb6370c 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -587,3 +587,272 @@ int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out
     ff_sws_op_list_print(ctx, AV_LOG_WARNING, ops);
     return AVERROR(ENOTSUP);
 }
+
+typedef struct SwsOpPass {
+    SwsCompiledOp comp;
+    SwsOpExec exec_base;
+    int num_blocks;
+    int tail_off_in;
+    int tail_off_out;
+    int tail_size_in;
+    int tail_size_out;
+    int pixel_bits_in;
+    int pixel_bits_out;
+    bool memcpy_in;
+    bool memcpy_out;
+} SwsOpPass;
+
+static void op_pass_free(void *ptr)
+{
+    SwsOpPass *p = ptr;
+    if (!p)
+        return;
+
+    if (p->comp.free)
+        p->comp.free(p->comp.priv);
+
+    av_free(p);
+}
+
+static void op_pass_setup(const SwsImg *out, const SwsImg *in, const SwsPass *pass)
+{
+    const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in->fmt);
+    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->fmt);
+
+    SwsOpPass *p = pass->priv;
+    SwsOpExec *exec = &p->exec_base;
+    const SwsCompiledOp *comp = &p->comp;
+    const int block_size = comp->block_size;
+    p->num_blocks = (pass->width + block_size - 1) / block_size;
+
+    /* Set up main loop parameters */
+    const int aligned_w  = p->num_blocks * block_size;
+    const int safe_width = (p->num_blocks - 1) * block_size;
+    const int tail_size  = pass->width - safe_width;
+    p->tail_off_in   = safe_width * p->pixel_bits_in  >> 3;
+    p->tail_off_out  = safe_width * p->pixel_bits_out >> 3;
+    p->tail_size_in  = tail_size  * p->pixel_bits_in  >> 3;
+    p->tail_size_out = tail_size  * p->pixel_bits_out >> 3;
+    p->memcpy_in     = false;
+    p->memcpy_out    = false;
+
+    for (int i = 0; i < 4 && in->data[i]; i++) {
+        const int sub_x      = (i == 1 || i == 2) ? indesc->log2_chroma_w : 0;
+        const int plane_w    = (aligned_w + sub_x) >> sub_x;
+        const int plane_pad  = (comp->over_read + sub_x) >> sub_x;
+        const int plane_size = plane_w * p->pixel_bits_in >> 3;
+        p->memcpy_in |= plane_size + plane_pad > in->linesize[i];
+        exec->in_stride[i] = in->linesize[i];
+    }
+
+    for (int i = 0; i < 4 && out->data[i]; i++) {
+        const int sub_x      = (i == 1 || i == 2) ? outdesc->log2_chroma_w : 0;
+        const int plane_w    = (aligned_w + sub_x) >> sub_x;
+        const int plane_pad  = (comp->over_write + sub_x) >> sub_x;
+        const int plane_size = plane_w * p->pixel_bits_out >> 3;
+        p->memcpy_out |= plane_size + plane_pad > out->linesize[i];
+        exec->out_stride[i] = out->linesize[i];
+    }
+
+    /* Pre-fill pointer bump for the main section only; this value does not
+     * matter at all for the tail / last row handlers because they only ever
+     * process a single line */
+    const int blocks_main = p->num_blocks - p->memcpy_out;
+    for (int i = 0; i < 4; i++) {
+        exec->in_bump[i]  = in->linesize[i]  - blocks_main * exec->block_size_in;
+        exec->out_bump[i] = out->linesize[i] - blocks_main * exec->block_size_out;
+    }
+}
+
+/* Dispatch kernel over the last column of the image using memcpy */
+static av_always_inline void
+handle_tail(const SwsOpPass *p, SwsOpExec *exec,
+            const SwsImg *out_base, const bool copy_out,
+            const SwsImg *in_base, const bool copy_in,
+            int y, const int h)
+{
+    DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])];
+
+    const SwsCompiledOp *comp = &p->comp;
+    const int tail_size_in  = p->tail_size_in;
+    const int tail_size_out = p->tail_size_out;
+    const int bx = p->num_blocks - 1;
+
+    SwsImg in  = ff_sws_img_shift(in_base,  y);
+    SwsImg out = ff_sws_img_shift(out_base, y);
+    for (int i = 0; i < 4 && in.data[i]; i++) {
+        in.data[i]  += p->tail_off_in;
+        if (copy_in) {
+            exec->in[i] = (void *) tmp[0][i];
+            exec->in_stride[i] = sizeof(tmp[0][i]);
+        } else {
+            exec->in[i] = in.data[i];
+        }
+    }
+
+    for (int i = 0; i < 4 && out.data[i]; i++) {
+        out.data[i] += p->tail_off_out;
+        if (copy_out) {
+            exec->out[i] = (void *) tmp[1][i];
+            exec->out_stride[i] = sizeof(tmp[1][i]);
+        } else {
+            exec->out[i] = out.data[i];
+        }
+    }
+
+    for (int y_end = y + h; y < y_end; y++) {
+        if (copy_in) {
+            for (int i = 0; i < 4 && in.data[i]; i++) {
+                av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]);
+                memcpy(tmp[0][i], in.data[i], tail_size_in);
+                in.data[i] += in.linesize[i];
+            }
+        }
+
+        comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1);
+
+        if (copy_out) {
+            for (int i = 0; i < 4 && out.data[i]; i++) {
+                av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]);
+                memcpy(out.data[i], tmp[1][i], tail_size_out);
+                out.data[i] += out.linesize[i];
+            }
+        }
+
+        for (int i = 0; i < 4; i++) {
+            if (!copy_in)
+                exec->in[i] += in.linesize[i];
+            if (!copy_out)
+                exec->out[i] += out.linesize[i];
+        }
+    }
+}
+
+static void op_pass_run(const SwsImg *out_base, const SwsImg *in_base,
+                        const int y, const int h, const SwsPass *pass)
+{
+    const SwsOpPass *p = pass->priv;
+    const SwsCompiledOp *comp = &p->comp;
+    const SwsImg in  = ff_sws_img_shift(in_base,  y);
+    const SwsImg out = ff_sws_img_shift(out_base, y);
+
+    /* Fill exec metadata for this slice */
+    DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base;
+    exec.slice_y = y;
+    exec.slice_h = h;
+    for (int i = 0; i < 4; i++) {
+        exec.in[i]  = in.data[i];
+        exec.out[i] = out.data[i];
+    }
+
+    /**
+     *  To ensure safety, we need to consider the following:
+     *
+     * 1. We can overread the input, unless this is the last line of an
+     *    unpadded buffer. All defined operations can handle arbitrary pixel
+     *    input, so overread of arbitrary data is fine.
+     *
+     * 2. We can overwrite the output, as long as we don't write more than the
+     *    amount of pixels that fit into one linesize. So we always need to
+     *    memcpy the last column on the output side if unpadded.
+     *
+     * 3. For the last row, we also need to memcpy the remainder of the input,
+     *    to avoid reading past the end of the buffer. Note that since we know
+     *    the run() function is called on stripes of the same buffer, we don't
+     *    need to worry about this for the end of a slice.
+     */
+
+    const int last_slice  = y + h == pass->height;
+    const bool memcpy_in  = last_slice && p->memcpy_in;
+    const bool memcpy_out = p->memcpy_out;
+    const int num_blocks  = p->num_blocks;
+    const int blocks_main = num_blocks - memcpy_out;
+    const int h_main      = h - memcpy_in;
+
+    /* Handle main section */
+    comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main);
+
+    if (memcpy_in) {
+        /* Safe part of last row */
+        for (int i = 0; i < 4; i++) {
+            exec.in[i]  += h_main * in.linesize[i];
+            exec.out[i] += h_main * out.linesize[i];
+        }
+        comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h);
+    }
+
+    /* Handle last column via memcpy, takes over `exec` so call these last */
+    if (memcpy_out)
+        handle_tail(p, &exec, out_base, true, in_base, false, y, h_main);
+    if (memcpy_in)
+        handle_tail(p, &exec, out_base, memcpy_out, in_base, true, y + h_main, 1);
+}
+
+static int rw_pixel_bits(const SwsOp *op)
+{
+    const int elems = op->rw.packed ? op->rw.elems : 1;
+    const int size  = ff_sws_pixel_type_size(op->type);
+    const int bits  = 8 >> op->rw.frac;
+    av_assert1(bits >= 1);
+    return elems * size * bits;
+}
+
+int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst,
+                        SwsPass *input, SwsPass **output)
+{
+    SwsContext *ctx = graph->ctx;
+    SwsOpPass *p = NULL;
+    const SwsOp *read = &ops->ops[0];
+    const SwsOp *write = &ops->ops[ops->num_ops - 1];
+    SwsPass *pass;
+    int ret;
+
+    if (ops->num_ops < 2) {
+        av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) {
+        av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
+               "and write, respectively.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (flags & SWS_OP_FLAG_OPTIMIZE)
+        RET(ff_sws_op_list_optimize(ops));
+    else
+        ff_sws_op_list_update_comps(ops);
+
+    p = av_mallocz(sizeof(*p));
+    if (!p)
+        return AVERROR(ENOMEM);
+
+    ret = ff_sws_ops_compile(ctx, ops, &p->comp);
+    if (ret < 0)
+        goto fail;
+
+    p->pixel_bits_in  = rw_pixel_bits(read);
+    p->pixel_bits_out = rw_pixel_bits(write);
+    p->exec_base = (SwsOpExec) {
+        .width  = dst.width,
+        .height = dst.height,
+        .block_size_in  = p->comp.block_size * p->pixel_bits_in  >> 3,
+        .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
+    };
+
+    pass = ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height, input,
+                                 1, p, op_pass_run);
+    if (!pass) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    pass->setup = op_pass_setup;
+    pass->free  = op_pass_free;
+
+    *output = pass;
+    return 0;
+
+fail:
+    op_pass_free(p);
+    return ret;
+}
diff --git a/libswscale/ops.h b/libswscale/ops.h
index a90701cf50..c4701404e1 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -249,4 +249,18 @@ void ff_sws_op_list_update_comps(SwsOpList *ops);
  */
 int ff_sws_op_list_optimize(SwsOpList *ops);
 
+enum SwsOpCompileFlags {
+    /* Automatically optimize the operations when compiling */
+    SWS_OP_FLAG_OPTIMIZE = 1 << 0,
+};
+
+/**
+ * Resolves an operation list to a graph pass. The first and last operations
+ * must be a read/write respectively. `flags` is a list of SwsOpCompileFlags.
+ *
+ * Note: `ops` may be modified by this function.
+ */
+int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst,
+                        SwsPass *input, SwsPass **output);
+
 #endif
diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
index 9fd866430b..2fbd8a55d0 100644
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -33,21 +33,25 @@
  */
 typedef struct SwsOpExec {
     /* The data pointers point to the first pixel to process */
-    DECLARE_ALIGNED_32(const uint8_t, *in[4]);
-    DECLARE_ALIGNED_32(uint8_t, *out[4]);
+    const uint8_t *in[4];
+    uint8_t *out[4];
 
     /* Separation between lines in bytes */
-    DECLARE_ALIGNED_32(ptrdiff_t, in_stride[4]);
-    DECLARE_ALIGNED_32(ptrdiff_t, out_stride[4]);
+    ptrdiff_t in_stride[4];
+    ptrdiff_t out_stride[4];
+
+    /* Pointer bump, difference between stride and processed line size */
+    ptrdiff_t in_bump[4];
+    ptrdiff_t out_bump[4];
 
     /* Extra metadata, may or may not be useful */
     int32_t width, height;      /* Overall image dimensions */
     int32_t slice_y, slice_h;   /* Start and height of current slice */
-    int32_t pixel_bits_in;      /* Bits per input pixel */
-    int32_t pixel_bits_out;     /* Bits per output pixel */
+    int32_t block_size_in;      /* Size of a block of pixels in bytes */
+    int32_t block_size_out;
 } SwsOpExec;
 
-static_assert(sizeof(SwsOpExec) == 16 * sizeof(void *) + 8 * sizeof(int32_t),
+static_assert(sizeof(SwsOpExec) == 24 * sizeof(void *) + 6 * sizeof(int32_t),
               "SwsOpExec layout mismatch");
 
 /**
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  parent reply	other threads:[~2025-06-11 12:51 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-11 12:47 [FFmpeg-devel] [PATCH v6 00/18] swscale: new ops framework Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 01/18] swscale/graph: pass per-pass image pointers to setup() Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 02/18] swscale/format: rename legacy format conversion table Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 03/18] swscale/format: add ff_fmt_clear() Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 04/18] tests/checkasm: increase number of runs in between measurements Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 05/18] tests/checkasm: generalize DEF_CHECKASM_CHECK_FUNC to floats Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 06/18] swscale: add SWS_UNSTABLE flag Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 07/18] swscale/ops: introduce new low level framework Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 08/18] swscale/optimizer: add high-level ops optimizer Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 09/18] swscale/ops_internal: add internal ops backend API Niklas Haas
2025-06-11 12:47 ` Niklas Haas [this message]
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 11/18] swscale/optimizer: add packed shuffle solver Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 12/18] swscale/ops_chain: add internal abstraction for kernel linking Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 13/18] swscale/ops_backend: add reference backend basend on C templates Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 14/18] swscale/ops_memcpy: add 'memcpy' backend for plane->plane copies Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 15/18] swscale/x86: add SIMD backend Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 16/18] tests/checkasm: add checkasm tests for swscale ops Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 17/18] swscale/format: add new format decode/encode logic Niklas Haas
2025-06-11 12:47 ` [FFmpeg-devel] [PATCH v6 18/18] swscale/graph: allow experimental use of new format handler Niklas Haas
2025-06-11 17:59 ` [FFmpeg-devel] [PATCH v6 00/18] swscale: new ops framework Niklas Haas
2025-06-16 12:31 ` Niklas Haas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250611124805.73196-11-ffmpeg@haasn.xyz \
    --to=ffmpeg@haasn.xyz \
    --cc=ffmpeg-devel@ffmpeg.org \
    --cc=git@haasn.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git