From: Niklas Haas <ffmpeg@haasn.xyz> To: ffmpeg-devel@ffmpeg.org Cc: Niklas Haas <git@haasn.dev> Subject: [FFmpeg-devel] [PATCH v2 09/17] swscale/ops: add dispatch layer Date: Wed, 21 May 2025 14:43:55 +0200 Message-ID: <20250521124824.49657-10-ffmpeg@haasn.xyz> (raw) In-Reply-To: <20250521124824.49657-1-ffmpeg@haasn.xyz> From: Niklas Haas <git@haasn.dev> This handles the low-level execution of an op list, and integration into the SwsGraph infrastructure. To handle frames with insufficient padding in the stride (or a width smaller than one block size), we use a fallback loop that pads the last column of pixels using `memcpy` into an appropriately sized buffer. --- libswscale/ops.c | 256 +++++++++++++++++++++++++++++++++++++++++++++++ libswscale/ops.h | 14 +++ 2 files changed, 270 insertions(+) diff --git a/libswscale/ops.c b/libswscale/ops.c index 8491bd9cad..d466f5e45c 100644 --- a/libswscale/ops.c +++ b/libswscale/ops.c @@ -582,3 +582,259 @@ int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out ff_sws_op_list_print(ctx, AV_LOG_WARNING, ops); return AVERROR(ENOTSUP); } + +typedef struct SwsOpPass { + SwsCompiledOp comp; + SwsOpExec exec_base; + int num_blocks; + int tail_off_in; + int tail_off_out; + int tail_size_in; + int tail_size_out; + bool memcpy_in; + bool memcpy_out; +} SwsOpPass; + +static void op_pass_free(void *ptr) +{ + SwsOpPass *p = ptr; + if (!p) + return; + + if (p->comp.free) + p->comp.free(p->comp.priv); + + av_free(p); +} + +static void op_pass_setup(const SwsImg *out, const SwsImg *in, const SwsPass *pass) +{ + const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->fmt); + const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->fmt); + + SwsOpPass *p = pass->priv; + SwsOpExec *exec = &p->exec_base; + const SwsCompiledOp *comp = &p->comp; + const int block_size = comp->block_size; + p->num_blocks = (pass->width + block_size - 1) / block_size; + + /* Set up main loop parameters */ + const int aligned_w = p->num_blocks * block_size; + const int safe_width = (p->num_blocks - 1) * block_size; + const int tail_size = pass->width - safe_width; + p->tail_off_in = safe_width * exec->pixel_bits_in >> 3; + p->tail_off_out = safe_width * exec->pixel_bits_out >> 3; + p->tail_size_in = tail_size * exec->pixel_bits_in >> 3; + p->tail_size_out = tail_size * exec->pixel_bits_out >> 3; + p->memcpy_in = false; + p->memcpy_out = false; + + for (int i = 0; i < 4 && in->data[i]; i++) { + const int sub_x = (i == 1 || i == 2) ? indesc->log2_chroma_w : 0; + const int plane_w = (aligned_w + sub_x) >> sub_x; + const int plane_pad = (comp->over_read + sub_x) >> sub_x; + const int plane_size = plane_w * exec->pixel_bits_in >> 3; + p->memcpy_in |= plane_size + plane_pad > in->linesize[i]; + exec->in_stride[i] = in->linesize[i]; + } + + for (int i = 0; i < 4 && out->data[i]; i++) { + const int sub_x = (i == 1 || i == 2) ? outdesc->log2_chroma_w : 0; + const int plane_w = (aligned_w + sub_x) >> sub_x; + const int plane_pad = (comp->over_write + sub_x) >> sub_x; + const int plane_size = plane_w * exec->pixel_bits_out >> 3; + p->memcpy_out |= plane_size + plane_pad > out->linesize[i]; + exec->out_stride[i] = out->linesize[i]; + } +} + +/* Dispatch kernel over the last column of the image using memcpy */ +static av_always_inline void +handle_tail(const SwsOpPass *p, SwsOpExec *exec, + const SwsImg *out_base, const bool copy_out, + const SwsImg *in_base, const bool copy_in, + int y, const int h) +{ + DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])]; + + const SwsCompiledOp *comp = &p->comp; + const int tail_size_in = p->tail_size_in; + const int tail_size_out = p->tail_size_out; + const int bx = p->num_blocks - 1; + + SwsImg in = ff_sws_img_shift(in_base, y); + SwsImg out = ff_sws_img_shift(out_base, y); + for (int i = 0; i < 4 && in.data[i]; i++) { + in.data[i] += p->tail_off_in; + if (copy_in) { + exec->in[i] = (void *) tmp[0][i]; + exec->in_stride[i] = sizeof(tmp[0][i]); + } else { + exec->in[i] = in.data[i]; + } + } + + for (int i = 0; i < 4 && out.data[i]; i++) { + out.data[i] += p->tail_off_out; + if (copy_out) { + exec->out[i] = (void *) tmp[1][i]; + exec->out_stride[i] = sizeof(tmp[1][i]); + } else { + exec->out[i] = out.data[i]; + } + } + + for (int y_end = y + h; y < y_end; y++) { + if (copy_in) { + for (int i = 0; i < 4 && in.data[i]; i++) { + av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]); + memcpy(tmp[0][i], in.data[i], tail_size_in); + in.data[i] += in.linesize[i]; + } + } + + comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1); + + if (copy_out) { + for (int i = 0; i < 4 && out.data[i]; i++) { + av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]); + memcpy(out.data[i], tmp[1][i], tail_size_out); + out.data[i] += out.linesize[i]; + } + } + + for (int i = 0; i < 4; i++) { + if (!copy_in) + exec->in[i] += in.linesize[i]; + if (!copy_out) + exec->out[i] += out.linesize[i]; + } + } +} + +static void op_pass_run(const SwsImg *out_base, const SwsImg *in_base, + const int y, const int h, const SwsPass *pass) +{ + const SwsOpPass *p = pass->priv; + const SwsCompiledOp *comp = &p->comp; + + /* Fill exec metadata for this slice */ + const SwsImg in = ff_sws_img_shift(in_base, y); + const SwsImg out = ff_sws_img_shift(out_base, y); + SwsOpExec exec = p->exec_base; + exec.slice_y = y; + exec.slice_h = h; + for (int i = 0; i < 4; i++) { + exec.in[i] = in.data[i]; + exec.out[i] = out.data[i]; + } + + /** + * To ensure safety, we need to consider the following: + * + * 1. We can overread the input, unless this is the last line of an + * unpadded buffer. All defined operations can handle arbitrary pixel + * input, so overread of arbitrary data is fine. + * + * 2. We can overwrite the output, as long as we don't write more than the + * amount of pixels that fit into one linesize. So we always need to + * memcpy the last column on the output side if unpadded. + * + * 3. For the last row, we also need to memcpy the remainder of the input, + * to avoid reading past the end of the buffer. Note that since we know + * the run() function is called on stripes of the same buffer, we don't + * need to worry about this for the end of a slice. + */ + + const int last_slice = y + h == pass->height; + const bool memcpy_in = last_slice && p->memcpy_in; + const bool memcpy_out = p->memcpy_out; + const int num_blocks = p->num_blocks; + const int blocks_main = num_blocks - memcpy_out; + const int h_main = h - memcpy_in; + + /* Handle main section */ + comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main); + + if (memcpy_in) { + /* Safe part of last row */ + for (int i = 0; i < 4; i++) { + exec.in[i] += h_main * in.linesize[i]; + exec.out[i] += h_main * out.linesize[i]; + } + comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h); + } + + /* Handle last column via memcpy, takes over `exec` so call these last */ + if (memcpy_out) + handle_tail(p, &exec, out_base, true, in_base, false, y, h_main); + if (memcpy_in) + handle_tail(p, &exec, out_base, memcpy_out, in_base, true, y + h_main, 1); +} + +static int rw_pixel_bits(const SwsOp *op) +{ + const int elems = op->rw.packed ? op->rw.elems : 1; + const int size = ff_sws_pixel_type_size(op->type); + const int bits = 8 >> op->rw.frac; + av_assert1(bits >= 1); + return elems * size * bits; +} + +int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst, + SwsPass *input, SwsPass **output) +{ + SwsContext *ctx = graph->ctx; + SwsOpPass *p = NULL; + const SwsOp *read = &ops->ops[0]; + const SwsOp *write = &ops->ops[ops->num_ops - 1]; + SwsPass *pass; + int ret; + + if (ops->num_ops < 2) { + av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n"); + return AVERROR(EINVAL); + } + + if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) { + av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read " + "and write, respectively.\n"); + return AVERROR(EINVAL); + } + + if (flags & SWS_OP_FLAG_OPTIMIZE) + RET(ff_sws_op_list_optimize(ops)); + else + ff_sws_op_list_update_comps(ops); + + p = av_mallocz(sizeof(*p)); + if (!p) + return AVERROR(ENOMEM); + + p->exec_base = (SwsOpExec) { + .width = dst.width, + .height = dst.height, + .pixel_bits_in = rw_pixel_bits(read), + .pixel_bits_out = rw_pixel_bits(write), + }; + + ret = ff_sws_ops_compile(ctx, ops, &p->comp); + if (ret < 0) + goto fail; + + pass = ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height, input, + 1, p, op_pass_run); + if (!pass) { + ret = AVERROR(ENOMEM); + goto fail; + } + pass->setup = op_pass_setup; + pass->free = op_pass_free; + + *output = pass; + return 0; + +fail: + op_pass_free(p); + return ret; +} diff --git a/libswscale/ops.h b/libswscale/ops.h index ae65d578b3..1a992f42ec 100644 --- a/libswscale/ops.h +++ b/libswscale/ops.h @@ -249,4 +249,18 @@ void ff_sws_op_list_update_comps(SwsOpList *ops); */ int ff_sws_op_list_optimize(SwsOpList *ops); +enum SwsOpCompileFlags { + /* Automatically optimize the operations when compiling */ + SWS_OP_FLAG_OPTIMIZE = 1 << 0, +}; + +/** + * Resolves an operation list to a graph pass. The first and last operations + * must be a read/write respectively. `flags` is a list of SwsOpCompileFlags. + * + * Note: `ops` may be modified by this function. + */ +int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst, + SwsPass *input, SwsPass **output); + #endif -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-05-21 12:52 UTC|newest] Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-05-21 12:43 [FFmpeg-devel] [PATCH v2 00/17] swscale: new ops framework Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 01/17] swscale/format: rename legacy format conversion table Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 02/17] swscale/format: add ff_fmt_clear() Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 03/17] tests/checkasm: increase number of runs in between measurements Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 04/17] tests/checkasm: generalize DEF_CHECKASM_CHECK_FUNC to floats Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 05/17] swscale: add SWS_UNSTABLE flag Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 06/17] swscale/ops: introduce new low level framework Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 07/17] swscale/optimizer: add high-level ops optimizer Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 08/17] swscale/ops_internal: add internal ops backend API Niklas Haas 2025-05-23 16:27 ` Michael Niedermayer 2025-05-23 16:52 ` Niklas Haas 2025-05-21 12:43 ` Niklas Haas [this message] 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 10/17] swscale/optimizer: add packed shuffle solver Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 11/17] swscale/ops_chain: add internal abstraction for kernel linking Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 12/17] swscale/ops_backend: add reference backend basend on C templates Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 13/17] swscale/ops_memcpy: add 'memcpy' backend for plane->plane copies Niklas Haas 2025-05-21 12:44 ` [FFmpeg-devel] [PATCH v2 14/17] swscale/x86: add SIMD backend Niklas Haas 2025-05-21 14:11 ` Kieran Kunhya via ffmpeg-devel 2025-05-21 12:44 ` [FFmpeg-devel] [PATCH v2 15/17] tests/checkasm: add checkasm tests for swscale ops Niklas Haas 2025-05-21 12:44 ` [FFmpeg-devel] [PATCH v2 16/17] swscale/format: add new format decode/encode logic Niklas Haas 2025-05-21 12:44 ` [FFmpeg-devel] [PATCH v2 17/17] swscale/graph: allow experimental use of new format handler Niklas Haas
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250521124824.49657-10-ffmpeg@haasn.xyz \ --to=ffmpeg@haasn.xyz \ --cc=ffmpeg-devel@ffmpeg.org \ --cc=git@haasn.dev \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git