From: Niklas Haas <ffmpeg@haasn.xyz> To: ffmpeg-devel@ffmpeg.org Cc: Niklas Haas <git@haasn.dev> Subject: [FFmpeg-devel] [PATCH v2 11/17] swscale/ops_chain: add internal abstraction for kernel linking Date: Wed, 21 May 2025 14:43:57 +0200 Message-ID: <20250521124824.49657-12-ffmpeg@haasn.xyz> (raw) In-Reply-To: <20250521124824.49657-1-ffmpeg@haasn.xyz> From: Niklas Haas <git@haasn.dev> See doc/swscale-v2.txt for design details. --- libswscale/Makefile | 1 + libswscale/ops_chain.c | 293 +++++++++++++++++++++++++++++++++++++++++ libswscale/ops_chain.h | 109 +++++++++++++++ 3 files changed, 403 insertions(+) create mode 100644 libswscale/ops_chain.c create mode 100644 libswscale/ops_chain.h diff --git a/libswscale/Makefile b/libswscale/Makefile index 810c9dee78..c9dfa78c89 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -16,6 +16,7 @@ OBJS = alphablend.o \ input.o \ lut3d.o \ ops.o \ + ops_chain.o \ ops_optimizer.o \ options.o \ output.o \ diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c new file mode 100644 index 0000000000..cba825ee41 --- /dev/null +++ b/libswscale/ops_chain.c @@ -0,0 +1,293 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/mem.h" +#include "libavutil/rational.h" + +#include "ops_chain.h" + +SwsOpChain *ff_sws_op_chain_alloc(void) +{ + return av_mallocz(sizeof(SwsOpChain)); +} + +void ff_sws_op_chain_free(SwsOpChain *chain) +{ + if (!chain) + return; + + for (int i = 0; i < chain->num_impl + 1; i++) { + if (chain->free[i]) + chain->free[i](chain->impl[i].priv.ptr); + } + + av_free(chain); +} + +int ff_sws_op_chain_append(SwsOpChain *chain, SwsFuncPtr func, + void (*free)(void *), SwsOpPriv priv) +{ + const int idx = chain->num_impl; + if (idx == SWS_MAX_OPS) + return AVERROR(EINVAL); + + av_assert1(func); + chain->impl[idx].cont = func; + chain->impl[idx + 1].priv = priv; + chain->free[idx + 1] = free; + chain->num_impl++; + return 0; +} + +/** + * Match an operation against a reference operation. Returns a score for how + * well the reference matches the operation, or 0 if there is no match. + * + * If `ref->comps` has any flags set, they must be set in `op` as well. + * Likewise, if `ref->comps` has any components marked as unused, they must be + * marked as as unused in `ops` as well. + * + * For SWS_OP_LINEAR, `ref->linear.mask` must be a strict superset of + * `op->linear.mask`, but may not contain any columns explicitly ignored by + * `op->comps.unused`. + * + * For SWS_OP_READ, SWS_OP_WRITE, SWS_OP_SWAP_BYTES and SWS_OP_SWIZZLE, the + * exact type is not checked, just the size. + * + * Components set in `next.unused` are ignored when matching. If `flexible` + * is true, the op body is ignored - only the operation, pixel type, and + * component masks are checked. + */ +static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps next) +{ + const SwsOp *ref = &entry->op; + int score = 10; + if (op->op != ref->op) + return 0; + + switch (op->op) { + case SWS_OP_READ: + case SWS_OP_WRITE: + case SWS_OP_SWAP_BYTES: + case SWS_OP_SWIZZLE: + /* Only the size matters for these operations */ + if (ff_sws_pixel_type_size(op->type) != ff_sws_pixel_type_size(ref->type)) + return 0; + break; + default: + if (op->type != ref->type) + return 0; + break; + } + + for (int i = 0; i < 4; i++) { + if (ref->comps.unused[i]) { + if (op->comps.unused[i]) + score += 1; /* Operating on fewer components is better .. */ + else + return false; /* .. but not too few! */ + } + + if (ref->comps.flags[i]) { + if (ref->comps.flags[i] & ~op->comps.flags[i]) { + return false; /* Missing required output assumptions */ + } else { + /* Implementation is more specialized */ + score += av_popcount(ref->comps.flags[i]); + } + } + } + + /* Flexible variants always match, but lower the score to prioritize more + * specific implementations if they exist */ + if (entry->flexible) + return score - 5; + + switch (op->op) { + case SWS_OP_INVALID: + return 0; + case SWS_OP_READ: + case SWS_OP_WRITE: + if (op->rw.elems != ref->rw.elems || + op->rw.frac != ref->rw.frac || + (op->rw.elems > 1 && op->rw.packed != ref->rw.packed)) + return 0; + return score; + case SWS_OP_SWAP_BYTES: + return score; + case SWS_OP_PACK: + case SWS_OP_UNPACK: + for (int i = 0; i < 4 && op->pack.pattern[i]; i++) { + if (op->pack.pattern[i] != ref->pack.pattern[i]) + return 0; + } + return score; + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (!op->c.q4[i].den) + continue; + if (av_cmp_q(op->c.q4[i], ref->c.q4[i]) && !next.unused[i]) + return 0; + } + return score; + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + return op->c.u == ref->c.u ? score : 0; + case SWS_OP_SWIZZLE: + for (int i = 0; i < 4; i++) { + if (op->swizzle.in[i] != ref->swizzle.in[i] && !next.unused[i]) + return 0; + } + return score; + case SWS_OP_CONVERT: + if (op->convert.to != ref->convert.to || + op->convert.expand != ref->convert.expand) + return 0; + return score; + case SWS_OP_DITHER: + return op->dither.size_log2 == ref->dither.size_log2 ? score : 0; + case SWS_OP_MIN: + case SWS_OP_MAX: + for (int i = 0; i < 4; i++) { + if (av_cmp_q(op->c.q4[i], ref->c.q4[i]) && !next.unused[i]) + return 0; + } + return score; + case SWS_OP_LINEAR: + /* All required elements must be present */ + if (op->lin.mask & ~ref->lin.mask) + return 0; + /* To avoid operating on possibly undefined memory, filter out + * implementations that operate on more input components */ + for (int i = 0; i < 4; i++) { + if ((ref->lin.mask & SWS_MASK_COL(i)) && op->comps.unused[i]) + return 0; + } + /* Prioritize smaller implementations */ + score += av_popcount(SWS_MASK_ALL ^ ref->lin.mask); + return score; + case SWS_OP_SCALE: + return score; + case SWS_OP_TYPE_NB: + break; + } + + av_assert0(!"Invalid operation type!"); + return 0; +} + +int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, + SwsOpList *ops, const int block_size, + SwsOpChain *chain) +{ + static const SwsOp dummy = { .comps.unused = { true, true, true, true }}; + const SwsOp *next = ops->num_ops > 1 ? &ops->ops[1] : &dummy; + const unsigned cpu_flags = av_get_cpu_flags(); + const SwsOpEntry *best = NULL; + const SwsOp *op = &ops->ops[0]; + int ret, best_score = 0, best_cpu_flags; + SwsOpPriv priv = {0}; + + for (int n = 0; n < num_tables; n++) { + const SwsOpTable *table = tables[n]; + if (table->block_size && table->block_size != block_size || + table->cpu_flags & ~cpu_flags) + continue; + + for (int i = 0; table->entries[i]; i++) { + const SwsOpEntry *entry = table->entries[i]; + int score = op_match(op, entry, next->comps); + if (score > best_score) { + best_score = score; + best_cpu_flags = table->cpu_flags; + best = entry; + } + } + } + + if (!best) + return AVERROR(ENOTSUP); + + if (best->setup) { + ret = best->setup(op, &priv); + if (ret < 0) + return ret; + } + + chain->cpu_flags |= best_cpu_flags; + ret = ff_sws_op_chain_append(chain, best->func, best->free, priv); + if (ret < 0) { + if (best->free) + best->free(&priv); + return ret; + } + + ops->ops++; + ops->num_ops--; + return ops->num_ops ? AVERROR(EAGAIN) : 0; +} + +#define q2pixel(type, q) ((q).den ? (type) (q).num / (q).den : 0) + +int ff_sws_setup_u8(const SwsOp *op, SwsOpPriv *out) +{ + out->u8[0] = op->c.u; + return 0; +} + +int ff_sws_setup_u(const SwsOp *op, SwsOpPriv *out) +{ + switch (op->type) { + case SWS_PIXEL_U8: out->u8[0] = op->c.u; return 0; + case SWS_PIXEL_U16: out->u16[0] = op->c.u; return 0; + case SWS_PIXEL_U32: out->u32[0] = op->c.u; return 0; + case SWS_PIXEL_F32: out->f32[0] = op->c.u; return 0; + default: return AVERROR(EINVAL); + } +} + +int ff_sws_setup_q(const SwsOp *op, SwsOpPriv *out) +{ + switch (op->type) { + case SWS_PIXEL_U8: out->u8[0] = q2pixel(uint8_t, op->c.q); return 0; + case SWS_PIXEL_U16: out->u16[0] = q2pixel(uint16_t, op->c.q); return 0; + case SWS_PIXEL_U32: out->u32[0] = q2pixel(uint32_t, op->c.q); return 0; + case SWS_PIXEL_F32: out->f32[0] = q2pixel(float, op->c.q); return 0; + default: return AVERROR(EINVAL); + } + + return 0; +} + +int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out) +{ + for (int i = 0; i < 4; i++) { + switch (op->type) { + case SWS_PIXEL_U8: out->u8[i] = q2pixel(uint8_t, op->c.q4[i]); break; + case SWS_PIXEL_U16: out->u16[i] = q2pixel(uint16_t, op->c.q4[i]); break; + case SWS_PIXEL_U32: out->u32[i] = q2pixel(uint32_t, op->c.q4[i]); break; + case SWS_PIXEL_F32: out->f32[i] = q2pixel(float, op->c.q4[i]); break; + default: return AVERROR(EINVAL); + } + } + + return 0; +} diff --git a/libswscale/ops_chain.h b/libswscale/ops_chain.h new file mode 100644 index 0000000000..6cbc3adabb --- /dev/null +++ b/libswscale/ops_chain.h @@ -0,0 +1,109 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef SWSCALE_OPS_CHAIN_H +#define SWSCALE_OPS_CHAIN_H + +#include "libavutil/cpu.h" + +#include "ops_internal.h" + +/** + * Helpers for SIMD implementations based on chained kernels, using a + * continuation passing style to link them together. + */ + +/** + * Private data for each kernel. + */ +typedef union SwsOpPriv { + DECLARE_ALIGNED_16(char, data)[16]; + + /* Common types */ + void *ptr; + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + float f32[4]; +} SwsOpPriv; + +static_assert(sizeof(SwsOpPriv) == 16, "SwsOpPriv size mismatch"); + +/* Setup helpers */ +int ff_sws_setup_u(const SwsOp *op, SwsOpPriv *out); +int ff_sws_setup_u8(const SwsOp *op, SwsOpPriv *out); +int ff_sws_setup_q(const SwsOp *op, SwsOpPriv *out); +int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out); + +/** + * Per-kernel execution context. + * + * Note: This struct is hard-coded in assembly, so do not change the layout. + */ +typedef void (*SwsFuncPtr)(void); +typedef struct SwsOpImpl { + SwsFuncPtr cont; /* [offset = 0] Continuation for this operation. */ + SwsOpPriv priv; /* [offset = 16] Private data for this operation. */ +} SwsOpImpl; + +static_assert(sizeof(SwsOpImpl) == 32, "SwsOpImpl layout mismatch"); +static_assert(offsetof(SwsOpImpl, priv) == 16, "SwsOpImpl layout mismatch"); + +/* Compiled chain of operations, which can be dispatched efficiently */ +typedef struct SwsOpChain { +#define SWS_MAX_OPS 16 + SwsOpImpl impl[SWS_MAX_OPS + 1]; /* reserve extra space for the entrypoint */ + void (*free[SWS_MAX_OPS + 1])(void *); + int num_impl; + int cpu_flags; /* set of all used CPU flags */ +} SwsOpChain; + +SwsOpChain *ff_sws_op_chain_alloc(void); +void ff_sws_op_chain_free(SwsOpChain *chain); + +/* Returns 0 on success, or a negative error code. */ +int ff_sws_op_chain_append(SwsOpChain *chain, SwsFuncPtr func, + void (*free)(void *), SwsOpPriv priv); + +typedef struct SwsOpEntry { + SwsOp op; + SwsFuncPtr func; + bool flexible; /* if true, only the type and op are matched */ + int (*setup)(const SwsOp *op, SwsOpPriv *out); /* optional */ + void (*free)(void *priv); +} SwsOpEntry; + +typedef struct SwsOpTable { + unsigned cpu_flags; /* required CPU flags for this table */ + int block_size; /* fixed block size of this table */ + const SwsOpEntry *entries[]; /* terminated by NULL */ +} SwsOpTable; + +/** + * "Compile" a single op by looking it up in a list of fixed size op tables. + * See `op_match` in `ops.c` for details on how the matching works. + * + * Returns 0, AVERROR(EAGAIN), or a negative error code. + */ +int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, + SwsOpList *ops, const int block_size, + SwsOpChain *chain); + +#endif -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-05-21 12:49 UTC|newest] Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-05-21 12:43 [FFmpeg-devel] [PATCH v2 00/17] swscale: new ops framework Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 01/17] swscale/format: rename legacy format conversion table Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 02/17] swscale/format: add ff_fmt_clear() Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 03/17] tests/checkasm: increase number of runs in between measurements Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 04/17] tests/checkasm: generalize DEF_CHECKASM_CHECK_FUNC to floats Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 05/17] swscale: add SWS_UNSTABLE flag Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 06/17] swscale/ops: introduce new low level framework Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 07/17] swscale/optimizer: add high-level ops optimizer Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 08/17] swscale/ops_internal: add internal ops backend API Niklas Haas 2025-05-23 16:27 ` Michael Niedermayer 2025-05-23 16:52 ` Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 09/17] swscale/ops: add dispatch layer Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 10/17] swscale/optimizer: add packed shuffle solver Niklas Haas 2025-05-21 12:43 ` Niklas Haas [this message] 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 12/17] swscale/ops_backend: add reference backend basend on C templates Niklas Haas 2025-05-21 12:43 ` [FFmpeg-devel] [PATCH v2 13/17] swscale/ops_memcpy: add 'memcpy' backend for plane->plane copies Niklas Haas 2025-05-21 12:44 ` [FFmpeg-devel] [PATCH v2 14/17] swscale/x86: add SIMD backend Niklas Haas 2025-05-21 14:11 ` Kieran Kunhya via ffmpeg-devel 2025-05-21 12:44 ` [FFmpeg-devel] [PATCH v2 15/17] tests/checkasm: add checkasm tests for swscale ops Niklas Haas 2025-05-21 12:44 ` [FFmpeg-devel] [PATCH v2 16/17] swscale/format: add new format decode/encode logic Niklas Haas 2025-05-21 12:44 ` [FFmpeg-devel] [PATCH v2 17/17] swscale/graph: allow experimental use of new format handler Niklas Haas
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250521124824.49657-12-ffmpeg@haasn.xyz \ --to=ffmpeg@haasn.xyz \ --cc=ffmpeg-devel@ffmpeg.org \ --cc=git@haasn.dev \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git