[FFmpeg-devel] [PATCH v6 11/18] swscale/optimizer: add packed shuffle solver

From: Niklas Haas <ffmpeg@haasn.xyz>
To: ffmpeg-devel@ffmpeg.org
Cc: Niklas Haas <git@haasn.dev>
Subject: [FFmpeg-devel] [PATCH v6 11/18] swscale/optimizer: add packed shuffle solver
Date: Wed, 11 Jun 2025 14:47:17 +0200
Message-ID: <20250611124805.73196-12-ffmpeg@haasn.xyz> (raw)
In-Reply-To: <20250611124805.73196-1-ffmpeg@haasn.xyz>

From: Niklas Haas <git@haasn.dev>

This can turn any compatible sequence of operations into a single packed
shuffle, including packed swizzling, grayscale->RGB conversion, endianness
swapping, RGB bit depth conversions, rgb24->rgb0 alpha clearing and more.
---
 libswscale/ops_internal.h  | 28 +++++++++++
 libswscale/ops_optimizer.c | 96 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
index 2fbd8a55d0..e7b6fb1c4c 100644
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -109,4 +109,32 @@ int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
  */
 int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out);
 
+/**
+ * "Solve" an op list into a fixed shuffle mask, with an optional ability to
+ * also directly clear the output value (for e.g. rgb24 -> rgb0). This can
+ * accept any operation chain that only consists of the following operations:
+ *
+ * - SWS_OP_READ (non-planar, non-fractional)
+ * - SWS_OP_SWIZZLE
+ * - SWS_OP_SWAP_BYTES
+ * - SWS_OP_CLEAR to zero (when clear_val is specified)
+ * - SWS_OP_CONVERT (integer expand)
+ * - SWS_OP_WRITE (non-planar, non-fractional)
+ *
+ * Basically, any operation that purely consists of moving around and reording
+ * bytes within a single plane, can be turned into a shuffle mask.
+ *
+ * @param ops         The operation list to decompose.
+ * @param shuffle     The output shuffle mask.
+ * @param size        The size (in bytes) of the output shuffle mask.
+ * @param clear_val   If nonzero, this index will be used to clear the output.
+ * @param read_bytes  Returns the number of bytes read per shuffle iteration.
+ * @param write_bytes Returns the number of bytes written per shuffle iteration.
+ *
+ * @return  The number of pixels processed per iteration, or a negative error
+            code; in particular AVERROR(ENOTSUP) for unsupported operations.
+ */
+int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size,
+                         uint8_t clear_val, int *read_bytes, int *write_bytes);
+
 #endif
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 80e75ed34b..23c9aae76d 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -19,9 +19,11 @@
  */
 
 #include "libavutil/avassert.h"
+#include <libavutil/bswap.h>
 #include "libavutil/rational.h"
 
 #include "ops.h"
+#include "ops_internal.h"
 
 #define Q(N) ((AVRational) { N, 1 })
 
@@ -780,3 +782,97 @@ retry:
 
     return 0;
 }
+
+int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
+                         int shuffle_size, uint8_t clear_val,
+                         int *out_read_bytes, int *out_write_bytes)
+{
+    const SwsOp read = ops->ops[0];
+    const int read_size = ff_sws_pixel_type_size(read.type);
+    uint32_t mask[4] = {0};
+
+    if (!ops->num_ops || read.op != SWS_OP_READ)
+        return AVERROR(EINVAL);
+    if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
+        return AVERROR(ENOTSUP);
+
+    for (int i = 0; i < read.rw.elems; i++)
+        mask[i] = 0x01010101 * i * read_size + 0x03020100;
+
+    for (int opidx = 1; opidx < ops->num_ops; opidx++) {
+        const SwsOp *op = &ops->ops[opidx];
+        switch (op->op) {
+        case SWS_OP_SWIZZLE: {
+            uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
+            for (int i = 0; i < 4; i++)
+                mask[i] = orig[op->swizzle.in[i]];
+            break;
+        }
+
+        case SWS_OP_SWAP_BYTES:
+            for (int i = 0; i < 4; i++) {
+                switch (ff_sws_pixel_type_size(op->type)) {
+                case 2: mask[i] = av_bswap16(mask[i]); break;
+                case 4: mask[i] = av_bswap32(mask[i]); break;
+                }
+            }
+            break;
+
+        case SWS_OP_CLEAR:
+            for (int i = 0; i < 4; i++) {
+                if (!op->c.q4[i].den)
+                    continue;
+                if (op->c.q4[i].num != 0 || !clear_val)
+                    return AVERROR(ENOTSUP);
+                mask[i] = 0x1010101ul * clear_val;
+            }
+            break;
+
+        case SWS_OP_CONVERT: {
+            if (!op->convert.expand)
+                return AVERROR(ENOTSUP);
+            for (int i = 0; i < 4; i++) {
+                switch (ff_sws_pixel_type_size(op->type)) {
+                case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF);   break;
+                case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
+                }
+            }
+            break;
+        }
+
+        case SWS_OP_WRITE: {
+            if (op->rw.frac || (!op->rw.packed && op->rw.elems > 1))
+                return AVERROR(ENOTSUP);
+
+            /* Initialize to no-op */
+            memset(shuffle, clear_val, shuffle_size);
+
+            const int write_size  = ff_sws_pixel_type_size(op->type);
+            const int read_chunk  = read.rw.elems * read_size;
+            const int write_chunk = op->rw.elems * write_size;
+            const int num_groups  = shuffle_size / FFMAX(read_chunk, write_chunk);
+            for (int n = 0; n < num_groups; n++) {
+                const int base_in  = n * read_chunk;
+                const int base_out = n * write_chunk;
+                for (int i = 0; i < op->rw.elems; i++) {
+                    const int offset = base_out + i * write_size;
+                    for (int b = 0; b < write_size; b++) {
+                        const uint8_t idx = mask[i] >> (b * 8);
+                        if (idx != clear_val)
+                            shuffle[offset + b] = base_in + idx;
+                    }
+                }
+            }
+
+            *out_read_bytes  = num_groups * read_chunk;
+            *out_write_bytes = num_groups * write_chunk;
+            return num_groups;
+        }
+
+        default:
+            return AVERROR(ENOTSUP);
+        }
+    }
+
+    return AVERROR(EINVAL);
+}
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".