[FFmpeg-devel] [PATCH 1/3] vulkan: Support samplerless images

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH 1/3] vulkan: Support samplerless images
@ 2025-09-04 20:10 IndecisiveTurtle via ffmpeg-devel
  2025-09-04 20:10 ` [FFmpeg-devel] [PATCH 2/3] lavc: Split out common components used by vulkan prores encoder IndecisiveTurtle via ffmpeg-devel
  2025-09-04 20:10 ` [FFmpeg-devel] [PATCH 3/3] lavc: implement a Vulkan-based " IndecisiveTurtle via ffmpeg-devel
  0 siblings, 2 replies; 3+ messages in thread
From: IndecisiveTurtle via ffmpeg-devel @ 2025-09-04 20:10 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: IndecisiveTurtle

From: IndecisiveTurtle <geoster3d@gmail.com>

---
 libavutil/vulkan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavutil/vulkan.c b/libavutil/vulkan.c
index ef755ad6f7..74eab88434 100644
--- a/libavutil/vulkan.c
+++ b/libavutil/vulkan.c
@@ -2507,7 +2507,8 @@ print:
         GLSLA("layout (set = %i, binding = %i", FFMAX(shd->nb_descriptor_sets - 1, 0), i);
 
         if (desc[i].mem_layout &&
-            (desc[i].type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE))
+            (desc[i].type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) &&
+            (desc[i].type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE))
             GLSLA(", %s", desc[i].mem_layout);
 
         GLSLA(")");
@@ -2520,7 +2521,7 @@ print:
 
         if (prop->type) {
             GLSLA(" ");
-            if (desc[i].type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
+            if (desc[i].type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE || desc[i].type == VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE) {
                 if (desc[i].mem_layout) {
                     int len = strlen(desc[i].mem_layout);
                     if (desc[i].mem_layout[len - 1] == 'i' &&
-- 
2.50.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [FFmpeg-devel] [PATCH 2/3] lavc: Split out common components used by vulkan prores encoder
  2025-09-04 20:10 [FFmpeg-devel] [PATCH 1/3] vulkan: Support samplerless images IndecisiveTurtle via ffmpeg-devel
@ 2025-09-04 20:10 ` IndecisiveTurtle via ffmpeg-devel
  2025-09-04 20:10 ` [FFmpeg-devel] [PATCH 3/3] lavc: implement a Vulkan-based " IndecisiveTurtle via ffmpeg-devel
  1 sibling, 0 replies; 3+ messages in thread
From: IndecisiveTurtle via ffmpeg-devel @ 2025-09-04 20:10 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: IndecisiveTurtle

From: IndecisiveTurtle <geoster3d@gmail.com>

---
 libavcodec/Makefile                  |   2 +-
 libavcodec/proresenc_kostya.c        | 414 +--------------------------
 libavcodec/proresenc_kostya_common.c | 364 +++++++++++++++++++++++
 libavcodec/proresenc_kostya_common.h | 131 +++++++++
 4 files changed, 511 insertions(+), 400 deletions(-)
 create mode 100644 libavcodec/proresenc_kostya_common.c
 create mode 100644 libavcodec/proresenc_kostya_common.h

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 3d036de4b6..d8e1ac5a54 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -638,7 +638,7 @@ OBJS-$(CONFIG_PPM_ENCODER)             += pnmenc.o
 OBJS-$(CONFIG_PRORES_DECODER)          += proresdec.o proresdsp.o proresdata.o
 OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc_anatoliy.o proresdata.o
 OBJS-$(CONFIG_PRORES_AW_ENCODER)       += proresenc_anatoliy.o proresdata.o
-OBJS-$(CONFIG_PRORES_KS_ENCODER)       += proresenc_kostya.o proresdata.o
+OBJS-$(CONFIG_PRORES_KS_ENCODER)       += proresenc_kostya.o proresdata.o proresenc_kostya_common.o
 OBJS-$(CONFIG_PRORES_RAW_DECODER)      += prores_raw.o proresdsp.o proresdata.o
 OBJS-$(CONFIG_PRORES_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o
 OBJS-$(CONFIG_PROSUMER_DECODER)        += prosumer.o
diff --git a/libavcodec/proresenc_kostya.c b/libavcodec/proresenc_kostya.c
index b98bc5c195..31d22a14ac 100644
--- a/libavcodec/proresenc_kostya.c
+++ b/libavcodec/proresenc_kostya.c
@@ -33,179 +33,7 @@
 #include "profiles.h"
 #include "bytestream.h"
 #include "proresdata.h"
-
-#define CFACTOR_Y422 2
-#define CFACTOR_Y444 3
-
-#define MAX_MBS_PER_SLICE 8
-
-#define MAX_PLANES 4
-
-enum {
-    PRORES_PROFILE_AUTO  = -1,
-    PRORES_PROFILE_PROXY = 0,
-    PRORES_PROFILE_LT,
-    PRORES_PROFILE_STANDARD,
-    PRORES_PROFILE_HQ,
-    PRORES_PROFILE_4444,
-    PRORES_PROFILE_4444XQ,
-};
-
-enum {
-    QUANT_MAT_PROXY = 0,
-    QUANT_MAT_PROXY_CHROMA,
-    QUANT_MAT_LT,
-    QUANT_MAT_STANDARD,
-    QUANT_MAT_HQ,
-    QUANT_MAT_XQ_LUMA,
-    QUANT_MAT_DEFAULT,
-};
-
-static const uint8_t prores_quant_matrices[][64] = {
-    { // proxy
-         4,  7,  9, 11, 13, 14, 15, 63,
-         7,  7, 11, 12, 14, 15, 63, 63,
-         9, 11, 13, 14, 15, 63, 63, 63,
-        11, 11, 13, 14, 63, 63, 63, 63,
-        11, 13, 14, 63, 63, 63, 63, 63,
-        13, 14, 63, 63, 63, 63, 63, 63,
-        13, 63, 63, 63, 63, 63, 63, 63,
-        63, 63, 63, 63, 63, 63, 63, 63,
-    },
-    { // proxy chromas
-        4,  7,  9, 11, 13, 14, 63, 63,
-        7,  7, 11, 12, 14, 63, 63, 63,
-        9, 11, 13, 14, 63, 63, 63, 63,
-        11, 11, 13, 14, 63, 63, 63, 63,
-        11, 13, 14, 63, 63, 63, 63, 63,
-        13, 14, 63, 63, 63, 63, 63, 63,
-        13, 63, 63, 63, 63, 63, 63, 63,
-        63, 63, 63, 63, 63, 63, 63, 63
-    },
-    { // LT
-         4,  5,  6,  7,  9, 11, 13, 15,
-         5,  5,  7,  8, 11, 13, 15, 17,
-         6,  7,  9, 11, 13, 15, 15, 17,
-         7,  7,  9, 11, 13, 15, 17, 19,
-         7,  9, 11, 13, 14, 16, 19, 23,
-         9, 11, 13, 14, 16, 19, 23, 29,
-         9, 11, 13, 15, 17, 21, 28, 35,
-        11, 13, 16, 17, 21, 28, 35, 41,
-    },
-    { // standard
-         4,  4,  5,  5,  6,  7,  7,  9,
-         4,  4,  5,  6,  7,  7,  9,  9,
-         5,  5,  6,  7,  7,  9,  9, 10,
-         5,  5,  6,  7,  7,  9,  9, 10,
-         5,  6,  7,  7,  8,  9, 10, 12,
-         6,  7,  7,  8,  9, 10, 12, 15,
-         6,  7,  7,  9, 10, 11, 14, 17,
-         7,  7,  9, 10, 11, 14, 17, 21,
-    },
-    { // high quality
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  5,
-         4,  4,  4,  4,  4,  4,  5,  5,
-         4,  4,  4,  4,  4,  5,  5,  6,
-         4,  4,  4,  4,  5,  5,  6,  7,
-         4,  4,  4,  4,  5,  6,  7,  7,
-    },
-    { // XQ luma
-        2,  2,  2,  2,  2,  2,  2,  2,
-        2,  2,  2,  2,  2,  2,  2,  2,
-        2,  2,  2,  2,  2,  2,  2,  2,
-        2,  2,  2,  2,  2,  2,  2,  3,
-        2,  2,  2,  2,  2,  2,  3,  3,
-        2,  2,  2,  2,  2,  3,  3,  3,
-        2,  2,  2,  2,  3,  3,  3,  4,
-        2,  2,  2,  2,  3,  3,  4,  4,
-    },
-    { // codec default
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-    },
-};
-
-#define NUM_MB_LIMITS 4
-static const int prores_mb_limits[NUM_MB_LIMITS] = {
-    1620, // up to 720x576
-    2700, // up to 960x720
-    6075, // up to 1440x1080
-    9216, // up to 2048x1152
-};
-
-static const struct prores_profile {
-    const char *full_name;
-    uint32_t    tag;
-    int         min_quant;
-    int         max_quant;
-    int         br_tab[NUM_MB_LIMITS];
-    int         quant;
-    int         quant_chroma;
-} prores_profile_info[6] = {
-    {
-        .full_name = "proxy",
-        .tag       = MKTAG('a', 'p', 'c', 'o'),
-        .min_quant = 4,
-        .max_quant = 8,
-        .br_tab    = { 300, 242, 220, 194 },
-        .quant     = QUANT_MAT_PROXY,
-        .quant_chroma = QUANT_MAT_PROXY_CHROMA,
-    },
-    {
-        .full_name = "LT",
-        .tag       = MKTAG('a', 'p', 'c', 's'),
-        .min_quant = 1,
-        .max_quant = 9,
-        .br_tab    = { 720, 560, 490, 440 },
-        .quant     = QUANT_MAT_LT,
-        .quant_chroma = QUANT_MAT_LT,
-    },
-    {
-        .full_name = "standard",
-        .tag       = MKTAG('a', 'p', 'c', 'n'),
-        .min_quant = 1,
-        .max_quant = 6,
-        .br_tab    = { 1050, 808, 710, 632 },
-        .quant     = QUANT_MAT_STANDARD,
-        .quant_chroma = QUANT_MAT_STANDARD,
-    },
-    {
-        .full_name = "high quality",
-        .tag       = MKTAG('a', 'p', 'c', 'h'),
-        .min_quant = 1,
-        .max_quant = 6,
-        .br_tab    = { 1566, 1216, 1070, 950 },
-        .quant     = QUANT_MAT_HQ,
-        .quant_chroma = QUANT_MAT_HQ,
-    },
-    {
-        .full_name = "4444",
-        .tag       = MKTAG('a', 'p', '4', 'h'),
-        .min_quant = 1,
-        .max_quant = 6,
-        .br_tab    = { 2350, 1828, 1600, 1425 },
-        .quant     = QUANT_MAT_HQ,
-        .quant_chroma = QUANT_MAT_HQ,
-    },
-    {
-        .full_name = "4444XQ",
-        .tag       = MKTAG('a', 'p', '4', 'x'),
-        .min_quant = 1,
-        .max_quant = 6,
-        .br_tab    = { 3525, 2742, 2400, 2137 },
-        .quant     = QUANT_MAT_HQ, /* Fix me : use QUANT_MAT_XQ_LUMA */
-        .quant_chroma = QUANT_MAT_HQ,
-    }
-};
+#include "proresenc_kostya_common.h"
 
 #define TRELLIS_WIDTH 16
 #define SCORE_LIMIT   INT_MAX / 2
@@ -217,8 +45,6 @@ struct TrellisNode {
     int score;
 };
 
-#define MAX_STORED_Q 16
-
 typedef struct ProresThreadData {
     DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
     DECLARE_ALIGNED(16, uint16_t, emu_buf)[16 * 16];
@@ -227,49 +53,6 @@ typedef struct ProresThreadData {
     struct TrellisNode *nodes;
 } ProresThreadData;
 
-typedef struct ProresContext {
-    AVClass *class;
-    DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
-    DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
-    int16_t quants[MAX_STORED_Q][64];
-    int16_t quants_chroma[MAX_STORED_Q][64];
-    int16_t custom_q[64];
-    int16_t custom_chroma_q[64];
-    const uint8_t *quant_mat;
-    const uint8_t *quant_chroma_mat;
-    const uint8_t *scantable;
-
-    void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src,
-                 ptrdiff_t linesize, int16_t *block);
-    FDCTDSPContext fdsp;
-
-    const AVFrame *pic;
-    int mb_width, mb_height;
-    int mbs_per_slice;
-    int num_chroma_blocks, chroma_factor;
-    int slices_width;
-    int slices_per_picture;
-    int pictures_per_frame; // 1 for progressive, 2 for interlaced
-    int cur_picture_idx;
-    int num_planes;
-    int bits_per_mb;
-    int force_quant;
-    int alpha_bits;
-    int warn;
-
-    char *vendor;
-    int quant_sel;
-
-    int frame_size_upper_bound;
-
-    int profile;
-    const struct prores_profile *profile_info;
-
-    int *slice_q;
-
-    ProresThreadData *tdata;
-} ProresContext;
-
 static void get_slice_data(ProresContext *ctx, const uint16_t *src,
                            ptrdiff_t linesize, int x, int y, int w, int h,
                            int16_t *blocks, uint16_t *emu_buf,
@@ -369,6 +152,8 @@ static void get_alpha_data(ProresContext *ctx, const uint16_t *src,
     }
 }
 
+int slice = 0;
+
 /**
  * Write an unsigned rice/exp golomb codeword.
  */
@@ -437,7 +222,6 @@ static void encode_acs(PutBitContext *pb, int16_t *blocks,
     int prev_level = 2;
     int run = 0, level;
     int max_coeffs, abs_level;
-
     max_coeffs = blocks_per_slice << 6;
 
     for (i = 1; i < 64; i++) {
@@ -685,7 +469,6 @@ static int estimate_acs(int *error, int16_t *blocks, int blocks_per_slice,
                 bits += estimate_vlc(ff_prores_run_to_cb[prev_run], run);
                 bits += estimate_vlc(ff_prores_level_to_cb[prev_level],
                                      abs_level - 1) + 1;
-
                 prev_run   = FFMIN(run, 15);
                 prev_level = FFMIN(abs_level, 9);
                 run    = 0;
@@ -905,7 +688,6 @@ static int find_slice_quant(AVCodecContext *avctx,
 
         for (q = min_quant; q < max_quant + 2; q++) {
             cur = trellis_node + q;
-
             bits  = td->nodes[prev].bits + slice_bits[q];
             error = slice_score[q];
             if (bits > bits_limit)
@@ -965,67 +747,33 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *pic, int *got_packet)
 {
     ProresContext *ctx = avctx->priv_data;
-    uint8_t *orig_buf, *buf, *slice_hdr, *slice_sizes, *tmp;
+    uint8_t *orig_buf, *buf, *slice_hdr, *slice_sizes;
     uint8_t *picture_size_pos;
     PutBitContext pb;
     int x, y, i, mb, q = 0;
     int sizes[4] = { 0 };
-    int slice_hdr_size = 2 + 2 * (ctx->num_planes - 1);
+    int slice_hdr_size = 2 * ctx->num_planes;
     int frame_size, picture_size, slice_size;
     int pkt_size, ret;
     int max_slice_size = (ctx->frame_size_upper_bound - 200) / (ctx->pictures_per_frame * ctx->slices_per_picture + 1);
-    uint8_t frame_flags;
+    pkt_size = ctx->frame_size_upper_bound;
 
     ctx->pic = pic;
-    pkt_size = ctx->frame_size_upper_bound;
 
     if ((ret = ff_alloc_packet(avctx, pkt, pkt_size + FF_INPUT_BUFFER_MIN_SIZE)) < 0)
         return ret;
 
     orig_buf = pkt->data;
-
-    // frame atom
-    orig_buf += 4;                              // frame size
-    bytestream_put_be32  (&orig_buf, FRAME_ID); // frame container ID
-    buf = orig_buf;
-
-    // frame header
-    tmp = buf;
-    buf += 2;                                   // frame header size will be stored here
-    bytestream_put_be16  (&buf, ctx->chroma_factor != CFACTOR_Y422 || ctx->alpha_bits ? 1 : 0);
-    bytestream_put_buffer(&buf, ctx->vendor, 4);
-    bytestream_put_be16  (&buf, avctx->width);
-    bytestream_put_be16  (&buf, avctx->height);
-
-    frame_flags = ctx->chroma_factor << 6;
-    if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT)
-        frame_flags |= (pic->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 0x04 : 0x08;
-    bytestream_put_byte  (&buf, frame_flags);
-
-    bytestream_put_byte  (&buf, 0);             // reserved
-    bytestream_put_byte  (&buf, pic->color_primaries);
-    bytestream_put_byte  (&buf, pic->color_trc);
-    bytestream_put_byte  (&buf, pic->colorspace);
-    bytestream_put_byte  (&buf, ctx->alpha_bits >> 3);
-    bytestream_put_byte  (&buf, 0);             // reserved
-    if (ctx->quant_sel != QUANT_MAT_DEFAULT) {
-        bytestream_put_byte  (&buf, 0x03);      // matrix flags - both matrices are present
-        bytestream_put_buffer(&buf, ctx->quant_mat, 64);        // luma quantisation matrix
-        bytestream_put_buffer(&buf, ctx->quant_chroma_mat, 64); // chroma quantisation matrix
-    } else {
-        bytestream_put_byte  (&buf, 0x00);      // matrix flags - default matrices are used
-    }
-    bytestream_put_be16  (&tmp, buf - orig_buf); // write back frame header size
+    buf = ff_prores_kostya_write_frame_header(avctx, ctx, &orig_buf, pic->flags,
+                              pic->color_primaries, pic->color_trc,
+                                   pic->colorspace);
 
     for (ctx->cur_picture_idx = 0;
          ctx->cur_picture_idx < ctx->pictures_per_frame;
          ctx->cur_picture_idx++) {
         // picture header
         picture_size_pos = buf + 1;
-        bytestream_put_byte  (&buf, 0x40);          // picture header size (in bits)
-        buf += 4;                                   // picture data size will be stored here
-        bytestream_put_be16  (&buf, ctx->slices_per_picture);
-        bytestream_put_byte  (&buf, av_log2(ctx->mbs_per_slice) << 4); // slice width and height in MBs
+        buf = ff_prores_kostya_write_picture_header(ctx, buf);
 
         // seek table - will be filled during slice encoding
         slice_sizes = buf;
@@ -1048,7 +796,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                 while (ctx->mb_width - x < mbs_per_slice)
                     mbs_per_slice >>= 1;
 
-                bytestream_put_byte(&buf, slice_hdr_size << 3);
+                bytestream_put_byte(&buf, slice_hdr_size * 8);
                 slice_hdr = buf;
                 buf += slice_hdr_size - 1;
                 if (pkt_size <= buf - orig_buf + 2 * max_slice_size) {
@@ -1076,13 +824,11 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         return ret;
 
                     pkt_size += delta;
-                    // restore pointers
                     orig_buf         = pkt->data + (orig_buf         - start);
                     buf              = pkt->data + (buf              - start);
                     picture_size_pos = pkt->data + (picture_size_pos - start);
                     slice_sizes      = pkt->data + (slice_sizes      - start);
                     slice_hdr        = pkt->data + (slice_hdr        - start);
-                    tmp              = pkt->data + (tmp              - start);
                 }
                 init_put_bits(&pb, buf, (pkt_size - (buf - orig_buf)));
                 ret = encode_slice(avctx, pic, &pb, sizes, x, y, q,
@@ -1149,105 +895,18 @@ static void prores_fdct(FDCTDSPContext *fdsp, const uint16_t *src,
 static av_cold int encode_init(AVCodecContext *avctx)
 {
     ProresContext *ctx = avctx->priv_data;
-    int mps;
-    int i, j;
-    int min_quant, max_quant;
-    int interlaced = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
+    int err = 0, i, j, min_quant, max_quant;
 
-    avctx->bits_per_raw_sample = 10;
+    err = ff_prores_kostya_encode_init(avctx, ctx, avctx->pix_fmt);
+    if (err < 0)
+        return err;
 
     ctx->fdct      = prores_fdct;
-    ctx->scantable = interlaced ? ff_prores_interlaced_scan
-                                : ff_prores_progressive_scan;
     ff_fdctdsp_init(&ctx->fdsp, avctx);
 
-    mps = ctx->mbs_per_slice;
-    if (mps & (mps - 1)) {
-        av_log(avctx, AV_LOG_ERROR,
-               "there should be an integer power of two MBs per slice\n");
-        return AVERROR(EINVAL);
-    }
-    if (ctx->profile == PRORES_PROFILE_AUTO) {
-        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
-        ctx->profile = (desc->flags & AV_PIX_FMT_FLAG_ALPHA ||
-                        !(desc->log2_chroma_w + desc->log2_chroma_h))
-                     ? PRORES_PROFILE_4444 : PRORES_PROFILE_HQ;
-        av_log(avctx, AV_LOG_INFO, "Autoselected %s. It can be overridden "
-               "through -profile option.\n", ctx->profile == PRORES_PROFILE_4444
-               ? "4:4:4:4 profile because of the used input colorspace"
-               : "HQ profile to keep best quality");
-    }
-    if (av_pix_fmt_desc_get(avctx->pix_fmt)->flags & AV_PIX_FMT_FLAG_ALPHA) {
-        if (ctx->profile != PRORES_PROFILE_4444 &&
-            ctx->profile != PRORES_PROFILE_4444XQ) {
-            // force alpha and warn
-            av_log(avctx, AV_LOG_WARNING, "Profile selected will not "
-                   "encode alpha. Override with -profile if needed.\n");
-            ctx->alpha_bits = 0;
-        }
-        if (ctx->alpha_bits & 7) {
-            av_log(avctx, AV_LOG_ERROR, "alpha bits should be 0, 8 or 16\n");
-            return AVERROR(EINVAL);
-        }
-        avctx->bits_per_coded_sample = 32;
-    } else {
-        ctx->alpha_bits = 0;
-    }
-
-    ctx->chroma_factor = avctx->pix_fmt == AV_PIX_FMT_YUV422P10
-                         ? CFACTOR_Y422
-                         : CFACTOR_Y444;
-    ctx->profile_info  = prores_profile_info + ctx->profile;
-    ctx->num_planes    = 3 + !!ctx->alpha_bits;
-
-    ctx->mb_width      = FFALIGN(avctx->width,  16) >> 4;
-
-    if (interlaced)
-        ctx->mb_height = FFALIGN(avctx->height, 32) >> 5;
-    else
-        ctx->mb_height = FFALIGN(avctx->height, 16) >> 4;
-
-    ctx->slices_width  = ctx->mb_width / mps;
-    ctx->slices_width += av_popcount(ctx->mb_width - ctx->slices_width * mps);
-    ctx->slices_per_picture = ctx->mb_height * ctx->slices_width;
-    ctx->pictures_per_frame = 1 + interlaced;
-
-    if (ctx->quant_sel == -1) {
-        ctx->quant_mat = prores_quant_matrices[ctx->profile_info->quant];
-        ctx->quant_chroma_mat = prores_quant_matrices[ctx->profile_info->quant_chroma];
-    } else {
-        ctx->quant_mat = prores_quant_matrices[ctx->quant_sel];
-        ctx->quant_chroma_mat = prores_quant_matrices[ctx->quant_sel];
-    }
-
-    if (strlen(ctx->vendor) != 4) {
-        av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    ctx->force_quant = avctx->global_quality / FF_QP2LAMBDA;
     if (!ctx->force_quant) {
-        if (!ctx->bits_per_mb) {
-            for (i = 0; i < NUM_MB_LIMITS - 1; i++)
-                if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height *
-                                           ctx->pictures_per_frame)
-                    break;
-            ctx->bits_per_mb   = ctx->profile_info->br_tab[i];
-            if (ctx->alpha_bits)
-                ctx->bits_per_mb *= 20;
-        } else if (ctx->bits_per_mb < 128) {
-            av_log(avctx, AV_LOG_ERROR, "too few bits per MB, please set at least 128\n");
-            return AVERROR_INVALIDDATA;
-        }
-
         min_quant = ctx->profile_info->min_quant;
         max_quant = ctx->profile_info->max_quant;
-        for (i = min_quant; i < MAX_STORED_Q; i++) {
-            for (j = 0; j < 64; j++) {
-                ctx->quants[i][j] = ctx->quant_mat[j] * i;
-                ctx->quants_chroma[i][j] = ctx->quant_chroma_mat[j] * i;
-            }
-        }
 
         ctx->slice_q = av_malloc_array(ctx->slices_per_picture, sizeof(*ctx->slice_q));
         if (!ctx->slice_q)
@@ -1269,51 +928,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
                 ctx->tdata[j].nodes[i].score     = 0;
             }
         }
-    } else {
-        int ls = 0;
-        int ls_chroma = 0;
-
-        if (ctx->force_quant > 64) {
-            av_log(avctx, AV_LOG_ERROR, "too large quantiser, maximum is 64\n");
-            return AVERROR_INVALIDDATA;
-        }
-
-        for (j = 0; j < 64; j++) {
-            ctx->quants[0][j] = ctx->quant_mat[j] * ctx->force_quant;
-            ctx->quants_chroma[0][j] = ctx->quant_chroma_mat[j] * ctx->force_quant;
-            ls += av_log2((1 << 11)  / ctx->quants[0][j]) * 2 + 1;
-            ls_chroma += av_log2((1 << 11)  / ctx->quants_chroma[0][j]) * 2 + 1;
-        }
-
-        ctx->bits_per_mb = ls * 4 + ls_chroma * 4;
-        if (ctx->chroma_factor == CFACTOR_Y444)
-            ctx->bits_per_mb += ls_chroma * 4;
     }
 
-    ctx->frame_size_upper_bound = (ctx->pictures_per_frame *
-                                   ctx->slices_per_picture + 1) *
-                                  (2 + 2 * ctx->num_planes +
-                                   (mps * ctx->bits_per_mb) / 8)
-                                  + 200;
-
-    if (ctx->alpha_bits) {
-         // The alpha plane is run-coded and might exceed the bit budget.
-         ctx->frame_size_upper_bound += (ctx->pictures_per_frame *
-                                         ctx->slices_per_picture + 1) *
-         /* num pixels per slice */     (ctx->mbs_per_slice * 256 *
-         /* bits per pixel */            (1 + ctx->alpha_bits + 1) + 7 >> 3);
-    }
-
-    avctx->codec_tag   = ctx->profile_info->tag;
-    avctx->profile = ctx->profile;
-
-    av_log(avctx, AV_LOG_DEBUG,
-           "profile %d, %d slices, interlacing: %s, %d bits per MB\n",
-           ctx->profile, ctx->slices_per_picture * ctx->pictures_per_frame,
-           interlaced ? "yes" : "no", ctx->bits_per_mb);
-    av_log(avctx, AV_LOG_DEBUG, "frame size upper bound: %d\n",
-           ctx->frame_size_upper_bound);
-
     return 0;
 }
 
diff --git a/libavcodec/proresenc_kostya_common.c b/libavcodec/proresenc_kostya_common.c
new file mode 100644
index 0000000000..d432d10369
--- /dev/null
+++ b/libavcodec/proresenc_kostya_common.c
@@ -0,0 +1,364 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ * Copyright (c) 2012 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "proresdata.h"
+#include <sys/types.h>
+#include "proresenc_kostya_common.h"
+
+static const uint8_t prores_quant_matrices[][64] = {
+    { // proxy
+         4,  7,  9, 11, 13, 14, 15, 63,
+         7,  7, 11, 12, 14, 15, 63, 63,
+         9, 11, 13, 14, 15, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63,
+    },
+    { // proxy chromas
+        4,  7,  9, 11, 13, 14, 63, 63,
+        7,  7, 11, 12, 14, 63, 63, 63,
+        9, 11, 13, 14, 63, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63
+    },
+    { // LT
+         4,  5,  6,  7,  9, 11, 13, 15,
+         5,  5,  7,  8, 11, 13, 15, 17,
+         6,  7,  9, 11, 13, 15, 15, 17,
+         7,  7,  9, 11, 13, 15, 17, 19,
+         7,  9, 11, 13, 14, 16, 19, 23,
+         9, 11, 13, 14, 16, 19, 23, 29,
+         9, 11, 13, 15, 17, 21, 28, 35,
+        11, 13, 16, 17, 21, 28, 35, 41,
+    },
+    { // standard
+         4,  4,  5,  5,  6,  7,  7,  9,
+         4,  4,  5,  6,  7,  7,  9,  9,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  6,  7,  7,  8,  9, 10, 12,
+         6,  7,  7,  8,  9, 10, 12, 15,
+         6,  7,  7,  9, 10, 11, 14, 17,
+         7,  7,  9, 10, 11, 14, 17, 21,
+    },
+    { // high quality
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  5,
+         4,  4,  4,  4,  4,  4,  5,  5,
+         4,  4,  4,  4,  4,  5,  5,  6,
+         4,  4,  4,  4,  5,  5,  6,  7,
+         4,  4,  4,  4,  5,  6,  7,  7,
+    },
+    { // XQ luma
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  3,
+        2,  2,  2,  2,  2,  2,  3,  3,
+        2,  2,  2,  2,  2,  3,  3,  3,
+        2,  2,  2,  2,  3,  3,  3,  4,
+        2,  2,  2,  2,  3,  3,  4,  4,
+    },
+    { // codec default
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+    },
+};
+
+static const int prores_mb_limits[NUM_MB_LIMITS] = {
+    1620, // up to 720x576
+    2700, // up to 960x720
+    6075, // up to 1440x1080
+    9216, // up to 2048x1152
+};
+
+static const prores_profile prores_profile_info[6] = {
+    {
+        .full_name = "proxy",
+        .tag       = MKTAG('a', 'p', 'c', 'o'),
+        .min_quant = 4,
+        .max_quant = 8,
+        .br_tab    = { 300, 242, 220, 194 },
+        .quant     = QUANT_MAT_PROXY,
+        .quant_chroma = QUANT_MAT_PROXY_CHROMA,
+    },
+    {
+        .full_name = "LT",
+        .tag       = MKTAG('a', 'p', 'c', 's'),
+        .min_quant = 1,
+        .max_quant = 9,
+        .br_tab    = { 720, 560, 490, 440 },
+        .quant     = QUANT_MAT_LT,
+        .quant_chroma = QUANT_MAT_LT,
+    },
+    {
+        .full_name = "standard",
+        .tag       = MKTAG('a', 'p', 'c', 'n'),
+        .min_quant = 1,
+        .max_quant = 6,
+        .br_tab    = { 1050, 808, 710, 632 },
+        .quant     = QUANT_MAT_STANDARD,
+        .quant_chroma = QUANT_MAT_STANDARD,
+    },
+    {
+        .full_name = "high quality",
+        .tag       = MKTAG('a', 'p', 'c', 'h'),
+        .min_quant = 1,
+        .max_quant = 6,
+        .br_tab    = { 1566, 1216, 1070, 950 },
+        .quant     = QUANT_MAT_HQ,
+        .quant_chroma = QUANT_MAT_HQ,
+    },
+    {
+        .full_name = "4444",
+        .tag       = MKTAG('a', 'p', '4', 'h'),
+        .min_quant = 1,
+        .max_quant = 6,
+        .br_tab    = { 2350, 1828, 1600, 1425 },
+        .quant     = QUANT_MAT_HQ,
+        .quant_chroma = QUANT_MAT_HQ,
+    },
+    {
+        .full_name = "4444XQ",
+        .tag       = MKTAG('a', 'p', '4', 'x'),
+        .min_quant = 1,
+        .max_quant = 6,
+        .br_tab    = { 3525, 2742, 2400, 2137 },
+        .quant     = QUANT_MAT_HQ, /* Fix me : use QUANT_MAT_XQ_LUMA */
+        .quant_chroma = QUANT_MAT_HQ,
+    }
+};
+
+av_cold int ff_prores_kostya_encode_init(AVCodecContext *avctx, ProresContext *ctx,
+                                         enum AVPixelFormat pix_fmt)
+{
+    int mps, i, j, min_quant;
+    int interlaced = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
+
+    avctx->bits_per_raw_sample = 10;
+
+    ctx->scantable = interlaced ? ff_prores_interlaced_scan
+                                : ff_prores_progressive_scan;
+
+    mps = ctx->mbs_per_slice;
+    if (mps & (mps - 1)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "there should be an integer power of two MBs per slice\n");
+        return AVERROR(EINVAL);
+    }
+    if (ctx->profile == PRORES_PROFILE_AUTO) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+        ctx->profile = (desc->flags & AV_PIX_FMT_FLAG_ALPHA ||
+                        !(desc->log2_chroma_w + desc->log2_chroma_h))
+                     ? PRORES_PROFILE_4444 : PRORES_PROFILE_HQ;
+        av_log(avctx, AV_LOG_INFO, "Autoselected %s. It can be overridden "
+               "through -profile option.\n", ctx->profile == PRORES_PROFILE_4444
+               ? "4:4:4:4 profile because of the used input colorspace"
+               : "HQ profile to keep best quality");
+    }
+    if (av_pix_fmt_desc_get(pix_fmt)->flags & AV_PIX_FMT_FLAG_ALPHA) {
+        if (ctx->profile != PRORES_PROFILE_4444 &&
+            ctx->profile != PRORES_PROFILE_4444XQ) {
+            // force alpha and warn
+            av_log(avctx, AV_LOG_WARNING, "Profile selected will not "
+                   "encode alpha. Override with -profile if needed.\n");
+            ctx->alpha_bits = 0;
+        }
+        if (ctx->alpha_bits & 7) {
+            av_log(avctx, AV_LOG_ERROR, "alpha bits should be 0, 8 or 16\n");
+            return AVERROR(EINVAL);
+        }
+        avctx->bits_per_coded_sample = 32;
+    } else {
+        ctx->alpha_bits = 0;
+    }
+
+    ctx->chroma_factor = pix_fmt == AV_PIX_FMT_YUV422P10
+                         ? CFACTOR_Y422
+                         : CFACTOR_Y444;
+    ctx->profile_info  = prores_profile_info + ctx->profile;
+    ctx->num_planes    = 3 + !!ctx->alpha_bits;
+
+    ctx->mb_width      = FFALIGN(avctx->width,  16) >> 4;
+
+    if (interlaced)
+        ctx->mb_height = FFALIGN(avctx->height, 32) >> 5;
+    else
+        ctx->mb_height = FFALIGN(avctx->height, 16) >> 4;
+
+    ctx->slices_width  = ctx->mb_width / mps;
+    ctx->slices_width += av_popcount(ctx->mb_width - ctx->slices_width * mps);
+    ctx->slices_per_picture = ctx->mb_height * ctx->slices_width;
+    ctx->pictures_per_frame = 1 + interlaced;
+
+    if (ctx->quant_sel == -1) {
+        ctx->quant_mat = prores_quant_matrices[ctx->profile_info->quant];
+        ctx->quant_chroma_mat = prores_quant_matrices[ctx->profile_info->quant_chroma];
+    } else {
+        ctx->quant_mat = prores_quant_matrices[ctx->quant_sel];
+        ctx->quant_chroma_mat = prores_quant_matrices[ctx->quant_sel];
+    }
+
+    if (strlen(ctx->vendor) != 4) {
+        av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ctx->force_quant = avctx->global_quality / FF_QP2LAMBDA;
+    if (!ctx->force_quant) {
+        if (!ctx->bits_per_mb) {
+            for (i = 0; i < NUM_MB_LIMITS - 1; i++)
+                if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height *
+                                           ctx->pictures_per_frame)
+                    break;
+            ctx->bits_per_mb   = ctx->profile_info->br_tab[i];
+            if (ctx->alpha_bits)
+                ctx->bits_per_mb *= 20;
+        } else if (ctx->bits_per_mb < 128) {
+            av_log(avctx, AV_LOG_ERROR, "too few bits per MB, please set at least 128\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        min_quant = ctx->profile_info->min_quant;
+        for (i = min_quant; i < MAX_STORED_Q; i++) {
+            for (j = 0; j < 64; j++) {
+                ctx->quants[i][j] = ctx->quant_mat[j] * i;
+                ctx->quants_chroma[i][j] = ctx->quant_chroma_mat[j] * i;
+            }
+        }
+    } else {
+        int ls = 0;
+        int ls_chroma = 0;
+
+        if (ctx->force_quant > 64) {
+            av_log(avctx, AV_LOG_ERROR, "too large quantiser, maximum is 64\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        for (j = 0; j < 64; j++) {
+            ctx->quants[0][j] = ctx->quant_mat[j] * ctx->force_quant;
+            ctx->quants_chroma[0][j] = ctx->quant_chroma_mat[j] * ctx->force_quant;
+            ls += av_log2((1 << 11)  / ctx->quants[0][j]) * 2 + 1;
+            ls_chroma += av_log2((1 << 11)  / ctx->quants_chroma[0][j]) * 2 + 1;
+        }
+
+        ctx->bits_per_mb = ls * 4 + ls_chroma * 4;
+        if (ctx->chroma_factor == CFACTOR_Y444)
+            ctx->bits_per_mb += ls_chroma * 4;
+    }
+
+    ctx->frame_size_upper_bound = (ctx->pictures_per_frame *
+                                   ctx->slices_per_picture + 1) *
+                                  (2 + 2 * ctx->num_planes +
+                                   (mps * ctx->bits_per_mb) / 8)
+                                  + 200;
+
+    if (ctx->alpha_bits) {
+         // The alpha plane is run-coded and might exceed the bit budget.
+         ctx->frame_size_upper_bound += (ctx->pictures_per_frame *
+                                         ctx->slices_per_picture + 1) *
+         /* num pixels per slice */     (ctx->mbs_per_slice * 256 *
+         /* bits per pixel */            (1 + ctx->alpha_bits + 1) + 7 >> 3);
+    }
+
+    avctx->codec_tag   = ctx->profile_info->tag;
+    avctx->profile = ctx->profile;
+
+    av_log(avctx, AV_LOG_DEBUG,
+           "profile %d, %d slices, interlacing: %s, %d bits per MB\n",
+           ctx->profile, ctx->slices_per_picture * ctx->pictures_per_frame,
+           interlaced ? "yes" : "no", ctx->bits_per_mb);
+    av_log(avctx, AV_LOG_DEBUG, "frame size upper bound: %d\n",
+           ctx->frame_size_upper_bound);
+
+    return 0;
+}
+
+uint8_t* ff_prores_kostya_write_frame_header(AVCodecContext *avctx, ProresContext *ctx,
+                                             uint8_t **orig_buf, int flags,
+                                             enum AVColorPrimaries color_primaries,
+                                             enum AVColorTransferCharacteristic color_trc,
+                                             enum AVColorSpace colorspace)
+{
+    uint8_t *buf, *tmp;
+    uint8_t frame_flags;
+
+    // frame atom
+    *orig_buf += 4;                              // frame size
+    bytestream_put_be32  (orig_buf, FRAME_ID); // frame container ID
+    buf = *orig_buf;
+
+    // frame header
+    tmp = buf;
+    buf += 2;                                   // frame header size will be stored here
+    bytestream_put_be16  (&buf, ctx->chroma_factor != CFACTOR_Y422 || ctx->alpha_bits ? 1 : 0);
+    bytestream_put_buffer(&buf, (uint8_t*)ctx->vendor, 4);
+    bytestream_put_be16  (&buf, avctx->width);
+    bytestream_put_be16  (&buf, avctx->height);
+
+    frame_flags = ctx->chroma_factor << 6;
+    if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT)
+        frame_flags |= (flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 0x04 : 0x08;
+    bytestream_put_byte  (&buf, frame_flags);
+
+    bytestream_put_byte  (&buf, 0);             // reserved
+    bytestream_put_byte  (&buf, color_primaries);
+    bytestream_put_byte  (&buf, color_trc);
+    bytestream_put_byte  (&buf, colorspace);
+    bytestream_put_byte  (&buf, ctx->alpha_bits >> 3);
+    bytestream_put_byte  (&buf, 0);             // reserved
+    if (ctx->quant_sel != QUANT_MAT_DEFAULT) {
+        bytestream_put_byte  (&buf, 0x03);      // matrix flags - both matrices are present
+        bytestream_put_buffer(&buf, ctx->quant_mat, 64);        // luma quantisation matrix
+        bytestream_put_buffer(&buf, ctx->quant_chroma_mat, 64); // chroma quantisation matrix
+    } else {
+        bytestream_put_byte  (&buf, 0x00);      // matrix flags - default matrices are used
+    }
+    bytestream_put_be16  (&tmp, buf - *orig_buf); // write back frame header size
+    return buf;
+}
+
+uint8_t* ff_prores_kostya_write_picture_header(ProresContext *ctx, uint8_t *buf)
+{
+    bytestream_put_byte  (&buf, 0x40); // picture header size (in bits)
+    buf += 4;                                   // picture data size will be stored here
+    bytestream_put_be16  (&buf, ctx->slices_per_picture);
+    bytestream_put_byte  (&buf, av_log2(ctx->mbs_per_slice) << 4); // slice width and height in MBs
+    return buf;
+}
\ No newline at end of file
diff --git a/libavcodec/proresenc_kostya_common.h b/libavcodec/proresenc_kostya_common.h
new file mode 100644
index 0000000000..f18adc36af
--- /dev/null
+++ b/libavcodec/proresenc_kostya_common.h
@@ -0,0 +1,131 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ * Copyright (c) 2012 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PRORESENC_KOSTYA_H
+#define AVCODEC_PRORESENC_KOSTYA_H
+
+#include "libavutil/attributes_internal.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/pixfmt.h"
+#include "fdctdsp.h"
+
+FF_VISIBILITY_PUSH_HIDDEN
+
+#define CFACTOR_Y422 2
+#define CFACTOR_Y444 3
+
+#define MAX_MBS_PER_SLICE 8
+
+#define MAX_PLANES 4
+
+#define NUM_MB_LIMITS 4
+
+#define MAX_STORED_Q 16
+
+enum {
+    PRORES_PROFILE_AUTO  = -1,
+    PRORES_PROFILE_PROXY = 0,
+    PRORES_PROFILE_LT,
+    PRORES_PROFILE_STANDARD,
+    PRORES_PROFILE_HQ,
+    PRORES_PROFILE_4444,
+    PRORES_PROFILE_4444XQ,
+};
+
+enum {
+    QUANT_MAT_PROXY = 0,
+    QUANT_MAT_PROXY_CHROMA,
+    QUANT_MAT_LT,
+    QUANT_MAT_STANDARD,
+    QUANT_MAT_HQ,
+    QUANT_MAT_XQ_LUMA,
+    QUANT_MAT_DEFAULT,
+};
+
+typedef struct prores_profile {
+    const char *full_name;
+    uint32_t    tag;
+    int         min_quant;
+    int         max_quant;
+    int         br_tab[NUM_MB_LIMITS];
+    int         quant;
+    int         quant_chroma;
+} prores_profile;
+
+typedef struct ProresContext {
+    AVClass *class;
+    DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
+    DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
+    int16_t quants[MAX_STORED_Q][64];
+    int16_t quants_chroma[MAX_STORED_Q][64];
+    int16_t custom_q[64];
+    int16_t custom_chroma_q[64];
+    const uint8_t *quant_mat;
+    const uint8_t *quant_chroma_mat;
+    const uint8_t *scantable;
+
+    void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src,
+                 ptrdiff_t linesize, int16_t *block);
+    FDCTDSPContext fdsp;
+
+    const AVFrame *pic;
+    int mb_width, mb_height;
+    int mbs_per_slice;
+    int num_chroma_blocks, chroma_factor;
+    int slices_width;
+    int slices_per_picture;
+    int pictures_per_frame; // 1 for progressive, 2 for interlaced
+    int cur_picture_idx;
+    int num_planes;
+    int bits_per_mb;
+    int force_quant;
+    int alpha_bits;
+    int warn;
+
+    char *vendor;
+    int quant_sel;
+
+    int frame_size_upper_bound;
+
+    int profile;
+    const struct prores_profile *profile_info;
+
+    int *slice_q;
+
+    struct ProresThreadData *tdata;
+} ProresContext;
+
+av_cold int ff_prores_kostya_encode_init(AVCodecContext *avctx, ProresContext *ctx,
+                                         enum AVPixelFormat pixfmt);
+
+uint8_t* ff_prores_kostya_write_frame_header(AVCodecContext *avctx, ProresContext *ctx,
+                                             uint8_t **orig_buf, int flags,
+                                             enum AVColorPrimaries color_primaries,
+                                             enum AVColorTransferCharacteristic color_trc,
+                                             enum AVColorSpace colorspace);
+
+uint8_t* ff_prores_kostya_write_picture_header(ProresContext *ctx, uint8_t *buf);
+
+FF_VISIBILITY_POP_HIDDEN
+
+#endif
\ No newline at end of file
-- 
2.50.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [FFmpeg-devel] [PATCH 3/3] lavc: implement a Vulkan-based prores encoder
  2025-09-04 20:10 [FFmpeg-devel] [PATCH 1/3] vulkan: Support samplerless images IndecisiveTurtle via ffmpeg-devel
  2025-09-04 20:10 ` [FFmpeg-devel] [PATCH 2/3] lavc: Split out common components used by vulkan prores encoder IndecisiveTurtle via ffmpeg-devel
@ 2025-09-04 20:10 ` IndecisiveTurtle via ffmpeg-devel
  1 sibling, 0 replies; 3+ messages in thread
From: IndecisiveTurtle via ffmpeg-devel @ 2025-09-04 20:10 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: IndecisiveTurtle

From: IndecisiveTurtle <geoster3d@gmail.com>

Adds a vulkan implementation of the reference prores kostya encoder. Provides about 3-4x speedup over the CPU code
---
 configure                                     |    1 +
 libavcodec/Makefile                           |    1 +
 libavcodec/allcodecs.c                        |    1 +
 libavcodec/proresenc_kostya_vulkan.c          | 1068 +++++++++++++++++
 libavcodec/vulkan/Makefile                    |    7 +
 libavcodec/vulkan/prores_ks_alpha_data.comp   |   67 ++
 libavcodec/vulkan/prores_ks_encode_slice.comp |  230 ++++
 .../vulkan/prores_ks_estimate_slice.comp      |  267 +++++
 libavcodec/vulkan/prores_ks_slice_data.comp   |  265 ++++
 libavcodec/vulkan/prores_ks_trellis_node.comp |  177 +++
 10 files changed, 2084 insertions(+)
 create mode 100644 libavcodec/proresenc_kostya_vulkan.c
 create mode 100644 libavcodec/vulkan/prores_ks_alpha_data.comp
 create mode 100644 libavcodec/vulkan/prores_ks_encode_slice.comp
 create mode 100644 libavcodec/vulkan/prores_ks_estimate_slice.comp
 create mode 100644 libavcodec/vulkan/prores_ks_slice_data.comp
 create mode 100644 libavcodec/vulkan/prores_ks_trellis_node.comp

diff --git a/configure b/configure
index 7ec4c3975b..4db8a7c581 100755
--- a/configure
+++ b/configure
@@ -3099,6 +3099,7 @@ prores_decoder_select="blockdsp idctdsp"
 prores_encoder_select="fdctdsp"
 prores_aw_encoder_select="fdctdsp"
 prores_ks_encoder_select="fdctdsp"
+prores_ks_vulkan_encoder_select="vulkan spirv_compiler"
 prores_raw_decoder_select="blockdsp idctdsp"
 qcelp_decoder_select="lsp"
 qdm2_decoder_select="mpegaudiodsp"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index d8e1ac5a54..1964c787d7 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -639,6 +639,7 @@ OBJS-$(CONFIG_PRORES_DECODER)          += proresdec.o proresdsp.o proresdata.o
 OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc_anatoliy.o proresdata.o
 OBJS-$(CONFIG_PRORES_AW_ENCODER)       += proresenc_anatoliy.o proresdata.o
 OBJS-$(CONFIG_PRORES_KS_ENCODER)       += proresenc_kostya.o proresdata.o proresenc_kostya_common.o
+OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += proresenc_kostya_vulkan.o proresdata.o proresenc_kostya_common.o
 OBJS-$(CONFIG_PRORES_RAW_DECODER)      += prores_raw.o proresdsp.o proresdata.o
 OBJS-$(CONFIG_PRORES_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o
 OBJS-$(CONFIG_PROSUMER_DECODER)        += prosumer.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index f5ec2e01e8..1b4a5f769c 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -269,6 +269,7 @@ extern const FFCodec ff_prores_encoder;
 extern const FFCodec ff_prores_decoder;
 extern const FFCodec ff_prores_aw_encoder;
 extern const FFCodec ff_prores_ks_encoder;
+extern const FFCodec ff_prores_ks_vulkan_encoder;
 extern const FFCodec ff_prores_raw_decoder;
 extern const FFCodec ff_prosumer_decoder;
 extern const FFCodec ff_psd_decoder;
diff --git a/libavcodec/proresenc_kostya_vulkan.c b/libavcodec/proresenc_kostya_vulkan.c
new file mode 100644
index 0000000000..6413b2f9d4
--- /dev/null
+++ b/libavcodec/proresenc_kostya_vulkan.c
@@ -0,0 +1,1068 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ * Copyright (c) 2012 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/buffer.h"
+#include "libavutil/macros.h"
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/vulkan_spirv.h"
+#include "libavutil/hwcontext_vulkan.h"
+#include "libavutil/vulkan_loader.h"
+#include "libavutil/vulkan.h"
+#include "avcodec.h"
+#include "codec.h"
+#include "codec_internal.h"
+#include "encode.h"
+#include "packet.h"
+#include "put_bits.h"
+#include "profiles.h"
+#include "bytestream.h"
+#include "proresdata.h"
+#include "proresenc_kostya_common.h"
+#include "hwconfig.h"
+
+#define DCTSIZE 8
+
+typedef struct ProresDataTables {
+    int16_t qmat[128][64];
+    int16_t qmat_chroma[128][64];
+    uint8_t scan[64];
+    uint8_t dc_codebook[7];
+    uint8_t run_to_cb[16];
+    uint8_t level_to_cb[10];
+} ProresDataTables;
+
+typedef struct SliceDataInfo {
+    int plane;
+    int pictures_per_frame;
+    int line_add;
+} SliceDataInfo;
+
+typedef struct EstimateSliceInfo {
+    int slices_per_picture;
+    int min_quant;
+    int max_quant;
+    int bits_per_mb;
+} EstimateSliceInfo;
+
+typedef struct EncodeSliceInfo {
+    VkDeviceAddress bytestream;
+    VkDeviceAddress seek_table;
+    int num_planes;
+    int slices_per_picture;
+    int max_quant;
+} EncodeSliceInfo;
+
+typedef struct TrellisNodeInfo {
+    int min_quant;
+    int max_quant;
+    int mbs_per_slice;
+    int bits_per_mb;
+} TrellisNodeInfo;
+
+#define TRELLIS_WIDTH 16
+#define SCORE_LIMIT   INT_MAX / 2
+
+struct TrellisNode {
+    int prev_node;
+    int quant;
+    int bits;
+    int score;
+};
+
+typedef struct SliceData {
+    uint32_t mbs_per_slice;
+    int16_t rows[MAX_PLANES * MAX_MBS_PER_SLICE * 256];
+} SliceData;
+
+typedef struct SliceScore {
+    int bits[MAX_STORED_Q][4];
+    int error[MAX_STORED_Q][4];
+    int total_bits[MAX_STORED_Q];
+    int total_error[MAX_STORED_Q];
+    int overquant;
+    int buf_start;
+    int quant;
+} SliceScore;
+
+typedef struct VulkanEncodeProresFrameData {
+    /* Intermediate buffers */
+    AVBufferRef *out_data_ref[2];
+    AVBufferRef *slice_data_ref[2];
+    AVBufferRef *slice_score_ref[2];
+    AVBufferRef *frame_size_ref[2];
+
+    /* Copied from the source */
+    int64_t pts;
+    int64_t duration;
+    void        *frame_opaque;
+    AVBufferRef *frame_opaque_ref;
+    enum AVColorTransferCharacteristic color_trc;
+    enum AVColorSpace colorspace;
+    enum AVColorPrimaries color_primaries;
+    int key_frame;
+    int flags;
+} VulkanEncodeProresFrameData;
+
+typedef struct ProresVulkanContext {
+    ProresContext ctx;
+
+    /* Vulkan state */
+    FFVulkanContext vkctx;
+    AVVulkanDeviceQueueFamily *qf;
+    FFVkExecPool e;
+    AVVulkanDeviceQueueFamily *transfer_qf;
+    FFVkExecPool transfer_exec_pool;
+    AVBufferPool *pkt_buf_pool;
+    AVBufferPool *slice_data_buf_pool;
+    AVBufferPool *slice_score_buf_pool;
+    AVBufferPool *frame_size_buf_pool;
+
+    FFVulkanShader alpha_data_shd;
+    FFVulkanShader slice_data_shd[2];
+    FFVulkanShader estimate_slice_shd;
+    FFVulkanShader encode_slice_shd;
+    FFVulkanShader trellis_node_shd;
+    FFVkBuffer prores_data_tables_buf;
+
+    int *slice_quants;
+    SliceScore *slice_scores;
+    ProresDataTables *tables;
+
+    int in_flight;
+    int async_depth;
+    AVFrame *frame;
+    VulkanEncodeProresFrameData *exec_ctx_info;
+} ProresVulkanContext;
+
+extern const char *ff_source_common_comp;
+extern const char *ff_source_prores_ks_alpha_data_comp;
+extern const char *ff_source_prores_ks_slice_data_comp;
+extern const char *ff_source_prores_ks_estimate_slice_comp;
+extern const char *ff_source_prores_ks_trellis_node_comp;
+extern const char *ff_source_prores_ks_encode_slice_comp;
+
+static int init_slice_data_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                    FFVulkanShader* shd, const char* pl_name, int blocks_per_mb)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, DCTSIZE, blocks_per_mb, pv->ctx.mbs_per_slice, 0);
+
+    av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+    av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+    av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice);
+    av_bprintf(&shd->src, "#define BLOCKS_PER_MB %d\n", blocks_per_mb);
+    av_bprintf(&shd->src, "#define WIDTH_IN_MB %d\n", pv->ctx.mb_width);
+    av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; i16vec4 rows[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE][DCTSIZE / 4]; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "SliceBuffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceData slices[];",
+        },
+        {
+            .name       = "planes",
+            .type       = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+            .dimensions = 2,
+            .elems      = 3,
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "r16i",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(SliceDataInfo), VK_SHADER_STAGE_COMPUTE_BIT);
+    GLSLD(ff_source_prores_ks_slice_data_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int init_alpha_data_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                    FFVulkanShader* shd, const char* pl_name)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, DCTSIZE * 2, DCTSIZE * 2, 1, 0);
+
+    av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+    av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+    av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice);
+    av_bprintf(&shd->src, "#define WIDTH_IN_MB %d\n", pv->ctx.mb_width);
+    av_bprintf(&shd->src, "#define SLICES_PITCH %d\n", pv->ctx.slices_width);
+    av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits);
+    av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "SliceBuffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceData slices[];",
+        },
+        {
+            .name       = "plane",
+            .type       = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+            .dimensions = 2,
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "r16i",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(int), VK_SHADER_STAGE_COMPUTE_BIT);
+    GLSLD(ff_source_prores_ks_alpha_data_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int init_estimate_slice_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                        FFVulkanShader* shd, const char* pl_name)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+    int subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
+    int dim_x = pv->ctx.alpha_bits ? subgroup_size : (subgroup_size / 3) * 3;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, dim_x, 1, 1, 0);
+
+    av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+    av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+    av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice);
+    av_bprintf(&shd->src, "#define CHROMA_FACTOR %d\n", pv->ctx.chroma_factor);
+    av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits);
+    av_bprintf(&shd->src, "#define MAX_STORED_Q %d\n", MAX_STORED_Q);
+    av_bprintf(&shd->src, "#define NUM_PLANES %d\n", pv->ctx.num_planes);
+    av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n");
+    av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4 score[16]; int total_bits[16]; int total_score[16]; uint overquant; int buf_start; int quant; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "SliceBuffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceData slices[];",
+        },
+        {
+            .name        = "SliceScores",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceScore scores[];",
+        },
+        {
+            .name        = "ProresDataTables",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t qmat[128][64]; int16_t qmat_chroma[128][64]; uint8_t scan[64]; "
+                           "uint8_t dc_codebook[7]; uint8_t run_to_cb[16]; uint8_t level_to_cb[10];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(EstimateSliceInfo), VK_SHADER_STAGE_COMPUTE_BIT);
+    GLSLD(ff_source_prores_ks_estimate_slice_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int init_trellis_node_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                      FFVulkanShader* shd, const char* pl_name)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+    int subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, pv->ctx.mb_height, 1, 1, 0);
+
+    av_bprintf(&shd->src, "#define SLICES_WIDTH %d\n", pv->ctx.slices_width);
+    av_bprintf(&shd->src, "#define NUM_SUBGROUPS %d\n", FFALIGN(pv->ctx.mb_height, subgroup_size) / subgroup_size);
+    av_bprintf(&shd->src, "#define NUM_PLANES %d\n", pv->ctx.num_planes);
+    av_bprintf(&shd->src, "#define FORCE_QUANT %d\n", pv->ctx.force_quant);
+    av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4 score[16]; int total_bits[16]; int total_score[16]; int overquant; int buf_start; int quant; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "FrameSize",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int frame_size;",
+        },
+        {
+            .name        = "SliceScores",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceScore scores[];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(TrellisNodeInfo), VK_SHADER_STAGE_COMPUTE_BIT);
+    GLSLD(ff_source_prores_ks_trellis_node_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int init_encode_slice_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                      FFVulkanShader* shd, const char* pl_name)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, 64, 1, 1, 0);
+
+    av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+    av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+    av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice);
+    av_bprintf(&shd->src, "#define CHROMA_FACTOR %d\n", pv->ctx.chroma_factor);
+    av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits);
+    av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n");
+    av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4 score[16]; int total_bits[16]; int total_score[16]; uint overquant; int buf_start; int quant; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "SliceBuffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceData slices[];",
+        },
+        {
+            .name        = "SliceScores",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceScore scores[];",
+        },
+        {
+            .name        = "ProresDataTables",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t qmat[128][64]; int16_t qmat_chroma[128][64]; uint8_t scan[64]; "
+                           "uint8_t dc_codebook[7]; uint8_t run_to_cb[16]; uint8_t level_to_cb[10];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(EncodeSliceInfo), VK_SHADER_STAGE_COMPUTE_BIT);
+    av_bprintf(&shd->src, "#define PB_UNALIGNED\n");
+    av_bprintf(&shd->src, "#extension GL_EXT_buffer_reference : require\n");
+    av_bprintf(&shd->src, "#extension GL_EXT_buffer_reference2 : require\n");
+    GLSLD(ff_source_common_comp);
+    GLSLD(ff_source_prores_ks_encode_slice_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int vulkan_encode_prores_submit_frame(AVCodecContext *avctx, FFVkExecContext *exec,
+                                             AVFrame *frame, int picture_idx)
+{
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    VulkanEncodeProresFrameData *pd = exec->opaque;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanFunctions *vk = &vkctx->vkfn;
+    int err = 0, nb_img_bar = 0, i, is_chroma;
+    int min_quant = ctx->profile_info->min_quant;
+    int max_quant = ctx->profile_info->max_quant;
+    int subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
+    int estimate_dim_x = ctx->alpha_bits ? subgroup_size : (subgroup_size / 3) * 3;
+    int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY;
+    VkImageView views[AV_NUM_DATA_POINTERS];
+    VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS];
+    FFVkBuffer *pkt_vk_buf, *slice_data_buf, *slice_score_buf, *frame_size_buf;
+    SliceDataInfo slice_data_info;
+    EstimateSliceInfo estimate_info;
+    TrellisNodeInfo trellis_node_info;
+    EncodeSliceInfo encode_info;
+    FFVulkanShader *shd;
+
+    /* Start recording */
+    ff_vk_exec_start(vkctx, exec);
+
+    /* Get a pooled buffer for writing output data */
+    RET(ff_vk_get_pooled_buffer(vkctx, &pv->pkt_buf_pool, &pd->out_data_ref[picture_idx],
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
+                                ctx->frame_size_upper_bound + FF_INPUT_BUFFER_MIN_SIZE,
+                                transfer_slices ? VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
+                                                : (VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
+                                                   VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                                   VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)));
+    pkt_vk_buf = (FFVkBuffer*)pd->out_data_ref[picture_idx]->data;
+    ff_vk_exec_add_dep_buf(vkctx, exec, &pd->out_data_ref[picture_idx], 1, 1);
+
+    /* Allocate buffer for writing slice data */
+    RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_data_buf_pool, &pd->slice_data_ref[picture_idx],
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
+                                ctx->slices_per_picture * sizeof(SliceData),
+                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+    slice_data_buf = (FFVkBuffer*)pd->slice_data_ref[picture_idx]->data;
+    ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_data_ref[picture_idx], 1, 1);
+
+    /* Allocate buffer for writing slice scores */
+    RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_score_buf_pool, &pd->slice_score_ref[picture_idx],
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
+                                ctx->slices_per_picture * sizeof(SliceScore),
+                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+    slice_score_buf = (FFVkBuffer*)pd->slice_score_ref[picture_idx]->data;
+    ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_score_ref[picture_idx], 1, 1);
+
+    /* Allocate buffer for writing frame size */
+    RET(ff_vk_get_pooled_buffer(vkctx, &pv->frame_size_buf_pool, &pd->frame_size_ref[picture_idx],
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
+                                sizeof(int),
+                                VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
+                                VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                VK_MEMORY_PROPERTY_HOST_COHERENT_BIT));
+    frame_size_buf = (FFVkBuffer*)pd->frame_size_ref[picture_idx]->data;
+    ff_vk_exec_add_dep_buf(vkctx, exec, &pd->frame_size_ref[picture_idx], 1, 1);
+
+    /* Generate barriers and image views for frame images. */
+    RET(ff_vk_exec_add_dep_frame(vkctx, exec, frame,
+                                 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+    RET(ff_vk_create_imageviews(vkctx, exec, views, frame, FF_VK_REP_INT));
+    ff_vk_frame_barrier(vkctx, exec, frame, img_bar, &nb_img_bar,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_ACCESS_SHADER_READ_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+
+    /* Submit the image barriers. */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+                                           .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+                                           .pImageMemoryBarriers = img_bar,
+                                           .imageMemoryBarrierCount = nb_img_bar,
+                                       });
+
+    /* Apply FDCT on input image data for future passes */
+    slice_data_info = (SliceDataInfo) {
+        .pictures_per_frame = ctx->pictures_per_frame,
+        .line_add = ctx->pictures_per_frame == 1 ? 0 : picture_idx ^ !(frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST),
+    };
+    for (i = 0; i < ctx->num_planes; i++) {
+        is_chroma = (i == 1 || i == 2);
+        shd = &pv->slice_data_shd[!is_chroma || ctx->chroma_factor == CFACTOR_Y444];
+        if (i < 3) {
+            slice_data_info.plane = i;
+            ff_vk_shader_update_desc_buffer(vkctx, exec, shd, 0, 0, 0,
+                                            slice_data_buf, 0, slice_data_buf->size,
+                                            VK_FORMAT_UNDEFINED);
+            ff_vk_shader_update_img_array(vkctx, exec, shd, frame, views, 0, 1,
+                                          VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
+            ff_vk_exec_bind_shader(vkctx, exec, shd);
+            ff_vk_shader_update_push_const(vkctx, exec, shd, VK_SHADER_STAGE_COMPUTE_BIT,
+                                           0, sizeof(SliceDataInfo), &slice_data_info);
+            vk->CmdDispatch(exec->buf, ctx->slices_width, ctx->mb_height, 1);        
+        } else {
+            ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->alpha_data_shd, 0, 0, 0,
+                                            slice_data_buf, 0, slice_data_buf->size,
+                                            VK_FORMAT_UNDEFINED);
+            ff_vk_shader_update_img(vkctx, exec, &pv->alpha_data_shd, 0, 1, 0, views[3],
+                                    VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
+            ff_vk_exec_bind_shader(vkctx, exec, &pv->alpha_data_shd);
+            vk->CmdDispatch(exec->buf, ctx->mb_width, ctx->mb_height, 1);
+        }
+    }
+
+    /* Wait for writes to slice buffer. */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .pNext = NULL,
+            .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = slice_data_buf->buf,
+            .offset = 0U,
+            .size = slice_data_buf->size,
+        },
+        .bufferMemoryBarrierCount = 1u,
+    });
+
+    /* Estimate slice bits and error for each quant */
+    estimate_info = (EstimateSliceInfo) {
+        .slices_per_picture = ctx->slices_per_picture,
+        .min_quant = ctx->force_quant ? ctx->force_quant : min_quant,
+        .max_quant = ctx->force_quant ? ctx->force_quant : max_quant,
+        .bits_per_mb = ctx->bits_per_mb,
+    };
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 0, 0,
+                                    slice_data_buf, 0, slice_data_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 1, 0,
+                                    slice_score_buf, 0, slice_score_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 2, 0,
+                                    &pv->prores_data_tables_buf, 0, pv->prores_data_tables_buf.size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_exec_bind_shader(vkctx, exec, &pv->estimate_slice_shd);
+
+    ff_vk_shader_update_push_const(vkctx, exec, &pv->estimate_slice_shd,
+                                   VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(estimate_info),
+                                   &estimate_info);
+    vk->CmdDispatch(exec->buf, (ctx->slices_per_picture * ctx->num_planes + estimate_dim_x - 1) / estimate_dim_x,
+                               ctx->force_quant ? 1 : (max_quant - min_quant + 1), 1);
+
+    /* Wait for writes to score buffer. */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .pNext = NULL,
+            .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = slice_score_buf->buf,
+            .offset = 0U,
+            .size = slice_score_buf->size,
+        },
+        .bufferMemoryBarrierCount = 1u,
+    });
+
+    /* Compute optimal quant value for each slice */
+    trellis_node_info = (TrellisNodeInfo) {
+        .min_quant = min_quant,
+        .max_quant = max_quant,
+        .bits_per_mb = ctx->bits_per_mb,
+        .mbs_per_slice = ctx->mbs_per_slice,
+    };
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 0, 0,
+                                    frame_size_buf, 0, frame_size_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 1, 0,
+                                    slice_score_buf, 0, slice_score_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_exec_bind_shader(vkctx, exec, &pv->trellis_node_shd);
+    ff_vk_shader_update_push_const(vkctx, exec, &pv->trellis_node_shd, VK_SHADER_STAGE_COMPUTE_BIT,
+                                    0, sizeof(TrellisNodeInfo), &trellis_node_info);
+    vk->CmdDispatch(exec->buf, 1, 1, 1);
+
+    /* Wait for writes to quant buffer. */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .pNext = NULL,
+            .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = frame_size_buf->buf,
+            .offset = 0U,
+            .size = frame_size_buf->size,
+        },
+        .bufferMemoryBarrierCount = 1u,
+    });
+
+    /* Encode slices. */
+    encode_info = (EncodeSliceInfo) {
+        .seek_table = pkt_vk_buf->address,
+        .bytestream = pkt_vk_buf->address + ctx->slices_per_picture * 2,
+        .num_planes = ctx->num_planes,
+        .slices_per_picture = ctx->slices_per_picture,
+        .max_quant = ctx->force_quant ? ctx->force_quant : max_quant,
+    };
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 0, 0,
+                                    slice_data_buf, 0, slice_data_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 1, 0,
+                                    slice_score_buf, 0, slice_score_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 2, 0,
+                                    &pv->prores_data_tables_buf, 0, pv->prores_data_tables_buf.size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_exec_bind_shader(vkctx, exec, &pv->encode_slice_shd);
+    ff_vk_shader_update_push_const(vkctx, exec, &pv->encode_slice_shd,
+                                   VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(encode_info), &encode_info);
+    vk->CmdDispatch(exec->buf, FFALIGN(ctx->slices_per_picture, 64) / 64,
+                               ctx->num_planes, 1);
+    
+fail:
+    return err;
+}
+
+static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec, AVPacket *pkt)
+{
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    VulkanEncodeProresFrameData *pd = exec->opaque;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanFunctions *vk = &vkctx->vkfn;
+    FFVkExecContext *transfer_exec;
+    uint8_t *orig_buf, *buf, *slice_sizes;
+    uint8_t *picture_size_pos;
+    int picture_idx, err = 0;
+    int frame_size, picture_size;
+    int pkt_size = ctx->frame_size_upper_bound;
+    int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY;
+    FFVkBuffer *out_data_buf, *frame_size_buf;
+    VkMappedMemoryRange invalidate_data;
+    AVBufferRef *mapped_ref;
+    FFVkBuffer *mapped_buf;
+
+    /* Allocate packet */
+    RET(ff_get_encode_buffer(avctx, pkt, pkt_size + FF_INPUT_BUFFER_MIN_SIZE, 0));
+
+    /* Initialize packet. */
+    pkt->pts      = pd->pts;
+    pkt->dts      = pd->pts;
+    pkt->duration = pd->duration;
+    pkt->flags   |= AV_PKT_FLAG_KEY * pd->key_frame;
+
+    if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
+        pkt->opaque          = pd->frame_opaque;
+        pkt->opaque_ref      = pd->frame_opaque_ref;
+        pd->frame_opaque_ref = NULL;
+    }
+    
+    /* Write frame atom */
+    orig_buf = pkt->data;
+    buf = ff_prores_kostya_write_frame_header(avctx,  ctx, &orig_buf, pd->flags,
+                              pd->color_primaries, pd->color_trc,
+                                   pd->colorspace);
+
+    /* Make sure encoding's done */
+    ff_vk_exec_wait(vkctx, exec);
+
+    /* Roll transfer execution context */
+    if (transfer_slices) {
+        RET(ff_vk_host_map_buffer(vkctx, &mapped_ref, pkt->data, pkt->buf,
+                                  VK_BUFFER_USAGE_TRANSFER_DST_BIT));
+        mapped_buf = (FFVkBuffer *)mapped_ref->data;
+        transfer_exec = ff_vk_exec_get(vkctx, &pv->transfer_exec_pool);
+        ff_vk_exec_start(vkctx, transfer_exec);
+    }
+
+    for (picture_idx = 0; picture_idx < ctx->pictures_per_frame; picture_idx++) {
+        /* Fetch buffers for the current picture. */
+        out_data_buf = (FFVkBuffer *)pd->out_data_ref[picture_idx]->data;
+        frame_size_buf = (FFVkBuffer *)pd->frame_size_ref[picture_idx]->data;
+
+        /* Invalidate slice/output data if needed */
+        invalidate_data = (VkMappedMemoryRange) {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .offset = 0,
+            .size = VK_WHOLE_SIZE,
+        };
+        if (!(frame_size_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+            invalidate_data.memory = frame_size_buf->mem;
+            vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1, &invalidate_data);
+        }
+
+        /* Write picture header */
+        picture_size_pos = buf + 1;
+        buf = ff_prores_kostya_write_picture_header(ctx, buf);
+
+        /* Skip over seek table */
+        slice_sizes = buf;
+        buf += ctx->slices_per_picture * 2;
+
+        /* Calculate final size */
+        buf += *(int*)frame_size_buf->mapped_mem;
+
+        if (transfer_slices) {
+            /* Perform host mapped transfer of slice data */
+            ff_vk_exec_add_dep_buf(vkctx, transfer_exec, &pd->out_data_ref[picture_idx], 1, 0);
+            ff_vk_exec_add_dep_buf(vkctx, transfer_exec, &mapped_ref, 1, 0);
+            vk->CmdCopyBuffer(transfer_exec->buf, out_data_buf->buf, mapped_buf->buf, 1, & (VkBufferCopy) {
+                .srcOffset = 0,
+                .dstOffset = mapped_buf->virtual_offset + slice_sizes - pkt->data,
+                .size = buf - slice_sizes,
+            });
+        } else {
+            /* Fallback to regular memcpy if transfer is not available */
+            if (!(out_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+                invalidate_data.memory = out_data_buf->mem;
+                vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1, &invalidate_data);
+            }
+            memcpy(slice_sizes, out_data_buf->mapped_mem, buf - slice_sizes);
+            av_buffer_unref(&pd->out_data_ref[picture_idx]);
+        }
+
+        /* Write picture size with header */
+        picture_size = buf - (picture_size_pos - 1);
+        bytestream_put_be32(&picture_size_pos, picture_size);
+
+        /* Slice output buffers no longer needed */
+        av_buffer_unref(&pd->slice_data_ref[picture_idx]);
+        av_buffer_unref(&pd->slice_score_ref[picture_idx]);
+        av_buffer_unref(&pd->frame_size_ref[picture_idx]);
+    }
+
+    /* Write frame size in header */
+    orig_buf -= 8;
+    frame_size = buf - orig_buf;
+    bytestream_put_be32(&orig_buf, frame_size);
+
+    av_shrink_packet(pkt, frame_size);
+    av_log(avctx, AV_LOG_VERBOSE, "Encoded data: %iMiB\n", pkt->size / (1024*1024));
+
+    /* Wait for slice transfer */
+    if (transfer_slices) {
+        RET(ff_vk_exec_submit(vkctx, transfer_exec));
+        ff_vk_exec_wait(vkctx, transfer_exec);
+    }
+
+fail:
+    return err;
+}
+
+static int vulkan_encode_prores_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
+{
+    int err;
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    VulkanEncodeProresFrameData *pd;
+    FFVkExecContext *exec;
+    AVFrame *frame;
+
+    while (1) {
+        /* Roll an execution context */
+        exec = ff_vk_exec_get(&pv->vkctx, &pv->e);
+
+        /* If it had a frame, immediately output it */
+        if (exec->had_submission) {
+            exec->had_submission = 0;
+            pv->in_flight--;
+            return get_packet(avctx, exec, pkt);
+        }
+
+        /* Get next frame to encode */
+        frame = pv->frame;
+        err = ff_encode_get_frame(avctx, frame);
+        if (err < 0 && err != AVERROR_EOF) {
+            return err;
+        } else if (err == AVERROR_EOF) {
+            if (!pv->in_flight)
+                return err;
+            continue;
+        }
+
+        /* Encode frame */
+        pd = exec->opaque;
+        pd->color_primaries = frame->color_primaries;
+        pd->color_trc = frame->color_trc;
+        pd->colorspace = frame->colorspace;
+        pd->pts = frame->pts;
+        pd->duration = frame->duration;
+        pd->flags = frame->flags;
+        if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
+            pd->frame_opaque     = frame->opaque;
+            pd->frame_opaque_ref = frame->opaque_ref;
+            frame->opaque_ref    = NULL;
+        }
+
+        err = vulkan_encode_prores_submit_frame(avctx, exec, frame, 0);
+        if (ctx->pictures_per_frame > 1)
+            vulkan_encode_prores_submit_frame(avctx, exec, frame, 1);
+
+        /* Submit execution context */
+        ff_vk_exec_submit(&pv->vkctx, exec);
+        av_frame_unref(frame);
+        if (err < 0)
+            return err;
+
+        pv->in_flight++;
+        if (pv->in_flight < pv->async_depth)
+            return AVERROR(EAGAIN);
+    }
+
+    return 0;
+}
+
+static av_cold int encode_close(AVCodecContext *avctx)
+{
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    FFVulkanContext *vkctx = &pv->vkctx;
+
+    ff_vk_exec_pool_free(vkctx, &pv->e);
+    ff_vk_exec_pool_free(vkctx, &pv->transfer_exec_pool);
+
+    if (ctx->alpha_bits)
+        ff_vk_shader_free(vkctx, &pv->alpha_data_shd);
+
+    ff_vk_shader_free(vkctx, &pv->slice_data_shd[0]);
+    ff_vk_shader_free(vkctx, &pv->slice_data_shd[1]);
+    ff_vk_shader_free(vkctx, &pv->estimate_slice_shd);
+    ff_vk_shader_free(vkctx, &pv->encode_slice_shd);
+    ff_vk_shader_free(vkctx, &pv->trellis_node_shd);
+
+    ff_vk_free_buf(vkctx, &pv->prores_data_tables_buf);
+
+    av_buffer_pool_uninit(&pv->pkt_buf_pool);
+    av_buffer_pool_uninit(&pv->slice_data_buf_pool);
+    av_buffer_pool_uninit(&pv->slice_score_buf_pool);
+    av_buffer_pool_uninit(&pv->frame_size_buf_pool);
+
+    ff_vk_uninit(vkctx);
+
+    return 0;
+}
+
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    int err = 0, i, q;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVkSPIRVCompiler *spv;
+
+    /* Init vulkan */
+    RET(ff_vk_init(vkctx, avctx, NULL, avctx->hw_frames_ctx));
+
+    pv->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
+    if (!pv->qf) {
+        av_log(avctx, AV_LOG_ERROR, "Device has no compute queues!\n");
+        return AVERROR(ENOTSUP);
+    }
+
+    spv = ff_vk_spirv_init();
+    if (!spv) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    RET(ff_vk_exec_pool_init(vkctx, pv->qf, &pv->e, 1, 0, 0, 0, NULL));
+
+    pv->transfer_qf = ff_vk_qf_find(vkctx, VK_QUEUE_TRANSFER_BIT, 0);
+    if (!pv->transfer_qf) {
+        av_log(avctx, AV_LOG_ERROR, "Device has no transfer queues!\n");
+        return err;
+    }
+
+    RET(ff_vk_exec_pool_init(vkctx, pv->transfer_qf, &pv->transfer_exec_pool, 1, 0, 0, 0, NULL));
+
+    /* Init common prores structures */
+    err = ff_prores_kostya_encode_init(avctx, ctx, vkctx->frames->sw_format);
+    if (err < 0)
+        return err;
+
+    /* Temporary frame */
+    pv->frame = av_frame_alloc();
+    if (!pv->frame)
+        return AVERROR(ENOMEM);
+
+    /* Async data pool */
+    pv->async_depth = pv->e.pool_size;
+    pv->exec_ctx_info = av_calloc(pv->async_depth, sizeof(*pv->exec_ctx_info));
+    if (!pv->exec_ctx_info)
+        return AVERROR(ENOMEM);
+    for (int i = 0; i < pv->async_depth; i++)
+        pv->e.contexts[i].opaque = &pv->exec_ctx_info[i];
+
+    /* Compile shaders used by encoder */
+    init_slice_data_pipeline(pv, spv, &pv->slice_data_shd[0], "slice_data_blocks2", 2);
+    init_slice_data_pipeline(pv, spv, &pv->slice_data_shd[1], "slice_data_blocks4", 4);
+    init_estimate_slice_pipeline(pv, spv, &pv->estimate_slice_shd, "estimate_slice");
+    init_trellis_node_pipeline(pv, spv, &pv->trellis_node_shd, "trellis_node");
+    init_encode_slice_pipeline(pv, spv, &pv->encode_slice_shd, "encode_slice");
+    if (ctx->alpha_bits) {
+        init_alpha_data_pipeline(pv, spv, &pv->alpha_data_shd, "alpha_data");
+    }
+
+    /* Create prores data tables uniform buffer. */
+    RET(ff_vk_create_buf(vkctx, &pv->prores_data_tables_buf,
+                         sizeof(ProresDataTables), NULL, NULL,
+                         VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+                         VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+                         VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
+    RET(ff_vk_map_buffer(vkctx, &pv->prores_data_tables_buf, (void *)&pv->tables, 0));
+    memcpy(pv->tables->qmat, ctx->quants, sizeof(ctx->quants));
+    memcpy(pv->tables->qmat_chroma, ctx->quants_chroma, sizeof(ctx->quants_chroma));
+    memcpy(pv->tables->scan, ctx->scantable, sizeof(ff_prores_progressive_scan));
+    memcpy(pv->tables->dc_codebook, ff_prores_dc_codebook, sizeof(ff_prores_dc_codebook));
+    memcpy(pv->tables->run_to_cb, ff_prores_run_to_cb, sizeof(ff_prores_run_to_cb));
+    memcpy(pv->tables->level_to_cb, ff_prores_level_to_cb, sizeof(ff_prores_level_to_cb));
+
+    for (q = MAX_STORED_Q; q < 128; ++q) {
+        for (i = 0; i < 64; i++) {
+            pv->tables->qmat[q][i] = ctx->quant_mat[i] * q;
+            pv->tables->qmat_chroma[q][i] = ctx->quant_chroma_mat[i] * q;
+        }
+    }
+
+fail:
+    return err;
+}
+
+#define OFFSET(x) offsetof(ProresVulkanContext, x)
+#define VE     AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+
+static const AVOption options[] = {
+    { "mbs_per_slice", "macroblocks per slice", OFFSET(ctx.mbs_per_slice),
+        AV_OPT_TYPE_INT, { .i64 = 8 }, 1, MAX_MBS_PER_SLICE, VE },
+    { "profile",       NULL, OFFSET(ctx.profile), AV_OPT_TYPE_INT,
+        { .i64 = PRORES_PROFILE_AUTO },
+        PRORES_PROFILE_AUTO, PRORES_PROFILE_4444XQ, VE, .unit = "profile" },
+    { "auto",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_AUTO },
+        0, 0, VE, .unit = "profile" },
+    { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_PROXY },
+        0, 0, VE, .unit = "profile" },
+    { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_LT },
+        0, 0, VE, .unit = "profile" },
+    { "standard",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_STANDARD },
+        0, 0, VE, .unit = "profile" },
+    { "hq",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_HQ },
+        0, 0, VE, .unit = "profile" },
+    { "4444",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444 },
+        0, 0, VE, .unit = "profile" },
+    { "4444xq",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444XQ },
+        0, 0, VE, .unit = "profile" },
+    { "vendor", "vendor ID", OFFSET(ctx.vendor),
+        AV_OPT_TYPE_STRING, { .str = "Lavc" }, 0, 0, VE },
+    { "bits_per_mb", "desired bits per macroblock", OFFSET(ctx.bits_per_mb),
+        AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 8192, VE },
+    { "quant_mat", "quantiser matrix", OFFSET(ctx.quant_sel), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, QUANT_MAT_DEFAULT, VE, .unit = "quant_mat" },
+    { "auto",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = -1 },
+        0, 0, VE, .unit = "quant_mat" },
+    { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_PROXY },
+        0, 0, VE, .unit = "quant_mat" },
+    { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_LT },
+        0, 0, VE, .unit = "quant_mat" },
+    { "standard",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_STANDARD },
+        0, 0, VE, .unit = "quant_mat" },
+    { "hq",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_HQ },
+        0, 0, VE, .unit = "quant_mat" },
+    { "default",       NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_DEFAULT },
+        0, 0, VE, .unit = "quant_mat" },
+    { "alpha_bits", "bits for alpha plane", OFFSET(ctx.alpha_bits), AV_OPT_TYPE_INT,
+        { .i64 = 16 }, 0, 16, VE },
+    { "async_depth", "Internal parallelization depth", OFFSET(async_depth), AV_OPT_TYPE_INT,
+            { .i64 = 1 }, 1, INT_MAX, VE },
+    { NULL }
+};
+
+static const AVClass proresenc_class = {
+    .class_name = "ProRes vulkan encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVCodecHWConfigInternal *const prores_ks_hw_configs[] = {
+    HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN),
+    HW_CONFIG_ENCODER_DEVICE(NONE,  VULKAN),
+    NULL,
+};
+
+const FFCodec ff_prores_ks_vulkan_encoder = {
+    .p.name         = "prores_ks_vulkan",
+    CODEC_LONG_NAME("Apple ProRes (iCodec Pro)"),
+    .p.type         = AVMEDIA_TYPE_VIDEO,
+    .p.id           = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresVulkanContext),
+    .init           = encode_init,
+    .close          = encode_close,
+    FF_CODEC_RECEIVE_PACKET_CB(&vulkan_encode_prores_receive_packet),
+    .p.capabilities = AV_CODEC_CAP_DELAY |
+                      AV_CODEC_CAP_HARDWARE |
+                      AV_CODEC_CAP_ENCODER_FLUSH |
+                      AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
+    CODEC_PIXFMTS(AV_PIX_FMT_VULKAN),
+    .hw_configs     = prores_ks_hw_configs,
+    .color_ranges   = AVCOL_RANGE_MPEG,
+    .p.priv_class   = &proresenc_class,
+    .p.profiles     = NULL_IF_CONFIG_SMALL(ff_prores_profiles),
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_EOF_FLUSH,
+};
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index d8e1471fa6..f69e430c33 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -9,6 +9,13 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER)  +=  vulkan/common.o \
 					vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_setup.o \
 					vulkan/ffv1_rct_search.o vulkan/ffv1_enc.o
 
+OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += vulkan/common.o \
+					vulkan/prores_ks_alpha_data.o \
+					vulkan/prores_ks_slice_data.o \
+					vulkan/prores_ks_estimate_slice.o \
+					vulkan/prores_ks_encode_slice.o \
+					vulkan/prores_ks_trellis_node.o
+
 OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
 					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
 					vulkan/ffv1_common.o vulkan/ffv1_reset.o \
diff --git a/libavcodec/vulkan/prores_ks_alpha_data.comp b/libavcodec/vulkan/prores_ks_alpha_data.comp
new file mode 100644
index 0000000000..825ba28a4f
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_alpha_data.comp
@@ -0,0 +1,67 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_EXT_samplerless_texture_functions : require
+
+/* Table of possible edge slice configurations */
+const uvec3 edge_mps_table[8] = uvec3[](
+    uvec3(0, 0, 0),
+    uvec3(1, 0, 0),
+    uvec3(2, 0, 0),
+    uvec3(2, 1, 0),
+    uvec3(4, 0, 0),
+    uvec3(4, 1, 0),
+    uvec3(4, 2, 0),
+    uvec3(4, 2, 1)
+);
+
+void main()
+{
+    ivec2 coord = min(ivec2(gl_GlobalInvocationID.xy), textureSize(plane, 0) - ivec2(1));
+    int alpha = texelFetch(plane, coord, 0).x;
+
+#if ALPHA_BITS == 8
+    alpha >>= 2;
+#else
+    alpha = (alpha << 6) | (alpha >> 4);
+#endif
+
+    uint mbs_per_slice = MAX_MBS_PER_SLICE;
+    uint slices_width = WIDTH_IN_MB / mbs_per_slice;
+    uint mb_width = slices_width * mbs_per_slice;
+    uint slice_x = gl_WorkGroupID.x / mbs_per_slice;
+    uint slice_y = gl_WorkGroupID.y;
+    uvec2 slice_base = uvec2(slice_x, slice_y) * (mbs_per_slice * 16u);
+
+    /* Handle slice macroblock size reduction on edge slices */
+    if (gl_WorkGroupID.x >= mb_width)
+    {
+        uint edge_mb = gl_WorkGroupID.x - mb_width;
+        uvec3 table = edge_mps_table[WIDTH_IN_MB - mb_width];
+        uvec3 base = uvec3(0, table.x, table.x + table.y);
+        uint edge_slice = edge_mb < base.y ? 0 : (edge_mb < base.z ? 1 : 2);
+        slice_x += edge_slice;
+        slice_base += base[edge_slice] * (DCTSIZE * 2u);
+        mbs_per_slice = table[edge_slice];
+    }
+
+    uint slice = slice_y * SLICES_PITCH + slice_x;
+    uvec2 coeff_coord = uvec2(coord) - slice_base;
+    uint coeff = coeff_coord.y * (mbs_per_slice * 16u) + coeff_coord.x;
+    slices[slice].coeffs[3][coeff] = int16_t(alpha);
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_encode_slice.comp b/libavcodec/vulkan/prores_ks_encode_slice.comp
new file mode 100644
index 0000000000..2c06388a46
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_encode_slice.comp
@@ -0,0 +1,230 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CFACTOR_Y444 3
+
+layout(push_constant, scalar) uniform EncodeSliceInfo {
+    u8buf bytestream;
+    u8vec2buf seek_table;
+    int num_planes;
+    int slices_per_picture;
+    int max_quant;
+};
+
+int av_zero_extend(int a, uint p)
+{
+    return int(uint(a) & ((1U << p) - 1));
+}
+
+void encode_vlc_codeword(inout PutBitContext pb, uint codebook, int val)
+{
+    /* number of prefix bits to switch between Rice and expGolomb */
+    uint switch_bits = (codebook & 3) + 1;
+    uint rice_order  =  codebook >> 5;       /* rice code order */
+    uint exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
+
+    uint switch_val  = switch_bits << rice_order;
+
+    if (val >= switch_val) {
+        val -= int(switch_val - (1 << exp_order));
+        int exponent = findMSB(val);
+
+        put_bits(pb, exponent - exp_order + switch_bits, 0);
+        put_bits(pb, exponent + 1, val);
+    } else {
+        int exponent = val >> rice_order;
+        if (exponent != 0)
+            put_bits(pb, exponent, 0);
+        put_bits(pb, 1, 1);
+        if (rice_order != 0)
+            put_bits(pb, rice_order, av_zero_extend(val, rice_order));
+    }
+}
+
+#define GET_SIGN(x)  ((x) >> 31)
+#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x))
+
+#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0
+
+void encode_dcs(inout PutBitContext pb, bool is_chroma, int q)
+{
+    uint slice = gl_GlobalInvocationID.x;
+    uint plane = gl_GlobalInvocationID.y;
+    uint blocks_per_mb = is_chroma && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+    uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+    int codebook = 5;
+    int scale = is_chroma ? qmat_chroma[q][0] : qmat[q][0];
+    int coeff = slices[slice].coeffs[plane][0];
+    int prev_dc = (coeff - 0x4000) / scale;
+    encode_vlc_codeword(pb, FIRST_DC_CB, MAKE_CODE(prev_dc));
+    int sign = 0;
+    for (int i = 1; i < blocks_per_slice; i++) {
+        coeff = slices[slice].coeffs[plane][i * 64];
+        int dc = (coeff - 0x4000) / scale;
+        int delta = dc - prev_dc;
+        int new_sign = GET_SIGN(delta);
+        delta = (delta ^ sign) - sign;
+        int code = MAKE_CODE(delta);
+        encode_vlc_codeword(pb, dc_codebook[codebook], code);
+        codebook = min(code, 6);
+        sign = new_sign;
+        prev_dc = dc;
+    }
+}
+
+void encode_acs(inout PutBitContext pb, bool is_chroma, int q)
+{
+    uint slice = gl_GlobalInvocationID.x;
+    uint plane = gl_GlobalInvocationID.y;
+    uint blocks_per_mb = is_chroma && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+    uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+    uint max_coeffs = blocks_per_slice << 6;
+    int prev_run = 4;
+    int prev_level = 2;
+    int run = 0;
+
+    for (int i = 1; i < 64; i++) {
+        for (int idx = scan[i]; idx < max_coeffs; idx += 64) {
+            int coeff = slices[slice].coeffs[plane][idx];
+            int level = coeff / (is_chroma ? qmat_chroma[q][scan[i]] : qmat[q][scan[i]]);
+            if (level != 0) {
+                int abs_level = abs(level);
+                encode_vlc_codeword(pb, run_to_cb[prev_run], run);
+                encode_vlc_codeword(pb, level_to_cb[prev_level], abs_level - 1);
+                put_bits(pb, 1, av_zero_extend(GET_SIGN(level), 1));
+                prev_run = min(run, 15);
+                prev_level = min(abs_level, 9);
+                run = 0;
+            } else {
+                run++;
+            }
+        }
+    }
+}
+
+void encode_slice_plane(inout PutBitContext pb, int q)
+{
+    uint plane = gl_GlobalInvocationID.y;
+    bool is_chroma = plane == 1 || plane == 2;
+    encode_dcs(pb, is_chroma, q);
+    encode_acs(pb, is_chroma, q);
+}
+
+void put_alpha_diff(inout PutBitContext pb, int cur, int prev)
+{
+    const int dbits = (ALPHA_BITS == 8) ? 4 : 7;
+    const int dsize = 1 << dbits - 1;
+    int diff = cur - prev;
+
+    diff = av_zero_extend(diff, ALPHA_BITS);
+    if (diff >= (1 << ALPHA_BITS) - dsize)
+        diff -= 1 << ALPHA_BITS;
+    if (diff < -dsize || diff > dsize || diff == 0) {
+        put_bits(pb, 1, 1);
+        put_bits(pb, ALPHA_BITS, diff);
+    } else {
+        put_bits(pb, 1, 0);
+        put_bits(pb, dbits - 1, abs(diff) - 1);
+        put_bits(pb, 1, int(diff < 0));
+    }
+}
+
+void put_alpha_run(inout PutBitContext pb, int run)
+{
+    if (run != 0) {
+        put_bits(pb, 1, 0);
+        if (run < 0x10)
+            put_bits(pb, 4, run);
+        else
+            put_bits(pb, 15, run);
+    } else {
+        put_bits(pb, 1, 1);
+    }
+}
+
+void encode_alpha_plane(inout PutBitContext pb)
+{
+    uint slice = gl_GlobalInvocationID.x;
+    const int mask = (1 << ALPHA_BITS) - 1;
+    const int num_coeffs = int(slices[slice].mbs_per_slice) * 256;
+    int prev = mask, cur;
+    int idx = 0;
+    int run = 0;
+
+    cur = slices[slice].coeffs[3][idx++];
+    put_alpha_diff(pb, cur, prev);
+    prev = cur;
+    do {
+        cur = slices[slice].coeffs[3][idx++];
+        if (cur != prev) {
+            put_alpha_run(pb, run);
+            put_alpha_diff(pb, cur, prev);
+            prev = cur;
+            run  = 0;
+        } else {
+            run++;
+        }
+    } while (idx < num_coeffs);
+    put_alpha_run(pb, run);
+}
+
+u8vec2 byteswap16(int value)
+{
+    return unpack8(uint16_t(value)).yx;
+}
+
+void main()
+{
+    uint slice = gl_GlobalInvocationID.x;
+    if (slice >= slices_per_picture)
+        return;
+
+    uint plane = gl_GlobalInvocationID.y;
+    int q = scores[slice].quant;
+    int q_idx = min(q, max_quant + 1);
+    int slice_hdr_size = 2 * num_planes;
+    int slice_size = slice_hdr_size + (scores[slice].total_bits[q_idx] / 8);
+    u8buf buf = OFFBUF(u8buf, bytestream, scores[slice].buf_start);
+
+    /* Write slice header */
+    if (plane == 0)
+    {
+        buf[0].v = uint8_t(slice_hdr_size * 8);
+        buf[1].v = uint8_t(q);
+        u8vec2buf slice_hdr = OFFBUF(u8vec2buf, buf, 2);
+        for (int i = 0; i < num_planes - 1; i++)
+        {
+            int bits = scores[slice].bits[q_idx][i] / 8;
+            slice_hdr[i].v = byteswap16(bits);
+        }
+        seek_table[slice].v = byteswap16(slice_size);
+    }
+
+    int plane_offset = 0;
+    for (int i = 0; i < plane; ++i)
+        plane_offset += scores[slice].bits[q_idx][i] / 8;
+
+    /* Encode slice plane */
+    PutBitContext pb;
+    init_put_bits(pb, OFFBUF(u8buf, buf, slice_hdr_size + plane_offset), 0);
+    if (plane == 3)
+        encode_alpha_plane(pb);
+    else
+        encode_slice_plane(pb, q);
+    flush_put_bits(pb);
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_estimate_slice.comp b/libavcodec/vulkan/prores_ks_estimate_slice.comp
new file mode 100644
index 0000000000..5f9b39cd75
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_estimate_slice.comp
@@ -0,0 +1,267 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_KHR_shader_subgroup_clustered : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+
+#define CFACTOR_Y444 3
+
+layout(push_constant, scalar) uniform EstimateSliceInfo {
+    uint slices_per_picture;
+    uint min_quant;
+    uint max_quant;
+    uint bits_per_mb;
+};
+
+int av_zero_extend(int a, uint p)
+{
+    return int(uint(a) & ((1U << p) - 1));
+}
+
+#define GET_SIGN(x)  ((x) >> 31)
+#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x))
+
+int estimate_vlc(uint codebook, int val)
+{
+    /* number of prefix bits to switch between Rice and expGolomb */
+    uint switch_bits = (codebook & 3) + 1;
+    uint rice_order  =  codebook >> 5;       /* rice code order */
+    uint exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
+
+    uint switch_val  = switch_bits << rice_order;
+
+    if (val >= switch_val)
+    {
+        val -= int(switch_val - (1 << exp_order));
+        int exponent = findMSB(val);
+        return int(exponent * 2 - exp_order + switch_bits + 1);
+    }
+    else
+    {
+        return int((val >> rice_order) + rice_order + 1);
+    }
+}
+
+#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0
+
+int estimate_dcs(inout int error, uint slice, uint plane, uint q)
+{
+    uint blocks_per_mb = plane != 0 && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+    uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+    int codebook = 5;
+    int coeff = slices[slice].coeffs[plane][0];
+    int scale = plane != 0 ? qmat_chroma[q][0] : qmat[q][0];
+    int prev_dc = (coeff - 0x4000) / scale;
+    int bits = estimate_vlc(FIRST_DC_CB, MAKE_CODE(prev_dc));
+    int sign = 0;
+    coeff = slices[slice].coeffs[plane][64];
+    error += abs(coeff - 0x4000) % scale;
+
+    for (int i = 1; i < blocks_per_slice; ++i) {
+        coeff = slices[slice].coeffs[plane][i * 64];
+        int dc = (coeff - 0x4000) / scale;
+        error += abs(coeff - 0x4000) % scale;
+        int delta = dc - prev_dc;
+        int new_sign = GET_SIGN(delta);
+        delta = (delta ^ sign) - sign;
+        int code = MAKE_CODE(delta);
+        bits += estimate_vlc(dc_codebook[codebook], code);
+        codebook = min(code, 6);
+        sign = new_sign;
+        prev_dc = dc;
+    }
+
+    return bits;
+}
+
+#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))
+#define SCORE_LIMIT   1073741823
+
+int estimate_acs(inout int error, uint slice, uint plane, uint q)
+{
+    uint blocks_per_mb = plane != 0 && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+    uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+    uint max_coeffs = blocks_per_slice << 6;
+    int prev_run = 4;
+    int prev_level = 2;
+    int bits = 0;
+    int run = 0;
+
+    for (int i = 1; i < 64; i++) {
+        for (int idx = scan[i]; idx < max_coeffs; idx += 64) {
+            int coeff = slices[slice].coeffs[plane][idx];
+            int quant = plane != 0 ? qmat_chroma[q][scan[i]] : qmat[q][scan[i]];
+            int level = coeff / quant;
+            error += abs(coeff) % quant;
+            if (level != 0) {
+                int abs_level = abs(level);
+                bits += estimate_vlc(run_to_cb[prev_run], run);
+                bits += estimate_vlc(level_to_cb[prev_level], abs_level - 1) + 1;
+                prev_run = min(run, 15);
+                prev_level = min(abs_level, 9);
+                run = 0;
+            } else {
+                run++;
+            }
+        }
+    }
+
+    return bits;
+}
+
+int estimate_slice_plane(inout int error, uint slice, uint plane, uint q)
+{
+    int bits = 0;
+    bits += estimate_dcs(error, slice, plane, q);
+    bits += estimate_acs(error, slice, plane, q);
+    return FFALIGN(bits, 8);
+}
+
+int est_alpha_diff(int cur, int prev)
+{
+    const int dbits = (ALPHA_BITS == 8) ? 4 : 7;
+    const int dsize = 1 << dbits - 1;
+    int diff = cur - prev;
+
+    diff = av_zero_extend(diff, ALPHA_BITS);
+    if (diff >= (1 << ALPHA_BITS) - dsize)
+        diff -= 1 << ALPHA_BITS;
+    if (diff < -dsize || diff > dsize || diff == 0)
+        return ALPHA_BITS + 1;
+    else
+        return dbits + 1;
+}
+
+int estimate_alpha_plane(uint slice)
+{
+    const int mask  = (1 << ALPHA_BITS) - 1;
+    const int num_coeffs = int(slices[slice].mbs_per_slice) * 256;
+    int prev = mask, cur;
+    int idx = 0;
+    int run = 0;
+    int bits;
+
+    cur = slices[slice].coeffs[3][idx++];
+    bits = est_alpha_diff(cur, prev);
+    prev = cur;
+    do {
+        cur = slices[slice].coeffs[3][idx++];
+        if (cur != prev) {
+            if (run == 0)
+                bits++;
+            else if (run < 0x10)
+                bits += 4;
+            else
+                bits += 15;
+            bits += est_alpha_diff(cur, prev);
+            prev = cur;
+            run  = 0;
+        } else {
+            run++;
+        }
+    } while (idx < num_coeffs);
+
+    if (run != 0) {
+        if (run < 0x10)
+            bits += 4;
+        else
+            bits += 15;
+    }
+
+    return bits;
+}
+
+int sum_of_planes(int value)
+{
+#if NUM_PLANES == 3
+    uint base = (gl_SubgroupInvocationID / 3) * 3;
+    return subgroupShuffle(value, base) + subgroupShuffle(value, base + 1) + subgroupShuffle(value, base + 2);
+#else
+    return subgroupClusteredAdd(value, 4);
+#endif
+}
+
+void main()
+{
+    uint slice = gl_GlobalInvocationID.x / NUM_PLANES;
+    uint plane = gl_LocalInvocationID.x % NUM_PLANES;
+    uint q = min_quant + gl_GlobalInvocationID.y;
+    if (slice >= slices_per_picture)
+        return;
+
+    /* Estimate slice bits and error for specified quantizer and plane */
+    int error = 0;
+    int bits = 0;
+    if (plane == 3)
+        bits = estimate_alpha_plane(slice);
+    else
+        bits = estimate_slice_plane(error, slice, plane, q);
+
+    /* Write results to score buffer */
+    scores[slice].bits[q][plane] = bits;
+    scores[slice].score[q][plane] = error;
+
+    /* Accumulate total bits and error of all planes */
+    int total_bits = sum_of_planes(bits);
+    int total_score = sum_of_planes(error);
+    if (total_bits > 65000 * 8)
+        total_score = SCORE_LIMIT;
+    scores[slice].total_bits[q] = total_bits;
+    scores[slice].total_score[q] = total_score;
+
+    if (q != max_quant)
+        return;
+
+    /* Task threads that computed max_quant to also compute overquant if necessary */
+    uint mbs_per_slice = slices[slice].mbs_per_slice;
+    if (total_bits <= bits_per_mb * mbs_per_slice)
+    {
+        /* Overquant isn't needed for this slice */
+        scores[slice].total_bits[max_quant + 1] = total_bits;
+        scores[slice].total_score[max_quant + 1] = total_score + 1;
+        scores[slice].overquant = max_quant;
+    }
+    else
+    {
+        /* Keep searching until an encoding fits our budget */
+        for (q = max_quant + 1; q < 128; ++q)
+        {
+            /* Estimate slice bits and error for specified quantizer and plane */
+            error = 0;
+            bits = 0;
+            if (plane == 3)
+                bits = estimate_alpha_plane(slice);
+            else
+                bits = estimate_slice_plane(error, slice, plane, q);
+
+            /* Accumulate total bits and error of all planes */
+            total_bits = sum_of_planes(bits);
+            total_score = sum_of_planes(error);
+
+            /* If estimated bits fit within budget, we are done */
+            if (total_bits <= bits_per_mb * mbs_per_slice)
+                break;
+        }
+
+        scores[slice].bits[max_quant + 1][plane] = bits;
+        scores[slice].score[max_quant + 1][plane] = error;
+        scores[slice].total_bits[max_quant + 1] = total_bits;
+        scores[slice].total_score[max_quant + 1] = total_score;
+        scores[slice].overquant = q;
+    }
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_slice_data.comp b/libavcodec/vulkan/prores_ks_slice_data.comp
new file mode 100644
index 0000000000..6a943532c5
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_slice_data.comp
@@ -0,0 +1,265 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_EXT_samplerless_texture_functions : require
+
+layout(push_constant, scalar) uniform SliceDataInfo {
+    int plane;
+    int pictures_per_frame;
+    int line_add;
+};
+
+shared i16vec4 coeffs[MAX_MBS_PER_SLICE][BLOCKS_PER_MB][DCTSIZE][DCTSIZE / 4];
+
+#define CONST_BITS  13
+#define PASS1_BITS  1   /* lose a little precision to avoid overflow */
+#define OUT_SHIFT   (PASS1_BITS + 1)
+
+#define FIX_0_541196100 4433  /* FIX(0.541196100) */
+#define FIX_0_765366865 6270 /* FIX(0.765366865) */
+#define FIX_1_847759065 15137 /* FIX(1.847759065) */
+#define FIX_1_175875602 9633 /* FIX(1.175875602) */
+#define FIX_0_298631336 2446 /* FIX(0.298631336) */
+#define FIX_3_072711026 25172 /* FIX(3.072711026) */
+#define FIX_1_501321110 12299 /* FIX(1.501321110) */
+#define FIX_0_899976223 7373 /* FIX(0.899976223) */
+#define FIX_1_961570560 16069 /* FIX(1.961570560) */
+#define FIX_2_053119869 16819 /* FIX(2.053119869) */
+#define FIX_2_562915447 20995 /* FIX(2.562915447) */
+#define FIX_0_390180644 3196 /* FIX(0.390180644) */
+
+#define MULTIPLY(type, var, cons) type(uint32_t(var) * uint32_t(cons))
+#define RIGHT_SHIFT(x, n) ((x) >> (n))
+#define DESCALE(x,n)  RIGHT_SHIFT(int32_t(x) + (1 << ((n) - 1)), n)
+
+void row_fdct(i32vec4 data_lo, i32vec4 data_hi)
+{
+    uint row_idx = gl_LocalInvocationID.x;
+    uint block = gl_LocalInvocationID.y;
+    uint mb = gl_LocalInvocationID.z;
+
+    /* Pass 1: process rows. */
+    /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+    /* furthermore, we scale the results by 2**PASS1_BITS. */
+    int32_t tmp0 = data_lo.x + data_hi.w;
+    int32_t tmp7 = data_lo.x - data_hi.w;
+    int32_t tmp1 = data_lo.y + data_hi.z;
+    int32_t tmp6 = data_lo.y - data_hi.z;
+    int32_t tmp2 = data_lo.z + data_hi.y;
+    int32_t tmp5 = data_lo.z - data_hi.y;
+    int32_t tmp3 = data_lo.w + data_hi.x;
+    int32_t tmp4 = data_lo.w - data_hi.x;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+    int32_t tmp10 = tmp0 + tmp3;
+    int32_t tmp13 = tmp0 - tmp3;
+    int32_t tmp11 = tmp1 + tmp2;
+    int32_t tmp12 = tmp1 - tmp2;
+
+    data_lo.x = (tmp10 + tmp11) * (1 << PASS1_BITS);
+    data_hi.x = (tmp10 - tmp11) * (1 << PASS1_BITS);
+
+    uint32_t z1 = MULTIPLY(uint32_t, tmp12 + tmp13, FIX_0_541196100);
+    data_lo.z = DESCALE(z1 + MULTIPLY(uint32_t, tmp13, FIX_0_765366865), CONST_BITS-PASS1_BITS);
+    data_hi.z = DESCALE(z1 + MULTIPLY(uint32_t, tmp12, -FIX_1_847759065), CONST_BITS-PASS1_BITS);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * cK represents cos(K*pi/16).
+     * i0..i3 in the paper are tmp4..tmp7 here.
+     */
+    z1 = tmp4 + tmp7;
+    uint32_t z2 = tmp5 + tmp6;
+    uint32_t z3 = tmp4 + tmp6;
+    uint32_t z4 = tmp5 + tmp7;
+    uint32_t z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+    tmp4 = MULTIPLY(int32_t, tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp5 = MULTIPLY(int32_t, tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp6 = MULTIPLY(int32_t, tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp7 = MULTIPLY(int32_t, tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = MULTIPLY(uint32_t, z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    z2 = MULTIPLY(uint32_t, z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(uint32_t, z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(uint32_t, z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+
+    z3 += z5;
+    z4 += z5;
+
+    data_hi.w = DESCALE(uint32_t(tmp4) + z1 + z3, CONST_BITS - PASS1_BITS);
+    data_hi.y = DESCALE(uint32_t(tmp5) + z2 + z4, CONST_BITS - PASS1_BITS);
+    data_lo.w = DESCALE(uint32_t(tmp6) + z2 + z3, CONST_BITS - PASS1_BITS);
+    data_lo.y = DESCALE(uint32_t(tmp7) + z1 + z4, CONST_BITS - PASS1_BITS);
+
+    coeffs[mb][block][row_idx][0] = i16vec4(data_lo);
+    coeffs[mb][block][row_idx][1] = i16vec4(data_hi);
+}
+
+void ff_jpeg_fdct_islow_10()
+{
+    uint col_half = gl_LocalInvocationID.x / 4;
+    uint col = gl_LocalInvocationID.x & 3u;
+    uint block = gl_LocalInvocationID.y;
+    uint mb = gl_LocalInvocationID.z;
+
+    i16vec4 col_lo = i16vec4(coeffs[mb][block][0][col_half][col],
+                             coeffs[mb][block][1][col_half][col],
+                             coeffs[mb][block][2][col_half][col],
+                             coeffs[mb][block][3][col_half][col]);
+    i16vec4 col_hi = i16vec4(coeffs[mb][block][4][col_half][col],
+                             coeffs[mb][block][5][col_half][col],
+                             coeffs[mb][block][6][col_half][col],
+                             coeffs[mb][block][7][col_half][col]);
+    i32vec4 data_lo = i32vec4(col_lo);
+    i32vec4 data_hi = i32vec4(col_hi);
+
+    /* Pass 2: process columns.
+     * We remove the PASS1_BITS scaling, but leave the results scaled up
+     * by an overall factor of 8.
+     */
+    int32_t tmp0 = data_lo.x + data_hi.w;
+    int32_t tmp7 = data_lo.x - data_hi.w;
+    int32_t tmp1 = data_lo.y + data_hi.z;
+    int32_t tmp6 = data_lo.y - data_hi.z;
+    int32_t tmp2 = data_lo.z + data_hi.y;
+    int32_t tmp5 = data_lo.z - data_hi.y;
+    int32_t tmp3 = data_lo.w + data_hi.x;
+    int32_t tmp4 = data_lo.w - data_hi.x;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+    int32_t tmp10 = tmp0 + tmp3;
+    int32_t tmp13 = tmp0 - tmp3;
+    int32_t tmp11 = tmp1 + tmp2;
+    int32_t tmp12 = tmp1 - tmp2;
+
+    data_lo.x = DESCALE(tmp10 + tmp11, OUT_SHIFT);
+    data_hi.x = DESCALE(tmp10 - tmp11, OUT_SHIFT);
+
+    uint32_t z1 = uint32_t((tmp12 + tmp13) * FIX_0_541196100);
+    data_lo.z = DESCALE(z1 + uint32_t(tmp13 * FIX_0_765366865), CONST_BITS + OUT_SHIFT);
+    data_hi.z = DESCALE(z1 + uint32_t(tmp12 * (-FIX_1_847759065)), CONST_BITS + OUT_SHIFT);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * cK represents cos(K*pi/16).
+     * i0..i3 in the paper are tmp4..tmp7 here.
+     */
+    z1 = tmp4 + tmp7;
+    uint32_t z2 = tmp5 + tmp6;
+    uint32_t z3 = tmp4 + tmp6;
+    uint32_t z4 = tmp5 + tmp7;
+    uint32_t z5 = MULTIPLY(uint32_t, z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+
+    tmp4 = MULTIPLY(int32_t, tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp5 = MULTIPLY(int32_t, tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp6 = MULTIPLY(int32_t, tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp7 = MULTIPLY(int32_t, tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = MULTIPLY(uint32_t, z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    z2 = MULTIPLY(uint32_t, z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(uint32_t, z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(uint32_t, z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+
+    z3 += z5;
+    z4 += z5;
+
+    data_hi.w = DESCALE(tmp4 + z1 + z3, CONST_BITS + OUT_SHIFT);
+    data_hi.y = DESCALE(tmp5 + z2 + z4, CONST_BITS + OUT_SHIFT);
+    data_lo.w = DESCALE(tmp6 + z2 + z3, CONST_BITS + OUT_SHIFT);
+    data_lo.y = DESCALE(tmp7 + z1 + z4, CONST_BITS + OUT_SHIFT);
+
+    col_lo = i16vec4(data_lo);
+    col_hi = i16vec4(data_hi);
+    coeffs[mb][block][0][col_half][col] = col_lo.x;
+    coeffs[mb][block][1][col_half][col] = col_lo.y;
+    coeffs[mb][block][2][col_half][col] = col_lo.z;
+    coeffs[mb][block][3][col_half][col] = col_lo.w;
+    coeffs[mb][block][4][col_half][col] = col_hi.x;
+    coeffs[mb][block][5][col_half][col] = col_hi.y;
+    coeffs[mb][block][6][col_half][col] = col_hi.z;
+    coeffs[mb][block][7][col_half][col] = col_hi.w;
+}
+
+/* Table of possible edge slice configurations */
+const uvec3 edge_mps_table[8] = uvec3[](
+    uvec3(0, 0, 0),
+    uvec3(1, 0, 0),
+    uvec3(2, 0, 0),
+    uvec3(2, 1, 0),
+    uvec3(4, 0, 0),
+    uvec3(4, 1, 0),
+    uvec3(4, 2, 0),
+    uvec3(4, 2, 1)
+);
+
+void main()
+{
+    bool is_chroma = plane == 1 || plane == 2;
+    uint row_idx = gl_LocalInvocationID.x;
+    uint block = gl_LocalInvocationID.y;
+    uint macroblock = gl_LocalInvocationID.z;
+    uint slice_x = gl_WorkGroupID.x;
+
+    /* Calculate the current thread coordinate in input plane */
+    uint mbs_per_slice = MAX_MBS_PER_SLICE;
+    uint mb_width = 4u * BLOCKS_PER_MB;
+    uint slices_width = WIDTH_IN_MB / MAX_MBS_PER_SLICE;
+    uvec2 slice_base = gl_WorkGroupID.xy * uvec2(MAX_MBS_PER_SLICE * mb_width, DCTSIZE * 2u);
+
+    /* Handle slice macroblock size reduction on edge slices */
+    if (slice_x >= slices_width)
+    {
+        uint edge_slice = slice_x - slices_width;
+        uvec3 table = edge_mps_table[WIDTH_IN_MB - slices_width * MAX_MBS_PER_SLICE];
+        uvec3 base = uvec3(0u, table.x, table.x + table.y);
+        slice_base.x = (MAX_MBS_PER_SLICE * slices_width + base[edge_slice]) * mb_width;
+        mbs_per_slice = table[edge_slice];
+    }
+
+    uvec2 mb_base = slice_base + uvec2(macroblock * mb_width, 0u);
+    uvec2 block_coord = is_chroma ? uvec2(block >> 1u, block & 1u) : uvec2(block & 1u, block >> 1u);
+    ivec2 coord = ivec2(mb_base + block_coord * DCTSIZE + uvec2(0u, row_idx));
+    ivec2 size = textureSize(planes[plane], 0);
+    coord.y = coord.y * pictures_per_frame + line_add;
+    coord = min(coord, size - ivec2(1));
+
+    /* Load coefficients from input planes */
+    i32vec4 row_lo;
+    row_lo.x = texelFetchOffset(planes[plane], coord, 0, ivec2(0, 0)).x;
+    row_lo.y = texelFetchOffset(planes[plane], coord, 0, ivec2(1, 0)).x;
+    row_lo.z = texelFetchOffset(planes[plane], coord, 0, ivec2(2, 0)).x;
+    row_lo.w = texelFetchOffset(planes[plane], coord, 0, ivec2(3, 0)).x;
+    
+    i32vec4 row_hi;
+    row_hi.x = texelFetchOffset(planes[plane], coord, 0, ivec2(4, 0)).x;
+    row_hi.y = texelFetchOffset(planes[plane], coord, 0, ivec2(5, 0)).x;
+    row_hi.z = texelFetchOffset(planes[plane], coord, 0, ivec2(6, 0)).x;
+    row_hi.w = texelFetchOffset(planes[plane], coord, 0, ivec2(7, 0)).x;
+
+    /* Perform DCT on the coefficients */
+    row_fdct(row_lo, row_hi);
+    ff_jpeg_fdct_islow_10();
+    barrier();
+
+    /* Store DCT result to slice buffer */
+    uint slice = gl_WorkGroupID.y * gl_NumWorkGroups.x + gl_WorkGroupID.x;
+    uint slice_row = macroblock * BLOCKS_PER_MB * DCTSIZE + block * DCTSIZE + row_idx;
+    slices[slice].mbs_per_slice = mbs_per_slice;
+    slices[slice].rows[plane][slice_row] = coeffs[macroblock][block][row_idx];
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_trellis_node.comp b/libavcodec/vulkan/prores_ks_trellis_node.comp
new file mode 100644
index 0000000000..052e47ac5f
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_trellis_node.comp
@@ -0,0 +1,177 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+
+layout(push_constant, scalar) uniform TrellisNodeInfo {
+    int min_quant;
+    int max_quant;
+    int mbs_per_slice;
+    int bits_per_mb;
+};
+
+#define TRELLIS_WIDTH 16
+#define SCORE_LIMIT   1073741823
+
+struct TrellisNode {
+    int prev_node;
+    int quant;
+    int bits;
+    int score;
+};
+
+shared int subgroup_sizes[NUM_SUBGROUPS];
+
+int slice_sizes[SLICES_WIDTH];
+
+TrellisNode nodes[(SLICES_WIDTH + 1) * TRELLIS_WIDTH];
+
+int find_slice_quant(int slice_x)
+{
+    int slice = int(gl_LocalInvocationID.x) * SLICES_WIDTH + slice_x;
+
+    int trellis_node = int(slice_x + 1) * TRELLIS_WIDTH;
+    for (int q = min_quant; q < max_quant + 2; q++)
+    {
+        nodes[trellis_node + q].prev_node = -1;
+        nodes[trellis_node + q].quant = q;
+    }
+
+    int mbs = int(slice_x + 1) * mbs_per_slice;
+    nodes[trellis_node + max_quant + 1].quant = scores[slice].overquant;
+
+    int bits_limit = mbs * bits_per_mb;
+    for (int pq = min_quant; pq < max_quant + 2; pq++)
+    {
+        int prev = trellis_node - TRELLIS_WIDTH + pq;
+        for (int q = min_quant; q < max_quant + 2; q++)
+        {
+            int cur = trellis_node + q;
+            int bits = nodes[prev].bits + scores[slice].total_bits[q];
+            int error = scores[slice].total_score[q];
+            if (bits > bits_limit)
+                error = SCORE_LIMIT;
+
+            int new_score;
+            if (nodes[prev].score < SCORE_LIMIT && error < SCORE_LIMIT)
+                new_score = nodes[prev].score + error;
+            else
+                new_score = SCORE_LIMIT;
+            if (nodes[cur].prev_node == -1 || nodes[cur].score >= new_score)
+            {
+                nodes[cur].bits      = bits;
+                nodes[cur].score     = new_score;
+                nodes[cur].prev_node = prev;
+            }
+        }
+    }
+
+    int error = nodes[trellis_node + min_quant].score;
+    int pq = trellis_node + min_quant;
+    for (int q = min_quant + 1; q < max_quant + 2; q++)
+    {
+        if (nodes[trellis_node + q].score <= error)
+        {
+            error = nodes[trellis_node + q].score;
+            pq = trellis_node + q;
+        }
+    }
+
+    return pq;
+}
+
+int find_slice_row_quants()
+{
+    for (int i = min_quant; i < max_quant + 2; i++)
+    {
+        nodes[i].prev_node = -1;
+        nodes[i].bits = 0;
+        nodes[i].score = 0;
+    }
+
+    int q = 0;
+    for (int slice_x = 0; slice_x < SLICES_WIDTH; ++slice_x)
+    {
+        q = find_slice_quant(slice_x);
+    }
+
+    int slice_hdr_size = 2 * NUM_PLANES;
+    int slice_row_size = slice_hdr_size * SLICES_WIDTH;
+    int y = int(gl_LocalInvocationID.x);
+    for (int x = SLICES_WIDTH - 1; x >= 0; x--)
+    {
+        int slice = x + y * SLICES_WIDTH;
+        int quant = nodes[q].quant;
+        int q_idx = min(quant, max_quant + 1);
+        slice_sizes[x] = scores[slice].total_bits[q_idx] / 8;
+        slice_row_size += slice_sizes[x];
+        scores[slice].quant = quant;
+        q = nodes[q].prev_node;
+    }
+
+    return slice_row_size;
+}
+
+int force_slice_row_quants()
+{
+    int slice_hdr_size = 2 * NUM_PLANES;
+    int slice_row_size = slice_hdr_size * SLICES_WIDTH;
+    int y = int(gl_LocalInvocationID.x);
+    for (int x = SLICES_WIDTH - 1; x >= 0; x--)
+    {
+        int slice = x + y * SLICES_WIDTH;
+        slice_sizes[x] = scores[slice].total_bits[FORCE_QUANT] / 8;
+        slice_row_size += slice_sizes[x];
+        scores[slice].quant = FORCE_QUANT;
+    }
+
+    return slice_row_size;
+}
+
+void main()
+{
+#if FORCE_QUANT == 0
+    int slice_row_size = find_slice_row_quants();
+#else
+    int slice_row_size = force_slice_row_quants();
+#endif
+
+    int subgroup_sum = subgroupAdd(slice_row_size);
+    subgroup_sizes[gl_SubgroupID] = subgroup_sum;
+    barrier();
+
+    int buf_start = subgroupExclusiveAdd(slice_row_size);
+    [[unroll]] for (int i = 0; i < NUM_SUBGROUPS; ++i)
+    {
+        if (i >= gl_SubgroupID)
+            break;
+        buf_start += subgroup_sizes[i];
+    }
+
+    int slice_hdr_size = 2 * NUM_PLANES;
+    int y = int(gl_LocalInvocationID.x);
+    [[unroll]] for (int x = 0; x < SLICES_WIDTH; ++x)
+    {
+        int slice = x + y * SLICES_WIDTH;
+        scores[slice].buf_start = buf_start;
+        buf_start += slice_hdr_size + slice_sizes[x];
+    }
+
+    if (y == gl_WorkGroupSize.x - 1)
+        frame_size = buf_start;
+}
\ No newline at end of file
-- 
2.50.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-09-04 20:12 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-04 20:10 [FFmpeg-devel] [PATCH 1/3] vulkan: Support samplerless images IndecisiveTurtle via ffmpeg-devel
2025-09-04 20:10 ` [FFmpeg-devel] [PATCH 2/3] lavc: Split out common components used by vulkan prores encoder IndecisiveTurtle via ffmpeg-devel
2025-09-04 20:10 ` [FFmpeg-devel] [PATCH 3/3] lavc: implement a Vulkan-based " IndecisiveTurtle via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git