[FFmpeg-devel] [PATCH] [GSoC 2025] Add vulkan compute based prores encoder (PR #20477)

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH] [GSoC 2025] Add vulkan compute based prores encoder (PR #20477)
@ 2025-09-09 12:59 indecisive_turtle via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: indecisive_turtle via ffmpeg-devel @ 2025-09-09 12:59 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: indecisive_turtle

PR #20477 opened by indecisive_turtle
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20477
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20477.patch

This is a compute implementation of the prores kostya encoder. On my NVIDIA GTX 1650 it's about 3-4x faster than the reference code. There is also an added option to use multiple compute queues ported from ffv1 vulkan encoder that could help more powerful GPUs

All profiles the reference supports are also supported here and some common initialization and bytestream writing code has been split into a separate translation unit to be shared by encoders

There are some possible improvements to be made, like splitting dcs estimation into a separate dispatch with 1 thread per block instead of a loop per slice, but this is a good starting point


>From 1fdd5187c2ff538d3273cb2a5f3c7aa5772a6910 Mon Sep 17 00:00:00 2001
From: IndecisiveTurtle <geoster3d@gmail.com>
Date: Wed, 3 Sep 2025 22:52:58 +0300
Subject: [PATCH 1/3] vulkan: Support samplerless images

---
 libavutil/vulkan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavutil/vulkan.c b/libavutil/vulkan.c
index ef755ad6f7..74eab88434 100644
--- a/libavutil/vulkan.c
+++ b/libavutil/vulkan.c
@@ -2507,7 +2507,8 @@ print:
         GLSLA("layout (set = %i, binding = %i", FFMAX(shd->nb_descriptor_sets - 1, 0), i);
 
         if (desc[i].mem_layout &&
-            (desc[i].type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE))
+            (desc[i].type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) &&
+            (desc[i].type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE))
             GLSLA(", %s", desc[i].mem_layout);
 
         GLSLA(")");
@@ -2520,7 +2521,7 @@ print:
 
         if (prop->type) {
             GLSLA(" ");
-            if (desc[i].type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
+            if (desc[i].type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE || desc[i].type == VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE) {
                 if (desc[i].mem_layout) {
                     int len = strlen(desc[i].mem_layout);
                     if (desc[i].mem_layout[len - 1] == 'i' &&
-- 
2.49.1


>From 594a8d5afaaac75887ddb68edcae39f2fcfb2edc Mon Sep 17 00:00:00 2001
From: IndecisiveTurtle <geoster3d@gmail.com>
Date: Wed, 3 Sep 2025 21:54:48 +0300
Subject: [PATCH 2/3] lavc: Split out common components used by vulkan prores
 encoder

---
 libavcodec/Makefile                  |   2 +-
 libavcodec/proresenc_kostya.c        | 414 +--------------------------
 libavcodec/proresenc_kostya_common.c | 364 +++++++++++++++++++++++
 libavcodec/proresenc_kostya_common.h | 131 +++++++++
 4 files changed, 511 insertions(+), 400 deletions(-)
 create mode 100644 libavcodec/proresenc_kostya_common.c
 create mode 100644 libavcodec/proresenc_kostya_common.h

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 3d036de4b6..d8e1ac5a54 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -638,7 +638,7 @@ OBJS-$(CONFIG_PPM_ENCODER)             += pnmenc.o
 OBJS-$(CONFIG_PRORES_DECODER)          += proresdec.o proresdsp.o proresdata.o
 OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc_anatoliy.o proresdata.o
 OBJS-$(CONFIG_PRORES_AW_ENCODER)       += proresenc_anatoliy.o proresdata.o
-OBJS-$(CONFIG_PRORES_KS_ENCODER)       += proresenc_kostya.o proresdata.o
+OBJS-$(CONFIG_PRORES_KS_ENCODER)       += proresenc_kostya.o proresdata.o proresenc_kostya_common.o
 OBJS-$(CONFIG_PRORES_RAW_DECODER)      += prores_raw.o proresdsp.o proresdata.o
 OBJS-$(CONFIG_PRORES_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o
 OBJS-$(CONFIG_PROSUMER_DECODER)        += prosumer.o
diff --git a/libavcodec/proresenc_kostya.c b/libavcodec/proresenc_kostya.c
index b98bc5c195..31d22a14ac 100644
--- a/libavcodec/proresenc_kostya.c
+++ b/libavcodec/proresenc_kostya.c
@@ -33,179 +33,7 @@
 #include "profiles.h"
 #include "bytestream.h"
 #include "proresdata.h"
-
-#define CFACTOR_Y422 2
-#define CFACTOR_Y444 3
-
-#define MAX_MBS_PER_SLICE 8
-
-#define MAX_PLANES 4
-
-enum {
-    PRORES_PROFILE_AUTO  = -1,
-    PRORES_PROFILE_PROXY = 0,
-    PRORES_PROFILE_LT,
-    PRORES_PROFILE_STANDARD,
-    PRORES_PROFILE_HQ,
-    PRORES_PROFILE_4444,
-    PRORES_PROFILE_4444XQ,
-};
-
-enum {
-    QUANT_MAT_PROXY = 0,
-    QUANT_MAT_PROXY_CHROMA,
-    QUANT_MAT_LT,
-    QUANT_MAT_STANDARD,
-    QUANT_MAT_HQ,
-    QUANT_MAT_XQ_LUMA,
-    QUANT_MAT_DEFAULT,
-};
-
-static const uint8_t prores_quant_matrices[][64] = {
-    { // proxy
-         4,  7,  9, 11, 13, 14, 15, 63,
-         7,  7, 11, 12, 14, 15, 63, 63,
-         9, 11, 13, 14, 15, 63, 63, 63,
-        11, 11, 13, 14, 63, 63, 63, 63,
-        11, 13, 14, 63, 63, 63, 63, 63,
-        13, 14, 63, 63, 63, 63, 63, 63,
-        13, 63, 63, 63, 63, 63, 63, 63,
-        63, 63, 63, 63, 63, 63, 63, 63,
-    },
-    { // proxy chromas
-        4,  7,  9, 11, 13, 14, 63, 63,
-        7,  7, 11, 12, 14, 63, 63, 63,
-        9, 11, 13, 14, 63, 63, 63, 63,
-        11, 11, 13, 14, 63, 63, 63, 63,
-        11, 13, 14, 63, 63, 63, 63, 63,
-        13, 14, 63, 63, 63, 63, 63, 63,
-        13, 63, 63, 63, 63, 63, 63, 63,
-        63, 63, 63, 63, 63, 63, 63, 63
-    },
-    { // LT
-         4,  5,  6,  7,  9, 11, 13, 15,
-         5,  5,  7,  8, 11, 13, 15, 17,
-         6,  7,  9, 11, 13, 15, 15, 17,
-         7,  7,  9, 11, 13, 15, 17, 19,
-         7,  9, 11, 13, 14, 16, 19, 23,
-         9, 11, 13, 14, 16, 19, 23, 29,
-         9, 11, 13, 15, 17, 21, 28, 35,
-        11, 13, 16, 17, 21, 28, 35, 41,
-    },
-    { // standard
-         4,  4,  5,  5,  6,  7,  7,  9,
-         4,  4,  5,  6,  7,  7,  9,  9,
-         5,  5,  6,  7,  7,  9,  9, 10,
-         5,  5,  6,  7,  7,  9,  9, 10,
-         5,  6,  7,  7,  8,  9, 10, 12,
-         6,  7,  7,  8,  9, 10, 12, 15,
-         6,  7,  7,  9, 10, 11, 14, 17,
-         7,  7,  9, 10, 11, 14, 17, 21,
-    },
-    { // high quality
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  5,
-         4,  4,  4,  4,  4,  4,  5,  5,
-         4,  4,  4,  4,  4,  5,  5,  6,
-         4,  4,  4,  4,  5,  5,  6,  7,
-         4,  4,  4,  4,  5,  6,  7,  7,
-    },
-    { // XQ luma
-        2,  2,  2,  2,  2,  2,  2,  2,
-        2,  2,  2,  2,  2,  2,  2,  2,
-        2,  2,  2,  2,  2,  2,  2,  2,
-        2,  2,  2,  2,  2,  2,  2,  3,
-        2,  2,  2,  2,  2,  2,  3,  3,
-        2,  2,  2,  2,  2,  3,  3,  3,
-        2,  2,  2,  2,  3,  3,  3,  4,
-        2,  2,  2,  2,  3,  3,  4,  4,
-    },
-    { // codec default
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-         4,  4,  4,  4,  4,  4,  4,  4,
-    },
-};
-
-#define NUM_MB_LIMITS 4
-static const int prores_mb_limits[NUM_MB_LIMITS] = {
-    1620, // up to 720x576
-    2700, // up to 960x720
-    6075, // up to 1440x1080
-    9216, // up to 2048x1152
-};
-
-static const struct prores_profile {
-    const char *full_name;
-    uint32_t    tag;
-    int         min_quant;
-    int         max_quant;
-    int         br_tab[NUM_MB_LIMITS];
-    int         quant;
-    int         quant_chroma;
-} prores_profile_info[6] = {
-    {
-        .full_name = "proxy",
-        .tag       = MKTAG('a', 'p', 'c', 'o'),
-        .min_quant = 4,
-        .max_quant = 8,
-        .br_tab    = { 300, 242, 220, 194 },
-        .quant     = QUANT_MAT_PROXY,
-        .quant_chroma = QUANT_MAT_PROXY_CHROMA,
-    },
-    {
-        .full_name = "LT",
-        .tag       = MKTAG('a', 'p', 'c', 's'),
-        .min_quant = 1,
-        .max_quant = 9,
-        .br_tab    = { 720, 560, 490, 440 },
-        .quant     = QUANT_MAT_LT,
-        .quant_chroma = QUANT_MAT_LT,
-    },
-    {
-        .full_name = "standard",
-        .tag       = MKTAG('a', 'p', 'c', 'n'),
-        .min_quant = 1,
-        .max_quant = 6,
-        .br_tab    = { 1050, 808, 710, 632 },
-        .quant     = QUANT_MAT_STANDARD,
-        .quant_chroma = QUANT_MAT_STANDARD,
-    },
-    {
-        .full_name = "high quality",
-        .tag       = MKTAG('a', 'p', 'c', 'h'),
-        .min_quant = 1,
-        .max_quant = 6,
-        .br_tab    = { 1566, 1216, 1070, 950 },
-        .quant     = QUANT_MAT_HQ,
-        .quant_chroma = QUANT_MAT_HQ,
-    },
-    {
-        .full_name = "4444",
-        .tag       = MKTAG('a', 'p', '4', 'h'),
-        .min_quant = 1,
-        .max_quant = 6,
-        .br_tab    = { 2350, 1828, 1600, 1425 },
-        .quant     = QUANT_MAT_HQ,
-        .quant_chroma = QUANT_MAT_HQ,
-    },
-    {
-        .full_name = "4444XQ",
-        .tag       = MKTAG('a', 'p', '4', 'x'),
-        .min_quant = 1,
-        .max_quant = 6,
-        .br_tab    = { 3525, 2742, 2400, 2137 },
-        .quant     = QUANT_MAT_HQ, /* Fix me : use QUANT_MAT_XQ_LUMA */
-        .quant_chroma = QUANT_MAT_HQ,
-    }
-};
+#include "proresenc_kostya_common.h"
 
 #define TRELLIS_WIDTH 16
 #define SCORE_LIMIT   INT_MAX / 2
@@ -217,8 +45,6 @@ struct TrellisNode {
     int score;
 };
 
-#define MAX_STORED_Q 16
-
 typedef struct ProresThreadData {
     DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
     DECLARE_ALIGNED(16, uint16_t, emu_buf)[16 * 16];
@@ -227,49 +53,6 @@ typedef struct ProresThreadData {
     struct TrellisNode *nodes;
 } ProresThreadData;
 
-typedef struct ProresContext {
-    AVClass *class;
-    DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
-    DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
-    int16_t quants[MAX_STORED_Q][64];
-    int16_t quants_chroma[MAX_STORED_Q][64];
-    int16_t custom_q[64];
-    int16_t custom_chroma_q[64];
-    const uint8_t *quant_mat;
-    const uint8_t *quant_chroma_mat;
-    const uint8_t *scantable;
-
-    void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src,
-                 ptrdiff_t linesize, int16_t *block);
-    FDCTDSPContext fdsp;
-
-    const AVFrame *pic;
-    int mb_width, mb_height;
-    int mbs_per_slice;
-    int num_chroma_blocks, chroma_factor;
-    int slices_width;
-    int slices_per_picture;
-    int pictures_per_frame; // 1 for progressive, 2 for interlaced
-    int cur_picture_idx;
-    int num_planes;
-    int bits_per_mb;
-    int force_quant;
-    int alpha_bits;
-    int warn;
-
-    char *vendor;
-    int quant_sel;
-
-    int frame_size_upper_bound;
-
-    int profile;
-    const struct prores_profile *profile_info;
-
-    int *slice_q;
-
-    ProresThreadData *tdata;
-} ProresContext;
-
 static void get_slice_data(ProresContext *ctx, const uint16_t *src,
                            ptrdiff_t linesize, int x, int y, int w, int h,
                            int16_t *blocks, uint16_t *emu_buf,
@@ -369,6 +152,8 @@ static void get_alpha_data(ProresContext *ctx, const uint16_t *src,
     }
 }
 
+int slice = 0;
+
 /**
  * Write an unsigned rice/exp golomb codeword.
  */
@@ -437,7 +222,6 @@ static void encode_acs(PutBitContext *pb, int16_t *blocks,
     int prev_level = 2;
     int run = 0, level;
     int max_coeffs, abs_level;
-
     max_coeffs = blocks_per_slice << 6;
 
     for (i = 1; i < 64; i++) {
@@ -685,7 +469,6 @@ static int estimate_acs(int *error, int16_t *blocks, int blocks_per_slice,
                 bits += estimate_vlc(ff_prores_run_to_cb[prev_run], run);
                 bits += estimate_vlc(ff_prores_level_to_cb[prev_level],
                                      abs_level - 1) + 1;
-
                 prev_run   = FFMIN(run, 15);
                 prev_level = FFMIN(abs_level, 9);
                 run    = 0;
@@ -905,7 +688,6 @@ static int find_slice_quant(AVCodecContext *avctx,
 
         for (q = min_quant; q < max_quant + 2; q++) {
             cur = trellis_node + q;
-
             bits  = td->nodes[prev].bits + slice_bits[q];
             error = slice_score[q];
             if (bits > bits_limit)
@@ -965,67 +747,33 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *pic, int *got_packet)
 {
     ProresContext *ctx = avctx->priv_data;
-    uint8_t *orig_buf, *buf, *slice_hdr, *slice_sizes, *tmp;
+    uint8_t *orig_buf, *buf, *slice_hdr, *slice_sizes;
     uint8_t *picture_size_pos;
     PutBitContext pb;
     int x, y, i, mb, q = 0;
     int sizes[4] = { 0 };
-    int slice_hdr_size = 2 + 2 * (ctx->num_planes - 1);
+    int slice_hdr_size = 2 * ctx->num_planes;
     int frame_size, picture_size, slice_size;
     int pkt_size, ret;
     int max_slice_size = (ctx->frame_size_upper_bound - 200) / (ctx->pictures_per_frame * ctx->slices_per_picture + 1);
-    uint8_t frame_flags;
+    pkt_size = ctx->frame_size_upper_bound;
 
     ctx->pic = pic;
-    pkt_size = ctx->frame_size_upper_bound;
 
     if ((ret = ff_alloc_packet(avctx, pkt, pkt_size + FF_INPUT_BUFFER_MIN_SIZE)) < 0)
         return ret;
 
     orig_buf = pkt->data;
-
-    // frame atom
-    orig_buf += 4;                              // frame size
-    bytestream_put_be32  (&orig_buf, FRAME_ID); // frame container ID
-    buf = orig_buf;
-
-    // frame header
-    tmp = buf;
-    buf += 2;                                   // frame header size will be stored here
-    bytestream_put_be16  (&buf, ctx->chroma_factor != CFACTOR_Y422 || ctx->alpha_bits ? 1 : 0);
-    bytestream_put_buffer(&buf, ctx->vendor, 4);
-    bytestream_put_be16  (&buf, avctx->width);
-    bytestream_put_be16  (&buf, avctx->height);
-
-    frame_flags = ctx->chroma_factor << 6;
-    if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT)
-        frame_flags |= (pic->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 0x04 : 0x08;
-    bytestream_put_byte  (&buf, frame_flags);
-
-    bytestream_put_byte  (&buf, 0);             // reserved
-    bytestream_put_byte  (&buf, pic->color_primaries);
-    bytestream_put_byte  (&buf, pic->color_trc);
-    bytestream_put_byte  (&buf, pic->colorspace);
-    bytestream_put_byte  (&buf, ctx->alpha_bits >> 3);
-    bytestream_put_byte  (&buf, 0);             // reserved
-    if (ctx->quant_sel != QUANT_MAT_DEFAULT) {
-        bytestream_put_byte  (&buf, 0x03);      // matrix flags - both matrices are present
-        bytestream_put_buffer(&buf, ctx->quant_mat, 64);        // luma quantisation matrix
-        bytestream_put_buffer(&buf, ctx->quant_chroma_mat, 64); // chroma quantisation matrix
-    } else {
-        bytestream_put_byte  (&buf, 0x00);      // matrix flags - default matrices are used
-    }
-    bytestream_put_be16  (&tmp, buf - orig_buf); // write back frame header size
+    buf = ff_prores_kostya_write_frame_header(avctx, ctx, &orig_buf, pic->flags,
+                              pic->color_primaries, pic->color_trc,
+                                   pic->colorspace);
 
     for (ctx->cur_picture_idx = 0;
          ctx->cur_picture_idx < ctx->pictures_per_frame;
          ctx->cur_picture_idx++) {
         // picture header
         picture_size_pos = buf + 1;
-        bytestream_put_byte  (&buf, 0x40);          // picture header size (in bits)
-        buf += 4;                                   // picture data size will be stored here
-        bytestream_put_be16  (&buf, ctx->slices_per_picture);
-        bytestream_put_byte  (&buf, av_log2(ctx->mbs_per_slice) << 4); // slice width and height in MBs
+        buf = ff_prores_kostya_write_picture_header(ctx, buf);
 
         // seek table - will be filled during slice encoding
         slice_sizes = buf;
@@ -1048,7 +796,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                 while (ctx->mb_width - x < mbs_per_slice)
                     mbs_per_slice >>= 1;
 
-                bytestream_put_byte(&buf, slice_hdr_size << 3);
+                bytestream_put_byte(&buf, slice_hdr_size * 8);
                 slice_hdr = buf;
                 buf += slice_hdr_size - 1;
                 if (pkt_size <= buf - orig_buf + 2 * max_slice_size) {
@@ -1076,13 +824,11 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         return ret;
 
                     pkt_size += delta;
-                    // restore pointers
                     orig_buf         = pkt->data + (orig_buf         - start);
                     buf              = pkt->data + (buf              - start);
                     picture_size_pos = pkt->data + (picture_size_pos - start);
                     slice_sizes      = pkt->data + (slice_sizes      - start);
                     slice_hdr        = pkt->data + (slice_hdr        - start);
-                    tmp              = pkt->data + (tmp              - start);
                 }
                 init_put_bits(&pb, buf, (pkt_size - (buf - orig_buf)));
                 ret = encode_slice(avctx, pic, &pb, sizes, x, y, q,
@@ -1149,105 +895,18 @@ static void prores_fdct(FDCTDSPContext *fdsp, const uint16_t *src,
 static av_cold int encode_init(AVCodecContext *avctx)
 {
     ProresContext *ctx = avctx->priv_data;
-    int mps;
-    int i, j;
-    int min_quant, max_quant;
-    int interlaced = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
+    int err = 0, i, j, min_quant, max_quant;
 
-    avctx->bits_per_raw_sample = 10;
+    err = ff_prores_kostya_encode_init(avctx, ctx, avctx->pix_fmt);
+    if (err < 0)
+        return err;
 
     ctx->fdct      = prores_fdct;
-    ctx->scantable = interlaced ? ff_prores_interlaced_scan
-                                : ff_prores_progressive_scan;
     ff_fdctdsp_init(&ctx->fdsp, avctx);
 
-    mps = ctx->mbs_per_slice;
-    if (mps & (mps - 1)) {
-        av_log(avctx, AV_LOG_ERROR,
-               "there should be an integer power of two MBs per slice\n");
-        return AVERROR(EINVAL);
-    }
-    if (ctx->profile == PRORES_PROFILE_AUTO) {
-        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
-        ctx->profile = (desc->flags & AV_PIX_FMT_FLAG_ALPHA ||
-                        !(desc->log2_chroma_w + desc->log2_chroma_h))
-                     ? PRORES_PROFILE_4444 : PRORES_PROFILE_HQ;
-        av_log(avctx, AV_LOG_INFO, "Autoselected %s. It can be overridden "
-               "through -profile option.\n", ctx->profile == PRORES_PROFILE_4444
-               ? "4:4:4:4 profile because of the used input colorspace"
-               : "HQ profile to keep best quality");
-    }
-    if (av_pix_fmt_desc_get(avctx->pix_fmt)->flags & AV_PIX_FMT_FLAG_ALPHA) {
-        if (ctx->profile != PRORES_PROFILE_4444 &&
-            ctx->profile != PRORES_PROFILE_4444XQ) {
-            // force alpha and warn
-            av_log(avctx, AV_LOG_WARNING, "Profile selected will not "
-                   "encode alpha. Override with -profile if needed.\n");
-            ctx->alpha_bits = 0;
-        }
-        if (ctx->alpha_bits & 7) {
-            av_log(avctx, AV_LOG_ERROR, "alpha bits should be 0, 8 or 16\n");
-            return AVERROR(EINVAL);
-        }
-        avctx->bits_per_coded_sample = 32;
-    } else {
-        ctx->alpha_bits = 0;
-    }
-
-    ctx->chroma_factor = avctx->pix_fmt == AV_PIX_FMT_YUV422P10
-                         ? CFACTOR_Y422
-                         : CFACTOR_Y444;
-    ctx->profile_info  = prores_profile_info + ctx->profile;
-    ctx->num_planes    = 3 + !!ctx->alpha_bits;
-
-    ctx->mb_width      = FFALIGN(avctx->width,  16) >> 4;
-
-    if (interlaced)
-        ctx->mb_height = FFALIGN(avctx->height, 32) >> 5;
-    else
-        ctx->mb_height = FFALIGN(avctx->height, 16) >> 4;
-
-    ctx->slices_width  = ctx->mb_width / mps;
-    ctx->slices_width += av_popcount(ctx->mb_width - ctx->slices_width * mps);
-    ctx->slices_per_picture = ctx->mb_height * ctx->slices_width;
-    ctx->pictures_per_frame = 1 + interlaced;
-
-    if (ctx->quant_sel == -1) {
-        ctx->quant_mat = prores_quant_matrices[ctx->profile_info->quant];
-        ctx->quant_chroma_mat = prores_quant_matrices[ctx->profile_info->quant_chroma];
-    } else {
-        ctx->quant_mat = prores_quant_matrices[ctx->quant_sel];
-        ctx->quant_chroma_mat = prores_quant_matrices[ctx->quant_sel];
-    }
-
-    if (strlen(ctx->vendor) != 4) {
-        av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    ctx->force_quant = avctx->global_quality / FF_QP2LAMBDA;
     if (!ctx->force_quant) {
-        if (!ctx->bits_per_mb) {
-            for (i = 0; i < NUM_MB_LIMITS - 1; i++)
-                if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height *
-                                           ctx->pictures_per_frame)
-                    break;
-            ctx->bits_per_mb   = ctx->profile_info->br_tab[i];
-            if (ctx->alpha_bits)
-                ctx->bits_per_mb *= 20;
-        } else if (ctx->bits_per_mb < 128) {
-            av_log(avctx, AV_LOG_ERROR, "too few bits per MB, please set at least 128\n");
-            return AVERROR_INVALIDDATA;
-        }
-
         min_quant = ctx->profile_info->min_quant;
         max_quant = ctx->profile_info->max_quant;
-        for (i = min_quant; i < MAX_STORED_Q; i++) {
-            for (j = 0; j < 64; j++) {
-                ctx->quants[i][j] = ctx->quant_mat[j] * i;
-                ctx->quants_chroma[i][j] = ctx->quant_chroma_mat[j] * i;
-            }
-        }
 
         ctx->slice_q = av_malloc_array(ctx->slices_per_picture, sizeof(*ctx->slice_q));
         if (!ctx->slice_q)
@@ -1269,51 +928,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
                 ctx->tdata[j].nodes[i].score     = 0;
             }
         }
-    } else {
-        int ls = 0;
-        int ls_chroma = 0;
-
-        if (ctx->force_quant > 64) {
-            av_log(avctx, AV_LOG_ERROR, "too large quantiser, maximum is 64\n");
-            return AVERROR_INVALIDDATA;
-        }
-
-        for (j = 0; j < 64; j++) {
-            ctx->quants[0][j] = ctx->quant_mat[j] * ctx->force_quant;
-            ctx->quants_chroma[0][j] = ctx->quant_chroma_mat[j] * ctx->force_quant;
-            ls += av_log2((1 << 11)  / ctx->quants[0][j]) * 2 + 1;
-            ls_chroma += av_log2((1 << 11)  / ctx->quants_chroma[0][j]) * 2 + 1;
-        }
-
-        ctx->bits_per_mb = ls * 4 + ls_chroma * 4;
-        if (ctx->chroma_factor == CFACTOR_Y444)
-            ctx->bits_per_mb += ls_chroma * 4;
     }
 
-    ctx->frame_size_upper_bound = (ctx->pictures_per_frame *
-                                   ctx->slices_per_picture + 1) *
-                                  (2 + 2 * ctx->num_planes +
-                                   (mps * ctx->bits_per_mb) / 8)
-                                  + 200;
-
-    if (ctx->alpha_bits) {
-         // The alpha plane is run-coded and might exceed the bit budget.
-         ctx->frame_size_upper_bound += (ctx->pictures_per_frame *
-                                         ctx->slices_per_picture + 1) *
-         /* num pixels per slice */     (ctx->mbs_per_slice * 256 *
-         /* bits per pixel */            (1 + ctx->alpha_bits + 1) + 7 >> 3);
-    }
-
-    avctx->codec_tag   = ctx->profile_info->tag;
-    avctx->profile = ctx->profile;
-
-    av_log(avctx, AV_LOG_DEBUG,
-           "profile %d, %d slices, interlacing: %s, %d bits per MB\n",
-           ctx->profile, ctx->slices_per_picture * ctx->pictures_per_frame,
-           interlaced ? "yes" : "no", ctx->bits_per_mb);
-    av_log(avctx, AV_LOG_DEBUG, "frame size upper bound: %d\n",
-           ctx->frame_size_upper_bound);
-
     return 0;
 }
 
diff --git a/libavcodec/proresenc_kostya_common.c b/libavcodec/proresenc_kostya_common.c
new file mode 100644
index 0000000000..d432d10369
--- /dev/null
+++ b/libavcodec/proresenc_kostya_common.c
@@ -0,0 +1,364 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ * Copyright (c) 2012 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "proresdata.h"
+#include <sys/types.h>
+#include "proresenc_kostya_common.h"
+
+static const uint8_t prores_quant_matrices[][64] = {
+    { // proxy
+         4,  7,  9, 11, 13, 14, 15, 63,
+         7,  7, 11, 12, 14, 15, 63, 63,
+         9, 11, 13, 14, 15, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63,
+    },
+    { // proxy chromas
+        4,  7,  9, 11, 13, 14, 63, 63,
+        7,  7, 11, 12, 14, 63, 63, 63,
+        9, 11, 13, 14, 63, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63
+    },
+    { // LT
+         4,  5,  6,  7,  9, 11, 13, 15,
+         5,  5,  7,  8, 11, 13, 15, 17,
+         6,  7,  9, 11, 13, 15, 15, 17,
+         7,  7,  9, 11, 13, 15, 17, 19,
+         7,  9, 11, 13, 14, 16, 19, 23,
+         9, 11, 13, 14, 16, 19, 23, 29,
+         9, 11, 13, 15, 17, 21, 28, 35,
+        11, 13, 16, 17, 21, 28, 35, 41,
+    },
+    { // standard
+         4,  4,  5,  5,  6,  7,  7,  9,
+         4,  4,  5,  6,  7,  7,  9,  9,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  6,  7,  7,  8,  9, 10, 12,
+         6,  7,  7,  8,  9, 10, 12, 15,
+         6,  7,  7,  9, 10, 11, 14, 17,
+         7,  7,  9, 10, 11, 14, 17, 21,
+    },
+    { // high quality
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  5,
+         4,  4,  4,  4,  4,  4,  5,  5,
+         4,  4,  4,  4,  4,  5,  5,  6,
+         4,  4,  4,  4,  5,  5,  6,  7,
+         4,  4,  4,  4,  5,  6,  7,  7,
+    },
+    { // XQ luma
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  3,
+        2,  2,  2,  2,  2,  2,  3,  3,
+        2,  2,  2,  2,  2,  3,  3,  3,
+        2,  2,  2,  2,  3,  3,  3,  4,
+        2,  2,  2,  2,  3,  3,  4,  4,
+    },
+    { // codec default
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+    },
+};
+
+static const int prores_mb_limits[NUM_MB_LIMITS] = {
+    1620, // up to 720x576
+    2700, // up to 960x720
+    6075, // up to 1440x1080
+    9216, // up to 2048x1152
+};
+
+static const prores_profile prores_profile_info[6] = {
+    {
+        .full_name = "proxy",
+        .tag       = MKTAG('a', 'p', 'c', 'o'),
+        .min_quant = 4,
+        .max_quant = 8,
+        .br_tab    = { 300, 242, 220, 194 },
+        .quant     = QUANT_MAT_PROXY,
+        .quant_chroma = QUANT_MAT_PROXY_CHROMA,
+    },
+    {
+        .full_name = "LT",
+        .tag       = MKTAG('a', 'p', 'c', 's'),
+        .min_quant = 1,
+        .max_quant = 9,
+        .br_tab    = { 720, 560, 490, 440 },
+        .quant     = QUANT_MAT_LT,
+        .quant_chroma = QUANT_MAT_LT,
+    },
+    {
+        .full_name = "standard",
+        .tag       = MKTAG('a', 'p', 'c', 'n'),
+        .min_quant = 1,
+        .max_quant = 6,
+        .br_tab    = { 1050, 808, 710, 632 },
+        .quant     = QUANT_MAT_STANDARD,
+        .quant_chroma = QUANT_MAT_STANDARD,
+    },
+    {
+        .full_name = "high quality",
+        .tag       = MKTAG('a', 'p', 'c', 'h'),
+        .min_quant = 1,
+        .max_quant = 6,
+        .br_tab    = { 1566, 1216, 1070, 950 },
+        .quant     = QUANT_MAT_HQ,
+        .quant_chroma = QUANT_MAT_HQ,
+    },
+    {
+        .full_name = "4444",
+        .tag       = MKTAG('a', 'p', '4', 'h'),
+        .min_quant = 1,
+        .max_quant = 6,
+        .br_tab    = { 2350, 1828, 1600, 1425 },
+        .quant     = QUANT_MAT_HQ,
+        .quant_chroma = QUANT_MAT_HQ,
+    },
+    {
+        .full_name = "4444XQ",
+        .tag       = MKTAG('a', 'p', '4', 'x'),
+        .min_quant = 1,
+        .max_quant = 6,
+        .br_tab    = { 3525, 2742, 2400, 2137 },
+        .quant     = QUANT_MAT_HQ, /* Fix me : use QUANT_MAT_XQ_LUMA */
+        .quant_chroma = QUANT_MAT_HQ,
+    }
+};
+
+av_cold int ff_prores_kostya_encode_init(AVCodecContext *avctx, ProresContext *ctx,
+                                         enum AVPixelFormat pix_fmt)
+{
+    int mps, i, j, min_quant;
+    int interlaced = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
+
+    avctx->bits_per_raw_sample = 10;
+
+    ctx->scantable = interlaced ? ff_prores_interlaced_scan
+                                : ff_prores_progressive_scan;
+
+    mps = ctx->mbs_per_slice;
+    if (mps & (mps - 1)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "there should be an integer power of two MBs per slice\n");
+        return AVERROR(EINVAL);
+    }
+    if (ctx->profile == PRORES_PROFILE_AUTO) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+        ctx->profile = (desc->flags & AV_PIX_FMT_FLAG_ALPHA ||
+                        !(desc->log2_chroma_w + desc->log2_chroma_h))
+                     ? PRORES_PROFILE_4444 : PRORES_PROFILE_HQ;
+        av_log(avctx, AV_LOG_INFO, "Autoselected %s. It can be overridden "
+               "through -profile option.\n", ctx->profile == PRORES_PROFILE_4444
+               ? "4:4:4:4 profile because of the used input colorspace"
+               : "HQ profile to keep best quality");
+    }
+    if (av_pix_fmt_desc_get(pix_fmt)->flags & AV_PIX_FMT_FLAG_ALPHA) {
+        if (ctx->profile != PRORES_PROFILE_4444 &&
+            ctx->profile != PRORES_PROFILE_4444XQ) {
+            // force alpha and warn
+            av_log(avctx, AV_LOG_WARNING, "Profile selected will not "
+                   "encode alpha. Override with -profile if needed.\n");
+            ctx->alpha_bits = 0;
+        }
+        if (ctx->alpha_bits & 7) {
+            av_log(avctx, AV_LOG_ERROR, "alpha bits should be 0, 8 or 16\n");
+            return AVERROR(EINVAL);
+        }
+        avctx->bits_per_coded_sample = 32;
+    } else {
+        ctx->alpha_bits = 0;
+    }
+
+    ctx->chroma_factor = pix_fmt == AV_PIX_FMT_YUV422P10
+                         ? CFACTOR_Y422
+                         : CFACTOR_Y444;
+    ctx->profile_info  = prores_profile_info + ctx->profile;
+    ctx->num_planes    = 3 + !!ctx->alpha_bits;
+
+    ctx->mb_width      = FFALIGN(avctx->width,  16) >> 4;
+
+    if (interlaced)
+        ctx->mb_height = FFALIGN(avctx->height, 32) >> 5;
+    else
+        ctx->mb_height = FFALIGN(avctx->height, 16) >> 4;
+
+    ctx->slices_width  = ctx->mb_width / mps;
+    ctx->slices_width += av_popcount(ctx->mb_width - ctx->slices_width * mps);
+    ctx->slices_per_picture = ctx->mb_height * ctx->slices_width;
+    ctx->pictures_per_frame = 1 + interlaced;
+
+    if (ctx->quant_sel == -1) {
+        ctx->quant_mat = prores_quant_matrices[ctx->profile_info->quant];
+        ctx->quant_chroma_mat = prores_quant_matrices[ctx->profile_info->quant_chroma];
+    } else {
+        ctx->quant_mat = prores_quant_matrices[ctx->quant_sel];
+        ctx->quant_chroma_mat = prores_quant_matrices[ctx->quant_sel];
+    }
+
+    if (strlen(ctx->vendor) != 4) {
+        av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ctx->force_quant = avctx->global_quality / FF_QP2LAMBDA;
+    if (!ctx->force_quant) {
+        if (!ctx->bits_per_mb) {
+            for (i = 0; i < NUM_MB_LIMITS - 1; i++)
+                if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height *
+                                           ctx->pictures_per_frame)
+                    break;
+            ctx->bits_per_mb   = ctx->profile_info->br_tab[i];
+            if (ctx->alpha_bits)
+                ctx->bits_per_mb *= 20;
+        } else if (ctx->bits_per_mb < 128) {
+            av_log(avctx, AV_LOG_ERROR, "too few bits per MB, please set at least 128\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        min_quant = ctx->profile_info->min_quant;
+        for (i = min_quant; i < MAX_STORED_Q; i++) {
+            for (j = 0; j < 64; j++) {
+                ctx->quants[i][j] = ctx->quant_mat[j] * i;
+                ctx->quants_chroma[i][j] = ctx->quant_chroma_mat[j] * i;
+            }
+        }
+    } else {
+        int ls = 0;
+        int ls_chroma = 0;
+
+        if (ctx->force_quant > 64) {
+            av_log(avctx, AV_LOG_ERROR, "too large quantiser, maximum is 64\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        for (j = 0; j < 64; j++) {
+            ctx->quants[0][j] = ctx->quant_mat[j] * ctx->force_quant;
+            ctx->quants_chroma[0][j] = ctx->quant_chroma_mat[j] * ctx->force_quant;
+            ls += av_log2((1 << 11)  / ctx->quants[0][j]) * 2 + 1;
+            ls_chroma += av_log2((1 << 11)  / ctx->quants_chroma[0][j]) * 2 + 1;
+        }
+
+        ctx->bits_per_mb = ls * 4 + ls_chroma * 4;
+        if (ctx->chroma_factor == CFACTOR_Y444)
+            ctx->bits_per_mb += ls_chroma * 4;
+    }
+
+    ctx->frame_size_upper_bound = (ctx->pictures_per_frame *
+                                   ctx->slices_per_picture + 1) *
+                                  (2 + 2 * ctx->num_planes +
+                                   (mps * ctx->bits_per_mb) / 8)
+                                  + 200;
+
+    if (ctx->alpha_bits) {
+         // The alpha plane is run-coded and might exceed the bit budget.
+         ctx->frame_size_upper_bound += (ctx->pictures_per_frame *
+                                         ctx->slices_per_picture + 1) *
+         /* num pixels per slice */     (ctx->mbs_per_slice * 256 *
+         /* bits per pixel */            (1 + ctx->alpha_bits + 1) + 7 >> 3);
+    }
+
+    avctx->codec_tag   = ctx->profile_info->tag;
+    avctx->profile = ctx->profile;
+
+    av_log(avctx, AV_LOG_DEBUG,
+           "profile %d, %d slices, interlacing: %s, %d bits per MB\n",
+           ctx->profile, ctx->slices_per_picture * ctx->pictures_per_frame,
+           interlaced ? "yes" : "no", ctx->bits_per_mb);
+    av_log(avctx, AV_LOG_DEBUG, "frame size upper bound: %d\n",
+           ctx->frame_size_upper_bound);
+
+    return 0;
+}
+
+uint8_t* ff_prores_kostya_write_frame_header(AVCodecContext *avctx, ProresContext *ctx,
+                                             uint8_t **orig_buf, int flags,
+                                             enum AVColorPrimaries color_primaries,
+                                             enum AVColorTransferCharacteristic color_trc,
+                                             enum AVColorSpace colorspace)
+{
+    uint8_t *buf, *tmp;
+    uint8_t frame_flags;
+
+    // frame atom
+    *orig_buf += 4;                              // frame size
+    bytestream_put_be32  (orig_buf, FRAME_ID); // frame container ID
+    buf = *orig_buf;
+
+    // frame header
+    tmp = buf;
+    buf += 2;                                   // frame header size will be stored here
+    bytestream_put_be16  (&buf, ctx->chroma_factor != CFACTOR_Y422 || ctx->alpha_bits ? 1 : 0);
+    bytestream_put_buffer(&buf, (uint8_t*)ctx->vendor, 4);
+    bytestream_put_be16  (&buf, avctx->width);
+    bytestream_put_be16  (&buf, avctx->height);
+
+    frame_flags = ctx->chroma_factor << 6;
+    if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT)
+        frame_flags |= (flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 0x04 : 0x08;
+    bytestream_put_byte  (&buf, frame_flags);
+
+    bytestream_put_byte  (&buf, 0);             // reserved
+    bytestream_put_byte  (&buf, color_primaries);
+    bytestream_put_byte  (&buf, color_trc);
+    bytestream_put_byte  (&buf, colorspace);
+    bytestream_put_byte  (&buf, ctx->alpha_bits >> 3);
+    bytestream_put_byte  (&buf, 0);             // reserved
+    if (ctx->quant_sel != QUANT_MAT_DEFAULT) {
+        bytestream_put_byte  (&buf, 0x03);      // matrix flags - both matrices are present
+        bytestream_put_buffer(&buf, ctx->quant_mat, 64);        // luma quantisation matrix
+        bytestream_put_buffer(&buf, ctx->quant_chroma_mat, 64); // chroma quantisation matrix
+    } else {
+        bytestream_put_byte  (&buf, 0x00);      // matrix flags - default matrices are used
+    }
+    bytestream_put_be16  (&tmp, buf - *orig_buf); // write back frame header size
+    return buf;
+}
+
+uint8_t* ff_prores_kostya_write_picture_header(ProresContext *ctx, uint8_t *buf)
+{
+    bytestream_put_byte  (&buf, 0x40); // picture header size (in bits)
+    buf += 4;                                   // picture data size will be stored here
+    bytestream_put_be16  (&buf, ctx->slices_per_picture);
+    bytestream_put_byte  (&buf, av_log2(ctx->mbs_per_slice) << 4); // slice width and height in MBs
+    return buf;
+}
\ No newline at end of file
diff --git a/libavcodec/proresenc_kostya_common.h b/libavcodec/proresenc_kostya_common.h
new file mode 100644
index 0000000000..f18adc36af
--- /dev/null
+++ b/libavcodec/proresenc_kostya_common.h
@@ -0,0 +1,131 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ * Copyright (c) 2012 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PRORESENC_KOSTYA_H
+#define AVCODEC_PRORESENC_KOSTYA_H
+
+#include "libavutil/attributes_internal.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/pixfmt.h"
+#include "fdctdsp.h"
+
+FF_VISIBILITY_PUSH_HIDDEN
+
+#define CFACTOR_Y422 2
+#define CFACTOR_Y444 3
+
+#define MAX_MBS_PER_SLICE 8
+
+#define MAX_PLANES 4
+
+#define NUM_MB_LIMITS 4
+
+#define MAX_STORED_Q 16
+
+enum {
+    PRORES_PROFILE_AUTO  = -1,
+    PRORES_PROFILE_PROXY = 0,
+    PRORES_PROFILE_LT,
+    PRORES_PROFILE_STANDARD,
+    PRORES_PROFILE_HQ,
+    PRORES_PROFILE_4444,
+    PRORES_PROFILE_4444XQ,
+};
+
+enum {
+    QUANT_MAT_PROXY = 0,
+    QUANT_MAT_PROXY_CHROMA,
+    QUANT_MAT_LT,
+    QUANT_MAT_STANDARD,
+    QUANT_MAT_HQ,
+    QUANT_MAT_XQ_LUMA,
+    QUANT_MAT_DEFAULT,
+};
+
+typedef struct prores_profile {
+    const char *full_name;
+    uint32_t    tag;
+    int         min_quant;
+    int         max_quant;
+    int         br_tab[NUM_MB_LIMITS];
+    int         quant;
+    int         quant_chroma;
+} prores_profile;
+
+typedef struct ProresContext {
+    AVClass *class;
+    DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
+    DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
+    int16_t quants[MAX_STORED_Q][64];
+    int16_t quants_chroma[MAX_STORED_Q][64];
+    int16_t custom_q[64];
+    int16_t custom_chroma_q[64];
+    const uint8_t *quant_mat;
+    const uint8_t *quant_chroma_mat;
+    const uint8_t *scantable;
+
+    void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src,
+                 ptrdiff_t linesize, int16_t *block);
+    FDCTDSPContext fdsp;
+
+    const AVFrame *pic;
+    int mb_width, mb_height;
+    int mbs_per_slice;
+    int num_chroma_blocks, chroma_factor;
+    int slices_width;
+    int slices_per_picture;
+    int pictures_per_frame; // 1 for progressive, 2 for interlaced
+    int cur_picture_idx;
+    int num_planes;
+    int bits_per_mb;
+    int force_quant;
+    int alpha_bits;
+    int warn;
+
+    char *vendor;
+    int quant_sel;
+
+    int frame_size_upper_bound;
+
+    int profile;
+    const struct prores_profile *profile_info;
+
+    int *slice_q;
+
+    struct ProresThreadData *tdata;
+} ProresContext;
+
+av_cold int ff_prores_kostya_encode_init(AVCodecContext *avctx, ProresContext *ctx,
+                                         enum AVPixelFormat pixfmt);
+
+uint8_t* ff_prores_kostya_write_frame_header(AVCodecContext *avctx, ProresContext *ctx,
+                                             uint8_t **orig_buf, int flags,
+                                             enum AVColorPrimaries color_primaries,
+                                             enum AVColorTransferCharacteristic color_trc,
+                                             enum AVColorSpace colorspace);
+
+uint8_t* ff_prores_kostya_write_picture_header(ProresContext *ctx, uint8_t *buf);
+
+FF_VISIBILITY_POP_HIDDEN
+
+#endif
\ No newline at end of file
-- 
2.49.1


>From 6d06ce52d2407cfb5f71746ac24801647a0b4f29 Mon Sep 17 00:00:00 2001
From: IndecisiveTurtle <geoster3d@gmail.com>
Date: Wed, 3 Sep 2025 22:28:34 +0300
Subject: [PATCH 3/3] lavc: implement a Vulkan-based prores encoder

Adds a vulkan implementation of the reference prores kostya encoder. Provides about 3-4x speedup over the CPU code
---
 configure                                     |    1 +
 libavcodec/Makefile                           |    1 +
 libavcodec/allcodecs.c                        |    1 +
 libavcodec/proresenc_kostya_vulkan.c          | 1068 +++++++++++++++++
 libavcodec/vulkan/Makefile                    |    7 +
 libavcodec/vulkan/prores_ks_alpha_data.comp   |   67 ++
 libavcodec/vulkan/prores_ks_encode_slice.comp |  230 ++++
 .../vulkan/prores_ks_estimate_slice.comp      |  267 +++++
 libavcodec/vulkan/prores_ks_slice_data.comp   |  265 ++++
 libavcodec/vulkan/prores_ks_trellis_node.comp |  177 +++
 10 files changed, 2084 insertions(+)
 create mode 100644 libavcodec/proresenc_kostya_vulkan.c
 create mode 100644 libavcodec/vulkan/prores_ks_alpha_data.comp
 create mode 100644 libavcodec/vulkan/prores_ks_encode_slice.comp
 create mode 100644 libavcodec/vulkan/prores_ks_estimate_slice.comp
 create mode 100644 libavcodec/vulkan/prores_ks_slice_data.comp
 create mode 100644 libavcodec/vulkan/prores_ks_trellis_node.comp

diff --git a/configure b/configure
index 7ec4c3975b..4db8a7c581 100755
--- a/configure
+++ b/configure
@@ -3099,6 +3099,7 @@ prores_decoder_select="blockdsp idctdsp"
 prores_encoder_select="fdctdsp"
 prores_aw_encoder_select="fdctdsp"
 prores_ks_encoder_select="fdctdsp"
+prores_ks_vulkan_encoder_select="vulkan spirv_compiler"
 prores_raw_decoder_select="blockdsp idctdsp"
 qcelp_decoder_select="lsp"
 qdm2_decoder_select="mpegaudiodsp"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index d8e1ac5a54..1964c787d7 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -639,6 +639,7 @@ OBJS-$(CONFIG_PRORES_DECODER)          += proresdec.o proresdsp.o proresdata.o
 OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc_anatoliy.o proresdata.o
 OBJS-$(CONFIG_PRORES_AW_ENCODER)       += proresenc_anatoliy.o proresdata.o
 OBJS-$(CONFIG_PRORES_KS_ENCODER)       += proresenc_kostya.o proresdata.o proresenc_kostya_common.o
+OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += proresenc_kostya_vulkan.o proresdata.o proresenc_kostya_common.o
 OBJS-$(CONFIG_PRORES_RAW_DECODER)      += prores_raw.o proresdsp.o proresdata.o
 OBJS-$(CONFIG_PRORES_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o
 OBJS-$(CONFIG_PROSUMER_DECODER)        += prosumer.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index f5ec2e01e8..1b4a5f769c 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -269,6 +269,7 @@ extern const FFCodec ff_prores_encoder;
 extern const FFCodec ff_prores_decoder;
 extern const FFCodec ff_prores_aw_encoder;
 extern const FFCodec ff_prores_ks_encoder;
+extern const FFCodec ff_prores_ks_vulkan_encoder;
 extern const FFCodec ff_prores_raw_decoder;
 extern const FFCodec ff_prosumer_decoder;
 extern const FFCodec ff_psd_decoder;
diff --git a/libavcodec/proresenc_kostya_vulkan.c b/libavcodec/proresenc_kostya_vulkan.c
new file mode 100644
index 0000000000..6413b2f9d4
--- /dev/null
+++ b/libavcodec/proresenc_kostya_vulkan.c
@@ -0,0 +1,1068 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ * Copyright (c) 2012 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/buffer.h"
+#include "libavutil/macros.h"
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/vulkan_spirv.h"
+#include "libavutil/hwcontext_vulkan.h"
+#include "libavutil/vulkan_loader.h"
+#include "libavutil/vulkan.h"
+#include "avcodec.h"
+#include "codec.h"
+#include "codec_internal.h"
+#include "encode.h"
+#include "packet.h"
+#include "put_bits.h"
+#include "profiles.h"
+#include "bytestream.h"
+#include "proresdata.h"
+#include "proresenc_kostya_common.h"
+#include "hwconfig.h"
+
+#define DCTSIZE 8
+
+typedef struct ProresDataTables {
+    int16_t qmat[128][64];
+    int16_t qmat_chroma[128][64];
+    uint8_t scan[64];
+    uint8_t dc_codebook[7];
+    uint8_t run_to_cb[16];
+    uint8_t level_to_cb[10];
+} ProresDataTables;
+
+typedef struct SliceDataInfo {
+    int plane;
+    int pictures_per_frame;
+    int line_add;
+} SliceDataInfo;
+
+typedef struct EstimateSliceInfo {
+    int slices_per_picture;
+    int min_quant;
+    int max_quant;
+    int bits_per_mb;
+} EstimateSliceInfo;
+
+typedef struct EncodeSliceInfo {
+    VkDeviceAddress bytestream;
+    VkDeviceAddress seek_table;
+    int num_planes;
+    int slices_per_picture;
+    int max_quant;
+} EncodeSliceInfo;
+
+typedef struct TrellisNodeInfo {
+    int min_quant;
+    int max_quant;
+    int mbs_per_slice;
+    int bits_per_mb;
+} TrellisNodeInfo;
+
+#define TRELLIS_WIDTH 16
+#define SCORE_LIMIT   INT_MAX / 2
+
+struct TrellisNode {
+    int prev_node;
+    int quant;
+    int bits;
+    int score;
+};
+
+typedef struct SliceData {
+    uint32_t mbs_per_slice;
+    int16_t rows[MAX_PLANES * MAX_MBS_PER_SLICE * 256];
+} SliceData;
+
+typedef struct SliceScore {
+    int bits[MAX_STORED_Q][4];
+    int error[MAX_STORED_Q][4];
+    int total_bits[MAX_STORED_Q];
+    int total_error[MAX_STORED_Q];
+    int overquant;
+    int buf_start;
+    int quant;
+} SliceScore;
+
+typedef struct VulkanEncodeProresFrameData {
+    /* Intermediate buffers */
+    AVBufferRef *out_data_ref[2];
+    AVBufferRef *slice_data_ref[2];
+    AVBufferRef *slice_score_ref[2];
+    AVBufferRef *frame_size_ref[2];
+
+    /* Copied from the source */
+    int64_t pts;
+    int64_t duration;
+    void        *frame_opaque;
+    AVBufferRef *frame_opaque_ref;
+    enum AVColorTransferCharacteristic color_trc;
+    enum AVColorSpace colorspace;
+    enum AVColorPrimaries color_primaries;
+    int key_frame;
+    int flags;
+} VulkanEncodeProresFrameData;
+
+typedef struct ProresVulkanContext {
+    ProresContext ctx;
+
+    /* Vulkan state */
+    FFVulkanContext vkctx;
+    AVVulkanDeviceQueueFamily *qf;
+    FFVkExecPool e;
+    AVVulkanDeviceQueueFamily *transfer_qf;
+    FFVkExecPool transfer_exec_pool;
+    AVBufferPool *pkt_buf_pool;
+    AVBufferPool *slice_data_buf_pool;
+    AVBufferPool *slice_score_buf_pool;
+    AVBufferPool *frame_size_buf_pool;
+
+    FFVulkanShader alpha_data_shd;
+    FFVulkanShader slice_data_shd[2];
+    FFVulkanShader estimate_slice_shd;
+    FFVulkanShader encode_slice_shd;
+    FFVulkanShader trellis_node_shd;
+    FFVkBuffer prores_data_tables_buf;
+
+    int *slice_quants;
+    SliceScore *slice_scores;
+    ProresDataTables *tables;
+
+    int in_flight;
+    int async_depth;
+    AVFrame *frame;
+    VulkanEncodeProresFrameData *exec_ctx_info;
+} ProresVulkanContext;
+
+extern const char *ff_source_common_comp;
+extern const char *ff_source_prores_ks_alpha_data_comp;
+extern const char *ff_source_prores_ks_slice_data_comp;
+extern const char *ff_source_prores_ks_estimate_slice_comp;
+extern const char *ff_source_prores_ks_trellis_node_comp;
+extern const char *ff_source_prores_ks_encode_slice_comp;
+
+static int init_slice_data_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                    FFVulkanShader* shd, const char* pl_name, int blocks_per_mb)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, DCTSIZE, blocks_per_mb, pv->ctx.mbs_per_slice, 0);
+
+    av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+    av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+    av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice);
+    av_bprintf(&shd->src, "#define BLOCKS_PER_MB %d\n", blocks_per_mb);
+    av_bprintf(&shd->src, "#define WIDTH_IN_MB %d\n", pv->ctx.mb_width);
+    av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; i16vec4 rows[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE][DCTSIZE / 4]; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "SliceBuffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceData slices[];",
+        },
+        {
+            .name       = "planes",
+            .type       = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+            .dimensions = 2,
+            .elems      = 3,
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "r16i",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(SliceDataInfo), VK_SHADER_STAGE_COMPUTE_BIT);
+    GLSLD(ff_source_prores_ks_slice_data_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int init_alpha_data_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                    FFVulkanShader* shd, const char* pl_name)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, DCTSIZE * 2, DCTSIZE * 2, 1, 0);
+
+    av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+    av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+    av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice);
+    av_bprintf(&shd->src, "#define WIDTH_IN_MB %d\n", pv->ctx.mb_width);
+    av_bprintf(&shd->src, "#define SLICES_PITCH %d\n", pv->ctx.slices_width);
+    av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits);
+    av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "SliceBuffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceData slices[];",
+        },
+        {
+            .name       = "plane",
+            .type       = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+            .dimensions = 2,
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "r16i",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(int), VK_SHADER_STAGE_COMPUTE_BIT);
+    GLSLD(ff_source_prores_ks_alpha_data_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int init_estimate_slice_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                        FFVulkanShader* shd, const char* pl_name)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+    int subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
+    int dim_x = pv->ctx.alpha_bits ? subgroup_size : (subgroup_size / 3) * 3;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, dim_x, 1, 1, 0);
+
+    av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+    av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+    av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice);
+    av_bprintf(&shd->src, "#define CHROMA_FACTOR %d\n", pv->ctx.chroma_factor);
+    av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits);
+    av_bprintf(&shd->src, "#define MAX_STORED_Q %d\n", MAX_STORED_Q);
+    av_bprintf(&shd->src, "#define NUM_PLANES %d\n", pv->ctx.num_planes);
+    av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n");
+    av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4 score[16]; int total_bits[16]; int total_score[16]; uint overquant; int buf_start; int quant; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "SliceBuffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceData slices[];",
+        },
+        {
+            .name        = "SliceScores",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceScore scores[];",
+        },
+        {
+            .name        = "ProresDataTables",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t qmat[128][64]; int16_t qmat_chroma[128][64]; uint8_t scan[64]; "
+                           "uint8_t dc_codebook[7]; uint8_t run_to_cb[16]; uint8_t level_to_cb[10];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(EstimateSliceInfo), VK_SHADER_STAGE_COMPUTE_BIT);
+    GLSLD(ff_source_prores_ks_estimate_slice_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int init_trellis_node_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                      FFVulkanShader* shd, const char* pl_name)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+    int subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, pv->ctx.mb_height, 1, 1, 0);
+
+    av_bprintf(&shd->src, "#define SLICES_WIDTH %d\n", pv->ctx.slices_width);
+    av_bprintf(&shd->src, "#define NUM_SUBGROUPS %d\n", FFALIGN(pv->ctx.mb_height, subgroup_size) / subgroup_size);
+    av_bprintf(&shd->src, "#define NUM_PLANES %d\n", pv->ctx.num_planes);
+    av_bprintf(&shd->src, "#define FORCE_QUANT %d\n", pv->ctx.force_quant);
+    av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4 score[16]; int total_bits[16]; int total_score[16]; int overquant; int buf_start; int quant; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "FrameSize",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int frame_size;",
+        },
+        {
+            .name        = "SliceScores",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceScore scores[];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(TrellisNodeInfo), VK_SHADER_STAGE_COMPUTE_BIT);
+    GLSLD(ff_source_prores_ks_trellis_node_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int init_encode_slice_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv,
+                                      FFVulkanShader* shd, const char* pl_name)
+{
+    int err = 0;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanDescriptorSetBinding *desc;
+
+    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
+                      NULL, 0, 64, 1, 1, 0);
+
+    av_bprintf(&shd->src, "#define DCTSIZE 8\n");
+    av_bprintf(&shd->src, "#define MAX_PLANES 4\n");
+    av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice);
+    av_bprintf(&shd->src, "#define CHROMA_FACTOR %d\n", pv->ctx.chroma_factor);
+    av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits);
+    av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n");
+    av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4 score[16]; int total_bits[16]; int total_score[16]; uint overquant; int buf_start; int quant; };\n");
+
+    desc = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "SliceBuffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceData slices[];",
+        },
+        {
+            .name        = "SliceScores",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "SliceScore scores[];",
+        },
+        {
+            .name        = "ProresDataTables",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t qmat[128][64]; int16_t qmat_chroma[128][64]; uint8_t scan[64]; "
+                           "uint8_t dc_codebook[7]; uint8_t run_to_cb[16]; uint8_t level_to_cb[10];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0));
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(EncodeSliceInfo), VK_SHADER_STAGE_COMPUTE_BIT);
+    av_bprintf(&shd->src, "#define PB_UNALIGNED\n");
+    av_bprintf(&shd->src, "#extension GL_EXT_buffer_reference : require\n");
+    av_bprintf(&shd->src, "#extension GL_EXT_buffer_reference2 : require\n");
+    GLSLD(ff_source_common_comp);
+    GLSLD(ff_source_prores_ks_encode_slice_comp);
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd));
+
+fail:
+    return err;
+}
+
+static int vulkan_encode_prores_submit_frame(AVCodecContext *avctx, FFVkExecContext *exec,
+                                             AVFrame *frame, int picture_idx)
+{
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    VulkanEncodeProresFrameData *pd = exec->opaque;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanFunctions *vk = &vkctx->vkfn;
+    int err = 0, nb_img_bar = 0, i, is_chroma;
+    int min_quant = ctx->profile_info->min_quant;
+    int max_quant = ctx->profile_info->max_quant;
+    int subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
+    int estimate_dim_x = ctx->alpha_bits ? subgroup_size : (subgroup_size / 3) * 3;
+    int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY;
+    VkImageView views[AV_NUM_DATA_POINTERS];
+    VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS];
+    FFVkBuffer *pkt_vk_buf, *slice_data_buf, *slice_score_buf, *frame_size_buf;
+    SliceDataInfo slice_data_info;
+    EstimateSliceInfo estimate_info;
+    TrellisNodeInfo trellis_node_info;
+    EncodeSliceInfo encode_info;
+    FFVulkanShader *shd;
+
+    /* Start recording */
+    ff_vk_exec_start(vkctx, exec);
+
+    /* Get a pooled buffer for writing output data */
+    RET(ff_vk_get_pooled_buffer(vkctx, &pv->pkt_buf_pool, &pd->out_data_ref[picture_idx],
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
+                                ctx->frame_size_upper_bound + FF_INPUT_BUFFER_MIN_SIZE,
+                                transfer_slices ? VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
+                                                : (VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
+                                                   VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                                   VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)));
+    pkt_vk_buf = (FFVkBuffer*)pd->out_data_ref[picture_idx]->data;
+    ff_vk_exec_add_dep_buf(vkctx, exec, &pd->out_data_ref[picture_idx], 1, 1);
+
+    /* Allocate buffer for writing slice data */
+    RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_data_buf_pool, &pd->slice_data_ref[picture_idx],
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
+                                ctx->slices_per_picture * sizeof(SliceData),
+                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+    slice_data_buf = (FFVkBuffer*)pd->slice_data_ref[picture_idx]->data;
+    ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_data_ref[picture_idx], 1, 1);
+
+    /* Allocate buffer for writing slice scores */
+    RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_score_buf_pool, &pd->slice_score_ref[picture_idx],
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
+                                ctx->slices_per_picture * sizeof(SliceScore),
+                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+    slice_score_buf = (FFVkBuffer*)pd->slice_score_ref[picture_idx]->data;
+    ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_score_ref[picture_idx], 1, 1);
+
+    /* Allocate buffer for writing frame size */
+    RET(ff_vk_get_pooled_buffer(vkctx, &pv->frame_size_buf_pool, &pd->frame_size_ref[picture_idx],
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
+                                sizeof(int),
+                                VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
+                                VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                VK_MEMORY_PROPERTY_HOST_COHERENT_BIT));
+    frame_size_buf = (FFVkBuffer*)pd->frame_size_ref[picture_idx]->data;
+    ff_vk_exec_add_dep_buf(vkctx, exec, &pd->frame_size_ref[picture_idx], 1, 1);
+
+    /* Generate barriers and image views for frame images. */
+    RET(ff_vk_exec_add_dep_frame(vkctx, exec, frame,
+                                 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+    RET(ff_vk_create_imageviews(vkctx, exec, views, frame, FF_VK_REP_INT));
+    ff_vk_frame_barrier(vkctx, exec, frame, img_bar, &nb_img_bar,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_ACCESS_SHADER_READ_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+
+    /* Submit the image barriers. */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+                                           .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+                                           .pImageMemoryBarriers = img_bar,
+                                           .imageMemoryBarrierCount = nb_img_bar,
+                                       });
+
+    /* Apply FDCT on input image data for future passes */
+    slice_data_info = (SliceDataInfo) {
+        .pictures_per_frame = ctx->pictures_per_frame,
+        .line_add = ctx->pictures_per_frame == 1 ? 0 : picture_idx ^ !(frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST),
+    };
+    for (i = 0; i < ctx->num_planes; i++) {
+        is_chroma = (i == 1 || i == 2);
+        shd = &pv->slice_data_shd[!is_chroma || ctx->chroma_factor == CFACTOR_Y444];
+        if (i < 3) {
+            slice_data_info.plane = i;
+            ff_vk_shader_update_desc_buffer(vkctx, exec, shd, 0, 0, 0,
+                                            slice_data_buf, 0, slice_data_buf->size,
+                                            VK_FORMAT_UNDEFINED);
+            ff_vk_shader_update_img_array(vkctx, exec, shd, frame, views, 0, 1,
+                                          VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
+            ff_vk_exec_bind_shader(vkctx, exec, shd);
+            ff_vk_shader_update_push_const(vkctx, exec, shd, VK_SHADER_STAGE_COMPUTE_BIT,
+                                           0, sizeof(SliceDataInfo), &slice_data_info);
+            vk->CmdDispatch(exec->buf, ctx->slices_width, ctx->mb_height, 1);        
+        } else {
+            ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->alpha_data_shd, 0, 0, 0,
+                                            slice_data_buf, 0, slice_data_buf->size,
+                                            VK_FORMAT_UNDEFINED);
+            ff_vk_shader_update_img(vkctx, exec, &pv->alpha_data_shd, 0, 1, 0, views[3],
+                                    VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
+            ff_vk_exec_bind_shader(vkctx, exec, &pv->alpha_data_shd);
+            vk->CmdDispatch(exec->buf, ctx->mb_width, ctx->mb_height, 1);
+        }
+    }
+
+    /* Wait for writes to slice buffer. */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .pNext = NULL,
+            .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = slice_data_buf->buf,
+            .offset = 0U,
+            .size = slice_data_buf->size,
+        },
+        .bufferMemoryBarrierCount = 1u,
+    });
+
+    /* Estimate slice bits and error for each quant */
+    estimate_info = (EstimateSliceInfo) {
+        .slices_per_picture = ctx->slices_per_picture,
+        .min_quant = ctx->force_quant ? ctx->force_quant : min_quant,
+        .max_quant = ctx->force_quant ? ctx->force_quant : max_quant,
+        .bits_per_mb = ctx->bits_per_mb,
+    };
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 0, 0,
+                                    slice_data_buf, 0, slice_data_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 1, 0,
+                                    slice_score_buf, 0, slice_score_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 2, 0,
+                                    &pv->prores_data_tables_buf, 0, pv->prores_data_tables_buf.size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_exec_bind_shader(vkctx, exec, &pv->estimate_slice_shd);
+
+    ff_vk_shader_update_push_const(vkctx, exec, &pv->estimate_slice_shd,
+                                   VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(estimate_info),
+                                   &estimate_info);
+    vk->CmdDispatch(exec->buf, (ctx->slices_per_picture * ctx->num_planes + estimate_dim_x - 1) / estimate_dim_x,
+                               ctx->force_quant ? 1 : (max_quant - min_quant + 1), 1);
+
+    /* Wait for writes to score buffer. */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .pNext = NULL,
+            .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = slice_score_buf->buf,
+            .offset = 0U,
+            .size = slice_score_buf->size,
+        },
+        .bufferMemoryBarrierCount = 1u,
+    });
+
+    /* Compute optimal quant value for each slice */
+    trellis_node_info = (TrellisNodeInfo) {
+        .min_quant = min_quant,
+        .max_quant = max_quant,
+        .bits_per_mb = ctx->bits_per_mb,
+        .mbs_per_slice = ctx->mbs_per_slice,
+    };
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 0, 0,
+                                    frame_size_buf, 0, frame_size_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 1, 0,
+                                    slice_score_buf, 0, slice_score_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_exec_bind_shader(vkctx, exec, &pv->trellis_node_shd);
+    ff_vk_shader_update_push_const(vkctx, exec, &pv->trellis_node_shd, VK_SHADER_STAGE_COMPUTE_BIT,
+                                    0, sizeof(TrellisNodeInfo), &trellis_node_info);
+    vk->CmdDispatch(exec->buf, 1, 1, 1);
+
+    /* Wait for writes to quant buffer. */
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .pNext = NULL,
+            .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = frame_size_buf->buf,
+            .offset = 0U,
+            .size = frame_size_buf->size,
+        },
+        .bufferMemoryBarrierCount = 1u,
+    });
+
+    /* Encode slices. */
+    encode_info = (EncodeSliceInfo) {
+        .seek_table = pkt_vk_buf->address,
+        .bytestream = pkt_vk_buf->address + ctx->slices_per_picture * 2,
+        .num_planes = ctx->num_planes,
+        .slices_per_picture = ctx->slices_per_picture,
+        .max_quant = ctx->force_quant ? ctx->force_quant : max_quant,
+    };
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 0, 0,
+                                    slice_data_buf, 0, slice_data_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 1, 0,
+                                    slice_score_buf, 0, slice_score_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 2, 0,
+                                    &pv->prores_data_tables_buf, 0, pv->prores_data_tables_buf.size,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_exec_bind_shader(vkctx, exec, &pv->encode_slice_shd);
+    ff_vk_shader_update_push_const(vkctx, exec, &pv->encode_slice_shd,
+                                   VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(encode_info), &encode_info);
+    vk->CmdDispatch(exec->buf, FFALIGN(ctx->slices_per_picture, 64) / 64,
+                               ctx->num_planes, 1);
+    
+fail:
+    return err;
+}
+
+static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec, AVPacket *pkt)
+{
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    VulkanEncodeProresFrameData *pd = exec->opaque;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVulkanFunctions *vk = &vkctx->vkfn;
+    FFVkExecContext *transfer_exec;
+    uint8_t *orig_buf, *buf, *slice_sizes;
+    uint8_t *picture_size_pos;
+    int picture_idx, err = 0;
+    int frame_size, picture_size;
+    int pkt_size = ctx->frame_size_upper_bound;
+    int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY;
+    FFVkBuffer *out_data_buf, *frame_size_buf;
+    VkMappedMemoryRange invalidate_data;
+    AVBufferRef *mapped_ref;
+    FFVkBuffer *mapped_buf;
+
+    /* Allocate packet */
+    RET(ff_get_encode_buffer(avctx, pkt, pkt_size + FF_INPUT_BUFFER_MIN_SIZE, 0));
+
+    /* Initialize packet. */
+    pkt->pts      = pd->pts;
+    pkt->dts      = pd->pts;
+    pkt->duration = pd->duration;
+    pkt->flags   |= AV_PKT_FLAG_KEY * pd->key_frame;
+
+    if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
+        pkt->opaque          = pd->frame_opaque;
+        pkt->opaque_ref      = pd->frame_opaque_ref;
+        pd->frame_opaque_ref = NULL;
+    }
+    
+    /* Write frame atom */
+    orig_buf = pkt->data;
+    buf = ff_prores_kostya_write_frame_header(avctx,  ctx, &orig_buf, pd->flags,
+                              pd->color_primaries, pd->color_trc,
+                                   pd->colorspace);
+
+    /* Make sure encoding's done */
+    ff_vk_exec_wait(vkctx, exec);
+
+    /* Roll transfer execution context */
+    if (transfer_slices) {
+        RET(ff_vk_host_map_buffer(vkctx, &mapped_ref, pkt->data, pkt->buf,
+                                  VK_BUFFER_USAGE_TRANSFER_DST_BIT));
+        mapped_buf = (FFVkBuffer *)mapped_ref->data;
+        transfer_exec = ff_vk_exec_get(vkctx, &pv->transfer_exec_pool);
+        ff_vk_exec_start(vkctx, transfer_exec);
+    }
+
+    for (picture_idx = 0; picture_idx < ctx->pictures_per_frame; picture_idx++) {
+        /* Fetch buffers for the current picture. */
+        out_data_buf = (FFVkBuffer *)pd->out_data_ref[picture_idx]->data;
+        frame_size_buf = (FFVkBuffer *)pd->frame_size_ref[picture_idx]->data;
+
+        /* Invalidate slice/output data if needed */
+        invalidate_data = (VkMappedMemoryRange) {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .offset = 0,
+            .size = VK_WHOLE_SIZE,
+        };
+        if (!(frame_size_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+            invalidate_data.memory = frame_size_buf->mem;
+            vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1, &invalidate_data);
+        }
+
+        /* Write picture header */
+        picture_size_pos = buf + 1;
+        buf = ff_prores_kostya_write_picture_header(ctx, buf);
+
+        /* Skip over seek table */
+        slice_sizes = buf;
+        buf += ctx->slices_per_picture * 2;
+
+        /* Calculate final size */
+        buf += *(int*)frame_size_buf->mapped_mem;
+
+        if (transfer_slices) {
+            /* Perform host mapped transfer of slice data */
+            ff_vk_exec_add_dep_buf(vkctx, transfer_exec, &pd->out_data_ref[picture_idx], 1, 0);
+            ff_vk_exec_add_dep_buf(vkctx, transfer_exec, &mapped_ref, 1, 0);
+            vk->CmdCopyBuffer(transfer_exec->buf, out_data_buf->buf, mapped_buf->buf, 1, & (VkBufferCopy) {
+                .srcOffset = 0,
+                .dstOffset = mapped_buf->virtual_offset + slice_sizes - pkt->data,
+                .size = buf - slice_sizes,
+            });
+        } else {
+            /* Fallback to regular memcpy if transfer is not available */
+            if (!(out_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+                invalidate_data.memory = out_data_buf->mem;
+                vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1, &invalidate_data);
+            }
+            memcpy(slice_sizes, out_data_buf->mapped_mem, buf - slice_sizes);
+            av_buffer_unref(&pd->out_data_ref[picture_idx]);
+        }
+
+        /* Write picture size with header */
+        picture_size = buf - (picture_size_pos - 1);
+        bytestream_put_be32(&picture_size_pos, picture_size);
+
+        /* Slice output buffers no longer needed */
+        av_buffer_unref(&pd->slice_data_ref[picture_idx]);
+        av_buffer_unref(&pd->slice_score_ref[picture_idx]);
+        av_buffer_unref(&pd->frame_size_ref[picture_idx]);
+    }
+
+    /* Write frame size in header */
+    orig_buf -= 8;
+    frame_size = buf - orig_buf;
+    bytestream_put_be32(&orig_buf, frame_size);
+
+    av_shrink_packet(pkt, frame_size);
+    av_log(avctx, AV_LOG_VERBOSE, "Encoded data: %iMiB\n", pkt->size / (1024*1024));
+
+    /* Wait for slice transfer */
+    if (transfer_slices) {
+        RET(ff_vk_exec_submit(vkctx, transfer_exec));
+        ff_vk_exec_wait(vkctx, transfer_exec);
+    }
+
+fail:
+    return err;
+}
+
+static int vulkan_encode_prores_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
+{
+    int err;
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    VulkanEncodeProresFrameData *pd;
+    FFVkExecContext *exec;
+    AVFrame *frame;
+
+    while (1) {
+        /* Roll an execution context */
+        exec = ff_vk_exec_get(&pv->vkctx, &pv->e);
+
+        /* If it had a frame, immediately output it */
+        if (exec->had_submission) {
+            exec->had_submission = 0;
+            pv->in_flight--;
+            return get_packet(avctx, exec, pkt);
+        }
+
+        /* Get next frame to encode */
+        frame = pv->frame;
+        err = ff_encode_get_frame(avctx, frame);
+        if (err < 0 && err != AVERROR_EOF) {
+            return err;
+        } else if (err == AVERROR_EOF) {
+            if (!pv->in_flight)
+                return err;
+            continue;
+        }
+
+        /* Encode frame */
+        pd = exec->opaque;
+        pd->color_primaries = frame->color_primaries;
+        pd->color_trc = frame->color_trc;
+        pd->colorspace = frame->colorspace;
+        pd->pts = frame->pts;
+        pd->duration = frame->duration;
+        pd->flags = frame->flags;
+        if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
+            pd->frame_opaque     = frame->opaque;
+            pd->frame_opaque_ref = frame->opaque_ref;
+            frame->opaque_ref    = NULL;
+        }
+
+        err = vulkan_encode_prores_submit_frame(avctx, exec, frame, 0);
+        if (ctx->pictures_per_frame > 1)
+            vulkan_encode_prores_submit_frame(avctx, exec, frame, 1);
+
+        /* Submit execution context */
+        ff_vk_exec_submit(&pv->vkctx, exec);
+        av_frame_unref(frame);
+        if (err < 0)
+            return err;
+
+        pv->in_flight++;
+        if (pv->in_flight < pv->async_depth)
+            return AVERROR(EAGAIN);
+    }
+
+    return 0;
+}
+
+static av_cold int encode_close(AVCodecContext *avctx)
+{
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    FFVulkanContext *vkctx = &pv->vkctx;
+
+    ff_vk_exec_pool_free(vkctx, &pv->e);
+    ff_vk_exec_pool_free(vkctx, &pv->transfer_exec_pool);
+
+    if (ctx->alpha_bits)
+        ff_vk_shader_free(vkctx, &pv->alpha_data_shd);
+
+    ff_vk_shader_free(vkctx, &pv->slice_data_shd[0]);
+    ff_vk_shader_free(vkctx, &pv->slice_data_shd[1]);
+    ff_vk_shader_free(vkctx, &pv->estimate_slice_shd);
+    ff_vk_shader_free(vkctx, &pv->encode_slice_shd);
+    ff_vk_shader_free(vkctx, &pv->trellis_node_shd);
+
+    ff_vk_free_buf(vkctx, &pv->prores_data_tables_buf);
+
+    av_buffer_pool_uninit(&pv->pkt_buf_pool);
+    av_buffer_pool_uninit(&pv->slice_data_buf_pool);
+    av_buffer_pool_uninit(&pv->slice_score_buf_pool);
+    av_buffer_pool_uninit(&pv->frame_size_buf_pool);
+
+    ff_vk_uninit(vkctx);
+
+    return 0;
+}
+
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+    ProresVulkanContext *pv = avctx->priv_data;
+    ProresContext *ctx = &pv->ctx;
+    int err = 0, i, q;
+    FFVulkanContext *vkctx = &pv->vkctx;
+    FFVkSPIRVCompiler *spv;
+
+    /* Init vulkan */
+    RET(ff_vk_init(vkctx, avctx, NULL, avctx->hw_frames_ctx));
+
+    pv->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
+    if (!pv->qf) {
+        av_log(avctx, AV_LOG_ERROR, "Device has no compute queues!\n");
+        return AVERROR(ENOTSUP);
+    }
+
+    spv = ff_vk_spirv_init();
+    if (!spv) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    RET(ff_vk_exec_pool_init(vkctx, pv->qf, &pv->e, 1, 0, 0, 0, NULL));
+
+    pv->transfer_qf = ff_vk_qf_find(vkctx, VK_QUEUE_TRANSFER_BIT, 0);
+    if (!pv->transfer_qf) {
+        av_log(avctx, AV_LOG_ERROR, "Device has no transfer queues!\n");
+        return err;
+    }
+
+    RET(ff_vk_exec_pool_init(vkctx, pv->transfer_qf, &pv->transfer_exec_pool, 1, 0, 0, 0, NULL));
+
+    /* Init common prores structures */
+    err = ff_prores_kostya_encode_init(avctx, ctx, vkctx->frames->sw_format);
+    if (err < 0)
+        return err;
+
+    /* Temporary frame */
+    pv->frame = av_frame_alloc();
+    if (!pv->frame)
+        return AVERROR(ENOMEM);
+
+    /* Async data pool */
+    pv->async_depth = pv->e.pool_size;
+    pv->exec_ctx_info = av_calloc(pv->async_depth, sizeof(*pv->exec_ctx_info));
+    if (!pv->exec_ctx_info)
+        return AVERROR(ENOMEM);
+    for (int i = 0; i < pv->async_depth; i++)
+        pv->e.contexts[i].opaque = &pv->exec_ctx_info[i];
+
+    /* Compile shaders used by encoder */
+    init_slice_data_pipeline(pv, spv, &pv->slice_data_shd[0], "slice_data_blocks2", 2);
+    init_slice_data_pipeline(pv, spv, &pv->slice_data_shd[1], "slice_data_blocks4", 4);
+    init_estimate_slice_pipeline(pv, spv, &pv->estimate_slice_shd, "estimate_slice");
+    init_trellis_node_pipeline(pv, spv, &pv->trellis_node_shd, "trellis_node");
+    init_encode_slice_pipeline(pv, spv, &pv->encode_slice_shd, "encode_slice");
+    if (ctx->alpha_bits) {
+        init_alpha_data_pipeline(pv, spv, &pv->alpha_data_shd, "alpha_data");
+    }
+
+    /* Create prores data tables uniform buffer. */
+    RET(ff_vk_create_buf(vkctx, &pv->prores_data_tables_buf,
+                         sizeof(ProresDataTables), NULL, NULL,
+                         VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+                         VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+                         VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
+    RET(ff_vk_map_buffer(vkctx, &pv->prores_data_tables_buf, (void *)&pv->tables, 0));
+    memcpy(pv->tables->qmat, ctx->quants, sizeof(ctx->quants));
+    memcpy(pv->tables->qmat_chroma, ctx->quants_chroma, sizeof(ctx->quants_chroma));
+    memcpy(pv->tables->scan, ctx->scantable, sizeof(ff_prores_progressive_scan));
+    memcpy(pv->tables->dc_codebook, ff_prores_dc_codebook, sizeof(ff_prores_dc_codebook));
+    memcpy(pv->tables->run_to_cb, ff_prores_run_to_cb, sizeof(ff_prores_run_to_cb));
+    memcpy(pv->tables->level_to_cb, ff_prores_level_to_cb, sizeof(ff_prores_level_to_cb));
+
+    for (q = MAX_STORED_Q; q < 128; ++q) {
+        for (i = 0; i < 64; i++) {
+            pv->tables->qmat[q][i] = ctx->quant_mat[i] * q;
+            pv->tables->qmat_chroma[q][i] = ctx->quant_chroma_mat[i] * q;
+        }
+    }
+
+fail:
+    return err;
+}
+
+#define OFFSET(x) offsetof(ProresVulkanContext, x)
+#define VE     AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+
+static const AVOption options[] = {
+    { "mbs_per_slice", "macroblocks per slice", OFFSET(ctx.mbs_per_slice),
+        AV_OPT_TYPE_INT, { .i64 = 8 }, 1, MAX_MBS_PER_SLICE, VE },
+    { "profile",       NULL, OFFSET(ctx.profile), AV_OPT_TYPE_INT,
+        { .i64 = PRORES_PROFILE_AUTO },
+        PRORES_PROFILE_AUTO, PRORES_PROFILE_4444XQ, VE, .unit = "profile" },
+    { "auto",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_AUTO },
+        0, 0, VE, .unit = "profile" },
+    { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_PROXY },
+        0, 0, VE, .unit = "profile" },
+    { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_LT },
+        0, 0, VE, .unit = "profile" },
+    { "standard",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_STANDARD },
+        0, 0, VE, .unit = "profile" },
+    { "hq",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_HQ },
+        0, 0, VE, .unit = "profile" },
+    { "4444",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444 },
+        0, 0, VE, .unit = "profile" },
+    { "4444xq",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444XQ },
+        0, 0, VE, .unit = "profile" },
+    { "vendor", "vendor ID", OFFSET(ctx.vendor),
+        AV_OPT_TYPE_STRING, { .str = "Lavc" }, 0, 0, VE },
+    { "bits_per_mb", "desired bits per macroblock", OFFSET(ctx.bits_per_mb),
+        AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 8192, VE },
+    { "quant_mat", "quantiser matrix", OFFSET(ctx.quant_sel), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, QUANT_MAT_DEFAULT, VE, .unit = "quant_mat" },
+    { "auto",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = -1 },
+        0, 0, VE, .unit = "quant_mat" },
+    { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_PROXY },
+        0, 0, VE, .unit = "quant_mat" },
+    { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_LT },
+        0, 0, VE, .unit = "quant_mat" },
+    { "standard",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_STANDARD },
+        0, 0, VE, .unit = "quant_mat" },
+    { "hq",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_HQ },
+        0, 0, VE, .unit = "quant_mat" },
+    { "default",       NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_DEFAULT },
+        0, 0, VE, .unit = "quant_mat" },
+    { "alpha_bits", "bits for alpha plane", OFFSET(ctx.alpha_bits), AV_OPT_TYPE_INT,
+        { .i64 = 16 }, 0, 16, VE },
+    { "async_depth", "Internal parallelization depth", OFFSET(async_depth), AV_OPT_TYPE_INT,
+            { .i64 = 1 }, 1, INT_MAX, VE },
+    { NULL }
+};
+
+static const AVClass proresenc_class = {
+    .class_name = "ProRes vulkan encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVCodecHWConfigInternal *const prores_ks_hw_configs[] = {
+    HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN),
+    HW_CONFIG_ENCODER_DEVICE(NONE,  VULKAN),
+    NULL,
+};
+
+const FFCodec ff_prores_ks_vulkan_encoder = {
+    .p.name         = "prores_ks_vulkan",
+    CODEC_LONG_NAME("Apple ProRes (iCodec Pro)"),
+    .p.type         = AVMEDIA_TYPE_VIDEO,
+    .p.id           = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresVulkanContext),
+    .init           = encode_init,
+    .close          = encode_close,
+    FF_CODEC_RECEIVE_PACKET_CB(&vulkan_encode_prores_receive_packet),
+    .p.capabilities = AV_CODEC_CAP_DELAY |
+                      AV_CODEC_CAP_HARDWARE |
+                      AV_CODEC_CAP_ENCODER_FLUSH |
+                      AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
+    CODEC_PIXFMTS(AV_PIX_FMT_VULKAN),
+    .hw_configs     = prores_ks_hw_configs,
+    .color_ranges   = AVCOL_RANGE_MPEG,
+    .p.priv_class   = &proresenc_class,
+    .p.profiles     = NULL_IF_CONFIG_SMALL(ff_prores_profiles),
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_EOF_FLUSH,
+};
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index d8e1471fa6..f69e430c33 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -9,6 +9,13 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER)  +=  vulkan/common.o \
 					vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_setup.o \
 					vulkan/ffv1_rct_search.o vulkan/ffv1_enc.o
 
+OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += vulkan/common.o \
+					vulkan/prores_ks_alpha_data.o \
+					vulkan/prores_ks_slice_data.o \
+					vulkan/prores_ks_estimate_slice.o \
+					vulkan/prores_ks_encode_slice.o \
+					vulkan/prores_ks_trellis_node.o
+
 OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
 					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
 					vulkan/ffv1_common.o vulkan/ffv1_reset.o \
diff --git a/libavcodec/vulkan/prores_ks_alpha_data.comp b/libavcodec/vulkan/prores_ks_alpha_data.comp
new file mode 100644
index 0000000000..825ba28a4f
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_alpha_data.comp
@@ -0,0 +1,67 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_EXT_samplerless_texture_functions : require
+
+/* Table of possible edge slice configurations */
+const uvec3 edge_mps_table[8] = uvec3[](
+    uvec3(0, 0, 0),
+    uvec3(1, 0, 0),
+    uvec3(2, 0, 0),
+    uvec3(2, 1, 0),
+    uvec3(4, 0, 0),
+    uvec3(4, 1, 0),
+    uvec3(4, 2, 0),
+    uvec3(4, 2, 1)
+);
+
+void main()
+{
+    ivec2 coord = min(ivec2(gl_GlobalInvocationID.xy), textureSize(plane, 0) - ivec2(1));
+    int alpha = texelFetch(plane, coord, 0).x;
+
+#if ALPHA_BITS == 8
+    alpha >>= 2;
+#else
+    alpha = (alpha << 6) | (alpha >> 4);
+#endif
+
+    uint mbs_per_slice = MAX_MBS_PER_SLICE;
+    uint slices_width = WIDTH_IN_MB / mbs_per_slice;
+    uint mb_width = slices_width * mbs_per_slice;
+    uint slice_x = gl_WorkGroupID.x / mbs_per_slice;
+    uint slice_y = gl_WorkGroupID.y;
+    uvec2 slice_base = uvec2(slice_x, slice_y) * (mbs_per_slice * 16u);
+
+    /* Handle slice macroblock size reduction on edge slices */
+    if (gl_WorkGroupID.x >= mb_width)
+    {
+        uint edge_mb = gl_WorkGroupID.x - mb_width;
+        uvec3 table = edge_mps_table[WIDTH_IN_MB - mb_width];
+        uvec3 base = uvec3(0, table.x, table.x + table.y);
+        uint edge_slice = edge_mb < base.y ? 0 : (edge_mb < base.z ? 1 : 2);
+        slice_x += edge_slice;
+        slice_base += base[edge_slice] * (DCTSIZE * 2u);
+        mbs_per_slice = table[edge_slice];
+    }
+
+    uint slice = slice_y * SLICES_PITCH + slice_x;
+    uvec2 coeff_coord = uvec2(coord) - slice_base;
+    uint coeff = coeff_coord.y * (mbs_per_slice * 16u) + coeff_coord.x;
+    slices[slice].coeffs[3][coeff] = int16_t(alpha);
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_encode_slice.comp b/libavcodec/vulkan/prores_ks_encode_slice.comp
new file mode 100644
index 0000000000..2c06388a46
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_encode_slice.comp
@@ -0,0 +1,230 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CFACTOR_Y444 3
+
+layout(push_constant, scalar) uniform EncodeSliceInfo {
+    u8buf bytestream;
+    u8vec2buf seek_table;
+    int num_planes;
+    int slices_per_picture;
+    int max_quant;
+};
+
+int av_zero_extend(int a, uint p)
+{
+    return int(uint(a) & ((1U << p) - 1));
+}
+
+void encode_vlc_codeword(inout PutBitContext pb, uint codebook, int val)
+{
+    /* number of prefix bits to switch between Rice and expGolomb */
+    uint switch_bits = (codebook & 3) + 1;
+    uint rice_order  =  codebook >> 5;       /* rice code order */
+    uint exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
+
+    uint switch_val  = switch_bits << rice_order;
+
+    if (val >= switch_val) {
+        val -= int(switch_val - (1 << exp_order));
+        int exponent = findMSB(val);
+
+        put_bits(pb, exponent - exp_order + switch_bits, 0);
+        put_bits(pb, exponent + 1, val);
+    } else {
+        int exponent = val >> rice_order;
+        if (exponent != 0)
+            put_bits(pb, exponent, 0);
+        put_bits(pb, 1, 1);
+        if (rice_order != 0)
+            put_bits(pb, rice_order, av_zero_extend(val, rice_order));
+    }
+}
+
+#define GET_SIGN(x)  ((x) >> 31)
+#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x))
+
+#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0
+
+void encode_dcs(inout PutBitContext pb, bool is_chroma, int q)
+{
+    uint slice = gl_GlobalInvocationID.x;
+    uint plane = gl_GlobalInvocationID.y;
+    uint blocks_per_mb = is_chroma && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+    uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+    int codebook = 5;
+    int scale = is_chroma ? qmat_chroma[q][0] : qmat[q][0];
+    int coeff = slices[slice].coeffs[plane][0];
+    int prev_dc = (coeff - 0x4000) / scale;
+    encode_vlc_codeword(pb, FIRST_DC_CB, MAKE_CODE(prev_dc));
+    int sign = 0;
+    for (int i = 1; i < blocks_per_slice; i++) {
+        coeff = slices[slice].coeffs[plane][i * 64];
+        int dc = (coeff - 0x4000) / scale;
+        int delta = dc - prev_dc;
+        int new_sign = GET_SIGN(delta);
+        delta = (delta ^ sign) - sign;
+        int code = MAKE_CODE(delta);
+        encode_vlc_codeword(pb, dc_codebook[codebook], code);
+        codebook = min(code, 6);
+        sign = new_sign;
+        prev_dc = dc;
+    }
+}
+
+void encode_acs(inout PutBitContext pb, bool is_chroma, int q)
+{
+    uint slice = gl_GlobalInvocationID.x;
+    uint plane = gl_GlobalInvocationID.y;
+    uint blocks_per_mb = is_chroma && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+    uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+    uint max_coeffs = blocks_per_slice << 6;
+    int prev_run = 4;
+    int prev_level = 2;
+    int run = 0;
+
+    for (int i = 1; i < 64; i++) {
+        for (int idx = scan[i]; idx < max_coeffs; idx += 64) {
+            int coeff = slices[slice].coeffs[plane][idx];
+            int level = coeff / (is_chroma ? qmat_chroma[q][scan[i]] : qmat[q][scan[i]]);
+            if (level != 0) {
+                int abs_level = abs(level);
+                encode_vlc_codeword(pb, run_to_cb[prev_run], run);
+                encode_vlc_codeword(pb, level_to_cb[prev_level], abs_level - 1);
+                put_bits(pb, 1, av_zero_extend(GET_SIGN(level), 1));
+                prev_run = min(run, 15);
+                prev_level = min(abs_level, 9);
+                run = 0;
+            } else {
+                run++;
+            }
+        }
+    }
+}
+
+void encode_slice_plane(inout PutBitContext pb, int q)
+{
+    uint plane = gl_GlobalInvocationID.y;
+    bool is_chroma = plane == 1 || plane == 2;
+    encode_dcs(pb, is_chroma, q);
+    encode_acs(pb, is_chroma, q);
+}
+
+void put_alpha_diff(inout PutBitContext pb, int cur, int prev)
+{
+    const int dbits = (ALPHA_BITS == 8) ? 4 : 7;
+    const int dsize = 1 << dbits - 1;
+    int diff = cur - prev;
+
+    diff = av_zero_extend(diff, ALPHA_BITS);
+    if (diff >= (1 << ALPHA_BITS) - dsize)
+        diff -= 1 << ALPHA_BITS;
+    if (diff < -dsize || diff > dsize || diff == 0) {
+        put_bits(pb, 1, 1);
+        put_bits(pb, ALPHA_BITS, diff);
+    } else {
+        put_bits(pb, 1, 0);
+        put_bits(pb, dbits - 1, abs(diff) - 1);
+        put_bits(pb, 1, int(diff < 0));
+    }
+}
+
+void put_alpha_run(inout PutBitContext pb, int run)
+{
+    if (run != 0) {
+        put_bits(pb, 1, 0);
+        if (run < 0x10)
+            put_bits(pb, 4, run);
+        else
+            put_bits(pb, 15, run);
+    } else {
+        put_bits(pb, 1, 1);
+    }
+}
+
+void encode_alpha_plane(inout PutBitContext pb)
+{
+    uint slice = gl_GlobalInvocationID.x;
+    const int mask = (1 << ALPHA_BITS) - 1;
+    const int num_coeffs = int(slices[slice].mbs_per_slice) * 256;
+    int prev = mask, cur;
+    int idx = 0;
+    int run = 0;
+
+    cur = slices[slice].coeffs[3][idx++];
+    put_alpha_diff(pb, cur, prev);
+    prev = cur;
+    do {
+        cur = slices[slice].coeffs[3][idx++];
+        if (cur != prev) {
+            put_alpha_run(pb, run);
+            put_alpha_diff(pb, cur, prev);
+            prev = cur;
+            run  = 0;
+        } else {
+            run++;
+        }
+    } while (idx < num_coeffs);
+    put_alpha_run(pb, run);
+}
+
+u8vec2 byteswap16(int value)
+{
+    return unpack8(uint16_t(value)).yx;
+}
+
+void main()
+{
+    uint slice = gl_GlobalInvocationID.x;
+    if (slice >= slices_per_picture)
+        return;
+
+    uint plane = gl_GlobalInvocationID.y;
+    int q = scores[slice].quant;
+    int q_idx = min(q, max_quant + 1);
+    int slice_hdr_size = 2 * num_planes;
+    int slice_size = slice_hdr_size + (scores[slice].total_bits[q_idx] / 8);
+    u8buf buf = OFFBUF(u8buf, bytestream, scores[slice].buf_start);
+
+    /* Write slice header */
+    if (plane == 0)
+    {
+        buf[0].v = uint8_t(slice_hdr_size * 8);
+        buf[1].v = uint8_t(q);
+        u8vec2buf slice_hdr = OFFBUF(u8vec2buf, buf, 2);
+        for (int i = 0; i < num_planes - 1; i++)
+        {
+            int bits = scores[slice].bits[q_idx][i] / 8;
+            slice_hdr[i].v = byteswap16(bits);
+        }
+        seek_table[slice].v = byteswap16(slice_size);
+    }
+
+    int plane_offset = 0;
+    for (int i = 0; i < plane; ++i)
+        plane_offset += scores[slice].bits[q_idx][i] / 8;
+
+    /* Encode slice plane */
+    PutBitContext pb;
+    init_put_bits(pb, OFFBUF(u8buf, buf, slice_hdr_size + plane_offset), 0);
+    if (plane == 3)
+        encode_alpha_plane(pb);
+    else
+        encode_slice_plane(pb, q);
+    flush_put_bits(pb);
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_estimate_slice.comp b/libavcodec/vulkan/prores_ks_estimate_slice.comp
new file mode 100644
index 0000000000..5f9b39cd75
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_estimate_slice.comp
@@ -0,0 +1,267 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_KHR_shader_subgroup_clustered : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+
+#define CFACTOR_Y444 3
+
+layout(push_constant, scalar) uniform EstimateSliceInfo {
+    uint slices_per_picture;
+    uint min_quant;
+    uint max_quant;
+    uint bits_per_mb;
+};
+
+int av_zero_extend(int a, uint p)
+{
+    return int(uint(a) & ((1U << p) - 1));
+}
+
+#define GET_SIGN(x)  ((x) >> 31)
+#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x))
+
+int estimate_vlc(uint codebook, int val)
+{
+    /* number of prefix bits to switch between Rice and expGolomb */
+    uint switch_bits = (codebook & 3) + 1;
+    uint rice_order  =  codebook >> 5;       /* rice code order */
+    uint exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
+
+    uint switch_val  = switch_bits << rice_order;
+
+    if (val >= switch_val)
+    {
+        val -= int(switch_val - (1 << exp_order));
+        int exponent = findMSB(val);
+        return int(exponent * 2 - exp_order + switch_bits + 1);
+    }
+    else
+    {
+        return int((val >> rice_order) + rice_order + 1);
+    }
+}
+
+#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0
+
+int estimate_dcs(inout int error, uint slice, uint plane, uint q)
+{
+    uint blocks_per_mb = plane != 0 && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+    uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+    int codebook = 5;
+    int coeff = slices[slice].coeffs[plane][0];
+    int scale = plane != 0 ? qmat_chroma[q][0] : qmat[q][0];
+    int prev_dc = (coeff - 0x4000) / scale;
+    int bits = estimate_vlc(FIRST_DC_CB, MAKE_CODE(prev_dc));
+    int sign = 0;
+    coeff = slices[slice].coeffs[plane][64];
+    error += abs(coeff - 0x4000) % scale;
+
+    for (int i = 1; i < blocks_per_slice; ++i) {
+        coeff = slices[slice].coeffs[plane][i * 64];
+        int dc = (coeff - 0x4000) / scale;
+        error += abs(coeff - 0x4000) % scale;
+        int delta = dc - prev_dc;
+        int new_sign = GET_SIGN(delta);
+        delta = (delta ^ sign) - sign;
+        int code = MAKE_CODE(delta);
+        bits += estimate_vlc(dc_codebook[codebook], code);
+        codebook = min(code, 6);
+        sign = new_sign;
+        prev_dc = dc;
+    }
+
+    return bits;
+}
+
+#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))
+#define SCORE_LIMIT   1073741823
+
+int estimate_acs(inout int error, uint slice, uint plane, uint q)
+{
+    uint blocks_per_mb = plane != 0 && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4;
+    uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb;
+    uint max_coeffs = blocks_per_slice << 6;
+    int prev_run = 4;
+    int prev_level = 2;
+    int bits = 0;
+    int run = 0;
+
+    for (int i = 1; i < 64; i++) {
+        for (int idx = scan[i]; idx < max_coeffs; idx += 64) {
+            int coeff = slices[slice].coeffs[plane][idx];
+            int quant = plane != 0 ? qmat_chroma[q][scan[i]] : qmat[q][scan[i]];
+            int level = coeff / quant;
+            error += abs(coeff) % quant;
+            if (level != 0) {
+                int abs_level = abs(level);
+                bits += estimate_vlc(run_to_cb[prev_run], run);
+                bits += estimate_vlc(level_to_cb[prev_level], abs_level - 1) + 1;
+                prev_run = min(run, 15);
+                prev_level = min(abs_level, 9);
+                run = 0;
+            } else {
+                run++;
+            }
+        }
+    }
+
+    return bits;
+}
+
+int estimate_slice_plane(inout int error, uint slice, uint plane, uint q)
+{
+    int bits = 0;
+    bits += estimate_dcs(error, slice, plane, q);
+    bits += estimate_acs(error, slice, plane, q);
+    return FFALIGN(bits, 8);
+}
+
+int est_alpha_diff(int cur, int prev)
+{
+    const int dbits = (ALPHA_BITS == 8) ? 4 : 7;
+    const int dsize = 1 << dbits - 1;
+    int diff = cur - prev;
+
+    diff = av_zero_extend(diff, ALPHA_BITS);
+    if (diff >= (1 << ALPHA_BITS) - dsize)
+        diff -= 1 << ALPHA_BITS;
+    if (diff < -dsize || diff > dsize || diff == 0)
+        return ALPHA_BITS + 1;
+    else
+        return dbits + 1;
+}
+
+int estimate_alpha_plane(uint slice)
+{
+    const int mask  = (1 << ALPHA_BITS) - 1;
+    const int num_coeffs = int(slices[slice].mbs_per_slice) * 256;
+    int prev = mask, cur;
+    int idx = 0;
+    int run = 0;
+    int bits;
+
+    cur = slices[slice].coeffs[3][idx++];
+    bits = est_alpha_diff(cur, prev);
+    prev = cur;
+    do {
+        cur = slices[slice].coeffs[3][idx++];
+        if (cur != prev) {
+            if (run == 0)
+                bits++;
+            else if (run < 0x10)
+                bits += 4;
+            else
+                bits += 15;
+            bits += est_alpha_diff(cur, prev);
+            prev = cur;
+            run  = 0;
+        } else {
+            run++;
+        }
+    } while (idx < num_coeffs);
+
+    if (run != 0) {
+        if (run < 0x10)
+            bits += 4;
+        else
+            bits += 15;
+    }
+
+    return bits;
+}
+
+int sum_of_planes(int value)
+{
+#if NUM_PLANES == 3
+    uint base = (gl_SubgroupInvocationID / 3) * 3;
+    return subgroupShuffle(value, base) + subgroupShuffle(value, base + 1) + subgroupShuffle(value, base + 2);
+#else
+    return subgroupClusteredAdd(value, 4);
+#endif
+}
+
+void main()
+{
+    uint slice = gl_GlobalInvocationID.x / NUM_PLANES;
+    uint plane = gl_LocalInvocationID.x % NUM_PLANES;
+    uint q = min_quant + gl_GlobalInvocationID.y;
+    if (slice >= slices_per_picture)
+        return;
+
+    /* Estimate slice bits and error for specified quantizer and plane */
+    int error = 0;
+    int bits = 0;
+    if (plane == 3)
+        bits = estimate_alpha_plane(slice);
+    else
+        bits = estimate_slice_plane(error, slice, plane, q);
+
+    /* Write results to score buffer */
+    scores[slice].bits[q][plane] = bits;
+    scores[slice].score[q][plane] = error;
+
+    /* Accumulate total bits and error of all planes */
+    int total_bits = sum_of_planes(bits);
+    int total_score = sum_of_planes(error);
+    if (total_bits > 65000 * 8)
+        total_score = SCORE_LIMIT;
+    scores[slice].total_bits[q] = total_bits;
+    scores[slice].total_score[q] = total_score;
+
+    if (q != max_quant)
+        return;
+
+    /* Task threads that computed max_quant to also compute overquant if necessary */
+    uint mbs_per_slice = slices[slice].mbs_per_slice;
+    if (total_bits <= bits_per_mb * mbs_per_slice)
+    {
+        /* Overquant isn't needed for this slice */
+        scores[slice].total_bits[max_quant + 1] = total_bits;
+        scores[slice].total_score[max_quant + 1] = total_score + 1;
+        scores[slice].overquant = max_quant;
+    }
+    else
+    {
+        /* Keep searching until an encoding fits our budget */
+        for (q = max_quant + 1; q < 128; ++q)
+        {
+            /* Estimate slice bits and error for specified quantizer and plane */
+            error = 0;
+            bits = 0;
+            if (plane == 3)
+                bits = estimate_alpha_plane(slice);
+            else
+                bits = estimate_slice_plane(error, slice, plane, q);
+
+            /* Accumulate total bits and error of all planes */
+            total_bits = sum_of_planes(bits);
+            total_score = sum_of_planes(error);
+
+            /* If estimated bits fit within budget, we are done */
+            if (total_bits <= bits_per_mb * mbs_per_slice)
+                break;
+        }
+
+        scores[slice].bits[max_quant + 1][plane] = bits;
+        scores[slice].score[max_quant + 1][plane] = error;
+        scores[slice].total_bits[max_quant + 1] = total_bits;
+        scores[slice].total_score[max_quant + 1] = total_score;
+        scores[slice].overquant = q;
+    }
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_slice_data.comp b/libavcodec/vulkan/prores_ks_slice_data.comp
new file mode 100644
index 0000000000..6a943532c5
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_slice_data.comp
@@ -0,0 +1,265 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_EXT_samplerless_texture_functions : require
+
+layout(push_constant, scalar) uniform SliceDataInfo {
+    int plane;
+    int pictures_per_frame;
+    int line_add;
+};
+
+shared i16vec4 coeffs[MAX_MBS_PER_SLICE][BLOCKS_PER_MB][DCTSIZE][DCTSIZE / 4];
+
+#define CONST_BITS  13
+#define PASS1_BITS  1   /* lose a little precision to avoid overflow */
+#define OUT_SHIFT   (PASS1_BITS + 1)
+
+#define FIX_0_541196100 4433  /* FIX(0.541196100) */
+#define FIX_0_765366865 6270 /* FIX(0.765366865) */
+#define FIX_1_847759065 15137 /* FIX(1.847759065) */
+#define FIX_1_175875602 9633 /* FIX(1.175875602) */
+#define FIX_0_298631336 2446 /* FIX(0.298631336) */
+#define FIX_3_072711026 25172 /* FIX(3.072711026) */
+#define FIX_1_501321110 12299 /* FIX(1.501321110) */
+#define FIX_0_899976223 7373 /* FIX(0.899976223) */
+#define FIX_1_961570560 16069 /* FIX(1.961570560) */
+#define FIX_2_053119869 16819 /* FIX(2.053119869) */
+#define FIX_2_562915447 20995 /* FIX(2.562915447) */
+#define FIX_0_390180644 3196 /* FIX(0.390180644) */
+
+#define MULTIPLY(type, var, cons) type(uint32_t(var) * uint32_t(cons))
+#define RIGHT_SHIFT(x, n) ((x) >> (n))
+#define DESCALE(x,n)  RIGHT_SHIFT(int32_t(x) + (1 << ((n) - 1)), n)
+
+void row_fdct(i32vec4 data_lo, i32vec4 data_hi)
+{
+    uint row_idx = gl_LocalInvocationID.x;
+    uint block = gl_LocalInvocationID.y;
+    uint mb = gl_LocalInvocationID.z;
+
+    /* Pass 1: process rows. */
+    /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+    /* furthermore, we scale the results by 2**PASS1_BITS. */
+    int32_t tmp0 = data_lo.x + data_hi.w;
+    int32_t tmp7 = data_lo.x - data_hi.w;
+    int32_t tmp1 = data_lo.y + data_hi.z;
+    int32_t tmp6 = data_lo.y - data_hi.z;
+    int32_t tmp2 = data_lo.z + data_hi.y;
+    int32_t tmp5 = data_lo.z - data_hi.y;
+    int32_t tmp3 = data_lo.w + data_hi.x;
+    int32_t tmp4 = data_lo.w - data_hi.x;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+    int32_t tmp10 = tmp0 + tmp3;
+    int32_t tmp13 = tmp0 - tmp3;
+    int32_t tmp11 = tmp1 + tmp2;
+    int32_t tmp12 = tmp1 - tmp2;
+
+    data_lo.x = (tmp10 + tmp11) * (1 << PASS1_BITS);
+    data_hi.x = (tmp10 - tmp11) * (1 << PASS1_BITS);
+
+    uint32_t z1 = MULTIPLY(uint32_t, tmp12 + tmp13, FIX_0_541196100);
+    data_lo.z = DESCALE(z1 + MULTIPLY(uint32_t, tmp13, FIX_0_765366865), CONST_BITS-PASS1_BITS);
+    data_hi.z = DESCALE(z1 + MULTIPLY(uint32_t, tmp12, -FIX_1_847759065), CONST_BITS-PASS1_BITS);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * cK represents cos(K*pi/16).
+     * i0..i3 in the paper are tmp4..tmp7 here.
+     */
+    z1 = tmp4 + tmp7;
+    uint32_t z2 = tmp5 + tmp6;
+    uint32_t z3 = tmp4 + tmp6;
+    uint32_t z4 = tmp5 + tmp7;
+    uint32_t z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+    tmp4 = MULTIPLY(int32_t, tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp5 = MULTIPLY(int32_t, tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp6 = MULTIPLY(int32_t, tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp7 = MULTIPLY(int32_t, tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = MULTIPLY(uint32_t, z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    z2 = MULTIPLY(uint32_t, z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(uint32_t, z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(uint32_t, z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+
+    z3 += z5;
+    z4 += z5;
+
+    data_hi.w = DESCALE(uint32_t(tmp4) + z1 + z3, CONST_BITS - PASS1_BITS);
+    data_hi.y = DESCALE(uint32_t(tmp5) + z2 + z4, CONST_BITS - PASS1_BITS);
+    data_lo.w = DESCALE(uint32_t(tmp6) + z2 + z3, CONST_BITS - PASS1_BITS);
+    data_lo.y = DESCALE(uint32_t(tmp7) + z1 + z4, CONST_BITS - PASS1_BITS);
+
+    coeffs[mb][block][row_idx][0] = i16vec4(data_lo);
+    coeffs[mb][block][row_idx][1] = i16vec4(data_hi);
+}
+
+void ff_jpeg_fdct_islow_10()
+{
+    uint col_half = gl_LocalInvocationID.x / 4;
+    uint col = gl_LocalInvocationID.x & 3u;
+    uint block = gl_LocalInvocationID.y;
+    uint mb = gl_LocalInvocationID.z;
+
+    i16vec4 col_lo = i16vec4(coeffs[mb][block][0][col_half][col],
+                             coeffs[mb][block][1][col_half][col],
+                             coeffs[mb][block][2][col_half][col],
+                             coeffs[mb][block][3][col_half][col]);
+    i16vec4 col_hi = i16vec4(coeffs[mb][block][4][col_half][col],
+                             coeffs[mb][block][5][col_half][col],
+                             coeffs[mb][block][6][col_half][col],
+                             coeffs[mb][block][7][col_half][col]);
+    i32vec4 data_lo = i32vec4(col_lo);
+    i32vec4 data_hi = i32vec4(col_hi);
+
+    /* Pass 2: process columns.
+     * We remove the PASS1_BITS scaling, but leave the results scaled up
+     * by an overall factor of 8.
+     */
+    int32_t tmp0 = data_lo.x + data_hi.w;
+    int32_t tmp7 = data_lo.x - data_hi.w;
+    int32_t tmp1 = data_lo.y + data_hi.z;
+    int32_t tmp6 = data_lo.y - data_hi.z;
+    int32_t tmp2 = data_lo.z + data_hi.y;
+    int32_t tmp5 = data_lo.z - data_hi.y;
+    int32_t tmp3 = data_lo.w + data_hi.x;
+    int32_t tmp4 = data_lo.w - data_hi.x;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+    int32_t tmp10 = tmp0 + tmp3;
+    int32_t tmp13 = tmp0 - tmp3;
+    int32_t tmp11 = tmp1 + tmp2;
+    int32_t tmp12 = tmp1 - tmp2;
+
+    data_lo.x = DESCALE(tmp10 + tmp11, OUT_SHIFT);
+    data_hi.x = DESCALE(tmp10 - tmp11, OUT_SHIFT);
+
+    uint32_t z1 = uint32_t((tmp12 + tmp13) * FIX_0_541196100);
+    data_lo.z = DESCALE(z1 + uint32_t(tmp13 * FIX_0_765366865), CONST_BITS + OUT_SHIFT);
+    data_hi.z = DESCALE(z1 + uint32_t(tmp12 * (-FIX_1_847759065)), CONST_BITS + OUT_SHIFT);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * cK represents cos(K*pi/16).
+     * i0..i3 in the paper are tmp4..tmp7 here.
+     */
+    z1 = tmp4 + tmp7;
+    uint32_t z2 = tmp5 + tmp6;
+    uint32_t z3 = tmp4 + tmp6;
+    uint32_t z4 = tmp5 + tmp7;
+    uint32_t z5 = MULTIPLY(uint32_t, z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+
+    tmp4 = MULTIPLY(int32_t, tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp5 = MULTIPLY(int32_t, tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp6 = MULTIPLY(int32_t, tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp7 = MULTIPLY(int32_t, tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = MULTIPLY(uint32_t, z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    z2 = MULTIPLY(uint32_t, z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(uint32_t, z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(uint32_t, z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+
+    z3 += z5;
+    z4 += z5;
+
+    data_hi.w = DESCALE(tmp4 + z1 + z3, CONST_BITS + OUT_SHIFT);
+    data_hi.y = DESCALE(tmp5 + z2 + z4, CONST_BITS + OUT_SHIFT);
+    data_lo.w = DESCALE(tmp6 + z2 + z3, CONST_BITS + OUT_SHIFT);
+    data_lo.y = DESCALE(tmp7 + z1 + z4, CONST_BITS + OUT_SHIFT);
+
+    col_lo = i16vec4(data_lo);
+    col_hi = i16vec4(data_hi);
+    coeffs[mb][block][0][col_half][col] = col_lo.x;
+    coeffs[mb][block][1][col_half][col] = col_lo.y;
+    coeffs[mb][block][2][col_half][col] = col_lo.z;
+    coeffs[mb][block][3][col_half][col] = col_lo.w;
+    coeffs[mb][block][4][col_half][col] = col_hi.x;
+    coeffs[mb][block][5][col_half][col] = col_hi.y;
+    coeffs[mb][block][6][col_half][col] = col_hi.z;
+    coeffs[mb][block][7][col_half][col] = col_hi.w;
+}
+
+/* Table of possible edge slice configurations */
+const uvec3 edge_mps_table[8] = uvec3[](
+    uvec3(0, 0, 0),
+    uvec3(1, 0, 0),
+    uvec3(2, 0, 0),
+    uvec3(2, 1, 0),
+    uvec3(4, 0, 0),
+    uvec3(4, 1, 0),
+    uvec3(4, 2, 0),
+    uvec3(4, 2, 1)
+);
+
+void main()
+{
+    bool is_chroma = plane == 1 || plane == 2;
+    uint row_idx = gl_LocalInvocationID.x;
+    uint block = gl_LocalInvocationID.y;
+    uint macroblock = gl_LocalInvocationID.z;
+    uint slice_x = gl_WorkGroupID.x;
+
+    /* Calculate the current thread coordinate in input plane */
+    uint mbs_per_slice = MAX_MBS_PER_SLICE;
+    uint mb_width = 4u * BLOCKS_PER_MB;
+    uint slices_width = WIDTH_IN_MB / MAX_MBS_PER_SLICE;
+    uvec2 slice_base = gl_WorkGroupID.xy * uvec2(MAX_MBS_PER_SLICE * mb_width, DCTSIZE * 2u);
+
+    /* Handle slice macroblock size reduction on edge slices */
+    if (slice_x >= slices_width)
+    {
+        uint edge_slice = slice_x - slices_width;
+        uvec3 table = edge_mps_table[WIDTH_IN_MB - slices_width * MAX_MBS_PER_SLICE];
+        uvec3 base = uvec3(0u, table.x, table.x + table.y);
+        slice_base.x = (MAX_MBS_PER_SLICE * slices_width + base[edge_slice]) * mb_width;
+        mbs_per_slice = table[edge_slice];
+    }
+
+    uvec2 mb_base = slice_base + uvec2(macroblock * mb_width, 0u);
+    uvec2 block_coord = is_chroma ? uvec2(block >> 1u, block & 1u) : uvec2(block & 1u, block >> 1u);
+    ivec2 coord = ivec2(mb_base + block_coord * DCTSIZE + uvec2(0u, row_idx));
+    ivec2 size = textureSize(planes[plane], 0);
+    coord.y = coord.y * pictures_per_frame + line_add;
+    coord = min(coord, size - ivec2(1));
+
+    /* Load coefficients from input planes */
+    i32vec4 row_lo;
+    row_lo.x = texelFetchOffset(planes[plane], coord, 0, ivec2(0, 0)).x;
+    row_lo.y = texelFetchOffset(planes[plane], coord, 0, ivec2(1, 0)).x;
+    row_lo.z = texelFetchOffset(planes[plane], coord, 0, ivec2(2, 0)).x;
+    row_lo.w = texelFetchOffset(planes[plane], coord, 0, ivec2(3, 0)).x;
+    
+    i32vec4 row_hi;
+    row_hi.x = texelFetchOffset(planes[plane], coord, 0, ivec2(4, 0)).x;
+    row_hi.y = texelFetchOffset(planes[plane], coord, 0, ivec2(5, 0)).x;
+    row_hi.z = texelFetchOffset(planes[plane], coord, 0, ivec2(6, 0)).x;
+    row_hi.w = texelFetchOffset(planes[plane], coord, 0, ivec2(7, 0)).x;
+
+    /* Perform DCT on the coefficients */
+    row_fdct(row_lo, row_hi);
+    ff_jpeg_fdct_islow_10();
+    barrier();
+
+    /* Store DCT result to slice buffer */
+    uint slice = gl_WorkGroupID.y * gl_NumWorkGroups.x + gl_WorkGroupID.x;
+    uint slice_row = macroblock * BLOCKS_PER_MB * DCTSIZE + block * DCTSIZE + row_idx;
+    slices[slice].mbs_per_slice = mbs_per_slice;
+    slices[slice].rows[plane][slice_row] = coeffs[macroblock][block][row_idx];
+}
\ No newline at end of file
diff --git a/libavcodec/vulkan/prores_ks_trellis_node.comp b/libavcodec/vulkan/prores_ks_trellis_node.comp
new file mode 100644
index 0000000000..052e47ac5f
--- /dev/null
+++ b/libavcodec/vulkan/prores_ks_trellis_node.comp
@@ -0,0 +1,177 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+
+layout(push_constant, scalar) uniform TrellisNodeInfo {
+    int min_quant;
+    int max_quant;
+    int mbs_per_slice;
+    int bits_per_mb;
+};
+
+#define TRELLIS_WIDTH 16
+#define SCORE_LIMIT   1073741823
+
+struct TrellisNode {
+    int prev_node;
+    int quant;
+    int bits;
+    int score;
+};
+
+shared int subgroup_sizes[NUM_SUBGROUPS];
+
+int slice_sizes[SLICES_WIDTH];
+
+TrellisNode nodes[(SLICES_WIDTH + 1) * TRELLIS_WIDTH];
+
+int find_slice_quant(int slice_x)
+{
+    int slice = int(gl_LocalInvocationID.x) * SLICES_WIDTH + slice_x;
+
+    int trellis_node = int(slice_x + 1) * TRELLIS_WIDTH;
+    for (int q = min_quant; q < max_quant + 2; q++)
+    {
+        nodes[trellis_node + q].prev_node = -1;
+        nodes[trellis_node + q].quant = q;
+    }
+
+    int mbs = int(slice_x + 1) * mbs_per_slice;
+    nodes[trellis_node + max_quant + 1].quant = scores[slice].overquant;
+
+    int bits_limit = mbs * bits_per_mb;
+    for (int pq = min_quant; pq < max_quant + 2; pq++)
+    {
+        int prev = trellis_node - TRELLIS_WIDTH + pq;
+        for (int q = min_quant; q < max_quant + 2; q++)
+        {
+            int cur = trellis_node + q;
+            int bits = nodes[prev].bits + scores[slice].total_bits[q];
+            int error = scores[slice].total_score[q];
+            if (bits > bits_limit)
+                error = SCORE_LIMIT;
+
+            int new_score;
+            if (nodes[prev].score < SCORE_LIMIT && error < SCORE_LIMIT)
+                new_score = nodes[prev].score + error;
+            else
+                new_score = SCORE_LIMIT;
+            if (nodes[cur].prev_node == -1 || nodes[cur].score >= new_score)
+            {
+                nodes[cur].bits      = bits;
+                nodes[cur].score     = new_score;
+                nodes[cur].prev_node = prev;
+            }
+        }
+    }
+
+    int error = nodes[trellis_node + min_quant].score;
+    int pq = trellis_node + min_quant;
+    for (int q = min_quant + 1; q < max_quant + 2; q++)
+    {
+        if (nodes[trellis_node + q].score <= error)
+        {
+            error = nodes[trellis_node + q].score;
+            pq = trellis_node + q;
+        }
+    }
+
+    return pq;
+}
+
+int find_slice_row_quants()
+{
+    for (int i = min_quant; i < max_quant + 2; i++)
+    {
+        nodes[i].prev_node = -1;
+        nodes[i].bits = 0;
+        nodes[i].score = 0;
+    }
+
+    int q = 0;
+    for (int slice_x = 0; slice_x < SLICES_WIDTH; ++slice_x)
+    {
+        q = find_slice_quant(slice_x);
+    }
+
+    int slice_hdr_size = 2 * NUM_PLANES;
+    int slice_row_size = slice_hdr_size * SLICES_WIDTH;
+    int y = int(gl_LocalInvocationID.x);
+    for (int x = SLICES_WIDTH - 1; x >= 0; x--)
+    {
+        int slice = x + y * SLICES_WIDTH;
+        int quant = nodes[q].quant;
+        int q_idx = min(quant, max_quant + 1);
+        slice_sizes[x] = scores[slice].total_bits[q_idx] / 8;
+        slice_row_size += slice_sizes[x];
+        scores[slice].quant = quant;
+        q = nodes[q].prev_node;
+    }
+
+    return slice_row_size;
+}
+
+int force_slice_row_quants()
+{
+    int slice_hdr_size = 2 * NUM_PLANES;
+    int slice_row_size = slice_hdr_size * SLICES_WIDTH;
+    int y = int(gl_LocalInvocationID.x);
+    for (int x = SLICES_WIDTH - 1; x >= 0; x--)
+    {
+        int slice = x + y * SLICES_WIDTH;
+        slice_sizes[x] = scores[slice].total_bits[FORCE_QUANT] / 8;
+        slice_row_size += slice_sizes[x];
+        scores[slice].quant = FORCE_QUANT;
+    }
+
+    return slice_row_size;
+}
+
+void main()
+{
+#if FORCE_QUANT == 0
+    int slice_row_size = find_slice_row_quants();
+#else
+    int slice_row_size = force_slice_row_quants();
+#endif
+
+    int subgroup_sum = subgroupAdd(slice_row_size);
+    subgroup_sizes[gl_SubgroupID] = subgroup_sum;
+    barrier();
+
+    int buf_start = subgroupExclusiveAdd(slice_row_size);
+    [[unroll]] for (int i = 0; i < NUM_SUBGROUPS; ++i)
+    {
+        if (i >= gl_SubgroupID)
+            break;
+        buf_start += subgroup_sizes[i];
+    }
+
+    int slice_hdr_size = 2 * NUM_PLANES;
+    int y = int(gl_LocalInvocationID.x);
+    [[unroll]] for (int x = 0; x < SLICES_WIDTH; ++x)
+    {
+        int slice = x + y * SLICES_WIDTH;
+        scores[slice].buf_start = buf_start;
+        buf_start += slice_hdr_size + slice_sizes[x];
+    }
+
+    if (y == gl_WorkGroupSize.x - 1)
+        frame_size = buf_start;
+}
\ No newline at end of file
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-09-09 13:00 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-09 12:59 [FFmpeg-devel] [PATCH] [GSoC 2025] Add vulkan compute based prores encoder (PR #20477) indecisive_turtle via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git