[FFmpeg-devel] [PATCH 7/7] lavc: add a ProRes RAW Vulkan hwaccel

From: Lynne <dev@lynne.ee>
To: ffmpeg-devel@ffmpeg.org
Cc: Lynne <dev@lynne.ee>
Subject: [FFmpeg-devel] [PATCH 7/7] lavc: add a ProRes RAW Vulkan hwaccel
Date: Fri, 11 Jul 2025 00:13:35 +0900
Message-ID: <20250710151349.1157547-7-dev@lynne.ee> (raw)
In-Reply-To: <20250710151349.1157547-1-dev@lynne.ee>

This commit adds a ProRes RAW hardware implementation written in Vulkan.
Both version 0 and version 1 streams are supported.
The implementation is highly parallelized, with 512 invocations dispatched
per every tile, with generally 4k tiles on a 5.8k stream.

Thanks to unlord for the 8-point iDCT.

Benchmark for a generic 5.8k RAW HQ file:
6900XT: 63fps
7900XTX: 84fps
6000 Ada: 120fps
Intel: 9fps
---
 configure                         |   2 +
 libavcodec/Makefile               |   1 +
 libavcodec/hwaccels.h             |   1 +
 libavcodec/proresdec_raw.c        |  10 +
 libavcodec/vulkan/Makefile        |   3 +
 libavcodec/vulkan/prores_raw.comp | 348 +++++++++++++++++++++
 libavcodec/vulkan_decode.c        |   9 +-
 libavcodec/vulkan_prores_raw.c    | 498 ++++++++++++++++++++++++++++++
 8 files changed, 871 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/vulkan/prores_raw.comp
 create mode 100644 libavcodec/vulkan_prores_raw.c

diff --git a/configure b/configure
index e0503337cd..8c34dc7cf1 100755
--- a/configure
+++ b/configure
@@ -3290,6 +3290,8 @@ mpeg4_videotoolbox_hwaccel_deps="videotoolbox"
 mpeg4_videotoolbox_hwaccel_select="mpeg4_decoder"
 prores_videotoolbox_hwaccel_deps="videotoolbox"
 prores_videotoolbox_hwaccel_select="prores_decoder"
+prores_raw_vulkan_hwaccel_deps="vulkan spirv_compiler"
+prores_raw_vulkan_hwaccel_select="prores_raw_decoder"
 vc1_d3d11va_hwaccel_deps="d3d11va"
 vc1_d3d11va_hwaccel_select="vc1_decoder"
 vc1_d3d11va2_hwaccel_deps="d3d11va"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 47d16f3312..23721de65f 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1080,6 +1080,7 @@ OBJS-$(CONFIG_VP9_VDPAU_HWACCEL)          += vdpau_vp9.o
 OBJS-$(CONFIG_VP9_VIDEOTOOLBOX_HWACCEL)   += videotoolbox_vp9.o
 OBJS-$(CONFIG_VP8_QSV_HWACCEL)            += qsvdec.o
 OBJS-$(CONFIG_VVC_VAAPI_HWACCEL)          += vaapi_vvc.o
+OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL)  += vulkan_decode.o vulkan_prores_raw.o
 
 # Objects duplicated from other libraries for shared builds
 SHLIBOBJS                              += log2_tab.o reverse.o
diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
index 0b2c725247..fb9b850233 100644
--- a/libavcodec/hwaccels.h
+++ b/libavcodec/hwaccels.h
@@ -67,6 +67,7 @@ extern const struct FFHWAccel ff_mpeg4_vaapi_hwaccel;
 extern const struct FFHWAccel ff_mpeg4_vdpau_hwaccel;
 extern const struct FFHWAccel ff_mpeg4_videotoolbox_hwaccel;
 extern const struct FFHWAccel ff_prores_videotoolbox_hwaccel;
+extern const struct FFHWAccel ff_prores_raw_vulkan_hwaccel;
 extern const struct FFHWAccel ff_vc1_d3d11va_hwaccel;
 extern const struct FFHWAccel ff_vc1_d3d11va2_hwaccel;
 extern const struct FFHWAccel ff_vc1_d3d12va_hwaccel;
diff --git a/libavcodec/proresdec_raw.c b/libavcodec/proresdec_raw.c
index caebed9e96..6a0773c8a5 100644
--- a/libavcodec/proresdec_raw.c
+++ b/libavcodec/proresdec_raw.c
@@ -20,6 +20,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config_components.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/mem.h"
@@ -30,10 +31,13 @@
 #include "bytestream.h"
 #include "codec_internal.h"
 #include "decode.h"
+#include "hwconfig.h"
 #include "get_bits.h"
 #include "idctdsp.h"
 #include "proresdata.h"
 #include "thread.h"
+#include "hwconfig.h"
+#include "hwaccel_internal.h"
 
 #include "proresdec_raw.h"
 
@@ -312,6 +316,9 @@ static enum AVPixelFormat get_pixel_format(AVCodecContext *avctx,
                                            enum AVPixelFormat pix_fmt)
 {
     enum AVPixelFormat pix_fmts[] = {
+#if CONFIG_PRORES_RAW_VULKAN_HWACCEL
+        AV_PIX_FMT_VULKAN,
+#endif
         pix_fmt,
         AV_PIX_FMT_NONE,
     };
@@ -514,6 +521,9 @@ const FFCodec ff_prores_raw_decoder = {
                         AV_CODEC_CAP_SLICE_THREADS,
     .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .hw_configs     = (const AVCodecHWConfigInternal *const []) {
+#if CONFIG_PRORES_RAW_VULKAN_HWACCEL
+        HWACCEL_VULKAN(prores_raw),
+#endif
         NULL
     },
 };
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index 729cb4f15c..d8e1471fa6 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -14,6 +14,9 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
 					vulkan/ffv1_common.o vulkan/ffv1_reset.o \
 					vulkan/ffv1_dec_setup.o vulkan/ffv1_dec.o
 
+OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o \
+                                            vulkan/prores_raw.o
+
 VULKAN = $(subst $(SRC_PATH)/,,$(wildcard $(SRC_PATH)/libavcodec/vulkan/*.comp))
 .SECONDARY: $(VULKAN:.comp=.c)
 libavcodec/vulkan/%.c: TAG = VULKAN
diff --git a/libavcodec/vulkan/prores_raw.comp b/libavcodec/vulkan/prores_raw.comp
new file mode 100644
index 0000000000..fe0606e0b8
--- /dev/null
+++ b/libavcodec/vulkan/prores_raw.comp
@@ -0,0 +1,348 @@
+/*
+ * ProRes RAW decoder
+ *
+ * Copyright (c) 2025 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define I16(x) (int16_t(x))
+
+#define COMP_ID (gl_LocalInvocationID.z)
+#define BLOCK_ID (gl_LocalInvocationID.y)
+#define ROW_ID (gl_LocalInvocationID.x)
+
+GetBitContext gb;
+shared float block[gl_WorkGroupSize.z][16][64] = { };
+shared float transposed[gl_WorkGroupSize.z][16][64];
+
+void idct8_horiz(const uint row_id)
+{
+    float t0, t1, t2, t3, t4, t5, t6, t7, u8;
+    float u0, u1, u2, u3, u4, u5, u6, u7;
+
+    /* Input */
+    t0 = block[COMP_ID][BLOCK_ID][8*row_id + 0];
+    u4 = block[COMP_ID][BLOCK_ID][8*row_id + 1];
+    t2 = block[COMP_ID][BLOCK_ID][8*row_id + 2];
+    u6 = block[COMP_ID][BLOCK_ID][8*row_id + 3];
+    t1 = block[COMP_ID][BLOCK_ID][8*row_id + 4];
+    u5 = block[COMP_ID][BLOCK_ID][8*row_id + 5];
+    t3 = block[COMP_ID][BLOCK_ID][8*row_id + 6];
+    u7 = block[COMP_ID][BLOCK_ID][8*row_id + 7];
+
+    /* Embedded scaled inverse 4-point Type-II DCT */
+    u0 = t0 + t1;
+    u1 = t0 - t1;
+    u3 = t2 + t3;
+    u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
+    t0 = u0 + u3;
+    t3 = u0 - u3;
+    t1 = u1 + u2;
+    t2 = u1 - u2;
+
+    /* Embedded scaled inverse 4-point Type-IV DST */
+    t5 = u5 + u6;
+    t6 = u5 - u6;
+    t7 = u4 + u7;
+    t4 = u4 - u7;
+    u7 = t7 + t5;
+    u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
+    u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
+    u4 = u8 - t4*(1.0823922002923939687994464107328f);
+    u6 = u8 - t6*(2.6131259297527530557132863468544f);
+    t7 = u7;
+    t6 = t7 - u6;
+    t5 = t6 + u5;
+    t4 = t5 - u4;
+
+    /* Butterflies */
+    u0 = t0 + t7;
+    u7 = t0 - t7;
+    u6 = t1 + t6;
+    u1 = t1 - t6;
+    u2 = t2 + t5;
+    u5 = t2 - t5;
+    u4 = t3 + t4;
+    u3 = t3 - t4;
+
+    /* Output */
+    transposed[COMP_ID][BLOCK_ID][0*8 + row_id] = u0;
+    transposed[COMP_ID][BLOCK_ID][1*8 + row_id] = u1;
+    transposed[COMP_ID][BLOCK_ID][2*8 + row_id] = u2;
+    transposed[COMP_ID][BLOCK_ID][3*8 + row_id] = u3;
+    transposed[COMP_ID][BLOCK_ID][4*8 + row_id] = u4;
+    transposed[COMP_ID][BLOCK_ID][5*8 + row_id] = u5;
+    transposed[COMP_ID][BLOCK_ID][6*8 + row_id] = u6;
+    transposed[COMP_ID][BLOCK_ID][7*8 + row_id] = u7;
+}
+
+void idct8_vert(const uint row_id)
+{
+    float t0, t1, t2, t3, t4, t5, t6, t7, u8;
+    float u0, u1, u2, u3, u4, u5, u6, u7;
+
+    /* Input */
+    t0 = transposed[COMP_ID][BLOCK_ID][8*row_id + 0] + 0.5f; // NOTE
+    u4 = transposed[COMP_ID][BLOCK_ID][8*row_id + 1];
+    t2 = transposed[COMP_ID][BLOCK_ID][8*row_id + 2];
+    u6 = transposed[COMP_ID][BLOCK_ID][8*row_id + 3];
+    t1 = transposed[COMP_ID][BLOCK_ID][8*row_id + 4];
+    u5 = transposed[COMP_ID][BLOCK_ID][8*row_id + 5];
+    t3 = transposed[COMP_ID][BLOCK_ID][8*row_id + 6];
+    u7 = transposed[COMP_ID][BLOCK_ID][8*row_id + 7];
+
+    /* Embedded scaled inverse 4-point Type-II DCT */
+    u0 = t0 + t1;
+    u1 = t0 - t1;
+    u3 = t2 + t3;
+    u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
+    t0 = u0 + u3;
+    t3 = u0 - u3;
+    t1 = u1 + u2;
+    t2 = u1 - u2;
+
+    /* Embedded scaled inverse 4-point Type-IV DST */
+    t5 = u5 + u6;
+    t6 = u5 - u6;
+    t7 = u4 + u7;
+    t4 = u4 - u7;
+    u7 = t7 + t5;
+    u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
+    u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
+    u4 = u8 - t4*(1.0823922002923939687994464107328f);
+    u6 = u8 - t6*(2.6131259297527530557132863468544f);
+    t7 = u7;
+    t6 = t7 - u6;
+    t5 = t6 + u5;
+    t4 = t5 - u4;
+
+    /* Butterflies */
+    u0 = t0 + t7;
+    u7 = t0 - t7;
+    u6 = t1 + t6;
+    u1 = t1 - t6;
+    u2 = t2 + t5;
+    u5 = t2 - t5;
+    u4 = t3 + t4;
+    u3 = t3 - t4;
+
+    /* Output */
+    block[COMP_ID][BLOCK_ID][0*8 + row_id] = u0;
+    block[COMP_ID][BLOCK_ID][1*8 + row_id] = u1;
+    block[COMP_ID][BLOCK_ID][2*8 + row_id] = u2;
+    block[COMP_ID][BLOCK_ID][3*8 + row_id] = u3;
+    block[COMP_ID][BLOCK_ID][4*8 + row_id] = u4;
+    block[COMP_ID][BLOCK_ID][5*8 + row_id] = u5;
+    block[COMP_ID][BLOCK_ID][6*8 + row_id] = u6;
+    block[COMP_ID][BLOCK_ID][7*8 + row_id] = u7;
+}
+
+#define TODCCODEBOOK(x) (((x) & 1) + (x) >> 1)
+
+int16_t get_value(int16_t codebook)
+{
+    const int16_t switch_bits = codebook >> 8;
+    const int16_t rice_order  = codebook & I16(0xf);
+    const int16_t exp_order   = (codebook >> 4) & I16(0xf);
+
+    uint b = show_bits(gb, 32);
+    if (expectEXT(b == 0, false))
+        return I16(0);
+    int16_t q = I16(31) - I16(findMSB(b));
+
+    if ((b & 0x80000000) > 0) {
+        skip_bits(gb, 1 + rice_order);
+        return I16((b & 0x7FFFFFFF) >> (31 - rice_order));
+    }
+
+    if (q <= switch_bits) {
+        skip_bits(gb, q + rice_order + 1);
+        return I16((q << rice_order) +
+                   (((b << (q + 1)) >> 1) >> (31 - rice_order)));
+    }
+
+    int16_t bits = exp_order + (q << 1) - switch_bits;
+    skip_bits(gb, bits);
+    return I16((b >> (32 - bits)) +
+               ((switch_bits + 1) << rice_order) -
+               (1 << exp_order));
+}
+
+void read_dc_vals(const uint nb_blocks)
+{
+    int16_t dc;
+    int16_t prev_dc = I16(0), dc_add = I16(0), sign = I16(0);
+
+    /* Special handling for first block */
+    dc = get_value(I16(700));
+    prev_dc = int16_t(((dc & 1) + (dc >> 1) ^ -int((dc & 1))) + (dc & 1));
+    block[COMP_ID][0][0] = prev_dc;
+
+    for (uint n = 1; n < nb_blocks; n++) {
+        if (expectEXT(left_bits(gb) <= 0, false))
+            break;
+
+        int16_t dc_codebook;
+        if ((n & 15) == 1)
+            dc_codebook = I16(100);
+        else
+            dc_codebook = dc_cb[min(TODCCODEBOOK(dc), 13 - 1)];
+
+        dc = get_value(dc_codebook);
+
+        sign = sign ^ dc & int16_t(1);
+        dc_add = (-sign ^ I16(TODCCODEBOOK(dc))) + sign;
+        sign = I16(dc_add < 0);
+        prev_dc += dc_add;
+
+        block[COMP_ID][n][0] = prev_dc;
+    }
+}
+
+void read_ac_vals(const uint nb_blocks)
+{
+    uint8_t idx;
+    const uint nb_codes = nb_blocks << 6;
+    const uint log2_nb_blocks = findMSB(nb_blocks);
+    const uint block_mask = (1 << log2_nb_blocks) - 1;
+
+    int16_t sign;
+    int16_t ac, rn, ln;
+    int16_t ac_codebook = I16(49);
+    int16_t rn_codebook = I16( 0);
+    int16_t ln_codebook = I16(66);
+    int16_t val;
+
+    for (uint n = nb_blocks; n <= nb_codes;) {
+        if (expectEXT(left_bits(gb) <= 0, false))
+            break;
+
+        ln = get_value(ln_codebook);
+        for (uint i = 0; i < ln; i++) {
+            if (expectEXT(left_bits(gb) <= 0, false))
+                break;
+
+            if (expectEXT((n + i) >= nb_codes, false))
+                break;
+
+            ac = get_value(ac_codebook);
+            ac_codebook = ac_cb[min(ac, 95 - 1)];
+            sign = -int16_t(get_bits(gb, 1));
+
+            idx = scan[(n + i) >> log2_nb_blocks];
+            val = int16_t(((ac + I16(1)) ^ sign) - sign);
+            block[COMP_ID][(n + i) & block_mask][idx] = val;
+        }
+
+        n += ln;
+        if (expectEXT(n >= nb_codes, false))
+            break;
+
+        rn = get_value(rn_codebook);
+        rn_codebook = rn_cb[min(rn, 28 - 1)];
+
+        n += rn + 1;
+        if (expectEXT(n >= nb_codes, false))
+            break;
+
+        if (expectEXT(left_bits(gb) <= 0, false))
+            break;
+
+        ac = get_value(ac_codebook);
+        sign = -int16_t(get_bits(gb, 1));
+
+        idx = scan[n >> log2_nb_blocks];
+        val = int16_t(((ac + 1) ^ sign) - sign);
+        block[COMP_ID][n & block_mask][idx] = val;
+
+        ac_codebook = ac_cb[min(ac, 95 - 1)];
+        ln_codebook = ln_cb[min(ac, 15 - 1)];
+
+        n++;
+    }
+}
+
+void main(void)
+{
+    const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
+    TileData td = tile_data[tile_idx];
+
+    if (expectEXT(td.pos.x >= frame_size.x, false))
+        return;
+
+    uint64_t pkt_offset = uint64_t(pkt_data) + td.offset;
+    u8vec2buf hdr_data = u8vec2buf(pkt_offset);
+    int qscale = int(pack16(hdr_data[0].v.yx));
+
+    ivec4 size = ivec4(td.size,
+                       pack16(hdr_data[2].v.yx),
+                       pack16(hdr_data[1].v.yx),
+                       pack16(hdr_data[3].v.yx));
+    size[0] = size[0] - size[1] - size[2] - size[3] - 8;
+    if (expectEXT(size[0] < 0, false))
+        return;
+
+    const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1);
+    const float n = float(qscale - 16384) / (2.0f*4096.0f);
+    const uint w = min(tile_size.x, frame_size.x - td.pos.x) / 2;
+    const uint nb_blocks = w / 8;
+
+    const ivec4 comp_offset = ivec4(size[2] + size[1] + size[3],
+                                    size[2],
+                                    0,
+                                    size[2] + size[1]);
+
+    if (BLOCK_ID == 0 && ROW_ID == 0) {
+        init_get_bits(gb, u8buf(pkt_offset + 8 + comp_offset[COMP_ID]),
+                      size[COMP_ID]);
+        read_dc_vals(nb_blocks);
+        read_ac_vals(nb_blocks);
+    }
+
+    barrier();
+
+    [[unroll]]
+    for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x)
+        block[COMP_ID][BLOCK_ID][i] *= n * idct_8x8_scales[i];
+
+    barrier();
+
+#ifdef PARALLEL_ROWS
+    idct8_horiz(ROW_ID);
+
+    barrier();
+
+    idct8_vert(ROW_ID);
+#else
+    for (uint j = 0; j < 8; j++)
+        idct8_horiz(j);
+
+    barrier();
+
+    for (uint j = 0; j < 8; j++)
+        idct8_vert(j);
+#endif
+
+    barrier();
+
+    [[unroll]]
+    for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x)
+         imageStore(dst,
+                    offs + 2*ivec2(BLOCK_ID*8 + (i & 7), i >> 3),
+                    vec4(block[COMP_ID][BLOCK_ID][i]));
+}
diff --git a/libavcodec/vulkan_decode.c b/libavcodec/vulkan_decode.c
index 7310ba1547..857f16bc0a 100644
--- a/libavcodec/vulkan_decode.c
+++ b/libavcodec/vulkan_decode.c
@@ -25,7 +25,8 @@
 #include "libavutil/vulkan_loader.h"
 
 #define DECODER_IS_SDR(codec_id) \
-    ((codec_id) == AV_CODEC_ID_FFV1)
+    (((codec_id) == AV_CODEC_ID_FFV1) || \
+     ((codec_id) == AV_CODEC_ID_PRORES_RAW))
 
 #if CONFIG_H264_VULKAN_HWACCEL
 extern const FFVulkanDecodeDescriptor ff_vk_dec_h264_desc;
@@ -39,6 +40,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_av1_desc;
 #if CONFIG_FFV1_VULKAN_HWACCEL
 extern const FFVulkanDecodeDescriptor ff_vk_dec_ffv1_desc;
 #endif
+#if CONFIG_PRORES_RAW_VULKAN_HWACCEL
+extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_raw_desc;
+#endif
 
 static const FFVulkanDecodeDescriptor *dec_descs[] = {
 #if CONFIG_H264_VULKAN_HWACCEL
@@ -53,6 +57,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = {
 #if CONFIG_FFV1_VULKAN_HWACCEL
     &ff_vk_dec_ffv1_desc,
 #endif
+#if CONFIG_PRORES_RAW_VULKAN_HWACCEL
+    &ff_vk_dec_prores_raw_desc,
+#endif
 };
 
 static const FFVulkanDecodeDescriptor *get_codecdesc(enum AVCodecID codec_id)
diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c
new file mode 100644
index 0000000000..31320ffa18
--- /dev/null
+++ b/libavcodec/vulkan_prores_raw.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2025 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vulkan_decode.h"
+#include "hwaccel_internal.h"
+
+#include "proresdec_raw.h"
+#include "libavutil/vulkan_spirv.h"
+#include "libavutil/mem.h"
+
+extern const char *ff_source_common_comp;
+extern const char *ff_source_prores_raw_comp;
+
+const FFVulkanDecodeDescriptor ff_vk_dec_prores_raw_desc = {
+    .codec_id         = AV_CODEC_ID_PRORES_RAW,
+    .decode_extension = FF_VK_EXT_PUSH_DESCRIPTOR,
+    .queue_flags      = VK_QUEUE_COMPUTE_BIT,
+};
+
+typedef struct ProResRAWVulkanDecodePicture {
+    FFVulkanDecodePicture vp;
+
+    AVBufferRef *tile_data;
+    uint32_t nb_tiles;
+} ProResRAWVulkanDecodePicture;
+
+typedef struct ProResRAWVulkanDecodeContext {
+    FFVulkanShader decode[2];
+
+    AVBufferPool *tile_data_pool;
+
+    FFVkBuffer uniform_buf;
+} ProResRAWVulkanDecodeContext;
+
+typedef struct DecodePushData {
+    VkDeviceAddress tile_data;
+    VkDeviceAddress pkt_data;
+    uint32_t frame_size[2];
+    uint32_t tile_size[2];
+} DecodePushData;
+
+typedef struct TileData {
+    int32_t pos[2];
+    uint32_t offset;
+    uint32_t size;
+} TileData;
+
+static int vk_prores_raw_start_frame(AVCodecContext          *avctx,
+                                     const AVBufferRef       *buffer_ref,
+                                     av_unused const uint8_t *buffer,
+                                     av_unused uint32_t       size)
+{
+    int err;
+    FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
+    FFVulkanDecodeShared *ctx = dec->shared_ctx;
+    ProResRAWVulkanDecodeContext *prv = ctx->sd_ctx;
+    ProResRAWContext *prr = avctx->priv_data;
+
+    ProResRAWVulkanDecodePicture *pp = prr->hwaccel_picture_private;
+    FFVulkanDecodePicture *vp = &pp->vp;
+
+    /* Host map the input tile data if supported */
+    if (ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY)
+        ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data,
+                              buffer_ref,
+                              VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                              VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT);
+
+    /* Allocate tile data */
+    err = ff_vk_get_pooled_buffer(&ctx->s, &prv->tile_data_pool,
+                                  &pp->tile_data,
+                                  VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                  VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+                                  NULL, prr->nb_tiles*sizeof(TileData),
+                                  VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                                  VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+    if (err < 0)
+        return err;
+
+    /* Prepare frame to be used */
+    err = ff_vk_decode_prepare_frame_sdr(dec, prr->frame, vp, 1,
+                                         FF_VK_REP_FLOAT, 0);
+    if (err < 0)
+        return err;
+
+    return 0;
+}
+
+static int vk_prores_raw_decode_slice(AVCodecContext *avctx,
+                                      const uint8_t  *data,
+                                      uint32_t        size)
+{
+    ProResRAWContext *prr = avctx->priv_data;
+
+    ProResRAWVulkanDecodePicture *pp = prr->hwaccel_picture_private;
+    FFVulkanDecodePicture *vp = &pp->vp;
+
+    FFVkBuffer *tile_data_buf = (FFVkBuffer *)pp->tile_data->data;
+    TileData *td = (TileData *)tile_data_buf->mapped_mem;
+    FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL;
+
+    td[pp->nb_tiles].pos[0] = prr->tiles[pp->nb_tiles].x;
+    td[pp->nb_tiles].pos[1] = prr->tiles[pp->nb_tiles].y;
+    td[pp->nb_tiles].size = size;
+
+    if (vp->slices_buf && slices_buf->host_ref) {
+        td[pp->nb_tiles].offset = data - slices_buf->mapped_mem;
+        pp->nb_tiles++;
+    } else {
+        int err;
+        td[pp->nb_tiles].offset = vp->slices_size;
+        err = ff_vk_decode_add_slice(avctx, vp, data, size, 0,
+                                     &pp->nb_tiles, NULL);
+        if (err < 0)
+            return err;
+    }
+
+    return 0;
+}
+
+static int vk_prores_raw_end_frame(AVCodecContext *avctx)
+{
+    int err;
+    FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
+    FFVulkanDecodeShared *ctx = dec->shared_ctx;
+    FFVulkanFunctions *vk = &ctx->s.vkfn;
+
+    ProResRAWContext *prr = avctx->priv_data;
+    ProResRAWVulkanDecodeContext *prv = ctx->sd_ctx;
+
+    DecodePushData pd_decode;
+    FFVulkanShader *decode_shader;
+
+    ProResRAWVulkanDecodePicture *pp = prr->hwaccel_picture_private;
+    FFVulkanDecodePicture *vp = &pp->vp;
+
+    FFVkBuffer *slices_buf = (FFVkBuffer *)vp->slices_buf->data;
+    FFVkBuffer *tile_data = (FFVkBuffer *)pp->tile_data->data;
+
+    VkImageMemoryBarrier2 img_bar[8];
+    int nb_img_bar = 0;
+
+    FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool);
+    ff_vk_exec_start(&ctx->s, exec);
+
+    /* Prepare deps */
+    RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, prr->frame,
+                                 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+
+    RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &pp->tile_data, 1, 0));
+    pp->tile_data = NULL;
+    RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &vp->slices_buf, 1, 0));
+    vp->slices_buf = NULL;
+
+    ff_vk_frame_barrier(&ctx->s, exec, prr->frame, img_bar, &nb_img_bar,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pImageMemoryBarriers = img_bar,
+        .imageMemoryBarrierCount = nb_img_bar,
+    });
+    nb_img_bar = 0;
+
+    decode_shader = &prv->decode[prr->version];
+    ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader,
+                                  prr->frame, vp->view.out,
+                                  0, 0,
+                                  VK_IMAGE_LAYOUT_GENERAL,
+                                  VK_NULL_HANDLE);
+
+    ff_vk_exec_bind_shader(&ctx->s, exec, decode_shader);
+    pd_decode = (DecodePushData) {
+        .tile_data = tile_data->address,
+        .pkt_data = slices_buf->address,
+        .frame_size[0] = avctx->width,
+        .frame_size[1] = avctx->height,
+        .tile_size[0] = prr->tw,
+        .tile_size[1] = prr->th,
+    };
+    ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader,
+                                   VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(pd_decode), &pd_decode);
+
+    vk->CmdDispatch(exec->buf, prr->nb_tw, prr->nb_th, 1);
+
+    err = ff_vk_exec_submit(&ctx->s, exec);
+    if (err < 0)
+        return err;
+
+fail:
+    return 0;
+}
+
+static int init_decode_shader(ProResRAWContext *prr, FFVulkanContext *s,
+                              FFVkExecPool *pool, FFVkSPIRVCompiler *spv,
+                              FFVulkanShader *shd, int version)
+{
+    int err;
+    FFVulkanDescriptorSetBinding *desc_set;
+    int parallel_rows = 1;
+
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+
+    if (s->props.properties.limits.maxComputeWorkGroupInvocations < 512 ||
+        s->props.properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
+        parallel_rows = 0;
+
+    RET(ff_vk_shader_init(s, shd, "prores_raw",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_EXT_buffer_reference",
+                                             "GL_EXT_buffer_reference2",
+                                             "GL_EXT_null_initializer",
+                                             "GL_AMD_gpu_shader_half_float" }, 4,
+                          parallel_rows ? 8 : 1 /* 8x8 transforms, 8-point width */,
+                          version == 0 ? 8 : 16 /* Horizontal blocks */,
+                          4 /* Components */,
+                          0));
+
+    if (parallel_rows)
+        GLSLC(0, #define PARALLEL_ROWS                                               );
+
+    /* Common codec header */
+    GLSLD(ff_source_common_comp);
+
+    GLSLC(0, layout(buffer_reference, buffer_reference_align = 16) buffer TileData { );
+    GLSLC(1,    ivec2 pos;                                                           );
+    GLSLC(1,    uint offset;                                                         );
+    GLSLC(1,    uint size;                                                           );
+    GLSLC(0, };                                                                      );
+    GLSLC(0,                                                                         );
+    GLSLC(0, layout(push_constant, scalar) uniform pushConstants {                   );
+    GLSLC(1,    TileData tile_data;                                                  );
+    GLSLC(1,    u8buf pkt_data;                                                      );
+    GLSLC(1,    uvec2 frame_size;                                                    );
+    GLSLC(1,    uvec2 tile_size;                                                     );
+    GLSLC(0, };                                                                      );
+    GLSLC(0,                                                                         );
+    ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name       = "dst",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .mem_layout = "r16",
+            .mem_quali  = "writeonly",
+            .dimensions = 2,
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 1, 0, 0));
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "dct_scale_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "float idct_8x8_scales[64];",
+        },
+        {
+            .name        = "scan_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "uint8_t scan[64];",
+        },
+        {
+            .name        = "dc_cb_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t dc_cb[13];",
+        },
+        {
+            .name        = "ac_cb_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t ac_cb[95];",
+        },
+        {
+            .name        = "rn_cb_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t rn_cb[28];",
+        },
+        {
+            .name        = "ln_cb_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t ln_cb[15];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 6, 1, 0));
+
+    GLSLD(ff_source_prores_raw_comp);
+
+    RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main",
+                            &spv_opaque));
+    RET(ff_vk_shader_link(s, shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(s, pool, shd));
+
+fail:
+    if (spv_opaque)
+        spv->free_shader(spv, &spv_opaque);
+
+    return err;
+}
+
+static void vk_decode_prores_raw_uninit(FFVulkanDecodeShared *ctx)
+{
+    ProResRAWVulkanDecodeContext *fv = ctx->sd_ctx;
+
+    ff_vk_shader_free(&ctx->s, &fv->decode[0]);
+    ff_vk_shader_free(&ctx->s, &fv->decode[1]);
+
+    ff_vk_free_buf(&ctx->s, &fv->uniform_buf);
+
+    av_buffer_pool_uninit(&fv->tile_data_pool);
+
+    av_freep(&fv);
+}
+
+static int vk_decode_prores_raw_init(AVCodecContext *avctx)
+{
+    int err;
+    ProResRAWContext *prr = avctx->priv_data;
+
+    FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
+    FFVulkanDecodeShared *ctx = NULL;
+    size_t ua;
+
+    ProResRAWVulkanDecodeContext *prv;
+    FFVkSPIRVCompiler *spv;
+
+    uint8_t *uniform_buf;
+    float *dct_scale_buf;
+    double idct_8_scales[8] = {
+        cos(4.0*M_PI/16.0)/2.0,
+        cos(1.0*M_PI/16.0)/2.0,
+        cos(2.0*M_PI/16.0)/2.0,
+        cos(3.0*M_PI/16.0)/2.0,
+        cos(4.0*M_PI/16.0)/2.0,
+        cos(5.0*M_PI/16.0)/2.0,
+        cos(6.0*M_PI/16.0)/2.0,
+        cos(7.0*M_PI/16.0)/2.0,
+    };
+    uint8_t *scan_buf;
+    size_t cb_size[5] = {
+        13*sizeof(int16_t),
+        95*sizeof(int16_t),
+        28*sizeof(int16_t),
+        15*sizeof(int16_t),
+    };
+    size_t cb_offset[5];
+
+    spv = ff_vk_spirv_init();
+    if (!spv) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    err = ff_vk_decode_init(avctx);
+    if (err < 0)
+        return err;
+    ctx = dec->shared_ctx;
+
+    prv = ctx->sd_ctx = av_mallocz(sizeof(*prv));
+    if (!prv) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ctx->sd_ctx_free = &vk_decode_prores_raw_uninit;
+
+    ua = ctx->s.props.properties.limits.minUniformBufferOffsetAlignment;
+    cb_offset[0] = 64*sizeof(float) + 64*sizeof(uint8_t);
+    cb_offset[1] = cb_offset[0] + FFALIGN(cb_size[0], ua);
+    cb_offset[2] = cb_offset[1] + FFALIGN(cb_size[1], ua);
+    cb_offset[3] = cb_offset[2] + FFALIGN(cb_size[2], ua);
+    cb_offset[4] = cb_offset[3] + FFALIGN(cb_size[3], ua);
+
+    /* Setup decode shader */
+    RET(init_decode_shader(prr, &ctx->s, &ctx->exec_pool, spv, &prv->decode[0], 0));
+    RET(init_decode_shader(prr, &ctx->s, &ctx->exec_pool, spv, &prv->decode[1], 1));
+
+    RET(ff_vk_create_buf(&ctx->s, &prv->uniform_buf,
+                         64*sizeof(float) + 64*sizeof(uint8_t) + cb_offset[4] + 256,
+                         NULL, NULL,
+                         VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
+                         VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+                         VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
+
+    RET(ff_vk_map_buffer(&ctx->s, &prv->uniform_buf, &uniform_buf, 0));
+
+    dct_scale_buf = (float *)uniform_buf;
+    for (int i = 0; i < 64; i++)
+        dct_scale_buf[i] = (float)(idct_8_scales[i >> 3] * idct_8_scales[i & 7]);
+
+    scan_buf = uniform_buf + 64*sizeof(float);
+    for (int i = 0; i < 64; i++)
+        scan_buf[i] = prr->scan[i];
+
+    memcpy(uniform_buf + cb_offset[0], ff_prores_raw_dc_cb,
+           sizeof(ff_prores_raw_dc_cb));
+    memcpy(uniform_buf + cb_offset[1], ff_prores_raw_ac_cb,
+           sizeof(ff_prores_raw_ac_cb));
+    memcpy(uniform_buf + cb_offset[2], ff_prores_raw_rn_cb,
+           sizeof(ff_prores_raw_rn_cb));
+    memcpy(uniform_buf + cb_offset[3], ff_prores_raw_ln_cb,
+           sizeof(ff_prores_raw_ln_cb));
+
+    RET(ff_vk_unmap_buffer(&ctx->s, &prv->uniform_buf, 1));
+
+    for (int i = 0; i < 2; i++) {
+        RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0],
+                                            &prv->decode[i], 1, 0, 0,
+                                            &prv->uniform_buf,
+                                            0, 64*sizeof(float),
+                                            VK_FORMAT_UNDEFINED));
+        RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0],
+                                            &prv->decode[i], 1, 1, 0,
+                                            &prv->uniform_buf,
+                                            64*sizeof(float), 64*sizeof(uint8_t),
+                                            VK_FORMAT_UNDEFINED));
+        for (int j = 0; j < 4; j++)
+            RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0],
+                                                &prv->decode[i], 1, 2 + j, 0,
+                                                &prv->uniform_buf,
+                                                cb_offset[j], cb_size[j],
+                                                VK_FORMAT_UNDEFINED));
+    }
+
+fail:
+    spv->uninit(&spv);
+
+    return err;
+}
+
+static void vk_prores_raw_free_frame_priv(AVRefStructOpaque _hwctx, void *data)
+{
+    AVHWDeviceContext *dev_ctx = _hwctx.nc;
+
+    ProResRAWVulkanDecodePicture *pp = data;
+    FFVulkanDecodePicture *vp = &pp->vp;
+
+    ff_vk_decode_free_frame(dev_ctx, vp);
+}
+
+const FFHWAccel ff_prores_raw_vulkan_hwaccel = {
+    .p.name                = "prores_raw_vulkan",
+    .p.type                = AVMEDIA_TYPE_VIDEO,
+    .p.id                  = AV_CODEC_ID_PRORES_RAW,
+    .p.pix_fmt             = AV_PIX_FMT_VULKAN,
+    .start_frame           = &vk_prores_raw_start_frame,
+    .decode_slice          = &vk_prores_raw_decode_slice,
+    .end_frame             = &vk_prores_raw_end_frame,
+    .free_frame_priv       = &vk_prores_raw_free_frame_priv,
+    .frame_priv_data_size  = sizeof(ProResRAWVulkanDecodePicture),
+    .init                  = &vk_decode_prores_raw_init,
+    .update_thread_context = &ff_vk_update_thread_context,
+    .decode_params         = &ff_vk_params_invalidate,
+    .flush                 = &ff_vk_decode_flush,
+    .uninit                = &ff_vk_decode_uninit,
+    .frame_params          = &ff_vk_frame_params,
+    .priv_data_size        = sizeof(FFVulkanDecodeContext),
+    .caps_internal         = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_THREAD_SAFE,
+};
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".