From: Lynne <dev@lynne.ee> To: ffmpeg-devel@ffmpeg.org Cc: Lynne <dev@lynne.ee> Subject: [FFmpeg-devel] [PATCH 7/7] lavc: add a ProRes RAW Vulkan hwaccel Date: Fri, 11 Jul 2025 00:13:35 +0900 Message-ID: <20250710151349.1157547-7-dev@lynne.ee> (raw) In-Reply-To: <20250710151349.1157547-1-dev@lynne.ee> This commit adds a ProRes RAW hardware implementation written in Vulkan. Both version 0 and version 1 streams are supported. The implementation is highly parallelized, with 512 invocations dispatched per every tile, with generally 4k tiles on a 5.8k stream. Thanks to unlord for the 8-point iDCT. Benchmark for a generic 5.8k RAW HQ file: 6900XT: 63fps 7900XTX: 84fps 6000 Ada: 120fps Intel: 9fps --- configure | 2 + libavcodec/Makefile | 1 + libavcodec/hwaccels.h | 1 + libavcodec/proresdec_raw.c | 10 + libavcodec/vulkan/Makefile | 3 + libavcodec/vulkan/prores_raw.comp | 348 +++++++++++++++++++++ libavcodec/vulkan_decode.c | 9 +- libavcodec/vulkan_prores_raw.c | 498 ++++++++++++++++++++++++++++++ 8 files changed, 871 insertions(+), 1 deletion(-) create mode 100644 libavcodec/vulkan/prores_raw.comp create mode 100644 libavcodec/vulkan_prores_raw.c diff --git a/configure b/configure index e0503337cd..8c34dc7cf1 100755 --- a/configure +++ b/configure @@ -3290,6 +3290,8 @@ mpeg4_videotoolbox_hwaccel_deps="videotoolbox" mpeg4_videotoolbox_hwaccel_select="mpeg4_decoder" prores_videotoolbox_hwaccel_deps="videotoolbox" prores_videotoolbox_hwaccel_select="prores_decoder" +prores_raw_vulkan_hwaccel_deps="vulkan spirv_compiler" +prores_raw_vulkan_hwaccel_select="prores_raw_decoder" vc1_d3d11va_hwaccel_deps="d3d11va" vc1_d3d11va_hwaccel_select="vc1_decoder" vc1_d3d11va2_hwaccel_deps="d3d11va" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 47d16f3312..23721de65f 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -1080,6 +1080,7 @@ OBJS-$(CONFIG_VP9_VDPAU_HWACCEL) += vdpau_vp9.o OBJS-$(CONFIG_VP9_VIDEOTOOLBOX_HWACCEL) += videotoolbox_vp9.o OBJS-$(CONFIG_VP8_QSV_HWACCEL) += qsvdec.o OBJS-$(CONFIG_VVC_VAAPI_HWACCEL) += vaapi_vvc.o +OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan_decode.o vulkan_prores_raw.o # Objects duplicated from other libraries for shared builds SHLIBOBJS += log2_tab.o reverse.o diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h index 0b2c725247..fb9b850233 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h @@ -67,6 +67,7 @@ extern const struct FFHWAccel ff_mpeg4_vaapi_hwaccel; extern const struct FFHWAccel ff_mpeg4_vdpau_hwaccel; extern const struct FFHWAccel ff_mpeg4_videotoolbox_hwaccel; extern const struct FFHWAccel ff_prores_videotoolbox_hwaccel; +extern const struct FFHWAccel ff_prores_raw_vulkan_hwaccel; extern const struct FFHWAccel ff_vc1_d3d11va_hwaccel; extern const struct FFHWAccel ff_vc1_d3d11va2_hwaccel; extern const struct FFHWAccel ff_vc1_d3d12va_hwaccel; diff --git a/libavcodec/proresdec_raw.c b/libavcodec/proresdec_raw.c index caebed9e96..6a0773c8a5 100644 --- a/libavcodec/proresdec_raw.c +++ b/libavcodec/proresdec_raw.c @@ -20,6 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "config_components.h" #include "libavutil/intreadwrite.h" #include "libavutil/mem_internal.h" #include "libavutil/mem.h" @@ -30,10 +31,13 @@ #include "bytestream.h" #include "codec_internal.h" #include "decode.h" +#include "hwconfig.h" #include "get_bits.h" #include "idctdsp.h" #include "proresdata.h" #include "thread.h" +#include "hwconfig.h" +#include "hwaccel_internal.h" #include "proresdec_raw.h" @@ -312,6 +316,9 @@ static enum AVPixelFormat get_pixel_format(AVCodecContext *avctx, enum AVPixelFormat pix_fmt) { enum AVPixelFormat pix_fmts[] = { +#if CONFIG_PRORES_RAW_VULKAN_HWACCEL + AV_PIX_FMT_VULKAN, +#endif pix_fmt, AV_PIX_FMT_NONE, }; @@ -514,6 +521,9 @@ const FFCodec ff_prores_raw_decoder = { AV_CODEC_CAP_SLICE_THREADS, .caps_internal = FF_CODEC_CAP_INIT_CLEANUP, .hw_configs = (const AVCodecHWConfigInternal *const []) { +#if CONFIG_PRORES_RAW_VULKAN_HWACCEL + HWACCEL_VULKAN(prores_raw), +#endif NULL }, }; diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index 729cb4f15c..d8e1471fa6 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -14,6 +14,9 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \ vulkan/ffv1_common.o vulkan/ffv1_reset.o \ vulkan/ffv1_dec_setup.o vulkan/ffv1_dec.o +OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o \ + vulkan/prores_raw.o + VULKAN = $(subst $(SRC_PATH)/,,$(wildcard $(SRC_PATH)/libavcodec/vulkan/*.comp)) .SECONDARY: $(VULKAN:.comp=.c) libavcodec/vulkan/%.c: TAG = VULKAN diff --git a/libavcodec/vulkan/prores_raw.comp b/libavcodec/vulkan/prores_raw.comp new file mode 100644 index 0000000000..fe0606e0b8 --- /dev/null +++ b/libavcodec/vulkan/prores_raw.comp @@ -0,0 +1,348 @@ +/* + * ProRes RAW decoder + * + * Copyright (c) 2025 Lynne <dev@lynne.ee> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define I16(x) (int16_t(x)) + +#define COMP_ID (gl_LocalInvocationID.z) +#define BLOCK_ID (gl_LocalInvocationID.y) +#define ROW_ID (gl_LocalInvocationID.x) + +GetBitContext gb; +shared float block[gl_WorkGroupSize.z][16][64] = { }; +shared float transposed[gl_WorkGroupSize.z][16][64]; + +void idct8_horiz(const uint row_id) +{ + float t0, t1, t2, t3, t4, t5, t6, t7, u8; + float u0, u1, u2, u3, u4, u5, u6, u7; + + /* Input */ + t0 = block[COMP_ID][BLOCK_ID][8*row_id + 0]; + u4 = block[COMP_ID][BLOCK_ID][8*row_id + 1]; + t2 = block[COMP_ID][BLOCK_ID][8*row_id + 2]; + u6 = block[COMP_ID][BLOCK_ID][8*row_id + 3]; + t1 = block[COMP_ID][BLOCK_ID][8*row_id + 4]; + u5 = block[COMP_ID][BLOCK_ID][8*row_id + 5]; + t3 = block[COMP_ID][BLOCK_ID][8*row_id + 6]; + u7 = block[COMP_ID][BLOCK_ID][8*row_id + 7]; + + /* Embedded scaled inverse 4-point Type-II DCT */ + u0 = t0 + t1; + u1 = t0 - t1; + u3 = t2 + t3; + u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3; + t0 = u0 + u3; + t3 = u0 - u3; + t1 = u1 + u2; + t2 = u1 - u2; + + /* Embedded scaled inverse 4-point Type-IV DST */ + t5 = u5 + u6; + t6 = u5 - u6; + t7 = u4 + u7; + t4 = u4 - u7; + u7 = t7 + t5; + u5 = (t7 - t5)*(1.4142135623730950488016887242097f); + u8 = (t4 + t6)*(1.8477590650225735122563663787936f); + u4 = u8 - t4*(1.0823922002923939687994464107328f); + u6 = u8 - t6*(2.6131259297527530557132863468544f); + t7 = u7; + t6 = t7 - u6; + t5 = t6 + u5; + t4 = t5 - u4; + + /* Butterflies */ + u0 = t0 + t7; + u7 = t0 - t7; + u6 = t1 + t6; + u1 = t1 - t6; + u2 = t2 + t5; + u5 = t2 - t5; + u4 = t3 + t4; + u3 = t3 - t4; + + /* Output */ + transposed[COMP_ID][BLOCK_ID][0*8 + row_id] = u0; + transposed[COMP_ID][BLOCK_ID][1*8 + row_id] = u1; + transposed[COMP_ID][BLOCK_ID][2*8 + row_id] = u2; + transposed[COMP_ID][BLOCK_ID][3*8 + row_id] = u3; + transposed[COMP_ID][BLOCK_ID][4*8 + row_id] = u4; + transposed[COMP_ID][BLOCK_ID][5*8 + row_id] = u5; + transposed[COMP_ID][BLOCK_ID][6*8 + row_id] = u6; + transposed[COMP_ID][BLOCK_ID][7*8 + row_id] = u7; +} + +void idct8_vert(const uint row_id) +{ + float t0, t1, t2, t3, t4, t5, t6, t7, u8; + float u0, u1, u2, u3, u4, u5, u6, u7; + + /* Input */ + t0 = transposed[COMP_ID][BLOCK_ID][8*row_id + 0] + 0.5f; // NOTE + u4 = transposed[COMP_ID][BLOCK_ID][8*row_id + 1]; + t2 = transposed[COMP_ID][BLOCK_ID][8*row_id + 2]; + u6 = transposed[COMP_ID][BLOCK_ID][8*row_id + 3]; + t1 = transposed[COMP_ID][BLOCK_ID][8*row_id + 4]; + u5 = transposed[COMP_ID][BLOCK_ID][8*row_id + 5]; + t3 = transposed[COMP_ID][BLOCK_ID][8*row_id + 6]; + u7 = transposed[COMP_ID][BLOCK_ID][8*row_id + 7]; + + /* Embedded scaled inverse 4-point Type-II DCT */ + u0 = t0 + t1; + u1 = t0 - t1; + u3 = t2 + t3; + u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3; + t0 = u0 + u3; + t3 = u0 - u3; + t1 = u1 + u2; + t2 = u1 - u2; + + /* Embedded scaled inverse 4-point Type-IV DST */ + t5 = u5 + u6; + t6 = u5 - u6; + t7 = u4 + u7; + t4 = u4 - u7; + u7 = t7 + t5; + u5 = (t7 - t5)*(1.4142135623730950488016887242097f); + u8 = (t4 + t6)*(1.8477590650225735122563663787936f); + u4 = u8 - t4*(1.0823922002923939687994464107328f); + u6 = u8 - t6*(2.6131259297527530557132863468544f); + t7 = u7; + t6 = t7 - u6; + t5 = t6 + u5; + t4 = t5 - u4; + + /* Butterflies */ + u0 = t0 + t7; + u7 = t0 - t7; + u6 = t1 + t6; + u1 = t1 - t6; + u2 = t2 + t5; + u5 = t2 - t5; + u4 = t3 + t4; + u3 = t3 - t4; + + /* Output */ + block[COMP_ID][BLOCK_ID][0*8 + row_id] = u0; + block[COMP_ID][BLOCK_ID][1*8 + row_id] = u1; + block[COMP_ID][BLOCK_ID][2*8 + row_id] = u2; + block[COMP_ID][BLOCK_ID][3*8 + row_id] = u3; + block[COMP_ID][BLOCK_ID][4*8 + row_id] = u4; + block[COMP_ID][BLOCK_ID][5*8 + row_id] = u5; + block[COMP_ID][BLOCK_ID][6*8 + row_id] = u6; + block[COMP_ID][BLOCK_ID][7*8 + row_id] = u7; +} + +#define TODCCODEBOOK(x) (((x) & 1) + (x) >> 1) + +int16_t get_value(int16_t codebook) +{ + const int16_t switch_bits = codebook >> 8; + const int16_t rice_order = codebook & I16(0xf); + const int16_t exp_order = (codebook >> 4) & I16(0xf); + + uint b = show_bits(gb, 32); + if (expectEXT(b == 0, false)) + return I16(0); + int16_t q = I16(31) - I16(findMSB(b)); + + if ((b & 0x80000000) > 0) { + skip_bits(gb, 1 + rice_order); + return I16((b & 0x7FFFFFFF) >> (31 - rice_order)); + } + + if (q <= switch_bits) { + skip_bits(gb, q + rice_order + 1); + return I16((q << rice_order) + + (((b << (q + 1)) >> 1) >> (31 - rice_order))); + } + + int16_t bits = exp_order + (q << 1) - switch_bits; + skip_bits(gb, bits); + return I16((b >> (32 - bits)) + + ((switch_bits + 1) << rice_order) - + (1 << exp_order)); +} + +void read_dc_vals(const uint nb_blocks) +{ + int16_t dc; + int16_t prev_dc = I16(0), dc_add = I16(0), sign = I16(0); + + /* Special handling for first block */ + dc = get_value(I16(700)); + prev_dc = int16_t(((dc & 1) + (dc >> 1) ^ -int((dc & 1))) + (dc & 1)); + block[COMP_ID][0][0] = prev_dc; + + for (uint n = 1; n < nb_blocks; n++) { + if (expectEXT(left_bits(gb) <= 0, false)) + break; + + int16_t dc_codebook; + if ((n & 15) == 1) + dc_codebook = I16(100); + else + dc_codebook = dc_cb[min(TODCCODEBOOK(dc), 13 - 1)]; + + dc = get_value(dc_codebook); + + sign = sign ^ dc & int16_t(1); + dc_add = (-sign ^ I16(TODCCODEBOOK(dc))) + sign; + sign = I16(dc_add < 0); + prev_dc += dc_add; + + block[COMP_ID][n][0] = prev_dc; + } +} + +void read_ac_vals(const uint nb_blocks) +{ + uint8_t idx; + const uint nb_codes = nb_blocks << 6; + const uint log2_nb_blocks = findMSB(nb_blocks); + const uint block_mask = (1 << log2_nb_blocks) - 1; + + int16_t sign; + int16_t ac, rn, ln; + int16_t ac_codebook = I16(49); + int16_t rn_codebook = I16( 0); + int16_t ln_codebook = I16(66); + int16_t val; + + for (uint n = nb_blocks; n <= nb_codes;) { + if (expectEXT(left_bits(gb) <= 0, false)) + break; + + ln = get_value(ln_codebook); + for (uint i = 0; i < ln; i++) { + if (expectEXT(left_bits(gb) <= 0, false)) + break; + + if (expectEXT((n + i) >= nb_codes, false)) + break; + + ac = get_value(ac_codebook); + ac_codebook = ac_cb[min(ac, 95 - 1)]; + sign = -int16_t(get_bits(gb, 1)); + + idx = scan[(n + i) >> log2_nb_blocks]; + val = int16_t(((ac + I16(1)) ^ sign) - sign); + block[COMP_ID][(n + i) & block_mask][idx] = val; + } + + n += ln; + if (expectEXT(n >= nb_codes, false)) + break; + + rn = get_value(rn_codebook); + rn_codebook = rn_cb[min(rn, 28 - 1)]; + + n += rn + 1; + if (expectEXT(n >= nb_codes, false)) + break; + + if (expectEXT(left_bits(gb) <= 0, false)) + break; + + ac = get_value(ac_codebook); + sign = -int16_t(get_bits(gb, 1)); + + idx = scan[n >> log2_nb_blocks]; + val = int16_t(((ac + 1) ^ sign) - sign); + block[COMP_ID][n & block_mask][idx] = val; + + ac_codebook = ac_cb[min(ac, 95 - 1)]; + ln_codebook = ln_cb[min(ac, 15 - 1)]; + + n++; + } +} + +void main(void) +{ + const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + TileData td = tile_data[tile_idx]; + + if (expectEXT(td.pos.x >= frame_size.x, false)) + return; + + uint64_t pkt_offset = uint64_t(pkt_data) + td.offset; + u8vec2buf hdr_data = u8vec2buf(pkt_offset); + int qscale = int(pack16(hdr_data[0].v.yx)); + + ivec4 size = ivec4(td.size, + pack16(hdr_data[2].v.yx), + pack16(hdr_data[1].v.yx), + pack16(hdr_data[3].v.yx)); + size[0] = size[0] - size[1] - size[2] - size[3] - 8; + if (expectEXT(size[0] < 0, false)) + return; + + const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1); + const float n = float(qscale - 16384) / (2.0f*4096.0f); + const uint w = min(tile_size.x, frame_size.x - td.pos.x) / 2; + const uint nb_blocks = w / 8; + + const ivec4 comp_offset = ivec4(size[2] + size[1] + size[3], + size[2], + 0, + size[2] + size[1]); + + if (BLOCK_ID == 0 && ROW_ID == 0) { + init_get_bits(gb, u8buf(pkt_offset + 8 + comp_offset[COMP_ID]), + size[COMP_ID]); + read_dc_vals(nb_blocks); + read_ac_vals(nb_blocks); + } + + barrier(); + + [[unroll]] + for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) + block[COMP_ID][BLOCK_ID][i] *= n * idct_8x8_scales[i]; + + barrier(); + +#ifdef PARALLEL_ROWS + idct8_horiz(ROW_ID); + + barrier(); + + idct8_vert(ROW_ID); +#else + for (uint j = 0; j < 8; j++) + idct8_horiz(j); + + barrier(); + + for (uint j = 0; j < 8; j++) + idct8_vert(j); +#endif + + barrier(); + + [[unroll]] + for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) + imageStore(dst, + offs + 2*ivec2(BLOCK_ID*8 + (i & 7), i >> 3), + vec4(block[COMP_ID][BLOCK_ID][i])); +} diff --git a/libavcodec/vulkan_decode.c b/libavcodec/vulkan_decode.c index 7310ba1547..857f16bc0a 100644 --- a/libavcodec/vulkan_decode.c +++ b/libavcodec/vulkan_decode.c @@ -25,7 +25,8 @@ #include "libavutil/vulkan_loader.h" #define DECODER_IS_SDR(codec_id) \ - ((codec_id) == AV_CODEC_ID_FFV1) + (((codec_id) == AV_CODEC_ID_FFV1) || \ + ((codec_id) == AV_CODEC_ID_PRORES_RAW)) #if CONFIG_H264_VULKAN_HWACCEL extern const FFVulkanDecodeDescriptor ff_vk_dec_h264_desc; @@ -39,6 +40,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_av1_desc; #if CONFIG_FFV1_VULKAN_HWACCEL extern const FFVulkanDecodeDescriptor ff_vk_dec_ffv1_desc; #endif +#if CONFIG_PRORES_RAW_VULKAN_HWACCEL +extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_raw_desc; +#endif static const FFVulkanDecodeDescriptor *dec_descs[] = { #if CONFIG_H264_VULKAN_HWACCEL @@ -53,6 +57,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = { #if CONFIG_FFV1_VULKAN_HWACCEL &ff_vk_dec_ffv1_desc, #endif +#if CONFIG_PRORES_RAW_VULKAN_HWACCEL + &ff_vk_dec_prores_raw_desc, +#endif }; static const FFVulkanDecodeDescriptor *get_codecdesc(enum AVCodecID codec_id) diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c new file mode 100644 index 0000000000..31320ffa18 --- /dev/null +++ b/libavcodec/vulkan_prores_raw.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2025 Lynne <dev@lynne.ee> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vulkan_decode.h" +#include "hwaccel_internal.h" + +#include "proresdec_raw.h" +#include "libavutil/vulkan_spirv.h" +#include "libavutil/mem.h" + +extern const char *ff_source_common_comp; +extern const char *ff_source_prores_raw_comp; + +const FFVulkanDecodeDescriptor ff_vk_dec_prores_raw_desc = { + .codec_id = AV_CODEC_ID_PRORES_RAW, + .decode_extension = FF_VK_EXT_PUSH_DESCRIPTOR, + .queue_flags = VK_QUEUE_COMPUTE_BIT, +}; + +typedef struct ProResRAWVulkanDecodePicture { + FFVulkanDecodePicture vp; + + AVBufferRef *tile_data; + uint32_t nb_tiles; +} ProResRAWVulkanDecodePicture; + +typedef struct ProResRAWVulkanDecodeContext { + FFVulkanShader decode[2]; + + AVBufferPool *tile_data_pool; + + FFVkBuffer uniform_buf; +} ProResRAWVulkanDecodeContext; + +typedef struct DecodePushData { + VkDeviceAddress tile_data; + VkDeviceAddress pkt_data; + uint32_t frame_size[2]; + uint32_t tile_size[2]; +} DecodePushData; + +typedef struct TileData { + int32_t pos[2]; + uint32_t offset; + uint32_t size; +} TileData; + +static int vk_prores_raw_start_frame(AVCodecContext *avctx, + const AVBufferRef *buffer_ref, + av_unused const uint8_t *buffer, + av_unused uint32_t size) +{ + int err; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = dec->shared_ctx; + ProResRAWVulkanDecodeContext *prv = ctx->sd_ctx; + ProResRAWContext *prr = avctx->priv_data; + + ProResRAWVulkanDecodePicture *pp = prr->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &pp->vp; + + /* Host map the input tile data if supported */ + if (ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) + ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data, + buffer_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT); + + /* Allocate tile data */ + err = ff_vk_get_pooled_buffer(&ctx->s, &prv->tile_data_pool, + &pp->tile_data, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, prr->nb_tiles*sizeof(TileData), + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + if (err < 0) + return err; + + /* Prepare frame to be used */ + err = ff_vk_decode_prepare_frame_sdr(dec, prr->frame, vp, 1, + FF_VK_REP_FLOAT, 0); + if (err < 0) + return err; + + return 0; +} + +static int vk_prores_raw_decode_slice(AVCodecContext *avctx, + const uint8_t *data, + uint32_t size) +{ + ProResRAWContext *prr = avctx->priv_data; + + ProResRAWVulkanDecodePicture *pp = prr->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &pp->vp; + + FFVkBuffer *tile_data_buf = (FFVkBuffer *)pp->tile_data->data; + TileData *td = (TileData *)tile_data_buf->mapped_mem; + FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL; + + td[pp->nb_tiles].pos[0] = prr->tiles[pp->nb_tiles].x; + td[pp->nb_tiles].pos[1] = prr->tiles[pp->nb_tiles].y; + td[pp->nb_tiles].size = size; + + if (vp->slices_buf && slices_buf->host_ref) { + td[pp->nb_tiles].offset = data - slices_buf->mapped_mem; + pp->nb_tiles++; + } else { + int err; + td[pp->nb_tiles].offset = vp->slices_size; + err = ff_vk_decode_add_slice(avctx, vp, data, size, 0, + &pp->nb_tiles, NULL); + if (err < 0) + return err; + } + + return 0; +} + +static int vk_prores_raw_end_frame(AVCodecContext *avctx) +{ + int err; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = dec->shared_ctx; + FFVulkanFunctions *vk = &ctx->s.vkfn; + + ProResRAWContext *prr = avctx->priv_data; + ProResRAWVulkanDecodeContext *prv = ctx->sd_ctx; + + DecodePushData pd_decode; + FFVulkanShader *decode_shader; + + ProResRAWVulkanDecodePicture *pp = prr->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &pp->vp; + + FFVkBuffer *slices_buf = (FFVkBuffer *)vp->slices_buf->data; + FFVkBuffer *tile_data = (FFVkBuffer *)pp->tile_data->data; + + VkImageMemoryBarrier2 img_bar[8]; + int nb_img_bar = 0; + + FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool); + ff_vk_exec_start(&ctx->s, exec); + + /* Prepare deps */ + RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, prr->frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &pp->tile_data, 1, 0)); + pp->tile_data = NULL; + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &vp->slices_buf, 1, 0)); + vp->slices_buf = NULL; + + ff_vk_frame_barrier(&ctx->s, exec, prr->frame, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + nb_img_bar = 0; + + decode_shader = &prv->decode[prr->version]; + ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader, + prr->frame, vp->view.out, + 0, 0, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + + ff_vk_exec_bind_shader(&ctx->s, exec, decode_shader); + pd_decode = (DecodePushData) { + .tile_data = tile_data->address, + .pkt_data = slices_buf->address, + .frame_size[0] = avctx->width, + .frame_size[1] = avctx->height, + .tile_size[0] = prr->tw, + .tile_size[1] = prr->th, + }; + ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd_decode), &pd_decode); + + vk->CmdDispatch(exec->buf, prr->nb_tw, prr->nb_th, 1); + + err = ff_vk_exec_submit(&ctx->s, exec); + if (err < 0) + return err; + +fail: + return 0; +} + +static int init_decode_shader(ProResRAWContext *prr, FFVulkanContext *s, + FFVkExecPool *pool, FFVkSPIRVCompiler *spv, + FFVulkanShader *shd, int version) +{ + int err; + FFVulkanDescriptorSetBinding *desc_set; + int parallel_rows = 1; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + + if (s->props.properties.limits.maxComputeWorkGroupInvocations < 512 || + s->props.properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) + parallel_rows = 0; + + RET(ff_vk_shader_init(s, shd, "prores_raw", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2", + "GL_EXT_null_initializer", + "GL_AMD_gpu_shader_half_float" }, 4, + parallel_rows ? 8 : 1 /* 8x8 transforms, 8-point width */, + version == 0 ? 8 : 16 /* Horizontal blocks */, + 4 /* Components */, + 0)); + + if (parallel_rows) + GLSLC(0, #define PARALLEL_ROWS ); + + /* Common codec header */ + GLSLD(ff_source_common_comp); + + GLSLC(0, layout(buffer_reference, buffer_reference_align = 16) buffer TileData { ); + GLSLC(1, ivec2 pos; ); + GLSLC(1, uint offset; ); + GLSLC(1, uint size; ); + GLSLC(0, }; ); + GLSLC(0, ); + GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); + GLSLC(1, TileData tile_data; ); + GLSLC(1, u8buf pkt_data; ); + GLSLC(1, uvec2 frame_size; ); + GLSLC(1, uvec2 tile_size; ); + GLSLC(0, }; ); + GLSLC(0, ); + ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData), + VK_SHADER_STAGE_COMPUTE_BIT); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "dst", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .mem_layout = "r16", + .mem_quali = "writeonly", + .dimensions = 2, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 1, 0, 0)); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "dct_scale_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "float idct_8x8_scales[64];", + }, + { + .name = "scan_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t scan[64];", + }, + { + .name = "dc_cb_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t dc_cb[13];", + }, + { + .name = "ac_cb_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t ac_cb[95];", + }, + { + .name = "rn_cb_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t rn_cb[28];", + }, + { + .name = "ln_cb_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t ln_cb[15];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 6, 1, 0)); + + GLSLD(ff_source_prores_raw_comp); + + RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(s, pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static void vk_decode_prores_raw_uninit(FFVulkanDecodeShared *ctx) +{ + ProResRAWVulkanDecodeContext *fv = ctx->sd_ctx; + + ff_vk_shader_free(&ctx->s, &fv->decode[0]); + ff_vk_shader_free(&ctx->s, &fv->decode[1]); + + ff_vk_free_buf(&ctx->s, &fv->uniform_buf); + + av_buffer_pool_uninit(&fv->tile_data_pool); + + av_freep(&fv); +} + +static int vk_decode_prores_raw_init(AVCodecContext *avctx) +{ + int err; + ProResRAWContext *prr = avctx->priv_data; + + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = NULL; + size_t ua; + + ProResRAWVulkanDecodeContext *prv; + FFVkSPIRVCompiler *spv; + + uint8_t *uniform_buf; + float *dct_scale_buf; + double idct_8_scales[8] = { + cos(4.0*M_PI/16.0)/2.0, + cos(1.0*M_PI/16.0)/2.0, + cos(2.0*M_PI/16.0)/2.0, + cos(3.0*M_PI/16.0)/2.0, + cos(4.0*M_PI/16.0)/2.0, + cos(5.0*M_PI/16.0)/2.0, + cos(6.0*M_PI/16.0)/2.0, + cos(7.0*M_PI/16.0)/2.0, + }; + uint8_t *scan_buf; + size_t cb_size[5] = { + 13*sizeof(int16_t), + 95*sizeof(int16_t), + 28*sizeof(int16_t), + 15*sizeof(int16_t), + }; + size_t cb_offset[5]; + + spv = ff_vk_spirv_init(); + if (!spv) { + av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n"); + return AVERROR_EXTERNAL; + } + + err = ff_vk_decode_init(avctx); + if (err < 0) + return err; + ctx = dec->shared_ctx; + + prv = ctx->sd_ctx = av_mallocz(sizeof(*prv)); + if (!prv) { + err = AVERROR(ENOMEM); + goto fail; + } + + ctx->sd_ctx_free = &vk_decode_prores_raw_uninit; + + ua = ctx->s.props.properties.limits.minUniformBufferOffsetAlignment; + cb_offset[0] = 64*sizeof(float) + 64*sizeof(uint8_t); + cb_offset[1] = cb_offset[0] + FFALIGN(cb_size[0], ua); + cb_offset[2] = cb_offset[1] + FFALIGN(cb_size[1], ua); + cb_offset[3] = cb_offset[2] + FFALIGN(cb_size[2], ua); + cb_offset[4] = cb_offset[3] + FFALIGN(cb_size[3], ua); + + /* Setup decode shader */ + RET(init_decode_shader(prr, &ctx->s, &ctx->exec_pool, spv, &prv->decode[0], 0)); + RET(init_decode_shader(prr, &ctx->s, &ctx->exec_pool, spv, &prv->decode[1], 1)); + + RET(ff_vk_create_buf(&ctx->s, &prv->uniform_buf, + 64*sizeof(float) + 64*sizeof(uint8_t) + cb_offset[4] + 256, + NULL, NULL, + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); + + RET(ff_vk_map_buffer(&ctx->s, &prv->uniform_buf, &uniform_buf, 0)); + + dct_scale_buf = (float *)uniform_buf; + for (int i = 0; i < 64; i++) + dct_scale_buf[i] = (float)(idct_8_scales[i >> 3] * idct_8_scales[i & 7]); + + scan_buf = uniform_buf + 64*sizeof(float); + for (int i = 0; i < 64; i++) + scan_buf[i] = prr->scan[i]; + + memcpy(uniform_buf + cb_offset[0], ff_prores_raw_dc_cb, + sizeof(ff_prores_raw_dc_cb)); + memcpy(uniform_buf + cb_offset[1], ff_prores_raw_ac_cb, + sizeof(ff_prores_raw_ac_cb)); + memcpy(uniform_buf + cb_offset[2], ff_prores_raw_rn_cb, + sizeof(ff_prores_raw_rn_cb)); + memcpy(uniform_buf + cb_offset[3], ff_prores_raw_ln_cb, + sizeof(ff_prores_raw_ln_cb)); + + RET(ff_vk_unmap_buffer(&ctx->s, &prv->uniform_buf, 1)); + + for (int i = 0; i < 2; i++) { + RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0], + &prv->decode[i], 1, 0, 0, + &prv->uniform_buf, + 0, 64*sizeof(float), + VK_FORMAT_UNDEFINED)); + RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0], + &prv->decode[i], 1, 1, 0, + &prv->uniform_buf, + 64*sizeof(float), 64*sizeof(uint8_t), + VK_FORMAT_UNDEFINED)); + for (int j = 0; j < 4; j++) + RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0], + &prv->decode[i], 1, 2 + j, 0, + &prv->uniform_buf, + cb_offset[j], cb_size[j], + VK_FORMAT_UNDEFINED)); + } + +fail: + spv->uninit(&spv); + + return err; +} + +static void vk_prores_raw_free_frame_priv(AVRefStructOpaque _hwctx, void *data) +{ + AVHWDeviceContext *dev_ctx = _hwctx.nc; + + ProResRAWVulkanDecodePicture *pp = data; + FFVulkanDecodePicture *vp = &pp->vp; + + ff_vk_decode_free_frame(dev_ctx, vp); +} + +const FFHWAccel ff_prores_raw_vulkan_hwaccel = { + .p.name = "prores_raw_vulkan", + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_PRORES_RAW, + .p.pix_fmt = AV_PIX_FMT_VULKAN, + .start_frame = &vk_prores_raw_start_frame, + .decode_slice = &vk_prores_raw_decode_slice, + .end_frame = &vk_prores_raw_end_frame, + .free_frame_priv = &vk_prores_raw_free_frame_priv, + .frame_priv_data_size = sizeof(ProResRAWVulkanDecodePicture), + .init = &vk_decode_prores_raw_init, + .update_thread_context = &ff_vk_update_thread_context, + .decode_params = &ff_vk_params_invalidate, + .flush = &ff_vk_decode_flush, + .uninit = &ff_vk_decode_uninit, + .frame_params = &ff_vk_frame_params, + .priv_data_size = sizeof(FFVulkanDecodeContext), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_THREAD_SAFE, +}; -- 2.49.0.395.g12beb8f557c _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
prev parent reply other threads:[~2025-07-10 15:15 UTC|newest] Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-07-10 15:13 [FFmpeg-devel] [PATCH 1/7] vf_libplacebo: add support for specifying a LUT for the input Lynne 2025-07-10 15:13 ` [FFmpeg-devel] [PATCH 2/7] hwcontext_vulkan: temporarily disable host_image_copy Lynne 2025-07-10 15:13 ` [FFmpeg-devel] [PATCH 3/7] vulkan: add support for 16-bit RGGB Bayer pixfmt Lynne 2025-07-10 15:13 ` [FFmpeg-devel] [PATCH 4/7] lavc/vulkan/common: sign-ify lengths Lynne 2025-07-10 15:13 ` [FFmpeg-devel] [PATCH 5/7] lavc: add codec ID and profiles for ProRes RAW Lynne 2025-07-10 15:13 ` [FFmpeg-devel] [PATCH 6/7] lavc: add a ProRes RAW decoder Lynne 2025-07-10 15:13 ` Lynne [this message]
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250710151349.1157547-7-dev@lynne.ee \ --to=dev@lynne.ee \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git