From: IndecisiveTurtle via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: IndecisiveTurtle <geoster3d@gmail.com> Subject: [FFmpeg-devel] [PATCH 3/3] lavc: implement a Vulkan-based prores encoder Date: Thu, 4 Sep 2025 23:10:02 +0300 Message-ID: <20250904201002.10446-3-47210458+raphaelthegreat@users.noreply.github.com> (raw) In-Reply-To: <20250904201002.10446-1-47210458+raphaelthegreat@users.noreply.github.com> From: IndecisiveTurtle <geoster3d@gmail.com> Adds a vulkan implementation of the reference prores kostya encoder. Provides about 3-4x speedup over the CPU code --- configure | 1 + libavcodec/Makefile | 1 + libavcodec/allcodecs.c | 1 + libavcodec/proresenc_kostya_vulkan.c | 1068 +++++++++++++++++ libavcodec/vulkan/Makefile | 7 + libavcodec/vulkan/prores_ks_alpha_data.comp | 67 ++ libavcodec/vulkan/prores_ks_encode_slice.comp | 230 ++++ .../vulkan/prores_ks_estimate_slice.comp | 267 +++++ libavcodec/vulkan/prores_ks_slice_data.comp | 265 ++++ libavcodec/vulkan/prores_ks_trellis_node.comp | 177 +++ 10 files changed, 2084 insertions(+) create mode 100644 libavcodec/proresenc_kostya_vulkan.c create mode 100644 libavcodec/vulkan/prores_ks_alpha_data.comp create mode 100644 libavcodec/vulkan/prores_ks_encode_slice.comp create mode 100644 libavcodec/vulkan/prores_ks_estimate_slice.comp create mode 100644 libavcodec/vulkan/prores_ks_slice_data.comp create mode 100644 libavcodec/vulkan/prores_ks_trellis_node.comp diff --git a/configure b/configure index 7ec4c3975b..4db8a7c581 100755 --- a/configure +++ b/configure @@ -3099,6 +3099,7 @@ prores_decoder_select="blockdsp idctdsp" prores_encoder_select="fdctdsp" prores_aw_encoder_select="fdctdsp" prores_ks_encoder_select="fdctdsp" +prores_ks_vulkan_encoder_select="vulkan spirv_compiler" prores_raw_decoder_select="blockdsp idctdsp" qcelp_decoder_select="lsp" qdm2_decoder_select="mpegaudiodsp" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index d8e1ac5a54..1964c787d7 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -639,6 +639,7 @@ OBJS-$(CONFIG_PRORES_DECODER) += proresdec.o proresdsp.o proresdata.o OBJS-$(CONFIG_PRORES_ENCODER) += proresenc_anatoliy.o proresdata.o OBJS-$(CONFIG_PRORES_AW_ENCODER) += proresenc_anatoliy.o proresdata.o OBJS-$(CONFIG_PRORES_KS_ENCODER) += proresenc_kostya.o proresdata.o proresenc_kostya_common.o +OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += proresenc_kostya_vulkan.o proresdata.o proresenc_kostya_common.o OBJS-$(CONFIG_PRORES_RAW_DECODER) += prores_raw.o proresdsp.o proresdata.o OBJS-$(CONFIG_PRORES_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o OBJS-$(CONFIG_PROSUMER_DECODER) += prosumer.o diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index f5ec2e01e8..1b4a5f769c 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -269,6 +269,7 @@ extern const FFCodec ff_prores_encoder; extern const FFCodec ff_prores_decoder; extern const FFCodec ff_prores_aw_encoder; extern const FFCodec ff_prores_ks_encoder; +extern const FFCodec ff_prores_ks_vulkan_encoder; extern const FFCodec ff_prores_raw_decoder; extern const FFCodec ff_prosumer_decoder; extern const FFCodec ff_psd_decoder; diff --git a/libavcodec/proresenc_kostya_vulkan.c b/libavcodec/proresenc_kostya_vulkan.c new file mode 100644 index 0000000000..6413b2f9d4 --- /dev/null +++ b/libavcodec/proresenc_kostya_vulkan.c @@ -0,0 +1,1068 @@ +/* + * Apple ProRes encoder + * + * Copyright (c) 2011 Anatoliy Wasserman + * Copyright (c) 2012 Konstantin Shishkov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/buffer.h" +#include "libavutil/macros.h" +#include "libavutil/mem.h" +#include "libavutil/mem_internal.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/vulkan_spirv.h" +#include "libavutil/hwcontext_vulkan.h" +#include "libavutil/vulkan_loader.h" +#include "libavutil/vulkan.h" +#include "avcodec.h" +#include "codec.h" +#include "codec_internal.h" +#include "encode.h" +#include "packet.h" +#include "put_bits.h" +#include "profiles.h" +#include "bytestream.h" +#include "proresdata.h" +#include "proresenc_kostya_common.h" +#include "hwconfig.h" + +#define DCTSIZE 8 + +typedef struct ProresDataTables { + int16_t qmat[128][64]; + int16_t qmat_chroma[128][64]; + uint8_t scan[64]; + uint8_t dc_codebook[7]; + uint8_t run_to_cb[16]; + uint8_t level_to_cb[10]; +} ProresDataTables; + +typedef struct SliceDataInfo { + int plane; + int pictures_per_frame; + int line_add; +} SliceDataInfo; + +typedef struct EstimateSliceInfo { + int slices_per_picture; + int min_quant; + int max_quant; + int bits_per_mb; +} EstimateSliceInfo; + +typedef struct EncodeSliceInfo { + VkDeviceAddress bytestream; + VkDeviceAddress seek_table; + int num_planes; + int slices_per_picture; + int max_quant; +} EncodeSliceInfo; + +typedef struct TrellisNodeInfo { + int min_quant; + int max_quant; + int mbs_per_slice; + int bits_per_mb; +} TrellisNodeInfo; + +#define TRELLIS_WIDTH 16 +#define SCORE_LIMIT INT_MAX / 2 + +struct TrellisNode { + int prev_node; + int quant; + int bits; + int score; +}; + +typedef struct SliceData { + uint32_t mbs_per_slice; + int16_t rows[MAX_PLANES * MAX_MBS_PER_SLICE * 256]; +} SliceData; + +typedef struct SliceScore { + int bits[MAX_STORED_Q][4]; + int error[MAX_STORED_Q][4]; + int total_bits[MAX_STORED_Q]; + int total_error[MAX_STORED_Q]; + int overquant; + int buf_start; + int quant; +} SliceScore; + +typedef struct VulkanEncodeProresFrameData { + /* Intermediate buffers */ + AVBufferRef *out_data_ref[2]; + AVBufferRef *slice_data_ref[2]; + AVBufferRef *slice_score_ref[2]; + AVBufferRef *frame_size_ref[2]; + + /* Copied from the source */ + int64_t pts; + int64_t duration; + void *frame_opaque; + AVBufferRef *frame_opaque_ref; + enum AVColorTransferCharacteristic color_trc; + enum AVColorSpace colorspace; + enum AVColorPrimaries color_primaries; + int key_frame; + int flags; +} VulkanEncodeProresFrameData; + +typedef struct ProresVulkanContext { + ProresContext ctx; + + /* Vulkan state */ + FFVulkanContext vkctx; + AVVulkanDeviceQueueFamily *qf; + FFVkExecPool e; + AVVulkanDeviceQueueFamily *transfer_qf; + FFVkExecPool transfer_exec_pool; + AVBufferPool *pkt_buf_pool; + AVBufferPool *slice_data_buf_pool; + AVBufferPool *slice_score_buf_pool; + AVBufferPool *frame_size_buf_pool; + + FFVulkanShader alpha_data_shd; + FFVulkanShader slice_data_shd[2]; + FFVulkanShader estimate_slice_shd; + FFVulkanShader encode_slice_shd; + FFVulkanShader trellis_node_shd; + FFVkBuffer prores_data_tables_buf; + + int *slice_quants; + SliceScore *slice_scores; + ProresDataTables *tables; + + int in_flight; + int async_depth; + AVFrame *frame; + VulkanEncodeProresFrameData *exec_ctx_info; +} ProresVulkanContext; + +extern const char *ff_source_common_comp; +extern const char *ff_source_prores_ks_alpha_data_comp; +extern const char *ff_source_prores_ks_slice_data_comp; +extern const char *ff_source_prores_ks_estimate_slice_comp; +extern const char *ff_source_prores_ks_trellis_node_comp; +extern const char *ff_source_prores_ks_encode_slice_comp; + +static int init_slice_data_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv, + FFVulkanShader* shd, const char* pl_name, int blocks_per_mb) +{ + int err = 0; + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + + ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT, + NULL, 0, DCTSIZE, blocks_per_mb, pv->ctx.mbs_per_slice, 0); + + av_bprintf(&shd->src, "#define DCTSIZE 8\n"); + av_bprintf(&shd->src, "#define MAX_PLANES 4\n"); + av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice); + av_bprintf(&shd->src, "#define BLOCKS_PER_MB %d\n", blocks_per_mb); + av_bprintf(&shd->src, "#define WIDTH_IN_MB %d\n", pv->ctx.mb_width); + av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; i16vec4 rows[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE][DCTSIZE / 4]; };\n"); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "SliceBuffer", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "SliceData slices[];", + }, + { + .name = "planes", + .type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .dimensions = 2, + .elems = 3, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "r16i", + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0)); + + ff_vk_shader_add_push_const(shd, 0, sizeof(SliceDataInfo), VK_SHADER_STAGE_COMPUTE_BIT); + GLSLD(ff_source_prores_ks_slice_data_comp); + + RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque)); + RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main")); + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int init_alpha_data_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv, + FFVulkanShader* shd, const char* pl_name) +{ + int err = 0; + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + + ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT, + NULL, 0, DCTSIZE * 2, DCTSIZE * 2, 1, 0); + + av_bprintf(&shd->src, "#define DCTSIZE 8\n"); + av_bprintf(&shd->src, "#define MAX_PLANES 4\n"); + av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice); + av_bprintf(&shd->src, "#define WIDTH_IN_MB %d\n", pv->ctx.mb_width); + av_bprintf(&shd->src, "#define SLICES_PITCH %d\n", pv->ctx.slices_width); + av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits); + av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n"); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "SliceBuffer", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "SliceData slices[];", + }, + { + .name = "plane", + .type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .dimensions = 2, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "r16i", + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0)); + + ff_vk_shader_add_push_const(shd, 0, sizeof(int), VK_SHADER_STAGE_COMPUTE_BIT); + GLSLD(ff_source_prores_ks_alpha_data_comp); + + RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque)); + RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main")); + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int init_estimate_slice_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv, + FFVulkanShader* shd, const char* pl_name) +{ + int err = 0; + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + int subgroup_size = vkctx->subgroup_props.maxSubgroupSize; + int dim_x = pv->ctx.alpha_bits ? subgroup_size : (subgroup_size / 3) * 3; + + ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT, + NULL, 0, dim_x, 1, 1, 0); + + av_bprintf(&shd->src, "#define DCTSIZE 8\n"); + av_bprintf(&shd->src, "#define MAX_PLANES 4\n"); + av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice); + av_bprintf(&shd->src, "#define CHROMA_FACTOR %d\n", pv->ctx.chroma_factor); + av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits); + av_bprintf(&shd->src, "#define MAX_STORED_Q %d\n", MAX_STORED_Q); + av_bprintf(&shd->src, "#define NUM_PLANES %d\n", pv->ctx.num_planes); + av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n"); + av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4 score[16]; int total_bits[16]; int total_score[16]; uint overquant; int buf_start; int quant; };\n"); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "SliceBuffer", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "SliceData slices[];", + }, + { + .name = "SliceScores", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "SliceScore scores[];", + }, + { + .name = "ProresDataTables", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t qmat[128][64]; int16_t qmat_chroma[128][64]; uint8_t scan[64]; " + "uint8_t dc_codebook[7]; uint8_t run_to_cb[16]; uint8_t level_to_cb[10];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0)); + + ff_vk_shader_add_push_const(shd, 0, sizeof(EstimateSliceInfo), VK_SHADER_STAGE_COMPUTE_BIT); + GLSLD(ff_source_prores_ks_estimate_slice_comp); + + RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque)); + RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main")); + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int init_trellis_node_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv, + FFVulkanShader* shd, const char* pl_name) +{ + int err = 0; + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + int subgroup_size = vkctx->subgroup_props.maxSubgroupSize; + + ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT, + NULL, 0, pv->ctx.mb_height, 1, 1, 0); + + av_bprintf(&shd->src, "#define SLICES_WIDTH %d\n", pv->ctx.slices_width); + av_bprintf(&shd->src, "#define NUM_SUBGROUPS %d\n", FFALIGN(pv->ctx.mb_height, subgroup_size) / subgroup_size); + av_bprintf(&shd->src, "#define NUM_PLANES %d\n", pv->ctx.num_planes); + av_bprintf(&shd->src, "#define FORCE_QUANT %d\n", pv->ctx.force_quant); + av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4 score[16]; int total_bits[16]; int total_score[16]; int overquant; int buf_start; int quant; };\n"); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "FrameSize", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int frame_size;", + }, + { + .name = "SliceScores", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "SliceScore scores[];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0)); + + ff_vk_shader_add_push_const(shd, 0, sizeof(TrellisNodeInfo), VK_SHADER_STAGE_COMPUTE_BIT); + GLSLD(ff_source_prores_ks_trellis_node_comp); + + RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque)); + RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main")); + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int init_encode_slice_pipeline(ProresVulkanContext *pv, FFVkSPIRVCompiler *spv, + FFVulkanShader* shd, const char* pl_name) +{ + int err = 0; + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + + ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT, + NULL, 0, 64, 1, 1, 0); + + av_bprintf(&shd->src, "#define DCTSIZE 8\n"); + av_bprintf(&shd->src, "#define MAX_PLANES 4\n"); + av_bprintf(&shd->src, "#define MAX_MBS_PER_SLICE %d\n", pv->ctx.mbs_per_slice); + av_bprintf(&shd->src, "#define CHROMA_FACTOR %d\n", pv->ctx.chroma_factor); + av_bprintf(&shd->src, "#define ALPHA_BITS %d\n", pv->ctx.alpha_bits); + av_bprintf(&shd->src, "struct SliceData { uint32_t mbs_per_slice; int16_t coeffs[MAX_PLANES][MAX_MBS_PER_SLICE * 4 * DCTSIZE * DCTSIZE]; };\n"); + av_bprintf(&shd->src, "struct SliceScore { ivec4 bits[16]; ivec4 score[16]; int total_bits[16]; int total_score[16]; uint overquant; int buf_start; int quant; };\n"); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "SliceBuffer", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "SliceData slices[];", + }, + { + .name = "SliceScores", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "SliceScore scores[];", + }, + { + .name = "ProresDataTables", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t qmat[128][64]; int16_t qmat_chroma[128][64]; uint8_t scan[64]; " + "uint8_t dc_codebook[7]; uint8_t run_to_cb[16]; uint8_t level_to_cb[10];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0)); + + ff_vk_shader_add_push_const(shd, 0, sizeof(EncodeSliceInfo), VK_SHADER_STAGE_COMPUTE_BIT); + av_bprintf(&shd->src, "#define PB_UNALIGNED\n"); + av_bprintf(&shd->src, "#extension GL_EXT_buffer_reference : require\n"); + av_bprintf(&shd->src, "#extension GL_EXT_buffer_reference2 : require\n"); + GLSLD(ff_source_common_comp); + GLSLD(ff_source_prores_ks_encode_slice_comp); + + RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque)); + RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main")); + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int vulkan_encode_prores_submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, + AVFrame *frame, int picture_idx) +{ + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + VulkanEncodeProresFrameData *pd = exec->opaque; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + int err = 0, nb_img_bar = 0, i, is_chroma; + int min_quant = ctx->profile_info->min_quant; + int max_quant = ctx->profile_info->max_quant; + int subgroup_size = vkctx->subgroup_props.maxSubgroupSize; + int estimate_dim_x = ctx->alpha_bits ? subgroup_size : (subgroup_size / 3) * 3; + int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY; + VkImageView views[AV_NUM_DATA_POINTERS]; + VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS]; + FFVkBuffer *pkt_vk_buf, *slice_data_buf, *slice_score_buf, *frame_size_buf; + SliceDataInfo slice_data_info; + EstimateSliceInfo estimate_info; + TrellisNodeInfo trellis_node_info; + EncodeSliceInfo encode_info; + FFVulkanShader *shd; + + /* Start recording */ + ff_vk_exec_start(vkctx, exec); + + /* Get a pooled buffer for writing output data */ + RET(ff_vk_get_pooled_buffer(vkctx, &pv->pkt_buf_pool, &pd->out_data_ref[picture_idx], + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, + ctx->frame_size_upper_bound + FF_INPUT_BUFFER_MIN_SIZE, + transfer_slices ? VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + : (VK_MEMORY_PROPERTY_HOST_CACHED_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))); + pkt_vk_buf = (FFVkBuffer*)pd->out_data_ref[picture_idx]->data; + ff_vk_exec_add_dep_buf(vkctx, exec, &pd->out_data_ref[picture_idx], 1, 1); + + /* Allocate buffer for writing slice data */ + RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_data_buf_pool, &pd->slice_data_ref[picture_idx], + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, + ctx->slices_per_picture * sizeof(SliceData), + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); + slice_data_buf = (FFVkBuffer*)pd->slice_data_ref[picture_idx]->data; + ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_data_ref[picture_idx], 1, 1); + + /* Allocate buffer for writing slice scores */ + RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_score_buf_pool, &pd->slice_score_ref[picture_idx], + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, + ctx->slices_per_picture * sizeof(SliceScore), + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); + slice_score_buf = (FFVkBuffer*)pd->slice_score_ref[picture_idx]->data; + ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_score_ref[picture_idx], 1, 1); + + /* Allocate buffer for writing frame size */ + RET(ff_vk_get_pooled_buffer(vkctx, &pv->frame_size_buf_pool, &pd->frame_size_ref[picture_idx], + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, + sizeof(int), + VK_MEMORY_PROPERTY_HOST_CACHED_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)); + frame_size_buf = (FFVkBuffer*)pd->frame_size_ref[picture_idx]->data; + ff_vk_exec_add_dep_buf(vkctx, exec, &pd->frame_size_ref[picture_idx], 1, 1); + + /* Generate barriers and image views for frame images. */ + RET(ff_vk_exec_add_dep_frame(vkctx, exec, frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + RET(ff_vk_create_imageviews(vkctx, exec, views, frame, FF_VK_REP_INT)); + ff_vk_frame_barrier(vkctx, exec, frame, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + /* Submit the image barriers. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + + /* Apply FDCT on input image data for future passes */ + slice_data_info = (SliceDataInfo) { + .pictures_per_frame = ctx->pictures_per_frame, + .line_add = ctx->pictures_per_frame == 1 ? 0 : picture_idx ^ !(frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST), + }; + for (i = 0; i < ctx->num_planes; i++) { + is_chroma = (i == 1 || i == 2); + shd = &pv->slice_data_shd[!is_chroma || ctx->chroma_factor == CFACTOR_Y444]; + if (i < 3) { + slice_data_info.plane = i; + ff_vk_shader_update_desc_buffer(vkctx, exec, shd, 0, 0, 0, + slice_data_buf, 0, slice_data_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img_array(vkctx, exec, shd, frame, views, 0, 1, + VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE); + ff_vk_exec_bind_shader(vkctx, exec, shd); + ff_vk_shader_update_push_const(vkctx, exec, shd, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(SliceDataInfo), &slice_data_info); + vk->CmdDispatch(exec->buf, ctx->slices_width, ctx->mb_height, 1); + } else { + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->alpha_data_shd, 0, 0, 0, + slice_data_buf, 0, slice_data_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img(vkctx, exec, &pv->alpha_data_shd, 0, 1, 0, views[3], + VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE); + ff_vk_exec_bind_shader(vkctx, exec, &pv->alpha_data_shd); + vk->CmdDispatch(exec->buf, ctx->mb_width, ctx->mb_height, 1); + } + } + + /* Wait for writes to slice buffer. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .pNext = NULL, + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_data_buf->buf, + .offset = 0U, + .size = slice_data_buf->size, + }, + .bufferMemoryBarrierCount = 1u, + }); + + /* Estimate slice bits and error for each quant */ + estimate_info = (EstimateSliceInfo) { + .slices_per_picture = ctx->slices_per_picture, + .min_quant = ctx->force_quant ? ctx->force_quant : min_quant, + .max_quant = ctx->force_quant ? ctx->force_quant : max_quant, + .bits_per_mb = ctx->bits_per_mb, + }; + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 0, 0, + slice_data_buf, 0, slice_data_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 1, 0, + slice_score_buf, 0, slice_score_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 2, 0, + &pv->prores_data_tables_buf, 0, pv->prores_data_tables_buf.size, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(vkctx, exec, &pv->estimate_slice_shd); + + ff_vk_shader_update_push_const(vkctx, exec, &pv->estimate_slice_shd, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(estimate_info), + &estimate_info); + vk->CmdDispatch(exec->buf, (ctx->slices_per_picture * ctx->num_planes + estimate_dim_x - 1) / estimate_dim_x, + ctx->force_quant ? 1 : (max_quant - min_quant + 1), 1); + + /* Wait for writes to score buffer. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .pNext = NULL, + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_score_buf->buf, + .offset = 0U, + .size = slice_score_buf->size, + }, + .bufferMemoryBarrierCount = 1u, + }); + + /* Compute optimal quant value for each slice */ + trellis_node_info = (TrellisNodeInfo) { + .min_quant = min_quant, + .max_quant = max_quant, + .bits_per_mb = ctx->bits_per_mb, + .mbs_per_slice = ctx->mbs_per_slice, + }; + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 0, 0, + frame_size_buf, 0, frame_size_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 1, 0, + slice_score_buf, 0, slice_score_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(vkctx, exec, &pv->trellis_node_shd); + ff_vk_shader_update_push_const(vkctx, exec, &pv->trellis_node_shd, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(TrellisNodeInfo), &trellis_node_info); + vk->CmdDispatch(exec->buf, 1, 1, 1); + + /* Wait for writes to quant buffer. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .pNext = NULL, + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = frame_size_buf->buf, + .offset = 0U, + .size = frame_size_buf->size, + }, + .bufferMemoryBarrierCount = 1u, + }); + + /* Encode slices. */ + encode_info = (EncodeSliceInfo) { + .seek_table = pkt_vk_buf->address, + .bytestream = pkt_vk_buf->address + ctx->slices_per_picture * 2, + .num_planes = ctx->num_planes, + .slices_per_picture = ctx->slices_per_picture, + .max_quant = ctx->force_quant ? ctx->force_quant : max_quant, + }; + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 0, 0, + slice_data_buf, 0, slice_data_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 1, 0, + slice_score_buf, 0, slice_score_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 2, 0, + &pv->prores_data_tables_buf, 0, pv->prores_data_tables_buf.size, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(vkctx, exec, &pv->encode_slice_shd); + ff_vk_shader_update_push_const(vkctx, exec, &pv->encode_slice_shd, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(encode_info), &encode_info); + vk->CmdDispatch(exec->buf, FFALIGN(ctx->slices_per_picture, 64) / 64, + ctx->num_planes, 1); + +fail: + return err; +} + +static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec, AVPacket *pkt) +{ + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + VulkanEncodeProresFrameData *pd = exec->opaque; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + FFVkExecContext *transfer_exec; + uint8_t *orig_buf, *buf, *slice_sizes; + uint8_t *picture_size_pos; + int picture_idx, err = 0; + int frame_size, picture_size; + int pkt_size = ctx->frame_size_upper_bound; + int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY; + FFVkBuffer *out_data_buf, *frame_size_buf; + VkMappedMemoryRange invalidate_data; + AVBufferRef *mapped_ref; + FFVkBuffer *mapped_buf; + + /* Allocate packet */ + RET(ff_get_encode_buffer(avctx, pkt, pkt_size + FF_INPUT_BUFFER_MIN_SIZE, 0)); + + /* Initialize packet. */ + pkt->pts = pd->pts; + pkt->dts = pd->pts; + pkt->duration = pd->duration; + pkt->flags |= AV_PKT_FLAG_KEY * pd->key_frame; + + if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) { + pkt->opaque = pd->frame_opaque; + pkt->opaque_ref = pd->frame_opaque_ref; + pd->frame_opaque_ref = NULL; + } + + /* Write frame atom */ + orig_buf = pkt->data; + buf = ff_prores_kostya_write_frame_header(avctx, ctx, &orig_buf, pd->flags, + pd->color_primaries, pd->color_trc, + pd->colorspace); + + /* Make sure encoding's done */ + ff_vk_exec_wait(vkctx, exec); + + /* Roll transfer execution context */ + if (transfer_slices) { + RET(ff_vk_host_map_buffer(vkctx, &mapped_ref, pkt->data, pkt->buf, + VK_BUFFER_USAGE_TRANSFER_DST_BIT)); + mapped_buf = (FFVkBuffer *)mapped_ref->data; + transfer_exec = ff_vk_exec_get(vkctx, &pv->transfer_exec_pool); + ff_vk_exec_start(vkctx, transfer_exec); + } + + for (picture_idx = 0; picture_idx < ctx->pictures_per_frame; picture_idx++) { + /* Fetch buffers for the current picture. */ + out_data_buf = (FFVkBuffer *)pd->out_data_ref[picture_idx]->data; + frame_size_buf = (FFVkBuffer *)pd->frame_size_ref[picture_idx]->data; + + /* Invalidate slice/output data if needed */ + invalidate_data = (VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .offset = 0, + .size = VK_WHOLE_SIZE, + }; + if (!(frame_size_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + invalidate_data.memory = frame_size_buf->mem; + vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1, &invalidate_data); + } + + /* Write picture header */ + picture_size_pos = buf + 1; + buf = ff_prores_kostya_write_picture_header(ctx, buf); + + /* Skip over seek table */ + slice_sizes = buf; + buf += ctx->slices_per_picture * 2; + + /* Calculate final size */ + buf += *(int*)frame_size_buf->mapped_mem; + + if (transfer_slices) { + /* Perform host mapped transfer of slice data */ + ff_vk_exec_add_dep_buf(vkctx, transfer_exec, &pd->out_data_ref[picture_idx], 1, 0); + ff_vk_exec_add_dep_buf(vkctx, transfer_exec, &mapped_ref, 1, 0); + vk->CmdCopyBuffer(transfer_exec->buf, out_data_buf->buf, mapped_buf->buf, 1, & (VkBufferCopy) { + .srcOffset = 0, + .dstOffset = mapped_buf->virtual_offset + slice_sizes - pkt->data, + .size = buf - slice_sizes, + }); + } else { + /* Fallback to regular memcpy if transfer is not available */ + if (!(out_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + invalidate_data.memory = out_data_buf->mem; + vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1, &invalidate_data); + } + memcpy(slice_sizes, out_data_buf->mapped_mem, buf - slice_sizes); + av_buffer_unref(&pd->out_data_ref[picture_idx]); + } + + /* Write picture size with header */ + picture_size = buf - (picture_size_pos - 1); + bytestream_put_be32(&picture_size_pos, picture_size); + + /* Slice output buffers no longer needed */ + av_buffer_unref(&pd->slice_data_ref[picture_idx]); + av_buffer_unref(&pd->slice_score_ref[picture_idx]); + av_buffer_unref(&pd->frame_size_ref[picture_idx]); + } + + /* Write frame size in header */ + orig_buf -= 8; + frame_size = buf - orig_buf; + bytestream_put_be32(&orig_buf, frame_size); + + av_shrink_packet(pkt, frame_size); + av_log(avctx, AV_LOG_VERBOSE, "Encoded data: %iMiB\n", pkt->size / (1024*1024)); + + /* Wait for slice transfer */ + if (transfer_slices) { + RET(ff_vk_exec_submit(vkctx, transfer_exec)); + ff_vk_exec_wait(vkctx, transfer_exec); + } + +fail: + return err; +} + +static int vulkan_encode_prores_receive_packet(AVCodecContext *avctx, AVPacket *pkt) +{ + int err; + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + VulkanEncodeProresFrameData *pd; + FFVkExecContext *exec; + AVFrame *frame; + + while (1) { + /* Roll an execution context */ + exec = ff_vk_exec_get(&pv->vkctx, &pv->e); + + /* If it had a frame, immediately output it */ + if (exec->had_submission) { + exec->had_submission = 0; + pv->in_flight--; + return get_packet(avctx, exec, pkt); + } + + /* Get next frame to encode */ + frame = pv->frame; + err = ff_encode_get_frame(avctx, frame); + if (err < 0 && err != AVERROR_EOF) { + return err; + } else if (err == AVERROR_EOF) { + if (!pv->in_flight) + return err; + continue; + } + + /* Encode frame */ + pd = exec->opaque; + pd->color_primaries = frame->color_primaries; + pd->color_trc = frame->color_trc; + pd->colorspace = frame->colorspace; + pd->pts = frame->pts; + pd->duration = frame->duration; + pd->flags = frame->flags; + if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) { + pd->frame_opaque = frame->opaque; + pd->frame_opaque_ref = frame->opaque_ref; + frame->opaque_ref = NULL; + } + + err = vulkan_encode_prores_submit_frame(avctx, exec, frame, 0); + if (ctx->pictures_per_frame > 1) + vulkan_encode_prores_submit_frame(avctx, exec, frame, 1); + + /* Submit execution context */ + ff_vk_exec_submit(&pv->vkctx, exec); + av_frame_unref(frame); + if (err < 0) + return err; + + pv->in_flight++; + if (pv->in_flight < pv->async_depth) + return AVERROR(EAGAIN); + } + + return 0; +} + +static av_cold int encode_close(AVCodecContext *avctx) +{ + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + FFVulkanContext *vkctx = &pv->vkctx; + + ff_vk_exec_pool_free(vkctx, &pv->e); + ff_vk_exec_pool_free(vkctx, &pv->transfer_exec_pool); + + if (ctx->alpha_bits) + ff_vk_shader_free(vkctx, &pv->alpha_data_shd); + + ff_vk_shader_free(vkctx, &pv->slice_data_shd[0]); + ff_vk_shader_free(vkctx, &pv->slice_data_shd[1]); + ff_vk_shader_free(vkctx, &pv->estimate_slice_shd); + ff_vk_shader_free(vkctx, &pv->encode_slice_shd); + ff_vk_shader_free(vkctx, &pv->trellis_node_shd); + + ff_vk_free_buf(vkctx, &pv->prores_data_tables_buf); + + av_buffer_pool_uninit(&pv->pkt_buf_pool); + av_buffer_pool_uninit(&pv->slice_data_buf_pool); + av_buffer_pool_uninit(&pv->slice_score_buf_pool); + av_buffer_pool_uninit(&pv->frame_size_buf_pool); + + ff_vk_uninit(vkctx); + + return 0; +} + +static av_cold int encode_init(AVCodecContext *avctx) +{ + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + int err = 0, i, q; + FFVulkanContext *vkctx = &pv->vkctx; + FFVkSPIRVCompiler *spv; + + /* Init vulkan */ + RET(ff_vk_init(vkctx, avctx, NULL, avctx->hw_frames_ctx)); + + pv->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0); + if (!pv->qf) { + av_log(avctx, AV_LOG_ERROR, "Device has no compute queues!\n"); + return AVERROR(ENOTSUP); + } + + spv = ff_vk_spirv_init(); + if (!spv) { + av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n"); + return AVERROR_EXTERNAL; + } + + RET(ff_vk_exec_pool_init(vkctx, pv->qf, &pv->e, 1, 0, 0, 0, NULL)); + + pv->transfer_qf = ff_vk_qf_find(vkctx, VK_QUEUE_TRANSFER_BIT, 0); + if (!pv->transfer_qf) { + av_log(avctx, AV_LOG_ERROR, "Device has no transfer queues!\n"); + return err; + } + + RET(ff_vk_exec_pool_init(vkctx, pv->transfer_qf, &pv->transfer_exec_pool, 1, 0, 0, 0, NULL)); + + /* Init common prores structures */ + err = ff_prores_kostya_encode_init(avctx, ctx, vkctx->frames->sw_format); + if (err < 0) + return err; + + /* Temporary frame */ + pv->frame = av_frame_alloc(); + if (!pv->frame) + return AVERROR(ENOMEM); + + /* Async data pool */ + pv->async_depth = pv->e.pool_size; + pv->exec_ctx_info = av_calloc(pv->async_depth, sizeof(*pv->exec_ctx_info)); + if (!pv->exec_ctx_info) + return AVERROR(ENOMEM); + for (int i = 0; i < pv->async_depth; i++) + pv->e.contexts[i].opaque = &pv->exec_ctx_info[i]; + + /* Compile shaders used by encoder */ + init_slice_data_pipeline(pv, spv, &pv->slice_data_shd[0], "slice_data_blocks2", 2); + init_slice_data_pipeline(pv, spv, &pv->slice_data_shd[1], "slice_data_blocks4", 4); + init_estimate_slice_pipeline(pv, spv, &pv->estimate_slice_shd, "estimate_slice"); + init_trellis_node_pipeline(pv, spv, &pv->trellis_node_shd, "trellis_node"); + init_encode_slice_pipeline(pv, spv, &pv->encode_slice_shd, "encode_slice"); + if (ctx->alpha_bits) { + init_alpha_data_pipeline(pv, spv, &pv->alpha_data_shd, "alpha_data"); + } + + /* Create prores data tables uniform buffer. */ + RET(ff_vk_create_buf(vkctx, &pv->prores_data_tables_buf, + sizeof(ProresDataTables), NULL, NULL, + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); + RET(ff_vk_map_buffer(vkctx, &pv->prores_data_tables_buf, (void *)&pv->tables, 0)); + memcpy(pv->tables->qmat, ctx->quants, sizeof(ctx->quants)); + memcpy(pv->tables->qmat_chroma, ctx->quants_chroma, sizeof(ctx->quants_chroma)); + memcpy(pv->tables->scan, ctx->scantable, sizeof(ff_prores_progressive_scan)); + memcpy(pv->tables->dc_codebook, ff_prores_dc_codebook, sizeof(ff_prores_dc_codebook)); + memcpy(pv->tables->run_to_cb, ff_prores_run_to_cb, sizeof(ff_prores_run_to_cb)); + memcpy(pv->tables->level_to_cb, ff_prores_level_to_cb, sizeof(ff_prores_level_to_cb)); + + for (q = MAX_STORED_Q; q < 128; ++q) { + for (i = 0; i < 64; i++) { + pv->tables->qmat[q][i] = ctx->quant_mat[i] * q; + pv->tables->qmat_chroma[q][i] = ctx->quant_chroma_mat[i] * q; + } + } + +fail: + return err; +} + +#define OFFSET(x) offsetof(ProresVulkanContext, x) +#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM + +static const AVOption options[] = { + { "mbs_per_slice", "macroblocks per slice", OFFSET(ctx.mbs_per_slice), + AV_OPT_TYPE_INT, { .i64 = 8 }, 1, MAX_MBS_PER_SLICE, VE }, + { "profile", NULL, OFFSET(ctx.profile), AV_OPT_TYPE_INT, + { .i64 = PRORES_PROFILE_AUTO }, + PRORES_PROFILE_AUTO, PRORES_PROFILE_4444XQ, VE, .unit = "profile" }, + { "auto", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_AUTO }, + 0, 0, VE, .unit = "profile" }, + { "proxy", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_PROXY }, + 0, 0, VE, .unit = "profile" }, + { "lt", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_LT }, + 0, 0, VE, .unit = "profile" }, + { "standard", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_STANDARD }, + 0, 0, VE, .unit = "profile" }, + { "hq", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_HQ }, + 0, 0, VE, .unit = "profile" }, + { "4444", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444 }, + 0, 0, VE, .unit = "profile" }, + { "4444xq", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444XQ }, + 0, 0, VE, .unit = "profile" }, + { "vendor", "vendor ID", OFFSET(ctx.vendor), + AV_OPT_TYPE_STRING, { .str = "Lavc" }, 0, 0, VE }, + { "bits_per_mb", "desired bits per macroblock", OFFSET(ctx.bits_per_mb), + AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 8192, VE }, + { "quant_mat", "quantiser matrix", OFFSET(ctx.quant_sel), AV_OPT_TYPE_INT, + { .i64 = -1 }, -1, QUANT_MAT_DEFAULT, VE, .unit = "quant_mat" }, + { "auto", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = -1 }, + 0, 0, VE, .unit = "quant_mat" }, + { "proxy", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_PROXY }, + 0, 0, VE, .unit = "quant_mat" }, + { "lt", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_LT }, + 0, 0, VE, .unit = "quant_mat" }, + { "standard", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_STANDARD }, + 0, 0, VE, .unit = "quant_mat" }, + { "hq", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_HQ }, + 0, 0, VE, .unit = "quant_mat" }, + { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_DEFAULT }, + 0, 0, VE, .unit = "quant_mat" }, + { "alpha_bits", "bits for alpha plane", OFFSET(ctx.alpha_bits), AV_OPT_TYPE_INT, + { .i64 = 16 }, 0, 16, VE }, + { "async_depth", "Internal parallelization depth", OFFSET(async_depth), AV_OPT_TYPE_INT, + { .i64 = 1 }, 1, INT_MAX, VE }, + { NULL } +}; + +static const AVClass proresenc_class = { + .class_name = "ProRes vulkan encoder", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVCodecHWConfigInternal *const prores_ks_hw_configs[] = { + HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN), + HW_CONFIG_ENCODER_DEVICE(NONE, VULKAN), + NULL, +}; + +const FFCodec ff_prores_ks_vulkan_encoder = { + .p.name = "prores_ks_vulkan", + CODEC_LONG_NAME("Apple ProRes (iCodec Pro)"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_PRORES, + .priv_data_size = sizeof(ProresVulkanContext), + .init = encode_init, + .close = encode_close, + FF_CODEC_RECEIVE_PACKET_CB(&vulkan_encode_prores_receive_packet), + .p.capabilities = AV_CODEC_CAP_DELAY | + AV_CODEC_CAP_HARDWARE | + AV_CODEC_CAP_ENCODER_FLUSH | + AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE, + CODEC_PIXFMTS(AV_PIX_FMT_VULKAN), + .hw_configs = prores_ks_hw_configs, + .color_ranges = AVCOL_RANGE_MPEG, + .p.priv_class = &proresenc_class, + .p.profiles = NULL_IF_CONFIG_SMALL(ff_prores_profiles), + .caps_internal = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_EOF_FLUSH, +}; diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index d8e1471fa6..f69e430c33 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -9,6 +9,13 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/common.o \ vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_setup.o \ vulkan/ffv1_rct_search.o vulkan/ffv1_enc.o +OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += vulkan/common.o \ + vulkan/prores_ks_alpha_data.o \ + vulkan/prores_ks_slice_data.o \ + vulkan/prores_ks_estimate_slice.o \ + vulkan/prores_ks_encode_slice.o \ + vulkan/prores_ks_trellis_node.o + OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \ vulkan/rangecoder.o vulkan/ffv1_vlc.o \ vulkan/ffv1_common.o vulkan/ffv1_reset.o \ diff --git a/libavcodec/vulkan/prores_ks_alpha_data.comp b/libavcodec/vulkan/prores_ks_alpha_data.comp new file mode 100644 index 0000000000..825ba28a4f --- /dev/null +++ b/libavcodec/vulkan/prores_ks_alpha_data.comp @@ -0,0 +1,67 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_EXT_samplerless_texture_functions : require + +/* Table of possible edge slice configurations */ +const uvec3 edge_mps_table[8] = uvec3[]( + uvec3(0, 0, 0), + uvec3(1, 0, 0), + uvec3(2, 0, 0), + uvec3(2, 1, 0), + uvec3(4, 0, 0), + uvec3(4, 1, 0), + uvec3(4, 2, 0), + uvec3(4, 2, 1) +); + +void main() +{ + ivec2 coord = min(ivec2(gl_GlobalInvocationID.xy), textureSize(plane, 0) - ivec2(1)); + int alpha = texelFetch(plane, coord, 0).x; + +#if ALPHA_BITS == 8 + alpha >>= 2; +#else + alpha = (alpha << 6) | (alpha >> 4); +#endif + + uint mbs_per_slice = MAX_MBS_PER_SLICE; + uint slices_width = WIDTH_IN_MB / mbs_per_slice; + uint mb_width = slices_width * mbs_per_slice; + uint slice_x = gl_WorkGroupID.x / mbs_per_slice; + uint slice_y = gl_WorkGroupID.y; + uvec2 slice_base = uvec2(slice_x, slice_y) * (mbs_per_slice * 16u); + + /* Handle slice macroblock size reduction on edge slices */ + if (gl_WorkGroupID.x >= mb_width) + { + uint edge_mb = gl_WorkGroupID.x - mb_width; + uvec3 table = edge_mps_table[WIDTH_IN_MB - mb_width]; + uvec3 base = uvec3(0, table.x, table.x + table.y); + uint edge_slice = edge_mb < base.y ? 0 : (edge_mb < base.z ? 1 : 2); + slice_x += edge_slice; + slice_base += base[edge_slice] * (DCTSIZE * 2u); + mbs_per_slice = table[edge_slice]; + } + + uint slice = slice_y * SLICES_PITCH + slice_x; + uvec2 coeff_coord = uvec2(coord) - slice_base; + uint coeff = coeff_coord.y * (mbs_per_slice * 16u) + coeff_coord.x; + slices[slice].coeffs[3][coeff] = int16_t(alpha); +} \ No newline at end of file diff --git a/libavcodec/vulkan/prores_ks_encode_slice.comp b/libavcodec/vulkan/prores_ks_encode_slice.comp new file mode 100644 index 0000000000..2c06388a46 --- /dev/null +++ b/libavcodec/vulkan/prores_ks_encode_slice.comp @@ -0,0 +1,230 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define CFACTOR_Y444 3 + +layout(push_constant, scalar) uniform EncodeSliceInfo { + u8buf bytestream; + u8vec2buf seek_table; + int num_planes; + int slices_per_picture; + int max_quant; +}; + +int av_zero_extend(int a, uint p) +{ + return int(uint(a) & ((1U << p) - 1)); +} + +void encode_vlc_codeword(inout PutBitContext pb, uint codebook, int val) +{ + /* number of prefix bits to switch between Rice and expGolomb */ + uint switch_bits = (codebook & 3) + 1; + uint rice_order = codebook >> 5; /* rice code order */ + uint exp_order = (codebook >> 2) & 7; /* exp golomb code order */ + + uint switch_val = switch_bits << rice_order; + + if (val >= switch_val) { + val -= int(switch_val - (1 << exp_order)); + int exponent = findMSB(val); + + put_bits(pb, exponent - exp_order + switch_bits, 0); + put_bits(pb, exponent + 1, val); + } else { + int exponent = val >> rice_order; + if (exponent != 0) + put_bits(pb, exponent, 0); + put_bits(pb, 1, 1); + if (rice_order != 0) + put_bits(pb, rice_order, av_zero_extend(val, rice_order)); + } +} + +#define GET_SIGN(x) ((x) >> 31) +#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x)) + +#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0 + +void encode_dcs(inout PutBitContext pb, bool is_chroma, int q) +{ + uint slice = gl_GlobalInvocationID.x; + uint plane = gl_GlobalInvocationID.y; + uint blocks_per_mb = is_chroma && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4; + uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; + int codebook = 5; + int scale = is_chroma ? qmat_chroma[q][0] : qmat[q][0]; + int coeff = slices[slice].coeffs[plane][0]; + int prev_dc = (coeff - 0x4000) / scale; + encode_vlc_codeword(pb, FIRST_DC_CB, MAKE_CODE(prev_dc)); + int sign = 0; + for (int i = 1; i < blocks_per_slice; i++) { + coeff = slices[slice].coeffs[plane][i * 64]; + int dc = (coeff - 0x4000) / scale; + int delta = dc - prev_dc; + int new_sign = GET_SIGN(delta); + delta = (delta ^ sign) - sign; + int code = MAKE_CODE(delta); + encode_vlc_codeword(pb, dc_codebook[codebook], code); + codebook = min(code, 6); + sign = new_sign; + prev_dc = dc; + } +} + +void encode_acs(inout PutBitContext pb, bool is_chroma, int q) +{ + uint slice = gl_GlobalInvocationID.x; + uint plane = gl_GlobalInvocationID.y; + uint blocks_per_mb = is_chroma && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4; + uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; + uint max_coeffs = blocks_per_slice << 6; + int prev_run = 4; + int prev_level = 2; + int run = 0; + + for (int i = 1; i < 64; i++) { + for (int idx = scan[i]; idx < max_coeffs; idx += 64) { + int coeff = slices[slice].coeffs[plane][idx]; + int level = coeff / (is_chroma ? qmat_chroma[q][scan[i]] : qmat[q][scan[i]]); + if (level != 0) { + int abs_level = abs(level); + encode_vlc_codeword(pb, run_to_cb[prev_run], run); + encode_vlc_codeword(pb, level_to_cb[prev_level], abs_level - 1); + put_bits(pb, 1, av_zero_extend(GET_SIGN(level), 1)); + prev_run = min(run, 15); + prev_level = min(abs_level, 9); + run = 0; + } else { + run++; + } + } + } +} + +void encode_slice_plane(inout PutBitContext pb, int q) +{ + uint plane = gl_GlobalInvocationID.y; + bool is_chroma = plane == 1 || plane == 2; + encode_dcs(pb, is_chroma, q); + encode_acs(pb, is_chroma, q); +} + +void put_alpha_diff(inout PutBitContext pb, int cur, int prev) +{ + const int dbits = (ALPHA_BITS == 8) ? 4 : 7; + const int dsize = 1 << dbits - 1; + int diff = cur - prev; + + diff = av_zero_extend(diff, ALPHA_BITS); + if (diff >= (1 << ALPHA_BITS) - dsize) + diff -= 1 << ALPHA_BITS; + if (diff < -dsize || diff > dsize || diff == 0) { + put_bits(pb, 1, 1); + put_bits(pb, ALPHA_BITS, diff); + } else { + put_bits(pb, 1, 0); + put_bits(pb, dbits - 1, abs(diff) - 1); + put_bits(pb, 1, int(diff < 0)); + } +} + +void put_alpha_run(inout PutBitContext pb, int run) +{ + if (run != 0) { + put_bits(pb, 1, 0); + if (run < 0x10) + put_bits(pb, 4, run); + else + put_bits(pb, 15, run); + } else { + put_bits(pb, 1, 1); + } +} + +void encode_alpha_plane(inout PutBitContext pb) +{ + uint slice = gl_GlobalInvocationID.x; + const int mask = (1 << ALPHA_BITS) - 1; + const int num_coeffs = int(slices[slice].mbs_per_slice) * 256; + int prev = mask, cur; + int idx = 0; + int run = 0; + + cur = slices[slice].coeffs[3][idx++]; + put_alpha_diff(pb, cur, prev); + prev = cur; + do { + cur = slices[slice].coeffs[3][idx++]; + if (cur != prev) { + put_alpha_run(pb, run); + put_alpha_diff(pb, cur, prev); + prev = cur; + run = 0; + } else { + run++; + } + } while (idx < num_coeffs); + put_alpha_run(pb, run); +} + +u8vec2 byteswap16(int value) +{ + return unpack8(uint16_t(value)).yx; +} + +void main() +{ + uint slice = gl_GlobalInvocationID.x; + if (slice >= slices_per_picture) + return; + + uint plane = gl_GlobalInvocationID.y; + int q = scores[slice].quant; + int q_idx = min(q, max_quant + 1); + int slice_hdr_size = 2 * num_planes; + int slice_size = slice_hdr_size + (scores[slice].total_bits[q_idx] / 8); + u8buf buf = OFFBUF(u8buf, bytestream, scores[slice].buf_start); + + /* Write slice header */ + if (plane == 0) + { + buf[0].v = uint8_t(slice_hdr_size * 8); + buf[1].v = uint8_t(q); + u8vec2buf slice_hdr = OFFBUF(u8vec2buf, buf, 2); + for (int i = 0; i < num_planes - 1; i++) + { + int bits = scores[slice].bits[q_idx][i] / 8; + slice_hdr[i].v = byteswap16(bits); + } + seek_table[slice].v = byteswap16(slice_size); + } + + int plane_offset = 0; + for (int i = 0; i < plane; ++i) + plane_offset += scores[slice].bits[q_idx][i] / 8; + + /* Encode slice plane */ + PutBitContext pb; + init_put_bits(pb, OFFBUF(u8buf, buf, slice_hdr_size + plane_offset), 0); + if (plane == 3) + encode_alpha_plane(pb); + else + encode_slice_plane(pb, q); + flush_put_bits(pb); +} \ No newline at end of file diff --git a/libavcodec/vulkan/prores_ks_estimate_slice.comp b/libavcodec/vulkan/prores_ks_estimate_slice.comp new file mode 100644 index 0000000000..5f9b39cd75 --- /dev/null +++ b/libavcodec/vulkan/prores_ks_estimate_slice.comp @@ -0,0 +1,267 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_KHR_shader_subgroup_clustered : require +#extension GL_KHR_shader_subgroup_shuffle : require + +#define CFACTOR_Y444 3 + +layout(push_constant, scalar) uniform EstimateSliceInfo { + uint slices_per_picture; + uint min_quant; + uint max_quant; + uint bits_per_mb; +}; + +int av_zero_extend(int a, uint p) +{ + return int(uint(a) & ((1U << p) - 1)); +} + +#define GET_SIGN(x) ((x) >> 31) +#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x)) + +int estimate_vlc(uint codebook, int val) +{ + /* number of prefix bits to switch between Rice and expGolomb */ + uint switch_bits = (codebook & 3) + 1; + uint rice_order = codebook >> 5; /* rice code order */ + uint exp_order = (codebook >> 2) & 7; /* exp golomb code order */ + + uint switch_val = switch_bits << rice_order; + + if (val >= switch_val) + { + val -= int(switch_val - (1 << exp_order)); + int exponent = findMSB(val); + return int(exponent * 2 - exp_order + switch_bits + 1); + } + else + { + return int((val >> rice_order) + rice_order + 1); + } +} + +#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0 + +int estimate_dcs(inout int error, uint slice, uint plane, uint q) +{ + uint blocks_per_mb = plane != 0 && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4; + uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; + int codebook = 5; + int coeff = slices[slice].coeffs[plane][0]; + int scale = plane != 0 ? qmat_chroma[q][0] : qmat[q][0]; + int prev_dc = (coeff - 0x4000) / scale; + int bits = estimate_vlc(FIRST_DC_CB, MAKE_CODE(prev_dc)); + int sign = 0; + coeff = slices[slice].coeffs[plane][64]; + error += abs(coeff - 0x4000) % scale; + + for (int i = 1; i < blocks_per_slice; ++i) { + coeff = slices[slice].coeffs[plane][i * 64]; + int dc = (coeff - 0x4000) / scale; + error += abs(coeff - 0x4000) % scale; + int delta = dc - prev_dc; + int new_sign = GET_SIGN(delta); + delta = (delta ^ sign) - sign; + int code = MAKE_CODE(delta); + bits += estimate_vlc(dc_codebook[codebook], code); + codebook = min(code, 6); + sign = new_sign; + prev_dc = dc; + } + + return bits; +} + +#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1)) +#define SCORE_LIMIT 1073741823 + +int estimate_acs(inout int error, uint slice, uint plane, uint q) +{ + uint blocks_per_mb = plane != 0 && CHROMA_FACTOR != CFACTOR_Y444 ? 2 : 4; + uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; + uint max_coeffs = blocks_per_slice << 6; + int prev_run = 4; + int prev_level = 2; + int bits = 0; + int run = 0; + + for (int i = 1; i < 64; i++) { + for (int idx = scan[i]; idx < max_coeffs; idx += 64) { + int coeff = slices[slice].coeffs[plane][idx]; + int quant = plane != 0 ? qmat_chroma[q][scan[i]] : qmat[q][scan[i]]; + int level = coeff / quant; + error += abs(coeff) % quant; + if (level != 0) { + int abs_level = abs(level); + bits += estimate_vlc(run_to_cb[prev_run], run); + bits += estimate_vlc(level_to_cb[prev_level], abs_level - 1) + 1; + prev_run = min(run, 15); + prev_level = min(abs_level, 9); + run = 0; + } else { + run++; + } + } + } + + return bits; +} + +int estimate_slice_plane(inout int error, uint slice, uint plane, uint q) +{ + int bits = 0; + bits += estimate_dcs(error, slice, plane, q); + bits += estimate_acs(error, slice, plane, q); + return FFALIGN(bits, 8); +} + +int est_alpha_diff(int cur, int prev) +{ + const int dbits = (ALPHA_BITS == 8) ? 4 : 7; + const int dsize = 1 << dbits - 1; + int diff = cur - prev; + + diff = av_zero_extend(diff, ALPHA_BITS); + if (diff >= (1 << ALPHA_BITS) - dsize) + diff -= 1 << ALPHA_BITS; + if (diff < -dsize || diff > dsize || diff == 0) + return ALPHA_BITS + 1; + else + return dbits + 1; +} + +int estimate_alpha_plane(uint slice) +{ + const int mask = (1 << ALPHA_BITS) - 1; + const int num_coeffs = int(slices[slice].mbs_per_slice) * 256; + int prev = mask, cur; + int idx = 0; + int run = 0; + int bits; + + cur = slices[slice].coeffs[3][idx++]; + bits = est_alpha_diff(cur, prev); + prev = cur; + do { + cur = slices[slice].coeffs[3][idx++]; + if (cur != prev) { + if (run == 0) + bits++; + else if (run < 0x10) + bits += 4; + else + bits += 15; + bits += est_alpha_diff(cur, prev); + prev = cur; + run = 0; + } else { + run++; + } + } while (idx < num_coeffs); + + if (run != 0) { + if (run < 0x10) + bits += 4; + else + bits += 15; + } + + return bits; +} + +int sum_of_planes(int value) +{ +#if NUM_PLANES == 3 + uint base = (gl_SubgroupInvocationID / 3) * 3; + return subgroupShuffle(value, base) + subgroupShuffle(value, base + 1) + subgroupShuffle(value, base + 2); +#else + return subgroupClusteredAdd(value, 4); +#endif +} + +void main() +{ + uint slice = gl_GlobalInvocationID.x / NUM_PLANES; + uint plane = gl_LocalInvocationID.x % NUM_PLANES; + uint q = min_quant + gl_GlobalInvocationID.y; + if (slice >= slices_per_picture) + return; + + /* Estimate slice bits and error for specified quantizer and plane */ + int error = 0; + int bits = 0; + if (plane == 3) + bits = estimate_alpha_plane(slice); + else + bits = estimate_slice_plane(error, slice, plane, q); + + /* Write results to score buffer */ + scores[slice].bits[q][plane] = bits; + scores[slice].score[q][plane] = error; + + /* Accumulate total bits and error of all planes */ + int total_bits = sum_of_planes(bits); + int total_score = sum_of_planes(error); + if (total_bits > 65000 * 8) + total_score = SCORE_LIMIT; + scores[slice].total_bits[q] = total_bits; + scores[slice].total_score[q] = total_score; + + if (q != max_quant) + return; + + /* Task threads that computed max_quant to also compute overquant if necessary */ + uint mbs_per_slice = slices[slice].mbs_per_slice; + if (total_bits <= bits_per_mb * mbs_per_slice) + { + /* Overquant isn't needed for this slice */ + scores[slice].total_bits[max_quant + 1] = total_bits; + scores[slice].total_score[max_quant + 1] = total_score + 1; + scores[slice].overquant = max_quant; + } + else + { + /* Keep searching until an encoding fits our budget */ + for (q = max_quant + 1; q < 128; ++q) + { + /* Estimate slice bits and error for specified quantizer and plane */ + error = 0; + bits = 0; + if (plane == 3) + bits = estimate_alpha_plane(slice); + else + bits = estimate_slice_plane(error, slice, plane, q); + + /* Accumulate total bits and error of all planes */ + total_bits = sum_of_planes(bits); + total_score = sum_of_planes(error); + + /* If estimated bits fit within budget, we are done */ + if (total_bits <= bits_per_mb * mbs_per_slice) + break; + } + + scores[slice].bits[max_quant + 1][plane] = bits; + scores[slice].score[max_quant + 1][plane] = error; + scores[slice].total_bits[max_quant + 1] = total_bits; + scores[slice].total_score[max_quant + 1] = total_score; + scores[slice].overquant = q; + } +} \ No newline at end of file diff --git a/libavcodec/vulkan/prores_ks_slice_data.comp b/libavcodec/vulkan/prores_ks_slice_data.comp new file mode 100644 index 0000000000..6a943532c5 --- /dev/null +++ b/libavcodec/vulkan/prores_ks_slice_data.comp @@ -0,0 +1,265 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_EXT_samplerless_texture_functions : require + +layout(push_constant, scalar) uniform SliceDataInfo { + int plane; + int pictures_per_frame; + int line_add; +}; + +shared i16vec4 coeffs[MAX_MBS_PER_SLICE][BLOCKS_PER_MB][DCTSIZE][DCTSIZE / 4]; + +#define CONST_BITS 13 +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#define OUT_SHIFT (PASS1_BITS + 1) + +#define FIX_0_541196100 4433 /* FIX(0.541196100) */ +#define FIX_0_765366865 6270 /* FIX(0.765366865) */ +#define FIX_1_847759065 15137 /* FIX(1.847759065) */ +#define FIX_1_175875602 9633 /* FIX(1.175875602) */ +#define FIX_0_298631336 2446 /* FIX(0.298631336) */ +#define FIX_3_072711026 25172 /* FIX(3.072711026) */ +#define FIX_1_501321110 12299 /* FIX(1.501321110) */ +#define FIX_0_899976223 7373 /* FIX(0.899976223) */ +#define FIX_1_961570560 16069 /* FIX(1.961570560) */ +#define FIX_2_053119869 16819 /* FIX(2.053119869) */ +#define FIX_2_562915447 20995 /* FIX(2.562915447) */ +#define FIX_0_390180644 3196 /* FIX(0.390180644) */ + +#define MULTIPLY(type, var, cons) type(uint32_t(var) * uint32_t(cons)) +#define RIGHT_SHIFT(x, n) ((x) >> (n)) +#define DESCALE(x,n) RIGHT_SHIFT(int32_t(x) + (1 << ((n) - 1)), n) + +void row_fdct(i32vec4 data_lo, i32vec4 data_hi) +{ + uint row_idx = gl_LocalInvocationID.x; + uint block = gl_LocalInvocationID.y; + uint mb = gl_LocalInvocationID.z; + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + int32_t tmp0 = data_lo.x + data_hi.w; + int32_t tmp7 = data_lo.x - data_hi.w; + int32_t tmp1 = data_lo.y + data_hi.z; + int32_t tmp6 = data_lo.y - data_hi.z; + int32_t tmp2 = data_lo.z + data_hi.y; + int32_t tmp5 = data_lo.z - data_hi.y; + int32_t tmp3 = data_lo.w + data_hi.x; + int32_t tmp4 = data_lo.w - data_hi.x; + + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + int32_t tmp10 = tmp0 + tmp3; + int32_t tmp13 = tmp0 - tmp3; + int32_t tmp11 = tmp1 + tmp2; + int32_t tmp12 = tmp1 - tmp2; + + data_lo.x = (tmp10 + tmp11) * (1 << PASS1_BITS); + data_hi.x = (tmp10 - tmp11) * (1 << PASS1_BITS); + + uint32_t z1 = MULTIPLY(uint32_t, tmp12 + tmp13, FIX_0_541196100); + data_lo.z = DESCALE(z1 + MULTIPLY(uint32_t, tmp13, FIX_0_765366865), CONST_BITS-PASS1_BITS); + data_hi.z = DESCALE(z1 + MULTIPLY(uint32_t, tmp12, -FIX_1_847759065), CONST_BITS-PASS1_BITS); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * cK represents cos(K*pi/16). + * i0..i3 in the paper are tmp4..tmp7 here. + */ + z1 = tmp4 + tmp7; + uint32_t z2 = tmp5 + tmp6; + uint32_t z3 = tmp4 + tmp6; + uint32_t z4 = tmp5 + tmp7; + uint32_t z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ + + tmp4 = MULTIPLY(int32_t, tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp5 = MULTIPLY(int32_t, tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp6 = MULTIPLY(int32_t, tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp7 = MULTIPLY(int32_t, tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(uint32_t, z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(uint32_t, z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(uint32_t, z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(uint32_t, z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + data_hi.w = DESCALE(uint32_t(tmp4) + z1 + z3, CONST_BITS - PASS1_BITS); + data_hi.y = DESCALE(uint32_t(tmp5) + z2 + z4, CONST_BITS - PASS1_BITS); + data_lo.w = DESCALE(uint32_t(tmp6) + z2 + z3, CONST_BITS - PASS1_BITS); + data_lo.y = DESCALE(uint32_t(tmp7) + z1 + z4, CONST_BITS - PASS1_BITS); + + coeffs[mb][block][row_idx][0] = i16vec4(data_lo); + coeffs[mb][block][row_idx][1] = i16vec4(data_hi); +} + +void ff_jpeg_fdct_islow_10() +{ + uint col_half = gl_LocalInvocationID.x / 4; + uint col = gl_LocalInvocationID.x & 3u; + uint block = gl_LocalInvocationID.y; + uint mb = gl_LocalInvocationID.z; + + i16vec4 col_lo = i16vec4(coeffs[mb][block][0][col_half][col], + coeffs[mb][block][1][col_half][col], + coeffs[mb][block][2][col_half][col], + coeffs[mb][block][3][col_half][col]); + i16vec4 col_hi = i16vec4(coeffs[mb][block][4][col_half][col], + coeffs[mb][block][5][col_half][col], + coeffs[mb][block][6][col_half][col], + coeffs[mb][block][7][col_half][col]); + i32vec4 data_lo = i32vec4(col_lo); + i32vec4 data_hi = i32vec4(col_hi); + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + */ + int32_t tmp0 = data_lo.x + data_hi.w; + int32_t tmp7 = data_lo.x - data_hi.w; + int32_t tmp1 = data_lo.y + data_hi.z; + int32_t tmp6 = data_lo.y - data_hi.z; + int32_t tmp2 = data_lo.z + data_hi.y; + int32_t tmp5 = data_lo.z - data_hi.y; + int32_t tmp3 = data_lo.w + data_hi.x; + int32_t tmp4 = data_lo.w - data_hi.x; + + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + int32_t tmp10 = tmp0 + tmp3; + int32_t tmp13 = tmp0 - tmp3; + int32_t tmp11 = tmp1 + tmp2; + int32_t tmp12 = tmp1 - tmp2; + + data_lo.x = DESCALE(tmp10 + tmp11, OUT_SHIFT); + data_hi.x = DESCALE(tmp10 - tmp11, OUT_SHIFT); + + uint32_t z1 = uint32_t((tmp12 + tmp13) * FIX_0_541196100); + data_lo.z = DESCALE(z1 + uint32_t(tmp13 * FIX_0_765366865), CONST_BITS + OUT_SHIFT); + data_hi.z = DESCALE(z1 + uint32_t(tmp12 * (-FIX_1_847759065)), CONST_BITS + OUT_SHIFT); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * cK represents cos(K*pi/16). + * i0..i3 in the paper are tmp4..tmp7 here. + */ + z1 = tmp4 + tmp7; + uint32_t z2 = tmp5 + tmp6; + uint32_t z3 = tmp4 + tmp6; + uint32_t z4 = tmp5 + tmp7; + uint32_t z5 = MULTIPLY(uint32_t, z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp4 = MULTIPLY(int32_t, tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp5 = MULTIPLY(int32_t, tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp6 = MULTIPLY(int32_t, tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp7 = MULTIPLY(int32_t, tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(uint32_t, z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(uint32_t, z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(uint32_t, z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(uint32_t, z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + data_hi.w = DESCALE(tmp4 + z1 + z3, CONST_BITS + OUT_SHIFT); + data_hi.y = DESCALE(tmp5 + z2 + z4, CONST_BITS + OUT_SHIFT); + data_lo.w = DESCALE(tmp6 + z2 + z3, CONST_BITS + OUT_SHIFT); + data_lo.y = DESCALE(tmp7 + z1 + z4, CONST_BITS + OUT_SHIFT); + + col_lo = i16vec4(data_lo); + col_hi = i16vec4(data_hi); + coeffs[mb][block][0][col_half][col] = col_lo.x; + coeffs[mb][block][1][col_half][col] = col_lo.y; + coeffs[mb][block][2][col_half][col] = col_lo.z; + coeffs[mb][block][3][col_half][col] = col_lo.w; + coeffs[mb][block][4][col_half][col] = col_hi.x; + coeffs[mb][block][5][col_half][col] = col_hi.y; + coeffs[mb][block][6][col_half][col] = col_hi.z; + coeffs[mb][block][7][col_half][col] = col_hi.w; +} + +/* Table of possible edge slice configurations */ +const uvec3 edge_mps_table[8] = uvec3[]( + uvec3(0, 0, 0), + uvec3(1, 0, 0), + uvec3(2, 0, 0), + uvec3(2, 1, 0), + uvec3(4, 0, 0), + uvec3(4, 1, 0), + uvec3(4, 2, 0), + uvec3(4, 2, 1) +); + +void main() +{ + bool is_chroma = plane == 1 || plane == 2; + uint row_idx = gl_LocalInvocationID.x; + uint block = gl_LocalInvocationID.y; + uint macroblock = gl_LocalInvocationID.z; + uint slice_x = gl_WorkGroupID.x; + + /* Calculate the current thread coordinate in input plane */ + uint mbs_per_slice = MAX_MBS_PER_SLICE; + uint mb_width = 4u * BLOCKS_PER_MB; + uint slices_width = WIDTH_IN_MB / MAX_MBS_PER_SLICE; + uvec2 slice_base = gl_WorkGroupID.xy * uvec2(MAX_MBS_PER_SLICE * mb_width, DCTSIZE * 2u); + + /* Handle slice macroblock size reduction on edge slices */ + if (slice_x >= slices_width) + { + uint edge_slice = slice_x - slices_width; + uvec3 table = edge_mps_table[WIDTH_IN_MB - slices_width * MAX_MBS_PER_SLICE]; + uvec3 base = uvec3(0u, table.x, table.x + table.y); + slice_base.x = (MAX_MBS_PER_SLICE * slices_width + base[edge_slice]) * mb_width; + mbs_per_slice = table[edge_slice]; + } + + uvec2 mb_base = slice_base + uvec2(macroblock * mb_width, 0u); + uvec2 block_coord = is_chroma ? uvec2(block >> 1u, block & 1u) : uvec2(block & 1u, block >> 1u); + ivec2 coord = ivec2(mb_base + block_coord * DCTSIZE + uvec2(0u, row_idx)); + ivec2 size = textureSize(planes[plane], 0); + coord.y = coord.y * pictures_per_frame + line_add; + coord = min(coord, size - ivec2(1)); + + /* Load coefficients from input planes */ + i32vec4 row_lo; + row_lo.x = texelFetchOffset(planes[plane], coord, 0, ivec2(0, 0)).x; + row_lo.y = texelFetchOffset(planes[plane], coord, 0, ivec2(1, 0)).x; + row_lo.z = texelFetchOffset(planes[plane], coord, 0, ivec2(2, 0)).x; + row_lo.w = texelFetchOffset(planes[plane], coord, 0, ivec2(3, 0)).x; + + i32vec4 row_hi; + row_hi.x = texelFetchOffset(planes[plane], coord, 0, ivec2(4, 0)).x; + row_hi.y = texelFetchOffset(planes[plane], coord, 0, ivec2(5, 0)).x; + row_hi.z = texelFetchOffset(planes[plane], coord, 0, ivec2(6, 0)).x; + row_hi.w = texelFetchOffset(planes[plane], coord, 0, ivec2(7, 0)).x; + + /* Perform DCT on the coefficients */ + row_fdct(row_lo, row_hi); + ff_jpeg_fdct_islow_10(); + barrier(); + + /* Store DCT result to slice buffer */ + uint slice = gl_WorkGroupID.y * gl_NumWorkGroups.x + gl_WorkGroupID.x; + uint slice_row = macroblock * BLOCKS_PER_MB * DCTSIZE + block * DCTSIZE + row_idx; + slices[slice].mbs_per_slice = mbs_per_slice; + slices[slice].rows[plane][slice_row] = coeffs[macroblock][block][row_idx]; +} \ No newline at end of file diff --git a/libavcodec/vulkan/prores_ks_trellis_node.comp b/libavcodec/vulkan/prores_ks_trellis_node.comp new file mode 100644 index 0000000000..052e47ac5f --- /dev/null +++ b/libavcodec/vulkan/prores_ks_trellis_node.comp @@ -0,0 +1,177 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_KHR_shader_subgroup_arithmetic : require + +layout(push_constant, scalar) uniform TrellisNodeInfo { + int min_quant; + int max_quant; + int mbs_per_slice; + int bits_per_mb; +}; + +#define TRELLIS_WIDTH 16 +#define SCORE_LIMIT 1073741823 + +struct TrellisNode { + int prev_node; + int quant; + int bits; + int score; +}; + +shared int subgroup_sizes[NUM_SUBGROUPS]; + +int slice_sizes[SLICES_WIDTH]; + +TrellisNode nodes[(SLICES_WIDTH + 1) * TRELLIS_WIDTH]; + +int find_slice_quant(int slice_x) +{ + int slice = int(gl_LocalInvocationID.x) * SLICES_WIDTH + slice_x; + + int trellis_node = int(slice_x + 1) * TRELLIS_WIDTH; + for (int q = min_quant; q < max_quant + 2; q++) + { + nodes[trellis_node + q].prev_node = -1; + nodes[trellis_node + q].quant = q; + } + + int mbs = int(slice_x + 1) * mbs_per_slice; + nodes[trellis_node + max_quant + 1].quant = scores[slice].overquant; + + int bits_limit = mbs * bits_per_mb; + for (int pq = min_quant; pq < max_quant + 2; pq++) + { + int prev = trellis_node - TRELLIS_WIDTH + pq; + for (int q = min_quant; q < max_quant + 2; q++) + { + int cur = trellis_node + q; + int bits = nodes[prev].bits + scores[slice].total_bits[q]; + int error = scores[slice].total_score[q]; + if (bits > bits_limit) + error = SCORE_LIMIT; + + int new_score; + if (nodes[prev].score < SCORE_LIMIT && error < SCORE_LIMIT) + new_score = nodes[prev].score + error; + else + new_score = SCORE_LIMIT; + if (nodes[cur].prev_node == -1 || nodes[cur].score >= new_score) + { + nodes[cur].bits = bits; + nodes[cur].score = new_score; + nodes[cur].prev_node = prev; + } + } + } + + int error = nodes[trellis_node + min_quant].score; + int pq = trellis_node + min_quant; + for (int q = min_quant + 1; q < max_quant + 2; q++) + { + if (nodes[trellis_node + q].score <= error) + { + error = nodes[trellis_node + q].score; + pq = trellis_node + q; + } + } + + return pq; +} + +int find_slice_row_quants() +{ + for (int i = min_quant; i < max_quant + 2; i++) + { + nodes[i].prev_node = -1; + nodes[i].bits = 0; + nodes[i].score = 0; + } + + int q = 0; + for (int slice_x = 0; slice_x < SLICES_WIDTH; ++slice_x) + { + q = find_slice_quant(slice_x); + } + + int slice_hdr_size = 2 * NUM_PLANES; + int slice_row_size = slice_hdr_size * SLICES_WIDTH; + int y = int(gl_LocalInvocationID.x); + for (int x = SLICES_WIDTH - 1; x >= 0; x--) + { + int slice = x + y * SLICES_WIDTH; + int quant = nodes[q].quant; + int q_idx = min(quant, max_quant + 1); + slice_sizes[x] = scores[slice].total_bits[q_idx] / 8; + slice_row_size += slice_sizes[x]; + scores[slice].quant = quant; + q = nodes[q].prev_node; + } + + return slice_row_size; +} + +int force_slice_row_quants() +{ + int slice_hdr_size = 2 * NUM_PLANES; + int slice_row_size = slice_hdr_size * SLICES_WIDTH; + int y = int(gl_LocalInvocationID.x); + for (int x = SLICES_WIDTH - 1; x >= 0; x--) + { + int slice = x + y * SLICES_WIDTH; + slice_sizes[x] = scores[slice].total_bits[FORCE_QUANT] / 8; + slice_row_size += slice_sizes[x]; + scores[slice].quant = FORCE_QUANT; + } + + return slice_row_size; +} + +void main() +{ +#if FORCE_QUANT == 0 + int slice_row_size = find_slice_row_quants(); +#else + int slice_row_size = force_slice_row_quants(); +#endif + + int subgroup_sum = subgroupAdd(slice_row_size); + subgroup_sizes[gl_SubgroupID] = subgroup_sum; + barrier(); + + int buf_start = subgroupExclusiveAdd(slice_row_size); + [[unroll]] for (int i = 0; i < NUM_SUBGROUPS; ++i) + { + if (i >= gl_SubgroupID) + break; + buf_start += subgroup_sizes[i]; + } + + int slice_hdr_size = 2 * NUM_PLANES; + int y = int(gl_LocalInvocationID.x); + [[unroll]] for (int x = 0; x < SLICES_WIDTH; ++x) + { + int slice = x + y * SLICES_WIDTH; + scores[slice].buf_start = buf_start; + buf_start += slice_hdr_size + slice_sizes[x]; + } + + if (y == gl_WorkGroupSize.x - 1) + frame_size = buf_start; +} \ No newline at end of file -- 2.50.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
prev parent reply other threads:[~2025-09-04 20:12 UTC|newest] Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-09-04 20:10 [FFmpeg-devel] [PATCH 1/3] vulkan: Support samplerless images IndecisiveTurtle via ffmpeg-devel 2025-09-04 20:10 ` [FFmpeg-devel] [PATCH 2/3] lavc: Split out common components used by vulkan prores encoder IndecisiveTurtle via ffmpeg-devel 2025-09-04 20:10 ` IndecisiveTurtle via ffmpeg-devel [this message]
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250904201002.10446-3-47210458+raphaelthegreat@users.noreply.github.com \ --to=ffmpeg-devel@ffmpeg.org \ --cc=geoster3d@gmail.com \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git