From: IndecisiveTurtle <geoster3d@gmail.com> To: ffmpeg-devel@ffmpeg.org Cc: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Subject: [FFmpeg-devel] [PATCH 3/4] libavcodec/vulkan: Add vulkan vc2 shaders Date: Sat, 8 Mar 2025 14:21:40 +0200 Message-ID: <20250308122140.59850-4-47210458+raphaelthegreat@users.noreply.github.com> (raw) In-Reply-To: <20250308122140.59850-2-47210458+raphaelthegreat@users.noreply.github.com> --- libavcodec/vulkan/vc2_dwt_haar.comp | 70 +++++++ libavcodec/vulkan/vc2_dwt_haar_subgroup.comp | 89 +++++++++ libavcodec/vulkan/vc2_dwt_hor_legall.comp | 66 +++++++ libavcodec/vulkan/vc2_dwt_upload.comp | 29 +++ libavcodec/vulkan/vc2_dwt_ver_legall.comp | 62 +++++++ libavcodec/vulkan/vc2_encode.comp | 173 ++++++++++++++++++ libavcodec/vulkan/vc2_slice_sizes.comp | 183 +++++++++++++++++++ 7 files changed, 672 insertions(+) create mode 100644 libavcodec/vulkan/vc2_dwt_haar.comp create mode 100644 libavcodec/vulkan/vc2_dwt_haar_subgroup.comp create mode 100644 libavcodec/vulkan/vc2_dwt_hor_legall.comp create mode 100644 libavcodec/vulkan/vc2_dwt_upload.comp create mode 100644 libavcodec/vulkan/vc2_dwt_ver_legall.comp create mode 100644 libavcodec/vulkan/vc2_encode.comp create mode 100644 libavcodec/vulkan/vc2_slice_sizes.comp diff --git a/libavcodec/vulkan/vc2_dwt_haar.comp b/libavcodec/vulkan/vc2_dwt_haar.comp new file mode 100644 index 0000000000..e67c0c1d9d --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_haar.comp @@ -0,0 +1,70 @@ +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require + +#define LOCAL_X 256 + +struct Plane { + ivec2 dim; + ivec2 dwt_dim; +}; + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int plane_idx; + int wavelet_depth; + Plane planes[3]; +}; + +shared int local_coef[LOCAL_X]; + +void main() +{ + ivec2 coord = ivec2(gl_GlobalInvocationID.xy); + ivec2 dwt_dim = planes[plane_idx].dwt_dim; + if (any(greaterThanEqual(coord, dwt_dim))) + return; + + int index = dwt_dim.x * coord.y + coord.x; + int value = imageLoad(planes0[plane_idx], coord).x; + + /* Perform Haar wavelet on the 16x16 local workgroup with shared memory */ + for (int i = 0; i < wavelet_depth; i++) + { + ivec2 mask = ivec2((1 << i) - 1); + if (any(notEqual(coord & mask, ivec2(0)))) + break; + + /* Offset between valid hor pixels for each level, +1, +2, +4 etc */ + int dist = (1 << i); + + local_coef[gl_LocalInvocationIndex] = value; + barrier(); + + /* Horizontal haar wavelet */ + uint other_id = gl_LocalInvocationIndex ^ dist; + int other = local_coef[other_id]; + int a = gl_LocalInvocationIndex < other_id ? value : other; + int b = gl_LocalInvocationIndex < other_id ? other : value; + int dst_b = (b - a) * (1 << s); + int dst_a = a * (1 << s) + ((dst_b + 1) >> 1); + value = gl_LocalInvocationIndex < other_id ? dst_a : dst_b; + + /* Offset between valid ver pixels for each level, +1, +2, +4 etc */ + dist <<= 4; + + local_coef[gl_LocalInvocationIndex] = value; + barrier(); + + /* Vertical haar wavelet */ + other_id = gl_LocalInvocationIndex ^ dist; + other = local_coef[other_id]; + a = gl_LocalInvocationIndex < other_id ? value : other; + b = gl_LocalInvocationIndex < other_id ? other : value; + dst_b = b - a; + dst_a = a + ((dst_b + 1) >> 1); + value = gl_LocalInvocationIndex < other_id ? dst_a : dst_b; + } + + /* Store value */ + imageStore(planes0[plane_idx], coord, ivec4(value)); +} diff --git a/libavcodec/vulkan/vc2_dwt_haar_subgroup.comp b/libavcodec/vulkan/vc2_dwt_haar_subgroup.comp new file mode 100644 index 0000000000..da0b9f72ca --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_haar_subgroup.comp @@ -0,0 +1,89 @@ +#extension GL_EXT_scalar_block_layout : require +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_EXT_buffer_reference : require + +#define TILE_DIM 8 + +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer DwtCoef { + int coef_buf[]; +}; + +struct Plane { + ivec2 dim; + ivec2 dwt_dim; +}; + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int plane_idx; + int wavelet_depth; + Plane planes[3]; + DwtCoef pbuf[3]; +}; + +int dwt_haar_subgroup(int value, int i) +{ + /* Offset between valid hor pixels for each level, +1, +2, +4 etc */ + int dist = (1 << i); + + /* Horizontal haar wavelet */ + uint other_sub_id = gl_SubgroupInvocationID ^ dist; + int other = subgroupShuffle(value, other_sub_id); + int a = gl_SubgroupInvocationID < other_sub_id ? value : other; + int b = gl_SubgroupInvocationID < other_sub_id ? other : value; + int dst_b = (b - a) * (1 << s); + int dst_a = a * (1 << s) + ((dst_b + 1) >> 1); + value = gl_SubgroupInvocationID < other_sub_id ? dst_a : dst_b; + + /* Offset between valid ver pixels for each level, +1, +2, +4 etc */ + dist <<= 3; + + /* Vertical haar wavelet */ + other_sub_id = gl_SubgroupInvocationID ^ dist; + other = subgroupShuffle(value, other_sub_id); + a = gl_SubgroupInvocationID < other_sub_id ? value : other; + b = gl_SubgroupInvocationID < other_sub_id ? other : value; + dst_b = b - a; + dst_a = a + ((dst_b + 1) >> 1); + return gl_SubgroupInvocationID < other_sub_id ? dst_a : dst_b; +} + +bool is_thread_active(int i, ivec2 coord) +{ + if (i > wavelet_depth - 1) + return false; + ivec2 mask = ivec2((1 << i) - 1); + if (any(notEqual(coord & mask, ivec2(0)))) + return false; + return true; +} + +void main() { + ivec2 tile_coord = ivec2(gl_WorkGroupID.xy); + ivec2 local_coord = ivec2(gl_LocalInvocationIndex & 7, gl_LocalInvocationIndex >> 3); + ivec2 coord = tile_coord * ivec2(TILE_DIM) + local_coord; + ivec2 dwt_dim = planes[plane_idx].dwt_dim; + if (any(greaterThanEqual(coord, dwt_dim))) { + return; + } + int index = dwt_dim.x * coord.y + coord.x; + int value = pbuf[plane_idx].coef_buf[index]; + + if (gl_SubgroupSize == 64) { + for (int i = 0; i < 3; i++) { + if (!is_thread_active(i, local_coord)) + break; + value = dwt_haar_subgroup(value, i); + } + } else { + for (int i = 0; i < 2; i++) { + if (!is_thread_active(i, local_coord)) + break; + value = dwt_haar_subgroup(value, i); + } + } + + // Store value + pbuf[plane_idx].coef_buf[index] = value; +} diff --git a/libavcodec/vulkan/vc2_dwt_hor_legall.comp b/libavcodec/vulkan/vc2_dwt_hor_legall.comp new file mode 100644 index 0000000000..3eece4ab48 --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_hor_legall.comp @@ -0,0 +1,66 @@ +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require + +struct Plane { + ivec2 dim; + ivec2 dwt_dim; +}; + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int diff_offset; + int level; + Plane planes[3]; +}; + +int image_load(int coord_x) +{ + int coord_y = int(gl_GlobalInvocationID.x); + return imageLoad(planes0[gl_GlobalInvocationID.z], ivec2(coord_x, coord_y)).x; +} + +void image_store(int coord_x, int value) +{ + int coord_y = int(gl_GlobalInvocationID.x); + imageStore(planes0[gl_GlobalInvocationID.z], ivec2(coord_x, coord_y), ivec4(value)); +} + +void main() +{ + int coord_y = int(gl_GlobalInvocationID.x); + uint plane_idx = gl_GlobalInvocationID.z; + ivec2 work_area = planes[plane_idx].dwt_dim; + int dist = 1 << level; + if (coord_y >= work_area.y || (coord_y & (dist - 1)) != 0) + return; + + // Shift in one bit that is used for additional precision + for (int x = 0; x < work_area.x; x += dist) + image_store(x, image_load(x) << 1); + + // Lifting stage 2 + for (int x = 0; x < work_area.x - 2 * dist; x += 2 * dist) { + int lhs = image_load(x); + int rhs = image_load(x + 2 * dist); + int value = image_load(x + dist); + value -= (lhs + rhs + 1) >> 1; + image_store(x + dist, value); + } + int lhs = image_load(work_area.x - 2 * dist); + int value = image_load(work_area.x - dist); + value -= (2 * lhs + 1) >> 1; + image_store(work_area.x - dist, value); + + // Lifting stage 1 + lhs = image_load(dist); + value = image_load(0); + value += (2 * lhs + 2) >> 2; + image_store(0, value); + for (int x = 2 * dist; x <= work_area.x - 2 * dist; x += 2 * dist) { + int lhs = image_load(x - dist); + int rhs = image_load(x + dist); + int value = image_load(x); + value += (lhs + rhs + 2) >> 2; + image_store(x, value); + } +} diff --git a/libavcodec/vulkan/vc2_dwt_upload.comp b/libavcodec/vulkan/vc2_dwt_upload.comp new file mode 100644 index 0000000000..6de3721d3b --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_upload.comp @@ -0,0 +1,29 @@ +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_buffer_reference : require + +layout(scalar, buffer_reference, buffer_reference_align = 1) buffer PlaneBuf { + uint8_t data[]; +}; + +struct Plane { + ivec2 dim; + ivec2 dwt_dim; +}; + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int diff_offset; + int level; + Plane planes[3]; +}; + +void main() +{ + ivec2 coord = ivec2(gl_GlobalInvocationID.xy); + uint plane_idx = gl_GlobalInvocationID.z; + ivec2 coord_i = clamp(coord, ivec2(0), planes[plane_idx].dim); + uint texel = imageLoad(planes1[plane_idx], coord_i).x; + int result = int(texel - diff_offset); + imageStore(planes0[plane_idx], coord, ivec4(result)); +} diff --git a/libavcodec/vulkan/vc2_dwt_ver_legall.comp b/libavcodec/vulkan/vc2_dwt_ver_legall.comp new file mode 100644 index 0000000000..28cfb97a7a --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_ver_legall.comp @@ -0,0 +1,62 @@ +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require + +struct Plane { + ivec2 dim; + ivec2 dwt_dim; +}; + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int diff_offset; + int level; + Plane planes[3]; +}; + +int image_load(int coord_y) +{ + int coord_x = int(gl_GlobalInvocationID.x); + return imageLoad(planes0[gl_GlobalInvocationID.z], ivec2(coord_x, coord_y)).x; +} + +void image_store(int coord_y, int value) +{ + int coord_x = int(gl_GlobalInvocationID.x); + imageStore(planes0[gl_GlobalInvocationID.z], ivec2(coord_x, coord_y), ivec4(value)); +} + +void main() +{ + int coord_x = int(gl_GlobalInvocationID.x); + uint plane_idx = gl_GlobalInvocationID.z; + ivec2 work_area = planes[plane_idx].dwt_dim; + int dist = 1 << level; + if (coord_x >= work_area.x || (coord_x & (dist - 1)) != 0) + return; + + // Lifting stage 2 + for (int y = dist; y < work_area.y - 2 * dist; y += 2 * dist) { + int lhs = image_load(y - dist); + int rhs = image_load(y + dist); + int value = image_load(y); + value -= (lhs + rhs + 1) >> 1; + image_store(y, value); + } + int lhs = image_load(work_area.y - 2 * dist); + int value = image_load(work_area.y - dist); + value -= (2 * lhs + 1) >> 1; + image_store(work_area.y - dist, value); + + // Lifting stage 1 + lhs = image_load(dist); + value = image_load(0); + value += (2 * lhs + 2) >> 2; + image_store(0, value); + for (int y = 2 * dist; y <= work_area.y - 2 * dist; y += 2 * dist) { + int lhs = image_load(y + dist); + int rhs = image_load(y - dist); + int value = image_load(y); + value += (lhs + rhs + 2) >> 2; + image_store(y, value); + } +} diff --git a/libavcodec/vulkan/vc2_encode.comp b/libavcodec/vulkan/vc2_encode.comp new file mode 100644 index 0000000000..da64d9c6d8 --- /dev/null +++ b/libavcodec/vulkan/vc2_encode.comp @@ -0,0 +1,173 @@ +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_debug_printf : require + +#define MAX_DWT_LEVELS (5) + +struct SliceArgs { + int quant_idx; + int bytes; + int pb_start; + int pad; +}; + +struct Plane { + ivec2 dim; + ivec2 dwt_dim; +}; + +layout(std430, buffer_reference, buffer_reference_align = 16) buffer SliceArgBuf { + SliceArgs args[]; +}; +layout(scalar, buffer_reference, buffer_reference_align = 1) buffer BitBuf { + uint data[]; +}; +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer QuantLuts { + int quant[5][4]; + int ff_dirac_qscale_tab[116]; +}; + +layout(push_constant, scalar) uniform ComputeInfo { + BitBuf bytestream; + QuantLuts luts; + SliceArgBuf slice; + ivec2 num_slices; + Plane planes[3]; + int wavelet_depth; + int size_scaler; + int prefix_bytes; +}; + +void put_vc2_ue_uint(inout PutBitContext pb, uint val) +{ + int pbits = 0, topbit = 1, maxval = 1, bits = 0; + if (val == 0) + { + put_bits(pb, 1, 1); + return; + } + val++; + + while (val > maxval) + { + topbit <<= 1; + bits++; + maxval <<= 1; + maxval |= 1; + } + + for (int i = 0; i < bits; i++) + { + topbit >>= 1; + pbits <<= 2; + if ((val & topbit) != 0) + pbits |= 1; + } + + put_bits(pb, bits * 2 + 1, (pbits << 1) | 1); +} + +int quants[MAX_DWT_LEVELS][4]; + +int subband_coord(int index, int h, int lvl) +{ + int coord = index; + coord <<= 1; + coord |= h; + coord <<= (wavelet_depth-lvl-1); + return coord; +} + +void main() +{ + int slice_index = int(gl_GlobalInvocationID.x); + int max_index = num_slices.x * num_slices.y; + if (slice_index >= max_index) + return; + + /* Step 2. Quantize and encode */ + int pb_start = slice.args[slice_index].pb_start; + int workgroup_x = int(gl_WorkGroupSize.x); + for (int i = 0, index = workgroup_x - 1; i < gl_WorkGroupID.x; i++) { + pb_start += slice.args[index].pb_start + slice.args[index].bytes; + index += workgroup_x; + } + ivec2 slice_coord = ivec2(slice_index % num_slices.x, slice_index / num_slices.x); + int slice_bytes_max = slice.args[slice_index].bytes; + int quant_index = slice.args[slice_index].quant_idx; + + PutBitContext pb; + init_put_bits(pb, OFFBUF(u8buf, bytestream, pb_start), slice_bytes_max); + + for (int level = 0; level < wavelet_depth; level++) + for (int orientation = int(level > 0); orientation < 4; orientation++) + quants[level][orientation] = max(quant_index - luts.quant[level][orientation], 0); + + /* Write quant index for this slice */ + put_bits(pb, 8, quant_index); + + /* Luma + 2 Chroma planes */ + for (int p = 0; p < 3; p++) + { + int pad_s, pad_c; + int bytes_start = put_bytes_count(pb); + + /* Save current location and write a zero value */ + uint64_t write_ptr_start = pb.buf; + int bit_left_start = pb.bit_left; + put_bits(pb, 8, 0); + + int stride = align(planes[p].dwt_dim.x, 32); + for (int level = 0; level < wavelet_depth; level++) + { + ivec2 band_size = planes[p].dwt_dim >> (wavelet_depth - level); + for (int o = int(level > 0); o < 4; o++) + { + /* Encode subband */ + int left = band_size.x * (slice_coord.x) / num_slices.x; + int right = band_size.x * (slice_coord.x+1) / num_slices.x; + int top = band_size.y * (slice_coord.y) / num_slices.y; + int bottom = band_size.y * (slice_coord.y+1) / num_slices.y; + + const int q_idx = quants[level][o]; + const int qfactor = luts.ff_dirac_qscale_tab[q_idx]; + + const int yh = o >> 1; + const int xh = o & 1; + + for (int y = top; y < bottom; y++) + { + for (int x = left; x < right; x++) + { + int sx = subband_coord(x, xh, level); + int sy = subband_coord(y, yh, level); + int coef = imageLoad(planes0[p], ivec2(sx, sy)).x; + uint c_abs = uint(abs(coef)); + c_abs = (c_abs << 2) / qfactor; + put_vc2_ue_uint(pb, c_abs); + if (c_abs != 0) + put_bits(pb, 1, int(coef < 0)); + } + } + } + } + flush_put_bits(pb); + int bytes_len = put_bytes_count(pb) - bytes_start - 1; + if (p == 2) + { + int len_diff = slice_bytes_max - put_bytes_count(pb); + pad_s = align((bytes_len + len_diff), size_scaler)/size_scaler; + pad_c = (pad_s*size_scaler) - bytes_len; + } + else + { + pad_s = align(bytes_len, size_scaler)/size_scaler; + pad_c = (pad_s*size_scaler) - bytes_len; + } + uint64_t start_ptr = write_ptr_start + ((BUF_BITS - bit_left_start) >> 3); + u8buf(start_ptr).v = uint8_t(pad_s); + /* vc2-reference uses that padding that decodes to '0' coeffs */ + skip_put_bytes(pb, pad_c); + } +} diff --git a/libavcodec/vulkan/vc2_slice_sizes.comp b/libavcodec/vulkan/vc2_slice_sizes.comp new file mode 100644 index 0000000000..9c048f3664 --- /dev/null +++ b/libavcodec/vulkan/vc2_slice_sizes.comp @@ -0,0 +1,183 @@ +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require + +#define DIRAC_MAX_QUANT_INDEX 116 +#define MAX_DWT_LEVELS 5 + +struct SliceArgs { + int quant_idx; + int bytes; + int pb_start; + int pad; +}; + +struct Plane { + ivec2 dim; + ivec2 dwt_dim; +}; + +layout(std430, buffer_reference) buffer SliceArgBuf { + SliceArgs args[]; +}; +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer QuantLuts { + int quant[5][4]; + int ff_dirac_qscale_tab[116]; +}; + +layout(push_constant, scalar) uniform ComputeInfo { + QuantLuts luts; + SliceArgBuf slice; + ivec2 num_slices; + Plane planes[3]; + int wavelet_depth; + int size_scaler; + int prefix_bytes; + int bits_ceil; + int bits_floor; +}; + +int count_vc2_ue_uint(uint val) +{ + uint topbit = 1, maxval = 1; + int bits = 0; + if (val == 0) + return 1; + val++; + while (val > maxval) + { + bits++; + topbit <<= 1; + maxval <<= 1; + maxval |= 1; + } + return bits * 2 + 1; +} + +int cache[DIRAC_MAX_QUANT_INDEX]; +int quants[MAX_DWT_LEVELS][4]; +shared int slice_sizes[gl_WorkGroupSize.x]; + +int subband_coord(int index, int h, int lvl) +{ + int coord = index; + coord <<= 1; + coord |= h; + coord <<= (wavelet_depth-lvl-1); + return coord; +} + +int count_hq_slice(int quant_index) +{ + int bits = 0; + if (cache[quant_index] != 0) + return cache[quant_index]; + + bits += 8*prefix_bytes; + bits += 8; /* quant_idx */ + + for (int level = 0; level < wavelet_depth; level++) + for (int orientation = int(level > 0); orientation < 4; orientation++) + quants[level][orientation] = max(quant_index - luts.quant[level][orientation], 0); + + int slice_index = int(gl_GlobalInvocationID.x); + ivec2 slice_coord = ivec2(slice_index % num_slices.x, slice_index / num_slices.x); + for (int p = 0; p < 3; p++) + { + int bytes_start = bits >> 3; + bits += 8; + + const int stride = align(planes[p].dwt_dim.x, 32); + for (int level = 0; level < wavelet_depth; level++) + { + ivec2 band_dim = planes[p].dwt_dim >> (wavelet_depth - level); + for (int o = int(level > 0); o < 4; o++) + { + const int left = band_dim.x * slice_coord.x / num_slices.x; + const int right = band_dim.x * (slice_coord.x+1) / num_slices.x; + const int top = band_dim.y * slice_coord.y / num_slices.y; + const int bottom = band_dim.y * (slice_coord.y+1) / num_slices.y; + + const int q_idx = quants[level][o]; + const int qfactor = luts.ff_dirac_qscale_tab[q_idx]; + + const int yh = o >> 1; + const int xh = o & 1; + + for (int y = top; y < bottom; y++) + { + for (int x = left; x < right; x++) + { + int sx = subband_coord(x, xh, level); + int sy = subband_coord(y, yh, level); + int coef = imageLoad(planes0[p], ivec2(sx, sy)).x; + uint c_abs = uint(abs(coef)); + c_abs = (c_abs << 2) / qfactor; + bits += count_vc2_ue_uint(c_abs); + bits += int(c_abs > 0); + } + } + } + } + bits += align(bits, 8) - bits; + int bytes_len = (bits >> 3) - bytes_start - 1; + int pad_s = align(bytes_len, size_scaler) / size_scaler; + int pad_c = (pad_s * size_scaler) - bytes_len; + bits += pad_c * 8; + } + + cache[quant_index] = bits; + return bits; +} + +int ssize_round(int b) +{ + return align(b, size_scaler) + 4 + prefix_bytes; +} + +void main() +{ + int slice_index = int(gl_GlobalInvocationID.x); + int max_index = num_slices.x * num_slices.y; + if (slice_index >= max_index) + return; + + for (int i = 0; i < DIRAC_MAX_QUANT_INDEX; i++) + cache[i] = 0; + + const int q_ceil = DIRAC_MAX_QUANT_INDEX; + const int top = bits_ceil; + const int bottom = bits_floor; + int quant_buf[2] = int[2](-1, -1); + int quant = slice.args[slice_index].quant_idx; + int step = 1; + int bits_last = 0; + int bits = count_hq_slice(quant); + while ((bits > top) || (bits < bottom)) + { + const int signed_step = bits > top ? +step : -step; + quant = clamp(quant + signed_step, 0, q_ceil-1); + bits = count_hq_slice(quant); + if (quant_buf[1] == quant) + { + quant = max(quant_buf[0], quant); + bits = quant == quant_buf[0] ? bits_last : bits; + break; + } + step = clamp(step / 2, 1, (q_ceil - 1) / 2); + quant_buf[1] = quant_buf[0]; + quant_buf[0] = quant; + bits_last = bits; + } + int bytes = ssize_round(bits >> 3); + slice.args[slice_index].quant_idx = clamp(quant, 0, q_ceil-1); + slice.args[slice_index].bytes = bytes; + slice_sizes[gl_LocalInvocationIndex] = bytes; + barrier(); + + /* Prefix sum for all slices in current workgroup */ + int total_bytes = 0; + for (int i = 0; i < gl_LocalInvocationIndex; i++) + total_bytes += slice_sizes[i]; + slice.args[slice_index].pb_start = total_bytes; +} -- 2.48.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-03-08 12:22 UTC|newest] Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-03-08 12:21 [FFmpeg-devel] [PATCH 1/4] libavcodec/vc2enc: Split out common functions between software and hardware encoders IndecisiveTurtle 2025-03-08 12:21 ` [FFmpeg-devel] [PATCH 2/4] libavcodec/vulkan: Add modifications to common shader for VC2 vulkan encoder IndecisiveTurtle 2025-03-08 12:21 ` IndecisiveTurtle [this message] 2025-03-08 12:21 ` [FFmpeg-devel] [PATCH 4/4] avcodec/vc2enc: Initial vulkan VC2 encoder Implements a Vulkan based dirac encoder. Supports Haar and Legall wavelets and should work with all wavelet depths IndecisiveTurtle 2025-03-08 13:13 ` [FFmpeg-devel] [PATCH 1/4] libavcodec/vc2enc: Split out common functions between software and hardware encoders Andreas Rheinhardt 2025-03-08 13:52 ` IndecisiveTurtle 2025-03-08 14:05 ` Andreas Rheinhardt 2025-03-08 14:22 ` IndecisiveTurtle
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250308122140.59850-4-47210458+raphaelthegreat@users.noreply.github.com \ --to=geoster3d@gmail.com \ --cc=47210458+raphaelthegreat@users.noreply.github.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git