From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTPS id EA1C44EECB for ; Wed, 14 May 2025 19:05:00 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 21EE768C339; Wed, 14 May 2025 22:03:18 +0300 (EEST) Received: from vidala.pars.ee (vidala.pars.ee [116.203.72.101]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 4A4A168BF86 for ; Wed, 14 May 2025 22:02:57 +0300 (EEST) DKIM-Signature: v=1; a=rsa-sha256; s=202405r; d=lynne.ee; c=relaxed/relaxed; h=Message-ID:Date:Subject:To:From; t=1747249376; bh=++jSUnJB/Q9+sMuOR9ahZPk KXzZme/VEvG3Kh639Hqk=; b=EMuWlNBHB5Unyxzrjr0JGzwFBStD9y83DdFi2JUcTkOUV/XObt 0yExhidxSZqgaMMx+d+A7heEVoh2UDZjVp8v2wiTeCxQM/GO28sB47WkcIPdKGKG9JLHopS0EVn ntsv+5jU0BaPWiV1nidEI2lmqakE9HFuTeSb3oaNGYIiXsb6erYaNwWJQDQt6yWmDNqMjlFpWiE NcJlpfNVOauam6C9Znz6+8TGmUbQV+UdiOjlroDxehrSuptlF+VAaMfAY7nm2kKle729PwUwrUC t+qAal8fWNtIDvZUihZ98H6Wu0X8NrEkCfkRx61YgFb2XSrQ+O6EJJbQtSkxK+HTQyg==; DKIM-Signature: v=1; a=ed25519-sha256; s=202405e; d=lynne.ee; c=relaxed/relaxed; h=Message-ID:Date:Subject:To:From; t=1747249376; bh=++jSUnJB/Q9+sMuOR9ahZPk KXzZme/VEvG3Kh639Hqk=; b=y7jMTOdpK+hl6I4MhWmftkuXy4UavVA28pc1zaFQQ9ii2fjd+N oc5jGVJmGnWcPLLJjh6I2H3/YMgnWQvrjXBg==; From: Lynne To: ffmpeg-devel@ffmpeg.org Date: Wed, 14 May 2025 21:02:39 +0200 Message-ID: <20250514190253.162819-10-dev@lynne.ee> X-Mailer: git-send-email 2.49.0.395.g12beb8f557c In-Reply-To: <20250514190253.162819-1-dev@lynne.ee> References: <20250514190253.162819-1-dev@lynne.ee> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 10/16] ffv1enc_vulkan: implement the cached EC writer from the decoder X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Lynne Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: This gives a 35% speedup on AMD and 50% on Nvidia. --- libavcodec/ffv1enc_vulkan.c | 6 ++- libavcodec/vulkan/ffv1_enc.comp | 68 ++++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c index c2eb73ca53..5de16d5b02 100644 --- a/libavcodec/ffv1enc_vulkan.c +++ b/libavcodec/ffv1enc_vulkan.c @@ -1099,12 +1099,13 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv) uint8_t *spv_data; size_t spv_len; void *spv_opaque = NULL; + int use_cached_reader = fv->ctx.ac != AC_GOLOMB_RICE; RET(ff_vk_shader_init(&fv->s, shd, "ffv1_enc", VK_SHADER_STAGE_COMPUTE_BIT, (const char *[]) { "GL_EXT_buffer_reference", "GL_EXT_buffer_reference2" }, 2, - 1, 1, 1, + use_cached_reader ? CONTEXT_SIZE : 1, 1, 1, 0)); /* Common codec header */ @@ -1116,6 +1117,9 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv) av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + if (use_cached_reader) + av_bprintf(&shd->src, "#define CACHED_SYMBOL_READER 1\n"); + desc_set = (FFVulkanDescriptorSetBinding []) { { .name = "rangecoder_static_buf", diff --git a/libavcodec/vulkan/ffv1_enc.comp b/libavcodec/vulkan/ffv1_enc.comp index db33c414e1..65a7df1359 100644 --- a/libavcodec/vulkan/ffv1_enc.comp +++ b/libavcodec/vulkan/ffv1_enc.comp @@ -21,27 +21,32 @@ */ #ifndef GOLOMB +#ifdef CACHED_SYMBOL_READER +shared uint8_t state[CONTEXT_SIZE]; +#define WRITE(c, off, val) put_rac_direct(c, state[off], val) +#else +#define WRITE(c, off, val) put_rac(c, uint64_t(slice_state) + (state_off + off), val) +#endif + /* Note - only handles signed values */ -void put_symbol(inout RangeCoder c, uint64_t state, int v) +void put_symbol(inout RangeCoder c, uint state_off, int v) { bool is_nil = (v == 0); - put_rac(c, state, is_nil); + WRITE(c, 0, is_nil); if (is_nil) return; const int a = abs(v); const int e = findMSB(a); - state += 1; for (int i = 0; i < e; i++) - put_rac(c, state + min(i, 9), true); - put_rac(c, state + min(e, 9), false); + WRITE(c, 1 + min(i, 9), true); + WRITE(c, 1 + min(e, 9), false); - state += 21; for (int i = e - 1; i >= 0; i--) - put_rac(c, state + min(i, 9), bool(bitfieldExtract(a, i, 1))); + WRITE(c, 22 + min(i, 9), bool(bitfieldExtract(a, i, 1))); - put_rac(c, state - 11 + min(e, 10), v < 0); + WRITE(c, 22 - 11 + min(e, 10), v < 0); } void encode_line_pcm(inout SliceContext sc, readonly uimage2D img, @@ -49,6 +54,11 @@ void encode_line_pcm(inout SliceContext sc, readonly uimage2D img, { int w = sc.slice_dim.x; +#ifdef CACHED_SYMBOL_READER + if (gl_LocalInvocationID.x > 0) + return; +#endif + #ifndef RGB if (p > 0 && p < 3) { w >>= chroma_shift.x; @@ -63,7 +73,7 @@ void encode_line_pcm(inout SliceContext sc, readonly uimage2D img, } } -void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state, +void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off, ivec2 sp, int y, int p, int comp, int bits, uint8_t quant_table_idx, const int run_index) { @@ -86,13 +96,25 @@ void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state, d[1] = fold(d[1], bits); - put_symbol(sc.c, state + CONTEXT_SIZE*d[0], d[1]); + uint context_off = state_off + CONTEXT_SIZE*d[0]; +#ifdef CACHED_SYMBOL_READER + u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x); + state[gl_LocalInvocationID.x] = sb.v; + barrier(); + if (gl_LocalInvocationID.x == 0) +#endif + + put_symbol(sc.c, context_off, d[1]); + +#ifdef CACHED_SYMBOL_READER + sb.v = state[gl_LocalInvocationID.x]; +#endif } } #else /* GOLOMB */ -void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state, +void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off, ivec2 sp, int y, int p, int comp, int bits, uint8_t quant_table_idx, inout int run_index) { @@ -143,7 +165,7 @@ void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state, } if (!run_mode) { - VlcState sb = VlcState(state + VLC_STATE_SIZE*d[0]); + VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*d[0]); Symbol sym = get_vlc_symbol(sb, d[1], bits); put_bits(sc.pb, sym.bits, sym.val); } @@ -245,8 +267,7 @@ void encode_slice(inout SliceContext sc, const uint slice_idx) #endif { u8vec4 quant_table_idx = sc.quant_table_idx.xyyz; - uint64_t slice_state_off = uint64_t(slice_state) + - slice_idx*plane_state_size*codec_planes; + u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size; #ifndef RGB for (int c = 0; c < components; c++) { @@ -260,26 +281,22 @@ void encode_slice(inout SliceContext sc, const uint slice_idx) int comp = c - p; for (int y = 0; y < h; y++) - encode_line(sc, src[p], slice_state_off, sp, y, p, + encode_line(sc, src[p], slice_state_off[c], sp, y, p, comp, bits, quant_table_idx[c], run_index); - - /* For the second chroma plane, reuse the first plane's state */ - if (c != 1) - slice_state_off += plane_state_size; } #else int run_index = 0; for (int y = 0; y < sc.slice_dim.y; y++) { preload_rgb(sc, sp, sc.slice_dim.x, y, true); - encode_line(sc, tmp, slice_state_off + plane_state_size*0, + encode_line(sc, tmp, slice_state_off[0], sp, y, 0, 1, bits, quant_table_idx[0], run_index); - encode_line(sc, tmp, slice_state_off + plane_state_size*1, + encode_line(sc, tmp, slice_state_off[1], sp, y, 0, 2, bits, quant_table_idx[1], run_index); - encode_line(sc, tmp, slice_state_off + plane_state_size*1, + encode_line(sc, tmp, slice_state_off[2], sp, y, 0, 0, bits, quant_table_idx[2], run_index); if (transparency == 1) - encode_line(sc, tmp, slice_state_off + plane_state_size*2, + encode_line(sc, tmp, slice_state_off[3], sp, y, 0, 3, bits, quant_table_idx[3], run_index); } #endif @@ -288,6 +305,11 @@ void encode_slice(inout SliceContext sc, const uint slice_idx) void finalize_slice(inout SliceContext sc, const uint slice_idx) { +#ifdef CACHED_SYMBOL_READER + if (gl_LocalInvocationID.x > 0) + return; +#endif + #ifdef GOLOMB uint32_t enc_len = sc.hdr_len + flush_put_bits(sc.pb); #else -- 2.49.0.395.g12beb8f557c _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".