From: Lynne <dev@lynne.ee> To: ffmpeg-devel@ffmpeg.org Cc: Lynne <dev@lynne.ee> Subject: [FFmpeg-devel] [PATCH 18/18] vulkan_ffv1: add cached symbol reader for AMD Date: Sat, 12 Apr 2025 09:22:49 +0200 Message-ID: <20250412072256.77815-18-dev@lynne.ee> (raw) In-Reply-To: <20250412072256.77815-1-dev@lynne.ee> Speeds up everything on AMD by 3x. This uses 32 local invocations to load state into cache, as well as to do the RCT faster. --- libavcodec/vulkan/ffv1_dec.comp | 71 ++++++++++++++++++++------------- libavcodec/vulkan_ffv1.c | 7 +++- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp index 9eba322b27..3c46ee1771 100644 --- a/libavcodec/vulkan/ffv1_dec.comp +++ b/libavcodec/vulkan/ffv1_dec.comp @@ -108,34 +108,37 @@ ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx) #endif #ifndef GOLOMB -int get_isymbol(inout RangeCoder c, uint64_t state) +#ifdef CACHED_SYMBOL_READER +shared uint8_t state[CONTEXT_SIZE]; +#define READ(c, off) get_rac_direct(c, state[off]) +#else +#define READ(c, off) get_rac(c, uint64_t(slice_state) + state_off + off) +#endif + +int get_isymbol(inout RangeCoder c, uint state_off) { - if (expectEXT(get_rac(c, state), false)) + if (expectEXT(READ(c, 0), false)) return 0; - state += 1; - - int e; - for (e = 0; e < 32; e++) - if (!get_rac(c, state + min(e, 9))) + int e = 1; + for (; e < 33; e++) + if (!READ(c, min(e, 10))) break; - if (expectEXT(e == 0, false)) { - return get_rac(c, state + 10) ? -1 : 1; - } else if (expectEXT(e > 31, false)) { + if (expectEXT(e == 1, false)) { + return READ(c, 11) ? -1 : 1; + } else if (expectEXT(e == 33, false)) { corrupt = true; return 0; } - state += 21; - int a = 1; - for (int i = e - 1; i >= 0; i--) { + for (int i = e + 20; i >= 22; i--) { a <<= 1; - a |= int(get_rac(c, state + min(i, 9))); // 22..31 + a |= int(READ(c, min(i, 31))); } - return get_rac(c, state - 11 + min(e, 10)) ? -a : a; + return READ(c, min(e + 10, 21)) ? -a : a; } void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int bits) @@ -157,7 +160,7 @@ void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int b } void decode_line(inout SliceContext sc, ivec2 sp, int w, - int y, int p, int bits, uint64_t state, + int y, int p, int bits, uint state_off, uint8_t quant_table_idx, const int run_index) { #ifndef RGB @@ -171,19 +174,33 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w, ivec2 pr = get_pred(sp, ivec2(x, y), p, w, quant_table_idx); - int diff = get_isymbol(sc.c, state + CONTEXT_SIZE*abs(pr[0])); - if (pr[0] < 0) - diff = -diff; + uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]); +#ifdef CACHED_SYMBOL_READER + u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x); + state[gl_LocalInvocationID.x] = sb.v; + barrier(); + if (gl_LocalInvocationID.x == 0) { - uint v = zero_extend(pr[1] + diff, bits); - imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v)); +#endif + + int diff = get_isymbol(sc.c, context_off); + if (pr[0] < 0) + diff = -diff; + + uint v = zero_extend(pr[1] + diff, bits); + imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v)); + +#ifdef CACHED_SYMBOL_READER + } + sb.v = state[gl_LocalInvocationID.x]; +#endif } } #else /* GOLOMB */ void decode_line(inout SliceContext sc, ivec2 sp, int w, - int y, int p, int bits, uint64_t state, + int y, int p, int bits, uint state_off, uint8_t quant_table_idx, inout int run_index) { #ifndef RGB @@ -202,7 +219,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w, ivec2 pr = get_pred(sp, ivec2(x, y), p, w, quant_table_idx); - VlcState sb = VlcState(state + VLC_STATE_SIZE*abs(pr[0])); + VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*abs(pr[0])); if (pr[0] == 0 && run_mode == 0) run_mode = 1; @@ -263,7 +280,7 @@ ivec4 transform_sample(ivec4 pix, ivec2 rct_coef) void writeout_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct) { - for (int x = 0; x < w; x++) { + for (uint x = gl_LocalInvocationID.x; x < w; x += gl_WorkGroupSize.x) { ivec2 lpos = sp + LADDR(ivec2(x, y)); ivec2 pos = sc.slice_pos + ivec2(x, y); @@ -305,6 +322,8 @@ void decode_slice(inout SliceContext sc, const uint slice_idx) /* PCM coding */ #ifndef GOLOMB if (sc.slice_coding_mode == 1) { + if (gl_LocalInvocationID.x > 0) + return; #ifndef RGB for (int p = 0; p < planes; p++) { int h = sc.slice_dim.y; @@ -328,9 +347,7 @@ void decode_slice(inout SliceContext sc, const uint slice_idx) #endif { u8vec4 quant_table_idx = sc.quant_table_idx.xyyz; - u64vec4 slice_state_off = (uint64_t(slice_state) + - slice_idx*plane_state_size*codec_planes) + - plane_state_size*uvec4(0, 1, 1, 2); + u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size; #ifndef RGB for (int p = 0; p < planes; p++) { diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c index c1875711bc..33c4e9114d 100644 --- a/libavcodec/vulkan_ffv1.c +++ b/libavcodec/vulkan_ffv1.c @@ -823,12 +823,14 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s, uint8_t *spv_data; size_t spv_len; void *spv_opaque = NULL; + int use_cached_reader = ac != AC_GOLOMB_RICE && + s->driver_props.driverID == VK_DRIVER_ID_MESA_RADV; RET(ff_vk_shader_init(s, shd, "ffv1_dec", VK_SHADER_STAGE_COMPUTE_BIT, (const char *[]) { "GL_EXT_buffer_reference", "GL_EXT_buffer_reference2" }, 2, - 1, 1, 1, + use_cached_reader ? 32 : 1, 1, 1, 0)); if (ac == AC_GOLOMB_RICE) @@ -837,6 +839,9 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s, if (rgb) av_bprintf(&shd->src, "#define RGB\n"); + if (use_cached_reader) + av_bprintf(&shd->src, "#define CACHED_SYMBOL_READER 1\n"); + /* Common codec header */ GLSLD(ff_source_common_comp); -- 2.47.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-04-12 7:26 UTC|newest] Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-04-12 7:22 [FFmpeg-devel] [PATCH 01/18] hwcontext_vulkan: disable descriptor buffer extension on Intel Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 02/18] vulkan_ffv1: enable acceleration " Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 03/18] vulkan_ffv1: remove unused define Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 04/18] vulkan_ffv1: slightly optimize the range decoder Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 05/18] vulkan_ffv1: optimize symbol reader Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 06/18] vulkan_ffv1: allocate just as much memory for slice state as needed Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 07/18] vulkan_ffv1: init overread/corrupt fields Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 08/18] vulkan_ffv1: fallback to upload if mapping packet fails, fix fallback Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 09/18] vulkan_ffv1: fix reset shader dependencies Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 10/18] vulkan_ffv1: improve buffer barrier correctness for slice state Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 11/18] vulkan_ffv1: fix left-2 sample addressing Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 12/18] vulkan_ffv1: cache only 2 lines when decoding RGB Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 13/18] ffv1/vulkan: redo context count tracking and quant_table_idx management Lynne 2025-04-13 20:39 ` Jerome Martinez 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 14/18] vulkan_ffv1: externalize extended lookup check Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 15/18] vulkan_ffv1: remove need for scratch data during setup Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 16/18] vulkan_ffv1: shortcut +-1 coeffs in symbol reading Lynne 2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 17/18] vulkan: add support for expect/assume Lynne 2025-04-12 7:22 ` Lynne [this message] 2025-04-13 13:38 ` [FFmpeg-devel] [PATCH 01/18] hwcontext_vulkan: disable descriptor buffer extension on Intel Jerome Martinez
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250412072256.77815-18-dev@lynne.ee \ --to=dev@lynne.ee \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git