[FFmpeg-devel] [PATCH] libavcodec/vulkan: cache bitstream in shared memory (PR #21067)

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

From: averne via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: averne <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] libavcodec/vulkan: cache bitstream in shared memory (PR #21067)
Date: Sun, 30 Nov 2025 20:15:28 -0000
Message-ID: <176453372863.39.14202436331499423423@2cb04c0e5124> (raw)

PR #21067 opened by averne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21067
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21067.patch

This introduces a small per-invocation buffer in shared memory, to store bitstream data and decrease the number of global memory reads. The bitstream read head is first aligned to a 4-byte boundary, so that we can use u32vec4 to perform the read. On architectures that support it (AMD/NV, don't know about others), that compiles to a single instruction.
It can probably be used in other VK-based decoders, with minimal modifications.

On the ProRes decoder, I measured the following speedup (5760x3040 422p10, 3840x2160 422p10):
NVIDIA 3050: 65%, 87%
AMD 6700XT: 16%, 22%
Intel Tiger Lake GT2: 6%, 4%


>From ef7354d471c18ec5c998220eaa4a95bdd36ccb6a Mon Sep 17 00:00:00 2001
From: averne <averne381@gmail.com>
Date: Sat, 29 Nov 2025 22:33:26 +0100
Subject: [PATCH 1/3] libavcodec/vulkan: introduce cached bitstream reader

This stores a small buffer in shared memory per decode thread (16 bytes),
which helps reduce the number of memory accesses.
The bitstream buffer is first aligned to a 4 byte boundary, so that the
buffer can be filled with a single memory request.
---
 libavcodec/vulkan/common.comp | 42 +++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp
index eda92ce28d..4b71dfd2f4 100644
--- a/libavcodec/vulkan/common.comp
+++ b/libavcodec/vulkan/common.comp
@@ -42,6 +42,10 @@ layout(buffer_reference, buffer_reference_align = 4) buffer u32vec2buf {
     u32vec2 v;
 };
 
+layout(buffer_reference, buffer_reference_align = 4) buffer u32vec4buf {
+    u32vec4 v;
+};
+
 layout(buffer_reference, buffer_reference_align = 8) buffer u64buf {
     uint64_t v;
 };
@@ -198,8 +202,12 @@ struct GetBitContext {
     uint64_t bits;
     int bits_valid;
     int size_in_bits;
+#ifdef GET_BITS_SMEM
+    int cur_smem_pos;
+#endif
 };
 
+#ifndef GET_BITS_SMEM
 #define LOAD64()                                       \
     {                                                  \
         u8vec4buf ptr = u8vec4buf(gb.buf);             \
@@ -218,6 +226,40 @@ struct GetBitContext {
         gb.bits = uint64_t(rf) << (32 - gb.bits_valid) | gb.bits; \
         gb.bits_valid += 32;                                      \
     }
+#else /* GET_BITS_SMEM */
+shared u32vec4 gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z];
+
+#define FILL_SMEM()                                      \
+    {                                                    \
+        u32vec4buf ptr = u32vec4buf(gb.buf);             \
+        gb_storage[gl_LocalInvocationIndex] = ptr[0].v;  \
+        gb.cur_smem_pos = 0;                             \
+    }
+
+#define LOAD64()                                                    \
+    {                                                               \
+        gb.bits = 0;                                                \
+        gb.bits_valid = 0;                                          \
+        u8buf ptr = u8buf(gb.buf);                                  \
+        for (uint i = 0; i < ((4 - uint(gb.buf_start)) & 3); ++i) { \
+            gb.bits |= uint64_t(ptr[i].v) << (56 - i * 8);          \
+            gb.bits_valid += 8;                                     \
+            gb.buf += 1;                                            \
+        }                                                           \
+        FILL_SMEM();                                                \
+    }
+
+#define RELOAD32()                                                         \
+    {                                                                      \
+        if (gb.cur_smem_pos >= 4)                                          \
+            FILL_SMEM();                                                   \
+        uint v = gb_storage[gl_LocalInvocationIndex][gb.cur_smem_pos];     \
+        gb.buf += 4;                                                       \
+        gb.bits = uint64_t(reverse4(v)) << (32 - gb.bits_valid) | gb.bits; \
+        gb.bits_valid += 32;                                               \
+        gb.cur_smem_pos += 1;                                              \
+    }
+#endif /* GET_BITS_SMEM */
 
 void init_get_bits(inout GetBitContext gb, u8buf data, int len)
 {
-- 
2.49.1


>From fd2fd3828c1e7384d06afee7481d61865ab6cfa7 Mon Sep 17 00:00:00 2001
From: averne <averne381@gmail.com>
Date: Sun, 30 Nov 2025 13:25:37 +0100
Subject: [PATCH 2/3] libavcodec/vulkan: remove unnessary member in
 GetBitContext

The number of remaining bits can be calculated using existing state.
This simplifies calculations and frees up one register.
---
 libavcodec/vulkan/common.comp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp
index 4b71dfd2f4..f5f466ce31 100644
--- a/libavcodec/vulkan/common.comp
+++ b/libavcodec/vulkan/common.comp
@@ -201,7 +201,6 @@ struct GetBitContext {
 
     uint64_t bits;
     int bits_valid;
-    int size_in_bits;
 #ifdef GET_BITS_SMEM
     int cur_smem_pos;
 #endif
@@ -265,7 +264,6 @@ void init_get_bits(inout GetBitContext gb, u8buf data, int len)
 {
     gb.buf = gb.buf_start = uint64_t(data);
     gb.buf_end = uint64_t(data) + len;
-    gb.size_in_bits = len * 8;
 
     /* Preload */
     LOAD64()
@@ -320,5 +318,5 @@ int tell_bits(in GetBitContext gb)
 
 int left_bits(in GetBitContext gb)
 {
-    return gb.size_in_bits - int(gb.buf - gb.buf_start) * 8 + gb.bits_valid;
+    return int(gb.buf_end - gb.buf) * 8 + gb.bits_valid;
 }
-- 
2.49.1


>From 9d65c4af8baf8a8c35d7f1eb3b3ddb3d2d46cc59 Mon Sep 17 00:00:00 2001
From: averne <averne381@gmail.com>
Date: Sat, 29 Nov 2025 22:33:45 +0100
Subject: [PATCH 3/3] vulkan/prores: use cached bitstream reader when possible

Check for sanity that the hardware is capable of holding the
required data in shared memory.
Speedup is around 25%.
---
 libavcodec/vulkan_prores.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c
index 0c704c3d1c..b945ab9c38 100644
--- a/libavcodec/vulkan_prores.c
+++ b/libavcodec/vulkan_prores.c
@@ -21,7 +21,6 @@
 #include "hwaccel_internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/vulkan.h"
-#include "libavutil/vulkan_loader.h"
 #include "libavutil/vulkan_spirv.h"
 
 extern const char *ff_source_common_comp;
@@ -207,14 +206,12 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
     RET(ff_vk_exec_mirror_sem_value(&ctx->s, exec, &vp->sem, &vp->sem_value,
                                     pr->frame));
 
+    /* Transfer ownership to the exec context */
     RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &vp->slices_buf, 1, 0));
     vp->slices_buf = NULL;
     RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &pp->metadata_buf, 1, 0));
     pp->metadata_buf = NULL;
 
-    /* Transfer ownership to the exec context */
-    vp->slices_buf = pp->metadata_buf = NULL;
-
     /* Input barrier */
     ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar,
                         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
@@ -390,7 +387,7 @@ static int init_shader(AVCodecContext *avctx, FFVulkanContext *s,
                        FFVkExecPool *pool, FFVkSPIRVCompiler *spv,
                        FFVulkanShader *shd, const char *name, const char *entrypoint,
                        FFVulkanDescriptorSetBinding *descs, int num_descs,
-                       const char *source, int local_size, int interlaced)
+                       const char *source, int local_size, int interlaced, int gb_smem)
 {
     uint8_t *spv_data;
     size_t spv_len;
@@ -404,6 +401,12 @@ static int init_shader(AVCodecContext *avctx, FFVulkanContext *s,
                           local_size >> 16 & 0xff, local_size >> 8 & 0xff, local_size >> 0 & 0xff,
                           0));
 
+    if (gb_smem)
+        av_bprintf(&shd->src, "#define GET_BITS_SMEM\n");
+
+    if (interlaced)
+        av_bprintf(&shd->src, "#define INTERLACED\n");
+
     /* Common code */
     GLSLD(ff_source_common_comp);
 
@@ -412,9 +415,6 @@ static int init_shader(AVCodecContext *avctx, FFVulkanContext *s,
 
     RET(ff_vk_shader_add_descriptor_set(s, shd, descs, num_descs, 0, 0));
 
-    if (interlaced)
-        av_bprintf(&shd->src, "#define INTERLACED\n");
-
     /* Main code */
     GLSLD(source);
 
@@ -493,7 +493,8 @@ static int vk_decode_prores_init(AVCodecContext *avctx)
     };
     RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &pv->reset,
                     "prores_dec_reset", "main", desc_set, 1,
-                    ff_source_prores_reset_comp, 0x080801, pr->frame_type != 0));
+                    ff_source_prores_reset_comp, 0x080801, pr->frame_type != 0, 0));
+
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
             .name        = "slice_offsets_buf",
@@ -524,7 +525,8 @@ static int vk_decode_prores_init(AVCodecContext *avctx)
     };
     RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &pv->vld,
                     "prores_dec_vld", "main", desc_set, 3,
-                    ff_source_prores_vld_comp, 0x080801, pr->frame_type != 0));
+                    ff_source_prores_vld_comp, 0x080801, pr->frame_type != 0,
+                    ctx->s.props.properties.limits.maxComputeSharedMemorySize >= 64*16));
 
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
@@ -547,7 +549,7 @@ static int vk_decode_prores_init(AVCodecContext *avctx)
     };
     RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &pv->idct,
                     "prores_dec_idct", "main", desc_set, 2,
-                    ff_source_prores_idct_comp, 0x200201, pr->frame_type != 0));
+                    ff_source_prores_idct_comp, 0x200201, pr->frame_type != 0, 0));
 
     err = 0;
 
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

                 reply	other threads:[~2025-11-30 20:16 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=176453372863.39.14202436331499423423@2cb04c0e5124 \
    --to=ffmpeg-devel@ffmpeg.org \
    --cc=code@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git