* [FFmpeg-devel] [PATCH] vk-proresdec-vldoptim (PR #21203)
@ 2025-12-15 9:50 averne via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: averne via ffmpeg-devel @ 2025-12-15 9:50 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: averne
PR #21203 opened by averne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21203
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21203.patch
This introduces a more aggressive bitstream caching, and saves the codebook/scan tables in shared memory, to reduce the overhead of hitting global memory.
It gives a nice speedup on AMD, but less significant on NVIDIA/Intel:
For a 4k, 422p10 file:
- AMD 6700XT: 18% (249 vs 211 fps)
- NVIDIA RTX 3050: 4% (98 vs 94 fps)
- Intel Tiger Lake GT2: 12% (38 vs 34 fps)
>From 540834ad991af1144d4b6d8d34737eff488fa303 Mon Sep 17 00:00:00 2001
From: averne <averne381@gmail.com>
Date: Sun, 14 Dec 2025 23:01:45 +0100
Subject: [PATCH 1/3] lavc/vulkan/common: allow configurable bitstream caching
in shared memory
---
libavcodec/vulkan/common.comp | 31 ++++++++++++++++---------------
1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp
index d50e629f06..1e34c9bab2 100644
--- a/libavcodec/vulkan/common.comp
+++ b/libavcodec/vulkan/common.comp
@@ -229,13 +229,14 @@ struct GetBitContext {
gb.bits_valid += 32; \
}
#else /* GET_BITS_SMEM */
-shared u32vec4 gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z];
+shared u32vec4 gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z*GET_BITS_SMEM];
-#define FILL_SMEM() \
- { \
- u32vec4buf ptr = u32vec4buf(gb.buf); \
- gb_storage[gl_LocalInvocationIndex] = ptr[0].v; \
- gb.cur_smem_pos = 0; \
+#define FILL_SMEM() \
+ { \
+ u32vec4buf ptr = u32vec4buf(gb.buf); \
+ [[unroll]] for (uint i = 0; i < GET_BITS_SMEM; ++i) \
+ gb_storage[gl_LocalInvocationIndex*GET_BITS_SMEM+i] = ptr[i].v; \
+ gb.cur_smem_pos = 0; \
}
#define LOAD64() \
@@ -251,15 +252,15 @@ shared u32vec4 gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize
FILL_SMEM(); \
}
-#define RELOAD32() \
- { \
- if (gb.cur_smem_pos >= 4) \
- FILL_SMEM(); \
- uint v = gb_storage[gl_LocalInvocationIndex][gb.cur_smem_pos]; \
- gb.buf += 4; \
- gb.bits = uint64_t(reverse4(v)) << (32 - gb.bits_valid) | gb.bits; \
- gb.bits_valid += 32; \
- gb.cur_smem_pos += 1; \
+#define RELOAD32() \
+ { \
+ if (gb.cur_smem_pos >= 4*GET_BITS_SMEM) \
+ FILL_SMEM(); \
+ uint v = gb_storage[gl_LocalInvocationIndex*GET_BITS_SMEM][gb.cur_smem_pos]; \
+ gb.buf += 4; \
+ gb.bits = uint64_t(reverse4(v)) << (32 - gb.bits_valid) | gb.bits; \
+ gb.bits_valid += 32; \
+ gb.cur_smem_pos += 1; \
}
#endif /* GET_BITS_SMEM */
--
2.49.1
>From 04f0ab1992e5b019a9bb4e6f0c9b6130339cc575 Mon Sep 17 00:00:00 2001
From: averne <averne381@gmail.com>
Date: Sun, 14 Dec 2025 23:05:07 +0100
Subject: [PATCH 2/3] vulkan/prores: increase bitstream caching
Now caches 64B of data when the reader hits the refill codepath
---
libavcodec/vulkan_prores.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c
index 338c09d46f..da10f93548 100644
--- a/libavcodec/vulkan_prores.c
+++ b/libavcodec/vulkan_prores.c
@@ -405,7 +405,7 @@ static int init_shader(AVCodecContext *avctx, FFVulkanContext *s,
local_size >> 16 & 0xff, local_size >> 8 & 0xff, local_size >> 0 & 0xff,
0));
- av_bprintf(&shd->src, "#define GET_BITS_SMEM\n");
+ av_bprintf(&shd->src, "#define GET_BITS_SMEM %d\n", 4);
if (interlaced)
av_bprintf(&shd->src, "#define INTERLACED\n");
--
2.49.1
>From d33674642084a857b348cc8a256ad5cfd4b67e42 Mon Sep 17 00:00:00 2001
From: averne <averne381@gmail.com>
Date: Sun, 14 Dec 2025 23:13:11 +0100
Subject: [PATCH 3/3] vulkan/prores: copy constant tables to shared memory
The shader needs ~3 loads per DCT coeff.
This data was not observed to get efficiently stored
in the upper cached levels, loading it explicitely in
shared memory fixes that.
---
libavcodec/vulkan/prores_vld.comp | 106 +++++++++++++++++-------------
1 file changed, 59 insertions(+), 47 deletions(-)
diff --git a/libavcodec/vulkan/prores_vld.comp b/libavcodec/vulkan/prores_vld.comp
index 298a5baf4c..30d5dcb04d 100644
--- a/libavcodec/vulkan/prores_vld.comp
+++ b/libavcodec/vulkan/prores_vld.comp
@@ -19,6 +19,58 @@
#define U8(x) (uint8_t (x))
#define U16(x) (uint16_t(x))
+/**
+ * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8)
+ * According to the SMPTE document, abs(prev_dc_diff) should be used
+ * to index the table, duplicating the entries removes the abs operation.
+ */
+const uint16_t k_dc_codebook[] = { U16(0x100),
+ U16(0x210), U16(0x210),
+ U16(0x321), U16(0x321),
+ U16(0x430), U16(0x430), };
+
+/* Table 10 */
+const uint16_t k_ac_run_codebook [] = { U16(0x102), U16(0x102), U16(0x101), U16(0x101),
+ U16(0x100), U16(0x211), U16(0x211), U16(0x211),
+ U16(0x211), U16(0x210), U16(0x210), U16(0x210),
+ U16(0x210), U16(0x210), U16(0x210), U16(0x320), };
+/* Table 11 */
+const uint16_t k_ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), U16(0x100),
+ U16(0x210), U16(0x210), U16(0x210), U16(0x210),
+ U16(0x320) };
+
+#ifndef INTERLACED
+ /* Figure 4, encoded as (x << 0) | (y << 4) */
+ const uint8_t k_scan_tbl[] = {
+ U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), U8(0x13),
+ U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), U8(0x33),
+ U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), U8(0x16),
+ U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), U8(0x37),
+ U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), U8(0x52),
+ U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), U8(0x54),
+ U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), U8(0x56),
+ U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), U8(0x77),
+ };
+#else
+ /* Figure 5 */
+ const uint8_t k_scan_tbl[] = {
+ U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), U8(0x31),
+ U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), U8(0x33),
+ U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), U8(0x61),
+ U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), U8(0x73),
+ U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), U8(0x25),
+ U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), U8(0x45),
+ U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), U8(0x65),
+ U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), U8(0x77),
+ };
+#endif
+
+shared uint16_t dc_codebook [k_dc_codebook .length()],
+ ac_run_codebook [k_ac_run_codebook .length()],
+ ac_level_codebook[k_ac_level_codebook.length()];
+
+shared uint8_t scan_tbl[k_scan_tbl.length()];
+
void put_px(uint tex_idx, ivec2 pos, uint v)
{
#ifndef INTERLACED
@@ -72,16 +124,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count)
uint c = to_signed(decode_codeword(gb, 0x650));
put_px(gid.z, base_pos, c);
- /**
- * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8)
- * According to the SMPTE document, abs(prev_dc_diff) should be used
- * to index the table, duplicating the entries removes the abs operation.
- */
- const uint16_t dc_codebook[] = { U16(0x100),
- U16(0x210), U16(0x210),
- U16(0x321), U16(0x321),
- U16(0x430), U16(0x430), };
-
uint cw = 5, prev_dc_diff = 0;
for (int i = 1; i < num_blocks; ++i) {
cw = decode_codeword(gb, dc_codebook[min(cw, 6)]);
@@ -95,43 +137,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count)
/* 7.1.1.4 AC Coefficients */
{
- /* Table 10 */
- const uint16_t ac_run_codebook [] = { U16(0x102), U16(0x102), U16(0x101), U16(0x101),
- U16(0x100), U16(0x211), U16(0x211), U16(0x211),
- U16(0x211), U16(0x210), U16(0x210), U16(0x210),
- U16(0x210), U16(0x210), U16(0x210), U16(0x320), };
-
- /* Table 11 */
- const uint16_t ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), U16(0x100),
- U16(0x210), U16(0x210), U16(0x210), U16(0x210),
- U16(0x320) };
-
-#ifndef INTERLACED
- /* Figure 4, encoded as (x << 0) | (y << 4) */
- const uint8_t scan_tbl[] = {
- U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), U8(0x13),
- U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), U8(0x33),
- U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), U8(0x16),
- U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), U8(0x37),
- U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), U8(0x52),
- U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), U8(0x54),
- U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), U8(0x56),
- U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), U8(0x77),
- };
-#else
- /* Figure 5 */
- const uint8_t scan_tbl[] = {
- U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), U8(0x31),
- U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), U8(0x33),
- U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), U8(0x61),
- U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), U8(0x73),
- U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), U8(0x25),
- U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), U8(0x45),
- U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), U8(0x65),
- U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), U8(0x77),
- };
-#endif
-
uint block_mask = num_blocks - 1;
uint block_shift = findLSB(num_blocks);
@@ -276,6 +281,13 @@ void main(void)
if (left_bits(gb) == 0)
return;
+ /* Copy constant tables to local memory */
+ dc_codebook = k_dc_codebook;
+ ac_run_codebook = k_ac_run_codebook;
+ ac_level_codebook = k_ac_level_codebook;
+
+ scan_tbl = k_scan_tbl;
+
/**
* 4 ProRes Frame Structure
* ProRes tiles pictures into a grid of slices, whose size is determined
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-12-15 21:30 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-12-15 9:50 [FFmpeg-devel] [PATCH] vk-proresdec-vldoptim (PR #21203) averne via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git