From: Lynne <dev@lynne.ee>
To: ffmpeg-devel@ffmpeg.org
Cc: Lynne <dev@lynne.ee>
Subject: [FFmpeg-devel] [PATCH 18/18] vulkan_ffv1: add cached symbol reader for AMD
Date: Sat, 12 Apr 2025 09:22:49 +0200
Message-ID: <20250412072256.77815-18-dev@lynne.ee> (raw)
In-Reply-To: <20250412072256.77815-1-dev@lynne.ee>
Speeds up everything on AMD by 3x.
This uses 32 local invocations to load state into cache, as well
as to do the RCT faster.
---
libavcodec/vulkan/ffv1_dec.comp | 71 ++++++++++++++++++++-------------
libavcodec/vulkan_ffv1.c | 7 +++-
2 files changed, 50 insertions(+), 28 deletions(-)
diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp
index 9eba322b27..3c46ee1771 100644
--- a/libavcodec/vulkan/ffv1_dec.comp
+++ b/libavcodec/vulkan/ffv1_dec.comp
@@ -108,34 +108,37 @@ ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
#endif
#ifndef GOLOMB
-int get_isymbol(inout RangeCoder c, uint64_t state)
+#ifdef CACHED_SYMBOL_READER
+shared uint8_t state[CONTEXT_SIZE];
+#define READ(c, off) get_rac_direct(c, state[off])
+#else
+#define READ(c, off) get_rac(c, uint64_t(slice_state) + state_off + off)
+#endif
+
+int get_isymbol(inout RangeCoder c, uint state_off)
{
- if (expectEXT(get_rac(c, state), false))
+ if (expectEXT(READ(c, 0), false))
return 0;
- state += 1;
-
- int e;
- for (e = 0; e < 32; e++)
- if (!get_rac(c, state + min(e, 9)))
+ int e = 1;
+ for (; e < 33; e++)
+ if (!READ(c, min(e, 10)))
break;
- if (expectEXT(e == 0, false)) {
- return get_rac(c, state + 10) ? -1 : 1;
- } else if (expectEXT(e > 31, false)) {
+ if (expectEXT(e == 1, false)) {
+ return READ(c, 11) ? -1 : 1;
+ } else if (expectEXT(e == 33, false)) {
corrupt = true;
return 0;
}
- state += 21;
-
int a = 1;
- for (int i = e - 1; i >= 0; i--) {
+ for (int i = e + 20; i >= 22; i--) {
a <<= 1;
- a |= int(get_rac(c, state + min(i, 9))); // 22..31
+ a |= int(READ(c, min(i, 31)));
}
- return get_rac(c, state - 11 + min(e, 10)) ? -a : a;
+ return READ(c, min(e + 10, 21)) ? -a : a;
}
void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int bits)
@@ -157,7 +160,7 @@ void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int b
}
void decode_line(inout SliceContext sc, ivec2 sp, int w,
- int y, int p, int bits, uint64_t state,
+ int y, int p, int bits, uint state_off,
uint8_t quant_table_idx, const int run_index)
{
#ifndef RGB
@@ -171,19 +174,33 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
quant_table_idx);
- int diff = get_isymbol(sc.c, state + CONTEXT_SIZE*abs(pr[0]));
- if (pr[0] < 0)
- diff = -diff;
+ uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]);
+#ifdef CACHED_SYMBOL_READER
+ u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x);
+ state[gl_LocalInvocationID.x] = sb.v;
+ barrier();
+ if (gl_LocalInvocationID.x == 0) {
- uint v = zero_extend(pr[1] + diff, bits);
- imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));
+#endif
+
+ int diff = get_isymbol(sc.c, context_off);
+ if (pr[0] < 0)
+ diff = -diff;
+
+ uint v = zero_extend(pr[1] + diff, bits);
+ imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));
+
+#ifdef CACHED_SYMBOL_READER
+ }
+ sb.v = state[gl_LocalInvocationID.x];
+#endif
}
}
#else /* GOLOMB */
void decode_line(inout SliceContext sc, ivec2 sp, int w,
- int y, int p, int bits, uint64_t state,
+ int y, int p, int bits, uint state_off,
uint8_t quant_table_idx, inout int run_index)
{
#ifndef RGB
@@ -202,7 +219,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
quant_table_idx);
- VlcState sb = VlcState(state + VLC_STATE_SIZE*abs(pr[0]));
+ VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*abs(pr[0]));
if (pr[0] == 0 && run_mode == 0)
run_mode = 1;
@@ -263,7 +280,7 @@ ivec4 transform_sample(ivec4 pix, ivec2 rct_coef)
void writeout_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct)
{
- for (int x = 0; x < w; x++) {
+ for (uint x = gl_LocalInvocationID.x; x < w; x += gl_WorkGroupSize.x) {
ivec2 lpos = sp + LADDR(ivec2(x, y));
ivec2 pos = sc.slice_pos + ivec2(x, y);
@@ -305,6 +322,8 @@ void decode_slice(inout SliceContext sc, const uint slice_idx)
/* PCM coding */
#ifndef GOLOMB
if (sc.slice_coding_mode == 1) {
+ if (gl_LocalInvocationID.x > 0)
+ return;
#ifndef RGB
for (int p = 0; p < planes; p++) {
int h = sc.slice_dim.y;
@@ -328,9 +347,7 @@ void decode_slice(inout SliceContext sc, const uint slice_idx)
#endif
{
u8vec4 quant_table_idx = sc.quant_table_idx.xyyz;
- u64vec4 slice_state_off = (uint64_t(slice_state) +
- slice_idx*plane_state_size*codec_planes) +
- plane_state_size*uvec4(0, 1, 1, 2);
+ u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size;
#ifndef RGB
for (int p = 0; p < planes; p++) {
diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c
index c1875711bc..33c4e9114d 100644
--- a/libavcodec/vulkan_ffv1.c
+++ b/libavcodec/vulkan_ffv1.c
@@ -823,12 +823,14 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
uint8_t *spv_data;
size_t spv_len;
void *spv_opaque = NULL;
+ int use_cached_reader = ac != AC_GOLOMB_RICE &&
+ s->driver_props.driverID == VK_DRIVER_ID_MESA_RADV;
RET(ff_vk_shader_init(s, shd, "ffv1_dec",
VK_SHADER_STAGE_COMPUTE_BIT,
(const char *[]) { "GL_EXT_buffer_reference",
"GL_EXT_buffer_reference2" }, 2,
- 1, 1, 1,
+ use_cached_reader ? 32 : 1, 1, 1,
0));
if (ac == AC_GOLOMB_RICE)
@@ -837,6 +839,9 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
if (rgb)
av_bprintf(&shd->src, "#define RGB\n");
+ if (use_cached_reader)
+ av_bprintf(&shd->src, "#define CACHED_SYMBOL_READER 1\n");
+
/* Common codec header */
GLSLD(ff_source_common_comp);
--
2.47.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-04-12 7:26 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-12 7:22 [FFmpeg-devel] [PATCH 01/18] hwcontext_vulkan: disable descriptor buffer extension on Intel Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 02/18] vulkan_ffv1: enable acceleration " Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 03/18] vulkan_ffv1: remove unused define Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 04/18] vulkan_ffv1: slightly optimize the range decoder Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 05/18] vulkan_ffv1: optimize symbol reader Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 06/18] vulkan_ffv1: allocate just as much memory for slice state as needed Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 07/18] vulkan_ffv1: init overread/corrupt fields Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 08/18] vulkan_ffv1: fallback to upload if mapping packet fails, fix fallback Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 09/18] vulkan_ffv1: fix reset shader dependencies Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 10/18] vulkan_ffv1: improve buffer barrier correctness for slice state Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 11/18] vulkan_ffv1: fix left-2 sample addressing Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 12/18] vulkan_ffv1: cache only 2 lines when decoding RGB Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 13/18] ffv1/vulkan: redo context count tracking and quant_table_idx management Lynne
2025-04-13 20:39 ` Jerome Martinez
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 14/18] vulkan_ffv1: externalize extended lookup check Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 15/18] vulkan_ffv1: remove need for scratch data during setup Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 16/18] vulkan_ffv1: shortcut +-1 coeffs in symbol reading Lynne
2025-04-12 7:22 ` [FFmpeg-devel] [PATCH 17/18] vulkan: add support for expect/assume Lynne
2025-04-12 7:22 ` Lynne [this message]
2025-04-13 13:38 ` [FFmpeg-devel] [PATCH 01/18] hwcontext_vulkan: disable descriptor buffer extension on Intel Jerome Martinez
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250412072256.77815-18-dev@lynne.ee \
--to=dev@lynne.ee \
--cc=ffmpeg-devel@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git