Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: Lynne <dev@lynne.ee>
To: ffmpeg-devel@ffmpeg.org
Cc: Lynne <dev@lynne.ee>
Subject: [FFmpeg-devel] [PATCH 10/16] ffv1enc_vulkan: implement the cached EC writer from the decoder
Date: Wed, 14 May 2025 21:02:39 +0200
Message-ID: <20250514190253.162819-10-dev@lynne.ee> (raw)
In-Reply-To: <20250514190253.162819-1-dev@lynne.ee>

This gives a 35% speedup on AMD and 50% on Nvidia.
---
 libavcodec/ffv1enc_vulkan.c     |  6 ++-
 libavcodec/vulkan/ffv1_enc.comp | 68 ++++++++++++++++++++++-----------
 2 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index c2eb73ca53..5de16d5b02 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -1099,12 +1099,13 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     uint8_t *spv_data;
     size_t spv_len;
     void *spv_opaque = NULL;
+    int use_cached_reader = fv->ctx.ac != AC_GOLOMB_RICE;
 
     RET(ff_vk_shader_init(&fv->s, shd, "ffv1_enc",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
                                              "GL_EXT_buffer_reference2" }, 2,
-                          1, 1, 1,
+                          use_cached_reader ? CONTEXT_SIZE : 1, 1, 1,
                           0));
 
     /* Common codec header */
@@ -1116,6 +1117,9 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE);
 
+    if (use_cached_reader)
+        av_bprintf(&shd->src, "#define CACHED_SYMBOL_READER 1\n");
+
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
             .name        = "rangecoder_static_buf",
diff --git a/libavcodec/vulkan/ffv1_enc.comp b/libavcodec/vulkan/ffv1_enc.comp
index db33c414e1..65a7df1359 100644
--- a/libavcodec/vulkan/ffv1_enc.comp
+++ b/libavcodec/vulkan/ffv1_enc.comp
@@ -21,27 +21,32 @@
  */
 
 #ifndef GOLOMB
+#ifdef CACHED_SYMBOL_READER
+shared uint8_t state[CONTEXT_SIZE];
+#define WRITE(c, off, val) put_rac_direct(c, state[off], val)
+#else
+#define WRITE(c, off, val) put_rac(c, uint64_t(slice_state) + (state_off + off), val)
+#endif
+
 /* Note - only handles signed values */
-void put_symbol(inout RangeCoder c, uint64_t state, int v)
+void put_symbol(inout RangeCoder c, uint state_off, int v)
 {
     bool is_nil = (v == 0);
-    put_rac(c, state, is_nil);
+    WRITE(c, 0, is_nil);
     if (is_nil)
         return;
 
     const int a = abs(v);
     const int e = findMSB(a);
 
-    state += 1;
     for (int i = 0; i < e; i++)
-        put_rac(c, state + min(i, 9), true);
-    put_rac(c, state + min(e, 9), false);
+        WRITE(c, 1 + min(i, 9), true);
+    WRITE(c, 1 + min(e, 9), false);
 
-    state += 21;
     for (int i = e - 1; i >= 0; i--)
-        put_rac(c, state + min(i, 9), bool(bitfieldExtract(a, i, 1)));
+        WRITE(c, 22 + min(i, 9), bool(bitfieldExtract(a, i, 1)));
 
-    put_rac(c, state - 11 + min(e, 10), v < 0);
+    WRITE(c, 22 - 11 + min(e, 10), v < 0);
 }
 
 void encode_line_pcm(inout SliceContext sc, readonly uimage2D img,
@@ -49,6 +54,11 @@ void encode_line_pcm(inout SliceContext sc, readonly uimage2D img,
 {
     int w = sc.slice_dim.x;
 
+#ifdef CACHED_SYMBOL_READER
+    if (gl_LocalInvocationID.x > 0)
+        return;
+#endif
+
 #ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
@@ -63,7 +73,7 @@ void encode_line_pcm(inout SliceContext sc, readonly uimage2D img,
     }
 }
 
-void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
+void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off,
                  ivec2 sp, int y, int p, int comp, int bits,
                  uint8_t quant_table_idx, const int run_index)
 {
@@ -86,13 +96,25 @@ void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
 
         d[1] = fold(d[1], bits);
 
-        put_symbol(sc.c, state + CONTEXT_SIZE*d[0], d[1]);
+        uint context_off = state_off + CONTEXT_SIZE*d[0];
+#ifdef CACHED_SYMBOL_READER
+        u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x);
+        state[gl_LocalInvocationID.x] = sb.v;
+        barrier();
+        if (gl_LocalInvocationID.x == 0)
+#endif
+
+            put_symbol(sc.c, context_off, d[1]);
+
+#ifdef CACHED_SYMBOL_READER
+        sb.v = state[gl_LocalInvocationID.x];
+#endif
     }
 }
 
 #else /* GOLOMB */
 
-void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
+void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off,
                  ivec2 sp, int y, int p, int comp, int bits,
                  uint8_t quant_table_idx, inout int run_index)
 {
@@ -143,7 +165,7 @@ void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
         }
 
         if (!run_mode) {
-            VlcState sb = VlcState(state + VLC_STATE_SIZE*d[0]);
+            VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*d[0]);
             Symbol sym = get_vlc_symbol(sb, d[1], bits);
             put_bits(sc.pb, sym.bits, sym.val);
         }
@@ -245,8 +267,7 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
 #endif
     {
         u8vec4 quant_table_idx = sc.quant_table_idx.xyyz;
-        uint64_t slice_state_off = uint64_t(slice_state) +
-                                   slice_idx*plane_state_size*codec_planes;
+        u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size;
 
 #ifndef RGB
         for (int c = 0; c < components; c++) {
@@ -260,26 +281,22 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
             int comp = c - p;
 
             for (int y = 0; y < h; y++)
-                encode_line(sc, src[p], slice_state_off, sp, y, p,
+                encode_line(sc, src[p], slice_state_off[c], sp, y, p,
                             comp, bits, quant_table_idx[c], run_index);
-
-            /* For the second chroma plane, reuse the first plane's state */
-            if (c != 1)
-                slice_state_off += plane_state_size;
         }
 #else
         int run_index = 0;
         for (int y = 0; y < sc.slice_dim.y; y++) {
             preload_rgb(sc, sp, sc.slice_dim.x, y, true);
 
-            encode_line(sc, tmp, slice_state_off + plane_state_size*0,
+            encode_line(sc, tmp, slice_state_off[0],
                         sp, y, 0, 1, bits, quant_table_idx[0], run_index);
-            encode_line(sc, tmp, slice_state_off + plane_state_size*1,
+            encode_line(sc, tmp, slice_state_off[1],
                         sp, y, 0, 2, bits, quant_table_idx[1], run_index);
-            encode_line(sc, tmp, slice_state_off + plane_state_size*1,
+            encode_line(sc, tmp, slice_state_off[2],
                         sp, y, 0, 0, bits, quant_table_idx[2], run_index);
             if (transparency == 1)
-                encode_line(sc, tmp, slice_state_off + plane_state_size*2,
+                encode_line(sc, tmp, slice_state_off[3],
                             sp, y, 0, 3, bits, quant_table_idx[3], run_index);
         }
 #endif
@@ -288,6 +305,11 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
 
 void finalize_slice(inout SliceContext sc, const uint slice_idx)
 {
+#ifdef CACHED_SYMBOL_READER
+    if (gl_LocalInvocationID.x > 0)
+        return;
+#endif
+
 #ifdef GOLOMB
     uint32_t enc_len = sc.hdr_len + flush_put_bits(sc.pb);
 #else
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  parent reply	other threads:[~2025-05-14 19:05 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 02/16] vulkan/ffv1: synchronize get_pred implementations between encoder and decoder Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 03/16] ffv1enc_vulkan: get rid of temporary data for the setup shader Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 04/16] ffv1enc_vulkan: unify EC code between setup and encode Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 05/16] ffv1enc_vulkan: minor EC optimizations Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 06/16] ffv1enc_vulkan: switch to 2-line cache, unify prediction code Lynne
2025-05-23 14:38   ` [FFmpeg-devel] [PATCH] ffv1enc_vulkan: fix array overflow Jerome Martinez
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 07/16] ffv1_common: minor RGB optimization Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 08/16] ffv1enc_vulkan: use ff_get_encode_buffer Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 09/16] vulkan_ffv1: fix PCM + cached symbol reader Lynne
2025-05-14 19:02 ` Lynne [this message]
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 11/16] ffv1enc_vulkan: implement RCT search for level >= 4 Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 12/16] vulkan/ffv1: unify encode and decode get/put primitives Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 13/16] vulkan_ffv1: pipe through slice decoding status Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 14/16] vulkan: enable VK_KHR_shader_subgroup_rotate Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 15/16] hwcontext_vulkan: correct image transfer usage flags Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 16/16] hwcontext_vulkan: only try exporting DMABUF memory on !WIN32 and only for DMABUF tiling Lynne

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250514190253.162819-10-dev@lynne.ee \
    --to=dev@lynne.ee \
    --cc=ffmpeg-devel@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git