Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file
@ 2025-05-14 19:02 Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 02/16] vulkan/ffv1: synchronize get_pred implementations between encoder and decoder Lynne
                   ` (14 more replies)
  0 siblings, 15 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

Makes it easier to work with, despite the heavy ifdeffery.
---
 libavcodec/ffv1enc_vulkan.c            |  21 +--
 libavcodec/vulkan/Makefile             |   4 +-
 libavcodec/vulkan/ffv1_enc.comp        | 240 ++++++++++++++++++++++++-
 libavcodec/vulkan/ffv1_enc_ac.comp     |  83 ---------
 libavcodec/vulkan/ffv1_enc_common.comp | 101 -----------
 libavcodec/vulkan/ffv1_enc_rgb.comp    |  83 ---------
 libavcodec/vulkan/ffv1_enc_vlc.comp    | 112 ------------
 7 files changed, 244 insertions(+), 400 deletions(-)
 delete mode 100644 libavcodec/vulkan/ffv1_enc_ac.comp
 delete mode 100644 libavcodec/vulkan/ffv1_enc_common.comp
 delete mode 100644 libavcodec/vulkan/ffv1_enc_rgb.comp
 delete mode 100644 libavcodec/vulkan/ffv1_enc_vlc.comp

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 42a98a5efa..f4b54b8375 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -114,13 +114,9 @@ extern const char *ff_source_rangecoder_comp;
 extern const char *ff_source_ffv1_vlc_comp;
 extern const char *ff_source_ffv1_common_comp;
 extern const char *ff_source_ffv1_reset_comp;
-extern const char *ff_source_ffv1_enc_common_comp;
 extern const char *ff_source_ffv1_enc_rct_comp;
-extern const char *ff_source_ffv1_enc_vlc_comp;
-extern const char *ff_source_ffv1_enc_ac_comp;
 extern const char *ff_source_ffv1_enc_setup_comp;
 extern const char *ff_source_ffv1_enc_comp;
-extern const char *ff_source_ffv1_enc_rgb_comp;
 
 typedef struct FFv1VkParameters {
     VkDeviceAddress slice_state;
@@ -961,6 +957,9 @@ static void define_shared_code(AVCodecContext *avctx, FFVulkanShader *shd)
         av_bprintf(&shd->src, "#define GOLOMB\n"                         );
     }
 
+    if (fv->is_rgb)
+        av_bprintf(&shd->src, "#define RGB\n");
+
     GLSLF(0, #define TYPE int%i_t                                        ,smp_bits);
     GLSLF(0, #define VTYPE2 i%ivec2                                      ,smp_bits);
     GLSLF(0, #define VTYPE3 i%ivec3                                      ,smp_bits);
@@ -1260,7 +1259,6 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
 {
     int err;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
-    FFV1Context *f = &fv->ctx;
     FFVulkanShader *shd = &fv->enc;
     FFVulkanDescriptorSetBinding *desc_set;
 
@@ -1344,18 +1342,7 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     };
     RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0));
 
-    /* Assemble the shader body */
-    GLSLD(ff_source_ffv1_enc_common_comp);
-
-    if (f->ac == AC_GOLOMB_RICE)
-        GLSLD(ff_source_ffv1_enc_vlc_comp);
-    else
-        GLSLD(ff_source_ffv1_enc_ac_comp);
-
-    if (fv->is_rgb)
-        GLSLD(ff_source_ffv1_enc_rgb_comp);
-    else
-        GLSLD(ff_source_ffv1_enc_comp);
+    GLSLD(ff_source_ffv1_enc_comp);
 
     RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main",
                             &spv_opaque));
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index feb5d2ea51..4bbcb38c6a 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -6,10 +6,8 @@ clean::
 OBJS-$(CONFIG_FFV1_VULKAN_ENCODER)  +=  vulkan/common.o \
 					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
 					vulkan/ffv1_common.o vulkan/ffv1_reset.o \
-					vulkan/ffv1_enc_common.o \
 					vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_setup.o \
-					vulkan/ffv1_enc_vlc.o vulkan/ffv1_enc_ac.o \
-					vulkan/ffv1_enc.o vulkan/ffv1_enc_rgb.o
+					vulkan/ffv1_enc.o
 
 OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
 					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
diff --git a/libavcodec/vulkan/ffv1_enc.comp b/libavcodec/vulkan/ffv1_enc.comp
index 4b851fd711..9854ecad51 100644
--- a/libavcodec/vulkan/ffv1_enc.comp
+++ b/libavcodec/vulkan/ffv1_enc.comp
@@ -20,12 +20,186 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+ivec2 get_diff(ivec2 pos, ivec2 off, int p, int comp, int sw, int bits)
+{
+    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
+    const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
+
+    TYPE top2 = TYPE(0);
+    if (off.y > 1)
+        top2 = TYPE(imageLoad(src[p], pos + ivec2(0, -2))[comp]);
+
+    VTYPE3 top  = VTYPE3(TYPE(0),
+                         TYPE(0),
+                         TYPE(0));
+    if (off.y > 0 && off != ivec2(0, 1))
+        top[0] = TYPE(imageLoad(src[p], pos + ivec2(-1, -1) + yoff_border1)[comp]);
+    if (off.y > 0) {
+        top[1] = TYPE(imageLoad(src[p], pos + ivec2(0, -1))[comp]);
+        top[2] = TYPE(imageLoad(src[p], pos + ivec2(min(1, sw - off.x - 1), -1))[comp]);
+    }
+
+    VTYPE3 cur = VTYPE3(TYPE(0),
+                        TYPE(0),
+                        imageLoad(src[p], pos)[comp]);
+    if (off.x > 0 && off != ivec2(1, 0))
+        cur[0] = TYPE(imageLoad(src[p], pos + ivec2(-2,  0) + yoff_border2)[comp]);
+    if (off != ivec2(0, 0))
+        cur[1] = TYPE(imageLoad(src[p], pos + ivec2(-1,  0) + yoff_border1)[comp]);
+
+    /* context, diff */
+    ivec2 d = ivec2(get_context(VTYPE2(cur), top, top2, context_model),
+                    cur[2] - predict(cur[1], VTYPE2(top)));
+
+    if (d[0] < 0)
+        d = -d;
+
+    d[1] = fold(d[1], bits);
+
+    return d;
+}
+
+#ifndef GOLOMB
+void put_rac(inout RangeCoder c, uint64_t state, bool bit)
+{
+    put_rac_norenorm(c, state, bit);
+    if (c.range < 0x100)
+        renorm_encoder(c);
+}
+
+/* Note - only handles signed values */
+void put_symbol(inout RangeCoder c, uint64_t state, int v)
+{
+    bool is_nil = (v == 0);
+    put_rac(c, state, is_nil);
+    if (is_nil)
+        return;
+
+    const int a = abs(v);
+    const int e = findMSB(a);
+
+    state += 1;
+    for (int i = 0; i < e; i++)
+        put_rac(c, state + min(i, 9), true);
+    put_rac(c, state + min(e, 9), false);
+
+    state += 21;
+    for (int i = e - 1; i >= 0; i--)
+        put_rac(c, state + min(i, 9), bool(bitfieldExtract(a, i, 1)));
+
+    put_rac(c, state - 11 + min(e, 10), v < 0);
+}
+
+void encode_line_pcm(inout SliceContext sc, int y, int p, int comp,
+                     int bits)
+{
+    ivec2 sp = sc.slice_pos;
+    int w = sc.slice_dim.x;
+    if (p > 0 && p < 3) {
+        w >>= chroma_shift.x;
+        sp >>= chroma_shift;
+    }
+
+    for (int x = 0; x < w; x++) {
+        uint v = imageLoad(src[p], (sp + ivec2(x, y)))[comp];
+        for (int i = (bits - 1); i >= 0; i--)
+            put_rac_equi(sc.c, bool(bitfieldExtract(v, i, 1)));
+    }
+}
+
+void encode_line(inout SliceContext sc, uint64_t state,
+                 int y, int p, int comp, int bits, const int run_index)
+{
+    ivec2 sp = sc.slice_pos;
+
+    int w = sc.slice_dim.x;
+    if (p > 0 && p < 3) {
+        w >>= chroma_shift.x;
+        sp >>= chroma_shift;
+    }
+
+    for (int x = 0; x < w; x++) {
+        const ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits);
+        put_symbol(sc.c, state + CONTEXT_SIZE*d[0], d[1]);
+    }
+}
+
+#else /* GOLOMB */
+
+void encode_line(inout SliceContext sc, uint64_t state,
+                 int y, int p, int comp, int bits, inout int run_index)
+{
+    ivec2 sp = sc.slice_pos;
+
+    int w = sc.slice_dim.x;
+    if (p > 0 && p < 3) {
+        w >>= chroma_shift.x;
+        sp >>= chroma_shift;
+    }
+
+    int run_count = 0;
+    bool run_mode = false;
+
+    for (int x = 0; x < w; x++) {
+        ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits);
+
+        if (d[0] == 0)
+            run_mode = true;
+
+        if (run_mode) {
+            if (d[1] != 0) {
+                /* A very unlikely loop */
+                while (run_count >= 1 << log2_run[run_index]) {
+                    run_count -= 1 << log2_run[run_index];
+                    run_index++;
+                    put_bits(sc.pb, 1, 1);
+                }
+
+                put_bits(sc.pb, 1 + log2_run[run_index], run_count);
+                if (run_index != 0)
+                    run_index--;
+                run_count = 0;
+                run_mode  = false;
+                if (d[1] > 0)
+                    d[1]--;
+            } else {
+                run_count++;
+            }
+        }
+
+        if (!run_mode) {
+            VlcState sb = VlcState(state + VLC_STATE_SIZE*d[0]);
+            Symbol sym = get_vlc_symbol(sb, d[1], bits);
+            put_bits(sc.pb, sym.bits, sym.val);
+        }
+    }
+
+    if (run_mode) {
+        while (run_count >= (1 << log2_run[run_index])) {
+            run_count -= 1 << log2_run[run_index];
+            run_index++;
+            put_bits(sc.pb, 1, 1);
+        }
+
+        if (run_count > 0)
+            put_bits(sc.pb, 1, 1);
+    }
+}
+#endif
+
 void encode_slice(inout SliceContext sc, const uint slice_idx)
 {
+#ifndef RGB
     int bits = bits_per_raw_sample;
+#else
+    int bits = 9;
+    if (bits != 8 || sc.slice_coding_mode != 0)
+        bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1);
+#endif
 
 #ifndef GOLOMB
     if (sc.slice_coding_mode == 1) {
+#ifndef RGB
         for (int c = 0; c < components; c++) {
 
             int h = sc.slice_dim.y;
@@ -39,12 +213,22 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
             for (int y = 0; y < h; y++)
                 encode_line_pcm(sc, y, p, comp, bits);
         }
+#else
+        for (int y = 0; y < sc.slice_dim.y; y++) {
+            encode_line_pcm(sc, y, 0, 1, bits);
+            encode_line_pcm(sc, y, 0, 2, bits);
+            encode_line_pcm(sc, y, 0, 0, bits);
+            if (transparency == 1)
+                encode_line_pcm(sc, y, 0, 3, bits);
+        }
+#endif
     } else
 #endif
     {
         uint64_t slice_state_off = uint64_t(slice_state) +
                                    slice_idx*plane_state_size*codec_planes;
 
+#ifndef RGB
         for (int c = 0; c < components; c++) {
             int run_index = 0;
 
@@ -62,13 +246,67 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
             if (c != 1)
                 slice_state_off += plane_state_size;
         }
+#else
+        int run_index = 0;
+        for (int y = 0; y < sc.slice_dim.y; y++) {
+            encode_line(sc, slice_state_off + plane_state_size*0,
+                        y, 0, 1, bits, run_index);
+            encode_line(sc, slice_state_off + plane_state_size*1,
+                        y, 0, 2, bits, run_index);
+            encode_line(sc, slice_state_off + plane_state_size*1,
+                        y, 0, 0, bits, run_index);
+            if (transparency == 1)
+                encode_line(sc, slice_state_off + plane_state_size*2,
+                            y, 0, 3, bits, run_index);
+        }
+#endif
+    }
+}
+
+void finalize_slice(inout SliceContext sc, const uint slice_idx)
+{
+#ifdef GOLOMB
+    uint32_t enc_len = sc.hdr_len + flush_put_bits(sc.pb);
+#else
+    uint32_t enc_len = rac_terminate(sc.c);
+#endif
+
+    u8buf bs = u8buf(sc.c.bytestream_start);
+
+    /* Append slice length */
+    u8vec4 enc_len_p = unpack8(enc_len);
+    bs[enc_len + 0].v = enc_len_p.z;
+    bs[enc_len + 1].v = enc_len_p.y;
+    bs[enc_len + 2].v = enc_len_p.x;
+    enc_len += 3;
+
+    /* Calculate and write CRC */
+    if (ec != 0) {
+        bs[enc_len].v = uint8_t(0);
+        enc_len++;
+
+        uint32_t crc = crcref;
+        for (int i = 0; i < enc_len; i++)
+            crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8);
+
+        if (crcref != 0x00000000)
+            crc ^= 0x8CD88196;
+
+        u8vec4 crc_p = unpack8(crc);
+        bs[enc_len + 0].v = crc_p.x;
+        bs[enc_len + 1].v = crc_p.y;
+        bs[enc_len + 2].v = crc_p.z;
+        bs[enc_len + 3].v = crc_p.w;
+        enc_len += 4;
     }
 
-    finalize_slice(sc, slice_idx);
+    slice_results[slice_idx*2 + 0] = enc_len;
+    slice_results[slice_idx*2 + 1] = uint64_t(bs) - uint64_t(out_data);
 }
 
 void main(void)
 {
     const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
     encode_slice(slice_ctx[slice_idx], slice_idx);
+    finalize_slice(slice_ctx[slice_idx], slice_idx);
 }
diff --git a/libavcodec/vulkan/ffv1_enc_ac.comp b/libavcodec/vulkan/ffv1_enc_ac.comp
deleted file mode 100644
index 0bbf58c5dd..0000000000
--- a/libavcodec/vulkan/ffv1_enc_ac.comp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2024 Lynne <dev@lynne.ee>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-void put_rac(inout RangeCoder c, uint64_t state, bool bit)
-{
-    put_rac_norenorm(c, state, bit);
-    if (c.range < 0x100)
-        renorm_encoder(c);
-}
-
-/* Note - only handles signed values */
-void put_symbol(inout RangeCoder c, uint64_t state, int v)
-{
-    bool is_nil = (v == 0);
-    put_rac(c, state, is_nil);
-    if (is_nil)
-        return;
-
-    const int a = abs(v);
-    const int e = findMSB(a);
-
-    state += 1;
-    for (int i = 0; i < e; i++)
-        put_rac(c, state + min(i, 9), true);
-    put_rac(c, state + min(e, 9), false);
-
-    state += 21;
-    for (int i = e - 1; i >= 0; i--)
-        put_rac(c, state + min(i, 9), bool(bitfieldExtract(a, i, 1)));
-
-    put_rac(c, state - 11 + min(e, 10), v < 0);
-}
-
-void encode_line_pcm(inout SliceContext sc, int y, int p, int comp,
-                     int bits)
-{
-    ivec2 sp = sc.slice_pos;
-    int w = sc.slice_dim.x;
-    if (p > 0 && p < 3) {
-        w >>= chroma_shift.x;
-        sp >>= chroma_shift;
-    }
-
-    for (int x = 0; x < w; x++) {
-        uint v = imageLoad(src[p], (sp + ivec2(x, y)))[comp];
-        for (int i = (bits - 1); i >= 0; i--)
-            put_rac_equi(sc.c, bool(bitfieldExtract(v, i, 1)));
-    }
-}
-
-void encode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int comp, int bits, const int run_index)
-{
-    ivec2 sp = sc.slice_pos;
-
-    int w = sc.slice_dim.x;
-    if (p > 0 && p < 3) {
-        w >>= chroma_shift.x;
-        sp >>= chroma_shift;
-    }
-
-    for (int x = 0; x < w; x++) {
-        const ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits);
-        put_symbol(sc.c, state + CONTEXT_SIZE*d[0], d[1]);
-    }
-}
diff --git a/libavcodec/vulkan/ffv1_enc_common.comp b/libavcodec/vulkan/ffv1_enc_common.comp
deleted file mode 100644
index 62c0624b0e..0000000000
--- a/libavcodec/vulkan/ffv1_enc_common.comp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * FFv1 codec
- *
- * Copyright (c) 2024 Lynne <dev@lynne.ee>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-ivec2 get_diff(ivec2 pos, ivec2 off, int p, int comp, int sw, int bits)
-{
-    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
-    const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
-
-    TYPE top2 = TYPE(0);
-    if (off.y > 1)
-        top2 = TYPE(imageLoad(src[p], pos + ivec2(0, -2))[comp]);
-
-    VTYPE3 top  = VTYPE3(TYPE(0),
-                         TYPE(0),
-                         TYPE(0));
-    if (off.y > 0 && off != ivec2(0, 1))
-        top[0] = TYPE(imageLoad(src[p], pos + ivec2(-1, -1) + yoff_border1)[comp]);
-    if (off.y > 0) {
-        top[1] = TYPE(imageLoad(src[p], pos + ivec2(0, -1))[comp]);
-        top[2] = TYPE(imageLoad(src[p], pos + ivec2(min(1, sw - off.x - 1), -1))[comp]);
-    }
-
-    VTYPE3 cur = VTYPE3(TYPE(0),
-                        TYPE(0),
-                        imageLoad(src[p], pos)[comp]);
-    if (off.x > 0 && off != ivec2(1, 0))
-        cur[0] = TYPE(imageLoad(src[p], pos + ivec2(-2,  0) + yoff_border2)[comp]);
-    if (off != ivec2(0, 0))
-        cur[1] = TYPE(imageLoad(src[p], pos + ivec2(-1,  0) + yoff_border1)[comp]);
-
-    /* context, diff */
-    ivec2 d = ivec2(get_context(VTYPE2(cur), top, top2, context_model),
-                    cur[2] - predict(cur[1], VTYPE2(top)));
-
-    if (d[0] < 0)
-        d = -d;
-
-    d[1] = fold(d[1], bits);
-
-    return d;
-}
-
-void finalize_slice(inout SliceContext sc, const uint slice_idx)
-{
-#ifdef GOLOMB
-    uint32_t enc_len = sc.hdr_len + flush_put_bits(sc.pb);
-#else
-    uint32_t enc_len = rac_terminate(sc.c);
-#endif
-
-    u8buf bs = u8buf(sc.c.bytestream_start);
-
-    /* Append slice length */
-    u8vec4 enc_len_p = unpack8(enc_len);
-    bs[enc_len + 0].v = enc_len_p.z;
-    bs[enc_len + 1].v = enc_len_p.y;
-    bs[enc_len + 2].v = enc_len_p.x;
-    enc_len += 3;
-
-    /* Calculate and write CRC */
-    if (ec != 0) {
-        bs[enc_len].v = uint8_t(0);
-        enc_len++;
-
-        uint32_t crc = crcref;
-        for (int i = 0; i < enc_len; i++)
-            crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8);
-
-        if (crcref != 0x00000000)
-            crc ^= 0x8CD88196;
-
-        u8vec4 crc_p = unpack8(crc);
-        bs[enc_len + 0].v = crc_p.x;
-        bs[enc_len + 1].v = crc_p.y;
-        bs[enc_len + 2].v = crc_p.z;
-        bs[enc_len + 3].v = crc_p.w;
-        enc_len += 4;
-    }
-
-    slice_results[slice_idx*2 + 0] = enc_len;
-    slice_results[slice_idx*2 + 1] = uint64_t(bs) - uint64_t(out_data);
-}
diff --git a/libavcodec/vulkan/ffv1_enc_rgb.comp b/libavcodec/vulkan/ffv1_enc_rgb.comp
deleted file mode 100644
index c176d94e8b..0000000000
--- a/libavcodec/vulkan/ffv1_enc_rgb.comp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * FFv1 codec
- *
- * Copyright (c) 2024 Lynne <dev@lynne.ee>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-void encode_slice_rgb(inout SliceContext sc, const uint slice_idx)
-{
-    int bits = 9;
-    if (bits != 8 || sc.slice_coding_mode != 0)
-        bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1);
-
-    int run_index = 0;
-
-#ifndef GOLOMB
-    if (sc.slice_coding_mode == 1) {
-        if (transparency == 1) {
-            for (int y = 0; y < sc.slice_dim.y; y++) {
-                encode_line_pcm(sc, y, 0, 1, bits);
-                encode_line_pcm(sc, y, 0, 2, bits);
-                encode_line_pcm(sc, y, 0, 0, bits);
-                encode_line_pcm(sc, y, 0, 3, bits);
-            }
-        } else {
-            for (int y = 0; y < sc.slice_dim.y; y++) {
-                encode_line_pcm(sc, y, 0, 1, bits);
-                encode_line_pcm(sc, y, 0, 2, bits);
-                encode_line_pcm(sc, y, 0, 0, bits);
-            }
-        }
-    } else
-#endif
-    {
-        uint64_t slice_state_off = uint64_t(slice_state) +
-                                   slice_idx*plane_state_size*codec_planes;
-
-        if (transparency == 1) {
-            for (int y = 0; y < sc.slice_dim.y; y++) {
-                encode_line(sc, slice_state_off + plane_state_size*0,
-                            y, 0, 1, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*1,
-                            y, 0, 2, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*1,
-                            y, 0, 0, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*2,
-                            y, 0, 3, bits, run_index);
-            }
-        } else {
-            for (int y = 0; y < sc.slice_dim.y; y++) {
-                encode_line(sc, slice_state_off + plane_state_size*0,
-                            y, 0, 1, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*1,
-                            y, 0, 2, bits, run_index);
-                encode_line(sc, slice_state_off + plane_state_size*1,
-                            y, 0, 0, bits, run_index);
-            }
-        }
-    }
-
-    finalize_slice(sc, slice_idx);
-}
-
-void main(void)
-{
-    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
-    encode_slice_rgb(slice_ctx[slice_idx], slice_idx);
-}
diff --git a/libavcodec/vulkan/ffv1_enc_vlc.comp b/libavcodec/vulkan/ffv1_enc_vlc.comp
deleted file mode 100644
index 7a4d39e307..0000000000
--- a/libavcodec/vulkan/ffv1_enc_vlc.comp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * FFv1 codec
- *
- * Copyright (c) 2024 Lynne <dev@lynne.ee>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-struct RLEState {
-    int count;
-    int diff;
-    int index;
-    bool mode;
-};
-
-void calc_new_state(inout RLEState state, int context)
-{
-    if (context == 0)
-        state.mode = false;
-
-    if (!state.mode)
-        return;
-
-    if (state.diff > 0) {
-        while (state.count >= (1 << log2_run[state.index])) {
-            state.count -= 1 << log2_run[state.index];
-            state.index++;
-        }
-        if (state.index > 0)
-            state.index--;
-        state.count = 0;
-        state.mode = false;
-        if (state.diff > 0)
-            state.diff--;
-    } else {
-        state.count++;
-    }
-}
-
-void encode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int comp, int bits, inout int run_index)
-{
-    ivec2 sp = sc.slice_pos;
-
-    int w = sc.slice_dim.x;
-    if (p > 0 && p < 3) {
-        w >>= chroma_shift.x;
-        sp >>= chroma_shift;
-    }
-
-    int run_count = 0;
-    bool run_mode = false;
-
-    for (int x = 0; x < w; x++) {
-        ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits);
-
-        if (d[0] == 0)
-            run_mode = true;
-
-        if (run_mode) {
-            if (d[1] != 0) {
-                /* A very unlikely loop */
-                while (run_count >= 1 << log2_run[run_index]) {
-                    run_count -= 1 << log2_run[run_index];
-                    run_index++;
-                    put_bits(sc.pb, 1, 1);
-                }
-
-                put_bits(sc.pb, 1 + log2_run[run_index], run_count);
-                if (run_index != 0)
-                    run_index--;
-                run_count = 0;
-                run_mode  = false;
-                if (d[1] > 0)
-                    d[1]--;
-            } else {
-                run_count++;
-            }
-        }
-
-        if (!run_mode) {
-            VlcState sb = VlcState(state + VLC_STATE_SIZE*d[0]);
-            Symbol sym = get_vlc_symbol(sb, d[1], bits);
-            put_bits(sc.pb, sym.bits, sym.val);
-        }
-    }
-
-    if (run_mode) {
-        while (run_count >= (1 << log2_run[run_index])) {
-            run_count -= 1 << log2_run[run_index];
-            run_index++;
-            put_bits(sc.pb, 1, 1);
-        }
-
-        if (run_count > 0)
-            put_bits(sc.pb, 1, 1);
-    }
-}
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 02/16] vulkan/ffv1: synchronize get_pred implementations between encoder and decoder
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 03/16] ffv1enc_vulkan: get rid of temporary data for the setup shader Lynne
                   ` (13 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

---
 libavcodec/vulkan/ffv1_dec.comp | 32 ++++++-------
 libavcodec/vulkan/ffv1_enc.comp | 85 ++++++++++++++++++++-------------
 2 files changed, 68 insertions(+), 49 deletions(-)

diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp
index fc0175c715..1c313b3168 100644
--- a/libavcodec/vulkan/ffv1_dec.comp
+++ b/libavcodec/vulkan/ffv1_dec.comp
@@ -29,19 +29,19 @@
 #endif
 
 #ifdef RGB
-ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
+ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off, int comp, int sw, uint8_t quant_table_idx)
 {
     const ivec2 yoff_border1 = expectEXT(off.x == 0, false) ? ivec2(1, -1) : ivec2(0, 0);
 
     /* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */
-    VTYPE3 top  = VTYPE3(TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[0]),
-                         TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(0, -1)))[0]),
-                         TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(min(1, sw - off.x - 1), -1)))[0]));
+    VTYPE3 top  = VTYPE3(TYPE(imageLoad(pred, sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[comp]),
+                         TYPE(imageLoad(pred, sp + LADDR(off + ivec2(0, -1)))[comp]),
+                         TYPE(imageLoad(pred, sp + LADDR(off + ivec2(min(1, sw - off.x - 1), -1)))[comp]));
 
     /* Normally, we'd need to check if off != ivec2(0, 0) here, since otherwise, we must
      * return zero. However, ivec2(-1,  0) + ivec2(1, -1) == ivec2(0, -1), e.g. previous
      * row, 0 offset, same slice, which is zero since we zero out the buffer for RGB */
-    TYPE cur = TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1,  0) + yoff_border1))[0]);
+    TYPE cur = TYPE(imageLoad(pred, sp + LADDR(off + ivec2(-1,  0) + yoff_border1))[comp]);
 
     int base = quant_table[quant_table_idx][0][(cur    - top[0]) & MAX_QUANT_TABLE_MASK] +
                quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
@@ -51,12 +51,12 @@ ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
         TYPE cur2 = TYPE(0);
         if (expectEXT(off.x > 0, true)) {
             const ivec2 yoff_border2 = expectEXT(off.x == 1, false) ? ivec2(-1, -1) : ivec2(-2, 0);
-            cur2 = TYPE(imageLoad(dec[p], sp + LADDR(off + yoff_border2))[0]);
+            cur2 = TYPE(imageLoad(pred, sp + LADDR(off + yoff_border2))[comp]);
         }
         base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
 
         /* top-2 became current upon swap */
-        TYPE top2 = TYPE(imageLoad(dec[p], sp + LADDR(off))[0]);
+        TYPE top2 = TYPE(imageLoad(pred, sp + LADDR(off))[comp]);
         base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
     }
 
@@ -64,7 +64,7 @@ ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
     return ivec2(base, predict(cur, VTYPE2(top)));
 }
 #else
-ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
+ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off, int comp, int sw, uint8_t quant_table_idx)
 {
     const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
     sp += off;
@@ -73,15 +73,15 @@ ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
                          TYPE(0),
                          TYPE(0));
     if (off.y > 0 && off != ivec2(0, 1))
-        top[0] = TYPE(imageLoad(dec[p], sp + ivec2(-1, -1) + yoff_border1)[0]);
+        top[0] = TYPE(imageLoad(pred, sp + ivec2(-1, -1) + yoff_border1)[comp]);
     if (off.y > 0) {
-        top[1] = TYPE(imageLoad(dec[p], sp + ivec2(0, -1))[0]);
-        top[2] = TYPE(imageLoad(dec[p], sp + ivec2(min(1, sw - off.x - 1), -1))[0]);
+        top[1] = TYPE(imageLoad(pred, sp + ivec2(0, -1))[comp]);
+        top[2] = TYPE(imageLoad(pred, sp + ivec2(min(1, sw - off.x - 1), -1))[comp]);
     }
 
     TYPE cur = TYPE(0);
     if (off != ivec2(0, 0))
-        cur = TYPE(imageLoad(dec[p], sp + ivec2(-1,  0) + yoff_border1)[0]);
+        cur = TYPE(imageLoad(pred, sp + ivec2(-1,  0) + yoff_border1)[comp]);
 
     int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] +
                quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
@@ -92,13 +92,13 @@ ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
         TYPE cur2 = TYPE(0);
         if (off.x > 0 && off != ivec2(1, 0)) {
             const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
-            cur2 = TYPE(imageLoad(dec[p], sp + ivec2(-2,  0) + yoff_border2)[0]);
+            cur2 = TYPE(imageLoad(pred, sp + ivec2(-2,  0) + yoff_border2)[comp]);
         }
         base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
 
         TYPE top2 = TYPE(0);
         if (off.y > 1)
-            top2 = TYPE(imageLoad(dec[p], sp + ivec2(0, -2))[0]);
+            top2 = TYPE(imageLoad(pred, sp + ivec2(0, -2))[comp]);
         base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
     }
 
@@ -171,7 +171,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
 #endif
 
     for (int x = 0; x < w; x++) {
-        ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
+        ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
                             quant_table_idx);
 
         uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]);
@@ -216,7 +216,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
     for (int x = 0; x < w; x++) {
         ivec2 pos = sp + ivec2(x, y);
         int diff;
-        ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
+        ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
                             quant_table_idx);
 
         VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*abs(pr[0]));
diff --git a/libavcodec/vulkan/ffv1_enc.comp b/libavcodec/vulkan/ffv1_enc.comp
index 9854ecad51..7f8c831efa 100644
--- a/libavcodec/vulkan/ffv1_enc.comp
+++ b/libavcodec/vulkan/ffv1_enc.comp
@@ -20,43 +20,46 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-ivec2 get_diff(ivec2 pos, ivec2 off, int p, int comp, int sw, int bits)
+ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off, int comp, int sw, uint8_t quant_table_idx)
 {
     const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
-    const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
-
-    TYPE top2 = TYPE(0);
-    if (off.y > 1)
-        top2 = TYPE(imageLoad(src[p], pos + ivec2(0, -2))[comp]);
+    sp += off;
 
     VTYPE3 top  = VTYPE3(TYPE(0),
                          TYPE(0),
                          TYPE(0));
     if (off.y > 0 && off != ivec2(0, 1))
-        top[0] = TYPE(imageLoad(src[p], pos + ivec2(-1, -1) + yoff_border1)[comp]);
+        top[0] = TYPE(imageLoad(pred, sp + ivec2(-1, -1) + yoff_border1)[comp]);
     if (off.y > 0) {
-        top[1] = TYPE(imageLoad(src[p], pos + ivec2(0, -1))[comp]);
-        top[2] = TYPE(imageLoad(src[p], pos + ivec2(min(1, sw - off.x - 1), -1))[comp]);
+        top[1] = TYPE(imageLoad(pred, sp + ivec2(0, -1))[comp]);
+        top[2] = TYPE(imageLoad(pred, sp + ivec2(min(1, sw - off.x - 1), -1))[comp]);
     }
 
-    VTYPE3 cur = VTYPE3(TYPE(0),
-                        TYPE(0),
-                        imageLoad(src[p], pos)[comp]);
-    if (off.x > 0 && off != ivec2(1, 0))
-        cur[0] = TYPE(imageLoad(src[p], pos + ivec2(-2,  0) + yoff_border2)[comp]);
+    TYPE cur = TYPE(0);
     if (off != ivec2(0, 0))
-        cur[1] = TYPE(imageLoad(src[p], pos + ivec2(-1,  0) + yoff_border1)[comp]);
-
-    /* context, diff */
-    ivec2 d = ivec2(get_context(VTYPE2(cur), top, top2, context_model),
-                    cur[2] - predict(cur[1], VTYPE2(top)));
-
-    if (d[0] < 0)
-        d = -d;
+        cur = TYPE(imageLoad(pred, sp + ivec2(-1,  0) + yoff_border1)[comp]);
+
+    int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
+
+    if ((quant_table[quant_table_idx][3][127] != 0) ||
+        (quant_table[quant_table_idx][4][127] != 0)) {
+        TYPE cur2 = TYPE(0);
+        if (off.x > 0 && off != ivec2(1, 0)) {
+            const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
+            cur2 = TYPE(imageLoad(pred, sp + ivec2(-2,  0) + yoff_border2)[comp]);
+        }
+        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
 
-    d[1] = fold(d[1], bits);
+        TYPE top2 = TYPE(0);
+        if (off.y > 1)
+            top2 = TYPE(imageLoad(pred, sp + ivec2(0, -2))[comp]);
+        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
+    }
 
-    return d;
+    /* context, prediction */
+    return ivec2(base, predict(cur, VTYPE2(top)));
 }
 
 #ifndef GOLOMB
@@ -108,7 +111,8 @@ void encode_line_pcm(inout SliceContext sc, int y, int p, int comp,
 }
 
 void encode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int comp, int bits, const int run_index)
+                 int y, int p, int comp, int bits,
+                 uint8_t quant_table_idx, const int run_index)
 {
     ivec2 sp = sc.slice_pos;
 
@@ -119,7 +123,14 @@ void encode_line(inout SliceContext sc, uint64_t state,
     }
 
     for (int x = 0; x < w; x++) {
-        const ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits);
+        ivec2 d = get_pred(src[p], sp, ivec2(x, y), comp, w, quant_table_idx);
+        d[1] = int(imageLoad(src[p], sp + ivec2(x, y))[comp]) - d[1];
+
+        if (d[0] < 0)
+            d = -d;
+
+        d[1] = fold(d[1], bits);
+
         put_symbol(sc.c, state + CONTEXT_SIZE*d[0], d[1]);
     }
 }
@@ -127,7 +138,8 @@ void encode_line(inout SliceContext sc, uint64_t state,
 #else /* GOLOMB */
 
 void encode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int comp, int bits, inout int run_index)
+                 int y, int p, int comp, int bits,
+                 uint8_t quant_table_idx, inout int run_index)
 {
     ivec2 sp = sc.slice_pos;
 
@@ -141,7 +153,13 @@ void encode_line(inout SliceContext sc, uint64_t state,
     bool run_mode = false;
 
     for (int x = 0; x < w; x++) {
-        ivec2 d = get_diff(sp + ivec2(x, y), ivec2(x, y), p, comp, w, bits);
+        ivec2 d = get_pred(src[p], sp, ivec2(x, y), comp, w, quant_table_idx);
+        d[1] = int(imageLoad(src[p], sp + ivec2(x, y))[comp]) - d[1];
+
+        if (d[0] < 0)
+            d = -d;
+
+        d[1] = fold(d[1], bits);
 
         if (d[0] == 0)
             run_mode = true;
@@ -225,6 +243,7 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
     } else
 #endif
     {
+        u8vec4 quant_table_idx = sc.quant_table_idx.xyyz;
         uint64_t slice_state_off = uint64_t(slice_state) +
                                    slice_idx*plane_state_size*codec_planes;
 
@@ -240,7 +259,7 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
             int comp = c - p;
 
             for (int y = 0; y < h; y++)
-                encode_line(sc, slice_state_off, y, p, comp, bits, run_index);
+                encode_line(sc, slice_state_off, y, p, comp, bits, quant_table_idx[c], run_index);
 
             /* For the second chroma plane, reuse the first plane's state */
             if (c != 1)
@@ -250,14 +269,14 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
         int run_index = 0;
         for (int y = 0; y < sc.slice_dim.y; y++) {
             encode_line(sc, slice_state_off + plane_state_size*0,
-                        y, 0, 1, bits, run_index);
+                        y, 0, 1, bits, quant_table_idx[0], run_index);
             encode_line(sc, slice_state_off + plane_state_size*1,
-                        y, 0, 2, bits, run_index);
+                        y, 0, 2, bits, quant_table_idx[1], run_index);
             encode_line(sc, slice_state_off + plane_state_size*1,
-                        y, 0, 0, bits, run_index);
+                        y, 0, 0, bits, quant_table_idx[2], run_index);
             if (transparency == 1)
                 encode_line(sc, slice_state_off + plane_state_size*2,
-                            y, 0, 3, bits, run_index);
+                            y, 0, 3, bits, quant_table_idx[3], run_index);
         }
 #endif
     }
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 03/16] ffv1enc_vulkan: get rid of temporary data for the setup shader
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 02/16] vulkan/ffv1: synchronize get_pred implementations between encoder and decoder Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 04/16] ffv1enc_vulkan: unify EC code between setup and encode Lynne
                   ` (12 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

---
 libavcodec/ffv1enc_vulkan.c           | 21 ---------
 libavcodec/vulkan/ffv1_enc_setup.comp | 65 +++++++++++----------------
 libavcodec/vulkan/rangecoder.comp     | 28 +++++++-----
 3 files changed, 42 insertions(+), 72 deletions(-)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index f4b54b8375..d78ba3aca8 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -88,9 +88,6 @@ typedef struct VulkanEncodeFFv1Context {
     AVBufferPool *out_data_pool;
     AVBufferPool *pkt_data_pool;
 
-    /* Temporary data buffer */
-    AVBufferPool *tmp_data_pool;
-
     /* Slice results buffer */
     AVBufferPool *results_data_pool;
 
@@ -303,11 +300,6 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
 
     AVFrame *intermediate_frame = NULL;
 
-    /* Temporary data */
-    size_t tmp_data_size;
-    AVBufferRef *tmp_data_ref;
-    FFVkBuffer *tmp_data_buf;
-
     /* Slice data */
     AVBufferRef *slice_data_ref;
     FFVkBuffer *slice_data_buf;
@@ -352,17 +344,6 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
 
     f->slice_count = f->max_slice_count;
 
-    /* Allocate temporary data buffer */
-    tmp_data_size = f->slice_count*CONTEXT_SIZE;
-    RET(ff_vk_get_pooled_buffer(&fv->s, &fv->tmp_data_pool,
-                                &tmp_data_ref,
-                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
-                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
-                                NULL, tmp_data_size,
-                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
-    tmp_data_buf = (FFVkBuffer *)tmp_data_ref->data;
-    ff_vk_exec_add_dep_buf(&fv->s, exec, &tmp_data_ref, 1, 0);
-
     /* Allocate slice buffer data */
     if (f->ac == AC_GOLOMB_RICE)
         plane_state_size = 8;
@@ -481,7 +462,6 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     ff_vk_exec_bind_shader(&fv->s, exec, &fv->setup);
     pd = (FFv1VkParameters) {
         .slice_state = slice_data_buf->address + f->slice_count*256,
-        .scratch_data = tmp_data_buf->address,
         .out_data = out_data_buf->address,
         .bits_per_raw_sample = f->bits_per_raw_sample,
         .sar[0] = pict->sample_aspect_ratio.num,
@@ -1698,7 +1678,6 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx)
 
     av_buffer_pool_uninit(&fv->out_data_pool);
     av_buffer_pool_uninit(&fv->pkt_data_pool);
-    av_buffer_pool_uninit(&fv->tmp_data_pool);
 
     av_buffer_unref(&fv->keyframe_slice_data_ref);
     av_buffer_pool_uninit(&fv->slice_data_pool);
diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp b/libavcodec/vulkan/ffv1_enc_setup.comp
index 44c13404d8..d395770ba8 100644
--- a/libavcodec/vulkan/ffv1_enc_setup.comp
+++ b/libavcodec/vulkan/ffv1_enc_setup.comp
@@ -20,6 +20,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+uint8_t state[CONTEXT_SIZE];
+
 void init_slice(out SliceContext sc, const uint slice_idx)
 {
     /* Set coordinates */
@@ -45,67 +47,54 @@ void init_slice(out SliceContext sc, const uint slice_idx)
              slice_size_max);
 }
 
-void put_rac_full(inout RangeCoder c, uint64_t state, bool bit)
-{
-    put_rac_norenorm(c, state, bit);
-    if (c.range < 0x100)
-        renorm_encoder_full(c);
-}
-
-void put_symbol_unsigned(inout RangeCoder c, uint64_t state, uint v)
+void put_usymbol(inout RangeCoder c, uint v)
 {
     bool is_nil = (v == 0);
-    put_rac_full(c, state, is_nil);
+    put_rac(c, state[0], is_nil);
     if (is_nil)
         return;
 
     const int e = findMSB(v);
 
-    state += 1;
     for (int i = 0; i < e; i++)
-        put_rac_full(c, state + min(i, 9), true);
-    put_rac_full(c, state + min(e, 9), false);
+        put_rac(c, state[1 + min(i, 9)], true);
+    put_rac(c, state[1 + min(e, 9)], false);
 
-    state += 21;
     for (int i = e - 1; i >= 0; i--)
-        put_rac_full(c, state + min(i, 9), bool(bitfieldExtract(v, i, 1)));
+        put_rac(c, state[22 + min(i, 9)], bool(bitfieldExtract(v, i, 1)));
 }
 
-void write_slice_header(inout SliceContext sc, uint64_t state)
+void write_slice_header(inout SliceContext sc)
 {
-    u8buf sb = u8buf(state);
-
     [[unroll]]
     for (int i = 0; i < CONTEXT_SIZE; i++)
-        sb[i].v = uint8_t(128);
+        state[i] = uint8_t(128);
 
-    put_symbol_unsigned(sc.c, state, gl_WorkGroupID.x);
-    put_symbol_unsigned(sc.c, state, gl_WorkGroupID.y);
-    put_symbol_unsigned(sc.c, state, 0);
-    put_symbol_unsigned(sc.c, state, 0);
+    put_usymbol(sc.c, gl_WorkGroupID.x);
+    put_usymbol(sc.c, gl_WorkGroupID.y);
+    put_usymbol(sc.c, 0);
+    put_usymbol(sc.c, 0);
 
     for (int i = 0; i < codec_planes; i++)
-        put_symbol_unsigned(sc.c, state, sc.quant_table_idx[i]);
+        put_usymbol(sc.c, sc.quant_table_idx[i]);
 
-    put_symbol_unsigned(sc.c, state, pic_mode);
-    put_symbol_unsigned(sc.c, state, sar.x);
-    put_symbol_unsigned(sc.c, state, sar.y);
+    put_usymbol(sc.c, pic_mode);
+    put_usymbol(sc.c, sar.x);
+    put_usymbol(sc.c, sar.y);
 
     if (version >= 4) {
-        put_rac_full(sc.c, state, sc.slice_reset_contexts);
-        put_symbol_unsigned(sc.c, state, sc.slice_coding_mode);
+        put_rac(sc.c, state[0], sc.slice_reset_contexts);
+        put_usymbol(sc.c, sc.slice_coding_mode);
         if (sc.slice_coding_mode != 1 && colorspace == 1) {
-            put_symbol_unsigned(sc.c, state, sc.slice_rct_coef.y);
-            put_symbol_unsigned(sc.c, state, sc.slice_rct_coef.x);
+            put_usymbol(sc.c, sc.slice_rct_coef.y);
+            put_usymbol(sc.c, sc.slice_rct_coef.x);
         }
     }
 }
 
-void write_frame_header(inout SliceContext sc, uint64_t state)
+void write_frame_header(inout SliceContext sc)
 {
-    u8buf sb = u8buf(state);
-    sb.v = uint8_t(128);
-    put_rac_full(sc.c, state, bool(key_frame));
+    put_rac_equi(sc.c, bool(key_frame));
 }
 
 #ifdef GOLOMB
@@ -122,16 +111,12 @@ void main(void)
 {
     const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
 
-    /* Write slice data */
-    uint64_t scratch_state = uint64_t(scratch_data) + slice_idx*CONTEXT_SIZE;
-    u8buf sb = u8buf(scratch_state);
-
     init_slice(slice_ctx[slice_idx], slice_idx);
 
     if (slice_idx == 0)
-        write_frame_header(slice_ctx[slice_idx], scratch_state);
+        write_frame_header(slice_ctx[slice_idx]);
 
-    write_slice_header(slice_ctx[slice_idx], scratch_state);
+    write_slice_header(slice_ctx[slice_idx]);
 
 #ifdef GOLOMB
     init_golomb(slice_ctx[slice_idx]);
diff --git a/libavcodec/vulkan/rangecoder.comp b/libavcodec/vulkan/rangecoder.comp
index 256b5f0e79..1db42e1dc9 100644
--- a/libavcodec/vulkan/rangecoder.comp
+++ b/libavcodec/vulkan/rangecoder.comp
@@ -91,15 +91,13 @@ void renorm_encoder(inout RangeCoder c)
         bs[i].v = fill;
 }
 
-void put_rac_norenorm(inout RangeCoder c, uint64_t state, bool bit)
+void put_rac_direct(inout RangeCoder c, uint8_t state, bool bit)
 {
-    u8buf sb = u8buf(state);
-    uint val = uint(sb.v);
-    int range1 = uint16_t((c.range * val) >> 8);
+    int range1 = uint16_t((c.range * state) >> 8);
 
 #ifdef DEBUG
-    if (val == 0)
-        debugPrintfEXT("Error: state is zero (addr: 0x%lx)", uint64_t(sb));
+    if (state == 0)
+        debugPrintfEXT("Error: state is zero");
     if (range1 >= c.range)
         debugPrintfEXT("Error: range1 >= c.range");
     if (range1 <= 0)
@@ -113,13 +111,21 @@ void put_rac_norenorm(inout RangeCoder c, uint64_t state, bool bit)
     } else {
         c.range  = diff;
     }
+}
 
-    sb.v = zero_one_state[(uint(bit) << 8) + val];
+void put_rac_norenorm(inout RangeCoder c, uint64_t state, bool bit)
+{
+    put_rac_direct(c, u8buf(state).v, bit);
 
-#ifdef DEBUG
-    if (sb.v == 0)
-        debugPrintfEXT("Error: inserted zero state from tab %i idx %i", bit, val);
-#endif
+    u8buf(state).v = zero_one_state[(uint(bit) << 8) + u8buf(state).v];
+}
+
+void put_rac(inout RangeCoder c, inout uint8_t state, bool bit)
+{
+    put_rac_direct(c, state, bit);
+    if (c.range < 0x100)
+        renorm_encoder_full(c);
+    state = zero_one_state[(uint(bit) << 8) + state];
 }
 
 /* Equiprobable bit */
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 04/16] ffv1enc_vulkan: unify EC code between setup and encode
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 02/16] vulkan/ffv1: synchronize get_pred implementations between encoder and decoder Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 03/16] ffv1enc_vulkan: get rid of temporary data for the setup shader Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 05/16] ffv1enc_vulkan: minor EC optimizations Lynne
                   ` (11 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

---
 libavcodec/ffv1enc_vulkan.c           |  1 +
 libavcodec/vulkan/ffv1_enc.comp       |  7 -------
 libavcodec/vulkan/ffv1_enc_setup.comp | 10 +++++-----
 libavcodec/vulkan/rangecoder.comp     | 23 +++++++++++------------
 4 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index d78ba3aca8..956463e932 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -976,6 +976,7 @@ static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES);
     av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE);
+    av_bprintf(&shd->src, "#define FULL_RENORM\n");
 
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
diff --git a/libavcodec/vulkan/ffv1_enc.comp b/libavcodec/vulkan/ffv1_enc.comp
index 7f8c831efa..a3c22f7459 100644
--- a/libavcodec/vulkan/ffv1_enc.comp
+++ b/libavcodec/vulkan/ffv1_enc.comp
@@ -63,13 +63,6 @@ ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off, int comp, int sw, ui
 }
 
 #ifndef GOLOMB
-void put_rac(inout RangeCoder c, uint64_t state, bool bit)
-{
-    put_rac_norenorm(c, state, bit);
-    if (c.range < 0x100)
-        renorm_encoder(c);
-}
-
 /* Note - only handles signed values */
 void put_symbol(inout RangeCoder c, uint64_t state, int v)
 {
diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp b/libavcodec/vulkan/ffv1_enc_setup.comp
index d395770ba8..6f21e47523 100644
--- a/libavcodec/vulkan/ffv1_enc_setup.comp
+++ b/libavcodec/vulkan/ffv1_enc_setup.comp
@@ -50,18 +50,18 @@ void init_slice(out SliceContext sc, const uint slice_idx)
 void put_usymbol(inout RangeCoder c, uint v)
 {
     bool is_nil = (v == 0);
-    put_rac(c, state[0], is_nil);
+    put_rac_direct(c, state[0], is_nil);
     if (is_nil)
         return;
 
     const int e = findMSB(v);
 
     for (int i = 0; i < e; i++)
-        put_rac(c, state[1 + min(i, 9)], true);
-    put_rac(c, state[1 + min(e, 9)], false);
+        put_rac_direct(c, state[1 + min(i, 9)], true);
+    put_rac_direct(c, state[1 + min(e, 9)], false);
 
     for (int i = e - 1; i >= 0; i--)
-        put_rac(c, state[22 + min(i, 9)], bool(bitfieldExtract(v, i, 1)));
+        put_rac_direct(c, state[22 + min(i, 9)], bool(bitfieldExtract(v, i, 1)));
 }
 
 void write_slice_header(inout SliceContext sc)
@@ -83,7 +83,7 @@ void write_slice_header(inout SliceContext sc)
     put_usymbol(sc.c, sar.y);
 
     if (version >= 4) {
-        put_rac(sc.c, state[0], sc.slice_reset_contexts);
+        put_rac_direct(sc.c, state[0], sc.slice_reset_contexts);
         put_usymbol(sc.c, sc.slice_coding_mode);
         if (sc.slice_coding_mode != 1 && colorspace == 1) {
             put_usymbol(sc.c, sc.slice_rct_coef.y);
diff --git a/libavcodec/vulkan/rangecoder.comp b/libavcodec/vulkan/rangecoder.comp
index 1db42e1dc9..badc65293f 100644
--- a/libavcodec/vulkan/rangecoder.comp
+++ b/libavcodec/vulkan/rangecoder.comp
@@ -31,8 +31,9 @@ struct RangeCoder {
     uint8_t outstanding_byte;
 };
 
+#ifdef FULL_RENORM
 /* Full renorm version that can handle outstanding_byte == 0xFF */
-void renorm_encoder_full(inout RangeCoder c)
+void renorm_encoder(inout RangeCoder c)
 {
     int bs_cnt = 0;
     u8buf bytestream = u8buf(c.bytestream);
@@ -62,6 +63,8 @@ void renorm_encoder_full(inout RangeCoder c)
     c.low = bitfieldInsert(0, c.low, 8, 8);
 }
 
+#else
+
 /* Cannot deal with outstanding_byte == -1 in the name of speed */
 void renorm_encoder(inout RangeCoder c)
 {
@@ -90,8 +93,9 @@ void renorm_encoder(inout RangeCoder c)
     for (int i = 1; i < oc; i++)
         bs[i].v = fill;
 }
+#endif
 
-void put_rac_direct(inout RangeCoder c, uint8_t state, bool bit)
+void put_rac_direct(inout RangeCoder c, inout uint8_t state, bool bit)
 {
     int range1 = uint16_t((c.range * state) >> 8);
 
@@ -111,21 +115,16 @@ void put_rac_direct(inout RangeCoder c, uint8_t state, bool bit)
     } else {
         c.range  = diff;
     }
-}
 
-void put_rac_norenorm(inout RangeCoder c, uint64_t state, bool bit)
-{
-    put_rac_direct(c, u8buf(state).v, bit);
+    if (c.range < 0x100)
+        renorm_encoder(c);
 
-    u8buf(state).v = zero_one_state[(uint(bit) << 8) + u8buf(state).v];
+    state = zero_one_state[(uint(bit) << 8) + state];
 }
 
-void put_rac(inout RangeCoder c, inout uint8_t state, bool bit)
+void put_rac(inout RangeCoder c, uint64_t state, bool bit)
 {
-    put_rac_direct(c, state, bit);
-    if (c.range < 0x100)
-        renorm_encoder_full(c);
-    state = zero_one_state[(uint(bit) << 8) + state];
+    put_rac_direct(c, u8buf(state).v, bit);
 }
 
 /* Equiprobable bit */
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 05/16] ffv1enc_vulkan: minor EC optimizations
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (2 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 04/16] ffv1enc_vulkan: unify EC code between setup and encode Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 06/16] ffv1enc_vulkan: switch to 2-line cache, unify prediction code Lynne
                   ` (10 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

---
 libavcodec/vulkan/rangecoder.comp | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/libavcodec/vulkan/rangecoder.comp b/libavcodec/vulkan/rangecoder.comp
index badc65293f..9e2c5fbecf 100644
--- a/libavcodec/vulkan/rangecoder.comp
+++ b/libavcodec/vulkan/rangecoder.comp
@@ -109,14 +109,10 @@ void put_rac_direct(inout RangeCoder c, inout uint8_t state, bool bit)
 #endif
 
     int diff = c.range - range1;
-    if (bit) {
-        c.low   += diff;
-        c.range  = range1;
-    } else {
-        c.range  = diff;
-    }
+    c.low += bit ? diff : 0;
+    c.range = bit ? range1 : diff;
 
-    if (c.range < 0x100)
+    if (expectEXT(c.range < 0x100, false))
         renorm_encoder(c);
 
     state = zero_one_state[(uint(bit) << 8) + state];
@@ -139,12 +135,9 @@ void put_rac_equi(inout RangeCoder c, bool bit)
         debugPrintfEXT("Error: range1 <= 0");
 #endif
 
-    if (bit) {
-        c.low   += c.range - range1;
-        c.range  = range1;
-    } else {
-        c.range -= range1;
-    }
+    int diff = c.range - range1;
+    c.low += bit ? diff : 0;
+    c.range = bit ? range1 : diff;
 
     if (expectEXT(c.range < 0x100, false))
         renorm_encoder(c);
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 06/16] ffv1enc_vulkan: switch to 2-line cache, unify prediction code
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (3 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 05/16] ffv1enc_vulkan: minor EC optimizations Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-23 14:38   ` [FFmpeg-devel] [PATCH] ffv1enc_vulkan: fix array overflow Jerome Martinez
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 07/16] ffv1_common: minor RGB optimization Lynne
                   ` (9 subsequent siblings)
  14 siblings, 1 reply; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

---
 libavcodec/ffv1enc_vulkan.c        | 379 +++++++++--------------------
 libavcodec/vulkan/ffv1_common.comp |  87 +++++++
 libavcodec/vulkan/ffv1_dec.comp    |  91 +------
 libavcodec/vulkan/ffv1_enc.comp    | 155 ++++++------
 libavcodec/vulkan_ffv1.c           |   5 +-
 5 files changed, 288 insertions(+), 429 deletions(-)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 956463e932..bab9bb640b 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -37,6 +37,9 @@
 #define LG_ALIGN_W 32
 #define LG_ALIGN_H 32
 
+/* Unlike the decoder, we need 4 lines (but really only 3) */
+#define RGB_LINECACHE 4
+
 typedef struct VulkanEncodeFFv1FrameData {
     /* Output data */
     AVBufferRef *out_data_ref;
@@ -72,7 +75,6 @@ typedef struct VulkanEncodeFFv1Context {
 
     FFVulkanShader setup;
     FFVulkanShader reset;
-    FFVulkanShader rct;
     FFVulkanShader enc;
 
     /* Constant read-only buffers */
@@ -111,7 +113,6 @@ extern const char *ff_source_rangecoder_comp;
 extern const char *ff_source_ffv1_vlc_comp;
 extern const char *ff_source_ffv1_common_comp;
 extern const char *ff_source_ffv1_reset_comp;
-extern const char *ff_source_ffv1_enc_rct_comp;
 extern const char *ff_source_ffv1_enc_setup_comp;
 extern const char *ff_source_ffv1_enc_comp;
 
@@ -120,6 +121,7 @@ typedef struct FFv1VkParameters {
     VkDeviceAddress scratch_data;
     VkDeviceAddress out_data;
 
+    int32_t fmt_lut[4];
     int32_t sar[2];
     uint32_t chroma_shift[2];
 
@@ -127,7 +129,9 @@ typedef struct FFv1VkParameters {
     uint32_t context_count;
     uint32_t crcref;
     uint32_t slice_size_max;
+    int      rct_offset;
 
+    uint8_t extend_lookup[8];
     uint8_t bits_per_raw_sample;
     uint8_t context_model;
     uint8_t version;
@@ -137,13 +141,14 @@ typedef struct FFv1VkParameters {
     uint8_t components;
     uint8_t planes;
     uint8_t codec_planes;
+    uint8_t planar_rgb;
     uint8_t transparency;
     uint8_t colorspace;
     uint8_t pic_mode;
     uint8_t ec;
     uint8_t ppi;
     uint8_t chunks;
-    uint8_t padding[1];
+    uint8_t padding[4];
 } FFv1VkParameters;
 
 static void add_push_data(FFVulkanShader *shd)
@@ -153,6 +158,7 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    u8buf scratch_data;                                           );
     GLSLC(1,    u8buf out_data;                                               );
     GLSLC(0,                                                                  );
+    GLSLC(1,    ivec4 fmt_lut;                                                );
     GLSLC(1,    ivec2 sar;                                                    );
     GLSLC(1,    uvec2 chroma_shift;                                           );
     GLSLC(0,                                                                  );
@@ -160,7 +166,9 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    uint context_count;                                           );
     GLSLC(1,    uint32_t crcref;                                              );
     GLSLC(1,    uint32_t slice_size_max;                                      );
+    GLSLC(1,    int rct_offset;                                               );
     GLSLC(0,                                                                  );
+    GLSLC(1,    uint8_t extend_lookup[8];                                     );
     GLSLC(1,    uint8_t bits_per_raw_sample;                                  );
     GLSLC(1,    uint8_t context_model;                                        );
     GLSLC(1,    uint8_t version;                                              );
@@ -170,122 +178,19 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    uint8_t components;                                           );
     GLSLC(1,    uint8_t planes;                                               );
     GLSLC(1,    uint8_t codec_planes;                                         );
+    GLSLC(1,    uint8_t planar_rgb;                                           );
     GLSLC(1,    uint8_t transparency;                                         );
     GLSLC(1,    uint8_t colorspace;                                           );
     GLSLC(1,    uint8_t pic_mode;                                             );
     GLSLC(1,    uint8_t ec;                                                   );
     GLSLC(1,    uint8_t ppi;                                                  );
     GLSLC(1,    uint8_t chunks;                                               );
-    GLSLC(1,    uint8_t padding[1];                                           );
+    GLSLC(1,    uint8_t padding[4];                                           );
     GLSLC(0, };                                                               );
     ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkParameters),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 }
 
-static int run_rct(AVCodecContext *avctx, FFVkExecContext *exec,
-                   AVFrame *enc_in, VkImageView *enc_in_views,
-                   AVFrame **intermediate_frame, VkImageView *intermediate_views,
-                   VkImageMemoryBarrier2 *img_bar, int *nb_img_bar,
-                   VkBufferMemoryBarrier2 *buf_bar, int *nb_buf_bar,
-                   FFVkBuffer *slice_data_buf, uint32_t slice_data_size)
-{
-    int err;
-    VulkanEncodeFFv1Context *fv = avctx->priv_data;
-    FFV1Context *f = &fv->ctx;
-    FFVulkanFunctions *vk = &fv->s.vkfn;
-    AVHWFramesContext *src_hwfc = (AVHWFramesContext *)enc_in->hw_frames_ctx->data;
-    FFv1VkRCTParameters pd;
-
-    /* Create a temporaty frame */
-    *intermediate_frame = av_frame_alloc();
-    if (!(*intermediate_frame))
-        return AVERROR(ENOMEM);
-
-    RET(av_hwframe_get_buffer(fv->intermediate_frames_ref,
-                              *intermediate_frame, 0));
-
-    RET(ff_vk_exec_add_dep_frame(&fv->s, exec, *intermediate_frame,
-                                 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
-    RET(ff_vk_create_imageviews(&fv->s, exec, intermediate_views,
-                                *intermediate_frame,
-                                fv->rep_fmt));
-
-    /* Update descriptors */
-    ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->rct,
-                                    1, 0, 0,
-                                    slice_data_buf,
-                                    0, slice_data_size*f->slice_count,
-                                    VK_FORMAT_UNDEFINED);
-    ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct,
-                                  enc_in, enc_in_views,
-                                  1, 1,
-                                  VK_IMAGE_LAYOUT_GENERAL,
-                                  VK_NULL_HANDLE);
-    ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct,
-                                  *intermediate_frame, intermediate_views,
-                                  1, 2,
-                                  VK_IMAGE_LAYOUT_GENERAL,
-                                  VK_NULL_HANDLE);
-
-    ff_vk_frame_barrier(&fv->s, exec, *intermediate_frame, img_bar, nb_img_bar,
-                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                        VK_ACCESS_SHADER_WRITE_BIT,
-                        VK_IMAGE_LAYOUT_GENERAL,
-                        VK_QUEUE_FAMILY_IGNORED);
-
-    /* Prep the input/output images */
-    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
-            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-            .pImageMemoryBarriers = img_bar,
-            .imageMemoryBarrierCount = *nb_img_bar,
-            .pBufferMemoryBarriers = buf_bar,
-            .bufferMemoryBarrierCount = *nb_buf_bar,
-    });
-    *nb_img_bar = 0;
-    if (*nb_buf_bar) {
-        slice_data_buf->stage = buf_bar[0].dstStageMask;
-        slice_data_buf->access = buf_bar[0].dstAccessMask;
-        *nb_buf_bar = 0;
-    }
-
-    /* Run the shader */
-    ff_vk_exec_bind_shader(&fv->s, exec, &fv->rct);
-    pd = (FFv1VkRCTParameters) {
-        .offset = 1 << f->bits_per_raw_sample,
-        .bits = f->bits_per_raw_sample,
-        .planar_rgb = ff_vk_mt_is_np_rgb(src_hwfc->sw_format) &&
-                      (ff_vk_count_images((AVVkFrame *)enc_in->data[0]) > 1),
-        .transparency = f->transparency,
-    };
-
-    /* For some reason the C FFv1 encoder/decoder treats these differently */
-    if (src_hwfc->sw_format == AV_PIX_FMT_GBRP10 ||
-        src_hwfc->sw_format == AV_PIX_FMT_GBRP12 ||
-        src_hwfc->sw_format == AV_PIX_FMT_GBRP14)
-        memcpy(pd.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int));
-    else
-        ff_vk_set_perm(src_hwfc->sw_format, pd.fmt_lut, 1);
-
-    ff_vk_shader_update_push_const(&fv->s, exec, &fv->rct,
-                                   VK_SHADER_STAGE_COMPUTE_BIT,
-                                   0, sizeof(pd), &pd);
-
-    vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1);
-
-    /* Add a post-dispatch barrier before encoding */
-    ff_vk_frame_barrier(&fv->s, exec, *intermediate_frame, img_bar, nb_img_bar,
-                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                        VK_ACCESS_SHADER_READ_BIT,
-                        VK_IMAGE_LAYOUT_GENERAL,
-                        VK_QUEUE_FAMILY_IGNORED);
-
-fail:
-    return err;
-}
-
 static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                            FFVkExecContext *exec,
                                            const AVFrame *pict)
@@ -298,8 +203,6 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     VulkanEncodeFFv1FrameData *fd = exec->opaque;
     FFv1VkParameters pd;
 
-    AVFrame *intermediate_frame = NULL;
-
     /* Slice data */
     AVBufferRef *slice_data_ref;
     FFVkBuffer *slice_data_buf;
@@ -318,11 +221,11 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     uint32_t context_count = f->context_count[f->context_model];
     const AVPixFmtDescriptor *fmt_desc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
 
-    VkImageView in_views[AV_NUM_DATA_POINTERS];
-    VkImageView intermediate_views[AV_NUM_DATA_POINTERS];
+    AVFrame *src = (AVFrame *)pict;
+    VkImageView src_views[AV_NUM_DATA_POINTERS];
 
-    AVFrame *enc_in = (AVFrame *)pict;
-    VkImageView *enc_in_views = in_views;
+    AVFrame *tmp = NULL;
+    VkImageView tmp_views[AV_NUM_DATA_POINTERS];
 
     VkImageMemoryBarrier2 img_bar[37];
     int nb_img_bar = 0;
@@ -402,27 +305,44 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 1);
 
     /* Prepare input frame */
-    RET(ff_vk_exec_add_dep_frame(&fv->s, exec, enc_in,
+    RET(ff_vk_exec_add_dep_frame(&fv->s, exec, src,
                                  VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                                  VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
 
-    RET(ff_vk_create_imageviews(&fv->s, exec, enc_in_views, enc_in,
+    RET(ff_vk_create_imageviews(&fv->s, exec, src_views, src,
                                 fv->rep_fmt));
-    ff_vk_frame_barrier(&fv->s, exec, enc_in, img_bar, &nb_img_bar,
+    ff_vk_frame_barrier(&fv->s, exec, src, img_bar, &nb_img_bar,
                         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
                         VK_ACCESS_SHADER_READ_BIT,
                         VK_IMAGE_LAYOUT_GENERAL,
                         VK_QUEUE_FAMILY_IGNORED);
 
-    /* Setup shader needs the original input */
+    if (fv->is_rgb) {
+        /* Create a temporaty frame */
+        tmp = av_frame_alloc();
+        if (!(tmp))
+            return AVERROR(ENOMEM);
+
+        RET(av_hwframe_get_buffer(fv->intermediate_frames_ref,
+                                  tmp, 0));
+
+        RET(ff_vk_exec_add_dep_frame(&fv->s, exec, tmp,
+                                     VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                                     VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+        RET(ff_vk_create_imageviews(&fv->s, exec, tmp_views,
+                                    tmp,
+                                    fv->rep_fmt));
+    }
+
+    /* Setup shader */
     ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->setup,
                                     1, 0, 0,
                                     slice_data_buf,
                                     0, slice_data_size*f->slice_count,
                                     VK_FORMAT_UNDEFINED);
     ff_vk_shader_update_img_array(&fv->s, exec, &fv->setup,
-                                  enc_in, enc_in_views,
+                                  src, src_views,
                                   1, 1,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_NULL_HANDLE);
@@ -471,6 +391,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         .plane_state_size = plane_state_size,
         .context_count = context_count,
         .crcref = f->crcref,
+        .rct_offset = 1 << f->bits_per_raw_sample,
         .slice_size_max = out_data_buf->size / f->slice_count,
         .context_model = fv->ctx.context_model,
         .version = f->version,
@@ -480,6 +401,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         .components = fmt_desc->nb_components,
         .planes = av_pix_fmt_count_planes(avctx->sw_pix_fmt),
         .codec_planes = f->plane_count,
+        .planar_rgb = ff_vk_mt_is_np_rgb(avctx->sw_pix_fmt) &&
+                      (ff_vk_count_images((AVVkFrame *)src->data[0]) > 1),
         .transparency = f->transparency,
         .colorspace = f->colorspace,
         .pic_mode = !(pict->flags & AV_FRAME_FLAG_INTERLACED) ? 3 :
@@ -488,11 +411,35 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         .ppi = fv->ppi,
         .chunks = fv->chunks,
     };
+
+    /* For some reason the C FFv1 encoder/decoder treats these differently */
+    if (avctx->sw_pix_fmt == AV_PIX_FMT_GBRP10 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP12 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP14)
+        memcpy(pd.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int));
+    else
+        ff_vk_set_perm(avctx->sw_pix_fmt, pd.fmt_lut, 1);
+
+    for (int i = 0; i < f->quant_table_count; i++)
+        pd.extend_lookup[i] = (f->quant_tables[i][3][127] != 0) ||
+                              (f->quant_tables[i][4][127] != 0);
     ff_vk_shader_update_push_const(&fv->s, exec, &fv->setup,
                                    VK_SHADER_STAGE_COMPUTE_BIT,
                                    0, sizeof(pd), &pd);
     vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1);
 
+    /* Clean up temporary image */
+    if (fv->is_rgb) {
+        AVVkFrame *vkf = (AVVkFrame *)tmp->data[0];
+        vk->CmdClearColorImage(exec->buf, vkf->img[0], VK_IMAGE_LAYOUT_GENERAL,
+                               &((VkClearColorValue) { 0 }),
+                               1, &((VkImageSubresourceRange) {
+                                   .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                                   .levelCount = 1,
+                                   .layerCount = 1,
+                               }));
+    }
+
     /* Setup shader modified the slice data buffer */
     buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
         .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
@@ -546,19 +493,6 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                         f->plane_count);
     }
 
-    /* Run RCT shader */
-    if (fv->is_rgb) {
-        RET(run_rct(avctx, exec,
-                    enc_in, enc_in_views,
-                    &intermediate_frame, intermediate_views,
-                    img_bar, &nb_img_bar, buf_bar, &nb_buf_bar,
-                    slice_data_buf, slice_data_size));
-
-        /* Use the new frame */
-        enc_in = intermediate_frame;
-        enc_in_views = intermediate_views;
-    }
-
     /* If the reset shader ran, insert a barrier now. */
     if (f->key_frame || f->version > 3) {
         /* Reset shader modified the slice data buffer */
@@ -577,6 +511,15 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         };
     }
 
+    if (fv->is_rgb) {
+        ff_vk_frame_barrier(&fv->s, exec, tmp, img_bar, &nb_img_bar,
+                            VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                            VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                            VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+                            VK_IMAGE_LAYOUT_GENERAL,
+                            VK_QUEUE_FAMILY_IGNORED);
+    }
+
     /* Final barrier before encoding */
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
@@ -599,7 +542,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                     0, slice_data_size*f->slice_count,
                                     VK_FORMAT_UNDEFINED);
     ff_vk_shader_update_img_array(&fv->s, exec, &fv->enc,
-                                  enc_in, enc_in_views,
+                                  src, src_views,
                                   1, 1,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_NULL_HANDLE);
@@ -608,6 +551,12 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                     results_data_buf,
                                     0, results_data_buf->size,
                                     VK_FORMAT_UNDEFINED);
+    if (fv->is_rgb)
+        ff_vk_shader_update_img_array(&fv->s, exec, &fv->enc,
+                                      tmp, tmp_views,
+                                      1, 3,
+                                      VK_IMAGE_LAYOUT_GENERAL,
+                                      VK_NULL_HANDLE);
 
     ff_vk_exec_bind_shader(&fv->s, exec, &fv->enc);
     ff_vk_shader_update_push_const(&fv->s, exec, &fv->enc,
@@ -624,11 +573,11 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
 
     /* This, if needed, was referenced by the execution context
      * as it was declared as a dependency. */
-    av_frame_free(&intermediate_frame);
+    av_frame_free(&tmp);
     return 0;
 
 fail:
-    av_frame_free(&intermediate_frame);
+    av_frame_free(&tmp);
     ff_vk_exec_discard_deps(&fv->s, exec);
 
     return err;
@@ -846,6 +795,7 @@ static int init_indirect(AVCodecContext *avctx, enum AVPixelFormat sw_format)
 {
     int err;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFV1Context *f = &fv->ctx;
     AVHWFramesContext *frames_ctx;
     AVVulkanFramesContext *vk_frames;
 
@@ -856,12 +806,13 @@ static int init_indirect(AVCodecContext *avctx, enum AVPixelFormat sw_format)
     frames_ctx = (AVHWFramesContext *)fv->intermediate_frames_ref->data;
     frames_ctx->format    = AV_PIX_FMT_VULKAN;
     frames_ctx->sw_format = sw_format;
-    frames_ctx->width     = FFALIGN(fv->s.frames->width, 32);
-    frames_ctx->height    = FFALIGN(fv->s.frames->height, 32);
+    frames_ctx->width     = fv->s.frames->width;
+    frames_ctx->height    = f->num_v_slices*RGB_LINECACHE;
 
     vk_frames = frames_ctx->hwctx;
     vk_frames->tiling    = VK_IMAGE_TILING_OPTIMAL;
-    vk_frames->usage     = VK_IMAGE_USAGE_STORAGE_BIT;
+    vk_frames->usage     = VK_IMAGE_USAGE_STORAGE_BIT |
+                           VK_IMAGE_USAGE_TRANSFER_DST_BIT;
     vk_frames->img_flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
 
     err = av_hwframe_ctx_init(fv->intermediate_frames_ref);
@@ -929,6 +880,7 @@ static void define_shared_code(AVCodecContext *avctx, FFVulkanShader *shd)
     FFV1Context *f = &fv->ctx;
     int smp_bits = fv->ctx.use32bit ? 32 : 16;
 
+    av_bprintf(&shd->src, "#define RGB_LINECACHE %i\n"                   ,RGB_LINECACHE);
     av_bprintf(&shd->src, "#define CONTEXT_SIZE %i\n"                    ,CONTEXT_SIZE);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_MASK 0x%x\n"          ,MAX_QUANT_TABLE_MASK);
 
@@ -1120,122 +1072,6 @@ fail:
     return err;
 }
 
-static int init_rct_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
-{
-    int err;
-    VulkanEncodeFFv1Context *fv = avctx->priv_data;
-    FFVulkanShader *shd = &fv->rct;
-    FFVulkanDescriptorSetBinding *desc_set;
-
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
-    int wg_count = sqrt(fv->s.props.properties.limits.maxComputeWorkGroupInvocations);
-
-    enum AVPixelFormat intermediate_fmt = get_supported_rgb_buffer_fmt(avctx);
-    if (intermediate_fmt == AV_PIX_FMT_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to find a supported compatible "
-                                    "pixel format for RCT buffer!\n");
-        return AVERROR(ENOTSUP);
-    }
-
-    RET(init_indirect(avctx, intermediate_fmt));
-
-    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_rct",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          wg_count, wg_count, 1,
-                          0));
-
-    /* Common codec header */
-    GLSLD(ff_source_common_comp);
-
-    GLSLC(0, layout(push_constant, scalar) uniform pushConstants {             );
-    GLSLC(1,    ivec4 fmt_lut;                                                 );
-    GLSLC(1,    int offset;                                                    );
-    GLSLC(1,    uint8_t bits;                                                  );
-    GLSLC(1,    uint8_t planar_rgb;                                            );
-    GLSLC(1,    uint8_t color_planes;                                          );
-    GLSLC(1,    uint8_t transparency;                                          );
-    GLSLC(1,    uint8_t version;                                               );
-    GLSLC(1,    uint8_t micro_version;                                         );
-    GLSLC(1,    uint8_t padding[2];                                            );
-    GLSLC(0, };                                                                );
-    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkRCTParameters),
-                                VK_SHADER_STAGE_COMPUTE_BIT);
-
-    av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES);
-    av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS);
-    av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE);
-
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "rangecoder_static_buf",
-            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .mem_layout  = "scalar",
-            .buf_content = "uint8_t zero_one_state[512];",
-        },
-        {
-            .name        = "quant_buf",
-            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .mem_layout  = "scalar",
-            .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]"
-                           "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];",
-        },
-    };
-    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 0));
-
-    define_shared_code(avctx, shd);
-
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "slice_data_buf",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "SliceContext slice_ctx[1024];",
-        },
-        {
-            .name       = "src",
-            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .dimensions = 2,
-            .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format,
-                                               fv->rep_fmt),
-            .elems      = av_pix_fmt_count_planes(fv->s.frames->sw_format),
-            .mem_quali  = "readonly",
-            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
-        },
-        {
-            .name       = "dst",
-            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .dimensions = 2,
-            .mem_layout = ff_vk_shader_rep_fmt(intermediate_fmt,
-                                               fv->rep_fmt),
-            .elems      = av_pix_fmt_count_planes(intermediate_fmt),
-            .mem_quali  = "writeonly",
-            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
-        },
-    };
-    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0));
-
-    GLSLD(ff_source_ffv1_enc_rct_comp);
-
-    RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main",
-                            &spv_opaque));
-    RET(ff_vk_shader_link(&fv->s, shd, spv_data, spv_len, "main"));
-
-    RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd));
-
-fail:
-    if (spv_opaque)
-        spv->free_shader(spv, &spv_opaque);
-
-    return err;
-}
-
 static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
 {
     int err;
@@ -1243,10 +1079,6 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     FFVulkanShader *shd = &fv->enc;
     FFVulkanDescriptorSetBinding *desc_set;
 
-    AVHWFramesContext *frames_ctx = fv->intermediate_frames_ref ?
-                                    (AVHWFramesContext *)fv->intermediate_frames_ref->data :
-                                    fv->s.frames;
-
     uint8_t *spv_data;
     size_t spv_len;
     void *spv_opaque = NULL;
@@ -1307,9 +1139,9 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .name       = "src",
             .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
             .dimensions = 2,
-            .mem_layout = ff_vk_shader_rep_fmt(frames_ctx->sw_format,
+            .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format,
                                                fv->rep_fmt),
-            .elems      = av_pix_fmt_count_planes(frames_ctx->sw_format),
+            .elems      = av_pix_fmt_count_planes(fv->s.frames->sw_format),
             .mem_quali  = "readonly",
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
@@ -1321,7 +1153,19 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .buf_content = "uint64_t slice_results[2048];",
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0));
+    if (fv->is_rgb) {
+        AVHWFramesContext *intermediate_frames_ctx;
+        intermediate_frames_ctx = (AVHWFramesContext *)fv->intermediate_frames_ref->data;
+        desc_set[3] = (FFVulkanDescriptorSetBinding) {
+            .name       = "tmp",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .dimensions = 2,
+            .mem_layout = ff_vk_shader_rep_fmt(intermediate_frames_ctx->sw_format,
+                                               FF_VK_REP_NATIVE),
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        };
+    }
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3 + fv->is_rgb, 0, 0));
 
     GLSLD(ff_source_ffv1_enc_comp);
 
@@ -1566,13 +1410,15 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
         return err;
     }
 
-    /* Init RCT shader */
     if (fv->is_rgb) {
-        err = init_rct_shader(avctx, spv);
-        if (err < 0) {
-            spv->uninit(&spv);
-            return err;
+        enum AVPixelFormat intermediate_fmt = get_supported_rgb_buffer_fmt(avctx);
+        if (intermediate_fmt == AV_PIX_FMT_NONE) {
+            av_log(avctx, AV_LOG_ERROR, "Unable to find a supported compatible "
+                                        "pixel format for RCT buffer!\n");
+            return AVERROR(ENOTSUP);
         }
+
+        RET(init_indirect(avctx, intermediate_fmt));
     }
 
     /* Encode shader */
@@ -1659,7 +1505,6 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx)
     ff_vk_exec_pool_free(&fv->s, &fv->transfer_exec_pool);
 
     ff_vk_shader_free(&fv->s, &fv->enc);
-    ff_vk_shader_free(&fv->s, &fv->rct);
     ff_vk_shader_free(&fv->s, &fv->reset);
     ff_vk_shader_free(&fv->s, &fv->setup);
 
diff --git a/libavcodec/vulkan/ffv1_common.comp b/libavcodec/vulkan/ffv1_common.comp
index 64c1c2ce80..1f222bdc42 100644
--- a/libavcodec/vulkan/ffv1_common.comp
+++ b/libavcodec/vulkan/ffv1_common.comp
@@ -92,3 +92,90 @@ uint slice_coord(uint width, uint sx, uint num_h_slices, uint chroma_shift)
 
     return sx;
 }
+
+#ifdef RGB
+#define RGB_LBUF (RGB_LINECACHE - 1)
+#define LADDR(p) (ivec2((p).x, ((p).y & RGB_LBUF)))
+
+ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off,
+               int comp, int sw, uint8_t quant_table_idx, bool extend_lookup)
+{
+    const ivec2 yoff_border1 = expectEXT(off.x == 0, false) ? ivec2(1, -1) : ivec2(0, 0);
+
+    /* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */
+    VTYPE3 top  = VTYPE3(TYPE(imageLoad(pred, sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[comp]),
+                         TYPE(imageLoad(pred, sp + LADDR(off + ivec2(0, -1)))[comp]),
+                         TYPE(imageLoad(pred, sp + LADDR(off + ivec2(min(1, sw - off.x - 1), -1)))[comp]));
+
+    /* Normally, we'd need to check if off != ivec2(0, 0) here, since otherwise, we must
+     * return zero. However, ivec2(-1,  0) + ivec2(1, -1) == ivec2(0, -1), e.g. previous
+     * row, 0 offset, same slice, which is zero since we zero out the buffer for RGB */
+    TYPE cur = TYPE(imageLoad(pred, sp + LADDR(off + ivec2(-1,  0) + yoff_border1))[comp]);
+
+    int base = quant_table[quant_table_idx][0][(cur    - top[0]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
+
+    if (expectEXT(extend_lookup, false)) {
+        TYPE cur2 = TYPE(0);
+        if (expectEXT(off.x > 0, true)) {
+            const ivec2 yoff_border2 = expectEXT(off.x == 1, false) ? ivec2(-1, -1) : ivec2(-2, 0);
+            cur2 = TYPE(imageLoad(pred, sp + LADDR(off + yoff_border2))[comp]);
+        }
+        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
+
+        /* top-2 became current upon swap */
+        TYPE top2 = TYPE(imageLoad(pred, sp + LADDR(off))[comp]);
+        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
+    }
+
+    /* context, prediction */
+    return ivec2(base, predict(cur, VTYPE2(top)));
+}
+
+#else /* RGB */
+
+#define LADDR(p) (p)
+
+ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off,
+               int comp, int sw, uint8_t quant_table_idx, bool extend_lookup)
+{
+    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
+    sp += off;
+
+    VTYPE3 top  = VTYPE3(TYPE(0),
+                         TYPE(0),
+                         TYPE(0));
+    if (off.y > 0 && off != ivec2(0, 1))
+        top[0] = TYPE(imageLoad(pred, sp + ivec2(-1, -1) + yoff_border1)[comp]);
+    if (off.y > 0) {
+        top[1] = TYPE(imageLoad(pred, sp + ivec2(0, -1))[comp]);
+        top[2] = TYPE(imageLoad(pred, sp + ivec2(min(1, sw - off.x - 1), -1))[comp]);
+    }
+
+    TYPE cur = TYPE(0);
+    if (off != ivec2(0, 0))
+        cur = TYPE(imageLoad(pred, sp + ivec2(-1,  0) + yoff_border1)[comp]);
+
+    int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
+
+    if (expectEXT(extend_lookup, false)) {
+        TYPE cur2 = TYPE(0);
+        if (off.x > 0 && off != ivec2(1, 0)) {
+            const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
+            cur2 = TYPE(imageLoad(pred, sp + ivec2(-2,  0) + yoff_border2)[comp]);
+        }
+        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
+
+        TYPE top2 = TYPE(0);
+        if (off.y > 1)
+            top2 = TYPE(imageLoad(pred, sp + ivec2(0, -2))[comp]);
+        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
+    }
+
+    /* context, prediction */
+    return ivec2(base, predict(cur, VTYPE2(top)));
+}
+#endif
diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp
index 1c313b3168..c74af4bf6a 100644
--- a/libavcodec/vulkan/ffv1_dec.comp
+++ b/libavcodec/vulkan/ffv1_dec.comp
@@ -20,93 +20,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef RGB
-#define LADDR(p) (p)
-#else
-#define RGB_LINECACHE 2
-#define RGB_LBUF (RGB_LINECACHE - 1)
-#define LADDR(p) (ivec2((p).x, ((p).y & RGB_LBUF)))
-#endif
-
-#ifdef RGB
-ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off, int comp, int sw, uint8_t quant_table_idx)
-{
-    const ivec2 yoff_border1 = expectEXT(off.x == 0, false) ? ivec2(1, -1) : ivec2(0, 0);
-
-    /* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */
-    VTYPE3 top  = VTYPE3(TYPE(imageLoad(pred, sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[comp]),
-                         TYPE(imageLoad(pred, sp + LADDR(off + ivec2(0, -1)))[comp]),
-                         TYPE(imageLoad(pred, sp + LADDR(off + ivec2(min(1, sw - off.x - 1), -1)))[comp]));
-
-    /* Normally, we'd need to check if off != ivec2(0, 0) here, since otherwise, we must
-     * return zero. However, ivec2(-1,  0) + ivec2(1, -1) == ivec2(0, -1), e.g. previous
-     * row, 0 offset, same slice, which is zero since we zero out the buffer for RGB */
-    TYPE cur = TYPE(imageLoad(pred, sp + LADDR(off + ivec2(-1,  0) + yoff_border1))[comp]);
-
-    int base = quant_table[quant_table_idx][0][(cur    - top[0]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
-
-    if (expectEXT(extend_lookup[quant_table_idx] > 0, false)) {
-        TYPE cur2 = TYPE(0);
-        if (expectEXT(off.x > 0, true)) {
-            const ivec2 yoff_border2 = expectEXT(off.x == 1, false) ? ivec2(-1, -1) : ivec2(-2, 0);
-            cur2 = TYPE(imageLoad(pred, sp + LADDR(off + yoff_border2))[comp]);
-        }
-        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
-
-        /* top-2 became current upon swap */
-        TYPE top2 = TYPE(imageLoad(pred, sp + LADDR(off))[comp]);
-        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
-    }
-
-    /* context, prediction */
-    return ivec2(base, predict(cur, VTYPE2(top)));
-}
-#else
-ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off, int comp, int sw, uint8_t quant_table_idx)
-{
-    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
-    sp += off;
-
-    VTYPE3 top  = VTYPE3(TYPE(0),
-                         TYPE(0),
-                         TYPE(0));
-    if (off.y > 0 && off != ivec2(0, 1))
-        top[0] = TYPE(imageLoad(pred, sp + ivec2(-1, -1) + yoff_border1)[comp]);
-    if (off.y > 0) {
-        top[1] = TYPE(imageLoad(pred, sp + ivec2(0, -1))[comp]);
-        top[2] = TYPE(imageLoad(pred, sp + ivec2(min(1, sw - off.x - 1), -1))[comp]);
-    }
-
-    TYPE cur = TYPE(0);
-    if (off != ivec2(0, 0))
-        cur = TYPE(imageLoad(pred, sp + ivec2(-1,  0) + yoff_border1)[comp]);
-
-    int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
-
-    if ((quant_table[quant_table_idx][3][127] != 0) ||
-        (quant_table[quant_table_idx][4][127] != 0)) {
-        TYPE cur2 = TYPE(0);
-        if (off.x > 0 && off != ivec2(1, 0)) {
-            const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
-            cur2 = TYPE(imageLoad(pred, sp + ivec2(-2,  0) + yoff_border2)[comp]);
-        }
-        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
-
-        TYPE top2 = TYPE(0);
-        if (off.y > 1)
-            top2 = TYPE(imageLoad(pred, sp + ivec2(0, -2))[comp]);
-        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
-    }
-
-    /* context, prediction */
-    return ivec2(base, predict(cur, VTYPE2(top)));
-}
-#endif
-
 #ifndef GOLOMB
 #ifdef CACHED_SYMBOL_READER
 shared uint8_t state[CONTEXT_SIZE];
@@ -172,7 +85,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
 
     for (int x = 0; x < w; x++) {
         ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
-                            quant_table_idx);
+                            quant_table_idx, extend_lookup[quant_table_idx] > 0);
 
         uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]);
 #ifdef CACHED_SYMBOL_READER
@@ -217,7 +130,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
         ivec2 pos = sp + ivec2(x, y);
         int diff;
         ivec2 pr = get_pred(dec[p], sp, ivec2(x, y), 0, w,
-                            quant_table_idx);
+                            quant_table_idx, extend_lookup[quant_table_idx] > 0);
 
         VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*abs(pr[0]));
 
diff --git a/libavcodec/vulkan/ffv1_enc.comp b/libavcodec/vulkan/ffv1_enc.comp
index a3c22f7459..db33c414e1 100644
--- a/libavcodec/vulkan/ffv1_enc.comp
+++ b/libavcodec/vulkan/ffv1_enc.comp
@@ -20,48 +20,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off, int comp, int sw, uint8_t quant_table_idx)
-{
-    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
-    sp += off;
-
-    VTYPE3 top  = VTYPE3(TYPE(0),
-                         TYPE(0),
-                         TYPE(0));
-    if (off.y > 0 && off != ivec2(0, 1))
-        top[0] = TYPE(imageLoad(pred, sp + ivec2(-1, -1) + yoff_border1)[comp]);
-    if (off.y > 0) {
-        top[1] = TYPE(imageLoad(pred, sp + ivec2(0, -1))[comp]);
-        top[2] = TYPE(imageLoad(pred, sp + ivec2(min(1, sw - off.x - 1), -1))[comp]);
-    }
-
-    TYPE cur = TYPE(0);
-    if (off != ivec2(0, 0))
-        cur = TYPE(imageLoad(pred, sp + ivec2(-1,  0) + yoff_border1)[comp]);
-
-    int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
-               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
-
-    if ((quant_table[quant_table_idx][3][127] != 0) ||
-        (quant_table[quant_table_idx][4][127] != 0)) {
-        TYPE cur2 = TYPE(0);
-        if (off.x > 0 && off != ivec2(1, 0)) {
-            const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
-            cur2 = TYPE(imageLoad(pred, sp + ivec2(-2,  0) + yoff_border2)[comp]);
-        }
-        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
-
-        TYPE top2 = TYPE(0);
-        if (off.y > 1)
-            top2 = TYPE(imageLoad(pred, sp + ivec2(0, -2))[comp]);
-        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
-    }
-
-    /* context, prediction */
-    return ivec2(base, predict(cur, VTYPE2(top)));
-}
-
 #ifndef GOLOMB
 /* Note - only handles signed values */
 void put_symbol(inout RangeCoder c, uint64_t state, int v)
@@ -86,38 +44,42 @@ void put_symbol(inout RangeCoder c, uint64_t state, int v)
     put_rac(c, state - 11 + min(e, 10), v < 0);
 }
 
-void encode_line_pcm(inout SliceContext sc, int y, int p, int comp,
-                     int bits)
+void encode_line_pcm(inout SliceContext sc, readonly uimage2D img,
+                     ivec2 sp, int y, int p, int comp, int bits)
 {
-    ivec2 sp = sc.slice_pos;
     int w = sc.slice_dim.x;
+
+#ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
         sp >>= chroma_shift;
     }
+#endif
 
     for (int x = 0; x < w; x++) {
-        uint v = imageLoad(src[p], (sp + ivec2(x, y)))[comp];
+        uint v = imageLoad(img, sp + LADDR(ivec2(x, y)))[comp];
         for (int i = (bits - 1); i >= 0; i--)
             put_rac_equi(sc.c, bool(bitfieldExtract(v, i, 1)));
     }
 }
 
-void encode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int comp, int bits,
+void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
+                 ivec2 sp, int y, int p, int comp, int bits,
                  uint8_t quant_table_idx, const int run_index)
 {
-    ivec2 sp = sc.slice_pos;
-
     int w = sc.slice_dim.x;
+
+#ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
         sp >>= chroma_shift;
     }
+#endif
 
     for (int x = 0; x < w; x++) {
-        ivec2 d = get_pred(src[p], sp, ivec2(x, y), comp, w, quant_table_idx);
-        d[1] = int(imageLoad(src[p], sp + ivec2(x, y))[comp]) - d[1];
+        ivec2 d = get_pred(img, sp, ivec2(x, y), comp, w,
+                           quant_table_idx, extend_lookup[quant_table_idx] > 0);
+        d[1] = int(imageLoad(img, sp + LADDR(ivec2(x, y)))[comp]) - d[1];
 
         if (d[0] < 0)
             d = -d;
@@ -130,24 +92,26 @@ void encode_line(inout SliceContext sc, uint64_t state,
 
 #else /* GOLOMB */
 
-void encode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int comp, int bits,
+void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
+                 ivec2 sp, int y, int p, int comp, int bits,
                  uint8_t quant_table_idx, inout int run_index)
 {
-    ivec2 sp = sc.slice_pos;
-
     int w = sc.slice_dim.x;
+
+#ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
         sp >>= chroma_shift;
     }
+#endif
 
     int run_count = 0;
     bool run_mode = false;
 
     for (int x = 0; x < w; x++) {
-        ivec2 d = get_pred(src[p], sp, ivec2(x, y), comp, w, quant_table_idx);
-        d[1] = int(imageLoad(src[p], sp + ivec2(x, y))[comp]) - d[1];
+        ivec2 d = get_pred(img, sp, ivec2(x, y), comp, w,
+                           quant_table_idx, extend_lookup[quant_table_idx] > 0);
+        d[1] = int(imageLoad(img, sp + LADDR(ivec2(x, y)))[comp]) - d[1];
 
         if (d[0] < 0)
             d = -d;
@@ -198,14 +162,56 @@ void encode_line(inout SliceContext sc, uint64_t state,
 }
 #endif
 
+#ifdef RGB
+ivec4 load_components(ivec2 pos)
+{
+    ivec4 pix = ivec4(imageLoad(src[0], pos));
+    if (planar_rgb != 0) {
+        for (int i = 1; i < (3 + transparency); i++)
+            pix[i] = int(imageLoad(src[i], pos)[0]);
+    }
+
+    return ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]],
+                 pix[fmt_lut[2]], pix[fmt_lut[3]]);
+}
+
+void transform_sample(inout ivec4 pix, ivec2 rct_coef)
+{
+    pix.b -= pix.g;
+    pix.r -= pix.g;
+    pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2;
+    pix.b += rct_offset;
+    pix.r += rct_offset;
+}
+
+void preload_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct)
+{
+    for (uint x = gl_LocalInvocationID.x; x < w; x += gl_WorkGroupSize.x) {
+        ivec2 lpos = sp + LADDR(ivec2(x, y));
+        ivec2 pos = sc.slice_pos + ivec2(x, y);
+
+        ivec4 pix = load_components(pos);
+
+        if (expectEXT(apply_rct, true))
+            transform_sample(pix, sc.slice_rct_coef);
+
+        imageStore(tmp, lpos, pix);
+    }
+}
+#endif
+
 void encode_slice(inout SliceContext sc, const uint slice_idx)
 {
+    ivec2 sp = sc.slice_pos;
+
 #ifndef RGB
     int bits = bits_per_raw_sample;
 #else
     int bits = 9;
     if (bits != 8 || sc.slice_coding_mode != 0)
         bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1);
+
+    sp.y = int(gl_WorkGroupID.y)*RGB_LINECACHE;
 #endif
 
 #ifndef GOLOMB
@@ -222,15 +228,17 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
             int comp = c - p;
 
             for (int y = 0; y < h; y++)
-                encode_line_pcm(sc, y, p, comp, bits);
+                encode_line_pcm(sc, src[p], sp, y, p, comp, bits);
         }
 #else
         for (int y = 0; y < sc.slice_dim.y; y++) {
-            encode_line_pcm(sc, y, 0, 1, bits);
-            encode_line_pcm(sc, y, 0, 2, bits);
-            encode_line_pcm(sc, y, 0, 0, bits);
+            preload_rgb(sc, sp, sc.slice_dim.x, y, false);
+
+            encode_line_pcm(sc, tmp, sp, y, 0, 1, bits);
+            encode_line_pcm(sc, tmp, sp, y, 0, 2, bits);
+            encode_line_pcm(sc, tmp, sp, y, 0, 0, bits);
             if (transparency == 1)
-                encode_line_pcm(sc, y, 0, 3, bits);
+                encode_line_pcm(sc, tmp, sp, y, 0, 3, bits);
         }
 #endif
     } else
@@ -252,7 +260,8 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
             int comp = c - p;
 
             for (int y = 0; y < h; y++)
-                encode_line(sc, slice_state_off, y, p, comp, bits, quant_table_idx[c], run_index);
+                encode_line(sc, src[p], slice_state_off, sp, y, p,
+                            comp, bits, quant_table_idx[c], run_index);
 
             /* For the second chroma plane, reuse the first plane's state */
             if (c != 1)
@@ -261,15 +270,17 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
 #else
         int run_index = 0;
         for (int y = 0; y < sc.slice_dim.y; y++) {
-            encode_line(sc, slice_state_off + plane_state_size*0,
-                        y, 0, 1, bits, quant_table_idx[0], run_index);
-            encode_line(sc, slice_state_off + plane_state_size*1,
-                        y, 0, 2, bits, quant_table_idx[1], run_index);
-            encode_line(sc, slice_state_off + plane_state_size*1,
-                        y, 0, 0, bits, quant_table_idx[2], run_index);
+            preload_rgb(sc, sp, sc.slice_dim.x, y, true);
+
+            encode_line(sc, tmp, slice_state_off + plane_state_size*0,
+                        sp, y, 0, 1, bits, quant_table_idx[0], run_index);
+            encode_line(sc, tmp, slice_state_off + plane_state_size*1,
+                        sp, y, 0, 2, bits, quant_table_idx[1], run_index);
+            encode_line(sc, tmp, slice_state_off + plane_state_size*1,
+                        sp, y, 0, 0, bits, quant_table_idx[2], run_index);
             if (transparency == 1)
-                encode_line(sc, slice_state_off + plane_state_size*2,
-                            y, 0, 3, bits, quant_table_idx[3], run_index);
+                encode_line(sc, tmp, slice_state_off + plane_state_size*2,
+                            sp, y, 0, 3, bits, quant_table_idx[3], run_index);
         }
 #endif
     }
diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c
index cbde2f319a..efbf5fa953 100644
--- a/libavcodec/vulkan_ffv1.c
+++ b/libavcodec/vulkan_ffv1.c
@@ -26,6 +26,8 @@
 #include "libavutil/vulkan_spirv.h"
 #include "libavutil/mem.h"
 
+#define RGB_LINECACHE 2
+
 extern const char *ff_source_common_comp;
 extern const char *ff_source_rangecoder_comp;
 extern const char *ff_source_ffv1_vlc_comp;
@@ -610,6 +612,7 @@ static void define_shared_code(FFVulkanShader *shd, int use32bit)
 
     GLSLC(0, #define DECODE                                              );
 
+    av_bprintf(&shd->src, "#define RGB_LINECACHE %i\n"                   ,RGB_LINECACHE);
     av_bprintf(&shd->src, "#define CONTEXT_SIZE %i\n"                    ,CONTEXT_SIZE);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_MASK 0x%x\n"          ,MAX_QUANT_TABLE_MASK);
 
@@ -936,7 +939,7 @@ static int init_indirect(AVCodecContext *avctx, FFVulkanContext *s,
     frames_ctx->format    = AV_PIX_FMT_VULKAN;
     frames_ctx->sw_format = sw_format;
     frames_ctx->width     = s->frames->width;
-    frames_ctx->height    = f->num_v_slices*2;
+    frames_ctx->height    = f->num_v_slices*RGB_LINECACHE;
 
     vk_frames = frames_ctx->hwctx;
     vk_frames->tiling    = VK_IMAGE_TILING_OPTIMAL;
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 07/16] ffv1_common: minor RGB optimization
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (4 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 06/16] ffv1enc_vulkan: switch to 2-line cache, unify prediction code Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 08/16] ffv1enc_vulkan: use ff_get_encode_buffer Lynne
                   ` (8 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

---
 libavcodec/vulkan/ffv1_common.comp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/vulkan/ffv1_common.comp b/libavcodec/vulkan/ffv1_common.comp
index 1f222bdc42..3d40592739 100644
--- a/libavcodec/vulkan/ffv1_common.comp
+++ b/libavcodec/vulkan/ffv1_common.comp
@@ -100,17 +100,17 @@ uint slice_coord(uint width, uint sx, uint num_h_slices, uint chroma_shift)
 ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off,
                int comp, int sw, uint8_t quant_table_idx, bool extend_lookup)
 {
-    const ivec2 yoff_border1 = expectEXT(off.x == 0, false) ? ivec2(1, -1) : ivec2(0, 0);
+    const ivec2 yoff_border1 = expectEXT(off.x == 0, false) ? off + ivec2(1, -1) : off;
 
     /* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */
-    VTYPE3 top  = VTYPE3(TYPE(imageLoad(pred, sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[comp]),
+    VTYPE3 top  = VTYPE3(TYPE(imageLoad(pred, sp + LADDR(yoff_border1 + ivec2(-1, -1)))[comp]),
                          TYPE(imageLoad(pred, sp + LADDR(off + ivec2(0, -1)))[comp]),
                          TYPE(imageLoad(pred, sp + LADDR(off + ivec2(min(1, sw - off.x - 1), -1)))[comp]));
 
     /* Normally, we'd need to check if off != ivec2(0, 0) here, since otherwise, we must
      * return zero. However, ivec2(-1,  0) + ivec2(1, -1) == ivec2(0, -1), e.g. previous
      * row, 0 offset, same slice, which is zero since we zero out the buffer for RGB */
-    TYPE cur = TYPE(imageLoad(pred, sp + LADDR(off + ivec2(-1,  0) + yoff_border1))[comp]);
+    TYPE cur = TYPE(imageLoad(pred, sp + LADDR(yoff_border1 + ivec2(-1,  0)))[comp]);
 
     int base = quant_table[quant_table_idx][0][(cur    - top[0]) & MAX_QUANT_TABLE_MASK] +
                quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 08/16] ffv1enc_vulkan: use ff_get_encode_buffer
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (5 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 07/16] ffv1_common: minor RGB optimization Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 09/16] vulkan_ffv1: fix PCM + cached symbol reader Lynne
                   ` (7 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

We used to create our own buffer, but still used the DR1 flag,
which is not how it's supposed to work.

Instead, use ff_get_encode_buffer, and either host-map the buffer
before copying each slice via GPU transfers, or just copy each
slice manually if that fails or is unavailable.
---
 libavcodec/ffv1enc_vulkan.c | 98 +++++++++++++++++++++----------------
 1 file changed, 57 insertions(+), 41 deletions(-)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index bab9bb640b..c2eb73ca53 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -88,7 +88,6 @@ typedef struct VulkanEncodeFFv1Context {
 
     /* Output data buffer */
     AVBufferPool *out_data_pool;
-    AVBufferPool *pkt_data_pool;
 
     /* Slice results buffer */
     AVBufferPool *results_data_pool;
@@ -299,8 +298,11 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                 NULL, maxsize,
-                                maxsize < fv->max_heap_size ?
-                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT : 0x0));
+                                VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                (maxsize < fv->max_heap_size ?
+                                 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT : 0x0) |
+                                (!(fv->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) ?
+                                 VK_MEMORY_PROPERTY_HOST_CACHED_BIT : 0x0)));
     out_data_buf = (FFVkBuffer *)fd->out_data_ref->data;
     ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 1);
 
@@ -583,10 +585,10 @@ fail:
     return err;
 }
 
-static int download_slices(AVCodecContext *avctx,
+static int transfer_slices(AVCodecContext *avctx,
                            VkBufferCopy *buf_regions, int nb_regions,
                            VulkanEncodeFFv1FrameData *fd,
-                           AVBufferRef *pkt_data_ref)
+                           uint8_t *dst, AVBufferRef *dst_ref)
 {
     int err;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
@@ -594,11 +596,20 @@ static int download_slices(AVCodecContext *avctx,
     FFVkExecContext *exec;
 
     FFVkBuffer *out_data_buf = (FFVkBuffer *)fd->out_data_ref->data;
-    FFVkBuffer *pkt_data_buf = (FFVkBuffer *)pkt_data_ref->data;
+
+    AVBufferRef *mapped_ref;
+    FFVkBuffer *mapped_buf;
 
     VkBufferMemoryBarrier2 buf_bar[8];
     int nb_buf_bar = 0;
 
+    err = ff_vk_host_map_buffer(&fv->s, &mapped_ref, dst, dst_ref,
+                                VK_BUFFER_USAGE_TRANSFER_DST_BIT);
+    if (err < 0)
+        return err;
+
+    mapped_buf = (FFVkBuffer *)mapped_ref->data;
+
     /* Transfer the slices */
     exec = ff_vk_exec_get(&fv->s, &fv->transfer_exec_pool);
     ff_vk_exec_start(&fv->s, exec);
@@ -606,7 +617,8 @@ static int download_slices(AVCodecContext *avctx,
     ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 0);
     fd->out_data_ref = NULL; /* Ownership passed */
 
-    ff_vk_exec_add_dep_buf(&fv->s, exec, &pkt_data_ref, 1, 1);
+    ff_vk_exec_add_dep_buf(&fv->s, exec, &mapped_ref, 1, 0);
+    mapped_ref = NULL; /* Ownership passed */
 
     /* Ensure the output buffer is finished */
     buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
@@ -630,8 +642,11 @@ static int download_slices(AVCodecContext *avctx,
     out_data_buf->access = buf_bar[0].dstAccessMask;
     nb_buf_bar = 0;
 
+    for (int i = 0; i < nb_regions; i++)
+        buf_regions[i].dstOffset += mapped_buf->virtual_offset;
+
     vk->CmdCopyBuffer(exec->buf,
-                      out_data_buf->buf, pkt_data_buf->buf,
+                      out_data_buf->buf, mapped_buf->buf,
                       nb_regions, buf_regions);
 
     /* Submit */
@@ -642,18 +657,6 @@ static int download_slices(AVCodecContext *avctx,
     /* We need the encoded data immediately */
     ff_vk_exec_wait(&fv->s, exec);
 
-    /* Invalidate slice/output data if needed */
-    if (!(pkt_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
-        VkMappedMemoryRange invalidate_data = {
-            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
-            .memory = pkt_data_buf->mem,
-            .offset = 0,
-            .size = VK_WHOLE_SIZE,
-        };
-        vk->InvalidateMappedMemoryRanges(fv->s.hwctx->act_dev,
-                                         1, &invalidate_data);
-    }
-
     return 0;
 }
 
@@ -664,13 +667,9 @@ static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec,
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
     FFV1Context *f = &fv->ctx;
     FFVulkanFunctions *vk = &fv->s.vkfn;
-
-    /* Packet data */
-    AVBufferRef *pkt_data_ref;
-    FFVkBuffer *pkt_data_buf;
-
     VulkanEncodeFFv1FrameData *fd = exec->opaque;
 
+    FFVkBuffer *out_data_buf = (FFVkBuffer *)fd->out_data_ref->data;
     FFVkBuffer *results_data_buf = (FFVkBuffer *)fd->results_data_ref->data;
     uint64_t *sc;
 
@@ -707,20 +706,9 @@ static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec,
     av_log(avctx, AV_LOG_VERBOSE, "Encoded data: %iMiB\n", pkt->size / (1024*1024));
     av_buffer_unref(&fd->results_data_ref); /* No need for this buffer anymore */
 
-    /* Allocate packet buffer */
-    err = ff_vk_get_pooled_buffer(&fv->s, &fv->pkt_data_pool,
-                                  &pkt_data_ref,
-                                  VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-                                  NULL, pkt->size,
-                                  VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
-                                  VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-    if (err < 0)
+    /* Allocate packet */
+    if ((err = ff_get_encode_buffer(avctx, pkt, pkt->size, 0)) < 0)
         return err;
-    pkt_data_buf = (FFVkBuffer *)pkt_data_ref->data;
-
-    /* Setup packet data */
-    pkt->data     = pkt_data_buf->mapped_mem;
-    pkt->buf      = pkt_data_ref;
 
     pkt->pts      = fd->pts;
     pkt->dts      = fd->pts;
@@ -733,8 +721,37 @@ static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec,
         fd->frame_opaque_ref = NULL;
     }
 
-    return download_slices(avctx, fv->buf_regions, f->slice_count, fd,
-                           pkt_data_ref);
+    /* Try using host mapped memory transfers first */
+    if (fv->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) {
+        err = transfer_slices(avctx, fv->buf_regions, f->slice_count, fd,
+                              pkt->data, pkt->buf);
+        if (err >= 0)
+            return err;
+    }
+
+    /* Invalidate slice/output data if needed */
+    if (!(out_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+        VkMappedMemoryRange invalidate_data = {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = out_data_buf->mem,
+            .offset = 0,
+            .size = VK_WHOLE_SIZE,
+        };
+        vk->InvalidateMappedMemoryRanges(fv->s.hwctx->act_dev,
+                                         1, &invalidate_data);
+    }
+
+    /* Copy each slice */
+    for (int i = 0; i < f->slice_count; i++) {
+        VkBufferCopy *region = &fv->buf_regions[i];
+        memcpy(pkt->data + region->dstOffset,
+               out_data_buf->mapped_mem + region->srcOffset,
+               region->size);
+    }
+
+    av_buffer_unref(&fd->out_data_ref);
+
+    return 0;
 }
 
 static int vulkan_encode_ffv1_receive_packet(AVCodecContext *avctx,
@@ -1523,7 +1540,6 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx)
     av_buffer_pool_uninit(&fv->results_data_pool);
 
     av_buffer_pool_uninit(&fv->out_data_pool);
-    av_buffer_pool_uninit(&fv->pkt_data_pool);
 
     av_buffer_unref(&fv->keyframe_slice_data_ref);
     av_buffer_pool_uninit(&fv->slice_data_pool);
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 09/16] vulkan_ffv1: fix PCM + cached symbol reader
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (6 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 08/16] ffv1enc_vulkan: use ff_get_encode_buffer Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 10/16] ffv1enc_vulkan: implement the cached EC writer from the decoder Lynne
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

writeout_rgb requires that all subgroups are active.
---
 libavcodec/vulkan/ffv1_dec.comp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp
index c74af4bf6a..e73b3f1dc0 100644
--- a/libavcodec/vulkan/ffv1_dec.comp
+++ b/libavcodec/vulkan/ffv1_dec.comp
@@ -56,6 +56,11 @@ int get_isymbol(inout RangeCoder c, uint state_off)
 
 void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int bits)
 {
+#ifdef CACHED_SYMBOL_READER
+    if (gl_LocalInvocationID.x > 0)
+        return;
+#endif
+
 #ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
@@ -235,8 +240,6 @@ void decode_slice(inout SliceContext sc, const uint slice_idx)
     /* PCM coding */
 #ifndef GOLOMB
     if (sc.slice_coding_mode == 1) {
-        if (gl_LocalInvocationID.x > 0)
-            return;
 #ifndef RGB
         for (int p = 0; p < planes; p++) {
             int h = sc.slice_dim.y;
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 10/16] ffv1enc_vulkan: implement the cached EC writer from the decoder
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (7 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 09/16] vulkan_ffv1: fix PCM + cached symbol reader Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 11/16] ffv1enc_vulkan: implement RCT search for level >= 4 Lynne
                   ` (5 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

This gives a 35% speedup on AMD and 50% on Nvidia.
---
 libavcodec/ffv1enc_vulkan.c     |  6 ++-
 libavcodec/vulkan/ffv1_enc.comp | 68 ++++++++++++++++++++++-----------
 2 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index c2eb73ca53..5de16d5b02 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -1099,12 +1099,13 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     uint8_t *spv_data;
     size_t spv_len;
     void *spv_opaque = NULL;
+    int use_cached_reader = fv->ctx.ac != AC_GOLOMB_RICE;
 
     RET(ff_vk_shader_init(&fv->s, shd, "ffv1_enc",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
                                              "GL_EXT_buffer_reference2" }, 2,
-                          1, 1, 1,
+                          use_cached_reader ? CONTEXT_SIZE : 1, 1, 1,
                           0));
 
     /* Common codec header */
@@ -1116,6 +1117,9 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
     av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS);
     av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE);
 
+    if (use_cached_reader)
+        av_bprintf(&shd->src, "#define CACHED_SYMBOL_READER 1\n");
+
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
             .name        = "rangecoder_static_buf",
diff --git a/libavcodec/vulkan/ffv1_enc.comp b/libavcodec/vulkan/ffv1_enc.comp
index db33c414e1..65a7df1359 100644
--- a/libavcodec/vulkan/ffv1_enc.comp
+++ b/libavcodec/vulkan/ffv1_enc.comp
@@ -21,27 +21,32 @@
  */
 
 #ifndef GOLOMB
+#ifdef CACHED_SYMBOL_READER
+shared uint8_t state[CONTEXT_SIZE];
+#define WRITE(c, off, val) put_rac_direct(c, state[off], val)
+#else
+#define WRITE(c, off, val) put_rac(c, uint64_t(slice_state) + (state_off + off), val)
+#endif
+
 /* Note - only handles signed values */
-void put_symbol(inout RangeCoder c, uint64_t state, int v)
+void put_symbol(inout RangeCoder c, uint state_off, int v)
 {
     bool is_nil = (v == 0);
-    put_rac(c, state, is_nil);
+    WRITE(c, 0, is_nil);
     if (is_nil)
         return;
 
     const int a = abs(v);
     const int e = findMSB(a);
 
-    state += 1;
     for (int i = 0; i < e; i++)
-        put_rac(c, state + min(i, 9), true);
-    put_rac(c, state + min(e, 9), false);
+        WRITE(c, 1 + min(i, 9), true);
+    WRITE(c, 1 + min(e, 9), false);
 
-    state += 21;
     for (int i = e - 1; i >= 0; i--)
-        put_rac(c, state + min(i, 9), bool(bitfieldExtract(a, i, 1)));
+        WRITE(c, 22 + min(i, 9), bool(bitfieldExtract(a, i, 1)));
 
-    put_rac(c, state - 11 + min(e, 10), v < 0);
+    WRITE(c, 22 - 11 + min(e, 10), v < 0);
 }
 
 void encode_line_pcm(inout SliceContext sc, readonly uimage2D img,
@@ -49,6 +54,11 @@ void encode_line_pcm(inout SliceContext sc, readonly uimage2D img,
 {
     int w = sc.slice_dim.x;
 
+#ifdef CACHED_SYMBOL_READER
+    if (gl_LocalInvocationID.x > 0)
+        return;
+#endif
+
 #ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
@@ -63,7 +73,7 @@ void encode_line_pcm(inout SliceContext sc, readonly uimage2D img,
     }
 }
 
-void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
+void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off,
                  ivec2 sp, int y, int p, int comp, int bits,
                  uint8_t quant_table_idx, const int run_index)
 {
@@ -86,13 +96,25 @@ void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
 
         d[1] = fold(d[1], bits);
 
-        put_symbol(sc.c, state + CONTEXT_SIZE*d[0], d[1]);
+        uint context_off = state_off + CONTEXT_SIZE*d[0];
+#ifdef CACHED_SYMBOL_READER
+        u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x);
+        state[gl_LocalInvocationID.x] = sb.v;
+        barrier();
+        if (gl_LocalInvocationID.x == 0)
+#endif
+
+            put_symbol(sc.c, context_off, d[1]);
+
+#ifdef CACHED_SYMBOL_READER
+        sb.v = state[gl_LocalInvocationID.x];
+#endif
     }
 }
 
 #else /* GOLOMB */
 
-void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
+void encode_line(inout SliceContext sc, readonly uimage2D img, uint state_off,
                  ivec2 sp, int y, int p, int comp, int bits,
                  uint8_t quant_table_idx, inout int run_index)
 {
@@ -143,7 +165,7 @@ void encode_line(inout SliceContext sc, readonly uimage2D img, uint64_t state,
         }
 
         if (!run_mode) {
-            VlcState sb = VlcState(state + VLC_STATE_SIZE*d[0]);
+            VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*d[0]);
             Symbol sym = get_vlc_symbol(sb, d[1], bits);
             put_bits(sc.pb, sym.bits, sym.val);
         }
@@ -245,8 +267,7 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
 #endif
     {
         u8vec4 quant_table_idx = sc.quant_table_idx.xyyz;
-        uint64_t slice_state_off = uint64_t(slice_state) +
-                                   slice_idx*plane_state_size*codec_planes;
+        u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size;
 
 #ifndef RGB
         for (int c = 0; c < components; c++) {
@@ -260,26 +281,22 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
             int comp = c - p;
 
             for (int y = 0; y < h; y++)
-                encode_line(sc, src[p], slice_state_off, sp, y, p,
+                encode_line(sc, src[p], slice_state_off[c], sp, y, p,
                             comp, bits, quant_table_idx[c], run_index);
-
-            /* For the second chroma plane, reuse the first plane's state */
-            if (c != 1)
-                slice_state_off += plane_state_size;
         }
 #else
         int run_index = 0;
         for (int y = 0; y < sc.slice_dim.y; y++) {
             preload_rgb(sc, sp, sc.slice_dim.x, y, true);
 
-            encode_line(sc, tmp, slice_state_off + plane_state_size*0,
+            encode_line(sc, tmp, slice_state_off[0],
                         sp, y, 0, 1, bits, quant_table_idx[0], run_index);
-            encode_line(sc, tmp, slice_state_off + plane_state_size*1,
+            encode_line(sc, tmp, slice_state_off[1],
                         sp, y, 0, 2, bits, quant_table_idx[1], run_index);
-            encode_line(sc, tmp, slice_state_off + plane_state_size*1,
+            encode_line(sc, tmp, slice_state_off[2],
                         sp, y, 0, 0, bits, quant_table_idx[2], run_index);
             if (transparency == 1)
-                encode_line(sc, tmp, slice_state_off + plane_state_size*2,
+                encode_line(sc, tmp, slice_state_off[3],
                             sp, y, 0, 3, bits, quant_table_idx[3], run_index);
         }
 #endif
@@ -288,6 +305,11 @@ void encode_slice(inout SliceContext sc, const uint slice_idx)
 
 void finalize_slice(inout SliceContext sc, const uint slice_idx)
 {
+#ifdef CACHED_SYMBOL_READER
+    if (gl_LocalInvocationID.x > 0)
+        return;
+#endif
+
 #ifdef GOLOMB
     uint32_t enc_len = sc.hdr_len + flush_put_bits(sc.pb);
 #else
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 11/16] ffv1enc_vulkan: implement RCT search for level >= 4
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (8 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 10/16] ffv1enc_vulkan: implement the cached EC writer from the decoder Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 12/16] vulkan/ffv1: unify encode and decode get/put primitives Lynne
                   ` (4 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

---
 libavcodec/ffv1enc_vulkan.c            | 204 ++++++++++++++++++++++++-
 libavcodec/vulkan/Makefile             |   2 +-
 libavcodec/vulkan/ffv1_enc_setup.comp  |   6 +-
 libavcodec/vulkan/ffv1_rct_search.comp | 139 +++++++++++++++++
 4 files changed, 346 insertions(+), 5 deletions(-)
 create mode 100644 libavcodec/vulkan/ffv1_rct_search.comp

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 5de16d5b02..d9e12f5fae 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -74,6 +74,7 @@ typedef struct VulkanEncodeFFv1Context {
     size_t max_heap_size;
 
     FFVulkanShader setup;
+    FFVulkanShader rct_search;
     FFVulkanShader reset;
     FFVulkanShader enc;
 
@@ -101,6 +102,7 @@ typedef struct VulkanEncodeFFv1Context {
     int num_h_slices;
     int num_v_slices;
     int force_pcm;
+    int optimize_rct;
 
     int is_rgb;
     int ppi;
@@ -112,6 +114,7 @@ extern const char *ff_source_rangecoder_comp;
 extern const char *ff_source_ffv1_vlc_comp;
 extern const char *ff_source_ffv1_common_comp;
 extern const char *ff_source_ffv1_reset_comp;
+extern const char *ff_source_ffv1_rct_search_comp;
 extern const char *ff_source_ffv1_enc_setup_comp;
 extern const char *ff_source_ffv1_enc_comp;
 
@@ -147,7 +150,8 @@ typedef struct FFv1VkParameters {
     uint8_t ec;
     uint8_t ppi;
     uint8_t chunks;
-    uint8_t padding[4];
+    uint8_t rct_search;
+    uint8_t padding[3];
 } FFv1VkParameters;
 
 static void add_push_data(FFVulkanShader *shd)
@@ -184,12 +188,76 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    uint8_t ec;                                                   );
     GLSLC(1,    uint8_t ppi;                                                  );
     GLSLC(1,    uint8_t chunks;                                               );
-    GLSLC(1,    uint8_t padding[4];                                           );
+    GLSLC(1,    uint8_t rct_search;                                           );
+    GLSLC(1,    uint8_t padding[3];                                           );
     GLSLC(0, };                                                               );
     ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkParameters),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 }
 
+typedef struct FFv1VkRCTSearchParameters {
+    int fmt_lut[4];
+    int rct_offset;
+    uint8_t planar_rgb;
+    uint8_t transparency;
+    uint8_t key_frame;
+    uint8_t force_pcm;
+    uint8_t version;
+    uint8_t micro_version;
+    uint8_t padding[2];
+} FFv1VkRCTSearchParameters;
+
+static int run_rct_search(AVCodecContext *avctx, FFVkExecContext *exec,
+                          AVFrame *enc_in, VkImageView *enc_in_views,
+                          FFVkBuffer *slice_data_buf, uint32_t slice_data_size)
+{
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFV1Context *f = &fv->ctx;
+    FFVulkanFunctions *vk = &fv->s.vkfn;
+    AVHWFramesContext *src_hwfc = (AVHWFramesContext *)enc_in->hw_frames_ctx->data;
+    FFv1VkRCTSearchParameters pd;
+
+    /* Update descriptors */
+    ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->rct_search,
+                                    0, 0, 0,
+                                    slice_data_buf,
+                                    0, slice_data_size*f->slice_count,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct_search,
+                                  enc_in, enc_in_views,
+                                  0, 1,
+                                  VK_IMAGE_LAYOUT_GENERAL,
+                                  VK_NULL_HANDLE);
+
+    ff_vk_exec_bind_shader(&fv->s, exec, &fv->rct_search);
+
+    pd = (FFv1VkRCTSearchParameters) {
+        .rct_offset = 1 << f->bits_per_raw_sample,
+        .planar_rgb = ff_vk_mt_is_np_rgb(src_hwfc->sw_format) &&
+                      (ff_vk_count_images((AVVkFrame *)enc_in->data[0]) > 1),
+        .transparency = f->transparency,
+        .key_frame = f->key_frame,
+        .force_pcm = fv->force_pcm,
+        .version = f->version,
+        .micro_version = f->micro_version,
+    };
+
+    if (avctx->sw_pix_fmt == AV_PIX_FMT_GBRP10 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP12 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP14)
+        memcpy(pd.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int));
+    else
+        ff_vk_set_perm(avctx->sw_pix_fmt, pd.fmt_lut, 1);
+
+    ff_vk_shader_update_push_const(&fv->s, exec, &fv->rct_search,
+                                   VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(pd), &pd);
+
+    vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1);
+
+    return 0;
+}
+
 static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                            FFVkExecContext *exec,
                                            const AVFrame *pict)
@@ -366,6 +434,25 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         };
     }
 
+    if (fv->optimize_rct) {
+        RET(run_rct_search(avctx, exec,
+                           src, src_views,
+                           slice_data_buf, slice_data_size));
+
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = slice_data_buf->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = slice_data_buf->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = slice_data_buf->buf,
+            .size = slice_data_size*f->slice_count,
+            .offset = 0,
+        };
+    }
+
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pImageMemoryBarriers = img_bar,
@@ -412,6 +499,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
         .ec = f->ec,
         .ppi = fv->ppi,
         .chunks = fv->chunks,
+        .rct_search = fv->optimize_rct,
     };
 
     /* For some reason the C FFv1 encoder/decoder treats these differently */
@@ -920,6 +1008,103 @@ static void define_shared_code(AVCodecContext *avctx, FFVulkanShader *shd)
     GLSLD(ff_source_ffv1_common_comp);
 }
 
+static int init_rct_search_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
+{
+    int err;
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFVulkanShader *shd = &fv->rct_search;
+    FFVulkanDescriptorSetBinding *desc_set;
+
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+
+    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_rct_search",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_EXT_buffer_reference",
+                                             "GL_EXT_buffer_reference2",
+                                             "GL_EXT_null_initializer" }, 3,
+                          32, 32, 1,
+                          0));
+
+    /* Common codec header */
+    GLSLD(ff_source_common_comp);
+
+    GLSLC(0, layout(push_constant, scalar) uniform pushConstants {             );
+    GLSLC(1,    ivec4 fmt_lut;                                                 );
+    GLSLC(1,    int rct_offset;                                                );
+    GLSLC(1,    uint8_t planar_rgb;                                            );
+    GLSLC(1,    uint8_t transparency;                                          );
+    GLSLC(1,    uint8_t key_frame;                                             );
+    GLSLC(1,    uint8_t force_pcm;                                             );
+    GLSLC(1,    uint8_t version;                                               );
+    GLSLC(1,    uint8_t micro_version;                                         );
+    GLSLC(1,    uint8_t padding[3];                                            );
+    GLSLC(0, };                                                                );
+    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkResetParameters),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES);
+    av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS);
+    av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE);
+
+    /* Never used */
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "rangecoder_static_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "uint8_t zero_one_state[512];",
+        },
+        {
+            .name        = "quant_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]"
+                           "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 1));
+
+    define_shared_code(avctx, shd);
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "slice_data_buf",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "SliceContext slice_ctx[1024];",
+        },
+        {
+            .name       = "src",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .dimensions = 2,
+            .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format,
+                                               fv->rep_fmt),
+            .elems      = av_pix_fmt_count_planes(fv->s.frames->sw_format),
+            .mem_quali  = "readonly",
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 0, 0));
+
+    GLSLD(ff_source_ffv1_rct_search_comp);
+
+    RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main",
+                            &spv_opaque));
+    RET(ff_vk_shader_link(&fv->s, shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd));
+
+fail:
+    if (spv_opaque)
+        spv->free_shader(spv, &spv_opaque);
+
+    return err;
+}
+
 static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
 {
     int err;
@@ -1417,6 +1602,17 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
     if (!fv->is_rgb && f->bits_per_raw_sample > 8)
         fv->rep_fmt = FF_VK_REP_INT;
 
+    /* Init rct search shader */
+    fv->optimize_rct = fv->is_rgb && f->version >= 4 &&
+                       !fv->force_pcm && fv->optimize_rct;
+    if (fv->optimize_rct) {
+        err = init_rct_search_shader(avctx, spv);
+        if (err < 0) {
+            spv->uninit(&spv);
+            return err;
+        }
+    }
+
     /* Init setup shader */
     err = init_setup_shader(avctx, spv);
     if (err < 0) {
@@ -1528,6 +1724,7 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx)
     ff_vk_shader_free(&fv->s, &fv->enc);
     ff_vk_shader_free(&fv->s, &fv->reset);
     ff_vk_shader_free(&fv->s, &fv->setup);
+    ff_vk_shader_free(&fv->s, &fv->rct_search);
 
     if (fv->exec_ctx_info) {
         for (int i = 0; i < fv->async_depth; i++) {
@@ -1591,6 +1788,9 @@ static const AVOption vulkan_encode_ffv1_options[] = {
     { "force_pcm", "Code all slices with no prediction", OFFSET(force_pcm), AV_OPT_TYPE_BOOL,
             { .i64 = 0 }, 0, 1, VE },
 
+    { "rct_search", "Run a search for RCT parameters (level 4 only)", OFFSET(optimize_rct), AV_OPT_TYPE_BOOL,
+            { .i64 = 1 }, 0, 1, VE },
+
     { "async_depth", "Internal parallelization depth", OFFSET(async_depth), AV_OPT_TYPE_INT,
             { .i64 = 1 }, 1, INT_MAX, VE },
 
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index 4bbcb38c6a..729cb4f15c 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -7,7 +7,7 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER)  +=  vulkan/common.o \
 					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
 					vulkan/ffv1_common.o vulkan/ffv1_reset.o \
 					vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_setup.o \
-					vulkan/ffv1_enc.o
+					vulkan/ffv1_rct_search.o vulkan/ffv1_enc.o
 
 OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
 					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp b/libavcodec/vulkan/ffv1_enc_setup.comp
index 6f21e47523..5f8e6704b0 100644
--- a/libavcodec/vulkan/ffv1_enc_setup.comp
+++ b/libavcodec/vulkan/ffv1_enc_setup.comp
@@ -22,7 +22,7 @@
 
 uint8_t state[CONTEXT_SIZE];
 
-void init_slice(out SliceContext sc, const uint slice_idx)
+void init_slice(inout SliceContext sc, const uint slice_idx)
 {
     /* Set coordinates */
     uvec2 img_size = imageSize(src[0]);
@@ -37,11 +37,13 @@ void init_slice(out SliceContext sc, const uint slice_idx)
 
     sc.slice_pos = ivec2(sxs, sys);
     sc.slice_dim = ivec2(sxe - sxs, sye - sys);
-    sc.slice_rct_coef = ivec2(1, 1);
     sc.slice_coding_mode = int(force_pcm == 1);
     sc.slice_reset_contexts = sc.slice_coding_mode == 1;
     sc.quant_table_idx = u8vec3(context_model);
 
+    if ((rct_search == 0) || (sc.slice_coding_mode == 1))
+        sc.slice_rct_coef = ivec2(1, 1);
+
     rac_init(sc.c,
              OFFBUF(u8buf, out_data, slice_idx * slice_size_max),
              slice_size_max);
diff --git a/libavcodec/vulkan/ffv1_rct_search.comp b/libavcodec/vulkan/ffv1_rct_search.comp
new file mode 100644
index 0000000000..055bde46c4
--- /dev/null
+++ b/libavcodec/vulkan/ffv1_rct_search.comp
@@ -0,0 +1,139 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2024 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ivec3 load_components(ivec2 pos)
+{
+    ivec3 pix = ivec3(imageLoad(src[0], pos));
+    if (planar_rgb != 0) {
+        for (int i = 1; i < 3; i++)
+            pix[i] = int(imageLoad(src[i], pos)[0]);
+    }
+
+    return ivec3(pix[fmt_lut[0]], pix[fmt_lut[1]], pix[fmt_lut[2]]);
+}
+
+#define NUM_CHECKS 15
+const ivec2 rct_y_coeff[NUM_CHECKS] = {
+    ivec2(0, 0), //      4G
+
+    ivec2(0, 1), //      3G +  B
+    ivec2(1, 0), //  R + 3G
+    ivec2(1, 1), //  R + 2G + B
+
+    ivec2(0, 2), //      2G + 2B
+    ivec2(2, 0), // 2R + 2G
+    ivec2(2, 2), // 2R      + 2B
+
+    ivec2(0, 3), //      1G + 3B
+    ivec2(3, 0), // 3R + 1G
+
+    ivec2(0, 4), //           4B
+    ivec2(4, 0), // 4R
+
+    ivec2(1, 2), //  R +  G + 2B
+    ivec2(2, 1), // 2R +  G +  B
+
+    ivec2(3, 1), // 3R      +  B
+    ivec2(1, 3), //  R      + 3B
+};
+
+shared ivec3 pix_buf[gl_WorkGroupSize.x + 1][gl_WorkGroupSize.y + 1] = { };
+
+ivec3 transform_sample(ivec3 pix, ivec2 rct_coef)
+{
+    pix.b -= pix.g;
+    pix.r -= pix.g;
+    pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2;
+    pix.b += rct_offset;
+    pix.r += rct_offset;
+    return pix;
+}
+
+uint get_dist(ivec3 cur)
+{
+    ivec3 LL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 1];
+    ivec3 TL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 0];
+    ivec3 TT = pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 0];
+
+    ivec3 pred = ivec3(predict(LL.r, ivec2(TL.r, TT.r)),
+                       predict(LL.g, ivec2(TL.g, TT.g)),
+                       predict(LL.b, ivec2(TL.b, TT.b)));
+
+    uvec3 c = abs(pred - cur);
+    return mid_pred(c.r, c.g, c.b);
+}
+
+shared uint score_cols[gl_WorkGroupSize.y] = { };
+shared uint score_mode[16] = { };
+
+void process(ivec2 pos)
+{
+    ivec3 pix = load_components(pos);
+
+    for (int i = 0; i < NUM_CHECKS; i++) {
+        ivec3 tx_pix = transform_sample(pix, rct_y_coeff[i]);
+        pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 1] = tx_pix;
+        memoryBarrierShared();
+
+        uint dist = get_dist(tx_pix);
+        atomicAdd(score_mode[i], dist);
+    }
+}
+
+void coeff_search(inout SliceContext sc)
+{
+    uvec2 img_size = imageSize(src[0]);
+    uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0,
+                           gl_NumWorkGroups.x, 0);
+    uint sxe = slice_coord(img_size.x, gl_WorkGroupID.x + 1,
+                           gl_NumWorkGroups.x, 0);
+    uint sys = slice_coord(img_size.y, gl_WorkGroupID.y + 0,
+                           gl_NumWorkGroups.y, 0);
+    uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1,
+                           gl_NumWorkGroups.y, 0);
+
+    for (uint y = sys + gl_LocalInvocationID.y; y < sye; y += gl_WorkGroupSize.y) {
+        for (uint x = sxs + gl_LocalInvocationID.x; x < sxe; x += gl_WorkGroupSize.x) {
+            process(ivec2(x, y));
+        }
+    }
+
+    if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
+        uint min_score = 0xFFFFFFFF;
+        uint min_idx = 3;
+        for (int i = 0; i < NUM_CHECKS; i++) {
+            if (score_mode[i] < min_score) {
+                min_score = score_mode[i];
+                min_idx = i;
+            }
+        }
+        sc.slice_rct_coef = rct_y_coeff[min_idx];
+    }
+}
+
+void main(void)
+{
+    if (force_pcm == 1)
+        return;
+    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
+    coeff_search(slice_ctx[slice_idx]);
+}
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 12/16] vulkan/ffv1: unify encode and decode get/put primitives
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (9 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 11/16] ffv1enc_vulkan: implement RCT search for level >= 4 Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 13/16] vulkan_ffv1: pipe through slice decoding status Lynne
                   ` (3 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

This simply makes a get_rac/put_rac_internal variant that can be
reused.
---
 libavcodec/vulkan/rangecoder.comp | 57 +++++++++----------------------
 1 file changed, 17 insertions(+), 40 deletions(-)

diff --git a/libavcodec/vulkan/rangecoder.comp b/libavcodec/vulkan/rangecoder.comp
index 9e2c5fbecf..8687b8bc3c 100644
--- a/libavcodec/vulkan/rangecoder.comp
+++ b/libavcodec/vulkan/rangecoder.comp
@@ -95,26 +95,26 @@ void renorm_encoder(inout RangeCoder c)
 }
 #endif
 
-void put_rac_direct(inout RangeCoder c, inout uint8_t state, bool bit)
+void put_rac_internal(inout RangeCoder c, const int range1, bool bit)
 {
-    int range1 = uint16_t((c.range * state) >> 8);
-
 #ifdef DEBUG
-    if (state == 0)
-        debugPrintfEXT("Error: state is zero");
     if (range1 >= c.range)
         debugPrintfEXT("Error: range1 >= c.range");
     if (range1 <= 0)
         debugPrintfEXT("Error: range1 <= 0");
 #endif
 
-    int diff = c.range - range1;
-    c.low += bit ? diff : 0;
-    c.range = bit ? range1 : diff;
+    int ranged = c.range - range1;
+    c.low += bit ? ranged : 0;
+    c.range = bit ? range1 : ranged;
 
     if (expectEXT(c.range < 0x100, false))
         renorm_encoder(c);
+}
 
+void put_rac_direct(inout RangeCoder c, inout uint8_t state, bool bit)
+{
+    put_rac_internal(c, (c.range * state) >> 8, bit);
     state = zero_one_state[(uint(bit) << 8) + state];
 }
 
@@ -126,21 +126,7 @@ void put_rac(inout RangeCoder c, uint64_t state, bool bit)
 /* Equiprobable bit */
 void put_rac_equi(inout RangeCoder c, bool bit)
 {
-    int range1 = c.range >> 1;
-
-#ifdef DEBUG
-    if (range1 >= c.range)
-        debugPrintfEXT("Error: range1 >= c.range");
-    if (range1 <= 0)
-        debugPrintfEXT("Error: range1 <= 0");
-#endif
-
-    int diff = c.range - range1;
-    c.low += bit ? diff : 0;
-    c.range = bit ? range1 : diff;
-
-    if (expectEXT(c.range < 0x100, false))
-        renorm_encoder(c);
+    put_rac_internal(c, c.range >> 1, bit);
 }
 
 void put_rac_terminate(inout RangeCoder c)
@@ -224,11 +210,9 @@ void refill(inout RangeCoder c)
     }
 }
 
-bool get_rac_direct(inout RangeCoder c, inout uint8_t state)
+bool get_rac_internal(inout RangeCoder c, const int range1)
 {
-    int range1 = c.range * state >> 8;
     int ranged = c.range - range1;
-
     bool bit = c.low >= ranged;
     c.low -= bit ? ranged : 0;
     c.range = (bit ? 0 : ranged) + (bit ? range1 : 0);
@@ -236,6 +220,12 @@ bool get_rac_direct(inout RangeCoder c, inout uint8_t state)
     if (expectEXT(c.range < 0x100, false))
         refill(c);
 
+    return bit;
+}
+
+bool get_rac_direct(inout RangeCoder c, inout uint8_t state)
+{
+    bool bit = get_rac_internal(c, c.range * state >> 8);
     state = zero_one_state[state + (bit ? 256 : 0)];
     return bit;
 }
@@ -247,18 +237,5 @@ bool get_rac(inout RangeCoder c, uint64_t state)
 
 bool get_rac_equi(inout RangeCoder c)
 {
-    int range1 = c.range >> 1;
-
-    c.range -= range1;
-
-    bool bit = c.low >= c.range;
-    if (bit) {
-        c.low -= c.range;
-        c.range = range1;
-    }
-
-    if (expectEXT(c.range < 0x100, false))
-        refill(c);
-
-    return bit;
+    return get_rac_internal(c, c.range >> 1);
 }
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 13/16] vulkan_ffv1: pipe through slice decoding status
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (10 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 12/16] vulkan/ffv1: unify encode and decode get/put primitives Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 14/16] vulkan: enable VK_KHR_shader_subgroup_rotate Lynne
                   ` (2 subsequent siblings)
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

---
 libavcodec/vulkan/ffv1_dec.comp       |  4 ++
 libavcodec/vulkan/ffv1_dec_setup.comp |  4 +-
 libavcodec/vulkan_decode.c            |  1 +
 libavcodec/vulkan_decode.h            |  1 +
 libavcodec/vulkan_ffv1.c              | 60 +++++++++++++++++++--------
 5 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp
index e73b3f1dc0..1d33b32c6b 100644
--- a/libavcodec/vulkan/ffv1_dec.comp
+++ b/libavcodec/vulkan/ffv1_dec.comp
@@ -291,4 +291,8 @@ void main(void)
 {
     const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
     decode_slice(slice_ctx[slice_idx], slice_idx);
+
+    uint32_t status = corrupt ? uint32_t(corrupt) : overread;
+    if (status != 0)
+        slice_status[2*slice_idx + 1] = status;
 }
diff --git a/libavcodec/vulkan/ffv1_dec_setup.comp b/libavcodec/vulkan/ffv1_dec_setup.comp
index a27a878927..671f28e7e7 100644
--- a/libavcodec/vulkan/ffv1_dec_setup.comp
+++ b/libavcodec/vulkan/ffv1_dec_setup.comp
@@ -133,6 +133,8 @@ void main(void)
         for (int i = 0; i < slice_size; i++)
             crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8);
 
-        slice_crc_mismatch[slice_idx] = crc;
+        slice_status[2*slice_idx + 0] = crc;
     }
+
+    slice_status[2*slice_idx + 1] = corrupt ? uint32_t(corrupt) : overread;
 }
diff --git a/libavcodec/vulkan_decode.c b/libavcodec/vulkan_decode.c
index f1313c8409..7310ba1547 100644
--- a/libavcodec/vulkan_decode.c
+++ b/libavcodec/vulkan_decode.c
@@ -142,6 +142,7 @@ static void init_frame(FFVulkanDecodeContext *dec, FFVulkanDecodePicture *vkpic)
 
     vkpic->destroy_image_view = vk->DestroyImageView;
     vkpic->wait_semaphores = vk->WaitSemaphores;
+    vkpic->invalidate_memory_ranges = vk->InvalidateMappedMemoryRanges;
 }
 
 int ff_vk_decode_prepare_frame(FFVulkanDecodeContext *dec, AVFrame *pic,
diff --git a/libavcodec/vulkan_decode.h b/libavcodec/vulkan_decode.h
index cbd22b3591..bf6506f280 100644
--- a/libavcodec/vulkan_decode.h
+++ b/libavcodec/vulkan_decode.h
@@ -114,6 +114,7 @@ typedef struct FFVulkanDecodePicture {
     /* Vulkan functions needed for destruction, as no other context is guaranteed to exist */
     PFN_vkWaitSemaphores            wait_semaphores;
     PFN_vkDestroyImageView          destroy_image_view;
+    PFN_vkInvalidateMappedMemoryRanges invalidate_memory_ranges;
 } FFVulkanDecodePicture;
 
 /**
diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c
index efbf5fa953..c839f4c387 100644
--- a/libavcodec/vulkan_ffv1.c
+++ b/libavcodec/vulkan_ffv1.c
@@ -221,7 +221,7 @@ static int vk_ffv1_start_frame(AVCodecContext          *avctx,
                                   &fp->slice_status_buf,
                                   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
-                                  NULL, f->slice_count*sizeof(uint32_t),
+                                  NULL, 2*f->slice_count*sizeof(uint32_t),
                                   VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
                                   VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
     if (err < 0)
@@ -408,7 +408,7 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
     ff_vk_shader_update_desc_buffer(&ctx->s, exec, &fv->setup,
                                     1, 2, 0,
                                     slice_status,
-                                    0, f->slice_count*sizeof(uint32_t),
+                                    0, 2*f->slice_count*sizeof(uint32_t),
                                     VK_FORMAT_UNDEFINED);
 
     ff_vk_exec_bind_shader(&ctx->s, exec, &fv->setup);
@@ -538,10 +538,15 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
                                   1, 1,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_NULL_HANDLE);
+    ff_vk_shader_update_desc_buffer(&ctx->s, exec, decode_shader,
+                                    1, 2, 0,
+                                    slice_status,
+                                    0, 2*f->slice_count*sizeof(uint32_t),
+                                    VK_FORMAT_UNDEFINED);
     if (is_rgb)
         ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader,
                                       f->picture.f, vp->view.out,
-                                      1, 2,
+                                      1, 3,
                                       VK_IMAGE_LAYOUT_GENERAL,
                                       VK_NULL_HANDLE);
 
@@ -700,8 +705,8 @@ static int init_setup_shader(FFV1Context *f, FFVulkanContext *s,
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
             .mem_quali   = "writeonly",
-            .buf_content = "uint32_t slice_crc_mismatch",
-            .buf_elems   = f->max_slice_count,
+            .buf_content = "uint32_t slice_status",
+            .buf_elems   = 2*f->max_slice_count,
         },
     };
     RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 0, 0));
@@ -895,6 +900,14 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
             .elems      = av_pix_fmt_count_planes(dec_frames_ctx->sw_format),
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
+        {
+            .name        = "slice_status_buf",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_quali   = "writeonly",
+            .buf_content = "uint32_t slice_status",
+            .buf_elems   = 2*f->max_slice_count,
+        },
         {
             .name       = "dst",
             .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
@@ -906,7 +919,7 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2 + rgb, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3 + rgb, 0, 0));
 
     GLSLD(ff_source_ffv1_dec_comp);
 
@@ -1114,22 +1127,35 @@ fail:
 
 static void vk_ffv1_free_frame_priv(AVRefStructOpaque _hwctx, void *data)
 {
-    AVHWDeviceContext *hwctx = _hwctx.nc;
+    AVHWDeviceContext *dev_ctx = _hwctx.nc;
+    AVVulkanDeviceContext *hwctx = dev_ctx->hwctx;
 
     FFv1VulkanDecodePicture *fp = data;
     FFVulkanDecodePicture *vp = &fp->vp;
+    FFVkBuffer *slice_status = (FFVkBuffer *)fp->slice_status_buf->data;
 
-    ff_vk_decode_free_frame(hwctx, vp);
+    ff_vk_decode_free_frame(dev_ctx, vp);
+
+    /* Invalidate slice/output data if needed */
+    if (!(slice_status->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+        VkMappedMemoryRange invalidate_data = {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = slice_status->mem,
+            .offset = 0,
+            .size = 2*fp->slice_num*sizeof(uint32_t),
+        };
+        vp->invalidate_memory_ranges(hwctx->act_dev,
+                                     1, &invalidate_data);
+    }
 
-    if (fp->crc_checked) {
-        FFVkBuffer *slice_status = (FFVkBuffer *)fp->slice_status_buf->data;
-        for (int i = 0; i < fp->slice_num; i++) {
-            uint32_t crc_res;
-            crc_res = AV_RN32(slice_status->mapped_mem + i*sizeof(uint32_t));
-            if (crc_res != 0)
-                av_log(hwctx, AV_LOG_ERROR, "CRC mismatch in slice %i, res: 0x%x\n",
-                       i, crc_res);
-        }
+    for (int i = 0; i < fp->slice_num; i++) {
+        uint32_t crc_res = 0;
+        if (fp->crc_checked)
+            crc_res = AV_RN32(slice_status->mapped_mem + 2*i*sizeof(uint32_t) + 0);
+        uint32_t status = AV_RN32(slice_status->mapped_mem + 2*i*sizeof(uint32_t) + 4);
+        if (status || crc_res)
+            av_log(dev_ctx, AV_LOG_ERROR, "Slice %i status: 0x%x, CRC 0x%x\n",
+                   i, status, crc_res);
     }
 
     av_buffer_unref(&vp->slices_buf);
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 14/16] vulkan: enable VK_KHR_shader_subgroup_rotate
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (11 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 13/16] vulkan_ffv1: pipe through slice decoding status Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 15/16] hwcontext_vulkan: correct image transfer usage flags Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 16/16] hwcontext_vulkan: only try exporting DMABUF memory on !WIN32 and only for DMABUF tiling Lynne
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

Yet another thing that should've been always present.
---
 libavutil/hwcontext_vulkan.c | 5 +++++
 libavutil/vulkan_functions.h | 1 +
 libavutil/vulkan_loader.h    | 1 +
 3 files changed, 7 insertions(+)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 978d7e29d3..eded36bc01 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -79,6 +79,7 @@ typedef struct VulkanDeviceFeatures {
     VkPhysicalDeviceVulkan12Features vulkan_1_2;
     VkPhysicalDeviceVulkan13Features vulkan_1_3;
     VkPhysicalDeviceTimelineSemaphoreFeatures timeline_semaphore;
+    VkPhysicalDeviceShaderSubgroupRotateFeaturesKHR subgroup_rotate;
 
 #ifdef VK_KHR_shader_expect_assume
     VkPhysicalDeviceShaderExpectAssumeFeaturesKHR expect_assume;
@@ -205,6 +206,8 @@ static void device_features_init(AVHWDeviceContext *ctx, VulkanDeviceFeatures *f
 
     FF_VK_STRUCT_EXT(s, &feats->device, &feats->timeline_semaphore, FF_VK_EXT_PORTABILITY_SUBSET,
                      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES);
+    FF_VK_STRUCT_EXT(s, &feats->device, &feats->subgroup_rotate, FF_VK_EXT_SUBGROUP_ROTATE,
+                     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_ROTATE_FEATURES);
 
 #ifdef VK_KHR_shader_expect_assume
     FF_VK_STRUCT_EXT(s, &feats->device, &feats->expect_assume, FF_VK_EXT_EXPECT_ASSUME,
@@ -283,6 +286,7 @@ static void device_features_copy_needed(VulkanDeviceFeatures *dst, VulkanDeviceF
     COPY_VAL(vulkan_1_3.dynamicRendering);
 
     COPY_VAL(timeline_semaphore.timelineSemaphore);
+    COPY_VAL(subgroup_rotate.shaderSubgroupRotate);
 
     COPY_VAL(video_maintenance_1.videoMaintenance1);
 #ifdef VK_KHR_video_maintenance2
@@ -588,6 +592,7 @@ static const VulkanOptExtension optional_device_exts[] = {
     { VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME,               FF_VK_EXT_COOP_MATRIX            },
     { VK_NV_OPTICAL_FLOW_EXTENSION_NAME,                      FF_VK_EXT_OPTICAL_FLOW           },
     { VK_EXT_SHADER_OBJECT_EXTENSION_NAME,                    FF_VK_EXT_SHADER_OBJECT          },
+    { VK_KHR_SHADER_SUBGROUP_ROTATE_EXTENSION_NAME,           FF_VK_EXT_SUBGROUP_ROTATE        },
 #ifdef VK_KHR_shader_expect_assume
     { VK_KHR_SHADER_EXPECT_ASSUME_EXTENSION_NAME,             FF_VK_EXT_EXPECT_ASSUME          },
 #endif
diff --git a/libavutil/vulkan_functions.h b/libavutil/vulkan_functions.h
index cd61d71577..8b413013e6 100644
--- a/libavutil/vulkan_functions.h
+++ b/libavutil/vulkan_functions.h
@@ -48,6 +48,7 @@ typedef uint64_t FFVulkanExtensions;
 #define FF_VK_EXT_PUSH_DESCRIPTOR        (1ULL << 14) /* VK_KHR_push_descriptor */
 #define FF_VK_EXT_RELAXED_EXTENDED_INSTR (1ULL << 15) /* VK_KHR_shader_relaxed_extended_instruction */
 #define FF_VK_EXT_EXPECT_ASSUME          (1ULL << 16) /* VK_KHR_shader_expect_assume */
+#define FF_VK_EXT_SUBGROUP_ROTATE        (1ULL << 17) /* VK_KHR_shader_subgroup_rotate */
 
 /* Video extensions */
 #define FF_VK_EXT_VIDEO_QUEUE            (1ULL << 36) /* VK_KHR_video_queue */
diff --git a/libavutil/vulkan_loader.h b/libavutil/vulkan_loader.h
index eaf6e2e6bb..a7976fe560 100644
--- a/libavutil/vulkan_loader.h
+++ b/libavutil/vulkan_loader.h
@@ -58,6 +58,7 @@ static inline uint64_t ff_vk_extensions_to_mask(const char * const *extensions,
         { VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME,        FF_VK_EXT_COOP_MATRIX            },
         { VK_NV_OPTICAL_FLOW_EXTENSION_NAME,               FF_VK_EXT_OPTICAL_FLOW           },
         { VK_EXT_SHADER_OBJECT_EXTENSION_NAME,             FF_VK_EXT_SHADER_OBJECT          },
+        { VK_KHR_SHADER_SUBGROUP_ROTATE_EXTENSION_NAME,    FF_VK_EXT_SUBGROUP_ROTATE        },
         { VK_KHR_VIDEO_MAINTENANCE_1_EXTENSION_NAME,       FF_VK_EXT_VIDEO_MAINTENANCE_1    },
 #ifdef VK_KHR_video_maintenance2
         { VK_KHR_VIDEO_MAINTENANCE_2_EXTENSION_NAME,       FF_VK_EXT_VIDEO_MAINTENANCE_2    },
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 15/16] hwcontext_vulkan: correct image transfer usage flags
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (12 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 14/16] vulkan: enable VK_KHR_shader_subgroup_rotate Lynne
@ 2025-05-14 19:02 ` Lynne
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 16/16] hwcontext_vulkan: only try exporting DMABUF memory on !WIN32 and only for DMABUF tiling Lynne
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

By pure coincidence, BUFFER and IMAGE flags were equal for those
two usage types.
---
 libavutil/hwcontext_vulkan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index eded36bc01..9f9df91e5d 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -2784,8 +2784,8 @@ static int vulkan_frames_init(AVHWFramesContext *hwfc)
 
     /* Image usage flags */
     if (!hwctx->usage) {
-        hwctx->usage = supported_usage & (VK_BUFFER_USAGE_TRANSFER_DST_BIT |
-                                          VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+        hwctx->usage = supported_usage & (VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+                                          VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                                           VK_IMAGE_USAGE_STORAGE_BIT       |
                                           VK_IMAGE_USAGE_SAMPLED_BIT);
 
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH 16/16] hwcontext_vulkan: only try exporting DMABUF memory on !WIN32 and only for DMABUF tiling
  2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
                   ` (13 preceding siblings ...)
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 15/16] hwcontext_vulkan: correct image transfer usage flags Lynne
@ 2025-05-14 19:02 ` Lynne
  14 siblings, 0 replies; 17+ messages in thread
From: Lynne @ 2025-05-14 19:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lynne

---
 libavutil/hwcontext_vulkan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 9f9df91e5d..4f205137eb 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -2643,11 +2643,12 @@ static AVBufferRef *vulkan_pool_alloc(void *opaque, size_t size)
     if (p->vkctx.extensions & FF_VK_EXT_EXTERNAL_FD_MEMORY)
         try_export_flags(hwfc, &eiinfo.handleTypes, &e,
                          VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT);
-#endif
 
-    if (p->vkctx.extensions & FF_VK_EXT_EXTERNAL_DMABUF_MEMORY)
+    if (p->vkctx.extensions & FF_VK_EXT_EXTERNAL_DMABUF_MEMORY &&
+        hwctx->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
         try_export_flags(hwfc, &eiinfo.handleTypes, &e,
                          VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
+#endif
 
     for (int i = 0; i < av_pix_fmt_count_planes(hwfc->sw_format); i++) {
         eminfo[i].sType       = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO;
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [FFmpeg-devel] [PATCH] ffv1enc_vulkan: fix array overflow
  2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 06/16] ffv1enc_vulkan: switch to 2-line cache, unify prediction code Lynne
@ 2025-05-23 14:38   ` Jerome Martinez
  0 siblings, 0 replies; 17+ messages in thread
From: Jerome Martinez @ 2025-05-23 14:38 UTC (permalink / raw)
  To: ffmpeg-devel

[-- Attachment #1: Type: text/plain, Size: 206 bytes --]

Fix a crash (some GCC) or silent quit (Microsoft compiler) after
"[PATCH 06/16] ffv1enc_vulkan: switch to 2-line cache, unify prediction 
code"
https://ffmpeg.org/pipermail/ffmpeg-devel/2025-May/343502.html

[-- Attachment #2: 0001-vulkan-ffv1-fix-array-overflow.patch --]
[-- Type: text/plain, Size: 739 bytes --]

From: Maxime Gervais <maxime@mediaarea.net>
Date: Fri, 23 May 2025 16:20:41 +0200
Subject: [PATCH] ffv1enc_vulkan: fix array overflow

---
 libavcodec/ffv1enc_vulkan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index d9e12f5fae..15aaddac98 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -1358,6 +1358,8 @@ static int init_encode_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .mem_quali   = "writeonly",
             .buf_content = "uint64_t slice_results[2048];",
         },
+        { /* place holder for desc_set[3] */
+        },
     };
     if (fv->is_rgb) {
         AVHWFramesContext *intermediate_frames_ctx;

[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2025-05-23 14:39 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-14 19:02 [FFmpeg-devel] [PATCH 01/16] ffv1enc_vulkan: merge all encoder variants into one file Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 02/16] vulkan/ffv1: synchronize get_pred implementations between encoder and decoder Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 03/16] ffv1enc_vulkan: get rid of temporary data for the setup shader Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 04/16] ffv1enc_vulkan: unify EC code between setup and encode Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 05/16] ffv1enc_vulkan: minor EC optimizations Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 06/16] ffv1enc_vulkan: switch to 2-line cache, unify prediction code Lynne
2025-05-23 14:38   ` [FFmpeg-devel] [PATCH] ffv1enc_vulkan: fix array overflow Jerome Martinez
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 07/16] ffv1_common: minor RGB optimization Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 08/16] ffv1enc_vulkan: use ff_get_encode_buffer Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 09/16] vulkan_ffv1: fix PCM + cached symbol reader Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 10/16] ffv1enc_vulkan: implement the cached EC writer from the decoder Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 11/16] ffv1enc_vulkan: implement RCT search for level >= 4 Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 12/16] vulkan/ffv1: unify encode and decode get/put primitives Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 13/16] vulkan_ffv1: pipe through slice decoding status Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 14/16] vulkan: enable VK_KHR_shader_subgroup_rotate Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 15/16] hwcontext_vulkan: correct image transfer usage flags Lynne
2025-05-14 19:02 ` [FFmpeg-devel] [PATCH 16/16] hwcontext_vulkan: only try exporting DMABUF memory on !WIN32 and only for DMABUF tiling Lynne

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git