From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100])
	by master.gitmailbox.com (Postfix) with ESMTPS id 74C864CA3D
	for <ffmpegdev@gitmailbox.com>; Sat, 12 Apr 2025 07:26:33 +0000 (UTC)
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id E767268C7E7;
	Sat, 12 Apr 2025 10:23:31 +0300 (EEST)
Received: from vidala.pars.ee (vidala.pars.ee [116.203.72.101])
 by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id BEC6468C4D1
 for <ffmpeg-devel@ffmpeg.org>; Sat, 12 Apr 2025 10:23:03 +0300 (EEST)
DKIM-Signature: v=1; a=rsa-sha256; s=202405r; d=lynne.ee; c=relaxed/relaxed;
 h=Message-ID:Date:Subject:To:From; t=1744442583; bh=KaLnbNI2lac3z62sBMyANmF
 fU4usOGtenbXWQmgmpcE=; b=UYEfOuva02lG/LdmiT0T54kb0vNECAcibWl3SamTbBttA0/Jqv
 L+8Ko7tRwBrbIFfPjBy8Habg27nyi9qoCXjVlMx4GGGOP20CI50wt8gIkGEmPZVdC+5T/vbJCX+
 Q4OKrvR2VJHQkiLaQNE+npywNS+YQqNVfgIC9Ti13Qw92sHiwI28ktJS75NvwYxuhNCER0YZ8PJ
 UnwUszpl+9Lef5SrsHkFypr6NK+dYqAcIUOAH6drQhZGCaxFP7rD9pCt3NtSxaV5jrJAnsgW4mD
 G2Mj1AfoyiszK7xEK3bT3uWKfgMfI8iH/FWuvwwY0u8Ig8lQyAlo0FYzmq3WjQOIaBA==;
DKIM-Signature: v=1; a=ed25519-sha256; s=202405e; d=lynne.ee; c=relaxed/relaxed;
 h=Message-ID:Date:Subject:To:From; t=1744442583; bh=KaLnbNI2lac3z62sBMyANmF
 fU4usOGtenbXWQmgmpcE=; b=w8P8BPUDgHBuLaT/wTl6rhw2L82Nt5EsYCwf4kalUyzB5i0JED
 vzwOjiO1BKsD4cX+feeinBxBfgF4KjGMRJDg==;
From: Lynne <dev@lynne.ee>
To: ffmpeg-devel@ffmpeg.org
Date: Sat, 12 Apr 2025 09:22:49 +0200
Message-ID: <20250412072256.77815-18-dev@lynne.ee>
X-Mailer: git-send-email 2.49.0
In-Reply-To: <20250412072256.77815-1-dev@lynne.ee>
References: <20250412072256.77815-1-dev@lynne.ee>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH 18/18] vulkan_ffv1: add cached symbol reader
 for AMD
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Lynne <dev@lynne.ee>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Archived-At: <https://master.gitmailbox.com/ffmpegdev/20250412072256.77815-18-dev@lynne.ee/>
List-Archive: <https://master.gitmailbox.com/ffmpegdev/>
List-Post: <mailto:ffmpegdev@gitmailbox.com>

Speeds up everything on AMD by 3x.
This uses 32 local invocations to load state into cache, as well
as to do the RCT faster.
---
 libavcodec/vulkan/ffv1_dec.comp | 71 ++++++++++++++++++++-------------
 libavcodec/vulkan_ffv1.c        |  7 +++-
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp
index 9eba322b27..3c46ee1771 100644
--- a/libavcodec/vulkan/ffv1_dec.comp
+++ b/libavcodec/vulkan/ffv1_dec.comp
@@ -108,34 +108,37 @@ ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
 #endif
 
 #ifndef GOLOMB
-int get_isymbol(inout RangeCoder c, uint64_t state)
+#ifdef CACHED_SYMBOL_READER
+shared uint8_t state[CONTEXT_SIZE];
+#define READ(c, off) get_rac_direct(c, state[off])
+#else
+#define READ(c, off) get_rac(c, uint64_t(slice_state) + state_off + off)
+#endif
+
+int get_isymbol(inout RangeCoder c, uint state_off)
 {
-    if (expectEXT(get_rac(c, state), false))
+    if (expectEXT(READ(c, 0), false))
         return 0;
 
-    state += 1;
-
-    int e;
-    for (e = 0; e < 32; e++)
-        if (!get_rac(c, state + min(e, 9)))
+    int e = 1;
+    for (; e < 33; e++)
+        if (!READ(c, min(e, 10)))
             break;
 
-    if (expectEXT(e == 0, false)) {
-        return get_rac(c, state + 10) ? -1 : 1;
-    } else if (expectEXT(e > 31, false)) {
+    if (expectEXT(e == 1, false)) {
+        return READ(c, 11) ? -1 : 1;
+    } else if (expectEXT(e == 33, false)) {
         corrupt = true;
         return 0;
     }
 
-    state += 21;
-
     int a = 1;
-    for (int i = e - 1; i >= 0; i--) {
+    for (int i = e + 20; i >= 22; i--) {
         a <<= 1;
-        a |= int(get_rac(c, state + min(i, 9)));  // 22..31
+        a |= int(READ(c, min(i, 31)));
     }
 
-    return get_rac(c, state - 11 + min(e, 10)) ? -a : a;
+    return READ(c, min(e + 10, 21)) ? -a : a;
 }
 
 void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int bits)
@@ -157,7 +160,7 @@ void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int b
 }
 
 void decode_line(inout SliceContext sc, ivec2 sp, int w,
-                 int y, int p, int bits, uint64_t state,
+                 int y, int p, int bits, uint state_off,
                  uint8_t quant_table_idx, const int run_index)
 {
 #ifndef RGB
@@ -171,19 +174,33 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
         ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
                             quant_table_idx);
 
-        int diff = get_isymbol(sc.c, state + CONTEXT_SIZE*abs(pr[0]));
-        if (pr[0] < 0)
-            diff = -diff;
+        uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]);
+#ifdef CACHED_SYMBOL_READER
+        u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x);
+        state[gl_LocalInvocationID.x] = sb.v;
+        barrier();
+        if (gl_LocalInvocationID.x == 0) {
 
-        uint v = zero_extend(pr[1] + diff, bits);
-        imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));
+#endif
+
+            int diff = get_isymbol(sc.c, context_off);
+            if (pr[0] < 0)
+                diff = -diff;
+
+            uint v = zero_extend(pr[1] + diff, bits);
+            imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));
+
+#ifdef CACHED_SYMBOL_READER
+        }
+        sb.v = state[gl_LocalInvocationID.x];
+#endif
     }
 }
 
 #else /* GOLOMB */
 
 void decode_line(inout SliceContext sc, ivec2 sp, int w,
-                 int y, int p, int bits, uint64_t state,
+                 int y, int p, int bits, uint state_off,
                  uint8_t quant_table_idx, inout int run_index)
 {
 #ifndef RGB
@@ -202,7 +219,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w,
         ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
                             quant_table_idx);
 
-        VlcState sb = VlcState(state + VLC_STATE_SIZE*abs(pr[0]));
+        VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*abs(pr[0]));
 
         if (pr[0] == 0 && run_mode == 0)
             run_mode = 1;
@@ -263,7 +280,7 @@ ivec4 transform_sample(ivec4 pix, ivec2 rct_coef)
 
 void writeout_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct)
 {
-    for (int x = 0; x < w; x++) {
+    for (uint x = gl_LocalInvocationID.x; x < w; x += gl_WorkGroupSize.x) {
         ivec2 lpos = sp + LADDR(ivec2(x, y));
         ivec2 pos = sc.slice_pos + ivec2(x, y);
 
@@ -305,6 +322,8 @@ void decode_slice(inout SliceContext sc, const uint slice_idx)
     /* PCM coding */
 #ifndef GOLOMB
     if (sc.slice_coding_mode == 1) {
+        if (gl_LocalInvocationID.x > 0)
+            return;
 #ifndef RGB
         for (int p = 0; p < planes; p++) {
             int h = sc.slice_dim.y;
@@ -328,9 +347,7 @@ void decode_slice(inout SliceContext sc, const uint slice_idx)
 #endif
     {
         u8vec4 quant_table_idx = sc.quant_table_idx.xyyz;
-        u64vec4 slice_state_off = (uint64_t(slice_state) +
-                                   slice_idx*plane_state_size*codec_planes) +
-                                  plane_state_size*uvec4(0, 1, 1, 2);
+        u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size;
 
 #ifndef RGB
         for (int p = 0; p < planes; p++) {
diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c
index c1875711bc..33c4e9114d 100644
--- a/libavcodec/vulkan_ffv1.c
+++ b/libavcodec/vulkan_ffv1.c
@@ -823,12 +823,14 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
     uint8_t *spv_data;
     size_t spv_len;
     void *spv_opaque = NULL;
+    int use_cached_reader = ac != AC_GOLOMB_RICE &&
+                            s->driver_props.driverID == VK_DRIVER_ID_MESA_RADV;
 
     RET(ff_vk_shader_init(s, shd, "ffv1_dec",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
                                              "GL_EXT_buffer_reference2" }, 2,
-                          1, 1, 1,
+                          use_cached_reader ? 32 : 1, 1, 1,
                           0));
 
     if (ac == AC_GOLOMB_RICE)
@@ -837,6 +839,9 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
     if (rgb)
         av_bprintf(&shd->src, "#define RGB\n");
 
+    if (use_cached_reader)
+        av_bprintf(&shd->src, "#define CACHED_SYMBOL_READER 1\n");
+
     /* Common codec header */
     GLSLD(ff_source_common_comp);
 
-- 
2.47.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".