[FFmpeg-devel] [PATCH] avfilter/vf_nlmeans_vulkan: rewrite filter (PR #20689)

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

From: my4ng via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: my4ng <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] avfilter/vf_nlmeans_vulkan: rewrite filter (PR #20689)
Date: Sat, 11 Oct 2025 05:25:43 -0000
Message-ID: <176016034378.49.16726346481155709312@bf249f23a2c8> (raw)

PR #20689 opened by my4ng
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20689
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20689.patch

This is a major rewrite of the exising nlmeans vulkan code, with bug fixes and major performance improvement.

- Fix visual artifacts found in ticket #10661, #10733. 
- Add OOB checks for image loading and patch sized area around the border. 
- Correct chroma plane height, strength and buffer barrier index.

- Improve parallelism with component workgroup axis and more but smaller workgroups. 
- Split weights pass into vertical/horizontal (integral) and weights passes. 
- Remove h/v order logic to always calculate sum on vertical pass. 
- Remove atomic float requirement, which causes high memory locking contentions, at the cost of higher memory usage of w/s buffer.
- Use cache blocking in h pass to reduce memory bandwidth usage.
- Default parallelism `t` changed to 8 with range `[1,64]`.

Performance results tested with AMD Radeon RX 9070 XT (RADV GFX1201) with optimal `t`'s:

`ffmpeg -init_hw_device vulkan -f lavfi -i color=black:s=1920x1080:d=30 -vf "format=nv12,hwupload,nlmeans_vulkan=t=<t>" -f null - `

| Dimension  | Speed before (t=16) | Speed after (t=8) | Improvement  |
|---------|---------|---------|---------|
| 848x480 | 3.26x | 7.91x | 2.42x |
| 1280x720 | 1.19x | 3.00x | 2.52x |
| 1920x1080 | 0.55x | 1.42x | 2.58x |
| 2560x1440 | 0.29x | 0.77x | 2.66x |
| 3840x2160 | 0.12x | 0.35x | 2.92x |


>From 16b9284620ae9e2a518c04ec257eed3567ecd6ca Mon Sep 17 00:00:00 2001
From: Michael Yang <admin@my4ng.dev>
Date: Tue, 30 Sep 2025 12:40:21 +1000
Subject: [PATCH] avfilter/vf_nlmeans_vulkan: rewrite filter

This is a major rewrite of the exising nlmeans vulkan code, with bug
fixes and major performance improvement.

Fix visual artifacts found in ticket #10661, #10733. Add OOB checks for
image loading and patch sized area around the border. Correct chroma
plane height, strength and buffer barrier index.

Improve parallelism with component workgroup axis and more but smaller
workgroups. Split weights pass into vertical/horizontal (integral) and
weights passes. Remove h/v order logic to always calculate sum on
vertical pass. Remove atomic float requirement, which causes high memory
locking contentions, at the cost of higher memory usage of w/s buffer.
Use cache blocking in h pass to reduce memory bandwidth usage.
---
 libavfilter/vf_nlmeans_vulkan.c | 970 +++++++++++++++++---------------
 1 file changed, 524 insertions(+), 446 deletions(-)

diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index 22a2a73eae..bffca4066a 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -30,6 +30,9 @@
 #define TYPE_NAME  "vec4"
 #define TYPE_ELEMS 4
 #define TYPE_SIZE  (TYPE_ELEMS*4)
+#define TYPE_BLOCK_ELEMS 16
+#define TYPE_BLOCK_SIZE (TYPE_SIZE * TYPE_BLOCK_ELEMS)
+#define WG_SIZE 32
 
 typedef struct NLMeansVulkanContext {
     FFVulkanContext vkctx;
@@ -43,7 +46,8 @@ typedef struct NLMeansVulkanContext {
 
     FFVkBuffer xyoffsets_buf;
 
-    int pl_weights_rows;
+    FFVulkanShader shd_horizontal;
+    FFVulkanShader shd_vertical;
     FFVulkanShader shd_weights;
     FFVulkanShader shd_denoise;
 
@@ -63,204 +67,39 @@ typedef struct NLMeansVulkanContext {
     } opts;
 } NLMeansVulkanContext;
 
-static void insert_first(FFVulkanShader *shd, int r, const char *off, int horiz, int plane, int comp)
-{
-    GLSLF(4, s1    = imageLoad(input_img[%i], pos + ivec2(%i + %s, %i + %s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
-
-    GLSLF(4, s2[0] = imageLoad(input_img[%i], pos + offs[0] + ivec2(%i + %s, %i + %s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
-    GLSLF(4, s2[1] = imageLoad(input_img[%i], pos + offs[1] + ivec2(%i + %s, %i + %s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
-    GLSLF(4, s2[2] = imageLoad(input_img[%i], pos + offs[2] + ivec2(%i + %s, %i + %s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
-    GLSLF(4, s2[3] = imageLoad(input_img[%i], pos + offs[3] + ivec2(%i + %s, %i + %s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
-
-    GLSLC(4, s2 = (s1 - s2) * (s1 - s2);                                                    );
-}
-
-static void insert_horizontal_pass(FFVulkanShader *shd, int nb_rows, int first, int plane, int comp)
-{
-    GLSLF(1, pos.y = int(gl_GlobalInvocationID.x) * %i;                           ,nb_rows);
-    if (!first)
-        GLSLC(1, barrier();                                                       );
-    GLSLC(0,                                                                      );
-    GLSLF(1, if (pos.y < height[%i]) {                                            ,plane);
-    GLSLC(2,     #pragma unroll(1)                                                );
-    GLSLF(2,     for (r = 0; r < %i; r++) {                                       ,nb_rows);
-    GLSLC(3,         prefix_sum = DTYPE(0);                                       );
-    GLSLC(3,         offset = int_stride * uint64_t(pos.y + r);                   );
-    GLSLC(3,         dst = DataBuffer(uint64_t(integral_data) + offset);          );
-    GLSLC(0,                                                                      );
-    GLSLF(3,         for (pos.x = 0; pos.x < width[%i]; pos.x++) {                ,plane);
-    if (first)
-        insert_first(shd, 0, "r", 0, plane, comp);
-    else
-        GLSLC(4,         s2 = dst.v[pos.x];                                       );
-    GLSLC(4,             dst.v[pos.x] = s2 + prefix_sum;                          );
-    GLSLC(4,             prefix_sum += s2;                                        );
-    GLSLC(3,         }                                                            );
-    GLSLC(2,     }                                                                );
-    GLSLC(1, }                                                                    );
-    GLSLC(0,                                                                      );
-}
-
-static void insert_vertical_pass(FFVulkanShader *shd, int nb_rows, int first, int plane, int comp)
-{
-    GLSLF(1, pos.x = int(gl_GlobalInvocationID.x) * %i;                           ,nb_rows);
-    GLSLC(1, #pragma unroll(1)                                                    );
-    GLSLF(1, for (r = 0; r < %i; r++)                                             ,nb_rows);
-    GLSLC(2,     psum[r] = DTYPE(0);                                              );
-    GLSLC(0,                                                                      );
-    if (!first)
-        GLSLC(1, barrier();                                                       );
-    GLSLC(0,                                                                      );
-    GLSLF(1, if (pos.x < width[%i]) {                                             ,plane);
-    GLSLF(2,     for (pos.y = 0; pos.y < height[%i]; pos.y++) {                   ,plane);
-    GLSLC(3,         offset = int_stride * uint64_t(pos.y);                       );
-    GLSLC(3,         dst = DataBuffer(uint64_t(integral_data) + offset);          );
-    GLSLC(0,                                                                      );
-    GLSLC(3,         #pragma unroll(1)                                            );
-    GLSLF(3,         for (r = 0; r < %i; r++) {                                   ,nb_rows);
-    if (first)
-        insert_first(shd, 0, "r", 1, plane, comp);
-    else
-        GLSLC(4,         s2 = dst.v[pos.x + r];                                   );
-    GLSLC(4,             dst.v[pos.x + r] = s2 + psum[r];                         );
-    GLSLC(4,             psum[r] += s2;                                           );
-    GLSLC(3,         }                                                            );
-    GLSLC(2,     }                                                                );
-    GLSLC(1, }                                                                    );
-    GLSLC(0,                                                                      );
-}
-
-static void insert_weights_pass(FFVulkanShader *shd, int nb_rows, int vert,
-                                int t, int dst_comp, int plane, int comp)
-{
-    GLSLF(1, p = patch_size[%i];                                              ,dst_comp);
-    GLSLC(0,                                                                  );
-    GLSLC(1, barrier();                                                       );
-    GLSLC(0,                                                                  );
-    if (!vert) {
-        GLSLF(1, for (pos.y = 0; pos.y < height[%i]; pos.y++) {               ,plane);
-        GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= width[%i])             ,nb_rows, plane);
-        GLSLC(3,         break;                                               );
-        GLSLF(2,     for (r = 0; r < %i; r++) {                               ,nb_rows);
-        GLSLF(3,         pos.x = int(gl_GlobalInvocationID.x) * %i + r;       ,nb_rows);
-    } else {
-        GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) {                ,plane);
-        GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= height[%i])            ,nb_rows, plane);
-        GLSLC(3,         break;                                               );
-        GLSLF(2,     for (r = 0; r < %i; r++) {                               ,nb_rows);
-        GLSLF(3,         pos.y = int(gl_GlobalInvocationID.x) * %i + r;       ,nb_rows);
-    }
-    GLSLC(0,                                                                  );
-    GLSLC(3,         a = DTYPE(0);                                            );
-    GLSLC(3,         b = DTYPE(0);                                            );
-    GLSLC(3,         c = DTYPE(0);                                            );
-    GLSLC(3,         d = DTYPE(0);                                            );
-    GLSLC(0,                                                                  );
-    GLSLC(3,         lt = ((pos.x - p) < 0) || ((pos.y - p) < 0);             );
-    GLSLC(0,                                                                  );
-    GLSLF(3,         src[0] = imageLoad(input_img[%i], pos + offs[0])[%i];    ,plane, comp);
-    GLSLF(3,         src[1] = imageLoad(input_img[%i], pos + offs[1])[%i];    ,plane, comp);
-    GLSLF(3,         src[2] = imageLoad(input_img[%i], pos + offs[2])[%i];    ,plane, comp);
-    GLSLF(3,         src[3] = imageLoad(input_img[%i], pos + offs[3])[%i];    ,plane, comp);
-    GLSLC(0,                                                                  );
-    GLSLC(3,         if (lt == false) {                                       );
-    GLSLC(3,             offset = int_stride * uint64_t(pos.y - p);           );
-    GLSLC(3,             dst = DataBuffer(uint64_t(integral_data) + offset);  );
-    GLSLC(4,             a = dst.v[pos.x - p];                                );
-    GLSLC(4,             c = dst.v[pos.x + p];                                );
-    GLSLC(3,             offset = int_stride * uint64_t(pos.y + p);           );
-    GLSLC(3,             dst = DataBuffer(uint64_t(integral_data) + offset);  );
-    GLSLC(4,             b = dst.v[pos.x - p];                                );
-    GLSLC(4,             d = dst.v[pos.x + p];                                );
-    GLSLC(3,         }                                                        );
-    GLSLC(0,                                                                  );
-    GLSLC(3,         patch_diff = d + a - b - c;                              );
-    GLSLF(3,         w = exp(patch_diff * strength[%i]);                      ,dst_comp);
-    GLSLC(3,         w_sum = w[0] + w[1] + w[2] + w[3];                       );
-    GLSLC(3,         sum = dot(w, src*255);                                   );
-    GLSLC(0,                                                                  );
-    if (t > 1) {
-        GLSLF(3,         atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum);   ,dst_comp, dst_comp);
-        GLSLF(3,         atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum);        ,dst_comp, dst_comp);
-    } else {
-        GLSLF(3,         weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum;            ,dst_comp, dst_comp);
-        GLSLF(3,         sums_%i[pos.y*ws_stride[%i] + pos.x] += sum;                 ,dst_comp, dst_comp);
-    }
-    GLSLC(2,     }                                                            );
-    GLSLC(1, }                                                                );
-}
-
-typedef struct HorizontalPushData {
+typedef struct IntegralPushData {
     uint32_t width[4];
     uint32_t height[4];
-    uint32_t ws_stride[4];
-    int32_t  patch_size[4];
-    float    strength[4];
+    uint32_t comp_off[4];
+    uint32_t comp_plane[4];
     VkDeviceAddress integral_base;
     uint64_t integral_size;
     uint64_t int_stride;
     uint32_t xyoffs_start;
-} HorizontalPushData;
+} IntegralPushData;
 
-static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
-                                         FFVulkanShader *shd,
-                                         FFVkSPIRVCompiler *spv,
-                                         int width, int height, int t,
-                                         const AVPixFmtDescriptor *desc,
-                                         int planes, int *nb_rows)
-{
-    int err;
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
-    FFVulkanDescriptorSetBinding *desc_set;
-    int max_dim = FFMAX(width, height);
-    uint32_t max_wg = vkctx->props.properties.limits.maxComputeWorkGroupSize[0];
-    int wg_size, wg_rows;
-
-    /* Round the max workgroup size to the previous power of two */
-    wg_size = max_wg;
-    wg_rows = 1;
-
-    if (max_wg > max_dim) {
-        wg_size = max_dim;
-    } else if (max_wg < max_dim) {
-        /* Make it fit */
-        while (wg_size*wg_rows < max_dim)
-            wg_rows++;
-    }
-
-    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_weights",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          wg_size, 1, 1,
-                          0));
-
-    *nb_rows = wg_rows;
-
-    if (t > 1)
-        GLSLC(0, #extension GL_EXT_shader_atomic_float : require              );
+static void shared_shd_def(FFVulkanShader *shd) {
     GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     );
     GLSLC(0,                                                                  );
     GLSLF(0, #define DTYPE %s                                                 ,TYPE_NAME);
     GLSLF(0, #define T_ALIGN %i                                               ,TYPE_SIZE);
+    GLSLF(0, #define T_BLOCK_ELEMS %i                                         ,TYPE_BLOCK_ELEMS);
+    GLSLF(0, #define T_BLOCK_ALIGN %i                                         ,TYPE_BLOCK_SIZE);
     GLSLC(0,                                                                  );
     GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer {  );
     GLSLC(1,     DTYPE v[];                                                   );
     GLSLC(0, };                                                               );
-    GLSLC(0,                                                                  );
+    GLSLC(0, struct Block {                                                   );
+    GLSLC(1,     DTYPE data[T_BLOCK_ELEMS];                                   );
+    GLSLC(0, };                                                               );
+    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_BLOCK_ALIGN) buffer BlockBuffer {  );
+    GLSLC(1,     Block v[];                                                   );
+    GLSLC(0, };                                                               );
     GLSLC(0, layout(push_constant, std430) uniform pushConstants {            );
     GLSLC(1,     uvec4 width;                                                 );
     GLSLC(1,     uvec4 height;                                                );
-    GLSLC(1,     uvec4 ws_stride;                                             );
-    GLSLC(1,     ivec4 patch_size;                                            );
-    GLSLC(1,     vec4 strength;                                               );
+    GLSLC(1,     uvec4 comp_off;                                              );
+    GLSLC(1,     uvec4 comp_plane;                                            );
     GLSLC(1,     DataBuffer integral_base;                                    );
     GLSLC(1,     uint64_t integral_size;                                      );
     GLSLC(1,     uint64_t int_stride;                                         );
@@ -268,8 +107,88 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
     GLSLC(0, };                                                               );
     GLSLC(0,                                                                  );
 
-    ff_vk_shader_add_push_const(shd, 0, sizeof(HorizontalPushData),
+    ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
+}
+
+static av_cold int init_integral_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
+                                          FFVulkanShader *shd_horizontal,
+                                          FFVulkanShader *shd_vertical,
+                                          FFVkSPIRVCompiler *spv,
+                                          const AVPixFmtDescriptor *desc, int planes)
+{
+    int err;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanShader *shd;
+    FFVulkanDescriptorSetBinding *desc_set;
+
+    shd = shd_horizontal;
+    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_horizontal",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_EXT_buffer_reference",
+                                             "GL_EXT_buffer_reference2" }, 2,
+                          WG_SIZE, 1, 1,
+                          0));
+    shared_shd_def(shd);
+
+    GLSLC(0,                                                                     );
+    GLSLC(0, void main()                                                         );
+    GLSLC(0, {                                                                   );
+    GLSLC(1,     uint64_t offset;                                                );
+    GLSLC(1,     DataBuffer dst;                                                 );
+    GLSLC(1,     BlockBuffer b_dst;                                              );
+    GLSLC(1,     Block block;                                                    );
+    GLSLC(1,     DTYPE s2;                                                       );
+    GLSLC(1,     DTYPE prefix_sum;                                               );
+    GLSLC(1,     ivec2 pos;                                                      );
+    GLSLC(1,     int k;                                                          );
+    GLSLC(1,     int o;                                                          );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     DataBuffer integral_data;                                       );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     uint c_plane;                                                   );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     int comp_idx = int(gl_WorkGroupID.y);                           );
+    GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                          );
+    GLSLC(0,                                                                     );
+    GLSLF(1,     offset = integral_size * (invoc_idx * %i + comp_idx);           ,desc->nb_components);
+    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset);   );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     c_plane = comp_plane[comp_idx];                                 );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     pos.y = int(gl_GlobalInvocationID.x);                           );
+    GLSLC(1,     if (pos.y < height[c_plane]) {                                  );
+    GLSLC(2,         prefix_sum = DTYPE(0);                                      );
+    GLSLC(2,         offset = int_stride * uint64_t(pos.y);                      );
+    GLSLC(2,         b_dst = BlockBuffer(uint64_t(integral_data) + offset);      );
+    GLSLC(0,                                                                     );
+    GLSLC(2,         for (k = 0; k * T_BLOCK_ELEMS < width[c_plane]; k++) {      );
+    GLSLC(3,             block = b_dst.v[k];                                     );
+    GLSLC(3,             for (o = 0; o < T_BLOCK_ELEMS; o++) {                   );
+    GLSLC(4,                 s2 = block.data[o];                                 );
+    GLSLC(4,                 block.data[o] = s2 + prefix_sum;                    );
+    GLSLC(4,                 prefix_sum += s2;                                   );
+    GLSLC(3,             }                                                       );
+    GLSLC(3,             b_dst.v[k] = block;                                     );
+    GLSLC(2,         }                                                           );
+    GLSLC(1,     }                                                               );
+    GLSLC(0, }                                                                   );
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(vkctx, exec, shd));
+
+    shd = shd_vertical;
+    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_vertical",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_EXT_buffer_reference",
+                                             "GL_EXT_buffer_reference2" }, 2,
+                          WG_SIZE, 1, 1,
+                          0));
+    shared_shd_def(shd);
 
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
@@ -281,56 +200,8 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
             .elems      = planes,
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
-        {
-            .name        = "weights_buffer_0",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_0[];",
-        },
-        {
-            .name        = "sums_buffer_0",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_0[];",
-        },
-        {
-            .name        = "weights_buffer_1",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_1[];",
-        },
-        {
-            .name        = "sums_buffer_1",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_1[];",
-        },
-        {
-            .name        = "weights_buffer_2",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_2[];",
-        },
-        {
-            .name        = "sums_buffer_2",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_2[];",
-        },
-        {
-            .name        = "weights_buffer_3",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_3[];",
-        },
-        {
-            .name        = "sums_buffer_3",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_3[];",
-        },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1 + 2*desc->nb_components, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 0, 0));
 
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
@@ -351,54 +222,230 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
     GLSLC(1,     float s1;                                                       );
     GLSLC(1,     DTYPE s2;                                                       );
     GLSLC(1,     DTYPE prefix_sum;                                               );
-    GLSLF(1,     DTYPE psum[%i];                                                 ,*nb_rows);
-    GLSLC(1,     int r;                                                          );
+    GLSLC(1,     uvec2 size;                                                     );
     GLSLC(1,     ivec2 pos;                                                      );
+    GLSLC(1,     ivec2 pos_off;                                                  );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     DataBuffer integral_data;                                       );
+    GLSLF(1,     ivec2 offs[%i];                                                 ,TYPE_ELEMS);
+    GLSLC(0,                                                                     );
+    GLSLC(1,     uint c_off;                                                     );
+    GLSLC(1,     uint c_plane;                                                   );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     int comp_idx = int(gl_WorkGroupID.y);                           );
+    GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                          );
+    GLSLC(0,                                                                     );
+    GLSLF(1,     offset = integral_size * (invoc_idx * %i + comp_idx);           ,desc->nb_components);
+    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset);   );
+    for (int i = 0; i < TYPE_ELEMS; i++)
+        GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];         ,i,TYPE_ELEMS,i);
+    GLSLC(0,                                                                     );
+    GLSLC(1,     c_off = comp_off[comp_idx];                                     );
+    GLSLC(1,     c_plane = comp_plane[comp_idx];                                 );
+    GLSLC(1,     size = imageSize(input_img[c_plane]);                           );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     pos.x = int(gl_GlobalInvocationID.x);                           );
+    GLSLC(1,     if (pos.x < width[c_plane]) {                                   );
+    GLSLC(2,         prefix_sum = DTYPE(0);                                      );
+    GLSLC(2,         for (pos.y = 0; pos.y < height[c_plane]; pos.y++) {         );
+    GLSLC(3,             offset = int_stride * uint64_t(pos.y);                  );
+    GLSLC(3,             dst = DataBuffer(uint64_t(integral_data) + offset);     );
+    GLSLC(4,             s1 = imageLoad(input_img[c_plane], pos)[c_off];         );
+    for (int i = 0; i < TYPE_ELEMS; i++) {
+        GLSLF(4,         pos_off = pos + offs[%i];                               ,i);
+        GLSLC(4,         if (!IS_WITHIN(uvec2(pos_off), size))                   );
+        GLSLF(5,             s2[%i] = s1;                                        ,i);
+        GLSLC(4,         else                                                    );
+        GLSLF(5,             s2[%i] = imageLoad(input_img[c_plane], pos_off)[c_off]; ,i);
+    }
+    GLSLC(4,             s2 = (s1 - s2) * (s1 - s2);                             );
+    GLSLC(3,             dst.v[pos.x] = s2 + prefix_sum;                         );
+    GLSLC(3,             prefix_sum += s2;                                       );
+    GLSLC(2,         }                                                           );
+    GLSLC(1,     }                                                               );
+    GLSLC(0, }                                                                   );
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(vkctx, exec, shd));
+
+fail:
+    if (spv_opaque)
+        spv->free_shader(spv, &spv_opaque);
+
+    return err;
+}
+
+typedef struct WeightsPushData {
+    uint32_t width[4];
+    uint32_t height[4];
+    uint32_t ws_offset[4];
+    uint32_t ws_stride[4];
+    int32_t  patch_size[4];
+    float    strength[4];
+    uint32_t comp_off[4];
+    uint32_t comp_plane[4];
+    VkDeviceAddress integral_base;
+    uint64_t integral_size;
+    uint64_t int_stride;
+    uint32_t xyoffs_start;
+    uint32_t ws_total_count;
+} WeightsPushData;
+
+static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
+                                         FFVulkanShader *shd,
+                                         FFVkSPIRVCompiler *spv,
+                                         const AVPixFmtDescriptor *desc,
+                                         int planes)
+{
+    int err;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanDescriptorSetBinding *desc_set;
+
+    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_weights",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_EXT_buffer_reference",
+                                             "GL_EXT_buffer_reference2" }, 2,
+                          WG_SIZE, WG_SIZE, 1,
+                          0));
+
+    GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     );
+    GLSLC(0,                                                                  );
+    GLSLF(0, #define DTYPE %s                                                 ,TYPE_NAME);
+    GLSLF(0, #define T_ALIGN %i                                               ,TYPE_SIZE);
+    GLSLC(0,                                                                  );
+    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer {  );
+    GLSLC(1,     DTYPE v[];                                                   );
+    GLSLC(0, };                                                               );
+    GLSLC(0, layout(push_constant, std430) uniform pushConstants {            );
+    GLSLC(1,     uvec4 width;                                                 );
+    GLSLC(1,     uvec4 height;                                                );
+    GLSLC(1,     uvec4 ws_offset;                                             );
+    GLSLC(1,     uvec4 ws_stride;                                             );
+    GLSLC(1,     ivec4 patch_size;                                            );
+    GLSLC(1,     vec4 strength;                                               );
+    GLSLC(1,     uvec4 comp_off;                                              );
+    GLSLC(1,     uvec4 comp_plane;                                            );
+    GLSLC(1,     DataBuffer integral_base;                                    );
+    GLSLC(1,     uint64_t integral_size;                                      );
+    GLSLC(1,     uint64_t int_stride;                                         );
+    GLSLC(1,     uint xyoffs_start;                                           );
+    GLSLC(1,     uint ws_total_count;                                         );
+    GLSLC(0, };                                                               );
+    GLSLC(0,                                                                  );
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(WeightsPushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name       = "input_img",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .mem_layout = ff_vk_shader_rep_fmt(vkctx->input_format, FF_VK_REP_FLOAT),
+            .mem_quali  = "readonly",
+            .dimensions = 2,
+            .elems      = planes,
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+        {
+            .name        = "weights_buffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "float weights[];",
+        },
+        {
+            .name        = "sums_buffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "float sums[];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 3, 0, 0));
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "xyoffsets_buffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .mem_quali   = "readonly",
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "ivec2 xyoffsets[];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 1, 0));
+
+    GLSLC(0,                                                                     );
+    GLSLC(0, void main()                                                         );
+    GLSLC(0, {                                                                   );
+    GLSLC(1,     uint64_t offset;                                                );
+    GLSLC(1,     DataBuffer dst;                                                 );
+    GLSLC(1,     uvec2 size;                                                     );
+    GLSLC(1,     ivec2 pos;                                                      );
+    GLSLC(1,     ivec2 pos_off;                                                  );
     GLSLC(1,     int p;                                                          );
     GLSLC(0,                                                                     );
     GLSLC(1,     DataBuffer integral_data;                                       );
     GLSLF(1,     ivec2 offs[%i];                                                 ,TYPE_ELEMS);
     GLSLC(0,                                                                     );
-    GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                          );
+    GLSLC(1,     uint c_off;                                                     );
+    GLSLC(1,     uint c_plane;                                                   );
+    GLSLC(1,     uint ws_off;                                                    );
     GLSLC(0,                                                                     );
-    GLSLC(1,     offset = integral_size * invoc_idx;                             );
+    GLSLC(1,     pos = ivec2(gl_GlobalInvocationID.xy);                          );
+    GLSLF(1,     int comp_idx = int(gl_WorkGroupID.z) %% %i;                     ,desc->nb_components);
+    GLSLF(1,     int invoc_idx = int(gl_WorkGroupID.z) / %i;                     ,desc->nb_components);
+    GLSLC(0,                                                                     );
+    GLSLC(1,     c_off = comp_off[comp_idx];                                     );
+    GLSLC(1,     c_plane = comp_plane[comp_idx];                                 );
+    GLSLC(1,     p = patch_size[comp_idx];                                       );
+    GLSLC(1,     if (pos.y < p || pos.y >= height[c_plane] - p || pos.x < p || pos.x >= width[c_plane] - p) );
+    GLSLC(2,         return;                                                     );
+    GLSLC(0,                                                                     );
+    GLSLF(1,     offset = integral_size * (invoc_idx * %i + comp_idx);           ,desc->nb_components);
     GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset);   );
     for (int i = 0; i < TYPE_ELEMS; i++)
         GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];         ,i,TYPE_ELEMS,i);
     GLSLC(0,                                                                     );
+    GLSLC(1,     ws_off = ws_total_count * invoc_idx + ws_offset[comp_idx] + pos.y * ws_stride[comp_idx]; );
+    GLSLC(1,     size = imageSize(input_img[c_plane]);                           );
+    GLSLC(0,                                                                     );
     GLSLC(1,     DTYPE a;                                                        );
     GLSLC(1,     DTYPE b;                                                        );
     GLSLC(1,     DTYPE c;                                                        );
     GLSLC(1,     DTYPE d;                                                        );
     GLSLC(0,                                                                     );
     GLSLC(1,     DTYPE patch_diff;                                               );
-    if (TYPE_ELEMS == 4) {
-        GLSLC(1, vec4 src;                                                       );
-        GLSLC(1, vec4 w;                                                         );
-    } else {
-        GLSLC(1, vec4 src[4];                                                    );
-        GLSLC(1, vec4 w[4];                                                      );
-    }
+    GLSLC(1,     vec4 src;                                                       );
+    GLSLC(1,     vec4 w;                                                         );
     GLSLC(1,     float w_sum;                                                    );
     GLSLC(1,     float sum;                                                      );
     GLSLC(0,                                                                     );
-    GLSLC(1,     bool lt;                                                        );
-    GLSLC(1,     bool gt;                                                        );
-    GLSLC(0,                                                                     );
-
-    for (int i = 0; i < desc->nb_components; i++) {
-        int off = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8);
-        if (width >= height) {
-            insert_horizontal_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
-            insert_vertical_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
-            insert_weights_pass(shd, *nb_rows, 0, t, i, desc->comp[i].plane, off);
-        } else {
-            insert_vertical_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
-            insert_horizontal_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
-            insert_weights_pass(shd, *nb_rows, 1, t, i, desc->comp[i].plane, off);
-        }
+    for (int i = 0; i < 4; i++) {
+        GLSLF(1,     pos_off = pos + offs[%i];                                   ,i);
+        GLSLC(1,     if (!IS_WITHIN(uvec2(pos_off), size))                       );
+        GLSLF(2,         src[%i] = imageLoad(input_img[c_plane], pos)[c_off];    ,i);
+        GLSLC(1,     else                                                        );
+        GLSLF(2,         src[%i] = imageLoad(input_img[c_plane], pos_off)[c_off]; ,i);
     }
-
+    GLSLC(0,                                                                     );
+    GLSLC(1,         offset = int_stride * uint64_t(pos.y - p);                  );
+    GLSLC(1,         dst = DataBuffer(uint64_t(integral_data) + offset);         );
+    GLSLC(1,         a = dst.v[pos.x - p];                                       );
+    GLSLC(1,         c = dst.v[pos.x + p];                                       );
+    GLSLC(1,         offset = int_stride * uint64_t(pos.y + p);                  );
+    GLSLC(1,         dst = DataBuffer(uint64_t(integral_data) + offset);         );
+    GLSLC(1,         b = dst.v[pos.x - p];                                       );
+    GLSLC(1,         d = dst.v[pos.x + p];                                       );
+    GLSLC(0,                                                                     );
+    GLSLC(1,         patch_diff = d + a - b - c;                                 );
+    GLSLC(1,         w = exp(patch_diff * strength[comp_idx]);                   );
+    GLSLC(1,         w_sum = w[0] + w[1] + w[2] + w[3];                          );
+    GLSLC(1,         sum = dot(w, src * 255);                                    );
+    GLSLC(0,                                                                     );
+    GLSLC(1,         weights[ws_off + pos.x] += w_sum;                           );
+    GLSLC(1,         sums[ws_off + pos.x] += sum;                                );
     GLSLC(0, }                                                                   );
 
     RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
@@ -414,7 +461,12 @@ fail:
 }
 
 typedef struct DenoisePushData {
+    uint32_t comp_off[4];
+    uint32_t comp_plane[4];
+    uint32_t ws_offset[4];
     uint32_t ws_stride[4];
+    uint32_t ws_total_count;
+    uint32_t t;
 } DenoisePushData;
 
 static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
@@ -426,16 +478,20 @@ static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
     size_t spv_len;
     void *spv_opaque = NULL;
     FFVulkanDescriptorSetBinding *desc_set;
-
     RET(ff_vk_shader_init(vkctx, shd, "nlmeans_denoise",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
                                              "GL_EXT_buffer_reference2" }, 2,
-                          32, 32, 1,
+                          WG_SIZE, WG_SIZE, 1,
                           0));
 
     GLSLC(0, layout(push_constant, std430) uniform pushConstants {        );
+    GLSLC(1,    uvec4 comp_off;                                           );
+    GLSLC(1,    uvec4 comp_plane;                                         );
+    GLSLC(1,    uvec4 ws_offset;                                          );
     GLSLC(1,    uvec4 ws_stride;                                          );
+    GLSLC(1,    uint32_t ws_total_count;                                  );
+    GLSLC(1,    uint32_t t;                                               );
     GLSLC(0, };                                                           );
 
     ff_vk_shader_add_push_const(shd, 0, sizeof(DenoisePushData),
@@ -465,92 +521,58 @@ static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
 
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
-            .name        = "weights_buffer_0",
+            .name        = "weights_buffer",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .mem_quali   = "readonly",
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_0[];",
+            .buf_content = "float weights[];",
         },
         {
-            .name        = "sums_buffer_0",
+            .name        = "sums_buffer",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .mem_quali   = "readonly",
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_0[];",
-        },
-        {
-            .name        = "weights_buffer_1",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_1[];",
-        },
-        {
-            .name        = "sums_buffer_1",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_1[];",
-        },
-        {
-            .name        = "weights_buffer_2",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_2[];",
-        },
-        {
-            .name        = "sums_buffer_2",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_2[];",
-        },
-        {
-            .name        = "weights_buffer_3",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_3[];",
-        },
-        {
-            .name        = "sums_buffer_3",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_3[];",
+            .buf_content = "float sums[];",
         },
     };
 
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2*desc->nb_components, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2, 0, 0));
 
     GLSLC(0, void main()                                                      );
     GLSLC(0, {                                                                );
-    GLSLC(1,     ivec2 size;                                                  );
     GLSLC(1,     const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);           );
     GLSLC(1,     const uint plane = uint(gl_WorkGroupID.z);                   );
+    GLSLC(1,     const uvec2 size = imageSize(output_img[plane]);             );
+    GLSLC(0,                                                                  );
+    GLSLC(1,     uint c_off;                                                  );
+    GLSLC(1,     uint c_plane;                                                );
+    GLSLC(1,     uint ws_off;                                                 );
     GLSLC(0,                                                                  );
     GLSLC(1,     float w_sum;                                                 );
     GLSLC(1,     float sum;                                                   );
     GLSLC(1,     vec4 src;                                                    );
     GLSLC(1,     vec4 r;                                                      );
+    GLSLC(1,     int invoc_idx;                                               );
+    GLSLC(1,     int comp_idx;                                                );
     GLSLC(0,                                                                  );
-    GLSLC(1,     size = imageSize(output_img[plane]);                         );
     GLSLC(1,     if (!IS_WITHIN(pos, size))                                   );
     GLSLC(2,         return;                                                  );
     GLSLC(0,                                                                  );
     GLSLC(1,     src = imageLoad(input_img[plane], pos);                      );
-    GLSLC(0,                                                                  );
-    for (int c = 0; c < desc->nb_components; c++) {
-        int off = desc->comp[c].offset / (FFALIGN(desc->comp[c].depth, 8)/8);
-        GLSLF(1, if (plane == %i) {                                              ,desc->comp[c].plane);
-        GLSLF(2,     w_sum = weights_%i[pos.y*ws_stride[%i] + pos.x];                           ,c, c);
-        GLSLF(2,     sum = sums_%i[pos.y*ws_stride[%i] + pos.x];                                ,c, c);
-        GLSLF(2,     r[%i] = (sum + src[%i]*255) / (1.0 + w_sum) / 255;                     ,off, off);
-        GLSLC(1, }                                                                                   );
-        GLSLC(0,                                                                                     );
-    }
-    GLSLC(1, imageStore(output_img[plane], pos, r);                           );
+    GLSLF(1,     for (comp_idx = 0; comp_idx < %i; comp_idx++) {              ,desc->nb_components);
+    GLSLC(2,         if (plane == comp_plane[comp_idx]) {                     );
+    GLSLC(3,             w_sum = 0.0;                                         );
+    GLSLC(3,             sum = 0.0;                                           );
+    GLSLC(3,             for (invoc_idx = 0; invoc_idx < t; invoc_idx++) {    );
+    GLSLC(4,                 ws_off = ws_total_count * invoc_idx + ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x; );
+    GLSLC(4,                 w_sum += weights[ws_off];                        );
+    GLSLC(4,                 sum += sums[ws_off];                             );
+    GLSLC(3,             }                                                    );
+    GLSLC(3,             c_off = comp_off[comp_idx];                          );
+    GLSLC(3,             r[c_off] = (sum + src[c_off] * 255) / (1.0 + w_sum) / 255; );
+    GLSLC(2,         }                                                        );
+    GLSLC(1,     }                                                            );
+    GLSLC(1,     imageStore(output_img[plane], pos, r);                       );
     GLSLC(0, }                                                                );
 
     RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
@@ -640,11 +662,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
     RET(ff_vk_unmap_buffer(&s->vkctx, &s->xyoffsets_buf, 1));
 
     s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS));
-    if (!vkctx->atomic_float_feats.shaderBufferFloat32AtomicAdd) {
-        av_log(ctx, AV_LOG_WARNING, "Device doesn't support atomic float adds, "
-               "disabling dispatch parallelism\n");
-        s->opts.t = 1;
-    }
 
     spv = ff_vk_spirv_init();
     if (!spv) {
@@ -661,21 +678,25 @@ static av_cold int init_filter(AVFilterContext *ctx)
 
     RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, 1, 0, 0, 0, NULL));
 
-    RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights,
-                              spv, s->vkctx.output_width, s->vkctx.output_height,
-                              s->opts.t, desc, planes, &s->pl_weights_rows));
+    RET(init_integral_pipeline(vkctx, &s->e, &s->shd_horizontal, &s->shd_vertical,
+                               spv, desc, planes));
 
-    RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise,
-                              spv, desc, planes));
+    RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights, spv, desc, planes));
+
+    RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise, spv, desc, planes));
+
+    RET(ff_vk_shader_update_desc_buffer(vkctx, &s->e.contexts[0], &s->shd_vertical,
+                                        1, 0, 0,
+                                        &s->xyoffsets_buf, 0, s->xyoffsets_buf.size,
+                                        VK_FORMAT_UNDEFINED));
 
     RET(ff_vk_shader_update_desc_buffer(vkctx, &s->e.contexts[0], &s->shd_weights,
                                         1, 0, 0,
-                                    &s->xyoffsets_buf, 0, s->xyoffsets_buf.size,
-                                    VK_FORMAT_UNDEFINED));
+                                        &s->xyoffsets_buf, 0, s->xyoffsets_buf.size,
+                                        VK_FORMAT_UNDEFINED));
 
     do {
         int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
-        wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
         offsets_dispatched += wg_invoc * TYPE_ELEMS;
         nb_dispatches++;
     } while (offsets_dispatched < s->nb_offsets);
@@ -693,15 +714,22 @@ fail:
 }
 
 static int denoise_pass(NLMeansVulkanContext *s, FFVkExecContext *exec,
-                        FFVkBuffer *ws_vk, uint32_t ws_stride[4])
+                        FFVkBuffer *ws_vk, uint32_t comp_offs[4], uint32_t comp_planes[4],
+                        uint32_t ws_offset[4], uint32_t ws_stride[4],
+                        uint32_t ws_total_count, int t)
 {
     FFVulkanContext *vkctx = &s->vkctx;
     FFVulkanFunctions *vk = &vkctx->vkfn;
-    VkBufferMemoryBarrier2 buf_bar[8];
+    VkBufferMemoryBarrier2 buf_bar[2];
     int nb_buf_bar = 0;
 
     DenoisePushData pd = {
+        { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
+        { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
+        { ws_offset[0], ws_offset[1], ws_offset[2], ws_offset[3] },
         { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
+        ws_total_count,
+        t,
     };
 
     /* Denoise pass pipeline */
@@ -753,6 +781,8 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
     FFVulkanFunctions *vk = &vkctx->vkfn;
 
     const AVPixFmtDescriptor *desc;
+    int comp_offs[4];
+    int comp_planes[4];
     int plane_widths[4];
     int plane_heights[4];
 
@@ -767,18 +797,17 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
     /* Weights/sums */
     AVBufferRef *ws_buf = NULL;
     FFVkBuffer *ws_vk;
-    VkDeviceSize weights_offs[4];
-    VkDeviceSize sums_offs[4];
+    uint32_t ws_total_count = 0;
+    uint32_t ws_offset[4];
     uint32_t ws_stride[4];
-    size_t ws_size[4];
-    size_t ws_total_size = 0;
+    size_t ws_total_size;
 
     FFVkExecContext *exec;
     VkImageView in_views[AV_NUM_DATA_POINTERS];
     VkImageView out_views[AV_NUM_DATA_POINTERS];
     VkImageMemoryBarrier2 img_bar[8];
     int nb_img_bar = 0;
-    VkBufferMemoryBarrier2 buf_bar[8];
+    VkBufferMemoryBarrier2 buf_bar[2];
     int nb_buf_bar = 0;
 
     if (!s->initialized)
@@ -789,27 +818,32 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
         return AVERROR(EINVAL);
 
     /* Integral image */
-    int_stride = s->shd_weights.lg_size[0]*s->pl_weights_rows*TYPE_SIZE;
-    int_size = s->shd_weights.lg_size[0]*s->pl_weights_rows*int_stride;
+    int_stride = FFALIGN(vkctx->output_width, s->shd_vertical.lg_size[0]) * TYPE_SIZE;
+    int_size = FFALIGN(vkctx->output_height, s->shd_horizontal.lg_size[0]) * int_stride;
 
     /* Plane dimensions */
     for (int i = 0; i < desc->nb_components; i++) {
         plane_widths[i] = !i || (i == 3) ? vkctx->output_width : AV_CEIL_RSHIFT(vkctx->output_width, desc->log2_chroma_w);
-        plane_heights[i] = !i || (i == 3) ? vkctx->output_height : AV_CEIL_RSHIFT(vkctx->output_height, desc->log2_chroma_w);
+        plane_heights[i] = !i || (i == 3) ? vkctx->output_height : AV_CEIL_RSHIFT(vkctx->output_height, desc->log2_chroma_h);
         plane_widths[i]  = FFALIGN(plane_widths[i],  s->shd_denoise.lg_size[0]);
         plane_heights[i] = FFALIGN(plane_heights[i], s->shd_denoise.lg_size[1]);
 
+        comp_offs[i] = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8);
+        comp_planes[i] = desc->comp[i].plane;
+
         ws_stride[i] = plane_widths[i];
-        ws_size[i] = ws_stride[i] * plane_heights[i] * sizeof(float);
-        ws_total_size += ws_size[i];
+        ws_offset[i] = ws_total_count;
+        ws_total_count += ws_stride[i] * plane_heights[i];
     }
 
+    ws_total_size = ws_total_count * sizeof(float);
+
     /* Buffers */
     err = ff_vk_get_pooled_buffer(&s->vkctx, &s->integral_buf_pool, &integral_buf,
                                   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                   NULL,
-                                  s->opts.t * int_size,
+                                  int_size * s->opts.t * desc->nb_components,
                                   VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
     if (err < 0)
         return err;
@@ -820,19 +854,12 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
                                   VK_BUFFER_USAGE_TRANSFER_DST_BIT |
                                   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                   NULL,
-                                  ws_total_size * 2,
+                                  ws_total_size * s-> opts.t * 2,
                                   VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
     if (err < 0)
         return err;
     ws_vk = (FFVkBuffer *)ws_buf->data;
 
-    weights_offs[0] = 0;
-    sums_offs[0] = ws_total_size;
-    for (int i = 1; i < desc->nb_components; i++) {
-        weights_offs[i] = weights_offs[i - 1] + ws_size[i - 1];
-        sums_offs[i] = sums_offs[i - 1] + ws_size[i - 1];
-    }
-
     /* Output frame */
     out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
     if (!out) {
@@ -889,19 +916,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
         .size = ws_vk->size,
         .offset = 0,
     };
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = integral_vk->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask = integral_vk->access,
-        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = integral_vk->buf,
-        .size = integral_vk->size,
-        .offset = 0,
-    };
 
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
             .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
@@ -912,118 +926,180 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
         });
     ws_vk->stage = buf_bar[0].dstStageMask;
     ws_vk->access = buf_bar[0].dstAccessMask;
-    integral_vk->stage = buf_bar[1].dstStageMask;
-    integral_vk->access = buf_bar[1].dstAccessMask;
 
     /* Buffer zeroing */
     vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0);
 
-    nb_buf_bar = 0;
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = ws_vk->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask = ws_vk->access,
-        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = ws_vk->buf,
-        .size = ws_vk->size,
-        .offset = 0,
-    };
-
-    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
-            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-            .pBufferMemoryBarriers = buf_bar,
-            .bufferMemoryBarrierCount = nb_buf_bar,
-        });
-    ws_vk->stage = buf_bar[0].dstStageMask;
-    ws_vk->access = buf_bar[0].dstAccessMask;
-
+    /* Update integral descriptors */
+    ff_vk_shader_update_img_array(vkctx, exec, &s->shd_vertical, in, in_views, 0, 0,
+                                  VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
     /* Update weights descriptors */
     ff_vk_shader_update_img_array(vkctx, exec, &s->shd_weights, in, in_views, 0, 0,
                                   VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
-    for (int i = 0; i < desc->nb_components; i++) {
-        RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 1 + i*2 + 0, 0,
-                                            ws_vk, weights_offs[i], ws_size[i],
-                                            VK_FORMAT_UNDEFINED));
-        RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 1 + i*2 + 1, 0,
-                                            ws_vk, sums_offs[i], ws_size[i],
-                                            VK_FORMAT_UNDEFINED));
-    }
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 1, 0,
+                                        ws_vk, 0, ws_total_size * s-> opts.t,
+                                        VK_FORMAT_UNDEFINED));
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 2, 0,
+                                        ws_vk, ws_total_size * s-> opts.t, ws_total_size * s-> opts.t,
+                                        VK_FORMAT_UNDEFINED));
 
     /* Update denoise descriptors */
     ff_vk_shader_update_img_array(vkctx, exec, &s->shd_denoise, in, in_views, 0, 0,
                                   VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
     ff_vk_shader_update_img_array(vkctx, exec, &s->shd_denoise, out, out_views, 0, 1,
                                   VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
-    for (int i = 0; i < desc->nb_components; i++) {
-        RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, i*2 + 0, 0,
-                                            ws_vk, weights_offs[i], ws_size[i],
-                                            VK_FORMAT_UNDEFINED));
-        RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, i*2 + 1, 0,
-                                            ws_vk, sums_offs[i], ws_size[i],
-                                            VK_FORMAT_UNDEFINED));
-    }
-
-    /* Weights pipeline */
-    ff_vk_exec_bind_shader(vkctx, exec, &s->shd_weights);
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, 0, 0,
+                                        ws_vk, 0, ws_total_size * s-> opts.t,
+                                        VK_FORMAT_UNDEFINED));
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, 1, 0,
+                                        ws_vk, ws_total_size * s-> opts.t, ws_total_size * s-> opts.t,
+                                        VK_FORMAT_UNDEFINED));
 
     do {
-        int wg_invoc;
-        HorizontalPushData pd = {
+        int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
+
+        /* Integral pipeline */
+        IntegralPushData pd = {
             { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
             { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
-            { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
-            { s->patch[0], s->patch[1], s->patch[2], s->patch[3] },
-            { s->strength[0], s->strength[1], s->strength[2], s->strength[2], },
+            { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
+            { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
             integral_vk->address,
             (uint64_t)int_size,
             (uint64_t)int_stride,
             offsets_dispatched,
         };
 
-        /* Push data */
-        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_weights,
+        ff_vk_exec_bind_shader(vkctx, exec, &s->shd_vertical);
+        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_vertical,
                                        VK_SHADER_STAGE_COMPUTE_BIT,
                                        0, sizeof(pd), &pd);
 
-        if (offsets_dispatched) {
-            nb_buf_bar = 0;
-            buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-                .srcStageMask = integral_vk->stage,
-                .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                .srcAccessMask = integral_vk->access,
-                .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                                 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                .buffer = integral_vk->buf,
-                .size = integral_vk->size,
-                .offset = 0,
-            };
+        nb_buf_bar = 0;
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = integral_vk->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = integral_vk->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = integral_vk->buf,
+            .size = integral_vk->size,
+            .offset = 0,
+        };
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
+        integral_vk->stage = buf_bar[0].dstStageMask;
+        integral_vk->access = buf_bar[0].dstAccessMask;
 
-            vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
-                    .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-                    .pBufferMemoryBarriers = buf_bar,
-                    .bufferMemoryBarrierCount = nb_buf_bar,
-                });
-            integral_vk->stage = buf_bar[1].dstStageMask;
-            integral_vk->access = buf_bar[1].dstAccessMask;
-        }
+        /* End of vertical pass */
+        vk->CmdDispatch(exec->buf, FFALIGN(vkctx->output_width, s->shd_vertical.lg_size[0])/s->shd_vertical.lg_size[0],
+                        desc->nb_components, wg_invoc);
 
-        wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
-        wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
+        ff_vk_exec_bind_shader(vkctx, exec, &s->shd_horizontal);
+        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_horizontal,
+                                       VK_SHADER_STAGE_COMPUTE_BIT,
+                                       0, sizeof(pd), &pd);
+
+        nb_buf_bar = 0;
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = integral_vk->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = integral_vk->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = integral_vk->buf,
+            .size = integral_vk->size,
+            .offset = 0,
+        };
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
+        integral_vk->stage = buf_bar[0].dstStageMask;
+        integral_vk->access = buf_bar[0].dstAccessMask;
 
         /* End of horizontal pass */
-        vk->CmdDispatch(exec->buf, 1, 1, wg_invoc);
+        vk->CmdDispatch(exec->buf, FFALIGN(vkctx->output_height, s->shd_horizontal.lg_size[0])/s->shd_horizontal.lg_size[0],
+                        desc->nb_components, wg_invoc);
+
+        /* Weights pipeline */
+        WeightsPushData wpd = {
+            { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
+            { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
+            { ws_offset[0], ws_offset[1], ws_offset[2], ws_offset[3] },
+            { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
+            { s->patch[0], s->patch[1], s->patch[2], s->patch[3] },
+            { s->strength[0], s->strength[1], s->strength[2], s->strength[3], },
+            { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
+            { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
+            integral_vk->address,
+            (uint64_t)int_size,
+            (uint64_t)int_stride,
+            offsets_dispatched,
+            ws_total_count,
+        };
+
+        ff_vk_exec_bind_shader(vkctx, exec, &s->shd_weights);
+        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_weights,
+                                        VK_SHADER_STAGE_COMPUTE_BIT,
+                                        0, sizeof(wpd), &wpd);
+
+        nb_buf_bar = 0;
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = integral_vk->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = integral_vk->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = integral_vk->buf,
+            .size = integral_vk->size,
+            .offset = 0,
+        };
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = ws_vk->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = ws_vk->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = ws_vk->buf,
+            .size = ws_vk->size,
+            .offset = 0,
+        };
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
+        integral_vk->stage = buf_bar[0].dstStageMask;
+        integral_vk->access = buf_bar[0].dstAccessMask;
+        ws_vk->stage = buf_bar[1].dstStageMask;
+        ws_vk->access = buf_bar[1].dstAccessMask;
+
+        /* End of weights pass */
+        vk->CmdDispatch(exec->buf,
+                        FFALIGN(vkctx->output_width, s->shd_weights.lg_size[0])/s->shd_weights.lg_size[0],
+                        FFALIGN(vkctx->output_height, s->shd_weights.lg_size[1])/s->shd_weights.lg_size[1],
+                        wg_invoc * desc->nb_components);
 
         offsets_dispatched += wg_invoc * TYPE_ELEMS;
     } while (offsets_dispatched < s->nb_offsets);
 
-    RET(denoise_pass(s, exec, ws_vk, ws_stride));
+    RET(denoise_pass(s, exec, ws_vk, comp_offs, comp_planes, ws_offset, ws_stride,
+                     ws_total_count, s->opts.t));
 
     err = ff_vk_exec_submit(vkctx, exec);
     if (err < 0)
@@ -1051,6 +1127,8 @@ static void nlmeans_vulkan_uninit(AVFilterContext *avctx)
     FFVulkanContext *vkctx = &s->vkctx;
 
     ff_vk_exec_pool_free(vkctx, &s->e);
+    ff_vk_shader_free(vkctx, &s->shd_horizontal);
+    ff_vk_shader_free(vkctx, &s->shd_vertical);
     ff_vk_shader_free(vkctx, &s->shd_weights);
     ff_vk_shader_free(vkctx, &s->shd_denoise);
 
@@ -1071,7 +1149,7 @@ static const AVOption nlmeans_vulkan_options[] = {
     { "s",  "denoising strength for all components", OFFSET(opts.s), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
     { "p",  "patch size for all components", OFFSET(opts.p), AV_OPT_TYPE_INT, { .i64 = 3*2+1 }, 0, 99, FLAGS },
     { "r",  "research window radius", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
-    { "t",  "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 36 }, 1, 168, FLAGS },
+    { "t",  "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 8 }, 1, 64, FLAGS },
 
     { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
     { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

                 reply	other threads:[~2025-10-11  5:26 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=176016034378.49.16726346481155709312@bf249f23a2c8 \
    --to=ffmpeg-devel@ffmpeg.org \
    --cc=code@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git