[FFmpeg-devel] [PATCH] avfilter: add alphamerge_cuda filter

From: Jorge Estrada <jestrada.list@gmail.com>
To: ffmpeg-devel@ffmpeg.org
Cc: Jorge Estrada <jestrada.list@gmail.com>
Subject: [FFmpeg-devel] [PATCH] avfilter: add alphamerge_cuda filter
Date: Wed,  6 Aug 2025 19:53:58 -0700
Message-ID: <20250807025358.49096-1-jestrada.list@gmail.com> (raw)

This patch adds the alphamerge_cuda video filter

Example usage:
ffmpeg -f lavfi -i "color=c=red:s=1280x720:d=5,format=yuva420p" \
-f lavfi -i "color=c=black:s=1280x720:d=5,geq=lum='255*gt(W/4,hypot(X-W/2,Y-H/2))'" \
-f lavfi -i "color=c=blue:s=1280x720:d=5" \
-filter_complex \
"[0:v]hwupload_cuda[base]; \
 [1:v]hwupload_cuda[mask]; \
 [2:v]hwupload_cuda[bkgd]; \
 [base][mask]alphamerge_cuda[merged]; \
 [bkgd][merged]overlay_cuda" \
-c:v h264_nvenc -y out.mp4
---
 configure                         |   2 +
 doc/filters.texi                  |  21 ++
 libavfilter/Makefile              |   2 +
 libavfilter/allfilters.c          |   1 +
 libavfilter/version.h             |   2 +-
 libavfilter/vf_alphamerge_cuda.c  | 345 ++++++++++++++++++++++++++++++
 libavfilter/vf_alphamerge_cuda.cu |  44 ++++
 7 files changed, 416 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/vf_alphamerge_cuda.c
 create mode 100644 libavfilter/vf_alphamerge_cuda.cu

diff --git a/configure b/configure
index 30e61c5bb5..268e3c3e28 100755
--- a/configure
+++ b/configure
@@ -3349,6 +3349,8 @@ vaapi_encode_deps="vaapi"
 vulkan_encode_deps="vulkan"
 v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
 
+alphamerge_cuda_filter_deps="ffnvcodec"
+alphamerge_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 bilateral_cuda_filter_deps="ffnvcodec"
 bilateral_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 chromakey_cuda_filter_deps="ffnvcodec"
diff --git a/doc/filters.texi b/doc/filters.texi
index 61ece1d000..e51e11eb28 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -26500,6 +26500,27 @@ Note: If FFmpeg detects the Nvidia CUDA Toolkit during configuration, it will en
 @item Configure FFmpeg with @code{--enable-cuda-llvm}. Additional requirement: @code{llvm} lib must be installed.
 @end itemize
 
+@subsection alphamerge_cuda
+Add or replace the alpha component of the primary input stream with the luma plane of a second input stream, using CUDA hardware acceleration.
+
+@subsubsection Examples
+@itemize
+@item
+Apply a circular alpha mask to a red video before overlaying it onto a blue background.
+@example
+ffmpeg -f lavfi -i "color=c=red:s=1280x720:d=5,format=yuva420p" \
+-f lavfi -i "color=c=black:s=1280x720:d=5,geq=lum='255*gt(W/4,hypot(X-W/2,Y-H/2))'" \
+-f lavfi -i "color=c=blue:s=1280x720:d=5" \
+-filter_complex \
+"[0:v]hwupload_cuda[base]; \
+ [1:v]hwupload_cuda[mask]; \
+ [2:v]hwupload_cuda[bkgd]; \
+ [base][mask]alphamerge_cuda[merged]; \
+ [bkgd][merged]overlay_cuda" \
+-c:v h264_nvenc -y out.mp4
+@end example
+@end itemize
+
 @subsection bilateral_cuda
 CUDA accelerated bilateral filter, an edge preserving filter.
 This filter is mathematically accurate thanks to the use of GPU acceleration.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 9a906bd342..3df6444f83 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -194,6 +194,8 @@ OBJS-$(CONFIG_ANULLSINK_FILTER)              += asink_anullsink.o
 OBJS-$(CONFIG_ADDROI_FILTER)                 += vf_addroi.o
 OBJS-$(CONFIG_ALPHAEXTRACT_FILTER)           += vf_extractplanes.o
 OBJS-$(CONFIG_ALPHAMERGE_FILTER)             += vf_alphamerge.o framesync.o
+OBJS-$(CONFIG_ALPHAMERGE_CUDA_FILTER)        += vf_alphamerge_cuda.o framesync.o vf_alphamerge_cuda.ptx.o \
+                                                cuda/load_helper.o
 OBJS-$(CONFIG_AMPLIFY_FILTER)                += vf_amplify.o
 OBJS-$(CONFIG_ASS_FILTER)                    += vf_subtitles.o
 OBJS-$(CONFIG_ATADENOISE_FILTER)             += vf_atadenoise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 44b4de2a14..87e9f2bdd4 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -179,6 +179,7 @@ extern const FFFilter ff_asink_anullsink;
 extern const FFFilter ff_vf_addroi;
 extern const FFFilter ff_vf_alphaextract;
 extern const FFFilter ff_vf_alphamerge;
+extern const FFFilter ff_vf_alphamerge_cuda;
 extern const FFFilter ff_vf_amplify;
 extern const FFFilter ff_vf_ass;
 extern const FFFilter ff_vf_atadenoise;
diff --git a/libavfilter/version.h b/libavfilter/version.h
index 7e0eb9af97..4d8f28e512 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -31,7 +31,7 @@
 
 #include "version_major.h"
 
-#define LIBAVFILTER_VERSION_MINOR   3
+#define LIBAVFILTER_VERSION_MINOR   4
 #define LIBAVFILTER_VERSION_MICRO 100
 
 
diff --git a/libavfilter/vf_alphamerge_cuda.c b/libavfilter/vf_alphamerge_cuda.c
new file mode 100644
index 0000000000..b8ab78cb2e
--- /dev/null
+++ b/libavfilter/vf_alphamerge_cuda.c
@@ -0,0 +1,345 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Copy the luma value of the second input into the alpha channel of the first input using CUDA.
+ */
+
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/mem.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "formats.h"
+#include "framesync.h"
+
+#include "cuda/load_helper.h"
+
+#define CHECK_CU(call) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, call)
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+
+#define BLOCK_X 32
+#define BLOCK_Y 16
+
+#define MAIN_INPUT 0
+#define ALPHA_INPUT 1
+
+#define ALPHA_PLANE_INDEX 3
+
+static const enum AVPixelFormat supported_main_formats[] = {
+    AV_PIX_FMT_YUVA444P,
+    AV_PIX_FMT_YUVA420P,
+    AV_PIX_FMT_NONE,
+};
+
+static const enum AVPixelFormat supported_alpha_mask_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUVA420P,
+    AV_PIX_FMT_YUVA444P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_NONE,
+};
+
+typedef struct AlphaMergeCUDAContext {
+    const AVClass *class;
+
+    enum AVPixelFormat sw_format_main;
+    enum AVPixelFormat sw_format_alpha_mask;
+
+    AVBufferRef *hw_device_ctx;
+    AVCUDADeviceContext *hwctx;
+
+    CUcontext cu_ctx;
+    CUmodule cu_module;
+    CUfunction cu_func_alphamerge_planar;
+    CUstream cu_stream;
+
+    FFFrameSync fs;
+
+    int alpha_plane_idx;
+
+} AlphaMergeCUDAContext;
+
+
+static int format_is_supported(const enum AVPixelFormat supported_formats[], enum AVPixelFormat fmt)
+{
+    for (int i = 0; supported_formats[i] != AV_PIX_FMT_NONE; i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static int query_formats(const AVFilterContext *ctx,
+                         AVFilterFormatsConfig **cfg_in,
+                         AVFilterFormatsConfig **cfg_out)
+{
+    static const int pix_fmts[] = { AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE };
+    static const int alpha_mask_ranges[] = { AVCOL_RANGE_JPEG };
+    AVFilterFormats *formats = NULL;
+    int ret = 0;
+
+    formats = ff_make_format_list(pix_fmts);
+    if (!formats)
+        return AVERROR(ENOMEM);
+
+    if ((ret = ff_set_common_formats2(ctx, cfg_in, cfg_out, ff_make_format_list(pix_fmts))) < 0)
+        return ret;
+
+    formats = ff_make_format_list(alpha_mask_ranges);
+    if (!formats)
+        return AVERROR(ENOMEM);
+
+    ret = ff_formats_ref(formats, &cfg_in[ALPHA_INPUT]->color_ranges);
+    ff_formats_unref(&formats);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int do_alphamerge_cuda(FFFrameSync *fs)
+{
+    AVFilterContext *ctx = fs->parent;
+    AlphaMergeCUDAContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVFrame *main_frame = NULL;
+    AVFrame *alpha_mask_frame = NULL;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUcontext dummy_cu_ctx;
+    int ret;
+
+    ret = ff_framesync_dualinput_get_writable(fs, &main_frame, &alpha_mask_frame);
+    if (ret < 0)
+        return ret;
+
+    if (!alpha_mask_frame)
+        return ff_filter_frame(outlink, main_frame);
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->cu_ctx));
+    if (ret < 0)
+        return ret;
+
+    void *kernel_args[] = {
+        &main_frame->data[s->alpha_plane_idx],
+        &main_frame->linesize[s->alpha_plane_idx],
+        &alpha_mask_frame->data[0],
+        &alpha_mask_frame->linesize[0],
+        &main_frame->width,
+        &main_frame->height
+    };
+    unsigned int grid_x = DIV_UP(main_frame->width, BLOCK_X);
+    unsigned int grid_y = DIV_UP(main_frame->height, BLOCK_Y);
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_alphamerge_planar, grid_x, grid_y, 1,
+                                      BLOCK_X, BLOCK_Y, 1,
+                                      0, s->cu_stream, kernel_args, NULL));
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy_cu_ctx));
+
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to launch CUDA kernel\n");
+        return ret;
+    }
+
+    return ff_filter_frame(outlink, main_frame);
+}
+
+
+static int alphamerge_cuda_config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AlphaMergeCUDAContext *s = ctx->priv;
+
+    AVFilterLink *main_inlink = ctx->inputs[MAIN_INPUT];
+    AVFilterLink *alpha_inlink = ctx->inputs[ALPHA_INPUT];
+
+    FilterLink *main_inl = ff_filter_link(main_inlink);
+    FilterLink *alpha_inl = ff_filter_link(alpha_inlink);
+
+    AVHWFramesContext *main_frames_ctx = (AVHWFramesContext*)main_inl->hw_frames_ctx->data;
+    AVHWFramesContext *alpha_frames_ctx = (AVHWFramesContext*)alpha_inl->hw_frames_ctx->data;
+
+    const AVPixFmtDescriptor *main_desc;
+    CUcontext dummy_cu_ctx;
+    CudaFunctions *cu;
+    int ret = 0;
+
+    extern const unsigned char ff_vf_alphamerge_cuda_ptx_data[];
+    extern const unsigned int ff_vf_alphamerge_cuda_ptx_len;
+
+    s->sw_format_main = main_frames_ctx->sw_format;
+    if (!format_is_supported(supported_main_formats, s->sw_format_main)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported main input software pixel format: %s\n",
+               av_get_pix_fmt_name(s->sw_format_main));
+        return AVERROR(ENOSYS);
+    }
+
+    s->sw_format_alpha_mask = alpha_frames_ctx->sw_format;
+    if (!format_is_supported(supported_alpha_mask_formats, s->sw_format_alpha_mask)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported alpha mask input software pixel format: %s.\n",
+               av_get_pix_fmt_name(s->sw_format_alpha_mask));
+        return AVERROR(ENOSYS);
+    }
+
+    if (main_inlink->w != alpha_inlink->w || main_inlink->h != alpha_inlink->h) {
+        av_log(ctx, AV_LOG_ERROR, "Input frame sizes do not match (%dx%d vs %dx%d).\n",
+               main_inlink->w, main_inlink->h, alpha_inlink->w, alpha_inlink->h);
+        return AVERROR(EINVAL);
+    }
+
+    s->hw_device_ctx = av_buffer_ref(main_frames_ctx->device_ref);
+    if (!s->hw_device_ctx)
+        return AVERROR(ENOMEM);
+
+    s->hwctx = ((AVHWDeviceContext*)s->hw_device_ctx->data)->hwctx;
+    s->cu_ctx = s->hwctx->cuda_ctx;
+    s->cu_stream = s->hwctx->stream;
+    cu = s->hwctx->internal->cuda_dl;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->cu_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                              ff_vf_alphamerge_cuda_ptx_data, ff_vf_alphamerge_cuda_ptx_len);
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to load CUDA module.\n");
+        goto end;
+    }
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_alphamerge_planar, s->cu_module, "alphamerge_planar"));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to get kernel function 'alphamerge_planar'.\n");
+        goto end;
+    }
+
+    main_desc = av_pix_fmt_desc_get(s->sw_format_main);
+    if (!main_desc || !(main_desc->flags & AV_PIX_FMT_FLAG_ALPHA)) {
+        av_log(ctx, AV_LOG_ERROR, "Main input sw_format %s is not a supported format with an alpha channel.\n",
+               av_get_pix_fmt_name(s->sw_format_main));
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+    s->alpha_plane_idx = main_desc->comp[ALPHA_PLANE_INDEX].plane;
+
+    ff_filter_link(outlink)->hw_frames_ctx = av_buffer_ref(main_inl->hw_frames_ctx);
+    if (!ff_filter_link(outlink)->hw_frames_ctx) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    s->fs.time_base = main_inlink->time_base;
+    if ((ret = ff_framesync_init_dualinput(&s->fs, ctx)) < 0)
+        goto end;
+
+    outlink->w = main_inlink->w;
+    outlink->h = main_inlink->h;
+    outlink->time_base = main_inlink->time_base;
+    outlink->sample_aspect_ratio = main_inlink->sample_aspect_ratio;
+    ff_filter_link(outlink)->frame_rate = ff_filter_link(main_inlink)->frame_rate;
+
+    ret = ff_framesync_configure(&s->fs);
+
+end:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy_cu_ctx));
+    return ret;
+}
+
+static av_cold int alphamerge_cuda_init(AVFilterContext *ctx)
+{
+    AlphaMergeCUDAContext *s = ctx->priv;
+    s->fs.on_event = &do_alphamerge_cuda;
+    return 0;
+}
+
+static av_cold void alphamerge_cuda_uninit(AVFilterContext *ctx)
+{
+    AlphaMergeCUDAContext *s = ctx->priv;
+    CUcontext dummy;
+
+    ff_framesync_uninit(&s->fs);
+
+    if (s->cu_module) {
+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+        CHECK_CU(cu->cuCtxPushCurrent(s->cu_ctx));
+
+        if (s->cu_stream)
+            CHECK_CU(cu->cuStreamSynchronize(s->cu_stream));
+
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    av_buffer_unref(&s->hw_device_ctx);
+}
+
+static int alphamerge_cuda_activate(AVFilterContext *ctx)
+{
+    AlphaMergeCUDAContext *s = ctx->priv;
+    return ff_framesync_activate(&s->fs);
+}
+
+static const AVFilterPad alphamerge_cuda_inputs[] = {
+    {
+        .name = "main",
+        .type = AVMEDIA_TYPE_VIDEO,
+    },
+    {
+        .name = "alpha",
+        .type = AVMEDIA_TYPE_VIDEO,
+    }
+};
+
+static const AVFilterPad alphamerge_cuda_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO,
+        .config_props = &alphamerge_cuda_config_output,
+    }
+};
+
+static const AVOption alphamerge_cuda_options[] = {
+    { NULL },
+};
+
+FRAMESYNC_DEFINE_CLASS(alphamerge_cuda, AlphaMergeCUDAContext, fs);
+
+const FFFilter ff_vf_alphamerge_cuda = {
+    .p.name          = "alphamerge_cuda",
+    .p.description   = NULL_IF_CONFIG_SMALL("Copy the luma value of the second input into the alpha channel of the first input using CUDA."),
+
+    .priv_size     = sizeof(AlphaMergeCUDAContext),
+    .p.priv_class    = &alphamerge_cuda_class,
+
+    .init          = &alphamerge_cuda_init,
+    .uninit        = &alphamerge_cuda_uninit,
+
+    .activate      = &alphamerge_cuda_activate,
+    FILTER_INPUTS(alphamerge_cuda_inputs),
+    FILTER_OUTPUTS(alphamerge_cuda_outputs),
+    FILTER_QUERY_FUNC2(query_formats),
+    .preinit       = alphamerge_cuda_framesync_preinit,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
\ No newline at end of file
diff --git a/libavfilter/vf_alphamerge_cuda.cu b/libavfilter/vf_alphamerge_cuda.cu
new file mode 100644
index 0000000000..99a9dc91ec
--- /dev/null
+++ b/libavfilter/vf_alphamerge_cuda.cu
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+template <typename T>
+__device__ void alphamerge_impl(T *dst, int dst_pitch,
+                                const T *src, int src_pitch,
+                                int width, int height)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        dst[y * dst_pitch + x] = src[y * src_pitch + x];
+    }
+}
+
+extern "C" {
+    __global__ void alphamerge_planar(unsigned char* main_alpha_plane,
+                                      int main_alpha_linesize,
+                                      const unsigned char* alpha_mask_luma_plane,
+                                      int alpha_mask_luma_linesize,
+                                      int width, int height)
+    {
+        alphamerge_impl<unsigned char>(main_alpha_plane, main_alpha_linesize,
+                                       alpha_mask_luma_plane, alpha_mask_luma_linesize,
+                                       width, height);
+    }
+}
\ No newline at end of file
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".