From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTP id 0E8D843959 for ; Sun, 3 Jul 2022 16:07:44 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 1BC9768B575; Sun, 3 Jul 2022 19:07:42 +0300 (EEST) Received: from btbn.de (btbn.de [136.243.74.85]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 4066D68B575 for ; Sun, 3 Jul 2022 19:07:35 +0300 (EEST) Received: from [authenticated] by btbn.de (Postfix) with ESMTPSA id CAB0935661C; Sun, 3 Jul 2022 18:07:34 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=rothenpieler.org; s=mail; t=1656864454; bh=WfEaBeOpdW0k8eXmt0sakBSnP+q9bcmg2lzVQEdd2eg=; h=Date:Subject:To:References:From:Cc:In-Reply-To; b=b85cYco2PwqzhWtA2yb592Lsb5ybcDfG70ynb4Tt4mf/5N/ga8PyyI9TCduU/VfjL kQKJKaGtEZ/PhXXveBk03NLn9ognE+MZTQtjKC6kbBD3/G2eqbOSEbC5ygSPbi2Y7B L2VoKxw718jpXrmP59XUnzNAI+IpP3OLqCH03eElmwawA66O6TdjdgRjaHrfxgcAtq rUo4MgXW7dc2gNmPwyX/lm/PwecFlL6NPQtD4ap0jzE7UIkjMOCSnqr01aq7GnHt9R 8iC9qk2QhszBG26MVxeoLANLQVCp99oYeP/4jrPlzJCc37Y8VS3Ys+SODuB0IW5saw yg26TpnwNf1Aw== Message-ID: <7fd0398d-78eb-3f43-a6fd-fb3240781922@rothenpieler.org> Date: Sun, 3 Jul 2022 18:07:34 +0200 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Thunderbird/102.0 Content-Language: en-US To: FFmpeg development discussions and patches References: From: Timo Rothenpieler In-Reply-To: Subject: Re: [FFmpeg-devel] [GSoC'22] Added Chromakey CUDA filter X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: mohamed Elhadidy Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="us-ascii"; Format="flowed" Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: On 30.06.2022 02:01, mohamed Elhadidy wrote: > GSoC'22 GPU accelerated video filters > Added CUDA chromakeyfilter > libavfilter/vf_chromakey_cuda.cu:the CUDA kernel for the filter > libavfilter/vf_chromakey_cuda.c: the C side that calls the kernel and gets user input > libavfilter/allfilters.c: added the filter to it > libavfilter/Makefile: added the filter to it > cuda/cuda_runtime.h: added two math CUDA functions that are used in the filter > --- > compat/cuda/cuda_runtime.h | 2 + > libavfilter/Makefile | 2 + > libavfilter/allfilters.c | 1 + > libavfilter/vf_chromakey_cuda.c | 520 +++++++++++++++++++++++++++++++ > libavfilter/vf_chromakey_cuda.cu | 248 +++++++++++++++ > 5 files changed, 773 insertions(+) > create mode 100644 libavfilter/vf_chromakey_cuda.c > create mode 100644 libavfilter/vf_chromakey_cuda.cu > > diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h > index 30cd085e48..5837c1ad37 100644 > --- a/compat/cuda/cuda_runtime.h > +++ b/compat/cuda/cuda_runtime.h > @@ -181,7 +181,9 @@ static inline __device__ double trunc(double a) { return __builtin_trunc(a); } > static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } > static inline __device__ float fabs(float a) { return __builtin_fabsf(a); } > static inline __device__ double fabs(double a) { return __builtin_fabs(a); } > +static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); } > > +static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); } > static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } > static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); } > static inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); } > diff --git a/libavfilter/Makefile b/libavfilter/Makefile > index 22b0a0ca15..8aee10fc76 100644 > --- a/libavfilter/Makefile > +++ b/libavfilter/Makefile > @@ -210,6 +210,8 @@ OBJS-$(CONFIG_CAS_FILTER) += vf_cas.o > OBJS-$(CONFIG_CHROMABER_VULKAN_FILTER) += vf_chromaber_vulkan.o vulkan.o vulkan_filter.o > OBJS-$(CONFIG_CHROMAHOLD_FILTER) += vf_chromakey.o > OBJS-$(CONFIG_CHROMAKEY_FILTER) += vf_chromakey.o > +OBJS-$(CONFIG_CHROMAKEY_CUDA_FILTER) += vf_chromakey_cuda.o vf_chromakey_cuda.ptx.o framesync.o This doesn't use framesync at all, does it? Only the overlay filter does. > OBJS-$(CONFIG_CHROMANR_FILTER) += vf_chromanr.o > OBJS-$(CONFIG_CHROMASHIFT_FILTER) += vf_chromashift.o > OBJS-$(CONFIG_CIESCOPE_FILTER) += vf_ciescope.o > diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c > index ec70feef11..da1a96b23c 100644 > --- a/libavfilter/allfilters.c > +++ b/libavfilter/allfilters.c > @@ -195,6 +195,7 @@ extern const AVFilter ff_vf_cas; > extern const AVFilter ff_vf_chromaber_vulkan; > extern const AVFilter ff_vf_chromahold; > extern const AVFilter ff_vf_chromakey; > +extern const AVFilter ff_vf_chromakey_cuda; > extern const AVFilter ff_vf_chromanr; > extern const AVFilter ff_vf_chromashift; > extern const AVFilter ff_vf_ciescope; > diff --git a/libavfilter/vf_chromakey_cuda.c b/libavfilter/vf_chromakey_cuda.c > new file mode 100644 > index 0000000000..822488d436 > --- /dev/null > +++ b/libavfilter/vf_chromakey_cuda.c > @@ -0,0 +1,520 @@ > +/* > +* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. This probably should be changed, there's basically none of the original code left. > +* Permission is hereby granted, free of charge, to any person obtaining a > +* copy of this software and associated documentation files (the "Software"), > +* to deal in the Software without restriction, including without limitation > +* the rights to use, copy, modify, merge, publish, distribute, sublicense, > +* and/or sell copies of the Software, and to permit persons to whom the > +* Software is furnished to do so, subject to the following conditions: > +* > +* The above copyright notice and this permission notice shall be included in > +* all copies or substantial portions of the Software. > +* > +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > +* DEALINGS IN THE SOFTWARE. > +*/ > + > +#include > +#include > +#include > + > +#include "libavutil/avstring.h" > +#include "libavutil/common.h" > +#include "libavutil/hwcontext.h" > +#include "libavutil/hwcontext_cuda_internal.h" > +#include "libavutil/cuda_check.h" > +#include "libavutil/internal.h" > +#include "libavutil/opt.h" > +#include "libavutil/pixdesc.h" > + > +#include "avfilter.h" > +#include "formats.h" > +#include "internal.h" > +#include "video.h" > + > +#include "cuda/load_helper.h" > + > + > + > +#define FIXNUM(x) lrint((x) * (1 << 10)) > +#define RGB_TO_U(rgb) (((- FIXNUM(0.16874) * rgb[0] - FIXNUM(0.33126) * rgb[1] + FIXNUM(0.50000) * rgb[2] + (1 << 9) - 1) >> 10) + 128) > +#define RGB_TO_V(rgb) ((( FIXNUM(0.50000) * rgb[0] - FIXNUM(0.41869) * rgb[1] - FIXNUM(0.08131) * rgb[2] + (1 << 9) - 1) >> 10) + 128) > + > + > +static const enum AVPixelFormat supported_formats[] = { > + AV_PIX_FMT_YUV420P, > + AV_PIX_FMT_NV12, > + AV_PIX_FMT_YUV444P, > + AV_PIX_FMT_YUVA420P > + spurious empty line > +}; Does the filter actually support yuv444p? Looking at the kernels, it seems to be hardcoded for 4:2:0 subsampling. Should be easy to adapt though, see comment in Kernel below. > +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) > +#define BLOCKX 32 > +#define BLOCKY 16 > + > +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) > + > + > +typedef struct ChromakeyCUDAContext { > + const AVClass *class; > + > + AVCUDADeviceContext *hwctx; > + > + enum AVPixelFormat in_fmt, out_fmt; > + const AVPixFmtDescriptor *in_desc, *out_desc; > + int in_planes, out_planes; > + int in_plane_depths[4]; > + int in_plane_channels[4]; > + > + > + uint8_t chromakey_rgba[4]; > + uint16_t chromakey_uv[2]; > + int is_yuv; > + float similarity; > + float blend; > + > + > + Superfluous empty lines here and above in the struct. > + AVBufferRef *frames_ctx; > + AVFrame *frame; > + > + AVFrame *tmp_frame; > + > + char *w_expr; ///< width expression string > + char *h_expr; ///< height expression string Comments can probably go, given no other elements here have them. > + CUcontext cu_ctx; > + CUmodule cu_module; > + CUfunction cu_func; > + CUfunction cu_func_uv; > + CUstream cu_stream; > +} ChromakeyCUDAContext; > + > +static av_cold int cudachromakey_init(AVFilterContext *ctx) > +{ > + ChromakeyCUDAContext *s = ctx->priv; > + > + s->frame = av_frame_alloc(); > + if (!s->frame) > + return AVERROR(ENOMEM); > + > + s->tmp_frame = av_frame_alloc(); > + if (!s->tmp_frame) > + return AVERROR(ENOMEM); > + > + return 0; > +} > + > +static av_cold void cudachromakey_uninit(AVFilterContext *ctx) > +{ > + ChromakeyCUDAContext *s = ctx->priv; > + > + if (s->hwctx && s->cu_module) { > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + CUcontext context; > + > + CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); > + CHECK_CU(cu->cuModuleUnload(s->cu_module)); > + s->cu_module = NULL; > + CHECK_CU(cu->cuCtxPopCurrent(&context)); > + } > + > + av_frame_free(&s->frame); > + av_buffer_unref(&s->frames_ctx); > + av_frame_free(&s->tmp_frame); > +} > + > +static av_cold int init_hwframe_ctx(ChromakeyCUDAContext *s, AVBufferRef *device_ctx, int width, int height) > +{ > + AVBufferRef *out_ref = NULL; > + AVHWFramesContext *out_ctx; > + int ret; > + > + out_ref = av_hwframe_ctx_alloc(device_ctx); > + if (!out_ref) > + return AVERROR(ENOMEM); > + out_ctx = (AVHWFramesContext*)out_ref->data; > + > + out_ctx->format = AV_PIX_FMT_CUDA; > + out_ctx->sw_format = s->out_fmt; > + out_ctx->width = width; > + out_ctx->height = height; > + > + ret = av_hwframe_ctx_init(out_ref); > + if (ret < 0) > + goto fail; > + > + av_frame_unref(s->frame); > + ret = av_hwframe_get_buffer(out_ref, s->frame, 0); > + if (ret < 0) > + goto fail; > + > + av_buffer_unref(&s->frames_ctx); > + s->frames_ctx = out_ref; > + > + return 0; > +fail: > + av_buffer_unref(&out_ref); > + return ret; > +} > + > +static int format_is_supported(enum AVPixelFormat fmt) > +{ > + int i; > + > + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) > + if (supported_formats[i] == fmt) > + return 1; > + return 0; > +} > + > +static av_cold void set_format_info(AVFilterContext *ctx, enum AVPixelFormat in_format, enum AVPixelFormat out_format) > +{ > + ChromakeyCUDAContext *s = ctx->priv; > + int i, p, d; > + > + s->in_fmt = in_format; > + s->out_fmt = out_format; > + > + s->in_desc = av_pix_fmt_desc_get(s->in_fmt); > + s->out_desc = av_pix_fmt_desc_get(s->out_fmt); > + s->in_planes = av_pix_fmt_count_planes(s->in_fmt); > + s->out_planes = av_pix_fmt_count_planes(s->out_fmt); > + > + // find maximum step of each component of each plane > + // For our subset of formats, this should accurately tell us how many channels CUDA needs > + // i.e. 1 for Y plane, 2 for UV plane of NV12, 4 for single plane of RGB0 formats > + > + for (i = 0; i < s->in_desc->nb_components; i++) { > + d = (s->in_desc->comp[i].depth + 7) / 8; > + p = s->in_desc->comp[i].plane; > + s->in_plane_channels[p] = FFMAX(s->in_plane_channels[p], s->in_desc->comp[i].step / d); > + > + s->in_plane_depths[p] = s->in_desc->comp[i].depth; > + } > +} > + > +static av_cold int init_processing_chain(AVFilterContext *ctx, int width, int height) > +{ > + ChromakeyCUDAContext *s = ctx->priv; > + AVHWFramesContext *in_frames_ctx; > + int ret; > + > + /* check that we have a hw context */ > + if (!ctx->inputs[0]->hw_frames_ctx) { > + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); > + return AVERROR(EINVAL); > + } > + in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data; > + > + if (!format_is_supported(in_frames_ctx->sw_format)) { > + av_log(ctx, AV_LOG_ERROR, "Unsupported format: %s\n", av_get_pix_fmt_name(in_frames_ctx->sw_format)); > + return AVERROR(ENOSYS); > + } > + > + set_format_info(ctx, in_frames_ctx->sw_format,AV_PIX_FMT_YUVA420P ); > + > + ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height); > + if (ret < 0) > + return ret; > + > + ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx); > + if (!ctx->outputs[0]->hw_frames_ctx) > + return AVERROR(ENOMEM); > + > + return 0; > +} > + > +static av_cold int cudachromakey_load_functions(AVFilterContext *ctx) > +{ > + ChromakeyCUDAContext *s = ctx->priv; > + CUcontext context, cuda_ctx = s->hwctx->cuda_ctx; > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + int ret; > + > + extern const unsigned char ff_vf_chromakey_cuda_ptx_data[]; > + extern const unsigned int ff_vf_chromakey_cuda_ptx_len; > + > + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); > + if (ret < 0) > + return ret; > + > + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, > + ff_vf_chromakey_cuda_ptx_data, ff_vf_chromakey_cuda_ptx_len); > + if (ret < 0) > + goto fail; > + > + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, "Process_uchar")); > + if (ret < 0) { > + av_log(ctx, AV_LOG_FATAL, "Failed loading Process_uchar\n"); > + goto fail; > + } > + > + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, "Process_uchar2")); > + if (ret < 0) { > + av_log(ctx, AV_LOG_FATAL, "Failed loading Process_uchar2\n"); > + goto fail; > + } > + > +fail: > + CHECK_CU(cu->cuCtxPopCurrent(&context)); > + > + return ret; > +} > + > +static av_cold int cudachromakey_config_props(AVFilterLink *outlink) > +{ > + AVFilterContext *ctx = outlink->src; > + AVFilterLink *inlink = outlink->src->inputs[0]; > + ChromakeyCUDAContext *s = ctx->priv; > + AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; > + AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; > + int ret; > + > + s->hwctx = device_hwctx; > + s->cu_stream = s->hwctx->stream; > + > + ret = init_processing_chain(ctx, inlink->w, inlink->h); > + if (ret < 0) > + return ret; > + > + outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; > + > + ret = cudachromakey_load_functions(ctx); > + if (ret < 0) > + return ret; > + > + return 0; > +} > + > +static int call_cuda_kernel(AVFilterContext *ctx, CUfunction func, > + CUtexObject src_tex[3], AVFrame *out_frame, > + int width, int height, int pitch, > + int width_uv, int height_uv, int pitch_uv, > + float u_key,float v_key, float similarity, > + float blend ) > +{ > + ChromakeyCUDAContext *s = ctx->priv; > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + int ret; > + > + CUdeviceptr dst_devptr[4] = { > + (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1], > + (CUdeviceptr)out_frame->data[2], (CUdeviceptr)out_frame->data[3] > + > + }; > + > + void *args_uchar[] = { > + &src_tex[0], &src_tex[1], &src_tex[2], > + &dst_devptr[0], &dst_devptr[1], &dst_devptr[2],&dst_devptr[3], > + &width, &height, &pitch, > + &width_uv, &height_uv, &pitch_uv,&u_key,&v_key,&similarity,&blend > + }; > + > + ret = CHECK_CU(cu->cuLaunchKernel(func, > + DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, > + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); > + if (ret < 0) > + return ret; > + > + // This ruins performance, but catches errors that happened in the Kernel immediately > + // Remove for performance/production test > + ret = CHECK_CU(cu->cuStreamSynchronize(s->cu_stream)); > + > + return ret; > +} > + > +static int cudachromakey_process_internal(AVFilterContext *ctx, > + AVFrame *out, AVFrame *in) > +{ > + ChromakeyCUDAContext *s = ctx->priv; > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + CUcontext context, cuda_ctx = s->hwctx->cuda_ctx; > + float u_key,v_key; > + int i, ret; > + > + CUtexObject tex[3] = { 0, 0, 0 }; > + if(s->is_yuv){ > + u_key=s->chromakey_rgba[1]; > + v_key=s->chromakey_rgba[2]; > + > + }else{ > + u_key = RGB_TO_U(s->chromakey_rgba); > + v_key = RGB_TO_V(s->chromakey_rgba); > + } > + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); > + if (ret < 0) > + return ret; > + > + for (i = 0; i < s->in_planes; i++) { > + CUDA_TEXTURE_DESC tex_desc = { > + .filterMode = CU_TR_FILTER_MODE_LINEAR, > + .flags = 0, // CU_TRSF_READ_AS_INTEGER to get raw ints instead of normalized floats from tex2D > + }; > + > + CUDA_RESOURCE_DESC res_desc = { > + .resType = CU_RESOURCE_TYPE_PITCH2D, > + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT8, > + .res.pitch2D.numChannels = s->in_plane_channels[i], > + .res.pitch2D.pitchInBytes = in->linesize[i], > + .res.pitch2D.devPtr = (CUdeviceptr)in->data[i], > + }; > + > + if (i == 1 || i == 2) { > + res_desc.res.pitch2D.width = AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w); > + res_desc.res.pitch2D.height = AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h); > + } else { > + res_desc.res.pitch2D.width = in->width; > + res_desc.res.pitch2D.height = in->height; > + } > + > + ret = CHECK_CU(cu->cuTexObjectCreate(&tex[i], &res_desc, &tex_desc, NULL)); > + if (ret < 0) > + goto exit; > + } > + > + ret = call_cuda_kernel(ctx, (s->in_plane_channels[1] > 1) ? s->cu_func_uv : s->cu_func, > + tex, out, > + out->width, out->height, out->linesize[0], > + AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w), > + AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h), > + out->linesize[1], > + u_key,v_key,s->similarity,s->blend ); extra whitespace before ) > + if (ret < 0) > + goto exit; > + > +exit: > + for (i = 0; i < s->in_planes; i++) > + if (tex[i]) > + CHECK_CU(cu->cuTexObjectDestroy(tex[i])); > + > + CHECK_CU(cu->cuCtxPopCurrent(&context)); > + > + return ret; > +} > + > +static int cudachromakey_process(AVFilterContext *ctx, AVFrame *out, AVFrame *in) > +{ > + ChromakeyCUDAContext *s = ctx->priv; > + AVFrame *src = in; > + int ret; > + > + ret = cudachromakey_process_internal(ctx, s->frame, src); > + if (ret < 0) > + return ret; > + > + src = s->frame; > + ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0); > + if (ret < 0) > + return ret; > + > + av_frame_move_ref(out, s->frame); > + av_frame_move_ref(s->frame, s->tmp_frame); > + > + ret = av_frame_copy_props(out, in); > + if (ret < 0) > + return ret; > + > + return 0; > +} > + > +static int cudachromakey_filter_frame(AVFilterLink *link, AVFrame *in) > +{ > + AVFilterContext *ctx = link->dst; > + ChromakeyCUDAContext *s = ctx->priv; the '=' got out of alignment with the others > + AVFilterLink *outlink = ctx->outputs[0]; > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + > + AVFrame *out = NULL; > + CUcontext context; > + int ret = 0; > + > + out = av_frame_alloc(); > + if (!out) { > + ret = AVERROR(ENOMEM); > + goto fail; > + } > + > + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); > + if (ret < 0) > + goto fail; > + > + ret = cudachromakey_process(ctx, out, in); > + > + CHECK_CU(cu->cuCtxPopCurrent(&context)); > + if (ret < 0) > + goto fail; > + > + av_frame_free(&in); > + return ff_filter_frame(outlink, out); > +fail: > + av_frame_free(&in); > + av_frame_free(&out); > + return ret; > +} > + > +static AVFrame *cudachromakey_get_video_buffer(AVFilterLink *inlink, int w, int h) > +{ > + return ff_default_get_video_buffer(inlink, w, h); > +} > + > +#define OFFSET(x) offsetof(ChromakeyCUDAContext, x) > +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) > +static const AVOption options[] = { > + { "color", "set the chromakey key color", OFFSET(chromakey_rgba), AV_OPT_TYPE_COLOR, { .str = "black" }, 0, 0, FLAGS }, > + { "similarity", "set the chromakey similarity value", OFFSET(similarity), AV_OPT_TYPE_FLOAT, { .dbl = 0.01 }, 0.01, 1.0, FLAGS }, > + { "blend", "set the chromakey key blend value", OFFSET(blend), AV_OPT_TYPE_FLOAT, { .dbl = 0.0 }, 0.0, 1.0, FLAGS }, > + { "yuv", "color parameter is in yuv instead of rgb", OFFSET(is_yuv), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS }, > + { "w", "Output video width", OFFSET(w_expr), AV_OPT_TYPE_STRING, { .str = "iw" }, .flags = FLAGS }, > + { "h", "Output video height", OFFSET(h_expr), AV_OPT_TYPE_STRING, { .str = "ih" }, .flags = FLAGS }, > + { NULL }, > +}; > + > +static const AVClass cudachromakey_class = { > + .class_name = "cudachromakey", > + .item_name = av_default_item_name, > + .option = options, > + .version = LIBAVUTIL_VERSION_INT, > +}; > + > +static const AVFilterPad cudachromakey_inputs[] = { > + { > + .name = "default", > + .type = AVMEDIA_TYPE_VIDEO, > + .filter_frame = cudachromakey_filter_frame, > + .get_buffer.video = cudachromakey_get_video_buffer, > + }, > +}; > + > +static const AVFilterPad cudachromakey_outputs[] = { > + { > + .name = "default", > + .type = AVMEDIA_TYPE_VIDEO, > + .config_props = cudachromakey_config_props, > + }, > +}; > + > +const AVFilter ff_vf_chromakey_cuda = { > + .name = "chromakey_cuda", > + .description = NULL_IF_CONFIG_SMALL("GPU accelerated chromakey filter"), > + > + .init = cudachromakey_init, > + .uninit = cudachromakey_uninit, > + > + .priv_size = sizeof(ChromakeyCUDAContext), > + .priv_class = &cudachromakey_class, > + > + FILTER_INPUTS(cudachromakey_inputs), > + FILTER_OUTPUTS(cudachromakey_outputs), > + > + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), > + > + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, > +}; > diff --git a/libavfilter/vf_chromakey_cuda.cu b/libavfilter/vf_chromakey_cuda.cu > new file mode 100644 > index 0000000000..fef118a0f8 > --- /dev/null > +++ b/libavfilter/vf_chromakey_cuda.cu > @@ -0,0 +1,248 @@ > +/* > + * Copyright (c) 2022, Chromakey > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > + * DEALINGS IN THE SOFTWARE. > + */ > + > +#include "cuda/vector_helpers.cuh" > + > +extern "C" > +{ > + > + > +/** > + * @brief function contains the main logic of chroma keying, and changes the alpahc channel with the suitable value > + * > + * @param src_tex texture U or texture UV , decided based on the passed is_uchar2 flag > + * @param src_tex_V texture V , used only if is_uchar2 flag is false > + * @param dst_A alpha channel destination > + * @param width_uv width of uv channels > + * @param height_uv height of uv channels > + * @param width width of alpha channel > + * @param height height of alpha channel > + * @param pitch pitch of alpha channel > + * @param x current x coordinate of pixel > + * @param y current y coordinate of pixel > + * @param chromakey_uv uv values for chroma keying > + * @param similarity similarity of keying > + * @param blend blend of keying > + * @param is_uchar2 true means that sent texture is uchar2 for the UV channels, false means U and V are sent seperatley > + * @param resize_ratio the ratio of alpha channel to UV channels > + * @return void > + */ > +__device__ static inline void change_alpha_channel( > + cudaTextureObject_t src_tex, cudaTextureObject_t src_tex_V, > + uchar *dst_A, int width_uv, > + int height_uv, int width, > + int height, int pitch, > + int x, int y, Can you rearrange those, so that corresponding values share a line? As in: int width_uv, int height_uv, int width, int height, int pitch, And so on? Having them intermixed like this makes this quite hard to read imo. Same goes for the other two functions. > + float2 chromakey_uv, float similarity, > + float blend, bool is_uchar2, > + uchar resize_ratio) > +{ > + Nit: we don't typically have empty first lines in functions > + uchar window_size = 3; > + int start_r = x - window_size / 2; > + int start_c = y - window_size / 2; > + > + uchar counter = 0; > + float diff = 0.0f; > + float du, dv; > + > + // loop over the eight neighbourhood of the current pixel(x,y) > + for (uchar i = 0; i < window_size; i++) > + { > + for (uchar j = 0; j < window_size; j++) > + { > + int r = start_r + i; > + int c = start_c + j; > + bool check_flag = (r >= 0 && r < width_uv && c >= 0 && c < height_uv); > + > + if (!check_flag) > + continue; > + > + float u_value, v_value; > + if (is_uchar2){ You should be able to just check if src_tex_V is valid, saving you one of the already quite numerous parameters. > + float2 temp_uv = tex2D(src_tex, r, c); > + u_value = temp_uv.x; > + v_value = temp_uv.y; > + }else{ > + u_value = tex2D(src_tex, r, c); > + v_value = tex2D(src_tex_V, r, c); > + } > + > + du = (u_value * 255.0f) - chromakey_uv.x; > + dv = (v_value * 255.0f) - chromakey_uv.y; > + diff += sqrtf((du * du + dv * dv) / (255.0f * 255.0f * 2.f)); > + counter++; > + } > + } > + > + if (counter > 0){ > + diff = diff / counter; > + }else{ > + diff /= 9.0f; > + } > + > + uchar alpha_value; One too many whitespaces. > + if(blend>0.0001f){ > + alpha_value=__saturatef((diff - similarity) / blend)*255; > + }else{ > + alpha_value=(diff < similarity ? 0 : 1)*255; > + } No {} for single-line if/else statements. > + //write the value in the aloha channel with regarding the ratio of (alpha_size : uv_size) Typo, "alpha" > + for (uchar k = 0; k < resize_ratio; k++) > + { > + for (uchar l = 0; l < resize_ratio; l++) > + { > + int x_resize = x * resize_ratio + k; > + int y_resize = y * resize_ratio + l; > + int a_channel_resize = y_resize * pitch + x_resize; > + if (y_resize >= height || x_resize >= width) > + continue; > + dst_A[a_channel_resize] = alpha_value; > + } > + } > + > + Two more spurious empty lines. > +} > + > + > +/** > + * @brief Function to process yuv420p video , and pass it to change_alpha_channel function, the output video is yuva420p > + * > + * @param src_tex_Y Y channel of input video frame > + * @param src_tex_U U channel of input video frame > + * @param src_tex_V V channel of input video frame > + * @param dst_Y Y channel of output video frame > + * @param dst_U U channel of output video frame > + * @param dst_V V channel of output video frame > + * @param dst_A A channel of output video frame > + * @param width width of Y channel for input,video frame > + * @param height height of Y channel for input,video frame > + * @param pitch pitch(line size) of Y channel for input,video frame > + * @param width_uv width of UV for input,output video frame > + * @param height_uv height of UV for input,output video frame > + * @param pitch_uv pitch(line size) of UV for input,output video frame > + * @param u_key u part of chromakey (user input) > + * @param v_key v part of chromakey (user input) > + * @param similarity similairty user want (user input) > + * @param blend blend user want (user input) > + * @return void > + */ You don't need to document all of these for all the functions, given they're all pretty much identical. Describing them on the shared function is more than plenty. > +__global__ void Process_uchar( > + cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_U, > + cudaTextureObject_t src_tex_V,uchar *dst_Y, > + uchar *dst_U, uchar *dst_V, > + uchar *dst_A,int width, > + int height, int pitch, > + int width_uv, int height_uv, > + int pitch_uv,float u_key, > + float v_key, float similarity, > + float blend) > +{ > + > + uchar resize_ratio = 2; If you manage to derive this from the actual information given, this function could otherwise support yuv444p as-is. Like, if it's 444, width_uv will be equal to width. For 420, width_uv will be only half of it. > + int x = blockIdx.x * blockDim.x + threadIdx.x; > + int y = blockIdx.y * blockDim.y + threadIdx.y; > + > + if (y >= height || x >= width) > + return; > + dst_Y[y * pitch + x] = tex2D(src_tex_Y, x, y)*255; > + if (y >= height_uv || x >= width_uv) > + return; > + > + int uv_index = y * pitch_uv + x; > + dst_U[uv_index]=tex2D(src_tex_U,x,y)*255; > + dst_V[uv_index]=tex2D(src_tex_V,x,y)*255; > + > + > + > + change_alpha_channel(src_tex_U,src_tex_V, > + dst_A,width_uv,height_uv, > + width,height, > + pitch,x, > + y,make_float2(u_key,v_key), > + similarity,blend, > + false,resize_ratio); > + > +} > + > + > + > +/** > + * @brief Function to process nv12 video , and pass it to change_alpha_channel function, the output video is yuva420p > + * > + * @param src_tex_Y Y channel of input video frame > + * @param src_tex_UV UV channels of input video frame > + * @param unused1 unused parameter > + * @param dst_Y Y channel of output video frame > + * @param dst_U U channel of output video frame > + * @param dst_V V channel of output video frame > + * @param dst_A A channel of output video frame > + * @param width width of Y channel for input,video frame > + * @param height height of Y channel for input,video frame > + * @param pitch pitch(line size) of Y channel for input,video frame > + * @param width_uv width of UV for input,output video frame > + * @param height_uv height of UV for input,output video frame > + * @param pitch_uv pitch(line size) of UV for input, output video frame > + * @param u_key u part of chromakey (user input) > + * @param v_key v part of chromakey (user input) > + * @param similarity similairty user want (user input) > + * @param blend blend user want (user input) > + * @return void > + */ > +__global__ void Process_uchar2( > + cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_UV, > + cudaTextureObject_t unused1,uchar *dst_Y, > + uchar *dst_U, uchar *dst_V, > + uchar *dst_A,int width, > + int height, int pitch, > + int width_uv, int height_uv, > + int pitch_uv,float u_key, > + float v_key, float similarity, > + float blend) > +{ > + > + uchar resize_ratio = 2; // alphachannel_size : uv_channel_size ratio > + int x = blockIdx.x * blockDim.x + threadIdx.x; // x coordinate of current pixel > + int y = blockIdx.y * blockDim.y + threadIdx.y; // y coordinate of current pixel > + > + if (y >= height || x >= width) > + return; > + dst_Y[y * pitch + x] = tex2D(src_tex_Y, x, y)*255; > + > + if (y >= height_uv || x >= width_uv) > + return; > + int uv_index= y * pitch_uv + x; > + float2 uv_temp=tex2D(src_tex_UV,x,y); > + dst_U[uv_index]=uv_temp.x*255; > + dst_V[uv_index]=uv_temp.y*255; > + > + change_alpha_channel(src_tex_UV, unused1, > + dst_A, width_uv, > + height_uv, width, > + height, pitch, > + x, y, > + make_float2(u_key,v_key), similarity, > + blend, true, > + resize_ratio); > +} > +} Looks good otherwise, and works as expected for all I could test, thank you! Make sure to mark a re-send of the fixed patch as v2 and send it as reply to this mail, so it shows up in the same thread. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".