From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTP id D418E406B5 for ; Sun, 28 Aug 2022 11:07:42 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 3E4AB68B8F2; Sun, 28 Aug 2022 14:07:39 +0300 (EEST) Received: from btbn.de (btbn.de [136.243.74.85]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id AC82968B34D for ; Sun, 28 Aug 2022 14:07:32 +0300 (EEST) Received: from [authenticated] by btbn.de (Postfix) with ESMTPSA id 24CCD3AAE90; Sun, 28 Aug 2022 13:07:31 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=rothenpieler.org; s=mail; t=1661684851; bh=G4H5IjLNJEuXw/BU5QVa3wGHLT7gFw2VBmIf0YWShdI=; h=Date:Subject:To:References:From:In-Reply-To; b=hwqSiXE+NwwhOivC7FpQVrpgqQ5TgDZtDW1w0c5zysZxE+Sytu4D1pB7pHyJimeX+ 2W7Ky2AfEkm5PK3XTTanankgfo37yH1DeaMhXyJat58BVCWMk4ysEXgc5BsTu5lP9o SNTVgsRMvCxEOUrreCnQibAsq8HmcGi7qzKTm+YVCueNkArYVg7qWTh4u+D0fg9lYr lvMe97Zw++za3QoN78hpNbKCY2Tpq4/Hu8mdt5PdvHtWgPLdL7sOOep6CGNIWjWkgw caHVZxnDycoZxrltwm3x4ym20eQQWptq7mzwxW/VYVPBHyXyXS/X2lASkMAQAjnrcw vP1vQfa5yyz/g== Message-ID: <1ba626ee-8b2a-c849-15a4-2ba36ccf7823@rothenpieler.org> Date: Sun, 28 Aug 2022 13:07:30 +0200 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Thunderbird/102.2.0 To: FFmpeg development discussions and patches , Mohamed Khaled Mohamed References: <743b6181-9f33-23e8-d983-96c912628f3a@eng-st.cu.edu.eg> Content-Language: en-US From: Timo Rothenpieler In-Reply-To: <743b6181-9f33-23e8-d983-96c912628f3a@eng-st.cu.edu.eg> Subject: Re: [FFmpeg-devel] [PATCH] avfilter: add bilateral_cuda filter X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="us-ascii"; Format="flowed" Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: On 22.08.2022 22:03, Mohamed Khaled Mohamed wrote: > avfilter: add bilateral_cuda filter > GSoC'22 > > libavfilter/vf_bilateral_cuda.cu:the CUDA kernel for the filter > libavfilter/vf_bilateral_cuda.c: the C side that calls the kernel and gets user input > libavfilter/allfilters.c: added the filter to it > libavfilter/Makefile: added the filter to it > cuda/cuda_runtime.h: added two math CUDA functions that are used in the filter > configure: add cuda dependencies for the filter > > Signed-off-by: Mohamed Khaled > --- > Changelog | 1 + > compat/cuda/cuda_runtime.h | 1 + > configure | 2 + > doc/filters.texi | 39 +++ > libavfilter/Makefile | 1 + > libavfilter/allfilters.c | 1 + > libavfilter/version.h | 2 +- > libavfilter/vf_bilateral_cuda.c | 479 +++++++++++++++++++++++++++++++ > libavfilter/vf_bilateral_cuda.cu | 177 ++++++++++++ > 9 files changed, 702 insertions(+), 1 deletion(-) > create mode 100644 libavfilter/vf_bilateral_cuda.c > create mode 100644 libavfilter/vf_bilateral_cuda.cu > > diff --git a/Changelog b/Changelog > index 121cfc3d90..d008be1577 100644 > --- a/Changelog > +++ b/Changelog > @@ -28,6 +28,7 @@ version 5.1: > - PHM image format support > - remap_opencl filter > - added chromakey_cuda filter > +- added bilateral_cuda filter > > > version 5.0: > diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h > index 5837c1ad37..58bc4c41af 100644 > --- a/compat/cuda/cuda_runtime.h > +++ b/compat/cuda/cuda_runtime.h > @@ -182,6 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } > static inline __device__ float fabs(float a) { return __builtin_fabsf(a); } > static inline __device__ double fabs(double a) { return __builtin_fabs(a); } > static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); } > +static inline __device__ float powf(float a,float y) { return __builtin_powf(a,y); } nit: space after ',' > static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); } > static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } > diff --git a/configure b/configure > index 9d6457d81b..c71cb11fda 100755 > --- a/configure > +++ b/configure > @@ -3144,6 +3144,8 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait" > > chromakey_cuda_filter_deps="ffnvcodec" > chromakey_cuda_filter_deps_any="cuda_nvcc cuda_llvm" > +bilateral_cuda_filter_deps="ffnvcodec" > +bilateral_cuda_filter_deps_any="cuda_nvcc cuda_llvm" > hwupload_cuda_filter_deps="ffnvcodec" > scale_npp_filter_deps="ffnvcodec libnpp" > scale2ref_npp_filter_deps="ffnvcodec libnpp" > diff --git a/doc/filters.texi b/doc/filters.texi > index 40f21fb34c..70c1f60291 100644 > --- a/doc/filters.texi > +++ b/doc/filters.texi > @@ -7965,6 +7965,45 @@ Set planes to filter. Default is first only. > > This filter supports the all above options as @ref{commands}. > > +@section bilateral_cuda > +CUDA accelerated bilateral filter, an edge pereseving filter. preserving I assume? > +This filter is mathematically accurate thanks to the use of GPU. > +For best output quality, use one to one chroma subsampling like yuv444 format. > + > +The filter accepts the following options: > +@table @option > +@item sigmaS > +Set sigma of gaussian function to calculate spatial weight. > +Allowed range is 0 to 512. Default is 0.1. > + > +@item sigmaR > +Set sigma of gaussian function to calculate range weight. > +Allowed range is 0 to 1. Default is 0.1. > + > +@item window_size > +Set window size of the bilateral function to determine the number of neighbours to loop on. > +If the number entered is even, a one will be added automatically. > +Allowed range is 1 to 255. Default is 1. > +@end table > +@subsection Examples > + > +@itemize > +@item > +Apply the bilateral filter on a video. > + > +@example > +./ffmpeg -v verbose \ > +-hwaccel cuda -hwaccel_output_format cuda -i input.mp4 \ > +-init_hw_device cuda \ > +-filter_complex \ > +" \ > +[0:v]scale_cuda=format=yuv444p[scaled_video]; > +[scaled_video]bilateral_cuda=window_size=9:sigmaS=3.0:sigmaR=50.0" \ > +-an -sn -c:v h264_nvenc -cq 20 out.mp4 > +@end example > + > +@end itemize > + > @section bitplanenoise > > Show and measure bit plane noise. > diff --git a/libavfilter/Makefile b/libavfilter/Makefile > index 30cc329fb6..d3284e2511 100644 > --- a/libavfilter/Makefile > +++ b/libavfilter/Makefile > @@ -194,6 +194,7 @@ OBJS-$(CONFIG_AVGBLUR_VULKAN_FILTER) += vf_avgblur_vulkan.o vulkan.o vul > OBJS-$(CONFIG_BBOX_FILTER) += bbox.o vf_bbox.o > OBJS-$(CONFIG_BENCH_FILTER) += f_bench.o > OBJS-$(CONFIG_BILATERAL_FILTER) += vf_bilateral.o > +OBJS-$(CONFIG_BILATERAL_CUDA_FILTER) += vf_bilateral_cuda.o vf_bilateral_cuda.ptx.o > OBJS-$(CONFIG_BITPLANENOISE_FILTER) += vf_bitplanenoise.o > OBJS-$(CONFIG_BLACKDETECT_FILTER) += vf_blackdetect.o > OBJS-$(CONFIG_BLACKFRAME_FILTER) += vf_blackframe.o > diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c > index 5ebacfde27..b5e05e8b9e 100644 > --- a/libavfilter/allfilters.c > +++ b/libavfilter/allfilters.c > @@ -180,6 +180,7 @@ extern const AVFilter ff_vf_avgblur_vulkan; > extern const AVFilter ff_vf_bbox; > extern const AVFilter ff_vf_bench; > extern const AVFilter ff_vf_bilateral; > +extern const AVFilter ff_vf_bilateral_cuda; > extern const AVFilter ff_vf_bitplanenoise; > extern const AVFilter ff_vf_blackdetect; > extern const AVFilter ff_vf_blackframe; > diff --git a/libavfilter/version.h b/libavfilter/version.h > index b1915afcea..0946ee91e8 100644 > --- a/libavfilter/version.h > +++ b/libavfilter/version.h > @@ -31,7 +31,7 @@ > > #include "version_major.h" > > -#define LIBAVFILTER_VERSION_MINOR 45 > +#define LIBAVFILTER_VERSION_MINOR 46 > #define LIBAVFILTER_VERSION_MICRO 100 > > > diff --git a/libavfilter/vf_bilateral_cuda.c b/libavfilter/vf_bilateral_cuda.c > new file mode 100644 > index 0000000000..b05954d2aa > --- /dev/null > +++ b/libavfilter/vf_bilateral_cuda.c > @@ -0,0 +1,479 @@ > +/* > + * Copyright (c) 2022 Mohamed Khaled > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include > +#include > +#include > + > +#include "libavutil/avstring.h" > +#include "libavutil/common.h" > +#include "libavutil/hwcontext.h" > +#include "libavutil/hwcontext_cuda_internal.h" > +#include "libavutil/cuda_check.h" > +#include "libavutil/internal.h" > +#include "libavutil/opt.h" > +#include "libavutil/pixdesc.h" > + > +#include "avfilter.h" > +#include "formats.h" > +#include "internal.h" > +#include "video.h" > + > +#include "cuda/load_helper.h" > + > +static const enum AVPixelFormat supported_formats[] = { > + AV_PIX_FMT_YUV420P, > + AV_PIX_FMT_NV12, > + AV_PIX_FMT_YUV444P > +}; > + > +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) > +#define BLOCKX 32 > +#define BLOCKY 16 > + > +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) > + > + > +typedef struct CUDABilateralContext { > + const AVClass *class; > + AVCUDADeviceContext *hwctx; > + > + enum AVPixelFormat in_fmt, out_fmt; > + const AVPixFmtDescriptor *in_desc, *out_desc; > + int in_planes, out_planes; > + int in_plane_depths[4]; > + int in_plane_channels[4]; > + > + int window_size; > + float sigmaS; > + float sigmaR; > + > + AVBufferRef *frames_ctx; > + AVFrame *frame; > + AVFrame *tmp_frame; > + > + CUcontext cu_ctx; > + CUmodule cu_module; > + CUfunction cu_func; > + CUfunction cu_func_uv; > + CUstream cu_stream; > +} CUDABilateralContext; > + > +static av_cold int cudabilateral_init(AVFilterContext *ctx) > +{ > + CUDABilateralContext *s = ctx->priv; > + > + s->frame = av_frame_alloc(); > + if (!s->frame) > + return AVERROR(ENOMEM); > + > + s->tmp_frame = av_frame_alloc(); > + if (!s->tmp_frame) > + return AVERROR(ENOMEM); > + > + return 0; > +} > + > +static av_cold void cudabilateral_uninit(AVFilterContext *ctx) > +{ > + CUDABilateralContext *s = ctx->priv; > + > + if (s->hwctx && s->cu_module) { > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + CUcontext bilateral; > + > + CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); > + CHECK_CU(cu->cuModuleUnload(s->cu_module)); > + s->cu_module = NULL; > + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); > + } > + > + av_frame_free(&s->frame); > + av_buffer_unref(&s->frames_ctx); > + av_frame_free(&s->tmp_frame); > +} > + > +static av_cold int init_hwframe_ctx(CUDABilateralContext *s, AVBufferRef *device_ctx, int width, int height) > +{ > + AVBufferRef *out_ref = NULL; > + AVHWFramesContext *out_ctx; > + int ret; > + > + out_ref = av_hwframe_ctx_alloc(device_ctx); > + if (!out_ref) > + return AVERROR(ENOMEM); > + out_ctx = (AVHWFramesContext*)out_ref->data; > + > + out_ctx->format = AV_PIX_FMT_CUDA; > + out_ctx->sw_format = s->out_fmt; > + out_ctx->width = width; > + out_ctx->height = height; > + > + ret = av_hwframe_ctx_init(out_ref); > + if (ret < 0) > + goto fail; > + > + av_frame_unref(s->frame); > + ret = av_hwframe_get_buffer(out_ref, s->frame, 0); > + if (ret < 0) > + goto fail; > + > + av_buffer_unref(&s->frames_ctx); > + s->frames_ctx = out_ref; > + > + return 0; > +fail: > + av_buffer_unref(&out_ref); > + return ret; > +} > + > +static int format_is_supported(enum AVPixelFormat fmt) > +{ > + int i; > + > + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) > + if (supported_formats[i] == fmt) > + return 1; > + return 0; > +} > + > +static av_cold void set_format_info(AVFilterContext *ctx, enum AVPixelFormat in_format, enum AVPixelFormat out_format) > +{ > + CUDABilateralContext *s = ctx->priv; > + int i, p, d; > + > + s->in_fmt = in_format; > + s->out_fmt = out_format; > + > + s->in_desc = av_pix_fmt_desc_get(s->in_fmt); > + s->out_desc = av_pix_fmt_desc_get(s->out_fmt); > + s->in_planes = av_pix_fmt_count_planes(s->in_fmt); > + s->out_planes = av_pix_fmt_count_planes(s->out_fmt); > + > + // find maximum step of each component of each plane > + // For our subset of formats, this should accurately tell us how many channels CUDA needs > + // i.e. 1 for Y plane, 2 for UV plane of NV12, 4 for single plane of RGB0 formats > + > + for (i = 0; i < s->in_desc->nb_components; i++) { > + d = (s->in_desc->comp[i].depth + 7) / 8; > + p = s->in_desc->comp[i].plane; > + s->in_plane_channels[p] = FFMAX(s->in_plane_channels[p], s->in_desc->comp[i].step / d); > + > + s->in_plane_depths[p] = s->in_desc->comp[i].depth; > + } > +} Unrelated to this patch, but we have this exact code in quite a bunch of filters now. It should probably get a proper function, maybe even part of the public API, at some point. > +static av_cold int init_processing_chain(AVFilterContext *ctx, int width, int height) > +{ > + CUDABilateralContext *s = ctx->priv; > + AVHWFramesContext *in_frames_ctx; > + int ret; > + > + /* check that we have a hw context */ > + if (!ctx->inputs[0]->hw_frames_ctx) { > + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); > + return AVERROR(EINVAL); > + } > + in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data; > + > + if (!format_is_supported(in_frames_ctx->sw_format)) { > + av_log(ctx, AV_LOG_ERROR, "Unsupported format: %s\n", av_get_pix_fmt_name(in_frames_ctx->sw_format)); > + return AVERROR(ENOSYS); > + } > + > + set_format_info(ctx, in_frames_ctx->sw_format, in_frames_ctx->sw_format); > + > + ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height); > + if (ret < 0) > + return ret; > + > + ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx); > + if (!ctx->outputs[0]->hw_frames_ctx) > + return AVERROR(ENOMEM); > + > + return 0; > +} > + > +static av_cold int cuda_bilateral_load_functions(AVFilterContext *ctx) > +{ > + CUDABilateralContext *s = ctx->priv; > + CUcontext bilateral, cuda_ctx = s->hwctx->cuda_ctx; > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + int ret; > + > + extern const unsigned char ff_vf_bilateral_cuda_ptx_data[]; > + extern const unsigned int ff_vf_bilateral_cuda_ptx_len; > + > + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); > + if (ret < 0) > + return ret; > + > + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, > + ff_vf_bilateral_cuda_ptx_data, ff_vf_bilateral_cuda_ptx_len); > + if (ret < 0) > + goto fail; > + > + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, "Process_uchar")); > + if (ret < 0) { > + av_log(ctx, AV_LOG_FATAL, "Failed loading Process_uchar\n"); > + goto fail; > + } > + > + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, "Process_uchar2")); > + if (ret < 0) { > + av_log(ctx, AV_LOG_FATAL, "Failed loading Process_uchar2\n"); > + goto fail; > + } > + > +fail: > + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); > + > + return ret; > +} > + > +static av_cold int cuda_bilateral_config_props(AVFilterLink *outlink) > +{ > + AVFilterContext *ctx = outlink->src; > + AVFilterLink *inlink = outlink->src->inputs[0]; > + CUDABilateralContext *s = ctx->priv; > + AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; > + AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; > + int ret; > + > + s->hwctx = device_hwctx; > + s->cu_stream = s->hwctx->stream; > + > + ret = init_processing_chain(ctx, inlink->w, inlink->h); > + if (ret < 0) > + return ret; > + > + outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; > + > + // the window_size makes more sense when it is odd, so add 1 if even > + s->window_size= (s->window_size%2) ? s->window_size : s->window_size+1; > + > + ret = cuda_bilateral_load_functions(ctx); > + if (ret < 0) > + return ret; > + > + return 0; > +} > + > +static int call_cuda_kernel(AVFilterContext *ctx, CUfunction func, > + CUtexObject src_tex[3], AVFrame *out_frame, > + int width, int height, int pitch, > + int width_uv, int height_uv, int pitch_uv, > + int window_size, float sigmaS, float sigmaR) > +{ > + CUDABilateralContext *s = ctx->priv; > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + int ret; > + > + CUdeviceptr dst_devptr[3] = { > + (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1], (CUdeviceptr)out_frame->data[2] > + }; > + > + void *args_uchar[] = { > + &src_tex[0], &src_tex[1], &src_tex[2], > + &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], > + &width, &height, &pitch, > + &width_uv, &height_uv, &pitch_uv, > + &window_size, &sigmaS, &sigmaR > + }; > + > + ret = CHECK_CU(cu->cuLaunchKernel(func, > + DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, > + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); > + if (ret < 0) > + return ret; > + > + return ret; > +} > + > +static int cuda_bilateral_process_internal(AVFilterContext *ctx, > + AVFrame *out, AVFrame *in) > +{ > + CUDABilateralContext *s = ctx->priv; > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + CUcontext bilateral, cuda_ctx = s->hwctx->cuda_ctx; > + int i, ret; > + > + CUtexObject tex[3] = { 0, 0, 0 }; > + > + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); > + if (ret < 0) > + return ret; > + > + for (i = 0; i < s->in_planes; i++) { > + CUDA_TEXTURE_DESC tex_desc = { > + .filterMode = CU_TR_FILTER_MODE_LINEAR, > + .flags = 0, // CU_TRSF_READ_AS_INTEGER to get raw ints instead of normalized floats from tex2D > + }; > + > + CUDA_RESOURCE_DESC res_desc = { > + .resType = CU_RESOURCE_TYPE_PITCH2D, > + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT8, > + .res.pitch2D.numChannels = s->in_plane_channels[i], > + .res.pitch2D.pitchInBytes = in->linesize[i], > + .res.pitch2D.devPtr = (CUdeviceptr)in->data[i], > + }; > + > + if (i == 1 || i == 2) { > + res_desc.res.pitch2D.width = AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w); > + res_desc.res.pitch2D.height = AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h); > + } else { > + res_desc.res.pitch2D.width = in->width; > + res_desc.res.pitch2D.height = in->height; > + } > + > + ret = CHECK_CU(cu->cuTexObjectCreate(&tex[i], &res_desc, &tex_desc, NULL)); > + if (ret < 0) > + goto exit; > + } > + > + ret = call_cuda_kernel(ctx, (s->in_plane_channels[1] > 1) ? s->cu_func_uv : s->cu_func, > + tex, out, > + out->width, out->height, out->linesize[0], > + AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w), > + AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h), > + out->linesize[1] >> ((s->in_plane_channels[1] > 1) ? 1 : 0), > + s->window_size, s->sigmaS, s->sigmaR); > + if (ret < 0) > + goto exit; > + > +exit: > + for (i = 0; i < s->in_planes; i++) > + if (tex[i]) > + CHECK_CU(cu->cuTexObjectDestroy(tex[i])); > + > + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); > + > + return ret; > +} > + > +static int cuda_bilateral_process(AVFilterContext *ctx, AVFrame *out, AVFrame *in) > +{ > + CUDABilateralContext *s = ctx->priv; > + AVFrame *src = in; > + int ret; > + > + ret = cuda_bilateral_process_internal(ctx, s->frame, src); > + if (ret < 0) > + return ret; > + > + src = s->frame; > + ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0); > + if (ret < 0) > + return ret; > + > + av_frame_move_ref(out, s->frame); > + av_frame_move_ref(s->frame, s->tmp_frame); > + > + ret = av_frame_copy_props(out, in); > + if (ret < 0) > + return ret; > + > + return 0; > +} > + > +static int cuda_bilateral_filter_frame(AVFilterLink *link, AVFrame *in) > +{ > + AVFilterContext *ctx = link->dst; > + CUDABilateralContext *s = ctx->priv; > + AVFilterLink *outlink = ctx->outputs[0]; > + CudaFunctions *cu = s->hwctx->internal->cuda_dl; > + > + AVFrame *out = NULL; > + CUcontext bilateral; > + int ret = 0; > + > + out = av_frame_alloc(); > + if (!out) { > + ret = AVERROR(ENOMEM); > + goto fail; > + } > + > + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); > + if (ret < 0) > + goto fail; > + > + ret = cuda_bilateral_process(ctx, out, in); > + > + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); > + if (ret < 0) > + goto fail; > + > + av_frame_free(&in); > + return ff_filter_frame(outlink, out); > +fail: > + av_frame_free(&in); > + av_frame_free(&out); > + return ret; > +} > + > +#define OFFSET(x) offsetof(CUDABilateralContext, x) > +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) > +static const AVOption options[] = { > + { "sigmaS", "set spatial sigma", OFFSET(sigmaS), AV_OPT_TYPE_FLOAT, {.dbl=0.1}, 0.0, 512, FLAGS }, > + { "sigmaR", "set range sigmaR", OFFSET(sigmaR), AV_OPT_TYPE_FLOAT, {.dbl=0.1}, 0.0, 512, FLAGS }, > + { "window_size", "set neighbours window_size", OFFSET(window_size), AV_OPT_TYPE_INT, {.i64=1}, 1, 255, FLAGS }, > + { NULL } > +}; > + > +static const AVClass cuda_bilateral_class = { > + .class_name = "cudabilateral", > + .item_name = av_default_item_name, > + .option = options, > + .version = LIBAVUTIL_VERSION_INT, > +}; > + > +static const AVFilterPad cuda_bilateral_inputs[] = { > + { > + .name = "default", > + .type = AVMEDIA_TYPE_VIDEO, > + .filter_frame = cuda_bilateral_filter_frame, > + }, > +}; > + > +static const AVFilterPad cuda_bilateral_outputs[] = { > + { > + .name = "default", > + .type = AVMEDIA_TYPE_VIDEO, > + .config_props = cuda_bilateral_config_props, > + }, > +}; > + > +const AVFilter ff_vf_bilateral_cuda = { > + .name = "bilateral_cuda", > + .description = NULL_IF_CONFIG_SMALL("GPU accelerated bilateral filter"), > + > + .init = cudabilateral_init, > + .uninit = cudabilateral_uninit, > + > + .priv_size = sizeof(CUDABilateralContext), > + .priv_class = &cuda_bilateral_class, > + > + FILTER_INPUTS(cuda_bilateral_inputs), > + FILTER_OUTPUTS(cuda_bilateral_outputs), > + > + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), > + > + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, > +}; > diff --git a/libavfilter/vf_bilateral_cuda.cu b/libavfilter/vf_bilateral_cuda.cu > new file mode 100644 > index 0000000000..f477e5aa29 > --- /dev/null > +++ b/libavfilter/vf_bilateral_cuda.cu > @@ -0,0 +1,177 @@ > +/* > + * Copyright (c) 2022 Mohamed Khaled > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include "cuda/vector_helpers.cuh" > + > +extern "C" > +{ > + /** > + * @brief calculated squared norm difference between two 3-dimension vecors ||first_vector-second_vector||^2 > + * used float4 for better performance > + * > + * @param first_yuv first color vector > + * @param second_yuv second color vecotr > + * @return answer of squared norm difference > + */ > + __device__ static inline float norm_squared(float4 first_yuv, float4 second_yuv) > + { > + float ans = 0; > + ans += powf(first_yuv.x - second_yuv.x, 2); > + ans += powf(first_yuv.y - second_yuv.y, 2); > + ans += powf(first_yuv.z - second_yuv.z, 2); > + return ans; > + } > + > + /** > + * @brief calculate w as stated in bilateral filter research paper > + * > + * @param first_yuv first color vector > + * @param second_yuv second color vecotr > + * @return the calculated w > + */ > + __device__ static inline float calculate_w(int x, int y, int r, int c, > + float4 pixel_value, float4 neighbor_value, > + float sigma_space, float sigma_color) > + { > + float first_term, second_term, w; > + first_term = (powf(x - r, 2) + powf(y - c, 2)) / (2 * sigma_space * sigma_space); > + second_term = norm_squared(pixel_value,neighbor_value)/ (2 * sigma_color * sigma_color); > + w = __expf(-first_term - second_term); > + return w; > + } > + > + /** > + * @brief apply the bilateral filter on the pixel sent > + * > + * @param src_tex_Y Y channel of source image > + * @param src_tex_U U channel of source image or UV channels if format is nv12 > + * @param src_tex_V V channel of source image > + * @param dst_Y Y channel of destination image > + * @param dst_U U channel of destination image if format is in yuv > + * @param dst_V V channel of destination image if format is in yuv > + * @param dst_UV UV channels of destination image if format is in nv12 > + * @param width width of Y channel > + * @param height height of Y channel > + * @param width_uv width of UV channels > + * @param height_uv height of UV channels > + * @param pitch pitch of Y channel > + * @param pitch_uv pitch of UV channels > + * @param x x coordinate of pixel to be filtered > + * @param y y coordinate of pixel to be filtered > + * @param sigma_space sigma space parameter > + * @param sigma_color sigma color parameter > + * @param window_size window size parameter > + * @return void > + */ > + __device__ static inline void apply_biltaeral( > + cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_U, cudaTextureObject_t src_tex_V, > + uchar *dst_Y, uchar *dst_U, uchar *dst_V, uchar2 *dst_UV, > + int width, int height, int width_uv, int height_uv, int pitch,int pitch_uv, > + int x, int y, > + float sigma_space, float sigma_color, int window_size) > + { > + int channel_ratio=width/width_uv; // ratio between Y channel and UV channels > + float4 currrent_pixel=make_float4(tex2D(src_tex_Y, x, y) * 255.f, > + tex2D(src_tex_U, x/channel_ratio, y/channel_ratio) * 255.f, > + tex2D(src_tex_V, x/channel_ratio, y/channel_ratio) * 255.f, 0.f); > + int start_r = x - window_size / 2; > + int start_c = y - window_size / 2; > + float4 neighbor_pixel=make_float4(0.f,0.f,0.f,0.f); > + float Wp = 0.f; > + float4 new_pixel_value=make_float4(0.f,0.f,0.f,0.f); > + float w = 0.f; > + for(int i=0;i + { > + for(int j=0;j + { > + int r=start_r+i; > + int c=start_c+j; > + bool in_bounds=r>=0 && r=0 && c + if(in_bounds) > + { > + if(!src_tex_V){ > + float2 temp_uv = tex2D(src_tex_U, r/channel_ratio, c/channel_ratio); > + neighbor_pixel=make_float4(tex2D(src_tex_Y, r, c) * 255.f, > + temp_uv.x * 255.f, > + temp_uv.y * 255.f, 0.f); > + > + } else { > + neighbor_pixel=make_float4(tex2D(src_tex_Y, r, c) * 255.f, > + tex2D(src_tex_U, r/channel_ratio, c/channel_ratio) * 255.f, > + tex2D(src_tex_V, r/channel_ratio, c/channel_ratio) * 255.f, 0.f); > + } > + w=calculate_w(x,y,r,c,currrent_pixel,neighbor_pixel,sigma_space,sigma_color); > + Wp+=w; > + new_pixel_value+= neighbor_pixel*w; > + } > + } > + } > + > + new_pixel_value=new_pixel_value/Wp; > + dst_Y[y*pitch+x]=new_pixel_value.x; > + if(!src_tex_V){ > + dst_UV[(y/channel_ratio)*pitch_uv+(x/channel_ratio)]=make_uchar2(new_pixel_value.y,new_pixel_value.z); > + } else { > + dst_U[(y/channel_ratio)*pitch_uv+(x/channel_ratio)]=new_pixel_value.y; > + dst_V[(y/channel_ratio)*pitch_uv+(x/channel_ratio)]=new_pixel_value.z; > + } > + > + return; > + } > + > + > + __global__ void Process_uchar(cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_U, cudaTextureObject_t src_tex_V, > + uchar *dst_Y, uchar *dst_U, uchar *dst_V, > + int width, int height, int pitch, > + int width_uv, int height_uv, int pitch_uv, > + int window_size, float sigmaS, float sigmaR) > + { > + > + int x = blockIdx.x * blockDim.x + threadIdx.x; > + int y = blockIdx.y * blockDim.y + threadIdx.y; > + if (y >= height || x >= width) > + return; > + > + apply_biltaeral(src_tex_Y, src_tex_U, src_tex_V, > + dst_Y, dst_U, dst_V, (uchar2*)nullptr, > + width, height, width_uv, height_uv, pitch, pitch_uv, > + x, y, > + sigmaS, sigmaR, window_size); > + } > + > + > + __global__ void Process_uchar2(cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_UV, cudaTextureObject_t unused1, > + uchar *dst_Y, uchar2 *dst_UV, uchar *unused2, > + int width, int height, int pitch, > + int width_uv, int height_uv, int pitch_uv, > + int window_size, float sigmaS, float sigmaR) > + { > + int x = blockIdx.x * blockDim.x + threadIdx.x; > + int y = blockIdx.y * blockDim.y + threadIdx.y; > + if (y >= height || x >= width) > + return; > + > + apply_biltaeral(src_tex_Y, src_tex_UV, (cudaTextureObject_t)nullptr, > + dst_Y, (uchar*)nullptr, (uchar*)nullptr, dst_UV, > + width, height, width_uv, height_uv, pitch, pitch_uv, > + x, y, > + sigmaS, sigmaR, window_size); > + } > +} Filter code looks good. A few super minor stylistic issues that are more personal taste than anything. Will give it a test run, and apply soon if no issues or further comments come up. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".