From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTP id 10A724318D for ; Mon, 22 Aug 2022 20:03:43 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id E61EA68B9C7; Mon, 22 Aug 2022 23:03:40 +0300 (EEST) Received: from mail-wr1-f50.google.com (mail-wr1-f50.google.com [209.85.221.50]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id BBA9768B9AE for ; Mon, 22 Aug 2022 23:03:33 +0300 (EEST) Received: by mail-wr1-f50.google.com with SMTP id u5so6720308wrt.11 for ; Mon, 22 Aug 2022 13:03:33 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=eng-st.cu.edu.eg; s=google; h=content-transfer-encoding:content-language:to:subject:from :user-agent:mime-version:date:message-id:from:to:cc; bh=QkaU87M5i6CQHLeL2JF7nuq2V/dA5E/1p6kP/NFGKbw=; b=DITwY0W74ofkWeD05C/t2bZeqfwGbkGQeW6FeDqMwLKJ8CBCI8QWcM9nMamB2pH5aM aAd3FkYHpFYN6SfDntolowV9ZtLtlinbk4sJpHjOtvyaOFVxEO2zIYEgMNGzyvZ/zrP7 GtflaUoV8maf7HD1N0W9yh6MUpE5KE1H7zA4s= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:content-language:to:subject:from :user-agent:mime-version:date:message-id:x-gm-message-state:from:to :cc; bh=QkaU87M5i6CQHLeL2JF7nuq2V/dA5E/1p6kP/NFGKbw=; b=uxXeLm9R/8L/8myXSpLZKxWhb+L4ZCWdtkB+hzc0dTTvY/QKj6vYz4YKW0UCOLJKXz 9h4qrCXAg/rYMcqHRepZ64q0G7nkZRwvqxyhEX1tKS9TGtDI4NZrFAoFsvFRT+GR6llT h/RzHkG/ZRrQ4U0TSkEaZZt+PKJtVLOQgqZ+bMSjNWrz/0TgJrgulMHbQjl5950Drs8u ZXsvzVS7udCOqyROKWr73/fQUU1qH1zw5AtlnLkLrMIkyY/zjyhro4oCJvZlwnO7i72m xmltX6c3pHMrVPXU4A6vmuQXH4AdMiuHNqkRsZeorsSXgjNo4ZDcaUZZZbErLOwm0UC/ JWsA== X-Gm-Message-State: ACgBeo0vV0bvDBkZGTvOxD2RSOjDvPQEfoMIbMsZvC6INxePw1AhKNlO njzAw4wZCxIuv0f5VTFQgVKC6xNW62rH1SFM X-Google-Smtp-Source: AA6agR6Rf1qjjFBLc7zwJgNrsBEP9vwNK/EXZDM7VRtAO+lLNrXvXmUlNu2kpWLgSHmlVqTIH8Ttpg== X-Received: by 2002:a05:6000:1f01:b0:225:5307:132e with SMTP id bv1-20020a0560001f0100b002255307132emr4594923wrb.303.1661198612141; Mon, 22 Aug 2022 13:03:32 -0700 (PDT) Received: from [192.168.1.108] ([102.44.207.183]) by smtp.gmail.com with ESMTPSA id g1-20020adffc81000000b0022520aba90asm12650657wrr.107.2022.08.22.13.03.31 for (version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128); Mon, 22 Aug 2022 13:03:31 -0700 (PDT) Message-ID: <743b6181-9f33-23e8-d983-96c912628f3a@eng-st.cu.edu.eg> Date: Mon, 22 Aug 2022 22:03:35 +0200 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.11.0 From: Mohamed Khaled Mohamed To: ffmpeg-devel@ffmpeg.org Content-Language: en-US Subject: [FFmpeg-devel] [PATCH] avfilter: add bilateral_cuda filter X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: avfilter: add bilateral_cuda filter GSoC'22 libavfilter/vf_bilateral_cuda.cu:the CUDA kernel for the filter libavfilter/vf_bilateral_cuda.c: the C side that calls the kernel and gets user input libavfilter/allfilters.c: added the filter to it libavfilter/Makefile: added the filter to it cuda/cuda_runtime.h: added two math CUDA functions that are used in the filter configure: add cuda dependencies for the filter Signed-off-by: Mohamed Khaled --- Changelog | 1 + compat/cuda/cuda_runtime.h | 1 + configure | 2 + doc/filters.texi | 39 +++ libavfilter/Makefile | 1 + libavfilter/allfilters.c | 1 + libavfilter/version.h | 2 +- libavfilter/vf_bilateral_cuda.c | 479 +++++++++++++++++++++++++++++++ libavfilter/vf_bilateral_cuda.cu | 177 ++++++++++++ 9 files changed, 702 insertions(+), 1 deletion(-) create mode 100644 libavfilter/vf_bilateral_cuda.c create mode 100644 libavfilter/vf_bilateral_cuda.cu diff --git a/Changelog b/Changelog index 121cfc3d90..d008be1577 100644 --- a/Changelog +++ b/Changelog @@ -28,6 +28,7 @@ version 5.1: - PHM image format support - remap_opencl filter - added chromakey_cuda filter +- added bilateral_cuda filter version 5.0: diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h index 5837c1ad37..58bc4c41af 100644 --- a/compat/cuda/cuda_runtime.h +++ b/compat/cuda/cuda_runtime.h @@ -182,6 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } static inline __device__ float fabs(float a) { return __builtin_fabsf(a); } static inline __device__ double fabs(double a) { return __builtin_fabs(a); } static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); } +static inline __device__ float powf(float a,float y) { return __builtin_powf(a,y); } static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); } static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } diff --git a/configure b/configure index 9d6457d81b..c71cb11fda 100755 --- a/configure +++ b/configure @@ -3144,6 +3144,8 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait" chromakey_cuda_filter_deps="ffnvcodec" chromakey_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +bilateral_cuda_filter_deps="ffnvcodec" +bilateral_cuda_filter_deps_any="cuda_nvcc cuda_llvm" hwupload_cuda_filter_deps="ffnvcodec" scale_npp_filter_deps="ffnvcodec libnpp" scale2ref_npp_filter_deps="ffnvcodec libnpp" diff --git a/doc/filters.texi b/doc/filters.texi index 40f21fb34c..70c1f60291 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -7965,6 +7965,45 @@ Set planes to filter. Default is first only. This filter supports the all above options as @ref{commands}. +@section bilateral_cuda +CUDA accelerated bilateral filter, an edge pereseving filter. +This filter is mathematically accurate thanks to the use of GPU. +For best output quality, use one to one chroma subsampling like yuv444 format. + +The filter accepts the following options: +@table @option +@item sigmaS +Set sigma of gaussian function to calculate spatial weight. +Allowed range is 0 to 512. Default is 0.1. + +@item sigmaR +Set sigma of gaussian function to calculate range weight. +Allowed range is 0 to 1. Default is 0.1. + +@item window_size +Set window size of the bilateral function to determine the number of neighbours to loop on. +If the number entered is even, a one will be added automatically. +Allowed range is 1 to 255. Default is 1. +@end table +@subsection Examples + +@itemize +@item +Apply the bilateral filter on a video. + +@example +./ffmpeg -v verbose \ +-hwaccel cuda -hwaccel_output_format cuda -i input.mp4 \ +-init_hw_device cuda \ +-filter_complex \ +" \ +[0:v]scale_cuda=format=yuv444p[scaled_video]; +[scaled_video]bilateral_cuda=window_size=9:sigmaS=3.0:sigmaR=50.0" \ +-an -sn -c:v h264_nvenc -cq 20 out.mp4 +@end example + +@end itemize + @section bitplanenoise Show and measure bit plane noise. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 30cc329fb6..d3284e2511 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -194,6 +194,7 @@ OBJS-$(CONFIG_AVGBLUR_VULKAN_FILTER) += vf_avgblur_vulkan.o vulkan.o vul OBJS-$(CONFIG_BBOX_FILTER) += bbox.o vf_bbox.o OBJS-$(CONFIG_BENCH_FILTER) += f_bench.o OBJS-$(CONFIG_BILATERAL_FILTER) += vf_bilateral.o +OBJS-$(CONFIG_BILATERAL_CUDA_FILTER) += vf_bilateral_cuda.o vf_bilateral_cuda.ptx.o OBJS-$(CONFIG_BITPLANENOISE_FILTER) += vf_bitplanenoise.o OBJS-$(CONFIG_BLACKDETECT_FILTER) += vf_blackdetect.o OBJS-$(CONFIG_BLACKFRAME_FILTER) += vf_blackframe.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 5ebacfde27..b5e05e8b9e 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -180,6 +180,7 @@ extern const AVFilter ff_vf_avgblur_vulkan; extern const AVFilter ff_vf_bbox; extern const AVFilter ff_vf_bench; extern const AVFilter ff_vf_bilateral; +extern const AVFilter ff_vf_bilateral_cuda; extern const AVFilter ff_vf_bitplanenoise; extern const AVFilter ff_vf_blackdetect; extern const AVFilter ff_vf_blackframe; diff --git a/libavfilter/version.h b/libavfilter/version.h index b1915afcea..0946ee91e8 100644 --- a/libavfilter/version.h +++ b/libavfilter/version.h @@ -31,7 +31,7 @@ #include "version_major.h" -#define LIBAVFILTER_VERSION_MINOR 45 +#define LIBAVFILTER_VERSION_MINOR 46 #define LIBAVFILTER_VERSION_MICRO 100 diff --git a/libavfilter/vf_bilateral_cuda.c b/libavfilter/vf_bilateral_cuda.c new file mode 100644 index 0000000000..b05954d2aa --- /dev/null +++ b/libavfilter/vf_bilateral_cuda.c @@ -0,0 +1,479 @@ +/* + * Copyright (c) 2022 Mohamed Khaled + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include + +#include "libavutil/avstring.h" +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/internal.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" + +#include "avfilter.h" +#include "formats.h" +#include "internal.h" +#include "video.h" + +#include "cuda/load_helper.h" + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NV12, + AV_PIX_FMT_YUV444P +}; + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define BLOCKX 32 +#define BLOCKY 16 + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + + +typedef struct CUDABilateralContext { + const AVClass *class; + AVCUDADeviceContext *hwctx; + + enum AVPixelFormat in_fmt, out_fmt; + const AVPixFmtDescriptor *in_desc, *out_desc; + int in_planes, out_planes; + int in_plane_depths[4]; + int in_plane_channels[4]; + + int window_size; + float sigmaS; + float sigmaR; + + AVBufferRef *frames_ctx; + AVFrame *frame; + AVFrame *tmp_frame; + + CUcontext cu_ctx; + CUmodule cu_module; + CUfunction cu_func; + CUfunction cu_func_uv; + CUstream cu_stream; +} CUDABilateralContext; + +static av_cold int cudabilateral_init(AVFilterContext *ctx) +{ + CUDABilateralContext *s = ctx->priv; + + s->frame = av_frame_alloc(); + if (!s->frame) + return AVERROR(ENOMEM); + + s->tmp_frame = av_frame_alloc(); + if (!s->tmp_frame) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold void cudabilateral_uninit(AVFilterContext *ctx) +{ + CUDABilateralContext *s = ctx->priv; + + if (s->hwctx && s->cu_module) { + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext bilateral; + + CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); + } + + av_frame_free(&s->frame); + av_buffer_unref(&s->frames_ctx); + av_frame_free(&s->tmp_frame); +} + +static av_cold int init_hwframe_ctx(CUDABilateralContext *s, AVBufferRef *device_ctx, int width, int height) +{ + AVBufferRef *out_ref = NULL; + AVHWFramesContext *out_ctx; + int ret; + + out_ref = av_hwframe_ctx_alloc(device_ctx); + if (!out_ref) + return AVERROR(ENOMEM); + out_ctx = (AVHWFramesContext*)out_ref->data; + + out_ctx->format = AV_PIX_FMT_CUDA; + out_ctx->sw_format = s->out_fmt; + out_ctx->width = width; + out_ctx->height = height; + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) + goto fail; + + av_frame_unref(s->frame); + ret = av_hwframe_get_buffer(out_ref, s->frame, 0); + if (ret < 0) + goto fail; + + av_buffer_unref(&s->frames_ctx); + s->frames_ctx = out_ref; + + return 0; +fail: + av_buffer_unref(&out_ref); + return ret; +} + +static int format_is_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + +static av_cold void set_format_info(AVFilterContext *ctx, enum AVPixelFormat in_format, enum AVPixelFormat out_format) +{ + CUDABilateralContext *s = ctx->priv; + int i, p, d; + + s->in_fmt = in_format; + s->out_fmt = out_format; + + s->in_desc = av_pix_fmt_desc_get(s->in_fmt); + s->out_desc = av_pix_fmt_desc_get(s->out_fmt); + s->in_planes = av_pix_fmt_count_planes(s->in_fmt); + s->out_planes = av_pix_fmt_count_planes(s->out_fmt); + + // find maximum step of each component of each plane + // For our subset of formats, this should accurately tell us how many channels CUDA needs + // i.e. 1 for Y plane, 2 for UV plane of NV12, 4 for single plane of RGB0 formats + + for (i = 0; i < s->in_desc->nb_components; i++) { + d = (s->in_desc->comp[i].depth + 7) / 8; + p = s->in_desc->comp[i].plane; + s->in_plane_channels[p] = FFMAX(s->in_plane_channels[p], s->in_desc->comp[i].step / d); + + s->in_plane_depths[p] = s->in_desc->comp[i].depth; + } +} + +static av_cold int init_processing_chain(AVFilterContext *ctx, int width, int height) +{ + CUDABilateralContext *s = ctx->priv; + AVHWFramesContext *in_frames_ctx; + int ret; + + /* check that we have a hw context */ + if (!ctx->inputs[0]->hw_frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); + return AVERROR(EINVAL); + } + in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data; + + if (!format_is_supported(in_frames_ctx->sw_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported format: %s\n", av_get_pix_fmt_name(in_frames_ctx->sw_format)); + return AVERROR(ENOSYS); + } + + set_format_info(ctx, in_frames_ctx->sw_format, in_frames_ctx->sw_format); + + ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height); + if (ret < 0) + return ret; + + ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx); + if (!ctx->outputs[0]->hw_frames_ctx) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold int cuda_bilateral_load_functions(AVFilterContext *ctx) +{ + CUDABilateralContext *s = ctx->priv; + CUcontext bilateral, cuda_ctx = s->hwctx->cuda_ctx; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + int ret; + + extern const unsigned char ff_vf_bilateral_cuda_ptx_data[]; + extern const unsigned int ff_vf_bilateral_cuda_ptx_len; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_bilateral_cuda_ptx_data, ff_vf_bilateral_cuda_ptx_len); + if (ret < 0) + goto fail; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, "Process_uchar")); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed loading Process_uchar\n"); + goto fail; + } + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, "Process_uchar2")); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed loading Process_uchar2\n"); + goto fail; + } + +fail: + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); + + return ret; +} + +static av_cold int cuda_bilateral_config_props(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + AVFilterLink *inlink = outlink->src->inputs[0]; + CUDABilateralContext *s = ctx->priv; + AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; + AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; + int ret; + + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + ret = init_processing_chain(ctx, inlink->w, inlink->h); + if (ret < 0) + return ret; + + outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; + + // the window_size makes more sense when it is odd, so add 1 if even + s->window_size= (s->window_size%2) ? s->window_size : s->window_size+1; + + ret = cuda_bilateral_load_functions(ctx); + if (ret < 0) + return ret; + + return 0; +} + +static int call_cuda_kernel(AVFilterContext *ctx, CUfunction func, + CUtexObject src_tex[3], AVFrame *out_frame, + int width, int height, int pitch, + int width_uv, int height_uv, int pitch_uv, + int window_size, float sigmaS, float sigmaR) +{ + CUDABilateralContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + int ret; + + CUdeviceptr dst_devptr[3] = { + (CUdeviceptr)out_frame->data[0], (CUdeviceptr)out_frame->data[1], (CUdeviceptr)out_frame->data[2] + }; + + void *args_uchar[] = { + &src_tex[0], &src_tex[1], &src_tex[2], + &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], + &width, &height, &pitch, + &width_uv, &height_uv, &pitch_uv, + &window_size, &sigmaS, &sigmaR + }; + + ret = CHECK_CU(cu->cuLaunchKernel(func, + DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + if (ret < 0) + return ret; + + return ret; +} + +static int cuda_bilateral_process_internal(AVFilterContext *ctx, + AVFrame *out, AVFrame *in) +{ + CUDABilateralContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext bilateral, cuda_ctx = s->hwctx->cuda_ctx; + int i, ret; + + CUtexObject tex[3] = { 0, 0, 0 }; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + for (i = 0; i < s->in_planes; i++) { + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_LINEAR, + .flags = 0, // CU_TRSF_READ_AS_INTEGER to get raw ints instead of normalized floats from tex2D + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT8, + .res.pitch2D.numChannels = s->in_plane_channels[i], + .res.pitch2D.pitchInBytes = in->linesize[i], + .res.pitch2D.devPtr = (CUdeviceptr)in->data[i], + }; + + if (i == 1 || i == 2) { + res_desc.res.pitch2D.width = AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w); + res_desc.res.pitch2D.height = AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h); + } else { + res_desc.res.pitch2D.width = in->width; + res_desc.res.pitch2D.height = in->height; + } + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex[i], &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + } + + ret = call_cuda_kernel(ctx, (s->in_plane_channels[1] > 1) ? s->cu_func_uv : s->cu_func, + tex, out, + out->width, out->height, out->linesize[0], + AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w), + AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h), + out->linesize[1] >> ((s->in_plane_channels[1] > 1) ? 1 : 0), + s->window_size, s->sigmaS, s->sigmaR); + if (ret < 0) + goto exit; + +exit: + for (i = 0; i < s->in_planes; i++) + if (tex[i]) + CHECK_CU(cu->cuTexObjectDestroy(tex[i])); + + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); + + return ret; +} + +static int cuda_bilateral_process(AVFilterContext *ctx, AVFrame *out, AVFrame *in) +{ + CUDABilateralContext *s = ctx->priv; + AVFrame *src = in; + int ret; + + ret = cuda_bilateral_process_internal(ctx, s->frame, src); + if (ret < 0) + return ret; + + src = s->frame; + ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0); + if (ret < 0) + return ret; + + av_frame_move_ref(out, s->frame); + av_frame_move_ref(s->frame, s->tmp_frame); + + ret = av_frame_copy_props(out, in); + if (ret < 0) + return ret; + + return 0; +} + +static int cuda_bilateral_filter_frame(AVFilterLink *link, AVFrame *in) +{ + AVFilterContext *ctx = link->dst; + CUDABilateralContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + AVFrame *out = NULL; + CUcontext bilateral; + int ret = 0; + + out = av_frame_alloc(); + if (!out) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + goto fail; + + ret = cuda_bilateral_process(ctx, out, in); + + CHECK_CU(cu->cuCtxPopCurrent(&bilateral)); + if (ret < 0) + goto fail; + + av_frame_free(&in); + return ff_filter_frame(outlink, out); +fail: + av_frame_free(&in); + av_frame_free(&out); + return ret; +} + +#define OFFSET(x) offsetof(CUDABilateralContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) +static const AVOption options[] = { + { "sigmaS", "set spatial sigma", OFFSET(sigmaS), AV_OPT_TYPE_FLOAT, {.dbl=0.1}, 0.0, 512, FLAGS }, + { "sigmaR", "set range sigmaR", OFFSET(sigmaR), AV_OPT_TYPE_FLOAT, {.dbl=0.1}, 0.0, 512, FLAGS }, + { "window_size", "set neighbours window_size", OFFSET(window_size), AV_OPT_TYPE_INT, {.i64=1}, 1, 255, FLAGS }, + { NULL } +}; + +static const AVClass cuda_bilateral_class = { + .class_name = "cudabilateral", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVFilterPad cuda_bilateral_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = cuda_bilateral_filter_frame, + }, +}; + +static const AVFilterPad cuda_bilateral_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = cuda_bilateral_config_props, + }, +}; + +const AVFilter ff_vf_bilateral_cuda = { + .name = "bilateral_cuda", + .description = NULL_IF_CONFIG_SMALL("GPU accelerated bilateral filter"), + + .init = cudabilateral_init, + .uninit = cudabilateral_uninit, + + .priv_size = sizeof(CUDABilateralContext), + .priv_class = &cuda_bilateral_class, + + FILTER_INPUTS(cuda_bilateral_inputs), + FILTER_OUTPUTS(cuda_bilateral_outputs), + + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), + + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vf_bilateral_cuda.cu b/libavfilter/vf_bilateral_cuda.cu new file mode 100644 index 0000000000..f477e5aa29 --- /dev/null +++ b/libavfilter/vf_bilateral_cuda.cu @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2022 Mohamed Khaled + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "cuda/vector_helpers.cuh" + +extern "C" +{ + /** + * @brief calculated squared norm difference between two 3-dimension vecors ||first_vector-second_vector||^2 + * used float4 for better performance + * + * @param first_yuv first color vector + * @param second_yuv second color vecotr + * @return answer of squared norm difference + */ + __device__ static inline float norm_squared(float4 first_yuv, float4 second_yuv) + { + float ans = 0; + ans += powf(first_yuv.x - second_yuv.x, 2); + ans += powf(first_yuv.y - second_yuv.y, 2); + ans += powf(first_yuv.z - second_yuv.z, 2); + return ans; + } + + /** + * @brief calculate w as stated in bilateral filter research paper + * + * @param first_yuv first color vector + * @param second_yuv second color vecotr + * @return the calculated w + */ + __device__ static inline float calculate_w(int x, int y, int r, int c, + float4 pixel_value, float4 neighbor_value, + float sigma_space, float sigma_color) + { + float first_term, second_term, w; + first_term = (powf(x - r, 2) + powf(y - c, 2)) / (2 * sigma_space * sigma_space); + second_term = norm_squared(pixel_value,neighbor_value)/ (2 * sigma_color * sigma_color); + w = __expf(-first_term - second_term); + return w; + } + + /** + * @brief apply the bilateral filter on the pixel sent + * + * @param src_tex_Y Y channel of source image + * @param src_tex_U U channel of source image or UV channels if format is nv12 + * @param src_tex_V V channel of source image + * @param dst_Y Y channel of destination image + * @param dst_U U channel of destination image if format is in yuv + * @param dst_V V channel of destination image if format is in yuv + * @param dst_UV UV channels of destination image if format is in nv12 + * @param width width of Y channel + * @param height height of Y channel + * @param width_uv width of UV channels + * @param height_uv height of UV channels + * @param pitch pitch of Y channel + * @param pitch_uv pitch of UV channels + * @param x x coordinate of pixel to be filtered + * @param y y coordinate of pixel to be filtered + * @param sigma_space sigma space parameter + * @param sigma_color sigma color parameter + * @param window_size window size parameter + * @return void + */ + __device__ static inline void apply_biltaeral( + cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_U, cudaTextureObject_t src_tex_V, + uchar *dst_Y, uchar *dst_U, uchar *dst_V, uchar2 *dst_UV, + int width, int height, int width_uv, int height_uv, int pitch,int pitch_uv, + int x, int y, + float sigma_space, float sigma_color, int window_size) + { + int channel_ratio=width/width_uv; // ratio between Y channel and UV channels + float4 currrent_pixel=make_float4(tex2D(src_tex_Y, x, y) * 255.f, + tex2D(src_tex_U, x/channel_ratio, y/channel_ratio) * 255.f, + tex2D(src_tex_V, x/channel_ratio, y/channel_ratio) * 255.f, 0.f); + int start_r = x - window_size / 2; + int start_c = y - window_size / 2; + float4 neighbor_pixel=make_float4(0.f,0.f,0.f,0.f); + float Wp = 0.f; + float4 new_pixel_value=make_float4(0.f,0.f,0.f,0.f); + float w = 0.f; + for(int i=0;i=0 && r=0 && c(src_tex_U, r/channel_ratio, c/channel_ratio); + neighbor_pixel=make_float4(tex2D(src_tex_Y, r, c) * 255.f, + temp_uv.x * 255.f, + temp_uv.y * 255.f, 0.f); + + } else { + neighbor_pixel=make_float4(tex2D(src_tex_Y, r, c) * 255.f, + tex2D(src_tex_U, r/channel_ratio, c/channel_ratio) * 255.f, + tex2D(src_tex_V, r/channel_ratio, c/channel_ratio) * 255.f, 0.f); + } + w=calculate_w(x,y,r,c,currrent_pixel,neighbor_pixel,sigma_space,sigma_color); + Wp+=w; + new_pixel_value+= neighbor_pixel*w; + } + } + } + + new_pixel_value=new_pixel_value/Wp; + dst_Y[y*pitch+x]=new_pixel_value.x; + if(!src_tex_V){ + dst_UV[(y/channel_ratio)*pitch_uv+(x/channel_ratio)]=make_uchar2(new_pixel_value.y,new_pixel_value.z); + } else { + dst_U[(y/channel_ratio)*pitch_uv+(x/channel_ratio)]=new_pixel_value.y; + dst_V[(y/channel_ratio)*pitch_uv+(x/channel_ratio)]=new_pixel_value.z; + } + + return; + } + + + __global__ void Process_uchar(cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_U, cudaTextureObject_t src_tex_V, + uchar *dst_Y, uchar *dst_U, uchar *dst_V, + int width, int height, int pitch, + int width_uv, int height_uv, int pitch_uv, + int window_size, float sigmaS, float sigmaR) + { + + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (y >= height || x >= width) + return; + + apply_biltaeral(src_tex_Y, src_tex_U, src_tex_V, + dst_Y, dst_U, dst_V, (uchar2*)nullptr, + width, height, width_uv, height_uv, pitch, pitch_uv, + x, y, + sigmaS, sigmaR, window_size); + } + + + __global__ void Process_uchar2(cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_UV, cudaTextureObject_t unused1, + uchar *dst_Y, uchar2 *dst_UV, uchar *unused2, + int width, int height, int pitch, + int width_uv, int height_uv, int pitch_uv, + int window_size, float sigmaS, float sigmaR) + { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (y >= height || x >= width) + return; + + apply_biltaeral(src_tex_Y, src_tex_UV, (cudaTextureObject_t)nullptr, + dst_Y, (uchar*)nullptr, (uchar*)nullptr, dst_UV, + width, height, width_uv, height_uv, pitch, pitch_uv, + x, y, + sigmaS, sigmaR, window_size); + } +} -- 2.34.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".