* [FFmpeg-devel] [PATCH 1/3] libavfilter/dnn_bakcend_openvino: Add automatic input/output detection
@ 2024-01-17 7:21 wenbin.chen-at-intel.com
2024-01-17 7:21 ` [FFmpeg-devel] [PATCH 2/3] libavfilter/dnn_interface: use dims to represent shapes wenbin.chen-at-intel.com
2024-01-17 7:21 ` [FFmpeg-devel] [PATCH 3/3] libavfilter/vf_dnn_detect: Use class confidence to filt boxes wenbin.chen-at-intel.com
0 siblings, 2 replies; 4+ messages in thread
From: wenbin.chen-at-intel.com @ 2024-01-17 7:21 UTC (permalink / raw)
To: ffmpeg-devel
From: Wenbin Chen <wenbin.chen@intel.com>
Now when using openvino backend, user doesn't need to set input/output
names in command line. Model ports will be automatically detected.
For example:
ffmpeg -i input.png -vf \
dnn_detect=dnn_backend=openvino:model=model.xml:input=image:\
output=detection_out -y output.png
can be simplified to:
ffmpeg -i input.png -vf dnn_detect=dnn_backend=openvino:model=model.xml\
-y output.png
Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
libavfilter/dnn/dnn_backend_openvino.c | 64 ++++++++++++++++++++++----
libavfilter/dnn_filter_common.c | 21 +++++----
2 files changed, 67 insertions(+), 18 deletions(-)
diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
index e207d44584..590ddd586c 100644
--- a/libavfilter/dnn/dnn_backend_openvino.c
+++ b/libavfilter/dnn/dnn_backend_openvino.c
@@ -205,6 +205,7 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
ov_tensor_t* tensor = NULL;
ov_shape_t input_shape = {0};
ov_element_type_e precision;
+ char *port_name;
#else
dimensions_t dims;
precision_e precision;
@@ -223,11 +224,23 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
ov_output_const_port_free(ov_model->input_port);
ov_model->input_port = NULL;
}
- status = ov_model_const_input_by_name(ov_model->ov_model, task->input_name, &ov_model->input_port);
+ if (task->input_name)
+ status = ov_model_const_input_by_name(ov_model->ov_model, task->input_name, &ov_model->input_port);
+ else
+ status = ov_model_const_input(ov_model->ov_model, &ov_model->input_port);
if (status != OK) {
av_log(ctx, AV_LOG_ERROR, "Failed to get input port shape.\n");
return ov2_map_error(status, NULL);
}
+ status = ov_port_get_any_name(ov_model->input_port, &port_name);
+ if (status != OK) {
+ av_log(ctx, AV_LOG_ERROR, "Failed to get input port name.\n");
+ return ov2_map_error(status, NULL);
+ }
+ av_log(ctx, AV_LOG_VERBOSE, "OpenVINO model input: %s\n", port_name);
+ ov_free(port_name);
+ port_name = NULL;
+
status = ov_const_port_get_shape(ov_model->input_port, &input_shape);
if (status != OK) {
av_log(ctx, AV_LOG_ERROR, "Failed to get input port shape.\n");
@@ -620,7 +633,10 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
goto err;
}
- status = ov_preprocess_prepostprocessor_get_input_info_by_name(ov_model->preprocess, input_name, &ov_model->input_info);
+ if (input_name)
+ status = ov_preprocess_prepostprocessor_get_input_info_by_name(ov_model->preprocess, input_name, &ov_model->input_info);
+ else
+ status = ov_preprocess_prepostprocessor_get_input_info(ov_model->preprocess, &ov_model->input_info);
if (status != OK) {
av_log(ctx, AV_LOG_ERROR, "Failed to get input info from preprocess.\n");
ret = ov2_map_error(status, NULL);
@@ -673,10 +689,24 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
goto err;
}
+ if (!nb_outputs) {
+ size_t output_size;
+ status = ov_model_outputs_size(ov_model->ov_model, &output_size);
+ if (status != OK) {
+ av_log(ctx, AV_LOG_ERROR, "Failed to get output size.\n");
+ ret = ov2_map_error(status, NULL);
+ goto err;
+ }
+ nb_outputs = output_size;
+ }
ov_model->nb_outputs = nb_outputs;
for (int i = 0; i < nb_outputs; i++) {
- status = ov_preprocess_prepostprocessor_get_output_info_by_name(
- ov_model->preprocess, output_names[i], &ov_model->output_info);
+ if (output_names)
+ status = ov_preprocess_prepostprocessor_get_output_info_by_name(
+ ov_model->preprocess, output_names[i], &ov_model->output_info);
+ else
+ status = ov_preprocess_prepostprocessor_get_output_info_by_index(
+ ov_model->preprocess, i, &ov_model->output_info);
if (status != OK) {
av_log(ctx, AV_LOG_ERROR, "Failed to get output info from preprocess.\n");
ret = ov2_map_error(status, NULL);
@@ -758,12 +788,25 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
}
for (int i = 0; i < nb_outputs; i++) {
- status = ov_model_const_output_by_name(ov_model->ov_model, output_names[i],
- &ov_model->output_ports[i]);
+ char *port_name;
+ if (output_names)
+ status = ov_model_const_output_by_name(ov_model->ov_model, output_names[i],
+ &ov_model->output_ports[i]);
+ else
+ status = ov_model_const_output_by_index(ov_model->ov_model, i,
+ &ov_model->output_ports[i]);
if (status != OK) {
av_log(ctx, AV_LOG_ERROR, "Failed to get output port %s.\n", output_names[i]);
goto err;
}
+ status = ov_port_get_any_name(ov_model->output_ports[i], &port_name);
+ if (status != OK) {
+ av_log(ctx, AV_LOG_ERROR, "Failed to get output port name.\n");
+ goto err;
+ }
+ av_log(ctx, AV_LOG_VERBOSE, "OpenVINO model outputs: %s\n", port_name);
+ ov_free(port_name);
+ port_name = NULL;
}
//compile network
status = ov_core_compile_model(ov_model->core, ov_model->ov_model, device, 0, &ov_model->compiled_model);
@@ -1044,7 +1087,10 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name)
ov_element_type_e precision;
int64_t* dims;
ov_status_e status;
- status = ov_model_const_input_by_name(ov_model->ov_model, input_name, &ov_model->input_port);
+ if (input_name)
+ status = ov_model_const_input_by_name(ov_model->ov_model, input_name, &ov_model->input_port);
+ else
+ status = ov_model_const_input(ov_model->ov_model, &ov_model->input_port);
if (status != OK) {
av_log(ctx, AV_LOG_ERROR, "Failed to get input port shape.\n");
return ov2_map_error(status, NULL);
@@ -1241,7 +1287,7 @@ static int get_output_ov(void *model, const char *input_name, int input_width, i
OVRequestItem *request;
DNNExecBaseParams exec_params = {
.input_name = input_name,
- .output_names = &output_name,
+ .output_names = output_name ? &output_name : NULL,
.nb_output = 1,
.in_frame = NULL,
.out_frame = NULL,
@@ -1297,7 +1343,7 @@ static int get_output_ov(void *model, const char *input_name, int input_width, i
}
if (!ov_model->exe_network) {
#endif
- ret = init_model_ov(ov_model, input_name, &output_name, 1);
+ ret = init_model_ov(ov_model, input_name, output_name ? &output_name : NULL, 1);
if (ret != 0) {
av_log(ctx, AV_LOG_ERROR, "Failed init OpenVINO exectuable network or inference request\n");
return ret;
diff --git a/libavfilter/dnn_filter_common.c b/libavfilter/dnn_filter_common.c
index 3b9182c1d1..f012d450a2 100644
--- a/libavfilter/dnn_filter_common.c
+++ b/libavfilter/dnn_filter_common.c
@@ -57,15 +57,17 @@ int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *fil
av_log(filter_ctx, AV_LOG_ERROR, "model file for network is not specified\n");
return AVERROR(EINVAL);
}
- if (!ctx->model_inputname) {
- av_log(filter_ctx, AV_LOG_ERROR, "input name of the model network is not specified\n");
- return AVERROR(EINVAL);
- }
- ctx->model_outputnames = separate_output_names(ctx->model_outputnames_string, "&", &ctx->nb_outputs);
- if (!ctx->model_outputnames) {
- av_log(filter_ctx, AV_LOG_ERROR, "could not parse model output names\n");
- return AVERROR(EINVAL);
+ if (ctx->backend_type == DNN_TF) {
+ if (!ctx->model_inputname) {
+ av_log(filter_ctx, AV_LOG_ERROR, "input name of the model network is not specified\n");
+ return AVERROR(EINVAL);
+ }
+ ctx->model_outputnames = separate_output_names(ctx->model_outputnames_string, "&", &ctx->nb_outputs);
+ if (!ctx->model_outputnames) {
+ av_log(filter_ctx, AV_LOG_ERROR, "could not parse model output names\n");
+ return AVERROR(EINVAL);
+ }
}
ctx->dnn_module = ff_get_dnn_module(ctx->backend_type, filter_ctx);
@@ -113,8 +115,9 @@ int ff_dnn_get_input(DnnContext *ctx, DNNData *input)
int ff_dnn_get_output(DnnContext *ctx, int input_width, int input_height, int *output_width, int *output_height)
{
+ char * output_name = ctx->model_outputnames ? ctx->model_outputnames[0] : NULL;
return ctx->model->get_output(ctx->model->model, ctx->model_inputname, input_width, input_height,
- (const char *)ctx->model_outputnames[0], output_width, output_height);
+ (const char *)output_name, output_width, output_height);
}
int ff_dnn_execute_model(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame)
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* [FFmpeg-devel] [PATCH 2/3] libavfilter/dnn_interface: use dims to represent shapes
2024-01-17 7:21 [FFmpeg-devel] [PATCH 1/3] libavfilter/dnn_bakcend_openvino: Add automatic input/output detection wenbin.chen-at-intel.com
@ 2024-01-17 7:21 ` wenbin.chen-at-intel.com
2024-01-17 7:21 ` [FFmpeg-devel] [PATCH 3/3] libavfilter/vf_dnn_detect: Use class confidence to filt boxes wenbin.chen-at-intel.com
1 sibling, 0 replies; 4+ messages in thread
From: wenbin.chen-at-intel.com @ 2024-01-17 7:21 UTC (permalink / raw)
To: ffmpeg-devel
From: Wenbin Chen <wenbin.chen@intel.com>
For detect and classify output, width and height make no sence, so
change width, height to dims to represent the shape of tensor. Use
layout and dims to get width, height and channel.
Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
libavfilter/dnn/dnn_backend_openvino.c | 80 ++++++++++++++------------
libavfilter/dnn/dnn_backend_tf.c | 32 +++++++----
libavfilter/dnn/dnn_io_proc.c | 30 +++++++---
libavfilter/dnn_interface.h | 17 +++++-
libavfilter/vf_dnn_classify.c | 6 +-
libavfilter/vf_dnn_detect.c | 50 ++++++++--------
libavfilter/vf_dnn_processing.c | 21 ++++---
7 files changed, 146 insertions(+), 90 deletions(-)
diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
index 590ddd586c..73b42c32b1 100644
--- a/libavfilter/dnn/dnn_backend_openvino.c
+++ b/libavfilter/dnn/dnn_backend_openvino.c
@@ -253,9 +253,9 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
ov_shape_free(&input_shape);
return ov2_map_error(status, NULL);
}
- input.height = dims[1];
- input.width = dims[2];
- input.channels = dims[3];
+ for (int i = 0; i < input_shape.rank; i++)
+ input.dims[i] = dims[i];
+ input.layout = DL_NHWC;
input.dt = precision_to_datatype(precision);
#else
status = ie_infer_request_get_blob(request->infer_request, task->input_name, &input_blob);
@@ -278,9 +278,9 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
av_log(ctx, AV_LOG_ERROR, "Failed to get input blob buffer\n");
return DNN_GENERIC_ERROR;
}
- input.height = dims.dims[2];
- input.width = dims.dims[3];
- input.channels = dims.dims[1];
+ for (int i = 0; i < input_shape.rank; i++)
+ input.dims[i] = dims[i];
+ input.layout = DL_NCHW;
input.data = blob_buffer.buffer;
input.dt = precision_to_datatype(precision);
#endif
@@ -339,8 +339,8 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
av_assert0(!"should not reach here");
break;
}
- input.data = (uint8_t *)input.data
- + input.width * input.height * input.channels * get_datatype_size(input.dt);
+ input.data = (uint8_t *)input.data +
+ input.dims[1] * input.dims[2] * input.dims[3] * get_datatype_size(input.dt);
}
#if HAVE_OPENVINO2
ov_tensor_free(tensor);
@@ -403,10 +403,11 @@ static void infer_completion_callback(void *args)
goto end;
}
outputs[i].dt = precision_to_datatype(precision);
-
- outputs[i].channels = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1;
- outputs[i].height = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1;
- outputs[i].width = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1;
+ outputs[i].layout = DL_NCHW;
+ outputs[i].dims[0] = 1;
+ outputs[i].dims[1] = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1;
+ outputs[i].dims[2] = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1;
+ outputs[i].dims[3] = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1;
av_assert0(request->lltask_count <= dims[0]);
outputs[i].layout = ctx->options.layout;
outputs[i].scale = ctx->options.scale;
@@ -445,9 +446,9 @@ static void infer_completion_callback(void *args)
return;
}
output.data = blob_buffer.buffer;
- output.channels = dims.dims[1];
- output.height = dims.dims[2];
- output.width = dims.dims[3];
+ output.layout = DL_NCHW;
+ for (int i = 0; i < 4; i++)
+ output.dims[i] = dims.dims[i];
av_assert0(request->lltask_count <= dims.dims[0]);
output.dt = precision_to_datatype(precision);
output.layout = ctx->options.layout;
@@ -469,8 +470,10 @@ static void infer_completion_callback(void *args)
ff_proc_from_dnn_to_frame(task->out_frame, outputs, ctx);
}
} else {
- task->out_frame->width = outputs[0].width;
- task->out_frame->height = outputs[0].height;
+ task->out_frame->width =
+ outputs[0].dims[dnn_get_width_idx_by_layout(outputs[0].layout)];
+ task->out_frame->height =
+ outputs[0].dims[dnn_get_height_idx_by_layout(outputs[0].layout)];
}
break;
case DFT_ANALYTICS_DETECT:
@@ -501,7 +504,8 @@ static void infer_completion_callback(void *args)
av_freep(&request->lltasks[i]);
for (int i = 0; i < ov_model->nb_outputs; i++)
outputs[i].data = (uint8_t *)outputs[i].data +
- outputs[i].width * outputs[i].height * outputs[i].channels * get_datatype_size(outputs[i].dt);
+ outputs[i].dims[1] * outputs[i].dims[2] * outputs[i].dims[3] *
+ get_datatype_size(outputs[i].dt);
}
end:
#if HAVE_OPENVINO2
@@ -1085,7 +1089,6 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name)
#if HAVE_OPENVINO2
ov_shape_t input_shape = {0};
ov_element_type_e precision;
- int64_t* dims;
ov_status_e status;
if (input_name)
status = ov_model_const_input_by_name(ov_model->ov_model, input_name, &ov_model->input_port);
@@ -1105,16 +1108,18 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name)
av_log(ctx, AV_LOG_ERROR, "Failed to get input port shape.\n");
return ov2_map_error(status, NULL);
}
- dims = input_shape.dims;
- if (dims[1] <= 3) { // NCHW
- input->channels = dims[1];
- input->height = input_resizable ? -1 : dims[2];
- input->width = input_resizable ? -1 : dims[3];
- } else { // NHWC
- input->height = input_resizable ? -1 : dims[1];
- input->width = input_resizable ? -1 : dims[2];
- input->channels = dims[3];
+ for (int i = 0; i < 4; i++)
+ input->dims[i] = input_shape.dims[i];
+ if (input_resizable) {
+ input->dims[dnn_get_width_idx_by_layout(input->layout)] = -1;
+ input->dims[dnn_get_height_idx_by_layout(input->layout)] = -1;
}
+
+ if (input_shape.dims[1] <= 3) // NCHW
+ input->layout = DL_NCHW;
+ else // NHWC
+ input->layout = DL_NHWC;
+
input->dt = precision_to_datatype(precision);
ov_shape_free(&input_shape);
return 0;
@@ -1144,15 +1149,18 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name)
return DNN_GENERIC_ERROR;
}
- if (dims[1] <= 3) { // NCHW
- input->channels = dims[1];
- input->height = input_resizable ? -1 : dims[2];
- input->width = input_resizable ? -1 : dims[3];
- } else { // NHWC
- input->height = input_resizable ? -1 : dims[1];
- input->width = input_resizable ? -1 : dims[2];
- input->channels = dims[3];
+ for (int i = 0; i < 4; i++)
+ input->dims[i] = input_shape.dims[i];
+ if (input_resizable) {
+ input->dims[dnn_get_width_idx_by_layout(input->layout)] = -1;
+ input->dims[dnn_get_height_idx_by_layout(input->layout)] = -1;
}
+
+ if (input_shape.dims[1] <= 3) // NCHW
+ input->layout = DL_NCHW;
+ else // NHWC
+ input->layout = DL_NHWC;
+
input->dt = precision_to_datatype(precision);
return 0;
}
diff --git a/libavfilter/dnn/dnn_backend_tf.c b/libavfilter/dnn/dnn_backend_tf.c
index 25046b58d9..27c5178bb5 100644
--- a/libavfilter/dnn/dnn_backend_tf.c
+++ b/libavfilter/dnn/dnn_backend_tf.c
@@ -251,7 +251,12 @@ static TF_Tensor *allocate_input_tensor(const DNNData *input)
{
TF_DataType dt;
size_t size;
- int64_t input_dims[] = {1, input->height, input->width, input->channels};
+ int64_t input_dims[4] = { 0 };
+
+ input_dims[0] = 1;
+ input_dims[1] = input->dims[dnn_get_height_idx_by_layout(input->layout)];
+ input_dims[2] = input->dims[dnn_get_width_idx_by_layout(input->layout)];
+ input_dims[3] = input->dims[dnn_get_channel_idx_by_layout(input->layout)];
switch (input->dt) {
case DNN_FLOAT:
dt = TF_FLOAT;
@@ -310,9 +315,9 @@ static int get_input_tf(void *model, DNNData *input, const char *input_name)
// currently only NHWC is supported
av_assert0(dims[0] == 1 || dims[0] == -1);
- input->height = dims[1];
- input->width = dims[2];
- input->channels = dims[3];
+ for (int i = 0; i < 4; i++)
+ input->dims[i] = dims[i];
+ input->layout = DL_NHWC;
return 0;
}
@@ -640,8 +645,8 @@ static int fill_model_input_tf(TFModel *tf_model, TFRequestItem *request) {
}
infer_request = request->infer_request;
- input.height = task->in_frame->height;
- input.width = task->in_frame->width;
+ input.dims[1] = task->in_frame->height;
+ input.dims[2] = task->in_frame->width;
infer_request->tf_input = av_malloc(sizeof(TF_Output));
if (!infer_request->tf_input) {
@@ -731,9 +736,12 @@ static void infer_completion_callback(void *args) {
}
for (uint32_t i = 0; i < task->nb_output; ++i) {
- outputs[i].height = TF_Dim(infer_request->output_tensors[i], 1);
- outputs[i].width = TF_Dim(infer_request->output_tensors[i], 2);
- outputs[i].channels = TF_Dim(infer_request->output_tensors[i], 3);
+ outputs[i].dims[dnn_get_height_idx_by_layout(outputs[i].layout)] =
+ TF_Dim(infer_request->output_tensors[i], 1);
+ outputs[i].dims[dnn_get_width_idx_by_layout(outputs[i].layout)] =
+ TF_Dim(infer_request->output_tensors[i], 2);
+ outputs[i].dims[dnn_get_channel_idx_by_layout(outputs[i].layout)] =
+ TF_Dim(infer_request->output_tensors[i], 3);
outputs[i].data = TF_TensorData(infer_request->output_tensors[i]);
outputs[i].dt = (DNNDataType)TF_TensorType(infer_request->output_tensors[i]);
}
@@ -747,8 +755,10 @@ static void infer_completion_callback(void *args) {
ff_proc_from_dnn_to_frame(task->out_frame, outputs, ctx);
}
} else {
- task->out_frame->width = outputs[0].width;
- task->out_frame->height = outputs[0].height;
+ task->out_frame->width =
+ outputs[0].dims[dnn_get_width_idx_by_layout(outputs[0].layout)];
+ task->out_frame->height =
+ outputs[0].dims[dnn_get_height_idx_by_layout(outputs[0].layout)];
}
break;
case DFT_ANALYTICS_DETECT:
diff --git a/libavfilter/dnn/dnn_io_proc.c b/libavfilter/dnn/dnn_io_proc.c
index ab656e8ed7..e5d6edb301 100644
--- a/libavfilter/dnn/dnn_io_proc.c
+++ b/libavfilter/dnn/dnn_io_proc.c
@@ -70,7 +70,7 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
dst_data = (void **)frame->data;
linesize[0] = frame->linesize[0];
if (output->layout == DL_NCHW) {
- middle_data = av_malloc(plane_size * output->channels);
+ middle_data = av_malloc(plane_size * output->dims[1]);
if (!middle_data) {
ret = AVERROR(ENOMEM);
goto err;
@@ -209,7 +209,7 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
src_data = (void **)frame->data;
linesize[0] = frame->linesize[0];
if (input->layout == DL_NCHW) {
- middle_data = av_malloc(plane_size * input->channels);
+ middle_data = av_malloc(plane_size * input->dims[1]);
if (!middle_data) {
ret = AVERROR(ENOMEM);
goto err;
@@ -346,6 +346,7 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index
int ret = 0;
enum AVPixelFormat fmt;
int left, top, width, height;
+ int width_idx, height_idx;
const AVDetectionBBoxHeader *header;
const AVDetectionBBox *bbox;
AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
@@ -364,6 +365,9 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index
return AVERROR(ENOSYS);
}
+ width_idx = dnn_get_width_idx_by_layout(input->layout);
+ height_idx = dnn_get_height_idx_by_layout(input->layout);
+
header = (const AVDetectionBBoxHeader *)sd->data;
bbox = av_get_detection_bbox(header, bbox_index);
@@ -374,17 +378,20 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index
fmt = get_pixel_format(input);
sws_ctx = sws_getContext(width, height, frame->format,
- input->width, input->height, fmt,
+ input->dims[width_idx],
+ input->dims[height_idx], fmt,
SWS_FAST_BILINEAR, NULL, NULL, NULL);
if (!sws_ctx) {
av_log(log_ctx, AV_LOG_ERROR, "Failed to create scale context for the conversion "
"fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
av_get_pix_fmt_name(frame->format), width, height,
- av_get_pix_fmt_name(fmt), input->width, input->height);
+ av_get_pix_fmt_name(fmt),
+ input->dims[width_idx],
+ input->dims[height_idx]);
return AVERROR(EINVAL);
}
- ret = av_image_fill_linesizes(linesizes, fmt, input->width);
+ ret = av_image_fill_linesizes(linesizes, fmt, input->dims[width_idx]);
if (ret < 0) {
av_log(log_ctx, AV_LOG_ERROR, "unable to get linesizes with av_image_fill_linesizes");
sws_freeContext(sws_ctx);
@@ -414,7 +421,7 @@ int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx)
{
struct SwsContext *sws_ctx;
int linesizes[4];
- int ret = 0;
+ int ret = 0, width_idx, height_idx;
enum AVPixelFormat fmt = get_pixel_format(input);
/* (scale != 1 and scale != 0) or mean != 0 */
@@ -430,18 +437,23 @@ int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx)
return AVERROR(ENOSYS);
}
+ width_idx = dnn_get_width_idx_by_layout(input->layout);
+ height_idx = dnn_get_height_idx_by_layout(input->layout);
+
sws_ctx = sws_getContext(frame->width, frame->height, frame->format,
- input->width, input->height, fmt,
+ input->dims[width_idx],
+ input->dims[height_idx], fmt,
SWS_FAST_BILINEAR, NULL, NULL, NULL);
if (!sws_ctx) {
av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion "
"fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
av_get_pix_fmt_name(frame->format), frame->width, frame->height,
- av_get_pix_fmt_name(fmt), input->width, input->height);
+ av_get_pix_fmt_name(fmt), input->dims[width_idx],
+ input->dims[height_idx]);
return AVERROR(EINVAL);
}
- ret = av_image_fill_linesizes(linesizes, fmt, input->width);
+ ret = av_image_fill_linesizes(linesizes, fmt, input->dims[width_idx]);
if (ret < 0) {
av_log(log_ctx, AV_LOG_ERROR, "unable to get linesizes with av_image_fill_linesizes");
sws_freeContext(sws_ctx);
diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h
index 183d8418b2..852d88baa8 100644
--- a/libavfilter/dnn_interface.h
+++ b/libavfilter/dnn_interface.h
@@ -64,7 +64,7 @@ typedef enum {
typedef struct DNNData{
void *data;
- int width, height, channels;
+ int dims[4];
// dt and order together decide the color format
DNNDataType dt;
DNNColorOrder order;
@@ -134,4 +134,19 @@ typedef struct DNNModule{
// Initializes DNNModule depending on chosen backend.
const DNNModule *ff_get_dnn_module(DNNBackendType backend_type, void *log_ctx);
+static inline int dnn_get_width_idx_by_layout(DNNLayout layout)
+{
+ return layout == DL_NHWC ? 2 : 3;
+}
+
+static inline int dnn_get_height_idx_by_layout(DNNLayout layout)
+{
+ return layout == DL_NHWC ? 1 : 2;
+}
+
+static inline int dnn_get_channel_idx_by_layout(DNNLayout layout)
+{
+ return layout == DL_NHWC ? 3 : 1;
+}
+
#endif
diff --git a/libavfilter/vf_dnn_classify.c b/libavfilter/vf_dnn_classify.c
index e88e59d09c..d180c3b461 100644
--- a/libavfilter/vf_dnn_classify.c
+++ b/libavfilter/vf_dnn_classify.c
@@ -68,8 +68,8 @@ static int dnn_classify_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox
uint32_t label_id;
float confidence;
AVFrameSideData *sd;
-
- if (output->channels <= 0) {
+ int output_size = output->dims[3] * output->dims[2] * output->dims[1];
+ if (output_size <= 0) {
return -1;
}
@@ -88,7 +88,7 @@ static int dnn_classify_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox
classifications = output->data;
label_id = 0;
confidence= classifications[0];
- for (int i = 1; i < output->channels; i++) {
+ for (int i = 1; i < output_size; i++) {
if (classifications[i] > confidence) {
label_id = i;
confidence= classifications[i];
diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index 249cbba0f7..caccbf7a12 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -166,14 +166,14 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
scale_w = cell_w;
scale_h = cell_h;
} else {
- if (output[output_index].height != output[output_index].width &&
- output[output_index].height == output[output_index].channels) {
+ if (output[output_index].dims[2] != output[output_index].dims[3] &&
+ output[output_index].dims[2] == output[output_index].dims[1]) {
is_NHWC = 1;
- cell_w = output[output_index].height;
- cell_h = output[output_index].channels;
+ cell_w = output[output_index].dims[2];
+ cell_h = output[output_index].dims[1];
} else {
- cell_w = output[output_index].width;
- cell_h = output[output_index].height;
+ cell_w = output[output_index].dims[3];
+ cell_h = output[output_index].dims[2];
}
scale_w = ctx->scale_width;
scale_h = ctx->scale_height;
@@ -205,14 +205,14 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
return AVERROR(EINVAL);
}
- if (output[output_index].channels * output[output_index].width *
- output[output_index].height % (box_size * cell_w * cell_h)) {
+ if (output[output_index].dims[1] * output[output_index].dims[2] *
+ output[output_index].dims[3] % (box_size * cell_w * cell_h)) {
av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n");
return AVERROR(EINVAL);
}
- detection_boxes = output[output_index].channels *
- output[output_index].height *
- output[output_index].width / box_size / cell_w / cell_h;
+ detection_boxes = output[output_index].dims[1] *
+ output[output_index].dims[2] *
+ output[output_index].dims[3] / box_size / cell_w / cell_h;
anchors = anchors + (detection_boxes * output_index * 2);
/**
@@ -373,18 +373,18 @@ static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outp
int scale_w = ctx->scale_width;
int scale_h = ctx->scale_height;
- if (nb_outputs == 1 && output->width == 7) {
- proposal_count = output->height;
- detect_size = output->width;
+ if (nb_outputs == 1 && output->dims[3] == 7) {
+ proposal_count = output->dims[2];
+ detect_size = output->dims[3];
detections = output->data;
- } else if (nb_outputs == 2 && output[0].width == 5) {
- proposal_count = output[0].height;
- detect_size = output[0].width;
+ } else if (nb_outputs == 2 && output[0].dims[3] == 5) {
+ proposal_count = output[0].dims[2];
+ detect_size = output[0].dims[3];
detections = output[0].data;
labels = output[1].data;
- } else if (nb_outputs == 2 && output[1].width == 5) {
- proposal_count = output[1].height;
- detect_size = output[1].width;
+ } else if (nb_outputs == 2 && output[1].dims[3] == 5) {
+ proposal_count = output[1].dims[2];
+ detect_size = output[1].dims[3];
detections = output[1].data;
labels = output[0].data;
} else {
@@ -821,15 +821,19 @@ static int config_input(AVFilterLink *inlink)
AVFilterContext *context = inlink->dst;
DnnDetectContext *ctx = context->priv;
DNNData model_input;
- int ret;
+ int ret, width_idx, height_idx;
ret = ff_dnn_get_input(&ctx->dnnctx, &model_input);
if (ret != 0) {
av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");
return ret;
}
- ctx->scale_width = model_input.width == -1 ? inlink->w : model_input.width;
- ctx->scale_height = model_input.height == -1 ? inlink->h : model_input.height;
+ width_idx = dnn_get_width_idx_by_layout(model_input.layout);
+ height_idx = dnn_get_height_idx_by_layout(model_input.layout);
+ ctx->scale_width = model_input.dims[width_idx] == -1 ? inlink->w :
+ model_input.dims[width_idx];
+ ctx->scale_height = model_input.dims[height_idx] == -1 ? inlink->h :
+ model_input.dims[height_idx];
return 0;
}
diff --git a/libavfilter/vf_dnn_processing.c b/libavfilter/vf_dnn_processing.c
index 6829e94585..0b70c8e024 100644
--- a/libavfilter/vf_dnn_processing.c
+++ b/libavfilter/vf_dnn_processing.c
@@ -77,22 +77,29 @@ static const enum AVPixelFormat pix_fmts[] = {
"the frame's format %s does not match " \
"the model input channel %d\n", \
av_get_pix_fmt_name(fmt), \
- model_input->channels);
+ model_input->dims[dnn_get_channel_idx_by_layout(model_input->layout)]);
static int check_modelinput_inlink(const DNNData *model_input, const AVFilterLink *inlink)
{
AVFilterContext *ctx = inlink->dst;
enum AVPixelFormat fmt = inlink->format;
+ int width_idx, height_idx;
+ width_idx = dnn_get_width_idx_by_layout(model_input->layout);
+ height_idx = dnn_get_height_idx_by_layout(model_input->layout);
// the design is to add explicit scale filter before this filter
- if (model_input->height != -1 && model_input->height != inlink->h) {
+ if (model_input->dims[height_idx] != -1 &&
+ model_input->dims[height_idx] != inlink->h) {
av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d but got %d\n",
- model_input->height, inlink->h);
+ model_input->dims[height_idx],
+ inlink->h);
return AVERROR(EIO);
}
- if (model_input->width != -1 && model_input->width != inlink->w) {
+ if (model_input->dims[width_idx] != -1 &&
+ model_input->dims[width_idx] != inlink->w) {
av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d but got %d\n",
- model_input->width, inlink->w);
+ model_input->dims[width_idx],
+ inlink->w);
return AVERROR(EIO);
}
if (model_input->dt != DNN_FLOAT) {
@@ -103,7 +110,7 @@ static int check_modelinput_inlink(const DNNData *model_input, const AVFilterLin
switch (fmt) {
case AV_PIX_FMT_RGB24:
case AV_PIX_FMT_BGR24:
- if (model_input->channels != 3) {
+ if (model_input->dims[dnn_get_channel_idx_by_layout(model_input->layout)] != 3) {
LOG_FORMAT_CHANNEL_MISMATCH();
return AVERROR(EIO);
}
@@ -116,7 +123,7 @@ static int check_modelinput_inlink(const DNNData *model_input, const AVFilterLin
case AV_PIX_FMT_YUV410P:
case AV_PIX_FMT_YUV411P:
case AV_PIX_FMT_NV12:
- if (model_input->channels != 1) {
+ if (model_input->dims[dnn_get_channel_idx_by_layout(model_input->layout)] != 1) {
LOG_FORMAT_CHANNEL_MISMATCH();
return AVERROR(EIO);
}
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] libavfilter/vf_dnn_detect: Use class confidence to filt boxes
2024-01-17 7:21 [FFmpeg-devel] [PATCH 1/3] libavfilter/dnn_bakcend_openvino: Add automatic input/output detection wenbin.chen-at-intel.com
2024-01-17 7:21 ` [FFmpeg-devel] [PATCH 2/3] libavfilter/dnn_interface: use dims to represent shapes wenbin.chen-at-intel.com
@ 2024-01-17 7:21 ` wenbin.chen-at-intel.com
2024-01-28 1:38 ` Guo, Yejun
1 sibling, 1 reply; 4+ messages in thread
From: wenbin.chen-at-intel.com @ 2024-01-17 7:21 UTC (permalink / raw)
To: ffmpeg-devel
From: Wenbin Chen <wenbin.chen@intel.com>
Use class confidence instead of box_score to filt boxes, which is more
accurate. Class confidence is obtained by multiplying class probability
distribution and box_score.
Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
libavfilter/vf_dnn_detect.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index caccbf7a12..2bf5ed7476 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -236,9 +236,6 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
conf = post_process_raw_data(
detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]);
}
- if (conf < conf_threshold) {
- continue;
- }
if (is_NHWC) {
x = post_process_raw_data(detection_boxes_data[0]);
@@ -257,6 +254,9 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
conf = conf * post_process_raw_data(
detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]);
}
+ if (conf < conf_threshold) {
+ continue;
+ }
bbox = av_mallocz(sizeof(*bbox));
if (!bbox)
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] libavfilter/vf_dnn_detect: Use class confidence to filt boxes
2024-01-17 7:21 ` [FFmpeg-devel] [PATCH 3/3] libavfilter/vf_dnn_detect: Use class confidence to filt boxes wenbin.chen-at-intel.com
@ 2024-01-28 1:38 ` Guo, Yejun
0 siblings, 0 replies; 4+ messages in thread
From: Guo, Yejun @ 2024-01-28 1:38 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> wenbin.chen-at-intel.com@ffmpeg.org
> Sent: Wednesday, January 17, 2024 3:22 PM
> To: ffmpeg-devel@ffmpeg.org
> Subject: [FFmpeg-devel] [PATCH 3/3] libavfilter/vf_dnn_detect: Use class
> confidence to filt boxes
>
> From: Wenbin Chen <wenbin.chen@intel.com>
>
> Use class confidence instead of box_score to filt boxes, which is more
> accurate. Class confidence is obtained by multiplying class probability
> distribution and box_score.
>
> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> ---
Looks good to me, will push soon.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2024-01-28 1:39 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-17 7:21 [FFmpeg-devel] [PATCH 1/3] libavfilter/dnn_bakcend_openvino: Add automatic input/output detection wenbin.chen-at-intel.com
2024-01-17 7:21 ` [FFmpeg-devel] [PATCH 2/3] libavfilter/dnn_interface: use dims to represent shapes wenbin.chen-at-intel.com
2024-01-17 7:21 ` [FFmpeg-devel] [PATCH 3/3] libavfilter/vf_dnn_detect: Use class confidence to filt boxes wenbin.chen-at-intel.com
2024-01-28 1:38 ` Guo, Yejun
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git