Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/4] libavfiter/dnn/dnn_backend_openvino: add multiple output support
@ 2023-12-04  5:36 wenbin.chen-at-intel.com
  2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 2/4] libavfilter/vf_dnn_detect: Add input pad wenbin.chen-at-intel.com
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: wenbin.chen-at-intel.com @ 2023-12-04  5:36 UTC (permalink / raw)
  To: ffmpeg-devel

From: Wenbin Chen <wenbin.chen@intel.com>

Add multiple output support to openvino backend. You can use '&' to
split different output when you set output name using command line.

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 libavfilter/dnn/dnn_backend_common.c   |   7 -
 libavfilter/dnn/dnn_backend_openvino.c | 216 +++++++++++++++++--------
 libavfilter/vf_dnn_detect.c            |  11 +-
 3 files changed, 150 insertions(+), 84 deletions(-)

diff --git a/libavfilter/dnn/dnn_backend_common.c b/libavfilter/dnn/dnn_backend_common.c
index 91a4a3c4bf..632832ec36 100644
--- a/libavfilter/dnn/dnn_backend_common.c
+++ b/libavfilter/dnn/dnn_backend_common.c
@@ -43,13 +43,6 @@ int ff_check_exec_params(void *ctx, DNNBackendType backend, DNNFunctionType func
         return AVERROR(EINVAL);
     }
 
-    if (exec_params->nb_output != 1 && backend != DNN_TF) {
-        // currently, the filter does not need multiple outputs,
-        // so we just pending the support until we really need it.
-        avpriv_report_missing_feature(ctx, "multiple outputs");
-        return AVERROR(ENOSYS);
-    }
-
     return 0;
 }
 
diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
index 6fe8b9c243..089e028818 100644
--- a/libavfilter/dnn/dnn_backend_openvino.c
+++ b/libavfilter/dnn/dnn_backend_openvino.c
@@ -64,7 +64,7 @@ typedef struct OVModel{
     ov_compiled_model_t *compiled_model;
     ov_output_const_port_t* input_port;
     ov_preprocess_input_info_t* input_info;
-    ov_output_const_port_t* output_port;
+    ov_output_const_port_t** output_ports;
     ov_preprocess_output_info_t* output_info;
     ov_preprocess_prepostprocessor_t* preprocess;
 #else
@@ -77,6 +77,7 @@ typedef struct OVModel{
     SafeQueue *request_queue;   // holds OVRequestItem
     Queue *task_queue;          // holds TaskItem
     Queue *lltask_queue;     // holds LastLevelTaskItem
+    int nb_outputs;
 } OVModel;
 
 // one request for one call to openvino
@@ -349,7 +350,7 @@ static void infer_completion_callback(void *args)
     TaskItem *task = lltask->task;
     OVModel *ov_model = task->model;
     SafeQueue *requestq = ov_model->request_queue;
-    DNNData output;
+    DNNData *outputs;
     OVContext *ctx = &ov_model->ctx;
 #if HAVE_OPENVINO2
     size_t* dims;
@@ -358,45 +359,61 @@ static void infer_completion_callback(void *args)
     ov_shape_t output_shape = {0};
     ov_element_type_e precision;
 
-    memset(&output, 0, sizeof(output));
-    status = ov_infer_request_get_output_tensor_by_index(request->infer_request, 0, &output_tensor);
-    if (status != OK) {
-        av_log(ctx, AV_LOG_ERROR,
-               "Failed to get output tensor.");
+    outputs = av_calloc(ov_model->nb_outputs, sizeof(*outputs));
+    if (!outputs) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to alloc outputs.");
         return;
     }
 
-    status = ov_tensor_data(output_tensor, &output.data);
-    if (status != OK) {
-        av_log(ctx, AV_LOG_ERROR,
-               "Failed to get output data.");
-        return;
-    }
+    for (int i = 0; i < ov_model->nb_outputs; i++) {
+        status = ov_infer_request_get_tensor_by_const_port(request->infer_request,
+                                                           ov_model->output_ports[i],
+                                                           &output_tensor);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR,
+                "Failed to get output tensor.");
+            goto end;
+        }
 
-    status = ov_tensor_get_shape(output_tensor, &output_shape);
-    if (status != OK) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to get output port shape.\n");
-        return;
-    }
-    dims = output_shape.dims;
+        status = ov_tensor_data(output_tensor, &outputs[i].data);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR,
+                "Failed to get output data.");
+            goto end;
+        }
 
-    status = ov_port_get_element_type(ov_model->output_port, &precision);
-    if (status != OK) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to get output port data type.\n");
+        status = ov_tensor_get_shape(output_tensor, &output_shape);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to get output port shape.\n");
+            goto end;
+        }
+        dims = output_shape.dims;
+
+        status = ov_port_get_element_type(ov_model->output_ports[i], &precision);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to get output port data type.\n");
+            goto end;
+        }
+        outputs[i].dt       = precision_to_datatype(precision);
+
+        outputs[i].channels = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1;
+        outputs[i].height   = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1;
+        outputs[i].width    = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1;
+        av_assert0(request->lltask_count <= dims[0]);
+        outputs[i].layout   = ctx->options.layout;
+        outputs[i].scale    = ctx->options.scale;
+        outputs[i].mean     = ctx->options.mean;
         ov_shape_free(&output_shape);
-        return;
+        ov_tensor_free(output_tensor);
+        output_tensor = NULL;
     }
-    output.channels = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1;
-    output.height   = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1;
-    output.width    = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1;
-    av_assert0(request->lltask_count <= dims[0]);
-    ov_shape_free(&output_shape);
 #else
     IEStatusCode status;
     dimensions_t dims;
     ie_blob_t *output_blob = NULL;
     ie_blob_buffer_t blob_buffer;
     precision_e precision;
+    DNNData output;
     status = ie_infer_request_get_blob(request->infer_request, task->output_names[0], &output_blob);
     if (status != OK) {
         av_log(ctx, AV_LOG_ERROR,
@@ -424,11 +441,12 @@ static void infer_completion_callback(void *args)
     output.height   = dims.dims[2];
     output.width    = dims.dims[3];
     av_assert0(request->lltask_count <= dims.dims[0]);
-#endif
     output.dt       = precision_to_datatype(precision);
     output.layout   = ctx->options.layout;
     output.scale    = ctx->options.scale;
     output.mean     = ctx->options.mean;
+    outputs = &output;
+#endif
 
     av_assert0(request->lltask_count >= 1);
     for (int i = 0; i < request->lltask_count; ++i) {
@@ -438,28 +456,33 @@ static void infer_completion_callback(void *args)
         case DFT_PROCESS_FRAME:
             if (task->do_ioproc) {
                 if (ov_model->model->frame_post_proc != NULL) {
-                    ov_model->model->frame_post_proc(task->out_frame, &output, ov_model->model->filter_ctx);
+                    ov_model->model->frame_post_proc(task->out_frame, outputs, ov_model->model->filter_ctx);
                 } else {
-                    ff_proc_from_dnn_to_frame(task->out_frame, &output, ctx);
+                    ff_proc_from_dnn_to_frame(task->out_frame, outputs, ctx);
                 }
             } else {
-                task->out_frame->width = output.width;
-                task->out_frame->height = output.height;
+                task->out_frame->width = outputs[0].width;
+                task->out_frame->height = outputs[0].height;
             }
             break;
         case DFT_ANALYTICS_DETECT:
             if (!ov_model->model->detect_post_proc) {
                 av_log(ctx, AV_LOG_ERROR, "detect filter needs to provide post proc\n");
-                return;
+                goto end;
             }
-            ov_model->model->detect_post_proc(task->in_frame, &output, 1, ov_model->model->filter_ctx);
+            ov_model->model->detect_post_proc(task->in_frame, outputs,
+                                              ov_model->nb_outputs,
+                                              ov_model->model->filter_ctx);
             break;
         case DFT_ANALYTICS_CLASSIFY:
             if (!ov_model->model->classify_post_proc) {
                 av_log(ctx, AV_LOG_ERROR, "classify filter needs to provide post proc\n");
-                return;
+                goto end;
             }
-            ov_model->model->classify_post_proc(task->in_frame, &output, request->lltasks[i]->bbox_index, ov_model->model->filter_ctx);
+            for (int output_i = 0; output_i < ov_model->nb_outputs; output_i++)
+                ov_model->model->classify_post_proc(task->in_frame, outputs,
+                                                    request->lltasks[i]->bbox_index,
+                                                    ov_model->model->filter_ctx);
             break;
         default:
             av_assert0(!"should not reach here");
@@ -468,10 +491,17 @@ static void infer_completion_callback(void *args)
 
         task->inference_done++;
         av_freep(&request->lltasks[i]);
-        output.data = (uint8_t *)output.data
-                      + output.width * output.height * output.channels * get_datatype_size(output.dt);
+        for (int i = 0; i < ov_model->nb_outputs; i++)
+            outputs[i].data = (uint8_t *)outputs[i].data +
+                outputs[i].width * outputs[i].height * outputs[i].channels * get_datatype_size(outputs[i].dt);
     }
-#if !HAVE_OPENVINO2
+end:
+#if HAVE_OPENVINO2
+    av_freep(&outputs);
+    ov_shape_free(&output_shape);
+    if (output_tensor)
+        ov_tensor_free(output_tensor);
+#else
     ie_blob_free(&output_blob);
 #endif
     request->lltask_count = 0;
@@ -525,8 +555,10 @@ static void dnn_free_model_ov(DNNModel **model)
 #if HAVE_OPENVINO2
     if (ov_model->input_port)
         ov_output_const_port_free(ov_model->input_port);
-    if (ov_model->output_port)
-        ov_output_const_port_free(ov_model->output_port);
+    for (int i = 0; i < ov_model->nb_outputs; i++)
+        if (ov_model->output_ports[i])
+            ov_output_const_port_free(ov_model->output_ports[i]);
+    av_freep(&ov_model->output_ports);
     if (ov_model->preprocess)
         ov_preprocess_prepostprocessor_free(ov_model->preprocess);
     if (ov_model->compiled_model)
@@ -551,7 +583,7 @@ static void dnn_free_model_ov(DNNModel **model)
 }
 
 
-static int init_model_ov(OVModel *ov_model, const char *input_name, const char *output_name)
+static int init_model_ov(OVModel *ov_model, const char *input_name, const char **output_names, int nb_outputs)
 {
     int ret = 0;
     OVContext *ctx = &ov_model->ctx;
@@ -594,17 +626,15 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
     }
 
     status = ov_preprocess_prepostprocessor_get_input_info_by_name(ov_model->preprocess, input_name, &ov_model->input_info);
-    status |= ov_preprocess_prepostprocessor_get_output_info_by_name(ov_model->preprocess, output_name, &ov_model->output_info);
     if (status != OK) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to get input/output info from preprocess.\n");
+        av_log(ctx, AV_LOG_ERROR, "Failed to get input info from preprocess.\n");
         ret = ov2_map_error(status, NULL);
         goto err;
     }
 
     status = ov_preprocess_input_info_get_tensor_info(ov_model->input_info, &input_tensor_info);
-    status |= ov_preprocess_output_info_get_tensor_info(ov_model->output_info, &output_tensor_info);
     if (status != OK) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to get tensor info from input/output.\n");
+        av_log(ctx, AV_LOG_ERROR, "Failed to get tensor info from input.\n");
         ret = ov2_map_error(status, NULL);
         goto err;
     }
@@ -642,17 +672,43 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
     }
 
     status = ov_preprocess_input_tensor_info_set_element_type(input_tensor_info, U8);
-    if (ov_model->model->func_type != DFT_PROCESS_FRAME)
-        status |= ov_preprocess_output_set_element_type(output_tensor_info, F32);
-    else if (fabsf(ctx->options.scale - 1) > 1e-6f || fabsf(ctx->options.mean) > 1e-6f)
-        status |= ov_preprocess_output_set_element_type(output_tensor_info, F32);
-    else
-        status |= ov_preprocess_output_set_element_type(output_tensor_info, U8);
     if (status != OK) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to set input/output element type\n");
+        av_log(ctx, AV_LOG_ERROR, "Failed to set input element type\n");
         ret = ov2_map_error(status, NULL);
         goto err;
     }
+
+    ov_model->nb_outputs = nb_outputs;
+    for (int i = 0; i < nb_outputs; i++) {
+        status = ov_preprocess_prepostprocessor_get_output_info_by_name(
+                ov_model->preprocess, output_names[i], &ov_model->output_info);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to get output info from preprocess.\n");
+            ret = ov2_map_error(status, NULL);
+            goto err;
+        }
+        status |= ov_preprocess_output_info_get_tensor_info(ov_model->output_info, &output_tensor_info);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to get tensor info from input/output.\n");
+            ret = ov2_map_error(status, NULL);
+            goto err;
+        }
+        if (ov_model->model->func_type != DFT_PROCESS_FRAME)
+            status |= ov_preprocess_output_set_element_type(output_tensor_info, F32);
+        else if (fabsf(ctx->options.scale - 1) > 1e-6f || fabsf(ctx->options.mean) > 1e-6f)
+            status |= ov_preprocess_output_set_element_type(output_tensor_info, F32);
+        else
+            status |= ov_preprocess_output_set_element_type(output_tensor_info, U8);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to set output element type\n");
+            ret = ov2_map_error(status, NULL);
+            goto err;
+        }
+        ov_preprocess_output_tensor_info_free(output_tensor_info);
+        output_tensor_info = NULL;
+        ov_preprocess_output_info_free(ov_model->output_info);
+        ov_model->output_info = NULL;
+    }
     // set preprocess steps.
     if (fabsf(ctx->options.scale - 1) > 1e-6f || fabsf(ctx->options.mean) > 1e-6f) {
         ov_preprocess_preprocess_steps_t* input_process_steps = NULL;
@@ -667,11 +723,18 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
         status |= ov_preprocess_preprocess_steps_scale(input_process_steps, ctx->options.scale);
         if (status != OK) {
             av_log(ctx, AV_LOG_ERROR, "Failed to set preprocess steps\n");
+            ov_preprocess_preprocess_steps_free(input_process_steps);
+            input_process_steps = NULL;
             ret = ov2_map_error(status, NULL);
             goto err;
         }
         ov_preprocess_preprocess_steps_free(input_process_steps);
+        input_process_steps = NULL;
     }
+    ov_preprocess_input_tensor_info_free(input_tensor_info);
+    input_tensor_info = NULL;
+    ov_preprocess_input_info_free(ov_model->input_info);
+    ov_model->input_info = NULL;
 
     //update model
     if(ov_model->ov_model)
@@ -679,20 +742,33 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
     status = ov_preprocess_prepostprocessor_build(ov_model->preprocess, &ov_model->ov_model);
     if (status != OK) {
         av_log(ctx, AV_LOG_ERROR, "Failed to update OV model\n");
+        ov_model_free(tmp_ov_model);
+        tmp_ov_model = NULL;
         ret = ov2_map_error(status, NULL);
         goto err;
     }
     ov_model_free(tmp_ov_model);
 
     //update output_port
-    if (ov_model->output_port) {
-        ov_output_const_port_free(ov_model->output_port);
-        ov_model->output_port = NULL;
-    }
-    status = ov_model_const_output_by_name(ov_model->ov_model, output_name, &ov_model->output_port);
-    if (status != OK) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to get output port.\n");
-        goto err;
+    if (!ov_model->output_ports) {
+        ov_model->output_ports = av_calloc(nb_outputs, sizeof(*ov_model->output_ports));
+        if (!ov_model->output_ports) {
+            ret = AVERROR(ENOMEM);
+            goto err;
+        }
+    } else
+        for (int i = 0; i < nb_outputs; i++) {
+            ov_output_const_port_free(ov_model->output_ports[i]);
+            ov_model->output_ports[i] = NULL;
+        }
+
+    for (int i = 0; i < nb_outputs; i++) {
+        status = ov_model_const_output_by_name(ov_model->ov_model, output_names[i],
+                                               &ov_model->output_ports[i]);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to get output port %s.\n", output_names[i]);
+            goto err;
+        }
     }
     //compile network
     status = ov_core_compile_model(ov_model->core, ov_model->ov_model, device, 0, &ov_model->compiled_model);
@@ -701,6 +777,7 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
         goto err;
     }
     ov_preprocess_input_model_info_free(input_model_info);
+    input_model_info = NULL;
     ov_layout_free(NCHW_layout);
     ov_layout_free(NHWC_layout);
 #else
@@ -745,6 +822,7 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
         ret = DNN_GENERIC_ERROR;
         goto err;
     }
+    ov_model->nb_outputs = 1;
 
     // all models in openvino open model zoo use BGR with range [0.0f, 255.0f] as input,
     // we don't have a AVPixelFormat to describe it, so we'll use AV_PIX_FMT_BGR24 and
@@ -848,6 +926,10 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
 
 err:
 #if HAVE_OPENVINO2
+    if (output_tensor_info)
+        ov_preprocess_output_tensor_info_free(output_tensor_info);
+    if (ov_model->output_info)
+        ov_preprocess_output_info_free(ov_model->output_info);
     if (NCHW_layout)
         ov_layout_free(NCHW_layout);
     if (NHWC_layout)
@@ -1204,11 +1286,6 @@ static int get_output_ov(void *model, const char *input_name, int input_width, i
         }
     }
 
-    status = ov_model_const_output_by_name(ov_model->ov_model, output_name, &ov_model->output_port);
-    if (status != OK) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to get output port.\n");
-        return ov2_map_error(status, NULL);
-    }
     if (!ov_model->compiled_model) {
 #else
     if (ctx->options.input_resizable) {
@@ -1224,7 +1301,7 @@ static int get_output_ov(void *model, const char *input_name, int input_width, i
     }
     if (!ov_model->exe_network) {
 #endif
-        ret = init_model_ov(ov_model, input_name, output_name);
+        ret = init_model_ov(ov_model, input_name, &output_name, 1);
         if (ret != 0) {
             av_log(ctx, AV_LOG_ERROR, "Failed init OpenVINO exectuable network or inference request\n");
             return ret;
@@ -1397,7 +1474,8 @@ static int dnn_execute_model_ov(const DNNModel *model, DNNExecBaseParams *exec_p
 #else
     if (!ov_model->exe_network) {
 #endif
-        ret = init_model_ov(ov_model, exec_params->input_name, exec_params->output_names[0]);
+        ret = init_model_ov(ov_model, exec_params->input_name,
+                            exec_params->output_names, exec_params->nb_output);
         if (ret != 0) {
             av_log(ctx, AV_LOG_ERROR, "Failed init OpenVINO exectuable network or inference request\n");
             return ret;
diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index 7ac3bb0b58..373dda58bf 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -354,11 +354,11 @@ static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterCon
             break;
         }
     }
-
     return 0;
 }
 
-static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
+static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outputs,
+                                   AVFilterContext *filter_ctx)
 {
     AVFrameSideData *sd;
     DnnDetectContext *ctx = filter_ctx->priv;
@@ -466,7 +466,7 @@ static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AV
     DnnContext *dnn_ctx = &ctx->dnnctx;
     switch (dnn_ctx->backend_type) {
     case DNN_OV:
-        return dnn_detect_post_proc_ov(frame, output, filter_ctx);
+        return dnn_detect_post_proc_ov(frame, output, nb, filter_ctx);
     case DNN_TF:
         return dnn_detect_post_proc_tf(frame, output, filter_ctx);
     default:
@@ -553,11 +553,6 @@ static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, i
         }
         return 0;
     case DNN_OV:
-        if (output_nb != 1) {
-            av_log(ctx, AV_LOG_ERROR, "Dnn detect filter with openvino backend needs 1 output only, \
-                                       but get %d instead\n", output_nb);
-            return AVERROR(EINVAL);
-        }
         return 0;
     default:
         avpriv_report_missing_feature(ctx, "Dnn detect filter does not support current backend\n");
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [FFmpeg-devel] [PATCH 2/4] libavfilter/vf_dnn_detect: Add input pad
  2023-12-04  5:36 [FFmpeg-devel] [PATCH 1/4] libavfiter/dnn/dnn_backend_openvino: add multiple output support wenbin.chen-at-intel.com
@ 2023-12-04  5:36 ` wenbin.chen-at-intel.com
  2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 3/4] libavfilter/vf_dnn_detect: Add yolov3 support wenbin.chen-at-intel.com
  2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 4/4] libavfilter/vf_dnn_detect: Add yolov4 support wenbin.chen-at-intel.com
  2 siblings, 0 replies; 4+ messages in thread
From: wenbin.chen-at-intel.com @ 2023-12-04  5:36 UTC (permalink / raw)
  To: ffmpeg-devel

From: Wenbin Chen <wenbin.chen@intel.com>

Add input pad to get model input resolution. Detection models always
have fixed input size. And the output coordinators are based on the
input resolution, so we need to get input size to map coordinators to
our real output frames.

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 libavfilter/dnn/dnn_backend_openvino.c | 24 ++++++++++++++++------
 libavfilter/vf_dnn_detect.c            | 28 +++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
index 089e028818..671a995c70 100644
--- a/libavfilter/dnn/dnn_backend_openvino.c
+++ b/libavfilter/dnn/dnn_backend_openvino.c
@@ -1073,9 +1073,15 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name)
         return AVERROR(ENOSYS);
     }
 
-    input->channels = dims[1];
-    input->height   = input_resizable ? -1 : dims[2];
-    input->width    = input_resizable ? -1 : dims[3];
+    if (dims[1] <= 3) { // NCHW
+        input->channels = dims[1];
+        input->height   = input_resizable ? -1 : dims[2];
+        input->width    = input_resizable ? -1 : dims[3];
+    } else { // NHWC
+        input->height   = input_resizable ? -1 : dims[1];
+        input->width    = input_resizable ? -1 : dims[2];
+        input->channels = dims[3];
+    }
     input->dt       = precision_to_datatype(precision);
 
     return 0;
@@ -1105,9 +1111,15 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name)
                 return DNN_GENERIC_ERROR;
             }
 
-            input->channels = dims.dims[1];
-            input->height   = input_resizable ? -1 : dims.dims[2];
-            input->width    = input_resizable ? -1 : dims.dims[3];
+            if (dims[1] <= 3) { // NCHW
+                input->channels = dims[1];
+                input->height   = input_resizable ? -1 : dims[2];
+                input->width    = input_resizable ? -1 : dims[3];
+            } else { // NHWC
+                input->height   = input_resizable ? -1 : dims[1];
+                input->width    = input_resizable ? -1 : dims[2];
+                input->channels = dims[3];
+            }
             input->dt       = precision_to_datatype(precision);
             return 0;
         }
diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index 373dda58bf..86f61c9907 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -699,13 +699,39 @@ static av_cold void dnn_detect_uninit(AVFilterContext *context)
     free_detect_labels(ctx);
 }
 
+static int config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *context     = inlink->dst;
+    DnnDetectContext *ctx = context->priv;
+    DNNData model_input;
+    int ret;
+
+    ret = ff_dnn_get_input(&ctx->dnnctx, &model_input);
+    if (ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");
+        return ret;
+    }
+    ctx->scale_width = model_input.width == -1 ? inlink->w : model_input.width;
+    ctx->scale_height = model_input.height ==  -1 ? inlink->h : model_input.height;
+
+    return 0;
+}
+
+static const AVFilterPad dnn_detect_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = config_input,
+    },
+};
+
 const AVFilter ff_vf_dnn_detect = {
     .name          = "dnn_detect",
     .description   = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."),
     .priv_size     = sizeof(DnnDetectContext),
     .init          = dnn_detect_init,
     .uninit        = dnn_detect_uninit,
-    FILTER_INPUTS(ff_video_default_filterpad),
+    FILTER_INPUTS(dnn_detect_inputs),
     FILTER_OUTPUTS(ff_video_default_filterpad),
     FILTER_PIXFMTS_ARRAY(pix_fmts),
     .priv_class    = &dnn_detect_class,
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [FFmpeg-devel] [PATCH 3/4] libavfilter/vf_dnn_detect: Add yolov3 support
  2023-12-04  5:36 [FFmpeg-devel] [PATCH 1/4] libavfiter/dnn/dnn_backend_openvino: add multiple output support wenbin.chen-at-intel.com
  2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 2/4] libavfilter/vf_dnn_detect: Add input pad wenbin.chen-at-intel.com
@ 2023-12-04  5:36 ` wenbin.chen-at-intel.com
  2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 4/4] libavfilter/vf_dnn_detect: Add yolov4 support wenbin.chen-at-intel.com
  2 siblings, 0 replies; 4+ messages in thread
From: wenbin.chen-at-intel.com @ 2023-12-04  5:36 UTC (permalink / raw)
  To: ffmpeg-devel

From: Wenbin Chen <wenbin.chen@intel.com>

Add yolov3 support. The difference of yolov3 is that it has multiple
outputs in different scale to perform better on both large and small
object.

The model detail refer to: https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/yolo-v3-tf

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 libavfilter/vf_dnn_detect.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index 86f61c9907..7a32b191c3 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -35,6 +35,7 @@
 typedef enum {
     DDMT_SSD,
     DDMT_YOLOV1V2,
+    DDMT_YOLOV3
 } DNNDetectionModelType;
 
 typedef struct DnnDetectContext {
@@ -73,6 +74,7 @@ static const AVOption dnn_detect_options[] = {
     { "model_type",  "DNN detection model type",   OFFSET2(model_type),      AV_OPT_TYPE_INT,       { .i64 = DDMT_SSD },    INT_MIN, INT_MAX, FLAGS, "model_type" },
         { "ssd",     "output shape [1, 1, N, 7]",  0,                        AV_OPT_TYPE_CONST,       { .i64 = DDMT_SSD },    0, 0, FLAGS, "model_type" },
         { "yolo",    "output shape [1, N*Cx*Cy*DetectionBox]",  0,           AV_OPT_TYPE_CONST,       { .i64 = DDMT_YOLOV1V2 },    0, 0, FLAGS, "model_type" },
+        { "yolov3",  "outputs shape [1, N*D, Cx, Cy]",  0,                   AV_OPT_TYPE_CONST,       { .i64 = DDMT_YOLOV3 },      0, 0, FLAGS, "model_type" },
     { "cell_w",      "cell width",                 OFFSET2(cell_w),          AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
     { "cell_h",      "cell height",                OFFSET2(cell_h),          AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
     { "nb_classes",  "The number of class",        OFFSET2(nb_classes),      AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
@@ -146,6 +148,11 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
         cell_h = ctx->cell_h;
         scale_w = cell_w;
         scale_h = cell_h;
+    } else {
+        cell_w = output[output_index].width;
+        cell_h = output[output_index].height;
+        scale_w = ctx->scale_width;
+        scale_h = ctx->scale_height;
     }
     box_size = nb_classes + 5;
 
@@ -173,6 +180,7 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
                       output[output_index].height *
                       output[output_index].width / box_size / cell_w / cell_h;
 
+    anchors = anchors + (detection_boxes * output_index * 2);
     /**
      * find all candidate bbox
      * yolo output can be reshaped to [B, N*D, Cx, Cy]
@@ -284,6 +292,21 @@ static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterCo
     return 0;
 }
 
+static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output,
+                                       AVFilterContext *filter_ctx, int nb_outputs)
+{
+    int ret = 0;
+    for (int i = 0; i < nb_outputs; i++) {
+        ret = dnn_detect_parse_yolo_output(frame, output, i, filter_ctx);
+        if (ret < 0)
+            return ret;
+    }
+    ret = dnn_detect_fill_side_data(frame, filter_ctx);
+    if (ret < 0)
+        return ret;
+    return 0;
+}
+
 static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
 {
     DnnDetectContext *ctx = filter_ctx->priv;
@@ -380,8 +403,11 @@ static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outpu
         ret = dnn_detect_post_proc_yolo(frame, output, filter_ctx);
         if (ret < 0)
             return ret;
+    case DDMT_YOLOV3:
+        ret = dnn_detect_post_proc_yolov3(frame, output, filter_ctx, nb_outputs);
+        if (ret < 0)
+            return ret;
     }
-
     return 0;
 }
 
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [FFmpeg-devel] [PATCH 4/4] libavfilter/vf_dnn_detect: Add yolov4 support
  2023-12-04  5:36 [FFmpeg-devel] [PATCH 1/4] libavfiter/dnn/dnn_backend_openvino: add multiple output support wenbin.chen-at-intel.com
  2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 2/4] libavfilter/vf_dnn_detect: Add input pad wenbin.chen-at-intel.com
  2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 3/4] libavfilter/vf_dnn_detect: Add yolov3 support wenbin.chen-at-intel.com
@ 2023-12-04  5:36 ` wenbin.chen-at-intel.com
  2 siblings, 0 replies; 4+ messages in thread
From: wenbin.chen-at-intel.com @ 2023-12-04  5:36 UTC (permalink / raw)
  To: ffmpeg-devel

From: Wenbin Chen <wenbin.chen@intel.com>

The difference of yolov4 is that sigmoid function needed to be applied
on x, y coordinates. Also make it compatiple with NHWC output as the
yolov4 model from openvino model zoo has NHWC output layout.

Model refer to: https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/yolo-v4-tf

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 libavfilter/vf_dnn_detect.c | 71 ++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 12 deletions(-)

diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index 7a32b191c3..1b04a2cb98 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -35,7 +35,8 @@
 typedef enum {
     DDMT_SSD,
     DDMT_YOLOV1V2,
-    DDMT_YOLOV3
+    DDMT_YOLOV3,
+    DDMT_YOLOV4
 } DNNDetectionModelType;
 
 typedef struct DnnDetectContext {
@@ -75,6 +76,7 @@ static const AVOption dnn_detect_options[] = {
         { "ssd",     "output shape [1, 1, N, 7]",  0,                        AV_OPT_TYPE_CONST,       { .i64 = DDMT_SSD },    0, 0, FLAGS, "model_type" },
         { "yolo",    "output shape [1, N*Cx*Cy*DetectionBox]",  0,           AV_OPT_TYPE_CONST,       { .i64 = DDMT_YOLOV1V2 },    0, 0, FLAGS, "model_type" },
         { "yolov3",  "outputs shape [1, N*D, Cx, Cy]",  0,                   AV_OPT_TYPE_CONST,       { .i64 = DDMT_YOLOV3 },      0, 0, FLAGS, "model_type" },
+        { "yolov4",  "outputs shape [1, N*D, Cx, Cy]",  0,                   AV_OPT_TYPE_CONST,       { .i64 = DDMT_YOLOV4 },    0, 0, FLAGS, "model_type" },
     { "cell_w",      "cell width",                 OFFSET2(cell_w),          AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
     { "cell_h",      "cell height",                OFFSET2(cell_h),          AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
     { "nb_classes",  "The number of class",        OFFSET2(nb_classes),      AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
@@ -84,6 +86,14 @@ static const AVOption dnn_detect_options[] = {
 
 AVFILTER_DEFINE_CLASS(dnn_detect);
 
+static inline float sigmoid(float x) {
+    return 1.f / (1.f + exp(-x));
+}
+
+static inline float linear(float x) {
+    return x;
+}
+
 static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)
 {
     float max_prob = 0;
@@ -142,6 +152,8 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
     float *output_data = output[output_index].data;
     float *anchors = ctx->anchors;
     AVDetectionBBox *bbox;
+    float (*post_process_raw_data)(float x);
+    int is_NHWC = 0;
 
     if (ctx->model_type == DDMT_YOLOV1V2) {
         cell_w = ctx->cell_w;
@@ -149,13 +161,30 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
         scale_w = cell_w;
         scale_h = cell_h;
     } else {
-        cell_w = output[output_index].width;
-        cell_h = output[output_index].height;
+        if (output[output_index].height != output[output_index].width &&
+            output[output_index].height == output[output_index].channels) {
+            is_NHWC = 1;
+            cell_w = output[output_index].height;
+            cell_h = output[output_index].channels;
+        } else {
+            cell_w = output[output_index].width;
+            cell_h = output[output_index].height;
+        }
         scale_w = ctx->scale_width;
         scale_h = ctx->scale_height;
     }
     box_size = nb_classes + 5;
 
+    switch (ctx->model_type) {
+    case DDMT_YOLOV1V2:
+    case DDMT_YOLOV3:
+        post_process_raw_data = linear;
+        break;
+    case DDMT_YOLOV4:
+        post_process_raw_data = sigmoid;
+         break;
+    }
+
     if (!cell_h || !cell_w) {
         av_log(filter_ctx, AV_LOG_ERROR, "cell_w and cell_h are detected\n");
         return AVERROR(EINVAL);
@@ -193,19 +222,36 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
                 float *detection_boxes_data;
                 int label_id;
 
-                detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h;
-                conf = detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h];
+                if (is_NHWC) {
+                    detection_boxes_data = output_data +
+                        ((cy * cell_w + cx) * detection_boxes + box_id) * box_size;
+                    conf = post_process_raw_data(detection_boxes_data[4]);
+                } else {
+                    detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h;
+                    conf = post_process_raw_data(
+                                detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]);
+                }
                 if (conf < conf_threshold) {
                     continue;
                 }
 
-                x    = detection_boxes_data[cy * cell_w + cx];
-                y    = detection_boxes_data[cy * cell_w + cx + cell_w * cell_h];
-                w    = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];
-                h    = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];
-                label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h,
-                                    detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);
-                conf = conf * detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h];
+                if (is_NHWC) {
+                    x = post_process_raw_data(detection_boxes_data[0]);
+                    y = post_process_raw_data(detection_boxes_data[1]);
+                    w = detection_boxes_data[2];
+                    h = detection_boxes_data[3];
+                    label_id = dnn_detect_get_label_id(ctx->nb_classes, 1, detection_boxes_data + 5);
+                    conf = conf * post_process_raw_data(detection_boxes_data[label_id + 5]);
+                } else {
+                    x = post_process_raw_data(detection_boxes_data[cy * cell_w + cx]);
+                    y = post_process_raw_data(detection_boxes_data[cy * cell_w + cx + cell_w * cell_h]);
+                    w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];
+                    h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];
+                    label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h,
+                        detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);
+                    conf = conf * post_process_raw_data(
+                                detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]);
+                }
 
                 bbox = av_mallocz(sizeof(*bbox));
                 if (!bbox)
@@ -404,6 +450,7 @@ static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outpu
         if (ret < 0)
             return ret;
     case DDMT_YOLOV3:
+    case DDMT_YOLOV4:
         ret = dnn_detect_post_proc_yolov3(frame, output, filter_ctx, nb_outputs);
         if (ret < 0)
             return ret;
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-12-04  5:37 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-04  5:36 [FFmpeg-devel] [PATCH 1/4] libavfiter/dnn/dnn_backend_openvino: add multiple output support wenbin.chen-at-intel.com
2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 2/4] libavfilter/vf_dnn_detect: Add input pad wenbin.chen-at-intel.com
2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 3/4] libavfilter/vf_dnn_detect: Add yolov3 support wenbin.chen-at-intel.com
2023-12-04  5:36 ` [FFmpeg-devel] [PATCH 4/4] libavfilter/vf_dnn_detect: Add yolov4 support wenbin.chen-at-intel.com

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git