From: wenbin.chen-at-intel.com@ffmpeg.org To: ffmpeg-devel@ffmpeg.org Subject: [FFmpeg-devel] [PATCH 2/2] libavfilter/vf_dnn_detect: Add yolo support Date: Tue, 21 Nov 2023 10:20:18 +0800 Message-ID: <20231121022018.285533-2-wenbin.chen@intel.com> (raw) In-Reply-To: <20231121022018.285533-1-wenbin.chen@intel.com> From: Wenbin Chen <wenbin.chen@intel.com> Add yolo support. Yolo model doesn't output final result. It outputs candidate boxes, so we need post-process to remove overlap boxes to get final results. Also, the box's coordinators relate to cell and anchors, so we need these information to calculate boxes as well. Model detail please refer to: https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/yolo-v2-tf Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> --- libavfilter/dnn/dnn_backend_openvino.c | 6 +- libavfilter/vf_dnn_detect.c | 242 ++++++++++++++++++++++++- 2 files changed, 244 insertions(+), 4 deletions(-) diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c index d3af8c34ce..6fe8b9c243 100644 --- a/libavfilter/dnn/dnn_backend_openvino.c +++ b/libavfilter/dnn/dnn_backend_openvino.c @@ -386,9 +386,9 @@ static void infer_completion_callback(void *args) ov_shape_free(&output_shape); return; } - output.channels = dims[1]; - output.height = dims[2]; - output.width = dims[3]; + output.channels = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1; + output.height = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1; + output.width = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1; av_assert0(request->lltask_count <= dims[0]); ov_shape_free(&output_shape); #else diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c index 9db90ee4cf..7ac3bb0b58 100644 --- a/libavfilter/vf_dnn_detect.c +++ b/libavfilter/vf_dnn_detect.c @@ -30,9 +30,11 @@ #include "libavutil/time.h" #include "libavutil/avstring.h" #include "libavutil/detection_bbox.h" +#include "libavutil/fifo.h" typedef enum { - DDMT_SSD + DDMT_SSD, + DDMT_YOLOV1V2, } DNNDetectionModelType; typedef struct DnnDetectContext { @@ -43,6 +45,15 @@ typedef struct DnnDetectContext { char **labels; int label_count; DNNDetectionModelType model_type; + int cell_w; + int cell_h; + int nb_classes; + AVFifo *bboxes_fifo; + int scale_width; + int scale_height; + char *anchors_str; + float *anchors; + int nb_anchor; } DnnDetectContext; #define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x) @@ -61,11 +72,218 @@ static const AVOption dnn_detect_options[] = { { "labels", "path to labels file", OFFSET2(labels_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, { "model_type", "DNN detection model type", OFFSET2(model_type), AV_OPT_TYPE_INT, { .i64 = DDMT_SSD }, INT_MIN, INT_MAX, FLAGS, "model_type" }, { "ssd", "output shape [1, 1, N, 7]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_SSD }, 0, 0, FLAGS, "model_type" }, + { "yolo", "output shape [1, N*Cx*Cy*DetectionBox]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV1V2 }, 0, 0, FLAGS, "model_type" }, + { "cell_w", "cell width", OFFSET2(cell_w), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS }, + { "cell_h", "cell height", OFFSET2(cell_h), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS }, + { "nb_classes", "The number of class", OFFSET2(nb_classes), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS }, + { "anchors", "anchors, splited by '&'", OFFSET2(anchors_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, { NULL } }; AVFILTER_DEFINE_CLASS(dnn_detect); +static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data) +{ + float max_prob = 0; + int label_id = 0; + for (int i = 0; i < nb_classes; i++) { + if (label_data[i * cell_size] > max_prob) { + max_prob = label_data[i * cell_size]; + label_id = i; + } + } + return label_id; +} + +static int dnn_detect_parse_anchors(char *anchors_str, float **anchors) +{ + char *saveptr = NULL, *token; + float *anchors_buf; + int nb_anchor = 0, i = 0; + while(anchors_str[i] != '\0') { + if(anchors_str[i] == '&') + nb_anchor++; + i++; + } + nb_anchor++; + anchors_buf = av_mallocz(nb_anchor * sizeof(*anchors)); + if (!anchors_buf) { + return 0; + } + for (int i = 0; i < nb_anchor; i++) { + token = av_strtok(anchors_str, "&", &saveptr); + anchors_buf[i] = strtof(token, NULL); + anchors_str = NULL; + } + *anchors = anchors_buf; + return nb_anchor; +} + +/* Calculate Intersection Over Union */ +static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2) +{ + float overlapping_width = FFMIN(bbox1->x + bbox1->w, bbox2->x + bbox2->w) - FFMAX(bbox1->x, bbox2->x); + float overlapping_height = FFMIN(bbox1->y + bbox1->h, bbox2->y + bbox2->h) - FFMAX(bbox1->y, bbox2->y); + float intersection_area = + (overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width; + float union_area = bbox1->w * bbox1->h + bbox2->w * bbox2->h - intersection_area; + return intersection_area / union_area; +} + +static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index, + AVFilterContext *filter_ctx) +{ + DnnDetectContext *ctx = filter_ctx->priv; + float conf_threshold = ctx->confidence; + int detection_boxes, box_size, cell_w, cell_h, scale_w, scale_h; + int nb_classes = ctx->nb_classes; + float *output_data = output[output_index].data; + float *anchors = ctx->anchors; + AVDetectionBBox *bbox; + + if (ctx->model_type == DDMT_YOLOV1V2) { + cell_w = ctx->cell_w; + cell_h = ctx->cell_h; + scale_w = cell_w; + scale_h = cell_h; + } + box_size = nb_classes + 5; + + if (!cell_h || !cell_w) { + av_log(filter_ctx, AV_LOG_ERROR, "cell_w and cell_h are detected\n"); + return AVERROR(EINVAL); + } + + if (!nb_classes) { + av_log(filter_ctx, AV_LOG_ERROR, "nb_classes is not set\n"); + return AVERROR(EINVAL); + } + + if (!anchors) { + av_log(filter_ctx, AV_LOG_ERROR, "anchors is not set\n"); + return AVERROR(EINVAL); + } + + if (output[output_index].channels * output[output_index].width * + output[output_index].height % (box_size * cell_w * cell_h)) { + av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n"); + return AVERROR(EINVAL); + } + detection_boxes = output[output_index].channels * + output[output_index].height * + output[output_index].width / box_size / cell_w / cell_h; + + /** + * find all candidate bbox + * yolo output can be reshaped to [B, N*D, Cx, Cy] + * Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,] + **/ + for (int box_id = 0; box_id < detection_boxes; box_id++) { + for (int cx = 0; cx < cell_w; cx++) + for (int cy = 0; cy < cell_h; cy++) { + float x, y, w, h, conf; + float *detection_boxes_data; + int label_id; + + detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h; + conf = detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]; + if (conf < conf_threshold) { + continue; + } + + x = detection_boxes_data[cy * cell_w + cx]; + y = detection_boxes_data[cy * cell_w + cx + cell_w * cell_h]; + w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h]; + h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h]; + label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h, + detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h); + conf = conf * detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]; + + bbox = av_mallocz(sizeof(*bbox)); + if (!bbox) + return AVERROR(ENOMEM); + + bbox->w = exp(w) * anchors[box_id * 2] * frame->width / scale_w; + bbox->h = exp(h) * anchors[box_id * 2 + 1] * frame->height / scale_h; + bbox->x = (cx + x) / cell_w * frame->width - bbox->w / 2; + bbox->y = (cy + y) / cell_h * frame->height - bbox->h / 2; + bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000); + if (ctx->labels && label_id < ctx->label_count) { + av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label)); + } else { + snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id); + } + + if (av_fifo_write(ctx->bboxes_fifo, &bbox, 1) < 0) { + av_freep(&bbox); + return AVERROR(ENOMEM); + } + } + } + return 0; +} + +static int dnn_detect_fill_side_data(AVFrame *frame, AVFilterContext *filter_ctx) +{ + DnnDetectContext *ctx = filter_ctx->priv; + float conf_threshold = ctx->confidence; + AVDetectionBBox *bbox; + int nb_bboxes = 0; + AVDetectionBBoxHeader *header; + if (av_fifo_can_read(ctx->bboxes_fifo) == 0) { + av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n"); + return 0; + } + + /* remove overlap bboxes */ + for (int i = 0; i < av_fifo_can_read(ctx->bboxes_fifo); i++){ + av_fifo_peek(ctx->bboxes_fifo, &bbox, 1, i); + for (int j = 0; j < av_fifo_can_read(ctx->bboxes_fifo); j++) { + AVDetectionBBox *overlap_bbox; + av_fifo_peek(ctx->bboxes_fifo, &overlap_bbox, 1, j); + if (!strcmp(bbox->detect_label, overlap_bbox->detect_label) && + av_cmp_q(bbox->detect_confidence, overlap_bbox->detect_confidence) < 0 && + dnn_detect_IOU(bbox, overlap_bbox) >= conf_threshold) { + bbox->classify_count = -1; // bad result + nb_bboxes++; + break; + } + } + } + nb_bboxes = av_fifo_can_read(ctx->bboxes_fifo) - nb_bboxes; + header = av_detection_bbox_create_side_data(frame, nb_bboxes); + if (!header) { + av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes); + return -1; + } + av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source)); + + while(av_fifo_can_read(ctx->bboxes_fifo)) { + AVDetectionBBox *candidate_bbox; + av_fifo_read(ctx->bboxes_fifo, &candidate_bbox, 1); + + if (nb_bboxes > 0 && candidate_bbox->classify_count != -1) { + bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes); + memcpy(bbox, candidate_bbox, sizeof(*bbox)); + nb_bboxes--; + } + av_freep(&candidate_bbox); + } + return 0; +} + +static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx) +{ + int ret = 0; + ret = dnn_detect_parse_yolo_output(frame, output, 0, filter_ctx); + if (ret < 0) + return ret; + ret = dnn_detect_fill_side_data(frame, filter_ctx); + if (ret < 0) + return ret; + return 0; +} + static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx) { DnnDetectContext *ctx = filter_ctx->priv; @@ -158,6 +376,10 @@ static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, AVFilterCont if (ret < 0) return ret; break; + case DDMT_YOLOV1V2: + ret = dnn_detect_post_proc_yolo(frame, output, filter_ctx); + if (ret < 0) + return ret; } return 0; @@ -356,11 +578,22 @@ static av_cold int dnn_detect_init(AVFilterContext *context) ret = check_output_nb(ctx, dnn_ctx->backend_type, dnn_ctx->nb_outputs); if (ret < 0) return ret; + ctx->bboxes_fifo = av_fifo_alloc2(1, sizeof(AVDetectionBBox *), AV_FIFO_FLAG_AUTO_GROW); + if (!ctx->bboxes_fifo) + return AVERROR(ENOMEM); ff_dnn_set_detect_post_proc(&ctx->dnnctx, dnn_detect_post_proc); if (ctx->labels_filename) { return read_detect_label_file(context); } + if (ctx->anchors_str) { + ret = dnn_detect_parse_anchors(ctx->anchors_str, &ctx->anchors); + if (!ctx->anchors) { + av_log(context, AV_LOG_ERROR, "failed to parse anchors_str\n"); + return AVERROR(EINVAL); + } + ctx->nb_anchor = ret; + } return 0; } @@ -460,7 +693,14 @@ static int dnn_detect_activate(AVFilterContext *filter_ctx) static av_cold void dnn_detect_uninit(AVFilterContext *context) { DnnDetectContext *ctx = context->priv; + AVDetectionBBox *bbox; ff_dnn_uninit(&ctx->dnnctx); + while(av_fifo_can_read(ctx->bboxes_fifo)) { + av_fifo_read(ctx->bboxes_fifo, &bbox, 1); + av_freep(&bbox); + } + av_fifo_freep2(&ctx->bboxes_fifo); + av_freep(&ctx->anchors); free_detect_labels(ctx); } -- 2.34.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2023-11-21 2:20 UTC|newest] Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top 2023-11-21 2:20 [FFmpeg-devel] [PATCH 1/2] libavfilter/vf_dnn_detect: Add model_type option wenbin.chen-at-intel.com 2023-11-21 2:20 ` wenbin.chen-at-intel.com [this message] 2023-11-25 4:52 ` [FFmpeg-devel] [PATCH 2/2] libavfilter/vf_dnn_detect: Add yolo support Guo, Yejun
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20231121022018.285533-2-wenbin.chen@intel.com \ --to=wenbin.chen-at-intel.com@ffmpeg.org \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git