From: <m.kaindl0208@gmail.com> To: <ffmpeg-devel@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH v2 FFmpeg 17/20] libavfilter: turn dnn_classify to multimedia filter. Classify CLIP/CLAP implementation. Date: Mon, 10 Mar 2025 20:55:33 +0100 Message-ID: <004501db91f6$62c2d680$28488380$@gmail.com> (raw) Signed-off-by: MaximilianKaindl <m.kaindl0208@gmail.com> --- libavfilter/Makefile | 2 +- libavfilter/allfilters.c | 2 +- libavfilter/avf_dnn_classify.c | 1283 ++++++++++++++++++++++++++++++++ libavfilter/vf_dnn_classify.c | 308 -------- 4 files changed, 1285 insertions(+), 310 deletions(-) create mode 100644 libavfilter/avf_dnn_classify.c delete mode 100644 libavfilter/vf_dnn_classify.c diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 7c0d879ec9..987dbcb82b 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -286,7 +286,6 @@ OBJS-$(CONFIG_DILATION_FILTER) += vf_neighbor.o OBJS-$(CONFIG_DILATION_OPENCL_FILTER) += vf_neighbor_opencl.o opencl.o \ opencl/neighbor.o OBJS-$(CONFIG_DISPLACE_FILTER) += vf_displace.o framesync.o -OBJS-$(CONFIG_DNN_CLASSIFY_FILTER) += vf_dnn_classify.o OBJS-$(CONFIG_DNN_DETECT_FILTER) += vf_dnn_detect.o OBJS-$(CONFIG_DNN_PROCESSING_FILTER) += vf_dnn_processing.o OBJS-$(CONFIG_DOUBLEWEAVE_FILTER) += vf_weave.o @@ -635,6 +634,7 @@ OBJS-$(CONFIG_AHISTOGRAM_FILTER) += avf_ahistogram.o OBJS-$(CONFIG_APHASEMETER_FILTER) += avf_aphasemeter.o OBJS-$(CONFIG_AVECTORSCOPE_FILTER) += avf_avectorscope.o OBJS-$(CONFIG_CONCAT_FILTER) += avf_concat.o +OBJS-$(CONFIG_DNN_CLASSIFY_FILTER) += avf_dnn_classify.o OBJS-$(CONFIG_SHOWCQT_FILTER) += avf_showcqt.o lswsutils.o lavfutils.o OBJS-$(CONFIG_SHOWCWT_FILTER) += avf_showcwt.o OBJS-$(CONFIG_SHOWFREQS_FILTER) += avf_showfreqs.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 740d9ab265..5385173dc1 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -261,7 +261,6 @@ extern const FFFilter ff_vf_detelecine; extern const FFFilter ff_vf_dilation; extern const FFFilter ff_vf_dilation_opencl; extern const FFFilter ff_vf_displace; -extern const FFFilter ff_vf_dnn_classify; extern const FFFilter ff_vf_dnn_detect; extern const FFFilter ff_vf_dnn_processing; extern const FFFilter ff_vf_doubleweave; @@ -596,6 +595,7 @@ extern const FFFilter ff_avf_ahistogram; extern const FFFilter ff_avf_aphasemeter; extern const FFFilter ff_avf_avectorscope; extern const FFFilter ff_avf_concat; +extern const FFFilter ff_avf_dnn_classify; extern const FFFilter ff_avf_showcqt; extern const FFFilter ff_avf_showcwt; extern const FFFilter ff_avf_showfreqs; diff --git a/libavfilter/avf_dnn_classify.c b/libavfilter/avf_dnn_classify.c new file mode 100644 index 0000000000..7b469e3af0 --- /dev/null +++ b/libavfilter/avf_dnn_classify.c @@ -0,0 +1,1283 @@ +/* +* This file is part of FFmpeg. +* +* FFmpeg is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License as published by the Free Software Foundation; either +* version 2.1 of the License, or (at your option) any later version. +* +* FFmpeg is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. +* +* You should have received a copy of the GNU Lesser General Public +* License along with FFmpeg; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +/** + * @file + * DNN classification filter supporting both video (standard and CLIP) + * and audio (CLAP) classification + */ + +#include "audio.h" +#include "avfilter.h" +#include "dnn_filter_common.h" +#include "filters.h" +#include "formats.h" +#include "libavutil/avstring.h" +#include "libavutil/detection_bbox.h" +#include "libavutil/file_open.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/time.h" +#include "video.h" + +/* + Labels that are being used to classify the image +*/ +typedef struct LabelContext { + char **labels; + int label_count; +} LabelContext; + +/* + Category that holds multiple Labels +*/ +typedef struct CategoryContext { + char *name; + LabelContext *labels; + int label_count; + float total_probability; +} CategoryContext; + +/* +Header (Attribute) that is being described by all its labels. +(name) in labels file + +e.g. +(Comic) +a drawn image +a fictional character +... + +Can also be used to substitute the labeltext with the header text +so that the header is displayed instead of the label with the probability +*/ +typedef struct CategoriesContext { + char *name; + CategoryContext *categories; + int category_count; + int label_count; + int max_categories; +} CategoriesContext; + +/* +Unit that is being classified. Each one can have multiple categories +[name] in categories file + +e.g. +[RecordingSystem] +(Professional) +a photo with high level of detail +... +(HomeRecording) +a photo with low level of detail +... + +empowers to do multiple classification "runs" +result per unit will be best category (rated by the sum over all confidence values) one of the categories +softmax is applied over each unit +*/ +typedef struct CategoryClassifcationContext { +CategoriesContext **category_units; +int num_contexts; +int max_contexts; +int total_labels; +int total_categories; +} CategoryClassifcationContext; + +typedef struct DnnClassifyContext { + const AVClass *class; + DnnContext dnnctx; + float confidence; + char *labels_filename; + char *target; + enum AVMediaType type; // AVMEDIA_TYPE_VIDEO or AVMEDIA_TYPE_AUDIO + + // Standard classification + LabelContext *label_classification_ctx; + + CategoryClassifcationContext *category_classification_ctx; + + char *categories_filename; + char *tokenizer_path; + + // Audio-specific parameters + int is_audio; // New flag to indicate if input will be audio +} DnnClassifyContext; + +#define OFFSET(x) offsetof(DnnClassifyContext, dnnctx.x) +#define OFFSET2(x) offsetof(DnnClassifyContext, x) +#if (CONFIG_LIBTORCH == 1) +#define OFFSET3(x) offsetof(DnnClassifyContext, dnnctx.torch_option.x) +#endif +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_AUDIO_PARAM + +static const AVOption dnn_classify_options[] = { + { "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = DNN_OV }, INT_MIN, INT_MAX, FLAGS, .unit = "backend" }, +#if (CONFIG_LIBOPENVINO == 1) + { "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_OV }, 0, 0, FLAGS, .unit = "backend" }, +#endif +#if (CONFIG_LIBTORCH == 1) + { "torch", "torch backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_TH }, 0, 0, FLAGS, .unit = "backend" }, + { "logit_scale", "logit scale for similarity calculation", OFFSET3(logit_scale), AV_OPT_TYPE_FLOAT, { .dbl = -1.0 }, -1.0, 100.0, FLAGS }, + { "temperature", "softmax temperature", OFFSET3(temperature), AV_OPT_TYPE_FLOAT, { .dbl = -1.0 }, -1.0, 100.0, FLAGS }, + { "forward_order", "Order of forward output (0: media text, 1: text media) (CLIP/CLAP only)", OFFSET3(forward_order), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, FLAGS }, + { "normalize", "Normalize the input tensor (CLIP/CLAP only)", OFFSET3(normalize), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, FLAGS }, + { "input_res", "video processing model expected input size", OFFSET3(input_resolution), AV_OPT_TYPE_INT64, { .i64 = -1 }, -1, 10000, FLAGS }, + { "sample_rate", "audio processing model expected sample rate", OFFSET3(sample_rate), AV_OPT_TYPE_INT64, { .i64 = 44100 }, 1600, 192000, FLAGS }, + { "sample_duration","audio processing model expected sample duration",OFFSET3(sample_duration), AV_OPT_TYPE_INT64, { .i64 = 7 }, 1, 100, FLAGS }, + { "token_dimension","dimension of token vector", OFFSET3(token_dimension), AV_OPT_TYPE_INT64, { .i64 = 77 }, 1, 10000, FLAGS }, +#endif + { "confidence", "threshold of confidence", OFFSET2(confidence), AV_OPT_TYPE_FLOAT, { .dbl = 0.5 }, 0, 1, FLAGS }, + { "labels", "path to labels file", OFFSET2(labels_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, + { "target", "which one to be classified", OFFSET2(target), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, + { "categories", "path to categories file (CLIP/CLAP only)", OFFSET2(categories_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, + { "tokenizer", "path to text tokenizer.json file (CLIP/CLAP only)", OFFSET2(tokenizer_path), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, + { "is_audio", "audio processing mode", OFFSET2(is_audio), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS }, + { NULL } +}; + +AVFILTER_DNN_DEFINE_CLASS(dnn_classify, DNN_OV); + +static void free_label_context(LabelContext *ctx) +{ + if (!ctx) + return; + + if (ctx->labels) { + for (int i = 0; i < ctx->label_count; i++) { + av_freep(&ctx->labels[i]); + } + av_freep(&ctx->labels); + } + ctx->label_count = 0; +} + +static void free_category_context(CategoryContext *category) +{ + if (!category) + return; + + if (category->name) { + av_freep(&category->name); + } + + if (category->labels) { + free_label_context(category->labels); + av_freep(&category->labels); + } +} + +static void free_categories_context(CategoriesContext *ctx) +{ + if (!ctx) + return; + + if (ctx->categories) { + for (int i = 0; i < ctx->category_count; i++) { + free_category_context(&ctx->categories[i]); + } + // Now free the array of categories + av_freep(&ctx->categories); + ctx->categories = NULL; + } + + if (ctx->name) { + av_freep(&ctx->name); + ctx->name = NULL; + } + + ctx->category_count = 0; + ctx->max_categories = 0; + ctx->label_count = 0; +} + +static void free_category_classfication_context(CategoryClassifcationContext *category_classification_ctx) +{ + if (category_classification_ctx) { + if (category_classification_ctx->category_units) { + for (int i = 0; i < category_classification_ctx->num_contexts; i++) { + if (category_classification_ctx->category_units[i]) { + free_categories_context(category_classification_ctx->category_units[i]); + av_freep(&category_classification_ctx->category_units[i]); + } + } + av_freep(&category_classification_ctx->category_units); + category_classification_ctx->category_units = NULL; + } + category_classification_ctx->num_contexts = 0; + category_classification_ctx->max_contexts = 0; + } +} + +static int detection_bbox_set_content(AVDetectionBBox *bbox, char *label, int index, float probability) +{ + // Set probability + bbox->classify_confidences[index] = av_make_q((int)(probability * 10000), 10000); + + // Copy label with size checking + if (av_strlcpy(bbox->classify_labels[index], label, AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) >= + AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) { + av_log(NULL, AV_LOG_WARNING, "Label truncated in set_prob_and_label_of_bbox\n"); + } + + return 0; +} + +static fill_detection_bbox_with_best_labels(char **labels, float *probabilities, int num_labels, AVDetectionBBox *bbox, int max_classes_per_box, float confidence_threshold) +{ + int i, j, minpos, ret; + float min; + + if (!labels || !probabilities || !bbox) { + return AVERROR(EINVAL); + } + + for (i = 0; i < num_labels; i++) { + if (probabilities[i] >= confidence_threshold) { + if (bbox->classify_count >= max_classes_per_box) { + // Find lowest probability classification + min = av_q2d(bbox->classify_confidences[0]); + minpos = 0; + for (j = 1; j < bbox->classify_count; j++) { + float prob = av_q2d(bbox->classify_confidences[j]); + if (prob < min) { + min = prob; + minpos = j; + } + } + + if (probabilities[i] > min) { + ret = detection_bbox_set_content(bbox, labels[i], minpos, probabilities[i]); + if (ret < 0) + return ret; + } + } else { + ret = detection_bbox_set_content(bbox, labels[i], bbox->classify_count, probabilities[i]); + if (ret < 0) + return ret; + bbox->classify_count++; + } + } + } + return 0; +} + +static int read_classify_label_file(AVFilterContext *context, LabelContext *label_classification_ctx, + char *labels_filename, int max_line_length) +{ + int line_len; + FILE *file; + + file = avpriv_fopen_utf8(labels_filename, "r"); + if (!file) { + av_log(context, AV_LOG_ERROR, "failed to open file %s\n", labels_filename); + return AVERROR(EINVAL); + } + + while (!feof(file)) { + char *label; + char buf[256]; + if (!fgets(buf, 256, file)) { + break; + } + + line_len = strlen(buf); + while (line_len) { + int i = line_len - 1; + if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') { + buf[i] = '\0'; + line_len--; + } else { + break; + } + } + + if (line_len == 0) // empty line + continue; + + if (line_len >= max_line_length) { + av_log(context, AV_LOG_ERROR, "label %s too long\n", buf); + fclose(file); + return AVERROR(EINVAL); + } + + label = av_strdup(buf); + if (!label) { + av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf); + fclose(file); + return AVERROR(ENOMEM); + } + + if (av_dynarray_add_nofree(&label_classification_ctx->labels, &label_classification_ctx->label_count, label) < + 0) { + av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n"); + fclose(file); + av_freep(&label); + return AVERROR(ENOMEM); + } + } + + fclose(file); + return 0; +} + +static int read_classify_categories_file(AVFilterContext *context, CategoryClassifcationContext *cat_class_ctx, + char *categories_filename, int max_line_length) +{ + FILE *file; + char buf[256]; + int ret = 0; + CategoriesContext *current_ctx = NULL; + CategoryContext *current_category = NULL; + + file = avpriv_fopen_utf8(categories_filename, "r"); + if (!file) { + av_log(context, AV_LOG_ERROR, "Failed to open categories file %s\n", categories_filename); + return AVERROR(EINVAL); + } + + // Initialize contexts array + cat_class_ctx->max_contexts = 10; + cat_class_ctx->num_contexts = 0; + cat_class_ctx->category_units = av_calloc(cat_class_ctx->max_contexts, sizeof(CategoriesContext *)); + if (!cat_class_ctx->category_units) { + fclose(file); + return AVERROR(ENOMEM); + } + + while (fgets(buf, sizeof(buf), file)) { + char *line = buf; + int line_len = strlen(line); + + // Trim whitespace and newlines + while (line_len > 0 && + (line[line_len - 1] == '\n' || line[line_len - 1] == '\r' || line[line_len - 1] == ' ')) { + line[--line_len] = '\0'; + } + + if (line_len == 0) + continue; + + if (line_len >= max_line_length) { + av_log(context, AV_LOG_ERROR, "Label %s too long\n", buf); + ret = AVERROR(ENOMEM); + goto end; + } + + // Check for context marker [ContextName] + if (line[0] == '[' && line[line_len - 1] == ']') { + if (current_ctx != NULL) { + // Store previous context + if (cat_class_ctx->num_contexts >= cat_class_ctx->max_contexts) { + int new_size = cat_class_ctx->max_contexts * 2; + CategoriesContext **new_contexts = + av_realloc_array(cat_class_ctx->category_units, new_size, sizeof(CategoriesContext *)); + if (!new_contexts) { + ret = AVERROR(ENOMEM); + goto end; + } + cat_class_ctx->category_units = new_contexts; + cat_class_ctx->max_contexts = new_size; + } + cat_class_ctx->category_units[cat_class_ctx->num_contexts++] = current_ctx; + } + + // Create new context + current_ctx = av_calloc(1, sizeof(CategoriesContext)); + if (!current_ctx) { + ret = AVERROR(ENOMEM); + goto end; + } + + // Extract context name + line[line_len - 1] = '\0'; + current_ctx->name = av_strdup(line + 1); + if (!current_ctx->name) { + av_freep(¤t_ctx); + ret = AVERROR(ENOMEM); + goto end; + } + + current_ctx->category_count = 0; + current_ctx->max_categories = 10; + current_ctx->categories = av_calloc(current_ctx->max_categories, sizeof(CategoryContext)); + if (!current_ctx->categories) { + av_freep(¤t_ctx->name); + av_freep(¤t_ctx); + ret = AVERROR(ENOMEM); + goto end; + } + + current_category = NULL; + } + // Check for category marker (CategoryName) + else if (line[0] == '(' && line[line_len - 1] == ')') { + if (!current_ctx) { + av_log(context, AV_LOG_ERROR, "Category found without context\n"); + ret = AVERROR(EINVAL); + goto end; + } + + if (current_ctx->category_count >= current_ctx->max_categories) { + int new_size = current_ctx->max_categories * 2; + CategoryContext *new_categories = + av_realloc_array(current_ctx->categories, new_size, sizeof(CategoryContext)); + if (!new_categories) { + ret = AVERROR(ENOMEM); + goto end; + } + current_ctx->categories = new_categories; + current_ctx->max_categories = new_size; + } + + line[line_len - 1] = '\0'; + current_category = ¤t_ctx->categories[current_ctx->category_count++]; + cat_class_ctx->total_categories++; + + current_category->name = av_strdup(line + 1); + if (!current_category->name) { + ret = AVERROR(ENOMEM); + goto end; + } + + current_category->labels = av_calloc(1, sizeof(LabelContext)); + if (!current_category->labels) { + av_freep(¤t_category->name); + ret = AVERROR(ENOMEM); + goto end; + } + current_category->label_count = 0; + current_category->total_probability = 0.0f; + } + // Must be a label + else if (line[0] != '\0') { + if (!current_category) { + av_log(context, AV_LOG_ERROR, "Label found without category\n"); + ret = AVERROR(EINVAL); + goto end; + } + char *label = av_strdup(line); + if (!label) { + ret = AVERROR(ENOMEM); + goto end; + } + + if (av_dynarray_add_nofree(¤t_category->labels->labels, ¤t_category->labels->label_count, + label) < 0) { + av_freep(&label); + ret = AVERROR(ENOMEM); + goto end; + } + + current_category->label_count++; + current_ctx->label_count++; + cat_class_ctx->total_labels++; + } + } + + // Store the last context + if (current_ctx) { + if (cat_class_ctx->num_contexts >= cat_class_ctx->max_contexts) { + int new_size = cat_class_ctx->max_contexts * 2; + CategoriesContext **new_contexts = + av_realloc_array(cat_class_ctx->category_units, new_size, sizeof(CategoriesContext *)); + if (!new_contexts) { + ret = AVERROR(ENOMEM); + goto end; + } + cat_class_ctx->category_units = new_contexts; + cat_class_ctx->max_contexts = new_size; + } + cat_class_ctx->category_units[cat_class_ctx->num_contexts++] = current_ctx; + } + +end: + if (ret < 0) { + // Clean up current context if it wasn't added to the array + if (current_ctx) { + free_categories_context(current_ctx); + } + } + + fclose(file); + return ret; +} + +static int combine_all_category_labels(LabelContext **label_ctx, CategoryClassifcationContext *cat_class_ctx) +{ + char **combined_labels = NULL; + int combined_idx = 0; + + *label_ctx = av_calloc(1, sizeof(LabelContext)); + if (!(*label_ctx)) + return AVERROR(ENOMEM); + + (*label_ctx)->label_count = cat_class_ctx->total_labels; + (*label_ctx)->labels = av_calloc(cat_class_ctx->total_labels, sizeof(char *)); + if (!(*label_ctx)->labels) { + av_freep(label_ctx); + return AVERROR(ENOMEM); + } + + combined_labels = (*label_ctx)->labels; + + // Combine all labels from all categories + for (int c = 0; c < cat_class_ctx->num_contexts; c++) { + CategoriesContext *current_ctx = cat_class_ctx->category_units[c]; + for (int i = 0; i < current_ctx->category_count; i++) { + CategoryContext *category = ¤t_ctx->categories[i]; + for (int j = 0; j < category->labels->label_count; j++) { + combined_labels[combined_idx] = category->labels->labels[j]; + combined_idx++; + } + } + } + + return 0; +} + +static int get_category_total_label_count(CategoryClassifcationContext *cat_ctx, int **label_counts) +{ + if (!cat_ctx || cat_ctx->num_contexts <= 0) { + return 0; + } + + // Allocate memory for the label counts array + *label_counts = av_calloc(cat_ctx->num_contexts, sizeof(int)); + if (!*label_counts) { + return AVERROR(ENOMEM); + } + + // Fill the array with label counts from each context + for (int i = 0; i < cat_ctx->num_contexts; i++) { + CategoriesContext *categories = cat_ctx->category_units[i]; + if (categories) { + (*label_counts)[i] = categories->label_count; + } else { + (*label_counts)[i] = 0; + } + } + + return cat_ctx->num_contexts; +} + +static CategoryContext *get_best_category(CategoriesContext *categories_ctx, float *probabilities) +{ + CategoryContext *best_category = NULL; + float best_probability = -1.0f; + int prob_offset = 0; + + // Calculate total probability for each category + for (int cat_idx = 0; cat_idx < categories_ctx->category_count; cat_idx++) { + CategoryContext *category = &categories_ctx->categories[cat_idx]; + + // Sum probabilities for all labels in this category + category->total_probability = 0.0f; + for (int label_idx = 0; label_idx < category->label_count; label_idx++) { + category->total_probability += probabilities[prob_offset + label_idx]; + } + + if (category->total_probability > best_probability) { + best_probability = category->total_probability; + best_category = category; + } + + prob_offset += category->label_count; + } + + return best_category; +} + +static AVDetectionBBox *find_or_create_detection_bbox(AVFrame *frame, uint32_t bbox_index, AVFilterContext *filter_ctx, + DnnClassifyContext *ctx) +{ + AVFrameSideData *sd; + AVDetectionBBoxHeader *header; + AVDetectionBBox *bbox; + + sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES); + if (!sd) { + header = av_detection_bbox_create_side_data(frame, 1); + if (!header) { + av_log(filter_ctx, AV_LOG_ERROR, "Cannot get side data in labels processing\n"); + return NULL; + } + } else { + header = (AVDetectionBBoxHeader *)sd->data; + } + + if (bbox_index == 0) { + av_strlcat(header->source, ", ", sizeof(header->source)); + av_strlcat(header->source, ctx->dnnctx.model_filename, sizeof(header->source)); + } + + // Get bbox for current index + bbox = av_get_detection_bbox(header, bbox_index); + if (!bbox) { + av_log(filter_ctx, AV_LOG_ERROR, "Failed to get bbox %d\n", bbox_index); + return NULL; + } + + return bbox; +} + +// Processing functions for standard classification (video only) +static int post_proc_standard(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx) +{ + DnnClassifyContext *ctx = filter_ctx->priv; + float conf_threshold = ctx->confidence; + AVDetectionBBoxHeader *header; + AVDetectionBBox *bbox; + float *classifications; + uint32_t label_id; + float confidence; + AVFrameSideData *sd; + int output_size = output->dims[3] * output->dims[2] * output->dims[1]; + + if (output_size <= 0) { + return -1; + } + + sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES); + if (!sd) { + av_log(filter_ctx, AV_LOG_ERROR, "Cannot get side data in post_proc_standard\n"); + return -1; + } + header = (AVDetectionBBoxHeader *)sd->data; + + if (bbox_index == 0) { + av_strlcat(header->source, ", ", sizeof(header->source)); + av_strlcat(header->source, ctx->dnnctx.model_filename, sizeof(header->source)); + } + + classifications = output->data; + label_id = 0; + confidence = classifications[0]; + for (int i = 1; i < output_size; i++) { + if (classifications[i] > confidence) { + label_id = i; + confidence = classifications[i]; + } + } + + if (confidence < conf_threshold) { + return 0; + } + + bbox = av_get_detection_bbox(header, bbox_index); + bbox->classify_confidences[bbox->classify_count] = av_make_q((int)(confidence * 10000), 10000); + + if (ctx->label_classification_ctx->labels && label_id < ctx->label_classification_ctx->label_count) { + av_strlcpy(bbox->classify_labels[bbox->classify_count], ctx->label_classification_ctx->labels[label_id], + sizeof(bbox->classify_labels[bbox->classify_count])); + } else { + snprintf(bbox->classify_labels[bbox->classify_count], sizeof(bbox->classify_labels[bbox->classify_count]), "%d", + label_id); + } + bbox->classify_count++; + + return 0; +} + +static int post_proc_clxp_labels(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx) +{ + DnnClassifyContext *ctx = filter_ctx->priv; + const int max_classes_per_box = AV_NUM_DETECTION_BBOX_CLASSIFY; + float *probabilities = (float *)output->data; + int num_labels = ctx->label_classification_ctx->label_count; + AVDetectionBBox *bbox; + float confidence_threshold = ctx->confidence; + int ret; + + bbox = find_or_create_detection_bbox(frame, bbox_index, filter_ctx, ctx); + if (!bbox) { + return AVERROR(EINVAL); + } + + ret = fill_detection_bbox_with_best_labels(ctx->label_classification_ctx->labels, probabilities, num_labels, + bbox, max_classes_per_box, confidence_threshold); + if (ret < 0) { + av_log(filter_ctx, AV_LOG_ERROR, "Failed to fill bbox with best labels\n"); + return ret; + } + return 0; +} + +static int post_proc_clxp_categories(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx) +{ + DnnClassifyContext *ctx = filter_ctx->priv; + CategoryClassifcationContext *cat_class_ctx = ctx->category_classification_ctx; + CategoryContext *best_category; + AVDetectionBBox *bbox; + float *probabilities = output->data; + int ret, prob_offset = 0; + char **ctx_labels; + float *ctx_probabilities; + + bbox = find_or_create_detection_bbox(frame, bbox_index, filter_ctx, ctx); + if (!bbox) { + return AVERROR(EINVAL); + } + + // Allocate temporary arrays for category results + ctx_labels = av_malloc_array(cat_class_ctx->num_contexts, sizeof(char *)); + if (!ctx_labels) { + return AVERROR(ENOMEM); + } + + for (int i = 0; i < cat_class_ctx->num_contexts; i++) { + ctx_labels[i] = av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE); + if (!ctx_labels[i]) { + // Clean up previously allocated memory + for (int j = 0; j < i; j++) { + av_freep(&ctx_labels[j]); + } + av_freep(&ctx_labels); + return AVERROR(ENOMEM); + } + } + + ctx_probabilities = av_malloc_array(cat_class_ctx->num_contexts, sizeof(float)); + if (!ctx_probabilities) { + // Clean up + for (int i = 0; i < cat_class_ctx->num_contexts; i++) { + av_freep(&ctx_labels[i]); + } + av_freep(&ctx_labels); + return AVERROR(ENOMEM); + } + + // Process each context + for (int ctx_idx = 0; ctx_idx < cat_class_ctx->num_contexts; ctx_idx++) { + CategoriesContext *categories_ctx = cat_class_ctx->category_units[ctx_idx]; + if (!categories_ctx) { + av_log(filter_ctx, AV_LOG_ERROR, "Missing classification data at context %d\n", ctx_idx); + continue; + } + + // Find best category in context + best_category = get_best_category(categories_ctx, probabilities + prob_offset); + if (!best_category || !best_category->name) { + av_log(filter_ctx, AV_LOG_ERROR, "Invalid best category at context %d\n", ctx_idx); + continue; + } + + // Copy category name instead of assigning pointer + av_strlcpy(ctx_labels[ctx_idx], best_category->name, AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE); + ctx_probabilities[ctx_idx] = best_category->total_probability; + + prob_offset += categories_ctx->label_count; + } + + // Fill bbox with best labels + ret = fill_detection_bbox_with_best_labels(ctx_labels, ctx_probabilities, cat_class_ctx->num_contexts, bbox, + AV_NUM_DETECTION_BBOX_CLASSIFY, ctx->confidence); + + // Clean up + for (int i = 0; i < cat_class_ctx->num_contexts; i++) { + av_freep(&ctx_labels[i]); + } + av_freep(&ctx_labels); + av_freep(&ctx_probabilities); + + return ret; +} + +static int dnn_classify_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx) +{ + DnnClassifyContext *ctx = filter_ctx->priv; + + if (ctx->dnnctx.backend_type == DNN_TH) { + if (ctx->category_classification_ctx) { + return post_proc_clxp_categories(frame, output, bbox_index, filter_ctx); + } else if (ctx->label_classification_ctx) { + return post_proc_clxp_labels(frame, output, bbox_index, filter_ctx); + } + av_log(filter_ctx, AV_LOG_ERROR, "No valid classification context available\n"); + return AVERROR(EINVAL); + } else { + return post_proc_standard(frame, output, bbox_index, filter_ctx); + } +} + +static void free_contexts(DnnClassifyContext *ctx) +{ + if (!ctx) + return; + if (ctx->category_classification_ctx) { + free_category_classfication_context(ctx->category_classification_ctx); + av_freep(&ctx->category_classification_ctx); + av_freep(&ctx->label_classification_ctx); + ctx->category_classification_ctx = NULL; + ctx->label_classification_ctx = NULL; + } else if (ctx->label_classification_ctx) { + free_label_context(ctx->label_classification_ctx); + ctx->label_classification_ctx = NULL; + } +} + +static int config_input(AVFilterLink *inlink) +{ + AVFilterContext *context = inlink->dst; + DnnClassifyContext *ctx = context->priv; + AVFilterLink *outlink = context->outputs[0]; + int ret; + DNNFunctionType goal_mode; + + // Set media type based on is_audio flag + if (ctx->is_audio) { + ctx->type = AVMEDIA_TYPE_AUDIO; + } else { + ctx->type = inlink->type; + } + + // Set the output link type to match the input link type + outlink->type = inlink->type; + outlink->w = inlink->w; + outlink->h = inlink->h; + outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; + outlink->time_base = inlink->time_base; + + int64_t sample_rate = ctx->dnnctx.torch_option.sample_rate; + + // Validate media type + if (ctx->type != AVMEDIA_TYPE_AUDIO && ctx->type != AVMEDIA_TYPE_VIDEO) { + av_log(context, AV_LOG_ERROR, "Invalid media type. Only audio or video is supported\n"); + return AVERROR(EINVAL); + } + + // Set type-specific parameters and check compatibility + if (ctx->type == AVMEDIA_TYPE_AUDIO) { + goal_mode = DFT_ANALYTICS_CLAP; + + // Check backend compatibility + if (ctx->dnnctx.backend_type != DNN_TH) { + av_log(context, AV_LOG_ERROR, "Audio classification requires Torch backend\n"); + return AVERROR(EINVAL); + } + + if (inlink->sample_rate != sample_rate) { + av_log(context, AV_LOG_ERROR, "Invalid sample rate. CLAP requires 44100 Hz\n"); + return AVERROR(EINVAL); + } + + // Copy audio properties to output + outlink->sample_rate = inlink->sample_rate; + outlink->ch_layout = inlink->ch_layout; + } else { + // Video mode + goal_mode = (ctx->dnnctx.backend_type == DNN_TH) ? DFT_ANALYTICS_CLIP : DFT_ANALYTICS_CLASSIFY; + } + // Initialize label and category contexts based on provided files + if (ctx->dnnctx.backend_type == DNN_TH) { + if (ctx->labels_filename) { + ctx->label_classification_ctx = av_calloc(1, sizeof(LabelContext)); + if (!ctx->label_classification_ctx) + return AVERROR(ENOMEM); + + ret = read_classify_label_file(context, ctx->label_classification_ctx, ctx->labels_filename, + AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE); + if (ret < 0) { + av_log(context, AV_LOG_ERROR, "Failed to read labels file\n"); + return ret; + } + ret = ff_dnn_init_with_tokenizer(&ctx->dnnctx, goal_mode, ctx->label_classification_ctx->labels, + ctx->label_classification_ctx->label_count, NULL, 0, ctx->tokenizer_path, + context); + if (ret < 0) { + free_contexts(ctx); + return ret; + } + } else if (ctx->categories_filename) { + ctx->category_classification_ctx = av_calloc(1, sizeof(CategoryClassifcationContext)); + if (!ctx->category_classification_ctx) + return AVERROR(ENOMEM); + + ret = read_classify_categories_file(context, ctx->category_classification_ctx, ctx->categories_filename, + AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE); + if (ret < 0) { + av_log(context, AV_LOG_ERROR, "Failed to read categories file\n"); + free_contexts(ctx); + return ret; + } + + ret = combine_all_category_labels(&ctx->label_classification_ctx, ctx->category_classification_ctx); + if (ret < 0) { + av_log(context, AV_LOG_ERROR, "Failed to combine labels\n"); + free_contexts(ctx); + return ret; + } + // Get total label count of all categories + int total_labels; + int *label_counts = NULL; + + total_labels = get_category_total_label_count(ctx->category_classification_ctx, &label_counts); + if (total_labels <= 0) { + av_log(context, AV_LOG_ERROR, "Failed to get category label counts or no labels found\n"); + free_contexts(ctx); + return ret; + } + + // Initialize DNN with tokenizer for CLIP/CLAP models + ret = ff_dnn_init_with_tokenizer(&ctx->dnnctx, goal_mode, ctx->label_classification_ctx->labels, + ctx->label_classification_ctx->label_count, label_counts, total_labels, + ctx->tokenizer_path, context); + if (ret < 0) { + av_freep(&label_counts); + free_contexts(ctx); + return ret; + } + av_freep(&label_counts); + } + } else if (ctx->dnnctx.backend_type == DNN_OV) { + // Initialize standard DNN for OpenVINO + ret = ff_dnn_init(&ctx->dnnctx, goal_mode, context); + if (ret < 0) + return ret; + + // Read labels file + ctx->label_classification_ctx = av_calloc(1, sizeof(LabelContext)); + if (!ctx->label_classification_ctx) + return AVERROR(ENOMEM); + + ret = read_classify_label_file(context, ctx->label_classification_ctx, ctx->labels_filename, + AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE); + if (ret < 0) { + av_log(context, AV_LOG_ERROR, "Failed to read labels file\n"); + free_contexts(ctx); + return ret; + } + } + + // Set the post-processing callback + ff_dnn_set_classify_post_proc(&ctx->dnnctx, dnn_classify_post_proc); + return 0; +} + +static av_cold int dnn_classify_init(AVFilterContext *context) +{ + DnnClassifyContext *ctx = context->priv; + int ret; + + // Create a static pad with the appropriate media type + AVFilterPad pad = { + .name = av_strdup("default"), + .type = ctx->is_audio ? AVMEDIA_TYPE_AUDIO : AVMEDIA_TYPE_VIDEO, + .config_props = config_input, + }; + + ret = ff_append_inpad(context, &pad); + if (ret < 0) + return ret; + + // Create a matching output pad + AVFilterPad outpad = { + .name = av_strdup("default"), + .type = ctx->is_audio ? AVMEDIA_TYPE_AUDIO : AVMEDIA_TYPE_VIDEO, + }; + + ret = ff_append_outpad(context, &outpad); + if (ret < 0) + return ret; + + // Check backend and file parameters (parameter validation only) + if (ctx->dnnctx.backend_type == DNN_TH) { + // Check CLIP/CLAP specific parameters + if (ctx->labels_filename && ctx->categories_filename) { + av_log(context, AV_LOG_ERROR, "Labels and categories file cannot be used together\n"); + return AVERROR(EINVAL); + } + + if (!ctx->labels_filename && !ctx->categories_filename) { + av_log(context, AV_LOG_ERROR, "Labels or categories file is required for classification\n"); + return AVERROR(EINVAL); + } + + if (!ctx->tokenizer_path) { + av_log(context, AV_LOG_ERROR, "Tokenizer file is required for CLIP/CLAP classification\n"); + return AVERROR(EINVAL); + } + } else if (ctx->dnnctx.backend_type == DNN_OV) { + // Check OpenVINO specific parameters + if (!ctx->labels_filename) { + av_log(context, AV_LOG_ERROR, "Labels file is required for classification\n"); + return AVERROR(EINVAL); + } + + if (ctx->categories_filename) { + av_log(context, AV_LOG_ERROR, "Categories file is only supported for CLIP/CLAP models\n"); + return AVERROR(EINVAL); + } + + // Audio classification is not supported with OpenVINO backend + if (ctx->is_audio) { + av_log(context, AV_LOG_ERROR, "Audio classification requires Torch backend\n"); + return AVERROR(EINVAL); + } + } + return 0; +} + +static const enum AVPixelFormat pix_fmts[] = {AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24, AV_PIX_FMT_GRAY8, + AV_PIX_FMT_GRAYF32, AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, + AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, + AV_PIX_FMT_NV12, AV_PIX_FMT_NONE}; + +static const enum AVSampleFormat sample_fmts[] = {AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_NONE}; + +static int query_formats(AVFilterContext *ctx) +{ + DnnClassifyContext *classify_ctx = ctx->priv; + + int ret; + // Get the type from the first input pad + enum AVMediaType type = ctx->inputs[0]->type; + + if (type == AVMEDIA_TYPE_VIDEO) { + ret = ff_set_common_formats(ctx, ff_make_format_list(pix_fmts)); + if (ret < 0) + return ret; + } else if (type == AVMEDIA_TYPE_AUDIO) { + + ret = ff_set_common_formats(ctx, ff_make_format_list(sample_fmts)); + if (ret < 0) + return ret; +#if (CONFIG_LIBTORCH == 1) + ret = ff_set_common_samplerates( + ctx, ff_make_format_list((const int[]){classify_ctx->dnnctx.torch_option.sample_rate, -1})); + if (ret < 0) + return ret; +#endif + ret = ff_set_common_channel_layouts(ctx, ff_all_channel_layouts()); + if (ret < 0) + return ret; + } else { + av_log(ctx, AV_LOG_ERROR, "Unsupported media type: %d\n", type); + return AVERROR(EINVAL); + } + + return 0; +} +static int dnn_classify_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts) +{ + AVFilterContext *context = outlink->src; + DnnClassifyContext *ctx = context->priv; + int ret; + DNNAsyncStatusType async_state; + + ret = ff_dnn_flush(&ctx->dnnctx); + if (ret != 0) { + return -1; + } + + do { + AVFrame *in_frame = NULL; + AVFrame *out_frame = NULL; + async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame); + if (async_state == DAST_SUCCESS) { + ret = ff_filter_frame(outlink, in_frame); + if (ret < 0) + return ret; + if (out_pts) + *out_pts = in_frame->pts + pts; + } + av_usleep(5000); + } while (async_state >= DAST_NOT_READY); + + return 0; +} + +static int process_video_frame(DnnClassifyContext *ctx, AVFrame *frame) +{ + int ret; + + if (ctx->dnnctx.backend_type == DNN_TH) { + ret = ff_dnn_execute_model_clip(&ctx->dnnctx, frame, NULL, ctx->label_classification_ctx->labels, + ctx->label_classification_ctx->label_count, ctx->tokenizer_path, ctx->target); + } else { + ret = ff_dnn_execute_model_classification(&ctx->dnnctx, frame, NULL, ctx->target); + } + + if (ret != 0) { + av_frame_free(&frame); + return AVERROR(EIO); + } + + return 0; +} + +static int process_audio_frame(DnnClassifyContext *ctx, AVFrame *frame) +{ + int ret = ff_dnn_execute_model_clap(&ctx->dnnctx, frame, NULL, ctx->label_classification_ctx->labels, + ctx->label_classification_ctx->label_count, ctx->tokenizer_path); + + if (ret != 0) { + av_frame_free(&frame); + return AVERROR(EIO); + } + + return 0; +} + +static int process_audio_buffer(DnnClassifyContext *ctx, AVFilterLink *inlink) +{ + static AVFrame *audio_buffer = NULL; + static int buffer_offset = 0; + int64_t required_samples = ctx->dnnctx.torch_option.sample_rate * ctx->dnnctx.torch_option.sample_duration; + int ret = 0, samples_to_copy = 0; + AVFrame *in = NULL; + + while (buffer_offset < required_samples) { + ret = ff_inlink_consume_frame(inlink, &in); + if (ret < 0) + return ret; + if (ret == 0) + break; // No more frames available right now + + // First frame - initialize our buffer + if (!audio_buffer) { + audio_buffer = av_frame_alloc(); + if (!audio_buffer) { + av_frame_free(&in); + return AVERROR(ENOMEM); + } + + // Allocate our buffer to hold exactly required_samples + audio_buffer->format = in->format; + audio_buffer->ch_layout = in->ch_layout; + audio_buffer->sample_rate = in->sample_rate; + audio_buffer->nb_samples = required_samples; + audio_buffer->pts = in->pts; + + ret = av_frame_get_buffer(audio_buffer, 0); + if (ret < 0) { + av_frame_free(&audio_buffer); + av_frame_free(&in); + return ret; + } + } + + // Copy samples to our buffer + samples_to_copy = FFMIN(in->nb_samples, required_samples - buffer_offset); + for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) { + if (!in->data[ch] || !audio_buffer->data[ch]) { + continue; + } + memcpy((float *)audio_buffer->data[ch] + buffer_offset, (float *)in->data[ch], + samples_to_copy * sizeof(float)); + } + + buffer_offset += samples_to_copy; + av_frame_free(&in); + + // If we've filled our buffer, process it + if (buffer_offset >= required_samples) { + ret = process_audio_frame(ctx, audio_buffer); + if (ret < 0) + return ret; + + // Reset for next frame + audio_buffer = NULL; + buffer_offset = 0; + break; + } + } + return ret; +} + +static int dnn_classify_activate(AVFilterContext *context) +{ + DnnClassifyContext *ctx = context->priv; + AVFilterLink *inlink = context->inputs[0]; + AVFilterLink *outlink = context->outputs[0]; + int ret, status; + int64_t pts; + AVFrame *in = NULL; + int got_frame = 0; + DNNAsyncStatusType async_state; + + // Check for EOF or other status + if (ff_inlink_acknowledge_status(inlink, &status, &pts)) { + if (status == AVERROR_EOF) { + int64_t out_pts = pts; + ret = dnn_classify_flush_frame(outlink, pts, &out_pts); + ff_outlink_set_status(outlink, status, out_pts); + return ret; + } + } + + if (ctx->type == AVMEDIA_TYPE_AUDIO) { + ret = process_audio_buffer(ctx, inlink); + if (ret < 0) { + return ret; + } + } else { + ret = ff_inlink_consume_frame(inlink, &in); + if (ret < 0) + return ret; + if (ret > 0) { + ret = process_video_frame(ctx, in); + if (ret < 0) { + av_frame_free(&in); + return ret; + } + } + } + + // Get processed results + do { + AVFrame *in_frame = NULL; + AVFrame *out_frame = NULL; + async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame); + if (async_state == DAST_SUCCESS) { + ret = ff_filter_frame(outlink, in_frame); + if (ret < 0) + return ret; + got_frame = 1; + } + } while (async_state == DAST_SUCCESS); + + if (got_frame) + return 0; + + // Request more frames if needed + if (ff_outlink_frame_wanted(outlink)) + ff_inlink_request_frame(inlink); + + return FFERROR_NOT_READY; +} + +static av_cold void dnn_classify_uninit(AVFilterContext *context) +{ + DnnClassifyContext *ctx = context->priv; + ff_dnn_uninit(&ctx->dnnctx); + free_contexts(ctx); +} + +const FFFilter ff_avf_dnn_classify = { + .p.name = "dnn_classify", + .p.description = NULL_IF_CONFIG_SMALL("Apply DNN classification filter to the input."), + .p.priv_class = &dnn_classify_class, + .p.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, + .priv_size = sizeof(DnnClassifyContext), + .preinit = ff_dnn_filter_init_child_class, + .init = dnn_classify_init, + .uninit = dnn_classify_uninit, + .activate = dnn_classify_activate, + FILTER_QUERY_FUNC2(query_formats), +}; diff --git a/libavfilter/vf_dnn_classify.c b/libavfilter/vf_dnn_classify.c deleted file mode 100644 index f92c41ab76..0000000000 --- a/libavfilter/vf_dnn_classify.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * implementing an classification filter using deep learning networks. - */ - -#include "libavutil/file_open.h" -#include "libavutil/mem.h" -#include "libavutil/opt.h" -#include "filters.h" -#include "dnn_filter_common.h" -#include "video.h" -#include "libavutil/time.h" -#include "libavutil/avstring.h" -#include "libavutil/detection_bbox.h" - -typedef struct DnnClassifyContext { - const AVClass *class; - DnnContext dnnctx; - float confidence; - char *labels_filename; - char *target; - char **labels; - int label_count; -} DnnClassifyContext; - -#define OFFSET(x) offsetof(DnnClassifyContext, dnnctx.x) -#define OFFSET2(x) offsetof(DnnClassifyContext, x) -#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM -static const AVOption dnn_classify_options[] = { - { "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = DNN_OV }, INT_MIN, INT_MAX, FLAGS, .unit = "backend" }, -#if (CONFIG_LIBOPENVINO == 1) - { "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_OV }, 0, 0, FLAGS, .unit = "backend" }, -#endif - { "confidence", "threshold of confidence", OFFSET2(confidence), AV_OPT_TYPE_FLOAT, { .dbl = 0.5 }, 0, 1, FLAGS}, - { "labels", "path to labels file", OFFSET2(labels_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, - { "target", "which one to be classified", OFFSET2(target), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS }, - { NULL } -}; - -AVFILTER_DNN_DEFINE_CLASS(dnn_classify, DNN_OV); - -static int dnn_classify_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx) -{ - DnnClassifyContext *ctx = filter_ctx->priv; - float conf_threshold = ctx->confidence; - AVDetectionBBoxHeader *header; - AVDetectionBBox *bbox; - float *classifications; - uint32_t label_id; - float confidence; - AVFrameSideData *sd; - int output_size = output->dims[3] * output->dims[2] * output->dims[1]; - if (output_size <= 0) { - return -1; - } - - sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES); - if (!sd) { - av_log(filter_ctx, AV_LOG_ERROR, "Cannot get side data in dnn_classify_post_proc\n"); - return -1; - } - header = (AVDetectionBBoxHeader *)sd->data; - - if (bbox_index == 0) { - av_strlcat(header->source, ", ", sizeof(header->source)); - av_strlcat(header->source, ctx->dnnctx.model_filename, sizeof(header->source)); - } - - classifications = output->data; - label_id = 0; - confidence= classifications[0]; - for (int i = 1; i < output_size; i++) { - if (classifications[i] > confidence) { - label_id = i; - confidence= classifications[i]; - } - } - - if (confidence < conf_threshold) { - return 0; - } - - bbox = av_get_detection_bbox(header, bbox_index); - bbox->classify_confidences[bbox->classify_count] = av_make_q((int)(confidence * 10000), 10000); - - if (ctx->labels && label_id < ctx->label_count) { - av_strlcpy(bbox->classify_labels[bbox->classify_count], ctx->labels[label_id], sizeof(bbox->classify_labels[bbox->classify_count])); - } else { - snprintf(bbox->classify_labels[bbox->classify_count], sizeof(bbox->classify_labels[bbox->classify_count]), "%d", label_id); - } - - bbox->classify_count++; - - return 0; -} - -static void free_classify_labels(DnnClassifyContext *ctx) -{ - for (int i = 0; i < ctx->label_count; i++) { - av_freep(&ctx->labels[i]); - } - ctx->label_count = 0; - av_freep(&ctx->labels); -} - -static int read_classify_label_file(AVFilterContext *context) -{ - int line_len; - FILE *file; - DnnClassifyContext *ctx = context->priv; - - file = avpriv_fopen_utf8(ctx->labels_filename, "r"); - if (!file){ - av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename); - return AVERROR(EINVAL); - } - - while (!feof(file)) { - char *label; - char buf[256]; - if (!fgets(buf, 256, file)) { - break; - } - - line_len = strlen(buf); - while (line_len) { - int i = line_len - 1; - if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') { - buf[i] = '\0'; - line_len--; - } else { - break; - } - } - - if (line_len == 0) // empty line - continue; - - if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) { - av_log(context, AV_LOG_ERROR, "label %s too long\n", buf); - fclose(file); - return AVERROR(EINVAL); - } - - label = av_strdup(buf); - if (!label) { - av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf); - fclose(file); - return AVERROR(ENOMEM); - } - - if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) { - av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n"); - fclose(file); - av_freep(&label); - return AVERROR(ENOMEM); - } - } - - fclose(file); - return 0; -} - -static av_cold int dnn_classify_init(AVFilterContext *context) -{ - DnnClassifyContext *ctx = context->priv; - int ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_CLASSIFY, context); - if (ret < 0) - return ret; - ff_dnn_set_classify_post_proc(&ctx->dnnctx, dnn_classify_post_proc); - - if (ctx->labels_filename) { - return read_classify_label_file(context); - } - return 0; -} - -static const enum AVPixelFormat pix_fmts[] = { - AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24, - AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32, - AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, - AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, - AV_PIX_FMT_NV12, - AV_PIX_FMT_NONE -}; - -static int dnn_classify_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts) -{ - DnnClassifyContext *ctx = outlink->src->priv; - int ret; - DNNAsyncStatusType async_state; - - ret = ff_dnn_flush(&ctx->dnnctx); - if (ret != 0) { - return -1; - } - - do { - AVFrame *in_frame = NULL; - AVFrame *out_frame = NULL; - async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame); - if (async_state == DAST_SUCCESS) { - ret = ff_filter_frame(outlink, in_frame); - if (ret < 0) - return ret; - if (out_pts) - *out_pts = in_frame->pts + pts; - } - av_usleep(5000); - } while (async_state >= DAST_NOT_READY); - - return 0; -} - -static int dnn_classify_activate(AVFilterContext *filter_ctx) -{ - AVFilterLink *inlink = filter_ctx->inputs[0]; - AVFilterLink *outlink = filter_ctx->outputs[0]; - DnnClassifyContext *ctx = filter_ctx->priv; - AVFrame *in = NULL; - int64_t pts; - int ret, status; - int got_frame = 0; - int async_state; - - FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); - - do { - // drain all input frames - ret = ff_inlink_consume_frame(inlink, &in); - if (ret < 0) - return ret; - if (ret > 0) { - if (ff_dnn_execute_model_classification(&ctx->dnnctx, in, NULL, ctx->target) != 0) { - return AVERROR(EIO); - } - } - } while (ret > 0); - - // drain all processed frames - do { - AVFrame *in_frame = NULL; - AVFrame *out_frame = NULL; - async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame); - if (async_state == DAST_SUCCESS) { - ret = ff_filter_frame(outlink, in_frame); - if (ret < 0) - return ret; - got_frame = 1; - } - } while (async_state == DAST_SUCCESS); - - // if frame got, schedule to next filter - if (got_frame) - return 0; - - if (ff_inlink_acknowledge_status(inlink, &status, &pts)) { - if (status == AVERROR_EOF) { - int64_t out_pts = pts; - ret = dnn_classify_flush_frame(outlink, pts, &out_pts); - ff_outlink_set_status(outlink, status, out_pts); - return ret; - } - } - - FF_FILTER_FORWARD_WANTED(outlink, inlink); - - return 0; -} - -static av_cold void dnn_classify_uninit(AVFilterContext *context) -{ - DnnClassifyContext *ctx = context->priv; - ff_dnn_uninit(&ctx->dnnctx); - free_classify_labels(ctx); -} - -const FFFilter ff_vf_dnn_classify = { - .p.name = "dnn_classify", - .p.description = NULL_IF_CONFIG_SMALL("Apply DNN classify filter to the input."), - .p.priv_class = &dnn_classify_class, - .priv_size = sizeof(DnnClassifyContext), - .preinit = ff_dnn_filter_init_child_class, - .init = dnn_classify_init, - .uninit = dnn_classify_uninit, - FILTER_INPUTS(ff_video_default_filterpad), - FILTER_OUTPUTS(ff_video_default_filterpad), - FILTER_PIXFMTS_ARRAY(pix_fmts), - .activate = dnn_classify_activate, -}; -- 2.34.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
reply other threads:[~2025-03-10 19:55 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to='004501db91f6$62c2d680$28488380$@gmail.com' \ --to=m.kaindl0208@gmail.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git