[FFmpeg-devel] [PATCH v2 FFmpeg 17/20] libavfilter: turn dnn_classify to multimedia filter. Classify CLIP/CLAP implementation.

From: <m.kaindl0208@gmail.com>
To: <ffmpeg-devel@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH v2 FFmpeg 17/20] libavfilter: turn dnn_classify to multimedia filter. Classify CLIP/CLAP implementation.
Date: Mon, 10 Mar 2025 20:55:33 +0100
Message-ID: <004501db91f6$62c2d680$28488380$@gmail.com> (raw)

Signed-off-by: MaximilianKaindl <m.kaindl0208@gmail.com>
---
 libavfilter/Makefile           |    2 +-
 libavfilter/allfilters.c       |    2 +-
 libavfilter/avf_dnn_classify.c | 1283 ++++++++++++++++++++++++++++++++
 libavfilter/vf_dnn_classify.c  |  308 --------
 4 files changed, 1285 insertions(+), 310 deletions(-)
 create mode 100644 libavfilter/avf_dnn_classify.c
 delete mode 100644 libavfilter/vf_dnn_classify.c

diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 7c0d879ec9..987dbcb82b 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -286,7 +286,6 @@ OBJS-$(CONFIG_DILATION_FILTER)               += vf_neighbor.o
 OBJS-$(CONFIG_DILATION_OPENCL_FILTER)        += vf_neighbor_opencl.o opencl.o \
                                                 opencl/neighbor.o
 OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o framesync.o
-OBJS-$(CONFIG_DNN_CLASSIFY_FILTER)           += vf_dnn_classify.o
 OBJS-$(CONFIG_DNN_DETECT_FILTER)             += vf_dnn_detect.o
 OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         += vf_dnn_processing.o
 OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o
@@ -635,6 +634,7 @@ OBJS-$(CONFIG_AHISTOGRAM_FILTER)             += avf_ahistogram.o
 OBJS-$(CONFIG_APHASEMETER_FILTER)            += avf_aphasemeter.o
 OBJS-$(CONFIG_AVECTORSCOPE_FILTER)           += avf_avectorscope.o
 OBJS-$(CONFIG_CONCAT_FILTER)                 += avf_concat.o
+OBJS-$(CONFIG_DNN_CLASSIFY_FILTER)           += avf_dnn_classify.o
 OBJS-$(CONFIG_SHOWCQT_FILTER)                += avf_showcqt.o lswsutils.o lavfutils.o
 OBJS-$(CONFIG_SHOWCWT_FILTER)                += avf_showcwt.o
 OBJS-$(CONFIG_SHOWFREQS_FILTER)              += avf_showfreqs.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 740d9ab265..5385173dc1 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -261,7 +261,6 @@ extern const FFFilter ff_vf_detelecine;
 extern const FFFilter ff_vf_dilation;
 extern const FFFilter ff_vf_dilation_opencl;
 extern const FFFilter ff_vf_displace;
-extern const FFFilter ff_vf_dnn_classify;
 extern const FFFilter ff_vf_dnn_detect;
 extern const FFFilter ff_vf_dnn_processing;
 extern const FFFilter ff_vf_doubleweave;
@@ -596,6 +595,7 @@ extern const FFFilter ff_avf_ahistogram;
 extern const FFFilter ff_avf_aphasemeter;
 extern const FFFilter ff_avf_avectorscope;
 extern const FFFilter ff_avf_concat;
+extern const FFFilter ff_avf_dnn_classify;
 extern const FFFilter ff_avf_showcqt;
 extern const FFFilter ff_avf_showcwt;
 extern const FFFilter ff_avf_showfreqs;
diff --git a/libavfilter/avf_dnn_classify.c b/libavfilter/avf_dnn_classify.c
new file mode 100644
index 0000000000..7b469e3af0
--- /dev/null
+++ b/libavfilter/avf_dnn_classify.c
@@ -0,0 +1,1283 @@
+/*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+/**
+ * @file
+ * DNN classification filter supporting both video (standard and CLIP)
+ * and audio (CLAP) classification
+ */
+
+#include "audio.h"
+#include "avfilter.h"
+#include "dnn_filter_common.h"
+#include "filters.h"
+#include "formats.h"
+#include "libavutil/avstring.h"
+#include "libavutil/detection_bbox.h"
+#include "libavutil/file_open.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/time.h"
+#include "video.h"
+
+/*
+    Labels that are being used to classify the image
+*/
+typedef struct LabelContext {
+    char **labels;
+    int label_count;
+} LabelContext;
+
+/*
+    Category that holds multiple Labels
+*/
+typedef struct CategoryContext {
+    char *name;
+    LabelContext *labels;
+    int label_count;
+    float total_probability;
+} CategoryContext;
+
+/*
+Header (Attribute) that is being described by all its labels.
+(name) in labels file
+
+e.g.
+(Comic)
+a drawn image
+a fictional character
+...
+
+Can also be used to substitute the labeltext with the header text
+so that the header is displayed instead of the label with the probability
+*/
+typedef struct CategoriesContext {
+    char *name;
+    CategoryContext *categories;
+    int category_count;
+    int label_count;
+    int max_categories;
+} CategoriesContext;
+
+/*
+Unit that is being classified. Each one can have multiple categories
+[name] in categories file
+
+e.g.
+[RecordingSystem]
+(Professional)
+a photo with high level of detail
+...
+(HomeRecording)
+a photo with low level of detail
+...
+
+empowers to do multiple classification "runs"
+result per unit will be best category (rated by the sum over all confidence values) one of the categories
+softmax is applied over each unit
+*/
+typedef struct CategoryClassifcationContext {
+CategoriesContext **category_units;
+int num_contexts;
+int max_contexts;
+int total_labels;
+int total_categories;
+} CategoryClassifcationContext;
+
+typedef struct DnnClassifyContext {
+    const AVClass *class;
+    DnnContext dnnctx;
+    float confidence;
+    char *labels_filename;
+    char *target;
+    enum AVMediaType type;  // AVMEDIA_TYPE_VIDEO or AVMEDIA_TYPE_AUDIO
+
+    // Standard classification
+    LabelContext *label_classification_ctx;
+
+    CategoryClassifcationContext *category_classification_ctx;
+
+    char *categories_filename;
+    char *tokenizer_path;
+
+    // Audio-specific parameters
+    int is_audio;               // New flag to indicate if input will be audio
+} DnnClassifyContext;
+
+#define OFFSET(x) offsetof(DnnClassifyContext, dnnctx.x)
+#define OFFSET2(x) offsetof(DnnClassifyContext, x)
+#if (CONFIG_LIBTORCH == 1)
+#define OFFSET3(x) offsetof(DnnClassifyContext, dnnctx.torch_option.x)
+#endif
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+
+static const AVOption dnn_classify_options[] = {
+    { "dnn_backend",    "DNN backend",                                   OFFSET(backend_type),          AV_OPT_TYPE_INT,    { .i64 = DNN_OV },   INT_MIN, INT_MAX, FLAGS, .unit = "backend" },
+#if (CONFIG_LIBOPENVINO == 1)
+    { "openvino",       "openvino backend flag",                         0,                             AV_OPT_TYPE_CONST,  { .i64 = DNN_OV },   0,       0,       FLAGS, .unit = "backend" },
+#endif
+#if (CONFIG_LIBTORCH == 1)
+    { "torch",          "torch backend flag",                            0,                             AV_OPT_TYPE_CONST,  { .i64 = DNN_TH },   0,       0,       FLAGS, .unit = "backend" },
+    { "logit_scale",    "logit scale for similarity calculation",        OFFSET3(logit_scale),          AV_OPT_TYPE_FLOAT,  { .dbl = -1.0 },     -1.0,    100.0,   FLAGS },
+    { "temperature",    "softmax temperature",                           OFFSET3(temperature),          AV_OPT_TYPE_FLOAT,  { .dbl = -1.0 },     -1.0,       100.0,   FLAGS },
+    { "forward_order",  "Order of forward output (0: media text, 1: text media) (CLIP/CLAP only)", OFFSET3(forward_order), AV_OPT_TYPE_BOOL,   { .i64 = -1 },     -1,      1,       FLAGS },
+    { "normalize",      "Normalize the input tensor (CLIP/CLAP only)",   OFFSET3(normalize),            AV_OPT_TYPE_BOOL,   { .i64 = -1 },       -1,      1,       FLAGS },
+    { "input_res",      "video processing model expected input size",    OFFSET3(input_resolution),     AV_OPT_TYPE_INT64,  { .i64 = -1 },       -1,      10000,   FLAGS },
+    { "sample_rate",    "audio processing model expected sample rate",   OFFSET3(sample_rate),          AV_OPT_TYPE_INT64,  { .i64 = 44100 },    1600,    192000,  FLAGS },
+    { "sample_duration","audio processing model expected sample duration",OFFSET3(sample_duration),     AV_OPT_TYPE_INT64,  { .i64 = 7 },        1,       100,     FLAGS },
+    { "token_dimension","dimension of token vector",                     OFFSET3(token_dimension),      AV_OPT_TYPE_INT64,  { .i64 = 77 },       1,       10000,   FLAGS },
+#endif
+    { "confidence",     "threshold of confidence",                       OFFSET2(confidence),           AV_OPT_TYPE_FLOAT,  { .dbl = 0.5 },      0,       1,       FLAGS },
+    { "labels",         "path to labels file",                           OFFSET2(labels_filename),      AV_OPT_TYPE_STRING, { .str = NULL },     0,       0,       FLAGS },
+    { "target",         "which one to be classified",                    OFFSET2(target),               AV_OPT_TYPE_STRING, { .str = NULL },     0,       0,       FLAGS },
+    { "categories",     "path to categories file (CLIP/CLAP only)",      OFFSET2(categories_filename),  AV_OPT_TYPE_STRING, { .str = NULL },     0,       0,       FLAGS },
+    { "tokenizer",      "path to text tokenizer.json file (CLIP/CLAP only)", OFFSET2(tokenizer_path),   AV_OPT_TYPE_STRING, { .str = NULL },     0,       0,       FLAGS },
+    { "is_audio",       "audio processing mode",                         OFFSET2(is_audio),             AV_OPT_TYPE_BOOL,   { .i64 = 0 },        0,       1,       FLAGS },
+    { NULL }
+};
+
+AVFILTER_DNN_DEFINE_CLASS(dnn_classify, DNN_OV);
+
+static void free_label_context(LabelContext *ctx)
+{
+    if (!ctx)
+        return;
+
+    if (ctx->labels) {
+        for (int i = 0; i < ctx->label_count; i++) {
+            av_freep(&ctx->labels[i]);
+        }
+        av_freep(&ctx->labels);
+    }
+    ctx->label_count = 0;
+}
+
+static void free_category_context(CategoryContext *category)
+{
+    if (!category)
+        return;
+
+    if (category->name) {
+        av_freep(&category->name);
+    }
+
+    if (category->labels) {
+        free_label_context(category->labels);
+        av_freep(&category->labels);
+    }
+}
+
+static void free_categories_context(CategoriesContext *ctx)
+{
+    if (!ctx)
+        return;
+
+    if (ctx->categories) {
+        for (int i = 0; i < ctx->category_count; i++) {
+            free_category_context(&ctx->categories[i]);
+        }
+        // Now free the array of categories
+        av_freep(&ctx->categories);
+        ctx->categories = NULL;
+    }
+
+    if (ctx->name) {
+        av_freep(&ctx->name);
+        ctx->name = NULL;
+    }
+
+    ctx->category_count = 0;
+    ctx->max_categories = 0;
+    ctx->label_count = 0;
+}
+
+static void free_category_classfication_context(CategoryClassifcationContext *category_classification_ctx)
+{
+    if (category_classification_ctx) {
+        if (category_classification_ctx->category_units) {
+            for (int i = 0; i < category_classification_ctx->num_contexts; i++) {
+                if (category_classification_ctx->category_units[i]) {
+                    free_categories_context(category_classification_ctx->category_units[i]);
+                    av_freep(&category_classification_ctx->category_units[i]);
+                }
+            }
+            av_freep(&category_classification_ctx->category_units);
+            category_classification_ctx->category_units = NULL;
+        }
+        category_classification_ctx->num_contexts = 0;
+        category_classification_ctx->max_contexts = 0;
+    }
+}
+
+static int detection_bbox_set_content(AVDetectionBBox *bbox, char *label, int index, float probability)
+{
+    // Set probability
+    bbox->classify_confidences[index] = av_make_q((int)(probability * 10000), 10000);
+
+    // Copy label with size checking
+    if (av_strlcpy(bbox->classify_labels[index], label, AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) >=
+        AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) {
+        av_log(NULL, AV_LOG_WARNING, "Label truncated in set_prob_and_label_of_bbox\n");
+    }
+
+    return 0;
+}
+
+static fill_detection_bbox_with_best_labels(char **labels, float *probabilities, int num_labels, AVDetectionBBox *bbox, int max_classes_per_box, float confidence_threshold)
+{
+    int i, j, minpos, ret;
+    float min;
+
+    if (!labels || !probabilities || !bbox) {
+        return AVERROR(EINVAL);
+    }
+
+    for (i = 0; i < num_labels; i++) {
+        if (probabilities[i] >= confidence_threshold) {
+            if (bbox->classify_count >= max_classes_per_box) {
+                // Find lowest probability classification
+                min = av_q2d(bbox->classify_confidences[0]);
+                minpos = 0;
+                for (j = 1; j < bbox->classify_count; j++) {
+                    float prob = av_q2d(bbox->classify_confidences[j]);
+                    if (prob < min) {
+                        min = prob;
+                        minpos = j;
+                    }
+                }
+
+                if (probabilities[i] > min) {
+                    ret = detection_bbox_set_content(bbox, labels[i], minpos, probabilities[i]);
+                    if (ret < 0)
+                        return ret;
+                }
+            } else {
+                ret = detection_bbox_set_content(bbox, labels[i], bbox->classify_count, probabilities[i]);
+                if (ret < 0)
+                    return ret;
+                bbox->classify_count++;
+            }
+        }
+    }
+    return 0;
+}
+
+static int read_classify_label_file(AVFilterContext *context, LabelContext *label_classification_ctx,
+                                    char *labels_filename, int max_line_length)
+{
+    int line_len;
+    FILE *file;
+
+    file = avpriv_fopen_utf8(labels_filename, "r");
+    if (!file) {
+        av_log(context, AV_LOG_ERROR, "failed to open file %s\n", labels_filename);
+        return AVERROR(EINVAL);
+    }
+
+    while (!feof(file)) {
+        char *label;
+        char buf[256];
+        if (!fgets(buf, 256, file)) {
+            break;
+        }
+
+        line_len = strlen(buf);
+        while (line_len) {
+            int i = line_len - 1;
+            if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
+                buf[i] = '\0';
+                line_len--;
+            } else {
+                break;
+            }
+        }
+
+        if (line_len == 0) // empty line
+            continue;
+
+        if (line_len >= max_line_length) {
+            av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
+            fclose(file);
+            return AVERROR(EINVAL);
+        }
+
+        label = av_strdup(buf);
+        if (!label) {
+            av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
+            fclose(file);
+            return AVERROR(ENOMEM);
+        }
+
+        if (av_dynarray_add_nofree(&label_classification_ctx->labels, &label_classification_ctx->label_count, label) <
+            0) {
+            av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
+            fclose(file);
+            av_freep(&label);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    fclose(file);
+    return 0;
+}
+
+static int read_classify_categories_file(AVFilterContext *context, CategoryClassifcationContext *cat_class_ctx,
+                                        char *categories_filename, int max_line_length)
+{
+    FILE *file;
+    char buf[256];
+    int ret = 0;
+    CategoriesContext *current_ctx = NULL;
+    CategoryContext *current_category = NULL;
+
+    file = avpriv_fopen_utf8(categories_filename, "r");
+    if (!file) {
+        av_log(context, AV_LOG_ERROR, "Failed to open categories file %s\n", categories_filename);
+        return AVERROR(EINVAL);
+    }
+
+    // Initialize contexts array
+    cat_class_ctx->max_contexts = 10;
+    cat_class_ctx->num_contexts = 0;
+    cat_class_ctx->category_units = av_calloc(cat_class_ctx->max_contexts, sizeof(CategoriesContext *));
+    if (!cat_class_ctx->category_units) {
+        fclose(file);
+        return AVERROR(ENOMEM);
+    }
+
+    while (fgets(buf, sizeof(buf), file)) {
+        char *line = buf;
+        int line_len = strlen(line);
+
+        // Trim whitespace and newlines
+        while (line_len > 0 &&
+            (line[line_len - 1] == '\n' || line[line_len - 1] == '\r' || line[line_len - 1] == ' ')) {
+            line[--line_len] = '\0';
+        }
+
+        if (line_len == 0)
+            continue;
+
+        if (line_len >= max_line_length) {
+            av_log(context, AV_LOG_ERROR, "Label %s too long\n", buf);
+            ret = AVERROR(ENOMEM);
+            goto end;
+        }
+
+        // Check for context marker [ContextName]
+        if (line[0] == '[' && line[line_len - 1] == ']') {
+            if (current_ctx != NULL) {
+                // Store previous context
+                if (cat_class_ctx->num_contexts >= cat_class_ctx->max_contexts) {
+                    int new_size = cat_class_ctx->max_contexts * 2;
+                    CategoriesContext **new_contexts =
+                        av_realloc_array(cat_class_ctx->category_units, new_size, sizeof(CategoriesContext *));
+                    if (!new_contexts) {
+                        ret = AVERROR(ENOMEM);
+                        goto end;
+                    }
+                    cat_class_ctx->category_units = new_contexts;
+                    cat_class_ctx->max_contexts = new_size;
+                }
+                cat_class_ctx->category_units[cat_class_ctx->num_contexts++] = current_ctx;
+            }
+
+            // Create new context
+            current_ctx = av_calloc(1, sizeof(CategoriesContext));
+            if (!current_ctx) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            // Extract context name
+            line[line_len - 1] = '\0';
+            current_ctx->name = av_strdup(line + 1);
+            if (!current_ctx->name) {
+                av_freep(&current_ctx);
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            current_ctx->category_count = 0;
+            current_ctx->max_categories = 10;
+            current_ctx->categories = av_calloc(current_ctx->max_categories, sizeof(CategoryContext));
+            if (!current_ctx->categories) {
+                av_freep(&current_ctx->name);
+                av_freep(&current_ctx);
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            current_category = NULL;
+        }
+        // Check for category marker (CategoryName)
+        else if (line[0] == '(' && line[line_len - 1] == ')') {
+            if (!current_ctx) {
+                av_log(context, AV_LOG_ERROR, "Category found without context\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
+
+            if (current_ctx->category_count >= current_ctx->max_categories) {
+                int new_size = current_ctx->max_categories * 2;
+                CategoryContext *new_categories =
+                    av_realloc_array(current_ctx->categories, new_size, sizeof(CategoryContext));
+                if (!new_categories) {
+                    ret = AVERROR(ENOMEM);
+                    goto end;
+                }
+                current_ctx->categories = new_categories;
+                current_ctx->max_categories = new_size;
+            }
+
+            line[line_len - 1] = '\0';
+            current_category = &current_ctx->categories[current_ctx->category_count++];
+            cat_class_ctx->total_categories++;
+
+            current_category->name = av_strdup(line + 1);
+            if (!current_category->name) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            current_category->labels = av_calloc(1, sizeof(LabelContext));
+            if (!current_category->labels) {
+                av_freep(&current_category->name);
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            current_category->label_count = 0;
+            current_category->total_probability = 0.0f;
+        }
+        // Must be a label
+        else if (line[0] != '\0') {
+            if (!current_category) {
+                av_log(context, AV_LOG_ERROR, "Label found without category\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
+            char *label = av_strdup(line);
+            if (!label) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            if (av_dynarray_add_nofree(&current_category->labels->labels, &current_category->labels->label_count,
+                                    label) < 0) {
+                av_freep(&label);
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            current_category->label_count++;
+            current_ctx->label_count++;
+            cat_class_ctx->total_labels++;
+        }
+    }
+
+    // Store the last context
+    if (current_ctx) {
+        if (cat_class_ctx->num_contexts >= cat_class_ctx->max_contexts) {
+            int new_size = cat_class_ctx->max_contexts * 2;
+            CategoriesContext **new_contexts =
+                av_realloc_array(cat_class_ctx->category_units, new_size, sizeof(CategoriesContext *));
+            if (!new_contexts) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            cat_class_ctx->category_units = new_contexts;
+            cat_class_ctx->max_contexts = new_size;
+        }
+        cat_class_ctx->category_units[cat_class_ctx->num_contexts++] = current_ctx;
+    }
+
+end:
+    if (ret < 0) {
+        // Clean up current context if it wasn't added to the array
+        if (current_ctx) {
+            free_categories_context(current_ctx);
+        }
+    }
+
+    fclose(file);
+    return ret;
+}
+
+static int combine_all_category_labels(LabelContext **label_ctx, CategoryClassifcationContext *cat_class_ctx)
+{
+    char **combined_labels = NULL;
+    int combined_idx = 0;
+
+    *label_ctx = av_calloc(1, sizeof(LabelContext));
+    if (!(*label_ctx))
+        return AVERROR(ENOMEM);
+
+    (*label_ctx)->label_count = cat_class_ctx->total_labels;
+    (*label_ctx)->labels = av_calloc(cat_class_ctx->total_labels, sizeof(char *));
+    if (!(*label_ctx)->labels) {
+        av_freep(label_ctx);
+        return AVERROR(ENOMEM);
+    }
+
+    combined_labels = (*label_ctx)->labels;
+
+    // Combine all labels from all categories
+    for (int c = 0; c < cat_class_ctx->num_contexts; c++) {
+        CategoriesContext *current_ctx = cat_class_ctx->category_units[c];
+        for (int i = 0; i < current_ctx->category_count; i++) {
+            CategoryContext *category = &current_ctx->categories[i];
+            for (int j = 0; j < category->labels->label_count; j++) {
+                combined_labels[combined_idx] = category->labels->labels[j];
+                combined_idx++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int get_category_total_label_count(CategoryClassifcationContext *cat_ctx, int **label_counts)
+{
+    if (!cat_ctx || cat_ctx->num_contexts <= 0) {
+        return 0;
+    }
+
+    // Allocate memory for the label counts array
+    *label_counts = av_calloc(cat_ctx->num_contexts, sizeof(int));
+    if (!*label_counts) {
+        return AVERROR(ENOMEM);
+    }
+
+    // Fill the array with label counts from each context
+    for (int i = 0; i < cat_ctx->num_contexts; i++) {
+        CategoriesContext *categories = cat_ctx->category_units[i];
+        if (categories) {
+            (*label_counts)[i] = categories->label_count;
+        } else {
+            (*label_counts)[i] = 0;
+        }
+    }
+
+    return cat_ctx->num_contexts;
+}
+
+static CategoryContext *get_best_category(CategoriesContext *categories_ctx, float *probabilities)
+{
+    CategoryContext *best_category = NULL;
+    float best_probability = -1.0f;
+    int prob_offset = 0;
+
+    // Calculate total probability for each category
+    for (int cat_idx = 0; cat_idx < categories_ctx->category_count; cat_idx++) {
+        CategoryContext *category = &categories_ctx->categories[cat_idx];
+
+        // Sum probabilities for all labels in this category
+        category->total_probability = 0.0f;
+        for (int label_idx = 0; label_idx < category->label_count; label_idx++) {
+            category->total_probability += probabilities[prob_offset + label_idx];
+        }
+
+        if (category->total_probability > best_probability) {
+            best_probability = category->total_probability;
+            best_category = category;
+        }
+
+        prob_offset += category->label_count;
+    }
+
+    return best_category;
+}
+
+static AVDetectionBBox *find_or_create_detection_bbox(AVFrame *frame, uint32_t bbox_index, AVFilterContext *filter_ctx,
+                                                    DnnClassifyContext *ctx)
+{
+    AVFrameSideData *sd;
+    AVDetectionBBoxHeader *header;
+    AVDetectionBBox *bbox;
+
+    sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
+    if (!sd) {
+        header = av_detection_bbox_create_side_data(frame, 1);
+        if (!header) {
+            av_log(filter_ctx, AV_LOG_ERROR, "Cannot get side data in labels processing\n");
+            return NULL;
+        }
+    } else {
+        header = (AVDetectionBBoxHeader *)sd->data;
+    }
+
+    if (bbox_index == 0) {
+        av_strlcat(header->source, ", ", sizeof(header->source));
+        av_strlcat(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
+    }
+
+    // Get bbox for current index
+    bbox = av_get_detection_bbox(header, bbox_index);
+    if (!bbox) {
+        av_log(filter_ctx, AV_LOG_ERROR, "Failed to get bbox %d\n", bbox_index);
+        return NULL;
+    }
+
+    return bbox;
+}
+
+// Processing functions for standard classification (video only)
+static int post_proc_standard(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx)
+{
+    DnnClassifyContext *ctx = filter_ctx->priv;
+    float conf_threshold = ctx->confidence;
+    AVDetectionBBoxHeader *header;
+    AVDetectionBBox *bbox;
+    float *classifications;
+    uint32_t label_id;
+    float confidence;
+    AVFrameSideData *sd;
+    int output_size = output->dims[3] * output->dims[2] * output->dims[1];
+
+    if (output_size <= 0) {
+        return -1;
+    }
+
+    sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
+    if (!sd) {
+        av_log(filter_ctx, AV_LOG_ERROR, "Cannot get side data in post_proc_standard\n");
+        return -1;
+    }
+    header = (AVDetectionBBoxHeader *)sd->data;
+
+    if (bbox_index == 0) {
+        av_strlcat(header->source, ", ", sizeof(header->source));
+        av_strlcat(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
+    }
+
+    classifications = output->data;
+    label_id = 0;
+    confidence = classifications[0];
+    for (int i = 1; i < output_size; i++) {
+        if (classifications[i] > confidence) {
+            label_id = i;
+            confidence = classifications[i];
+        }
+    }
+
+    if (confidence < conf_threshold) {
+        return 0;
+    }
+
+    bbox = av_get_detection_bbox(header, bbox_index);
+    bbox->classify_confidences[bbox->classify_count] = av_make_q((int)(confidence * 10000), 10000);
+
+    if (ctx->label_classification_ctx->labels && label_id < ctx->label_classification_ctx->label_count) {
+        av_strlcpy(bbox->classify_labels[bbox->classify_count], ctx->label_classification_ctx->labels[label_id],
+                sizeof(bbox->classify_labels[bbox->classify_count]));
+    } else {
+        snprintf(bbox->classify_labels[bbox->classify_count], sizeof(bbox->classify_labels[bbox->classify_count]), "%d",
+                label_id);
+    }
+    bbox->classify_count++;
+
+    return 0;
+}
+
+static int post_proc_clxp_labels(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx)
+{
+    DnnClassifyContext *ctx = filter_ctx->priv;
+    const int max_classes_per_box = AV_NUM_DETECTION_BBOX_CLASSIFY;
+    float *probabilities = (float *)output->data;
+    int num_labels = ctx->label_classification_ctx->label_count;
+    AVDetectionBBox *bbox;
+    float confidence_threshold = ctx->confidence;
+    int ret;
+
+    bbox = find_or_create_detection_bbox(frame, bbox_index, filter_ctx, ctx);
+    if (!bbox) {
+        return AVERROR(EINVAL);
+    }
+
+    ret = fill_detection_bbox_with_best_labels(ctx->label_classification_ctx->labels, probabilities, num_labels,
+                                                bbox, max_classes_per_box, confidence_threshold);
+    if (ret < 0) {
+        av_log(filter_ctx, AV_LOG_ERROR, "Failed to fill bbox with best labels\n");
+        return ret;
+    }
+    return 0;
+}
+
+static int post_proc_clxp_categories(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx)
+{
+    DnnClassifyContext *ctx = filter_ctx->priv;
+    CategoryClassifcationContext *cat_class_ctx = ctx->category_classification_ctx;
+    CategoryContext *best_category;
+    AVDetectionBBox *bbox;
+    float *probabilities = output->data;
+    int ret, prob_offset = 0;
+    char **ctx_labels;
+    float *ctx_probabilities;
+
+    bbox = find_or_create_detection_bbox(frame, bbox_index, filter_ctx, ctx);
+    if (!bbox) {
+        return AVERROR(EINVAL);
+    }
+
+    // Allocate temporary arrays for category results
+    ctx_labels = av_malloc_array(cat_class_ctx->num_contexts, sizeof(char *));
+    if (!ctx_labels) {
+        return AVERROR(ENOMEM);
+    }
+
+    for (int i = 0; i < cat_class_ctx->num_contexts; i++) {
+        ctx_labels[i] = av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE);
+        if (!ctx_labels[i]) {
+            // Clean up previously allocated memory
+            for (int j = 0; j < i; j++) {
+                av_freep(&ctx_labels[j]);
+            }
+            av_freep(&ctx_labels);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    ctx_probabilities = av_malloc_array(cat_class_ctx->num_contexts, sizeof(float));
+    if (!ctx_probabilities) {
+        // Clean up
+        for (int i = 0; i < cat_class_ctx->num_contexts; i++) {
+            av_freep(&ctx_labels[i]);
+        }
+        av_freep(&ctx_labels);
+        return AVERROR(ENOMEM);
+    }
+
+    // Process each context
+    for (int ctx_idx = 0; ctx_idx < cat_class_ctx->num_contexts; ctx_idx++) {
+        CategoriesContext *categories_ctx = cat_class_ctx->category_units[ctx_idx];
+        if (!categories_ctx) {
+            av_log(filter_ctx, AV_LOG_ERROR, "Missing classification data at context %d\n", ctx_idx);
+            continue;
+        }
+
+        // Find best category in context
+        best_category = get_best_category(categories_ctx, probabilities + prob_offset);
+        if (!best_category || !best_category->name) {
+            av_log(filter_ctx, AV_LOG_ERROR, "Invalid best category at context %d\n", ctx_idx);
+            continue;
+        }
+
+        // Copy category name instead of assigning pointer
+        av_strlcpy(ctx_labels[ctx_idx], best_category->name, AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE);
+        ctx_probabilities[ctx_idx] = best_category->total_probability;
+
+        prob_offset += categories_ctx->label_count;
+    }
+
+    // Fill bbox with best labels
+    ret = fill_detection_bbox_with_best_labels(ctx_labels, ctx_probabilities, cat_class_ctx->num_contexts, bbox,
+                                                AV_NUM_DETECTION_BBOX_CLASSIFY, ctx->confidence);
+
+    // Clean up
+    for (int i = 0; i < cat_class_ctx->num_contexts; i++) {
+        av_freep(&ctx_labels[i]);
+    }
+    av_freep(&ctx_labels);
+    av_freep(&ctx_probabilities);
+
+    return ret;
+}
+
+static int dnn_classify_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx)
+{
+    DnnClassifyContext *ctx = filter_ctx->priv;
+
+    if (ctx->dnnctx.backend_type == DNN_TH) {
+        if (ctx->category_classification_ctx) {
+            return post_proc_clxp_categories(frame, output, bbox_index, filter_ctx);
+        } else if (ctx->label_classification_ctx) {
+            return post_proc_clxp_labels(frame, output, bbox_index, filter_ctx);
+        }
+        av_log(filter_ctx, AV_LOG_ERROR, "No valid classification context available\n");
+        return AVERROR(EINVAL);
+    } else {
+        return post_proc_standard(frame, output, bbox_index, filter_ctx);
+    }
+}
+
+static void free_contexts(DnnClassifyContext *ctx)
+{
+    if (!ctx)
+        return;
+    if (ctx->category_classification_ctx) {
+        free_category_classfication_context(ctx->category_classification_ctx);
+        av_freep(&ctx->category_classification_ctx);
+        av_freep(&ctx->label_classification_ctx);
+        ctx->category_classification_ctx = NULL;
+        ctx->label_classification_ctx = NULL;
+    } else if (ctx->label_classification_ctx) {
+        free_label_context(ctx->label_classification_ctx);
+        ctx->label_classification_ctx = NULL;
+    }
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *context = inlink->dst;
+    DnnClassifyContext *ctx = context->priv;
+    AVFilterLink *outlink = context->outputs[0];
+    int ret;
+    DNNFunctionType goal_mode;
+
+    // Set media type based on is_audio flag
+    if (ctx->is_audio) {
+        ctx->type = AVMEDIA_TYPE_AUDIO;
+    } else {
+        ctx->type = inlink->type;
+    }
+
+    // Set the output link type to match the input link type
+    outlink->type = inlink->type;
+    outlink->w = inlink->w;
+    outlink->h = inlink->h;
+    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+    outlink->time_base = inlink->time_base;
+
+    int64_t sample_rate = ctx->dnnctx.torch_option.sample_rate;
+
+    // Validate media type
+    if (ctx->type != AVMEDIA_TYPE_AUDIO && ctx->type != AVMEDIA_TYPE_VIDEO) {
+        av_log(context, AV_LOG_ERROR, "Invalid media type. Only audio or video is supported\n");
+        return AVERROR(EINVAL);
+    }
+
+    // Set type-specific parameters and check compatibility
+    if (ctx->type == AVMEDIA_TYPE_AUDIO) {
+        goal_mode = DFT_ANALYTICS_CLAP;
+
+        // Check backend compatibility
+        if (ctx->dnnctx.backend_type != DNN_TH) {
+            av_log(context, AV_LOG_ERROR, "Audio classification requires Torch backend\n");
+            return AVERROR(EINVAL);
+        }
+
+        if (inlink->sample_rate != sample_rate) {
+            av_log(context, AV_LOG_ERROR, "Invalid sample rate. CLAP requires 44100 Hz\n");
+            return AVERROR(EINVAL);
+        }
+
+        // Copy audio properties to output
+        outlink->sample_rate = inlink->sample_rate;
+        outlink->ch_layout = inlink->ch_layout;
+    } else {
+        // Video mode
+        goal_mode = (ctx->dnnctx.backend_type == DNN_TH) ? DFT_ANALYTICS_CLIP : DFT_ANALYTICS_CLASSIFY;
+    }
+    // Initialize label and category contexts based on provided files
+    if (ctx->dnnctx.backend_type == DNN_TH) {
+        if (ctx->labels_filename) {
+            ctx->label_classification_ctx = av_calloc(1, sizeof(LabelContext));
+            if (!ctx->label_classification_ctx)
+                return AVERROR(ENOMEM);
+
+            ret = read_classify_label_file(context, ctx->label_classification_ctx, ctx->labels_filename,
+                                        AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE);
+            if (ret < 0) {
+                av_log(context, AV_LOG_ERROR, "Failed to read labels file\n");
+                return ret;
+            }
+            ret = ff_dnn_init_with_tokenizer(&ctx->dnnctx, goal_mode, ctx->label_classification_ctx->labels,
+                                            ctx->label_classification_ctx->label_count, NULL, 0, ctx->tokenizer_path,
+                                            context);
+            if (ret < 0) {
+                free_contexts(ctx);
+                return ret;
+            }
+        } else if (ctx->categories_filename) {
+            ctx->category_classification_ctx = av_calloc(1, sizeof(CategoryClassifcationContext));
+            if (!ctx->category_classification_ctx)
+                return AVERROR(ENOMEM);
+
+            ret = read_classify_categories_file(context, ctx->category_classification_ctx, ctx->categories_filename,
+                                                AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE);
+            if (ret < 0) {
+                av_log(context, AV_LOG_ERROR, "Failed to read categories file\n");
+                free_contexts(ctx);
+                return ret;
+            }
+
+            ret = combine_all_category_labels(&ctx->label_classification_ctx, ctx->category_classification_ctx);
+            if (ret < 0) {
+                av_log(context, AV_LOG_ERROR, "Failed to combine labels\n");
+                free_contexts(ctx);
+                return ret;
+            }
+            // Get total label count of all categories
+            int total_labels;
+            int *label_counts = NULL;
+
+            total_labels = get_category_total_label_count(ctx->category_classification_ctx, &label_counts);
+            if (total_labels <= 0) {
+                av_log(context, AV_LOG_ERROR, "Failed to get category label counts or no labels found\n");
+                free_contexts(ctx);
+                return ret;
+            }
+
+            // Initialize DNN with tokenizer for CLIP/CLAP models
+            ret = ff_dnn_init_with_tokenizer(&ctx->dnnctx, goal_mode, ctx->label_classification_ctx->labels,
+                                            ctx->label_classification_ctx->label_count, label_counts, total_labels,
+                                            ctx->tokenizer_path, context);
+            if (ret < 0) {
+                av_freep(&label_counts);
+                free_contexts(ctx);
+                return ret;
+            }
+            av_freep(&label_counts);
+        }
+    } else if (ctx->dnnctx.backend_type == DNN_OV) {
+        // Initialize standard DNN for OpenVINO
+        ret = ff_dnn_init(&ctx->dnnctx, goal_mode, context);
+        if (ret < 0)
+            return ret;
+
+        // Read labels file
+        ctx->label_classification_ctx = av_calloc(1, sizeof(LabelContext));
+        if (!ctx->label_classification_ctx)
+            return AVERROR(ENOMEM);
+
+        ret = read_classify_label_file(context, ctx->label_classification_ctx, ctx->labels_filename,
+                                    AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE);
+        if (ret < 0) {
+            av_log(context, AV_LOG_ERROR, "Failed to read labels file\n");
+            free_contexts(ctx);
+            return ret;
+        }
+    }
+
+    // Set the post-processing callback
+    ff_dnn_set_classify_post_proc(&ctx->dnnctx, dnn_classify_post_proc);
+    return 0;
+}
+
+static av_cold int dnn_classify_init(AVFilterContext *context)
+{
+    DnnClassifyContext *ctx = context->priv;
+    int ret;
+
+    // Create a static pad with the appropriate media type
+    AVFilterPad pad = {
+        .name = av_strdup("default"),
+        .type = ctx->is_audio ? AVMEDIA_TYPE_AUDIO : AVMEDIA_TYPE_VIDEO,
+        .config_props = config_input,
+    };
+
+    ret = ff_append_inpad(context, &pad);
+    if (ret < 0)
+        return ret;
+
+    // Create a matching output pad
+    AVFilterPad outpad = {
+        .name = av_strdup("default"),
+        .type = ctx->is_audio ? AVMEDIA_TYPE_AUDIO : AVMEDIA_TYPE_VIDEO,
+    };
+
+    ret = ff_append_outpad(context, &outpad);
+    if (ret < 0)
+        return ret;
+
+    // Check backend and file parameters (parameter validation only)
+    if (ctx->dnnctx.backend_type == DNN_TH) {
+        // Check CLIP/CLAP specific parameters
+        if (ctx->labels_filename && ctx->categories_filename) {
+            av_log(context, AV_LOG_ERROR, "Labels and categories file cannot be used together\n");
+            return AVERROR(EINVAL);
+        }
+
+        if (!ctx->labels_filename && !ctx->categories_filename) {
+            av_log(context, AV_LOG_ERROR, "Labels or categories file is required for classification\n");
+            return AVERROR(EINVAL);
+        }
+
+        if (!ctx->tokenizer_path) {
+            av_log(context, AV_LOG_ERROR, "Tokenizer file is required for CLIP/CLAP classification\n");
+            return AVERROR(EINVAL);
+        }
+    } else if (ctx->dnnctx.backend_type == DNN_OV) {
+        // Check OpenVINO specific parameters
+        if (!ctx->labels_filename) {
+            av_log(context, AV_LOG_ERROR, "Labels file is required for classification\n");
+            return AVERROR(EINVAL);
+        }
+
+        if (ctx->categories_filename) {
+            av_log(context, AV_LOG_ERROR, "Categories file is only supported for CLIP/CLAP models\n");
+            return AVERROR(EINVAL);
+        }
+
+        // Audio classification is not supported with OpenVINO backend
+        if (ctx->is_audio) {
+            av_log(context, AV_LOG_ERROR, "Audio classification requires Torch backend\n");
+            return AVERROR(EINVAL);
+        }
+    }
+    return 0;
+}
+
+static const enum AVPixelFormat pix_fmts[] = {AV_PIX_FMT_RGB24,   AV_PIX_FMT_BGR24,   AV_PIX_FMT_GRAY8,
+                                            AV_PIX_FMT_GRAYF32, AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
+                                            AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
+                                            AV_PIX_FMT_NV12,    AV_PIX_FMT_NONE};
+
+static const enum AVSampleFormat sample_fmts[] = {AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_NONE};
+
+static int query_formats(AVFilterContext *ctx)
+{
+    DnnClassifyContext *classify_ctx = ctx->priv;
+
+    int ret;
+    // Get the type from the first input pad
+    enum AVMediaType type = ctx->inputs[0]->type;
+
+    if (type == AVMEDIA_TYPE_VIDEO) {
+        ret = ff_set_common_formats(ctx, ff_make_format_list(pix_fmts));
+        if (ret < 0)
+            return ret;
+    } else if (type == AVMEDIA_TYPE_AUDIO) {
+
+        ret = ff_set_common_formats(ctx, ff_make_format_list(sample_fmts));
+        if (ret < 0)
+            return ret;
+#if (CONFIG_LIBTORCH == 1)
+        ret = ff_set_common_samplerates(
+            ctx, ff_make_format_list((const int[]){classify_ctx->dnnctx.torch_option.sample_rate, -1}));
+        if (ret < 0)
+            return ret;
+#endif
+        ret = ff_set_common_channel_layouts(ctx, ff_all_channel_layouts());
+        if (ret < 0)
+            return ret;
+    } else {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported media type: %d\n", type);
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+static int dnn_classify_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
+{
+    AVFilterContext *context = outlink->src;
+    DnnClassifyContext *ctx = context->priv;
+    int ret;
+    DNNAsyncStatusType async_state;
+
+    ret = ff_dnn_flush(&ctx->dnnctx);
+    if (ret != 0) {
+        return -1;
+    }
+
+    do {
+        AVFrame *in_frame = NULL;
+        AVFrame *out_frame = NULL;
+        async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
+        if (async_state == DAST_SUCCESS) {
+            ret = ff_filter_frame(outlink, in_frame);
+            if (ret < 0)
+                return ret;
+            if (out_pts)
+                *out_pts = in_frame->pts + pts;
+        }
+        av_usleep(5000);
+    } while (async_state >= DAST_NOT_READY);
+
+    return 0;
+}
+
+static int process_video_frame(DnnClassifyContext *ctx, AVFrame *frame)
+{
+    int ret;
+
+    if (ctx->dnnctx.backend_type == DNN_TH) {
+        ret = ff_dnn_execute_model_clip(&ctx->dnnctx, frame, NULL, ctx->label_classification_ctx->labels,
+                                        ctx->label_classification_ctx->label_count, ctx->tokenizer_path, ctx->target);
+    } else {
+        ret = ff_dnn_execute_model_classification(&ctx->dnnctx, frame, NULL, ctx->target);
+    }
+
+    if (ret != 0) {
+        av_frame_free(&frame);
+        return AVERROR(EIO);
+    }
+
+    return 0;
+}
+
+static int process_audio_frame(DnnClassifyContext *ctx, AVFrame *frame)
+{
+    int ret = ff_dnn_execute_model_clap(&ctx->dnnctx, frame, NULL, ctx->label_classification_ctx->labels,
+                                        ctx->label_classification_ctx->label_count, ctx->tokenizer_path);
+
+    if (ret != 0) {
+        av_frame_free(&frame);
+        return AVERROR(EIO);
+    }
+
+    return 0;
+}
+
+static int process_audio_buffer(DnnClassifyContext *ctx, AVFilterLink *inlink)
+{
+    static AVFrame *audio_buffer = NULL;
+    static int buffer_offset = 0;
+    int64_t required_samples = ctx->dnnctx.torch_option.sample_rate * ctx->dnnctx.torch_option.sample_duration;
+    int ret = 0, samples_to_copy = 0;
+    AVFrame *in = NULL;
+
+    while (buffer_offset < required_samples) {
+        ret = ff_inlink_consume_frame(inlink, &in);
+        if (ret < 0)
+            return ret;
+        if (ret == 0)
+            break; // No more frames available right now
+
+        // First frame - initialize our buffer
+        if (!audio_buffer) {
+            audio_buffer = av_frame_alloc();
+            if (!audio_buffer) {
+                av_frame_free(&in);
+                return AVERROR(ENOMEM);
+            }
+
+            // Allocate our buffer to hold exactly required_samples
+            audio_buffer->format = in->format;
+            audio_buffer->ch_layout = in->ch_layout;
+            audio_buffer->sample_rate = in->sample_rate;
+            audio_buffer->nb_samples = required_samples;
+            audio_buffer->pts = in->pts;
+
+            ret = av_frame_get_buffer(audio_buffer, 0);
+            if (ret < 0) {
+                av_frame_free(&audio_buffer);
+                av_frame_free(&in);
+                return ret;
+            }
+        }
+
+        // Copy samples to our buffer
+        samples_to_copy = FFMIN(in->nb_samples, required_samples - buffer_offset);
+        for (int ch = 0; ch < inlink->ch_layout.nb_channels; ch++) {
+            if (!in->data[ch] || !audio_buffer->data[ch]) {
+                continue;
+            }
+            memcpy((float *)audio_buffer->data[ch] + buffer_offset, (float *)in->data[ch],
+                samples_to_copy * sizeof(float));
+        }
+
+        buffer_offset += samples_to_copy;
+        av_frame_free(&in);
+
+        // If we've filled our buffer, process it
+        if (buffer_offset >= required_samples) {
+            ret = process_audio_frame(ctx, audio_buffer);
+            if (ret < 0)
+                return ret;
+
+            // Reset for next frame
+            audio_buffer = NULL;
+            buffer_offset = 0;
+            break;
+        }
+    }
+    return ret;
+}
+
+static int dnn_classify_activate(AVFilterContext *context)
+{
+    DnnClassifyContext *ctx = context->priv;
+    AVFilterLink *inlink = context->inputs[0];
+    AVFilterLink *outlink = context->outputs[0];
+    int ret, status;
+    int64_t pts;
+    AVFrame *in = NULL;
+    int got_frame = 0;
+    DNNAsyncStatusType async_state;
+
+    // Check for EOF or other status
+    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+        if (status == AVERROR_EOF) {
+            int64_t out_pts = pts;
+            ret = dnn_classify_flush_frame(outlink, pts, &out_pts);
+            ff_outlink_set_status(outlink, status, out_pts);
+            return ret;
+        }
+    }
+
+    if (ctx->type == AVMEDIA_TYPE_AUDIO) {
+        ret = process_audio_buffer(ctx, inlink);
+        if (ret < 0) {
+            return ret;
+        }
+    } else {
+        ret = ff_inlink_consume_frame(inlink, &in);
+        if (ret < 0)
+            return ret;
+        if (ret > 0) {
+            ret = process_video_frame(ctx, in);
+            if (ret < 0) {
+                av_frame_free(&in);
+                return ret;
+            }
+        }
+    }
+
+    // Get processed results
+    do {
+        AVFrame *in_frame = NULL;
+        AVFrame *out_frame = NULL;
+        async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
+        if (async_state == DAST_SUCCESS) {
+            ret = ff_filter_frame(outlink, in_frame);
+            if (ret < 0)
+                return ret;
+            got_frame = 1;
+        }
+    } while (async_state == DAST_SUCCESS);
+
+    if (got_frame)
+        return 0;
+
+    // Request more frames if needed
+    if (ff_outlink_frame_wanted(outlink))
+        ff_inlink_request_frame(inlink);
+
+    return FFERROR_NOT_READY;
+}
+
+static av_cold void dnn_classify_uninit(AVFilterContext *context)
+{
+    DnnClassifyContext *ctx = context->priv;
+    ff_dnn_uninit(&ctx->dnnctx);
+    free_contexts(ctx);
+}
+
+const FFFilter ff_avf_dnn_classify = {
+    .p.name         = "dnn_classify",
+    .p.description  = NULL_IF_CONFIG_SMALL("Apply DNN classification filter to the input."),
+    .p.priv_class   = &dnn_classify_class,
+    .p.flags        = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,
+    .priv_size      = sizeof(DnnClassifyContext),
+    .preinit        = ff_dnn_filter_init_child_class,
+    .init           = dnn_classify_init,
+    .uninit         = dnn_classify_uninit,
+    .activate       = dnn_classify_activate,
+    FILTER_QUERY_FUNC2(query_formats),
+};
diff --git a/libavfilter/vf_dnn_classify.c b/libavfilter/vf_dnn_classify.c
deleted file mode 100644
index f92c41ab76..0000000000
--- a/libavfilter/vf_dnn_classify.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * implementing an classification filter using deep learning networks.
- */
-
-#include "libavutil/file_open.h"
-#include "libavutil/mem.h"
-#include "libavutil/opt.h"
-#include "filters.h"
-#include "dnn_filter_common.h"
-#include "video.h"
-#include "libavutil/time.h"
-#include "libavutil/avstring.h"
-#include "libavutil/detection_bbox.h"
-
-typedef struct DnnClassifyContext {
-    const AVClass *class;
-    DnnContext dnnctx;
-    float confidence;
-    char *labels_filename;
-    char *target;
-    char **labels;
-    int label_count;
-} DnnClassifyContext;
-
-#define OFFSET(x) offsetof(DnnClassifyContext, dnnctx.x)
-#define OFFSET2(x) offsetof(DnnClassifyContext, x)
-#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
-static const AVOption dnn_classify_options[] = {
-    { "dnn_backend", "DNN backend",                OFFSET(backend_type),     AV_OPT_TYPE_INT,       { .i64 = DNN_OV },    INT_MIN, INT_MAX, FLAGS, .unit = "backend" },
-#if (CONFIG_LIBOPENVINO == 1)
-    { "openvino",    "openvino backend flag",      0,                        AV_OPT_TYPE_CONST,     { .i64 = DNN_OV },    0, 0, FLAGS, .unit = "backend" },
-#endif
-    { "confidence",  "threshold of confidence",    OFFSET2(confidence),      AV_OPT_TYPE_FLOAT,     { .dbl = 0.5 },  0, 1, FLAGS},
-    { "labels",      "path to labels file",        OFFSET2(labels_filename), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
-    { "target",      "which one to be classified", OFFSET2(target),          AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
-    { NULL }
-};
-
-AVFILTER_DNN_DEFINE_CLASS(dnn_classify, DNN_OV);
-
-static int dnn_classify_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx)
-{
-    DnnClassifyContext *ctx = filter_ctx->priv;
-    float conf_threshold = ctx->confidence;
-    AVDetectionBBoxHeader *header;
-    AVDetectionBBox *bbox;
-    float *classifications;
-    uint32_t label_id;
-    float confidence;
-    AVFrameSideData *sd;
-    int output_size = output->dims[3] * output->dims[2] * output->dims[1];
-    if (output_size <= 0) {
-        return -1;
-    }
-
-    sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
-    if (!sd) {
-        av_log(filter_ctx, AV_LOG_ERROR, "Cannot get side data in dnn_classify_post_proc\n");
-        return -1;
-    }
-    header = (AVDetectionBBoxHeader *)sd->data;
-
-    if (bbox_index == 0) {
-        av_strlcat(header->source, ", ", sizeof(header->source));
-        av_strlcat(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
-    }
-
-    classifications = output->data;
-    label_id = 0;
-    confidence= classifications[0];
-    for (int i = 1; i < output_size; i++) {
-        if (classifications[i] > confidence) {
-            label_id = i;
-            confidence= classifications[i];
-        }
-    }
-
-    if (confidence < conf_threshold) {
-        return 0;
-    }
-
-    bbox = av_get_detection_bbox(header, bbox_index);
-    bbox->classify_confidences[bbox->classify_count] = av_make_q((int)(confidence * 10000), 10000);
-
-    if (ctx->labels && label_id < ctx->label_count) {
-        av_strlcpy(bbox->classify_labels[bbox->classify_count], ctx->labels[label_id], sizeof(bbox->classify_labels[bbox->classify_count]));
-    } else {
-        snprintf(bbox->classify_labels[bbox->classify_count], sizeof(bbox->classify_labels[bbox->classify_count]), "%d", label_id);
-    }
-
-    bbox->classify_count++;
-
-    return 0;
-}
-
-static void free_classify_labels(DnnClassifyContext *ctx)
-{
-    for (int i = 0; i < ctx->label_count; i++) {
-        av_freep(&ctx->labels[i]);
-    }
-    ctx->label_count = 0;
-    av_freep(&ctx->labels);
-}
-
-static int read_classify_label_file(AVFilterContext *context)
-{
-    int line_len;
-    FILE *file;
-    DnnClassifyContext *ctx = context->priv;
-
-    file = avpriv_fopen_utf8(ctx->labels_filename, "r");
-    if (!file){
-        av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);
-        return AVERROR(EINVAL);
-    }
-
-    while (!feof(file)) {
-        char *label;
-        char buf[256];
-        if (!fgets(buf, 256, file)) {
-            break;
-        }
-
-        line_len = strlen(buf);
-        while (line_len) {
-            int i = line_len - 1;
-            if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
-                buf[i] = '\0';
-                line_len--;
-            } else {
-                break;
-            }
-        }
-
-        if (line_len == 0)  // empty line
-            continue;
-
-        if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) {
-            av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
-            fclose(file);
-            return AVERROR(EINVAL);
-        }
-
-        label = av_strdup(buf);
-        if (!label) {
-            av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
-            fclose(file);
-            return AVERROR(ENOMEM);
-        }
-
-        if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {
-            av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
-            fclose(file);
-            av_freep(&label);
-            return AVERROR(ENOMEM);
-        }
-    }
-
-    fclose(file);
-    return 0;
-}
-
-static av_cold int dnn_classify_init(AVFilterContext *context)
-{
-    DnnClassifyContext *ctx = context->priv;
-    int ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_CLASSIFY, context);
-    if (ret < 0)
-        return ret;
-    ff_dnn_set_classify_post_proc(&ctx->dnnctx, dnn_classify_post_proc);
-
-    if (ctx->labels_filename) {
-        return read_classify_label_file(context);
-    }
-    return 0;
-}
-
-static const enum AVPixelFormat pix_fmts[] = {
-    AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
-    AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,
-    AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
-    AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
-    AV_PIX_FMT_NV12,
-    AV_PIX_FMT_NONE
-};
-
-static int dnn_classify_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
-{
-    DnnClassifyContext *ctx = outlink->src->priv;
-    int ret;
-    DNNAsyncStatusType async_state;
-
-    ret = ff_dnn_flush(&ctx->dnnctx);
-    if (ret != 0) {
-        return -1;
-    }
-
-    do {
-        AVFrame *in_frame = NULL;
-        AVFrame *out_frame = NULL;
-        async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
-        if (async_state == DAST_SUCCESS) {
-            ret = ff_filter_frame(outlink, in_frame);
-            if (ret < 0)
-                return ret;
-            if (out_pts)
-                *out_pts = in_frame->pts + pts;
-        }
-        av_usleep(5000);
-    } while (async_state >= DAST_NOT_READY);
-
-    return 0;
-}
-
-static int dnn_classify_activate(AVFilterContext *filter_ctx)
-{
-    AVFilterLink *inlink = filter_ctx->inputs[0];
-    AVFilterLink *outlink = filter_ctx->outputs[0];
-    DnnClassifyContext *ctx = filter_ctx->priv;
-    AVFrame *in = NULL;
-    int64_t pts;
-    int ret, status;
-    int got_frame = 0;
-    int async_state;
-
-    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
-
-    do {
-        // drain all input frames
-        ret = ff_inlink_consume_frame(inlink, &in);
-        if (ret < 0)
-            return ret;
-        if (ret > 0) {
-            if (ff_dnn_execute_model_classification(&ctx->dnnctx, in, NULL, ctx->target) != 0) {
-                return AVERROR(EIO);
-            }
-        }
-    } while (ret > 0);
-
-    // drain all processed frames
-    do {
-        AVFrame *in_frame = NULL;
-        AVFrame *out_frame = NULL;
-        async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
-        if (async_state == DAST_SUCCESS) {
-            ret = ff_filter_frame(outlink, in_frame);
-            if (ret < 0)
-                return ret;
-            got_frame = 1;
-        }
-    } while (async_state == DAST_SUCCESS);
-
-    // if frame got, schedule to next filter
-    if (got_frame)
-        return 0;
-
-    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
-        if (status == AVERROR_EOF) {
-            int64_t out_pts = pts;
-            ret = dnn_classify_flush_frame(outlink, pts, &out_pts);
-            ff_outlink_set_status(outlink, status, out_pts);
-            return ret;
-        }
-    }
-
-    FF_FILTER_FORWARD_WANTED(outlink, inlink);
-
-    return 0;
-}
-
-static av_cold void dnn_classify_uninit(AVFilterContext *context)
-{
-    DnnClassifyContext *ctx = context->priv;
-    ff_dnn_uninit(&ctx->dnnctx);
-    free_classify_labels(ctx);
-}
-
-const FFFilter ff_vf_dnn_classify = {
-    .p.name        = "dnn_classify",
-    .p.description = NULL_IF_CONFIG_SMALL("Apply DNN classify filter to the input."),
-    .p.priv_class  = &dnn_classify_class,
-    .priv_size     = sizeof(DnnClassifyContext),
-    .preinit       = ff_dnn_filter_init_child_class,
-    .init          = dnn_classify_init,
-    .uninit        = dnn_classify_uninit,
-    FILTER_INPUTS(ff_video_default_filterpad),
-    FILTER_OUTPUTS(ff_video_default_filterpad),
-    FILTER_PIXFMTS_ARRAY(pix_fmts),
-    .activate      = dnn_classify_activate,
-};
--
2.34.1


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".