[FFmpeg-devel] [PATCH] avfilter/dnn: add zero-shot image classification using CLIP models

* [FFmpeg-devel] [PATCH] avfilter/dnn: add zero-shot image classification using CLIP models
@ 2025-01-29 20:33 m.kaindl0208
  2025-02-16  6:08 ` Guo, Yejun
  0 siblings, 1 reply; 6+ messages in thread
From: m.kaindl0208 @ 2025-01-29 20:33 UTC (permalink / raw)
  To: ffmpeg-devel

Add a new filter 'dnn_clip' that performs zero-shot image classification using CLIP (Contrastive Language-Image Pre-Training) models. The filter supports:
- Loading and running CLIP models through the LibTorch backend
- Outputting classification confidence scores as frame side data

Requires tokenizers-cpp library (https://github.com/mlc-ai/tokenizers-cpp) for text tokenization functionality. Build it like described on the Github.

Example for running clip inference:
./ffmpeg -i duck.mp4 -vf "dnn_clip=dnn_backend=torch:model= openclip-vit-l-14.pt:labels=labels_duck.txt:tokenizer=tokenizer.json,showinfo" -y out.mp4

Example Configure:
Set LIBTORCH_ROOT and TOKENIZER_ROOT according to their root folders

./configure \
    --enable-debug \
    --enable-libtorch \
    --enable-libtokenizers \
    --extra-cflags="-I${LIBTORCH_ROOT}/include \
                    -I${LIBTORCH_ROOT}/include/torch/csrc/api/include \
                    -I${TOKENIZER_ROOT}/include " \
    --extra-ldflags="-L${LIBTORCH_ROOT}/lib \
                     -L${TOKENIZER_ROOT}/example/build/tokenizers "

Do not forget to add/append the Paths to the PATH and LD_LIBRARY_PATH

Note: Configure "test" skips tokenizer functionality verification since Tokenizer is an abstract interface class.

For testing purposes, I mainly utilized the LAION CLIP-ViT-L-14-DataComp.XL-s13B-b90K model, which is available on Hugging Face. The repository contains both the required tokenizer.json file and the model itself. To use this model, it needs to be converted to the PyTorch (.pt) format. This conversion can be accomplished using Python scripting.

I have implemented this conversion process and the code is available in my GitHub repository under https://github.com/MaximilianKaindl/PyVideoClassifcation/blob/main/src/converters/clip_to_pt.py. The code will download the CLIP model from Hugging Face, convert it to PyTorch format, and then test it using an input image.

I think this functionality would be a great addition to FFMPEG. I will implement an additional Filter that will take the average classification over all frames and save it to the file or output it to the console. 
Using this, content recognition could be achieved using e.g., following prompts: 
a photo of a sports show
a photo of a documentary
.... 

Furthermore, I also have implemented fate tests. Should I upload those tests? The necessary files are around 1.6 GB, mostly because of the model.

Signed-off-by: MaximilianKaindl <m.kaindl0208@gmail.com>
---
 configure                                  |   6 +-
 libavfilter/Makefile                       |   1 +
 libavfilter/allfilters.c                   |   1 +
 libavfilter/dnn/Makefile                   |   2 +
 libavfilter/dnn/dnn_backend_torch.cpp      | 128 ++++++--
 libavfilter/dnn/dnn_backend_torch_clip.cpp | 356 +++++++++++++++++++++
 libavfilter/dnn/dnn_backend_torch_clip.h   |  58 ++++
 libavfilter/dnn/dnn_backend_torch_common.h |  66 ++++
 libavfilter/dnn/dnn_io_proc.c              |  52 +++
 libavfilter/dnn/dnn_io_proc.h              |   1 +
 libavfilter/dnn_filter_common.c            |  16 +
 libavfilter/dnn_filter_common.h            |   1 +
 libavfilter/dnn_interface.h                |   8 +
 libavfilter/vf_dnn_clip.c                  | 342 ++++++++++++++++++++
 14 files changed, 1004 insertions(+), 34 deletions(-)
 create mode 100644 libavfilter/dnn/dnn_backend_torch_clip.cpp
 create mode 100644 libavfilter/dnn/dnn_backend_torch_clip.h
 create mode 100644 libavfilter/dnn/dnn_backend_torch_common.h
 create mode 100644 libavfilter/vf_dnn_clip.c

diff --git a/configure b/configure
index 06f641ead0..9fb439f2a8 100755
--- a/configure
+++ b/configure
@@ -285,6 +285,7 @@ External library support:
   --enable-libtls          enable LibreSSL (via libtls), needed for https support
                            if openssl, gnutls or mbedtls is not used [no]
   --enable-libtorch        enable Torch as one DNN backend [no]
+  --enable-libtokenizers   enable tokenizers-cpp library [no]
   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
   --enable-libv4l2         enable libv4l2/v4l-utils [no]
@@ -1967,6 +1968,7 @@ EXTERNAL_LIBRARY_LIST="
     libtesseract
     libtheora
     libtorch
+    libtokenizers
     libtwolame
     libuavs3d
     libv4l2
@@ -2870,7 +2872,7 @@ dirac_parse_select="golomb"
 dovi_rpudec_select="golomb"
 dovi_rpuenc_select="golomb"
 dnn_deps="avformat swscale"
-dnn_deps_any="libtensorflow libopenvino libtorch"
+dnn_deps_any="libtensorflow libopenvino libtorch libtokenizers"
 error_resilience_select="me_cmp"
 evcparse_select="golomb"
 faandct_deps="faan"
@@ -3892,6 +3894,7 @@ dilation_opencl_filter_deps="opencl"
 dnn_classify_filter_select="dnn"
 dnn_detect_filter_select="dnn"
 dnn_processing_filter_select="dnn"
+dnn_clip_filter_select="dnn"
 drawtext_filter_deps="libfreetype libharfbuzz"
 drawtext_filter_suggest="libfontconfig libfribidi"
 elbg_filter_deps="avcodec"
@@ -7034,6 +7037,7 @@ enabled libtesseract      && require_pkg_config libtesseract tesseract tesseract
 enabled libtheora         && require libtheora theora/theoraenc.h th_info_init -ltheoraenc -ltheoradec -logg
 enabled libtls            && require_pkg_config libtls libtls tls.h tls_configure
 enabled libtorch          && check_cxxflags -std=c++17 && require_cpp libtorch torch/torch.h "torch::Tensor" -ltorch -lc10 -ltorch_cpu -lstdc++ -lpthread
+enabled libtokenizers && check_cxxflags -std=c++17 && require_cpp libtokenizers "tokenizers_cpp.h tokenizers_c.h" "" -ltokenizers_cpp -ltokenizers_c -lstdc++
 enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame &&
                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 4d9681768b..efb5a8164e 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -289,6 +289,7 @@ OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o framesync.o
 OBJS-$(CONFIG_DNN_CLASSIFY_FILTER)           += vf_dnn_classify.o
 OBJS-$(CONFIG_DNN_DETECT_FILTER)             += vf_dnn_detect.o
 OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         += vf_dnn_processing.o
+OBJS-$(CONFIG_DNN_CLIP_FILTER)               += vf_dnn_clip.o
 OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o
 OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
 OBJS-$(CONFIG_DRAWGRAPH_FILTER)              += f_drawgraph.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index c9178ba27b..da2117f038 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -264,6 +264,7 @@ extern const FFFilter ff_vf_displace;
 extern const FFFilter ff_vf_dnn_classify;
 extern const FFFilter ff_vf_dnn_detect;
 extern const FFFilter ff_vf_dnn_processing;
+extern const FFFilter ff_vf_dnn_clip;
 extern const FFFilter ff_vf_doubleweave;
 extern const FFFilter ff_vf_drawbox;
 extern const FFFilter ff_vf_drawgraph;
diff --git a/libavfilter/dnn/Makefile b/libavfilter/dnn/Makefile
index 3d09927c98..92d374e2bb 100644
--- a/libavfilter/dnn/Makefile
+++ b/libavfilter/dnn/Makefile
@@ -7,5 +7,7 @@ OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_common.o
 DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn/dnn_backend_tf.o
 DNN-OBJS-$(CONFIG_LIBOPENVINO)               += dnn/dnn_backend_openvino.o
 DNN-OBJS-$(CONFIG_LIBTORCH)                  += dnn/dnn_backend_torch.o
+DNN-OBJS-$(CONFIG_LIBTORCH)                   += dnn/dnn_backend_torch.o
+DNN-OBJS-$(CONFIG_LIBTORCH)                   += dnn/dnn_backend_torch_clip.o
 
 OBJS-$(CONFIG_DNN)                           += $(DNN-OBJS-yes)
diff --git a/libavfilter/dnn/dnn_backend_torch.cpp b/libavfilter/dnn/dnn_backend_torch.cpp
index 2e4326d9d4..5afef3ca60 100644
--- a/libavfilter/dnn/dnn_backend_torch.cpp
+++ b/libavfilter/dnn/dnn_backend_torch.cpp
@@ -23,38 +23,11 @@
  * DNN Torch backend implementation.
  */
 
-#include <torch/torch.h>
-#include <torch/script.h>
-
-extern "C" {
-#include "dnn_io_proc.h"
-#include "dnn_backend_common.h"
-#include "libavutil/opt.h"
-#include "libavutil/mem.h"
-#include "queue.h"
-#include "safe_queue.h"
-}
-
-typedef struct THModel {
-    DNNModel model;
-    DnnContext *ctx;
-    torch::jit::Module *jit_model;
-    SafeQueue *request_queue;
-    Queue *task_queue;
-    Queue *lltask_queue;
-} THModel;
-
-typedef struct THInferRequest {
-    torch::Tensor *output;
-    torch::Tensor *input_tensor;
-} THInferRequest;
-
-typedef struct THRequestItem {
-    THInferRequest *infer_request;
-    LastLevelTaskItem *lltask;
-    DNNAsyncExecModule exec_module;
-} THRequestItem;
+#include "dnn_backend_torch_common.h"
 
+#if (CONFIG_LIBTOKENIZERS == 1)
+#include "dnn_backend_torch_clip.h"
+#endif
 
 #define OFFSET(x) offsetof(THOptions, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM
@@ -95,6 +68,12 @@ static void th_free_request(THInferRequest *request)
         delete(request->input_tensor);
         request->input_tensor = NULL;
     }
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    if (request->text_embeddings) {
+        delete(request->text_embeddings);
+        request->text_embeddings = NULL;    
+    }
+    #endif
     return;
 }
 
@@ -139,6 +118,11 @@ static void dnn_free_model_th(DNNModel **model)
     }
     ff_queue_destroy(th_model->task_queue);
     delete th_model->jit_model;
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    if (th_model->is_clip_model) {
+        free_clip_context(th_model->clip_ctx);
+    }
+    #endif
     av_freep(&th_model);
     *model = NULL;
 }
@@ -185,8 +169,17 @@ static int fill_model_input_th(THModel *th_model, THRequestItem *request)
     width_idx = dnn_get_width_idx_by_layout(input.layout);
     height_idx = dnn_get_height_idx_by_layout(input.layout);
     channel_idx = dnn_get_channel_idx_by_layout(input.layout);
-    input.dims[height_idx] = task->in_frame->height;
-    input.dims[width_idx] = task->in_frame->width;
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    if (th_model->is_clip_model) {
+        input.dims[height_idx] = 224;
+        input.dims[width_idx] = 224;
+    } else {
+    #endif
+        input.dims[height_idx] = task->in_frame->height;
+        input.dims[width_idx] = task->in_frame->width;
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    }
+    #endif
     input.data = av_malloc(input.dims[height_idx] * input.dims[width_idx] *
                            input.dims[channel_idx] * sizeof(float));
     if (!input.data)
@@ -205,6 +198,13 @@ static int fill_model_input_th(THModel *th_model, THRequestItem *request)
             }
         }
         break;
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    case DFT_ANALYTICS_CLIP:
+        if (task->do_ioproc) {
+            ff_frame_to_dnn_clip(task->in_frame, &input, ctx);              
+        }
+        break;
+    #endif
     default:
         avpriv_report_missing_feature(NULL, "model function type %d", th_model->model.func_type);
         break;
@@ -212,6 +212,15 @@ static int fill_model_input_th(THModel *th_model, THRequestItem *request)
     *infer_request->input_tensor = torch::from_blob(input.data,
         {1, input.dims[channel_idx], input.dims[height_idx], input.dims[width_idx]},
         deleter, torch::kFloat32);
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    if(th_model->is_clip_model){
+        ret = fill_model_input_clip(th_model, request, input);
+        if (ret < 0) {
+            goto err;
+        }
+        return 0;
+    }
+    #endif
     return 0;
 
 err:
@@ -251,6 +260,15 @@ static int th_start_inference(void *args)
     }
     // Transfer tensor to the same device as model
     c10::Device device = (*th_model->jit_model->parameters().begin()).device();
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    if (th_model->is_clip_model) {
+        int ret = forward_clip(th_model,request,device);
+        if(ret < 0){
+            return ret;
+        }
+        return 0;
+    }
+    #endif
     if (infer_request->input_tensor->device() != device)
         *infer_request->input_tensor = infer_request->input_tensor->to(device);
     inputs.push_back(*infer_request->input_tensor);
@@ -273,6 +291,12 @@ static void infer_completion_callback(void *args) {
     outputs.order = DCO_RGB;
     outputs.layout = DL_NCHW;
     outputs.dt = DNN_FLOAT;
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    if (th_model->is_clip_model && sizes.size() == 1) {
+        //Do nothing Clip output has only one dimension which are the similarity scores
+    }
+    else
+    #endif
     if (sizes.size() == 4) {
         // 4 dimensions: [batch_size, channel, height, width]
         // this format of data is normally used for video frame SR
@@ -303,6 +327,24 @@ static void infer_completion_callback(void *args) {
             task->out_frame->height = outputs.dims[dnn_get_height_idx_by_layout(outputs.layout)];
         }
         break;
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    case DFT_ANALYTICS_CLIP:
+        if (task->do_ioproc) {
+            // Post process can only deal with CPU memory.
+            if (output->device() != torch::kCPU)
+                *output = output->to(torch::kCPU);
+            outputs.data = output->data_ptr<float>();
+            if (!th_model->model.classify_post_proc) {
+                av_log(th_model->ctx, AV_LOG_ERROR, "clip filter needs to provide post proc\n");
+                goto err;
+            }
+            th_model->model.classify_post_proc(task->in_frame, &outputs, lltask->bbox_index, th_model->model.filter_ctx);           
+        } else {
+            task->out_frame->width = outputs.dims[dnn_get_width_idx_by_layout(outputs.layout)];
+            task->out_frame->height = outputs.dims[dnn_get_height_idx_by_layout(outputs.layout)];
+        }
+        break;
+    #endif
     default:
         avpriv_report_missing_feature(th_model->ctx, "model function type %d", th_model->model.func_type);
         goto err;
@@ -413,6 +455,9 @@ static THInferRequest *th_create_inference_request(void)
     }
     request->input_tensor = NULL;
     request->output = NULL;
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    request->text_embeddings = NULL;
+    #endif
     return request;
 }
 
@@ -445,6 +490,13 @@ static DNNModel *dnn_load_model_th(DnnContext *ctx, DNNFunctionType func_type, A
         th_model->jit_model = new torch::jit::Module;
         (*th_model->jit_model) = torch::jit::load(ctx->model_filename);
         th_model->jit_model->to(device);
+        #if (CONFIG_LIBTOKENIZERS == 1)
+        th_model->is_clip_model = false;
+        // Check if this is a CLIP model and initialize accordingly
+        if (func_type == DFT_ANALYTICS_CLIP && init_clip_model(th_model,filter_ctx) > 0) {
+            goto fail;
+        }
+        #endif
     } catch (const c10::Error& e) {
         av_log(ctx, AV_LOG_ERROR, "Failed to load torch model\n");
         goto fail;
@@ -545,6 +597,16 @@ static int dnn_execute_model_th(const DNNModel *model, DNNExecBaseParams *exec_p
         return AVERROR(EINVAL);
     }
 
+    #if (CONFIG_LIBTOKENIZERS == 1)
+    if(model->func_type == DFT_ANALYTICS_CLIP) {
+        DNNExecZeroShotClassificationParams *params = (DNNExecZeroShotClassificationParams *) exec_params;
+        ret = set_params_clip(th_model, params->labels, params->label_count, params->tokenizer_path);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+    #endif
+
     return execute_model_th(request, th_model->lltask_queue);
 }
 
diff --git a/libavfilter/dnn/dnn_backend_torch_clip.cpp b/libavfilter/dnn/dnn_backend_torch_clip.cpp
new file mode 100644
index 0000000000..4cfc5cab9a
--- /dev/null
+++ b/libavfilter/dnn/dnn_backend_torch_clip.cpp
@@ -0,0 +1,356 @@
+/*
+* This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dnn_backend_torch_clip.h"
+#if (CONFIG_LIBTOKENIZERS == 1)
+
+extern "C" {
+#include "libavutil/mem.h"
+#include "libavutil/log.h"
+#include "libswscale/swscale.h"
+#include "libavformat/avio.h"
+}
+
+static torch::Tensor get_tokens(const THModel *th_model, const std::string& prompt) {
+    DnnContext *ctx = th_model->ctx;
+    const int expected_length = EMBEDDING_SIZE_CLIP;
+
+    try {
+        if (!th_model->clip_ctx || !th_model->clip_ctx->tokenizer) {
+            throw std::runtime_error("Tokenizer not initialized");
+        }
+
+        int32_t start_token = th_model->clip_ctx->tokenizer->TokenToId(START_TOKEN_CLIP);
+        int32_t end_token = th_model->clip_ctx->tokenizer->TokenToId(END_TOKEN_CLIP);
+
+        // Create vector with correct size, filled with padding tokens
+        std::vector<int64_t> padded_ids(expected_length, PADDING_TOKEN_CLIP);
+
+        // Add start token
+        padded_ids[0] = start_token;
+
+        try {
+            // Get tokens from the tokenizer
+            std::vector<int> tokens = th_model->clip_ctx->tokenizer->Encode(prompt);
+
+            // Calculate how many tokens we can copy (leaving space for start and end tokens)
+            const size_t max_text_tokens = expected_length - 2;
+
+            const size_t num_tokens = tokens.size();
+            if(num_tokens > max_text_tokens) {
+                av_log(ctx, AV_LOG_WARNING, "Input text is too long, truncating to %ld tokens\n", max_text_tokens);
+            }
+            // Copy tokens after the start token
+            size_t i;
+            for (i = 0; i < num_tokens; i++) {
+                padded_ids[i + 1] = tokens[i];
+            }
+            padded_ids[i+1] = end_token;
+
+            auto tensor = torch::from_blob(
+                padded_ids.data(),
+                {1, expected_length}, 
+                torch::kInt64
+            ).clone(); 
+
+            return tensor;
+
+        } catch (const std::exception& e) {
+            av_log(ctx, AV_LOG_ERROR, "Token encoding failed: %s\n", e.what());
+            // Return empty tensor with correct dimensions on error
+            return torch::zeros({1, expected_length}, torch::kInt64);
+        }
+
+    } catch (const std::exception& e) {
+        av_log(ctx, AV_LOG_ERROR, "Token generation failed: %s\n", e.what());
+        return torch::zeros({1, expected_length}, torch::kInt64);
+    }
+}
+
+static int load_bytes_from_file(const std::string& path, std::string& data, DnnContext* log_ctx) {
+    AVIOContext *ctx = NULL;
+    int ret;
+    int64_t size;
+    
+    ret = avio_open(&ctx, path.c_str(), AVIO_FLAG_READ);
+    if (ret < 0) {
+        av_log(log_ctx, AV_LOG_ERROR, "Cannot open file: %s\n", path.c_str());
+        return ret;
+    }
+
+    size = avio_size(ctx);
+    if (size < 0) {
+        av_log(log_ctx, AV_LOG_ERROR, "Failed to determine file size: %s\n", path.c_str());
+        return size;
+    }
+
+    try {
+        data.resize(size);
+        ret = avio_read(ctx, (unsigned char*)data.data(), size);
+        if (ret < 0) {
+            av_log(log_ctx, AV_LOG_ERROR, "Failed to read file: %s\n", path.c_str());
+            return ret;
+        }
+        if (ret != size) {
+            av_log(log_ctx, AV_LOG_ERROR, "Incomplete read: %s\n", path.c_str());
+            return AVERROR(EIO);
+        }
+    } catch (const std::exception& e) {
+        av_log(log_ctx, AV_LOG_ERROR, "Exception while reading file %s: %s\n", 
+               path.c_str(), e.what());
+        return AVERROR(ENOMEM);
+    }
+
+    return 0;
+}
+
+int create_tokenizer(const THModel *th_model, const std::string& tokenizer_path) {
+    //Dont create tokenizer if it already exists
+    if (th_model->clip_ctx->tokenizer) {
+        return 0;
+    }
+
+    std::string blob;
+    int ret = load_bytes_from_file(tokenizer_path, blob, th_model->ctx);
+    if (ret < 0) {
+        return ret;
+    }
+
+    try {
+        th_model->clip_ctx->tokenizer = Tokenizer::FromBlobJSON(blob);
+    } catch (const c10::Error& e) {
+        av_log(th_model->ctx, AV_LOG_ERROR, "Error creating tokenizer: %s\n", e.what());
+        return AVERROR(EINVAL);
+    }
+    return 0;
+}
+
+int init_clip_model(THModel *th_model, const AVFilterContext *filter_ctx) {
+    try {
+        //Should throw exception if not existing
+        auto encode_image = th_model->jit_model->get_method("encode_image");
+        auto encode_text = th_model->jit_model->get_method("encode_text");
+        th_model->is_clip_model = true;
+        th_model->clip_ctx = (THClipContext *)av_mallocz(sizeof(THClipContext));
+        th_model->clip_ctx->logit_scale = std::exp(std::log(1.0f / 0.07f));
+        av_log(th_model->ctx, AV_LOG_INFO, 
+               "Successfully initialized CLIP model\n");
+        return 0;
+
+    } catch (const c10::Error& e) {
+        av_log(th_model->ctx, AV_LOG_ERROR, 
+               "Error during CLIP model initialization: %s\n", e.what());
+        return AVERROR(EINVAL);
+    }
+}
+
+
+int encode_image_clip(const THModel *th_model, const THRequestItem *request, const c10::Device& device) {
+    THInferRequest *infer_request = request->infer_request;
+    DnnContext *ctx = th_model->ctx;
+
+    try {               
+        if (infer_request->input_tensor->device() != device) 
+            *infer_request->input_tensor = infer_request->input_tensor->to(device);
+
+        // Apply CLIP specific normalization
+        auto options = torch::TensorOptions().dtype(torch::kFloat32);
+        auto mean = torch::tensor({0.48145466, 0.4578275, 0.40821073}, options).view({1, 3, 1, 1});
+        auto std = torch::tensor({0.26862954, 0.26130258, 0.27577711}, options).view({1, 3, 1, 1});
+
+        *infer_request->input_tensor = (*infer_request->input_tensor - mean) / std;
+
+        // Get image features
+        auto image_features = th_model->jit_model->run_method(
+            "encode_image",
+            *infer_request->input_tensor,
+            true  // normalize
+        );
+
+        if (!image_features.isTensor()) {
+            av_log(ctx, AV_LOG_ERROR, "Model returned invalid non-tensor output\n");
+            return AVERROR(EINVAL);
+        }
+        *infer_request->input_tensor = image_features.toTensor();
+        return 0;
+
+    } catch (const c10::Error& e) {
+        av_log(ctx, AV_LOG_ERROR, "Image encoding error: %s\n", e.what());
+        return AVERROR(EINVAL);
+    }
+}
+
+int encode_text_clip(const THModel *th_model, const THRequestItem *request, const c10::Device& device) {
+    THInferRequest *infer_request = request->infer_request;
+    DnnContext *ctx = th_model->ctx;
+    THClipContext *clip_ctx = th_model->clip_ctx;
+    infer_request->text_embeddings = new std::vector<torch::Tensor>();
+
+    try {
+        infer_request->text_embeddings->reserve(clip_ctx->labels.size());
+
+        for (const auto& label : clip_ctx->labels) {
+            torch::Tensor tokens = get_tokens(th_model, label);
+
+            if (tokens.device() != device) 
+                tokens = tokens.to(device);
+
+            auto text_embedding = th_model->jit_model->run_method(
+                "encode_text",
+                tokens, 
+                true // normalize
+            );
+
+            if (!text_embedding.isTensor()) {
+                av_log(ctx, AV_LOG_ERROR, "Model returned invalid non-tensor output for text encoding\n");
+                return AVERROR(EINVAL);
+            }
+            infer_request->text_embeddings->push_back(text_embedding.toTensor());
+        }
+        return 0;
+    } catch (const c10::Error& e) {
+        av_log(ctx, AV_LOG_ERROR, "Text encoding error: %s\n", e.what());
+        return AVERROR(EINVAL);
+    }
+}
+
+int forward_clip(const THModel *th_model, const THRequestItem *request, const c10::Device& device)
+{
+    int ret;
+    ret = encode_image_clip(th_model, request, device);
+    if (ret < 0) {
+        av_log(th_model->ctx, AV_LOG_ERROR, "Image encoding failed in CLIP preprocessing\n");
+        return ret;
+    }
+    ret = encode_text_clip(th_model, request, device);
+    if (ret < 0) {
+        av_log(th_model->ctx, AV_LOG_ERROR, "Text encoding failed in CLIP preprocessing\n");
+        return ret;
+    }
+    ret = process_clip_similarity(th_model, request, device);
+    if (ret < 0) {
+        av_log(th_model->ctx, AV_LOG_ERROR, "Error in CLIP Similarity calculation\n");
+        return ret;
+    }
+    return 0;
+}
+
+int fill_model_input_clip(const THModel *th_model, const THRequestItem *request, const DNNData& input)
+{
+    DnnContext *ctx = th_model->ctx;
+    THInferRequest *infer_request = request->infer_request;
+    *infer_request->output = infer_request->input_tensor->clone().detach();
+
+    // Verify the clone worked
+    if (!infer_request->output->defined() || infer_request->output->sizes() != infer_request->input_tensor->sizes()) {
+        av_log(ctx, AV_LOG_ERROR, "Tensor cloning failed\n");
+        return AVERROR(EINVAL);
+    }
+
+    int ret;
+    ret = create_tokenizer(th_model, th_model->clip_ctx->tokenizer_path);
+    if(ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Error creating tokenizer\n");
+        return ret;
+    }
+    return 0;
+}
+
+int set_params_clip(const THModel *th_model, const char **labels, const int& label_count, const char *tokenizer_path) {
+    if (!labels || label_count <= 0) {
+        av_log(th_model->ctx, AV_LOG_ERROR, "Label file invalid.\n");
+        return AVERROR(EINVAL);
+    }
+
+    std::vector<std::string> label_vector;
+    label_vector.reserve(label_count); 
+
+    for (int i = 0; i < label_count; i++) {
+        if (labels[i]) {
+            label_vector.emplace_back(labels[i]);
+        }
+    }
+    th_model->clip_ctx->labels = label_vector;
+    th_model->clip_ctx->tokenizer_path = tokenizer_path;
+    return 0;
+}
+
+static torch::Tensor calculate_clip_similarity_matrix(const torch::Tensor& image_features, const torch::Tensor& text_embedding, const float& logit_scale, DnnContext *ctx, float temperature = 1.0) {
+    try {
+        auto similarity = torch::matmul(image_features, text_embedding.transpose(0,1));    
+        similarity = similarity * logit_scale;      
+        return similarity.div(temperature);
+    } catch (const c10::Error& e) {
+        av_log(ctx, AV_LOG_ERROR, "Similarity computation failed: %s\n", e.what());
+        return {};
+    }
+}
+
+int process_clip_similarity(const THModel *th_model, const THRequestItem *request, const c10::Device& device) {
+    DnnContext *ctx = th_model->ctx;
+    THInferRequest *infer_request = request->infer_request;
+    std::vector<float> similarity_scores;
+    auto embedding_count = infer_request->text_embeddings->size();
+    similarity_scores.reserve(embedding_count);
+
+    try {
+        if(infer_request->input_tensor->device() != device)
+            *infer_request->input_tensor = infer_request->input_tensor->to(device);
+
+        for (size_t i = 0; i < embedding_count; i++) {
+            if((*infer_request->text_embeddings)[i].device() != device) {
+                (*infer_request->text_embeddings)[i] = (*infer_request->text_embeddings)[i].to(device);
+            }
+            auto similarity = calculate_clip_similarity_matrix(*infer_request->input_tensor, (*infer_request->text_embeddings)[i], th_model->clip_ctx->logit_scale, ctx);
+            auto similarity_value = similarity.item<float>();
+            similarity_scores.push_back(similarity_value);
+
+            av_log(ctx, AV_LOG_DEBUG, "Label %s: logit_value=%.4f\n",
+                   th_model->clip_ctx->labels[i].c_str(), similarity_value);
+        }
+
+        // Convert scores to tensor and compute softmax
+        auto scores_tensor = torch::tensor(similarity_scores);
+        auto softmax_scores = torch::softmax(scores_tensor, 0);
+
+        infer_request->output = new torch::Tensor(softmax_scores);
+
+        if (!infer_request->output->defined()) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to create output tensor\n");
+            return AVERROR(EINVAL);
+        }
+        return 0;
+
+    } catch (const c10::Error& e) {
+        av_log(ctx, AV_LOG_ERROR, "CLIP similarity computation error: %s\n", e.what());
+        return AVERROR(EINVAL);
+    } catch (const std::exception& e) {
+        av_log(ctx, AV_LOG_ERROR, "Error computing similarities: %s\n", e.what());
+        return AVERROR(EINVAL);
+    }
+}
+
+void free_clip_context(THClipContext *clip_ctx) {
+    if (!clip_ctx)
+        return;
+
+    clip_ctx->labels.clear();
+    clip_ctx->tokenizer.release();
+    av_freep(clip_ctx);
+}
+#endif
\ No newline at end of file
diff --git a/libavfilter/dnn/dnn_backend_torch_clip.h b/libavfilter/dnn/dnn_backend_torch_clip.h
new file mode 100644
index 0000000000..46d40b71c2
--- /dev/null
+++ b/libavfilter/dnn/dnn_backend_torch_clip.h
@@ -0,0 +1,58 @@
+/*
+* This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVFILTER_DNN_DNN_BACKEND_TORCH_CLIP_H
+#define AVFILTER_DNN_DNN_BACKEND_TORCH_CLIP_H
+
+#include "dnn_backend_torch_common.h"
+
+#if (CONFIG_LIBTOKENIZERS == 1)
+#include <string>
+#include <memory>
+#include <vector>
+#include <torch/script.h>
+#include <tokenizers_cpp.h>
+
+using tokenizers::Tokenizer;
+
+typedef struct THClipContext {
+    std::unique_ptr<Tokenizer> tokenizer;
+    std::vector<std::string> labels;
+    std::string tokenizer_path;
+    float logit_scale;
+} THClipContext;
+
+const std::string START_TOKEN_CLIP = "<|startoftext|>";
+const std::string END_TOKEN_CLIP = "<|endoftext|>";
+const int32_t PADDING_TOKEN_CLIP = 0;
+#define EMBEDDING_SIZE_CLIP 77
+
+int init_clip_model(THModel *th_model, const AVFilterContext *filter_ctx);
+int fill_model_input_clip(const THModel *th_model, const THRequestItem *request, const DNNData& input);
+int forward_clip(const THModel *th_model, const THRequestItem *request, const c10::Device& device);
+int process_clip_similarity(const THModel *th_model, const THRequestItem *request, const c10::Device& device);
+
+int create_tokenizer(const THModel *th_model, const std::string& tokenizer_path);
+int encode_image_clip(const THModel *th_model, const THRequestItem *request, const c10::Device& device);
+int encode_text_clip(const THModel *th_model, const THRequestItem *request, const c10::Device& device);
+
+int set_params_clip(const THModel *th_model, const char **labels, const int& label_count,
+                   const char *tokenizer_path);
+void free_clip_context(THClipContext *clip_ctx);
+
+#endif
+#endif
\ No newline at end of file
diff --git a/libavfilter/dnn/dnn_backend_torch_common.h b/libavfilter/dnn/dnn_backend_torch_common.h
new file mode 100644
index 0000000000..d0ee7cb4a8
--- /dev/null
+++ b/libavfilter/dnn/dnn_backend_torch_common.h
@@ -0,0 +1,66 @@
+/*
+* This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVFILTER_DNN_DNN_BACKEND_TORCH_COMMON_H
+#define AVFILTER_DNN_DNN_BACKEND_TORCH_COMMON_H
+
+#include <torch/script.h>
+#include <vector>
+
+struct THClipContext;
+
+extern "C" {
+#include "dnn_io_proc.h"
+#include "dnn_backend_common.h"
+#include "libavutil/opt.h"
+#include "libavutil/mem.h"
+#include "queue.h"
+#include "safe_queue.h"
+}
+
+typedef struct THModel {
+    DNNModel model;
+    DnnContext *ctx;
+    torch::jit::Module *jit_model;
+    SafeQueue *request_queue;
+    Queue *task_queue;
+    Queue *lltask_queue;
+
+    #if CONFIG_LIBTOKENIZERS
+    bool is_clip_model; 
+    THClipContext *clip_ctx;
+    #endif
+
+} THModel;
+
+typedef struct THInferRequest {
+    torch::Tensor *output;
+    torch::Tensor *input_tensor;
+
+    #if CONFIG_LIBTOKENIZERS
+    std::vector<torch::Tensor> *text_embeddings;
+    #endif
+
+} THInferRequest;
+
+typedef struct THRequestItem {
+    THInferRequest *infer_request;
+    LastLevelTaskItem *lltask;
+    DNNAsyncExecModule exec_module;
+} THRequestItem;
+
+#endif
\ No newline at end of file
diff --git a/libavfilter/dnn/dnn_io_proc.c b/libavfilter/dnn/dnn_io_proc.c
index 826110dab0..b943c3861b 100644
--- a/libavfilter/dnn/dnn_io_proc.c
+++ b/libavfilter/dnn/dnn_io_proc.c
@@ -420,6 +420,58 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index
     return ret;
 }
 
+int ff_frame_to_dnn_clip(AVFrame *frame, DNNData *input, void *log_ctx)
+{
+    struct SwsContext *sws_ctx;
+    int linesizes[4];
+    int ret = 0;
+    enum AVPixelFormat fmt;
+    fmt = AV_PIX_FMT_RGB24;
+    float *float_data = (float *)input->data;
+
+    // Initialize scaling context to 224x224 RGB24
+    sws_ctx = sws_getContext(frame->width, frame->height, frame->format,
+                        224, 224, AV_PIX_FMT_RGB24,
+                        SWS_FAST_BILINEAR | SWS_FULL_CHR_H_INT | SWS_ACCURATE_RND,
+                        sws_getDefaultFilter(0, 0, 0, 0, 0, 0, 0),
+                        NULL,
+                        NULL);
+    if (!sws_ctx) {
+        av_log(log_ctx, AV_LOG_ERROR, "Failed to create scale context\n");
+        return AVERROR(EINVAL);
+    }
+
+    ret = av_image_fill_linesizes(linesizes, fmt, 224);
+    if (ret < 0) {
+        av_log(log_ctx, AV_LOG_ERROR, "Unable to get linesizes\n");
+        sws_freeContext(sws_ctx);
+        return ret;
+    }
+
+    // Temporary buffer for RGB24 data
+    uint8_t *rgb_data = av_malloc(224 * 224 * 3);
+    if (!rgb_data) {
+        sws_freeContext(sws_ctx);
+        return AVERROR(ENOMEM);
+    }
+
+    // Scale to RGB24
+    sws_scale(sws_ctx, frame->data, frame->linesize,
+              0, frame->height,
+              (uint8_t *const [4]){rgb_data, 0, 0, 0}, linesizes);
+
+    // Convert RGB24 to float and normalize to [0,1]
+    for (int i = 0; i < 224 * 224; i++) {
+        float_data[i] = rgb_data[i * 3] / 255.0f;                    // R
+        float_data[i + 224 * 224] = rgb_data[i * 3 + 1] / 255.0f;   // G
+        float_data[i + 2 * 224 * 224] = rgb_data[i * 3 + 2] / 255.0f; // B
+    }
+
+    av_free(rgb_data);
+    sws_freeContext(sws_ctx);
+    return ret;
+}
+
 int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx)
 {
     struct SwsContext *sws_ctx;
diff --git a/libavfilter/dnn/dnn_io_proc.h b/libavfilter/dnn/dnn_io_proc.h
index a3dd94675b..12b9611cea 100644
--- a/libavfilter/dnn/dnn_io_proc.h
+++ b/libavfilter/dnn/dnn_io_proc.h
@@ -34,5 +34,6 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx);
 int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx);
 int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx);
 int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index, void *log_ctx);
+int ff_frame_to_dnn_clip(AVFrame *frame, DNNData *input, void *log_ctx);
 
 #endif
diff --git a/libavfilter/dnn_filter_common.c b/libavfilter/dnn_filter_common.c
index 6b9c6f8d7f..9e7cb1f2cd 100644
--- a/libavfilter/dnn_filter_common.c
+++ b/libavfilter/dnn_filter_common.c
@@ -194,6 +194,22 @@ int ff_dnn_execute_model_classification(DnnContext *ctx, AVFrame *in_frame, AVFr
     };
     return (ctx->dnn_module->execute_model)(ctx->model, &class_params.base);
 }
+int ff_dnn_execute_model_clip(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame, const char **labels, const char* tokenizer_path, int label_count)
+{
+    DNNExecZeroShotClassificationParams class_params = {
+        {
+            .input_name     = ctx->model_inputname,
+            .output_names   = (const char **)ctx->model_outputnames,
+            .nb_output      = ctx->nb_outputs,
+            .in_frame       = in_frame,
+            .out_frame      = out_frame,
+        },
+        .labels = labels,
+        .tokenizer_path = tokenizer_path,
+        .label_count = label_count,
+    };
+    return (ctx->dnn_module->execute_model)(ctx->model, &class_params.base);
+}
 
 DNNAsyncStatusType ff_dnn_get_result(DnnContext *ctx, AVFrame **in_frame, AVFrame **out_frame)
 {
diff --git a/libavfilter/dnn_filter_common.h b/libavfilter/dnn_filter_common.h
index 42a4719997..9de2c2772d 100644
--- a/libavfilter/dnn_filter_common.h
+++ b/libavfilter/dnn_filter_common.h
@@ -59,6 +59,7 @@ int ff_dnn_get_input(DnnContext *ctx, DNNData *input);
 int ff_dnn_get_output(DnnContext *ctx, int input_width, int input_height, int *output_width, int *output_height);
 int ff_dnn_execute_model(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame);
 int ff_dnn_execute_model_classification(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame, const char *target);
+int ff_dnn_execute_model_clip(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame, const char **labels, const char* tokenizer_path, int label_count);
 DNNAsyncStatusType ff_dnn_get_result(DnnContext *ctx, AVFrame **in_frame, AVFrame **out_frame);
 int ff_dnn_flush(DnnContext *ctx);
 void ff_dnn_uninit(DnnContext *ctx);
diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h
index 66086409be..84bd5e7a8a 100644
--- a/libavfilter/dnn_interface.h
+++ b/libavfilter/dnn_interface.h
@@ -58,6 +58,7 @@ typedef enum {
     DFT_PROCESS_FRAME,      // process the whole frame
     DFT_ANALYTICS_DETECT,   // detect from the whole frame
     DFT_ANALYTICS_CLASSIFY, // classify for each bounding box
+    DFT_ANALYTICS_CLIP      // classify whole frame with zero-shot classification
 }DNNFunctionType;
 
 typedef enum {
@@ -90,6 +91,13 @@ typedef struct DNNExecClassificationParams {
     const char *target;
 } DNNExecClassificationParams;
 
+typedef struct DNNExecZeroShotClassificationParams {
+    DNNExecBaseParams base;
+    const char **labels;
+    const int label_count;
+    const char *tokenizer_path;
+} DNNExecZeroShotClassificationParams;
+
 typedef int (*FramePrePostProc)(AVFrame *frame, DNNData *model, AVFilterContext *filter_ctx);
 typedef int (*DetectPostProc)(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx);
 typedef int (*ClassifyPostProc)(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx);
diff --git a/libavfilter/vf_dnn_clip.c b/libavfilter/vf_dnn_clip.c
new file mode 100644
index 0000000000..73a81cf02c
--- /dev/null
+++ b/libavfilter/vf_dnn_clip.c
@@ -0,0 +1,342 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * DNN CLIP filter - Zero-shot image classification using CLIP models
+ */
+
+#include "libavutil/file_open.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "filters.h"
+#include "dnn_filter_common.h"
+#include "video.h"
+#include "libavutil/time.h"
+#include "libavutil/detection_bbox.h"
+#include "libavutil/avstring.h"
+
+typedef struct DNNCLIPContext {
+    const AVClass *clazz;
+    DnnContext dnnctx;
+    char *labels_filename;      
+    char *tokenizer_path;       
+    char **labels;
+    int label_count;
+} DNNCLIPContext;
+
+#define OFFSET(x) offsetof(DNNCLIPContext, dnnctx.x)
+#define OFFSET2(x) offsetof(DNNCLIPContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+
+static const AVOption dnn_clip_options[] = {
+    { "dnn_backend", "DNN backend", 
+        OFFSET(backend_type), AV_OPT_TYPE_INT, 
+        { .i64 = DNN_TH }, INT_MIN, INT_MAX, FLAGS, .unit = "backend" },
+#if (CONFIG_LIBTORCH == 1)
+    { "torch", "torch backend flag", 
+        0, AV_OPT_TYPE_CONST, { .i64 = DNN_TH }, 0, 0, FLAGS, .unit = "backend" },
+#endif
+    { "labels", "path to text prompts file", 
+        OFFSET2(labels_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
+    { "tokenizer", "path to text tokenizer.json file", 
+        OFFSET2(tokenizer_path), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DNN_DEFINE_CLASS(dnn_clip, DNN_TH);
+
+static int dnn_clip_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx)
+{
+    DNNCLIPContext *ctx = filter_ctx->priv;
+    const int max_classes_per_box = AV_NUM_DETECTION_BBOX_CLASSIFY;
+    int num_labels = ctx->label_count;
+    float *probabilities = (float*)output->data;
+    int num_bboxes;
+    size_t side_data_size;
+    AVFrameSideData *sd;
+    AVDetectionBBoxHeader *header;
+    AVDetectionBBox *bbox;
+    int i, j;
+    int start_idx, end_idx;
+    int percentage;
+
+    // Calculate number of bounding boxes needed 
+    num_bboxes = (num_labels + max_classes_per_box - 1) / max_classes_per_box;
+
+    // Calculate total size needed 
+    side_data_size = sizeof(AVDetectionBBoxHeader) +
+                    (num_bboxes * sizeof(AVDetectionBBox));
+
+    sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
+    if (sd) {
+        av_log(filter_ctx, AV_LOG_ERROR, "Found Detection Box of Detect Filter. Detect is not compatible with CLIP Filter yet. Detection Boxes get replaced ... %zu\n", side_data_size);
+        av_frame_remove_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
+    }
+
+    sd = av_frame_new_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES, side_data_size);
+    if (!sd) {
+        av_log(filter_ctx, AV_LOG_ERROR, "Failed to allocate side data of size %zu\n", side_data_size);
+        return AVERROR(ENOMEM);
+    }
+
+    // Zero initialize the entire side data 
+    memset(sd->data, 0, side_data_size);
+
+    header = (AVDetectionBBoxHeader *)sd->data;
+    header->nb_bboxes = num_bboxes;
+    header->bbox_size = sizeof(AVDetectionBBox);
+    av_strlcpy(header->source, "clip", sizeof(header->source));
+
+    //Process each bbox 
+    for (i = 0; i < num_bboxes; i++) {
+        bbox = av_get_detection_bbox(header, i);
+        if (!bbox) {
+            av_log(filter_ctx, AV_LOG_ERROR, "Failed to get bbox %d\n", i);
+            return AVERROR(EINVAL);
+        }
+
+        // Initialize bbox 
+        bbox->x = 0;
+        bbox->y = 0;
+        bbox->w = frame->width;
+        bbox->h = frame->height;
+        bbox->classify_count = 0;
+        bbox->detect_label[0] = '\0';
+
+        start_idx = i * max_classes_per_box;
+        end_idx = FFMIN(num_labels, (i + 1) * max_classes_per_box);
+
+        // Set classifications for this bbox
+        for (j = start_idx; j < end_idx && bbox->classify_count < max_classes_per_box; j++) {
+            if (!ctx->labels[j]) {
+                av_log(filter_ctx, AV_LOG_ERROR, "Invalid label at index %d\n", j);
+                continue;
+            }
+
+            percentage = (int)(probabilities[j] * 100);
+            bbox->classify_confidences[bbox->classify_count] = av_make_q(percentage, 100);
+            av_strlcpy(bbox->classify_labels[bbox->classify_count],
+                      ctx->labels[j],
+                      sizeof(bbox->classify_labels[0]));
+
+            bbox->classify_count++;
+        }
+    }
+
+    return 0;
+}
+
+static void free_classify_labels(DNNCLIPContext *ctx)
+{
+    for (int i = 0; i < ctx->label_count; i++)
+        av_freep(&ctx->labels[i]);
+    ctx->label_count = 0;
+    av_freep(&ctx->labels);
+}
+
+static int read_classify_label_file(AVFilterContext *context)
+{
+    int line_len;
+    FILE *file;
+    DNNCLIPContext *ctx = context->priv;
+
+    file = avpriv_fopen_utf8(ctx->labels_filename, "r");
+    if (!file) {
+        av_log(context, AV_LOG_ERROR, "Failed to open file %s\n", ctx->labels_filename);
+        return AVERROR(EINVAL);
+    }
+
+    while (!feof(file)) {
+        char *prompt;
+        char buf[256];
+        if (!fgets(buf, sizeof(buf), file))
+            break;
+
+        line_len = strlen(buf);
+        while (line_len) {
+            int i = line_len - 1;
+            if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
+                buf[i] = '\0';
+                line_len--;
+            } else
+                break;
+        }
+
+        if (line_len == 0)
+            continue;
+
+        if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) {
+            av_log(context, AV_LOG_ERROR, "Text prompt %s too long\n", buf);
+            fclose(file);
+            return AVERROR(EINVAL);
+        }
+
+        prompt = av_strdup(buf);
+        if (!prompt) {
+            av_log(context, AV_LOG_ERROR, "Failed to allocate memory for prompt %s\n", buf);
+            fclose(file);
+            return AVERROR(ENOMEM);
+        }
+
+        if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, prompt) < 0) {
+            av_log(context, AV_LOG_ERROR, "Failed to add prompt to array\n");
+            fclose(file);
+            av_freep(&prompt);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    fclose(file);
+    return 0;
+}
+
+static av_cold int dnn_clip_init(AVFilterContext *context)
+{
+    DNNCLIPContext *ctx = context->priv;
+    int ret;
+
+    ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_CLIP, context);
+    if (ret < 0)
+        return ret;
+    ff_dnn_set_classify_post_proc(&ctx->dnnctx, dnn_clip_post_proc);
+
+    if (!ctx->labels_filename) {
+        av_log(context, AV_LOG_ERROR, "Text prompts file is required for CLIP classification\n");
+        return AVERROR(EINVAL);
+    }
+    if (!ctx->tokenizer_path) {
+        av_log(context, AV_LOG_ERROR, "Tokenizer file is required for CLIP classification\n");
+        return AVERROR(EINVAL);
+    }
+    return read_classify_label_file(context);
+}
+
+static av_cold void dnn_clip_uninit(AVFilterContext *context)
+{
+    DNNCLIPContext *ctx = context->priv;
+    ff_dnn_uninit(&ctx->dnnctx);
+    free_classify_labels(ctx);
+}
+
+static int dnn_clip_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
+{
+    DNNCLIPContext *ctx = outlink->src->priv;
+    int ret;
+    DNNAsyncStatusType async_state;
+
+    ret = ff_dnn_flush(&ctx->dnnctx);
+    if (ret != 0) {
+        return -1;
+    }
+
+    do {
+        AVFrame *in_frame = NULL;
+        AVFrame *out_frame = NULL;
+        async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
+        if (async_state == DAST_SUCCESS) {
+            ret = ff_filter_frame(outlink, in_frame);
+            if (ret < 0)
+                return ret;
+            if (out_pts)
+                *out_pts = in_frame->pts + pts;
+        }
+        av_usleep(5000);
+    } while (async_state >= DAST_NOT_READY);
+
+    return 0;
+}
+
+static int dnn_clip_activate(AVFilterContext *filter_ctx)
+{
+    AVFilterLink *inlink = filter_ctx->inputs[0];
+    AVFilterLink *outlink = filter_ctx->outputs[0];
+    DNNCLIPContext *ctx = filter_ctx->priv;
+    AVFrame *in = NULL;
+    int64_t pts;
+    int ret, status;
+    int got_frame = 0;
+    int async_state;
+
+    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
+
+    do {
+        // Process all available input frames
+        ret = ff_inlink_consume_frame(inlink, &in);
+        if (ret < 0)
+            return ret;
+        if (ret > 0) {
+            if (ff_dnn_execute_model_clip(&ctx->dnnctx, in, NULL, ctx->labels, ctx->tokenizer_path, ctx->label_count) != 0) {
+                return AVERROR(EIO);
+            }
+        }
+    } while (ret > 0);
+
+    // Handle processed frames
+    do {
+        AVFrame *in_frame = NULL;
+        AVFrame *out_frame = NULL;
+        async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
+        if (async_state == DAST_SUCCESS) {
+            ret = ff_filter_frame(outlink, in_frame);
+            if (ret < 0)
+                return ret;
+            got_frame = 1;
+        }
+    } while (async_state == DAST_SUCCESS);
+
+    // Schedule next filter if frame was processed
+    if (got_frame)
+        return 0;
+
+    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+        if (status == AVERROR_EOF) {
+            int64_t out_pts = pts;
+            ret = dnn_clip_flush_frame(outlink, pts, &out_pts);
+            ff_outlink_set_status(outlink, status, out_pts);
+            return ret;
+        }
+    }
+
+    FF_FILTER_FORWARD_WANTED(outlink, inlink);
+
+    return 0;
+}
+
+static const enum AVPixelFormat pix_fmts[] = {
+    AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
+    AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,
+    AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUVJ420P,
+    AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_NONE
+};
+
+const FFFilter ff_vf_dnn_clip = {
+    .p.name          = "dnn_clip",
+    .p.description   = NULL_IF_CONFIG_SMALL("Apply CLIP zero-shot classification."),
+    .p.priv_class    = &dnn_clip_class,
+    .preinit       = ff_dnn_filter_init_child_class,
+    .priv_size     = sizeof(DNNCLIPContext),
+    .init          = dnn_clip_init,
+    .uninit        = dnn_clip_uninit,
+    .activate      = dnn_clip_activate,  
+    FILTER_INPUTS(ff_video_default_filterpad),
+    FILTER_OUTPUTS(ff_video_default_filterpad),
+    FILTER_PIXFMTS_ARRAY(pix_fmts),
+};
\ No newline at end of file
-- 
2.34.1


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread