* [FFmpeg-devel] [PATCH v2 FFmpeg 15/20] libavfilter/dnn/dnn_backend_torch: Audio and Video preprocessing for CLIP/CLAP models
@ 2025-03-10 19:55 m.kaindl0208
0 siblings, 0 replies; only message in thread
From: m.kaindl0208 @ 2025-03-10 19:55 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: MaximilianKaindl <m.kaindl0208@gmail.com>
---
libavfilter/dnn/dnn_backend_torch.cpp | 128 ++++++++++++++++++++++++++
1 file changed, 128 insertions(+)
diff --git a/libavfilter/dnn/dnn_backend_torch.cpp b/libavfilter/dnn/dnn_backend_torch.cpp
index 12ba2674b3..1d2bfb191a 100644
--- a/libavfilter/dnn/dnn_backend_torch.cpp
+++ b/libavfilter/dnn/dnn_backend_torch.cpp
@@ -458,6 +458,134 @@ static torch::Tensor apply_softmax(torch::Tensor input_tensor, float temperature
}
}
+static torch::Tensor handle_short_audio_tensor(torch::Tensor audio_tensor, int target_samples)
+{
+ int nb_samples = audio_tensor.size(0);
+ int repeat_factor = (target_samples + nb_samples - 1) / nb_samples;
+
+ // Repeat tensor along dimension 0 to fill required length
+ torch::Tensor repeated = audio_tensor.repeat({repeat_factor});
+
+ // Take only the needed samples
+ return repeated.slice(0, 0, target_samples);
+}
+
+static torch::Tensor handle_long_audio_tensor(torch::Tensor audio_tensor, int target_samples, TaskItem *task)
+{
+ int nb_samples = audio_tensor.size(0);
+ int max_start = nb_samples - target_samples;
+
+ // Use a deterministic seed based on frame properties
+ unsigned int seed =
+ (unsigned int)((uintptr_t)task ^ (uintptr_t)(task->in_frame->pts ? task->in_frame->pts : nb_samples));
+
+ // Determine start position - center-biased for better representation
+ int start_idx;
+
+ // Prefer center segments for better representation, with some randomness
+ if (seed % 3 == 0) { // ~33% chance for center segment
+ start_idx = (nb_samples - target_samples) / 2;
+ } else {
+ // Otherwise use seeded position
+ start_idx = seed % (max_start + 1);
+ }
+
+ // Extract the segment using slice operation
+ return audio_tensor.slice(0, start_idx, start_idx + target_samples);
+}
+
+static int prepare_audio_tensor(const THModel *th_model, const THRequestItem *request)
+{
+ THInferRequest *infer_request = request->infer_request;
+ LastLevelTaskItem *lltask = request->lltasks[0];
+ TaskItem *task = lltask->task;
+ DnnContext *ctx = th_model->ctx;
+ int ret = 0;
+
+ const int target_samples = th_model->ctx->torch_option.sample_rate * th_model->ctx->torch_option.sample_duration;
+
+ // Validate input frame
+ if (!task->in_frame->data[0]) {
+ av_log(ctx, AV_LOG_ERROR, "Invalid frame input data\n");
+ return AVERROR(EINVAL);
+ }
+
+ // Get audio data from the frame
+ float *audio_data = (float *)task->in_frame->data[0];
+ int nb_samples = task->in_frame->nb_samples;
+
+ // Validate audio parameters
+ if (task->in_frame->sample_rate != th_model->ctx->torch_option.sample_rate) {
+ av_log(ctx, AV_LOG_ERROR, "Sample rate mismatch. Expected %ld Hz, got %d Hz\n",
+ th_model->ctx->torch_option.sample_rate, task->in_frame->sample_rate);
+ return AVERROR(EINVAL);
+ }
+
+ if (task->in_frame->format != AV_SAMPLE_FMT_FLT) {
+ av_log(ctx, AV_LOG_ERROR, "Unsupported sample format. Expected float\n");
+ return AVERROR(EINVAL);
+ }
+
+ try {
+ torch::Tensor audio_tensor = torch::from_blob(audio_data, {nb_samples}, torch::kFloat32).clone();
+
+ c10::Device device = (*th_model->jit_model->parameters().begin()).device();
+ if (audio_tensor.device() != device) {
+ audio_tensor = audio_tensor.to(device);
+ }
+
+ // Create target tensor based on the audio length
+ torch::Tensor processed_tensor;
+
+ if (nb_samples < target_samples) {
+ // Handle short audio using tensor repeat operation
+ processed_tensor = handle_short_audio_tensor(audio_tensor, target_samples);
+ } else if (nb_samples > target_samples) {
+ // Handle long audio using tensor slice operation
+ processed_tensor = handle_long_audio_tensor(audio_tensor, target_samples, task);
+ } else {
+ // Exact length, just use the tensor as is
+ processed_tensor = audio_tensor;
+ }
+
+ processed_tensor = processed_tensor.reshape({1, -1});
+
+ // Assign to output
+ *infer_request->input_tensor = processed_tensor;
+ } catch (const c10::Error &e) {
+ av_log(ctx, AV_LOG_ERROR, "Audio tensor processing failed: %s\n", e.what());
+ return AVERROR(EINVAL);
+ } catch (const std::exception &e) {
+ av_log(ctx, AV_LOG_ERROR, "Audio tensor processing failed: %s\n", e.what());
+ return AVERROR(EINVAL);
+ }
+
+ return ret;
+}
+
+static int preprocess_image_tensor(const THModel *th_model, torch::Tensor *input_tensor, const c10::Device &device)
+{
+ DnnContext *ctx = th_model->ctx;
+ try {
+ if (input_tensor->device() != device) {
+ *input_tensor = input_tensor->to(device);
+ }
+ *input_tensor = torch::nn::functional::interpolate(
+ *input_tensor,
+ torch::nn::functional::InterpolateFuncOptions()
+ .size(std::vector<int64_t>{ctx->torch_option.input_resolution, ctx->torch_option.input_resolution})
+ .mode(torch::kBicubic)
+ .align_corners(false));
+ return 0;
+ } catch (const c10::Error &e) {
+ av_log(ctx, AV_LOG_ERROR, "Image encoding error: %s\n", e.what());
+ return AVERROR(EINVAL);
+ } catch (const std::exception &e) {
+ av_log(ctx, AV_LOG_ERROR, "Image encoding error: %s\n", e.what());
+ return AVERROR(EINVAL);
+ }
+}
+
static int fill_model_input_th(THModel *th_model, THRequestItem *request)
{
LastLevelTaskItem *lltask = NULL;
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-03-10 19:55 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-03-10 19:55 [FFmpeg-devel] [PATCH v2 FFmpeg 15/20] libavfilter/dnn/dnn_backend_torch: Audio and Video preprocessing for CLIP/CLAP models m.kaindl0208
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git