From: <m.kaindl0208@gmail.com> To: <ffmpeg-devel@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH v2 FFmpeg 15/20] libavfilter/dnn/dnn_backend_torch: Audio and Video preprocessing for CLIP/CLAP models Date: Mon, 10 Mar 2025 20:55:14 +0100 Message-ID: <004301db91f6$56fa2ce0$04ee86a0$@gmail.com> (raw) Signed-off-by: MaximilianKaindl <m.kaindl0208@gmail.com> --- libavfilter/dnn/dnn_backend_torch.cpp | 128 ++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/libavfilter/dnn/dnn_backend_torch.cpp b/libavfilter/dnn/dnn_backend_torch.cpp index 12ba2674b3..1d2bfb191a 100644 --- a/libavfilter/dnn/dnn_backend_torch.cpp +++ b/libavfilter/dnn/dnn_backend_torch.cpp @@ -458,6 +458,134 @@ static torch::Tensor apply_softmax(torch::Tensor input_tensor, float temperature } } +static torch::Tensor handle_short_audio_tensor(torch::Tensor audio_tensor, int target_samples) +{ + int nb_samples = audio_tensor.size(0); + int repeat_factor = (target_samples + nb_samples - 1) / nb_samples; + + // Repeat tensor along dimension 0 to fill required length + torch::Tensor repeated = audio_tensor.repeat({repeat_factor}); + + // Take only the needed samples + return repeated.slice(0, 0, target_samples); +} + +static torch::Tensor handle_long_audio_tensor(torch::Tensor audio_tensor, int target_samples, TaskItem *task) +{ + int nb_samples = audio_tensor.size(0); + int max_start = nb_samples - target_samples; + + // Use a deterministic seed based on frame properties + unsigned int seed = + (unsigned int)((uintptr_t)task ^ (uintptr_t)(task->in_frame->pts ? task->in_frame->pts : nb_samples)); + + // Determine start position - center-biased for better representation + int start_idx; + + // Prefer center segments for better representation, with some randomness + if (seed % 3 == 0) { // ~33% chance for center segment + start_idx = (nb_samples - target_samples) / 2; + } else { + // Otherwise use seeded position + start_idx = seed % (max_start + 1); + } + + // Extract the segment using slice operation + return audio_tensor.slice(0, start_idx, start_idx + target_samples); +} + +static int prepare_audio_tensor(const THModel *th_model, const THRequestItem *request) +{ + THInferRequest *infer_request = request->infer_request; + LastLevelTaskItem *lltask = request->lltasks[0]; + TaskItem *task = lltask->task; + DnnContext *ctx = th_model->ctx; + int ret = 0; + + const int target_samples = th_model->ctx->torch_option.sample_rate * th_model->ctx->torch_option.sample_duration; + + // Validate input frame + if (!task->in_frame->data[0]) { + av_log(ctx, AV_LOG_ERROR, "Invalid frame input data\n"); + return AVERROR(EINVAL); + } + + // Get audio data from the frame + float *audio_data = (float *)task->in_frame->data[0]; + int nb_samples = task->in_frame->nb_samples; + + // Validate audio parameters + if (task->in_frame->sample_rate != th_model->ctx->torch_option.sample_rate) { + av_log(ctx, AV_LOG_ERROR, "Sample rate mismatch. Expected %ld Hz, got %d Hz\n", + th_model->ctx->torch_option.sample_rate, task->in_frame->sample_rate); + return AVERROR(EINVAL); + } + + if (task->in_frame->format != AV_SAMPLE_FMT_FLT) { + av_log(ctx, AV_LOG_ERROR, "Unsupported sample format. Expected float\n"); + return AVERROR(EINVAL); + } + + try { + torch::Tensor audio_tensor = torch::from_blob(audio_data, {nb_samples}, torch::kFloat32).clone(); + + c10::Device device = (*th_model->jit_model->parameters().begin()).device(); + if (audio_tensor.device() != device) { + audio_tensor = audio_tensor.to(device); + } + + // Create target tensor based on the audio length + torch::Tensor processed_tensor; + + if (nb_samples < target_samples) { + // Handle short audio using tensor repeat operation + processed_tensor = handle_short_audio_tensor(audio_tensor, target_samples); + } else if (nb_samples > target_samples) { + // Handle long audio using tensor slice operation + processed_tensor = handle_long_audio_tensor(audio_tensor, target_samples, task); + } else { + // Exact length, just use the tensor as is + processed_tensor = audio_tensor; + } + + processed_tensor = processed_tensor.reshape({1, -1}); + + // Assign to output + *infer_request->input_tensor = processed_tensor; + } catch (const c10::Error &e) { + av_log(ctx, AV_LOG_ERROR, "Audio tensor processing failed: %s\n", e.what()); + return AVERROR(EINVAL); + } catch (const std::exception &e) { + av_log(ctx, AV_LOG_ERROR, "Audio tensor processing failed: %s\n", e.what()); + return AVERROR(EINVAL); + } + + return ret; +} + +static int preprocess_image_tensor(const THModel *th_model, torch::Tensor *input_tensor, const c10::Device &device) +{ + DnnContext *ctx = th_model->ctx; + try { + if (input_tensor->device() != device) { + *input_tensor = input_tensor->to(device); + } + *input_tensor = torch::nn::functional::interpolate( + *input_tensor, + torch::nn::functional::InterpolateFuncOptions() + .size(std::vector<int64_t>{ctx->torch_option.input_resolution, ctx->torch_option.input_resolution}) + .mode(torch::kBicubic) + .align_corners(false)); + return 0; + } catch (const c10::Error &e) { + av_log(ctx, AV_LOG_ERROR, "Image encoding error: %s\n", e.what()); + return AVERROR(EINVAL); + } catch (const std::exception &e) { + av_log(ctx, AV_LOG_ERROR, "Image encoding error: %s\n", e.what()); + return AVERROR(EINVAL); + } +} + static int fill_model_input_th(THModel *th_model, THRequestItem *request) { LastLevelTaskItem *lltask = NULL; -- 2.34.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
reply other threads:[~2025-03-10 19:55 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to='004301db91f6$56fa2ce0$04ee86a0$@gmail.com' \ --to=m.kaindl0208@gmail.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git