From: Michael Niedermayer <michael@niedermayer.cc> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Subject: Re: [FFmpeg-devel] [PATCH] Whisper audio filter Date: Sat, 19 Jul 2025 02:15:53 +0200 Message-ID: <20250719001553.GP29660@pb2> (raw) In-Reply-To: <20250717085157.88889-1-vpalmisano@gmail.com> [-- Attachment #1.1: Type: text/plain, Size: 7350 bytes --] Hi Vittorio On Thu, Jul 17, 2025 at 10:51:57AM +0200, Vittorio Palmisano wrote: > It adds a new audio filter for running audio transcriptions with the whisper model. > Documentation and examples are included into the patch. > > Signed-off-by: Vittorio Palmisano <vpalmisano@gmail.com> > --- > configure | 5 + > doc/filters.texi | 107 +++++++++ > libavfilter/Makefile | 2 + > libavfilter/af_whisper.c | 452 +++++++++++++++++++++++++++++++++++++++ > libavfilter/allfilters.c | 2 + > 5 files changed, 568 insertions(+) > create mode 100644 libavfilter/af_whisper.c [...] > +static void cb_log(enum ggml_log_level level, const char *text, void *user_data) > +{ > + AVFilterContext *ctx = (AVFilterContext *) user_data; > + switch (level) { > + case GGML_LOG_LEVEL_ERROR: > + av_log(ctx, AV_LOG_ERROR, "%s", text); > + break; > + case GGML_LOG_LEVEL_WARN: > + av_log(ctx, AV_LOG_WARNING, "%s", text); > + break; > + case GGML_LOG_LEVEL_INFO: > + case GGML_LOG_LEVEL_DEBUG: > + av_log(ctx, AV_LOG_DEBUG, "%s", text); > + break; > + } > +} you can factor the function calls out of the switch/case > + > +static int init(AVFilterContext *ctx) > +{ > + WhisperContext *wctx = ctx->priv; > + > + static AVOnce init_static_once = AV_ONCE_INIT; > + ff_thread_once(&init_static_once, ggml_backend_load_all); > + > + whisper_log_set(cb_log, ctx); > + > + // Init whisper context > + if (!wctx->model_path) { > + av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n"); > + return AVERROR(EINVAL); > + } > + > + struct whisper_context_params params = whisper_context_default_params(); > + params.use_gpu = wctx->use_gpu; > + params.gpu_device = wctx->gpu_device; > + > + wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params); > + if (wctx->ctx_wsp == NULL) { > + av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path); > + return AVERROR(EIO); > + } > + > + // Init buffer > + wctx->audio_buffer_queue_size = WHISPER_SAMPLE_RATE * wctx->queue / 1000000; The multiplication can overflow also the 32bit output could overflow best is probably to limit queue to a more reasonable value than INT64_MAX > + wctx->audio_buffer = av_malloc(wctx->audio_buffer_queue_size * sizeof(*wctx->audio_buffer)); av_calloc() or av_malloc_array() [...] > +static void run_transcription(AVFilterContext *ctx, AVDictionary **metadata, int end_pos) > +{ > + WhisperContext *wctx = ctx->priv; > + end_pos = FFMAX(0, FFMIN(end_pos, wctx->audio_buffer_fill_size)); > + > + if (!wctx->ctx_wsp || end_pos == 0) > + return; > + > + float duration = (float) end_pos / WHISPER_SAMPLE_RATE; [...] > + wctx->timestamp += duration * 1000; floats are not precise and the accumulated rounding errors will add up and lead to synchronization issues between the subtitles and audio or video over a long enough timespan Also for reproducability this should use integers what you could do, is to use: wctx->timestamp += end_pos; and then replace every use of wctx->timestamp by wctx->timestamp / WHISPER_SAMPLE_RATE or wctx->timestamp / (double)WHISPER_SAMPLE_RATE if the context demands a double for example that way the code is exact and no errors accumulate > + > + if (metadata && segments_text) { > + av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0); > + char *duration_text = av_asprintf("%f", duration); > + av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL); > + } > + av_freep(&segments_text); > + > + memcpy(wctx->audio_buffer, wctx->audio_buffer + end_pos, end_pos * sizeof(*wctx->audio_buffer)); > + wctx->audio_buffer_fill_size -= end_pos; > + wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size; > +} > + > +static int filter_frame(AVFilterLink *inlink, AVFrame *frame) > +{ > + AVFilterContext *ctx = inlink->dst; > + WhisperContext *wctx = ctx->priv; > + AVFilterLink *outlink = ctx->outputs[0]; > + AVDictionary **metadata = &frame->metadata; > + > + const int samples = frame->nb_samples; > + const float *input_data = (const float *) frame->data[0]; > + > + if (wctx->audio_buffer_fill_size + samples > wctx->audio_buffer_queue_size) { > + run_transcription(ctx, metadata, wctx->audio_buffer_fill_size); > + } > + > + memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer)); > + wctx->audio_buffer_fill_size += samples; > + > + if (wctx->ctx_vad > + && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >= > + WHISPER_SAMPLE_RATE * (wctx->vad_min_speech_duration + wctx->vad_min_silence_duration) / 1000000) { > + struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad, > + wctx->vad_params, > + wctx->audio_buffer, > + wctx->audio_buffer_fill_size); > + wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size; > + > + if (!segments) { > + av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n"); > + } else { > + int n_segments = whisper_vad_segments_n_segments(segments); > + > + if (n_segments > 0) { > + const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0; > + const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0; > + int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000); > + > + if (end_pos <= wctx->audio_buffer_fill_size - WHISPER_SAMPLE_RATE * wctx->vad_min_silence_duration / 1000000) { > + av_log(ctx, AV_LOG_INFO, > + "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n", > + n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE); > + run_transcription(ctx, metadata, end_pos); > + } > + } > + > + whisper_vad_free_segments(segments); > + } > + } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size) > + run_transcription(ctx, metadata, wctx->audio_buffer_fill_size); > + > + wctx->next_pts = frame->pts + av_rescale_q(frame->nb_samples, (AVRational) { > + 1, inlink->sample_rate} > + , inlink->time_base); I think you should consistently use samples or frame->nb_samples, they are the same value i think thx [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Never trust a computer, one day, it may think you are the virus. -- Compn [-- Attachment #1.2: signature.asc --] [-- Type: application/pgp-signature, Size: 195 bytes --] [-- Attachment #2: Type: text/plain, Size: 251 bytes --] _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-07-19 0:16 UTC|newest] Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-07-09 7:23 Vittorio Palmisano 2025-07-09 13:36 ` Marvin Scholz 2025-07-09 15:24 ` Zhao Zhili 2025-07-10 8:43 ` Vittorio Palmisano 2025-07-10 9:47 ` Zhao Zhili 2025-07-10 12:41 ` Michael Niedermayer 2025-07-09 23:37 ` Michael Niedermayer 2025-07-10 8:34 ` Vittorio Palmisano 2025-07-10 10:05 ` Marvin Scholz 2025-07-10 10:20 ` Vittorio Palmisano 2025-07-10 10:25 ` Vittorio Palmisano 2025-07-10 12:20 ` Michael Niedermayer 2025-07-11 8:41 ` Vittorio Palmisano 2025-07-11 9:07 ` Vittorio Palmisano 2025-07-11 19:05 ` Marvin Scholz 2025-07-12 0:03 ` Michael Niedermayer 2025-07-13 11:16 ` Vittorio Palmisano 2025-07-14 10:34 ` Vittorio Palmisano 2025-07-14 21:47 ` Michael Niedermayer 2025-07-15 7:44 ` Vittorio Palmisano 2025-07-17 8:51 ` Vittorio Palmisano 2025-07-19 0:15 ` Michael Niedermayer [this message] 2025-07-18 23:24 ` Michael Niedermayer 2025-07-10 11:31 ` Michael Niedermayer 2025-07-10 12:07 ` Nicolas George 2025-07-10 12:10 ` Nicolas George 2025-07-09 23:41 ` Michael Niedermayer
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250719001553.GP29660@pb2 \ --to=michael@niedermayer.cc \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git