From: Michael Niedermayer <michael@niedermayer.cc> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Subject: Re: [FFmpeg-devel] [PATCH] libavfilter: Whisper audio filter Date: Sun, 20 Jul 2025 03:22:09 +0200 Message-ID: <20250720012209.GW29660@pb2> (raw) In-Reply-To: <20250719125526.389239-1-vpalmisano@gmail.com> [-- Attachment #1.1: Type: text/plain, Size: 6872 bytes --] On Sat, Jul 19, 2025 at 02:55:26PM +0200, Vittorio Palmisano wrote: > It adds a new audio filter for running audio transcriptions with the whisper model. > Documentation and examples are included into the patch. > > Signed-off-by: Vittorio Palmisano <vpalmisano@gmail.com> > --- > configure | 5 + > doc/filters.texi | 107 +++++++++ > libavfilter/Makefile | 2 + > libavfilter/af_whisper.c | 456 +++++++++++++++++++++++++++++++++++++++ > libavfilter/allfilters.c | 2 + > 5 files changed, 572 insertions(+) > create mode 100644 libavfilter/af_whisper.c > [...] > +static int init(AVFilterContext *ctx) > +{ > + WhisperContext *wctx = ctx->priv; > + > + static AVOnce init_static_once = AV_ONCE_INIT; > + ff_thread_once(&init_static_once, ggml_backend_load_all); > + > + whisper_log_set(cb_log, ctx); > + > + // Init whisper context > + if (!wctx->model_path) { > + av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n"); > + return AVERROR(EINVAL); > + } > + > + struct whisper_context_params params = whisper_context_default_params(); > + params.use_gpu = wctx->use_gpu; > + params.gpu_device = wctx->gpu_device; > + > + wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params); > + if (wctx->ctx_wsp == NULL) { > + av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path); > + return AVERROR(EIO); > + } > + > + // Init buffer > + wctx->audio_buffer_queue_size = WHISPER_SAMPLE_RATE * wctx->queue / 1000000; > + wctx->audio_buffer = av_malloc_array(wctx->audio_buffer_queue_size, sizeof(*wctx->audio_buffer)); > + if (!wctx->audio_buffer) > + return AVERROR(ENOMEM); > + > + // Init VAD model context > + if (wctx->vad_model_path) { > + struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params(); > + ctx_params.n_threads = ff_filter_get_nb_threads(ctx); > + // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context) > + ctx_params.gpu_device = wctx->gpu_device; > + wctx->ctx_vad = whisper_vad_init_from_file_with_params(wctx->vad_model_path, ctx_params); > + > + wctx->vad_params = whisper_vad_default_params(); > + wctx->vad_params.threshold = wctx->vad_threshold; > + wctx->vad_params.min_speech_duration_ms = wctx->vad_min_speech_duration / 1000; > + wctx->vad_params.min_silence_duration_ms = wctx->vad_min_silence_duration / 1000; > + wctx->vad_params.max_speech_duration_s = wctx->queue / 1000000.0; > + wctx->vad_params.speech_pad_ms = 0; > + wctx->vad_params.samples_overlap = 0; > + } > + > + wctx->next_pts = AV_NOPTS_VALUE; > + > + if (wctx->destination && strcmp("", wctx->destination)) { > + const char *dst = wctx->destination; > + if (!strcmp("-", dst)) > + dst = "pipe:1"; > + int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE); > + > + if (ret < 0) { > + av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", wctx->destination, av_err2str(ret)); > + return ret; > + } > + > + wctx->avio_context->direct = AVIO_FLAG_DIRECT; > + } > + > + av_log(ctx, AV_LOG_INFO, > + "Whisper filter initialized: model: %s lang: %s queue: %ld ms\n", > + wctx->model_path, wctx->language, wctx->queue / 1000); > + > + return 0; > +} > + > +static void uninit(AVFilterContext *ctx) > +{ > + WhisperContext *wctx = ctx->priv; > + > + if (wctx->audio_buffer_fill_size > 0) { > + av_log(ctx, AV_LOG_WARNING, > + "Remaining audio buffer %d samples (%d seconds) after stopping\n", > + wctx->audio_buffer_fill_size, wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE); > + } > + > + if (wctx->ctx_vad) { > + whisper_vad_free(wctx->ctx_vad); > + wctx->ctx_vad = NULL; > + } > + > + if (wctx->ctx_wsp) { > + whisper_free(wctx->ctx_wsp); > + wctx->ctx_wsp = NULL; > + } > + > + av_freep(&wctx->audio_buffer); > + > + if (wctx->avio_context) > + avio_closep(&wctx->avio_context); > +} > + > +static void run_transcription(AVFilterContext *ctx, AVDictionary **metadata, int frames) > +{ > + WhisperContext *wctx = ctx->priv; > + frames = FFMAX(0, FFMIN(frames, wctx->audio_buffer_fill_size)); I would call it samples, sample_count or nb_samples why are you cliping the number of samples ? I assume run_transcription() would be called with the correct number or am i missing something ? > + > + if (!wctx->ctx_wsp || frames == 0) > + return; > + > + float duration = (float) frames / WHISPER_SAMPLE_RATE; > + > + av_log(ctx, AV_LOG_INFO, > + "run transcription %d/%d samples (%.2f seconds)...\n", frames, wctx->audio_buffer_fill_size, duration); > + > + struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); > + params.language = wctx->language; > + params.n_threads = ff_filter_get_nb_threads(ctx); > + params.print_special = 0; > + params.print_progress = 0; > + params.print_realtime = 0; > + params.print_timestamps = 0; > + > + if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, frames) != 0) { > + av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n"); > + return; > + } > + > + const int64_t timestamp = wctx->frames * 1000 / WHISPER_SAMPLE_RATE; to make this a bit easier to understand i suggest to call it timestamp_ms as its a timestamp in milliseconds and we have timestamps in the timebase too. But thats not really important, just an idea A bigger problem is that the input frame->pts are not passed through to the output srt/json timestamps. To understand why this is a problem, consider some audio input device which samples at 16khz. This hardware contains lets say for simplicity a 16khz crystal and samples based on that. But depending on temperature of this crystal it will really sample lets say between 15990 and 16010khz. So simply counting samples alone is not enough. the frame->pts need to be used too. If the subtitles should be perfectly in sync with the video Its probably best to give the user the option to produce srt/json times based purely on sample numbers but also on pts. sorry iam bringing this just up now, i just realizes it as i was reviewing the timestamp code thx [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB No great genius has ever existed without some touch of madness. -- Aristotle [-- Attachment #1.2: signature.asc --] [-- Type: application/pgp-signature, Size: 195 bytes --] [-- Attachment #2: Type: text/plain, Size: 251 bytes --] _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2025-07-20 1:22 UTC|newest] Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top 2025-07-09 7:23 [FFmpeg-devel] [PATCH] " Vittorio Palmisano 2025-07-09 13:36 ` Marvin Scholz 2025-07-09 15:24 ` Zhao Zhili 2025-07-10 8:43 ` Vittorio Palmisano 2025-07-10 9:47 ` Zhao Zhili 2025-07-10 12:41 ` Michael Niedermayer 2025-07-09 23:37 ` Michael Niedermayer 2025-07-10 8:34 ` Vittorio Palmisano 2025-07-10 10:05 ` Marvin Scholz 2025-07-10 10:20 ` Vittorio Palmisano 2025-07-10 10:25 ` Vittorio Palmisano 2025-07-10 12:20 ` Michael Niedermayer 2025-07-11 8:41 ` Vittorio Palmisano 2025-07-11 9:07 ` Vittorio Palmisano 2025-07-11 19:05 ` Marvin Scholz 2025-07-12 0:03 ` Michael Niedermayer 2025-07-13 11:16 ` Vittorio Palmisano 2025-07-14 10:34 ` Vittorio Palmisano 2025-07-14 21:47 ` Michael Niedermayer 2025-07-15 7:44 ` Vittorio Palmisano 2025-07-17 8:51 ` Vittorio Palmisano 2025-07-19 0:15 ` Michael Niedermayer 2025-07-19 12:55 ` [FFmpeg-devel] [PATCH] libavfilter: " Vittorio Palmisano 2025-07-20 1:22 ` Michael Niedermayer [this message] 2025-07-19 12:58 ` [FFmpeg-devel] [PATCH] " Vittorio Palmisano 2025-07-18 23:24 ` Michael Niedermayer 2025-07-10 11:31 ` Michael Niedermayer 2025-07-10 12:07 ` Nicolas George 2025-07-10 12:10 ` Nicolas George 2025-07-09 23:41 ` Michael Niedermayer
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20250720012209.GW29660@pb2 \ --to=michael@niedermayer.cc \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git