Hi Vittorio On Wed, Jul 09, 2025 at 09:23:48AM +0200, Vittorio Palmisano wrote: > It adds a new audio filter for running audio transcriptions with the whisper model. Iam happy to see someone contribute a whisper filter! [...] > +@example > +ffmpeg -i input.mp4 -vn -af "aformat=sample_rates=16000:channel_layouts=mono,whisper= Is there a reason why we convert to 16khz mono here ? > +model=../whisper.cpp/models/ggml-base.en.bin\ It would be nice if the models would be in a standard location, so teh user just has to specify the model name and not the path Maybe teh filter could check some "standard" locations I dont know what path is standard, but maybe something like: /usr/local/share/whisper.cpp/models ~/.whisper.cpp/models > +:language=en\ > +:queue=3000\ > +:destination=output.srt\ > +:format=srt" -f null - format can be deducted from the destination file extension. I tried this: ./ffmpeg -i matrixbench_mpeg2.mpg -vn -af "aformat=sample_rates=16000:channel_layouts=mono,whisper=model=/home/michael/whisper.cpp/models/ggml-base.en.bin:language=en:queue=3000:destination=output.srt:format=srt" -f null - but the output.srt is empty (0 bytes) [...] > +static void cb_log_disable(enum ggml_log_level, const char *, void *) {} libavfilter/af_whisper.c: In function ‘cb_log_disable’: libavfilter/af_whisper.c:75:28: error: parameter name omitted 75 | static void cb_log_disable(enum ggml_log_level, const char *, void *) {} libavfilter/af_whisper.c:75:49: error: parameter name omitted 75 | static void cb_log_disable(enum ggml_log_level, const char *, void *) {} | ^~~~~~~~~~~~ libavfilter/af_whisper.c:75:63: error: parameter name omitted 75 | static void cb_log_disable(enum ggml_log_level, const char *, void *) {} > + > +static int init(AVFilterContext *ctx) > +{ > + WhisperContext *wctx = ctx->priv; > + > + ggml_backend_load_all(); > + whisper_log_set(cb_log_disable, NULL); > + > + // Init whisper context > + if (!wctx->model_path) > + { > + av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n"); > + return AVERROR(EINVAL); > + } > + > + struct whisper_context_params params = whisper_context_default_params(); > + params.use_gpu = wctx->use_gpu; > + params.gpu_device = wctx->gpu_device; > + > + wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params); > + if (wctx->ctx_wsp == NULL) > + { > + av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path); > + return AVERROR(EIO); > + } > + > + wctx->whisper_state = whisper_init_state(wctx->ctx_wsp); > + if (wctx->whisper_state == NULL) > + { > + av_log(ctx, AV_LOG_ERROR, "Failed to get whisper state from context\n"); > + whisper_free(wctx->ctx_wsp); > + wctx->ctx_wsp = NULL; > + return AVERROR(EIO); > + } > + > + // Init VAD model context > + if (wctx->vad_model_path) > + { > + struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params(); > + ctx_params.n_threads = 4; > + // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context) > + ctx_params.gpu_device = wctx->gpu_device; > + wctx->ctx_vad = whisper_vad_init_from_file_with_params( > + wctx->vad_model_path, > + ctx_params); > + > + wctx->vad_params = whisper_vad_default_params(); > + wctx->vad_params.threshold = wctx->vad_threshold; > + wctx->vad_params.min_speech_duration_ms = wctx->vad_min_speech_duration; > + wctx->vad_params.min_silence_duration_ms = wctx->vad_min_silence_duration; > + wctx->vad_params.max_speech_duration_s = (float)(wctx->audio_buffer_queue_size / 1000.0f); teh float cast is unneeded > + wctx->vad_params.speech_pad_ms = 0; > + wctx->vad_params.samples_overlap = 0; > + } > + > + // Init buffer > + wctx->audio_buffer_queue_size = WHISPER_SAMPLE_RATE * wctx->queue / 1000; > + wctx->audio_buffer = av_malloc(wctx->audio_buffer_queue_size * sizeof(float)); > + if (!wctx->audio_buffer) > + { > + return AVERROR(ENOMEM); > + } > + > + wctx->audio_buffer_fill_size = 0; > + > + wctx->next_pts = AV_NOPTS_VALUE; > + > + wctx->avio_context = NULL; arent things already initialized to 0 ? > + if (wctx->destination && strcmp("", wctx->destination)) > + { > + int ret = 0; useless initialization > + > + if (!strcmp("-", wctx->destination)) > + { > + ret = avio_open(&wctx->avio_context, "pipe:1", AVIO_FLAG_WRITE); > + } > + else > + { > + ret = avio_open(&wctx->avio_context, wctx->destination, AVIO_FLAG_WRITE); > + } const char *dst = wctx->destination; if (!strcmp("-", wctx->destination)) dst = "pipe:1"; int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE); [...] > + if (segments_text) > + { > + av_free(segments_text); > + } the NULL check isnt needed and please use av_freep(&) instead of av_free() as it clears the pointer and thats just more robust thx [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB The educated differ from the uneducated as much as the living from the dead. -- Aristotle