Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: Michael Niedermayer <michael@niedermayer.cc>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: Re: [FFmpeg-devel] [PATCH] Whisper audio filter
Date: Sat, 19 Jul 2025 02:15:53 +0200
Message-ID: <20250719001553.GP29660@pb2> (raw)
In-Reply-To: <20250717085157.88889-1-vpalmisano@gmail.com>


[-- Attachment #1.1: Type: text/plain, Size: 7350 bytes --]

Hi Vittorio

On Thu, Jul 17, 2025 at 10:51:57AM +0200, Vittorio Palmisano wrote:
> It adds a new audio filter for running audio transcriptions with the whisper model.
> Documentation and examples are included into the patch.
> 
> Signed-off-by: Vittorio Palmisano <vpalmisano@gmail.com>
> ---
>  configure                |   5 +
>  doc/filters.texi         | 107 +++++++++
>  libavfilter/Makefile     |   2 +
>  libavfilter/af_whisper.c | 452 +++++++++++++++++++++++++++++++++++++++
>  libavfilter/allfilters.c |   2 +
>  5 files changed, 568 insertions(+)
>  create mode 100644 libavfilter/af_whisper.c
[...]

> +static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
> +{
> +    AVFilterContext *ctx = (AVFilterContext *) user_data;
> +    switch (level) {
> +    case GGML_LOG_LEVEL_ERROR:
> +        av_log(ctx, AV_LOG_ERROR, "%s", text);
> +        break;
> +    case GGML_LOG_LEVEL_WARN:
> +        av_log(ctx, AV_LOG_WARNING, "%s", text);
> +        break;
> +    case GGML_LOG_LEVEL_INFO:
> +    case GGML_LOG_LEVEL_DEBUG:
> +        av_log(ctx, AV_LOG_DEBUG, "%s", text);
> +        break;
> +    }
> +}

you can factor the function calls out of the switch/case


> +
> +static int init(AVFilterContext *ctx)
> +{
> +    WhisperContext *wctx = ctx->priv;
> +
> +    static AVOnce init_static_once = AV_ONCE_INIT;
> +    ff_thread_once(&init_static_once, ggml_backend_load_all);
> +
> +    whisper_log_set(cb_log, ctx);
> +
> +    // Init whisper context
> +    if (!wctx->model_path) {
> +        av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    struct whisper_context_params params = whisper_context_default_params();
> +    params.use_gpu = wctx->use_gpu;
> +    params.gpu_device = wctx->gpu_device;
> +
> +    wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);
> +    if (wctx->ctx_wsp == NULL) {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);
> +        return AVERROR(EIO);
> +    }
> +
> +    // Init buffer

> +    wctx->audio_buffer_queue_size = WHISPER_SAMPLE_RATE * wctx->queue / 1000000;

The multiplication can overflow also the 32bit output could overflow
best is probably to limit queue to a more reasonable value than INT64_MAX


> +    wctx->audio_buffer = av_malloc(wctx->audio_buffer_queue_size * sizeof(*wctx->audio_buffer));

av_calloc() or av_malloc_array()


[...]
> +static void run_transcription(AVFilterContext *ctx, AVDictionary **metadata, int end_pos)
> +{
> +    WhisperContext *wctx = ctx->priv;
> +    end_pos = FFMAX(0, FFMIN(end_pos, wctx->audio_buffer_fill_size));
> +
> +    if (!wctx->ctx_wsp || end_pos == 0)
> +        return;
> +
> +    float duration = (float) end_pos / WHISPER_SAMPLE_RATE;
[...]

> +    wctx->timestamp += duration * 1000;

floats are not precise and the accumulated rounding errors will
add up and lead to synchronization issues between the subtitles
and audio or video over a long enough timespan

Also for reproducability this should use integers

what you could do, is to use:
wctx->timestamp += end_pos;

and then replace every use of wctx->timestamp by wctx->timestamp / WHISPER_SAMPLE_RATE

or wctx->timestamp / (double)WHISPER_SAMPLE_RATE if the context demands a
double for example

that way the code is exact and no errors accumulate


> +
> +    if (metadata && segments_text) {
> +        av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);
> +        char *duration_text = av_asprintf("%f", duration);
> +        av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);
> +    }
> +    av_freep(&segments_text);
> +
> +    memcpy(wctx->audio_buffer, wctx->audio_buffer + end_pos, end_pos * sizeof(*wctx->audio_buffer));
> +    wctx->audio_buffer_fill_size -= end_pos;
> +    wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;
> +}
> +

> +static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    WhisperContext *wctx = ctx->priv;
> +    AVFilterLink *outlink = ctx->outputs[0];
> +    AVDictionary **metadata = &frame->metadata;
> +
> +    const int samples = frame->nb_samples;
> +    const float *input_data = (const float *) frame->data[0];
> +
> +    if (wctx->audio_buffer_fill_size + samples > wctx->audio_buffer_queue_size) {
> +        run_transcription(ctx, metadata, wctx->audio_buffer_fill_size);
> +    }
> +
> +    memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));
> +    wctx->audio_buffer_fill_size += samples;
> +
> +    if (wctx->ctx_vad
> +        && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=
> +        WHISPER_SAMPLE_RATE * (wctx->vad_min_speech_duration + wctx->vad_min_silence_duration) / 1000000) {
> +        struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,
> +                                                                                  wctx->vad_params,
> +                                                                                  wctx->audio_buffer,
> +                                                                                  wctx->audio_buffer_fill_size);
> +        wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;
> +
> +        if (!segments) {
> +            av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");
> +        } else {
> +            int n_segments = whisper_vad_segments_n_segments(segments);
> +
> +            if (n_segments > 0) {
> +                const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;
> +                const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;
> +                int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);
> +
> +                if (end_pos <= wctx->audio_buffer_fill_size - WHISPER_SAMPLE_RATE * wctx->vad_min_silence_duration / 1000000) {
> +                    av_log(ctx, AV_LOG_INFO,
> +                            "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",
> +                            n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
> +                    run_transcription(ctx, metadata, end_pos);
> +                }
> +            }
> +
> +            whisper_vad_free_segments(segments);
> +        }
> +    } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)
> +        run_transcription(ctx, metadata, wctx->audio_buffer_fill_size);
> +
> +    wctx->next_pts = frame->pts + av_rescale_q(frame->nb_samples, (AVRational) {
> +                                               1, inlink->sample_rate}
> +                                               , inlink->time_base);

I think you should consistently use samples or frame->nb_samples, they are the same
value i think

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Never trust a computer, one day, it may think you are the virus. -- Compn

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  reply	other threads:[~2025-07-19  0:16 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-09  7:23 Vittorio Palmisano
2025-07-09 13:36 ` Marvin Scholz
2025-07-09 15:24 ` Zhao Zhili
2025-07-10  8:43   ` Vittorio Palmisano
2025-07-10  9:47     ` Zhao Zhili
2025-07-10 12:41   ` Michael Niedermayer
2025-07-09 23:37 ` Michael Niedermayer
2025-07-10  8:34   ` Vittorio Palmisano
2025-07-10 10:05     ` Marvin Scholz
2025-07-10 10:20       ` Vittorio Palmisano
2025-07-10 10:25         ` Vittorio Palmisano
2025-07-10 12:20           ` Michael Niedermayer
2025-07-11  8:41             ` Vittorio Palmisano
2025-07-11  9:07               ` Vittorio Palmisano
2025-07-11 19:05                 ` Marvin Scholz
2025-07-12  0:03               ` Michael Niedermayer
2025-07-13 11:16                 ` Vittorio Palmisano
2025-07-14 10:34                   ` Vittorio Palmisano
2025-07-14 21:47                     ` Michael Niedermayer
2025-07-15  7:44                       ` Vittorio Palmisano
2025-07-17  8:51                         ` Vittorio Palmisano
2025-07-19  0:15                           ` Michael Niedermayer [this message]
2025-07-18 23:24                         ` Michael Niedermayer
2025-07-10 11:31     ` Michael Niedermayer
2025-07-10 12:07       ` Nicolas George
2025-07-10 12:10         ` Nicolas George
2025-07-09 23:41 ` Michael Niedermayer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250719001553.GP29660@pb2 \
    --to=michael@niedermayer.cc \
    --cc=ffmpeg-devel@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git