Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: Michael Niedermayer <michael@niedermayer.cc>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: Re: [FFmpeg-devel] [PATCH] Whisper audio filter
Date: Thu, 10 Jul 2025 14:20:08 +0200
Message-ID: <20250710122008.GP29660@pb2> (raw)
In-Reply-To: <20250710102543.1002696-1-vpalmisano@gmail.com>


[-- Attachment #1.1: Type: text/plain, Size: 9837 bytes --]

Hi Vittorio

On Thu, Jul 10, 2025 at 12:25:41PM +0200, Vittorio Palmisano wrote:
> It adds a new audio filter for running audio transcriptions with the whisper model.
> Documentation and examples are included into the patch.
> 
> Signed-off-by: Vittorio Palmisano <vpalmisano@gmail.com>
> ---
>  configure                |   5 +
>  doc/filters.texi         | 101 ++++++++
>  libavfilter/Makefile     |   2 +
>  libavfilter/af_whisper.c | 488 +++++++++++++++++++++++++++++++++++++++
>  libavfilter/allfilters.c |   2 +
>  5 files changed, 598 insertions(+)
>  create mode 100644 libavfilter/af_whisper.c
> 
[...]
> diff --git a/libavfilter/af_whisper.c b/libavfilter/af_whisper.c
> new file mode 100644
> index 0000000000..81d90a77d7
> --- /dev/null
> +++ b/libavfilter/af_whisper.c
> @@ -0,0 +1,488 @@
> +/*
> + * Copyright (c) 2025 Vittorio Palmisano
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public License
> + * as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public License
> + * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +
> +#include "libavutil/avutil.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/channel_layout.h"
> +#include "libavutil/samplefmt.h"
> +#include "libavfilter/avfilter.h"
> +#include "libavfilter/audio.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/avstring.h"
> +#include "libavutil/internal.h"
> +#include "libavformat/avio.h"
> +#include "libavutil/thread.h"
> +
> +#include "formats.h"
> +

> +#include "whisper.h"

i presume thats meant to be #include <whisper.h> or something


> +
> +typedef struct WhisperContext {
> +    const AVClass *class;
> +    char *model_path;
> +    char *language;
> +    bool use_gpu;
> +    int gpu_device;
> +    char *vad_model_path;
> +    float vad_threshold;
> +    int vad_min_speech_duration;
> +    int vad_min_silence_duration;
> +
> +    int queue;
> +    char *destination;
> +    char *format;
> +
> +    struct whisper_context *ctx_wsp;
> +    struct whisper_vad_context *ctx_vad;
> +    struct whisper_vad_params vad_params;
> +
> +    float *audio_buffer;
> +    int audio_buffer_queue_size;
> +    int audio_buffer_fill_size;
> +    int audio_buffer_vad_size;
> +
> +    int eof;
> +    int64_t next_pts;
> +
> +    AVIOContext *avio_context;
> +    int index;
> +    int64_t timestamp;
> +} WhisperContext;
> +

> +static void cb_log_disable(enum ggml_log_level level, const char *text, void *user_data)
> +{
> +}

this should probably be forwarded to av_log


> +
> +static int init(AVFilterContext *ctx)
> +{
> +    WhisperContext *wctx = ctx->priv;
> +

> +    ggml_backend_load_all();

is this thread safe ?


> +    whisper_log_set(cb_log_disable, NULL);

This is not thread safe, its directly changing global state

void whisper_log_set(ggml_log_callback log_callback, void * user_data) {
    g_state.log_callback = log_callback ? log_callback : whisper_log_callback_default;
    g_state.log_callback_user_data = user_data;
    ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
}

Not a bug in this patch of course but whisper itself

until whisper is actually thread safe, this should be wraped with
ff_thread_once() though that only protects af_whisper from itself not
from any other potential callers


[...]
[...]

> +    wctx->timestamp += (int64_t) (duration * 1000);

that cast looks wierdly placed


> +
> +    if (metadata && segments_text) {
> +        av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);

> +        char *duration_text = av_asprintf("%f", duration);
> +        av_dict_set(metadata, "lavfi.whisper.duration", duration_text, 0);
> +        av_freep(&duration_text);

AV_DICT_DONT_STRDUP_VAL


> +    }
> +    av_freep(&segments_text);
> +
> +    memcpy(wctx->audio_buffer, wctx->audio_buffer + end_pos,
> +           end_pos * sizeof(float));

sizeof(*wctx->audio_buffer) is more robust than float


[...]

> +#define OFFSET(x) offsetof(WhisperContext, x)
> +#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
> +
> +static const AVOption whisper_options[] = {
> +    { "model", "Path to the whisper.cpp model file", OFFSET(model_path),
> +     AV_OPT_TYPE_STRING,.flags = FLAGS },
> +    { "language", "Language for transcription ('auto' for auto-detect)",
> +     OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"},.flags =
> +     FLAGS },
> +    { "queue", "Audio queue size in milliseconds", OFFSET(queue),
> +     AV_OPT_TYPE_INT, {.i64 = 3000}, 20, INT_MAX,.flags = FLAGS },
> +    { "use_gpu", "Use GPU for processing", OFFSET(use_gpu),
> +     AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1,.flags = FLAGS },
> +    { "gpu_device", "GPU device to use", OFFSET(gpu_device),
> +     AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX,.flags = FLAGS },
> +    { "destination", "Output destination", OFFSET(destination),
> +     AV_OPT_TYPE_STRING, {.str = ""},.flags = FLAGS },
> +    { "format", "Output format (text|srt|json)", OFFSET(format),
> +     AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
> +    { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path),
> +     AV_OPT_TYPE_STRING,.flags = FLAGS },
> +    { "vad_threshold", "VAD threshold", OFFSET(vad_threshold),
> +     AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0,.flags = FLAGS },
> +    { "vad_min_speech_duration",
> +     "Minimum speech duration in milliseconds for VAD",
> +     OFFSET(vad_min_speech_duration), AV_OPT_TYPE_INT, {.i64 = 50}, 20,
> +     INT_MAX,.flags = FLAGS },
> +    { "vad_min_silence_duration",
> +     "Minimum silence duration in milliseconds for VAD",
> +     OFFSET(vad_min_silence_duration), AV_OPT_TYPE_INT, {.i64 = 500}, 0,
> +     INT_MAX,.flags = FLAGS },
> +    { NULL }

not sure how others think of this, but i would ignore the 80 char limit and format this like:

static const AVOption whisper_options[] = {
    { "model"   , "Path to the whisper.cpp model file"                 , OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
    { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language)  , AV_OPT_TYPE_STRING, {.str = "auto"},             .flags = FLAGS },
    { "queue"   , "Audio queue size in milliseconds"                   , OFFSET(queue)     , AV_OPT_TYPE_INT   , {.i64 = 3000  }, 20, INT_MAX,.flags = FLAGS },
    { "use_gpu" , "Use GPU for processing"                             , OFFSET(use_gpu)   , AV_OPT_TYPE_BOOL  , {.i64 = 1     }, 0 , 1      ,.flags = FLAGS },
....

or:

static const AVOption whisper_options[] = {
    { "model"   , "Path to the whisper.cpp model file"                 ,
        OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
    { "language", "Language for transcription ('auto' for auto-detect)",
        OFFSET(language)  , AV_OPT_TYPE_STRING, {.str = "auto"},             .flags = FLAGS },
    { "queue"   , "Audio queue size in milliseconds"                   ,
        OFFSET(queue)     , AV_OPT_TYPE_INT   , {.i64 = 3000  }, 20, INT_MAX,.flags = FLAGS },
    { "use_gpu" , "Use GPU for processing"                             ,
        OFFSET(use_gpu)   , AV_OPT_TYPE_BOOL  , {.i64 = 1     }, 0 , 1      ,.flags = FLAGS },
....

Also it seems, this is alot slower than whisper-cli

time whisper-cli  matrix.wav -m ~/whisper.cpp/models/ggml-base.en.bin  --output-srt
real	0m16,283s
user	1m3,644s
sys	0m0,581s


time ./ffmpeg -v 99 -i matrix.wav -af "aformat=sample_rates=16000:channel_layouts=mono,whisper=model=/home/michael/whisper.cpp/models/ggml-base.en.bin:language=en:queue=3000:destination=output.srt:format=srt" -f null - 2> /tmp/log
real	1m30,827s
user	6m0,590s
sys	0m0,756s


and its af_whisper not the other processing:
time ./ffmpeg -v 99 -i matrix.wav -af "aformat=sample_rates=16000:channel_layouts=mono" -f null - 2> /tmp/nolog
real	0m0,151s
user	0m0,185s
sys	0m0,048s

also the srt is different:

whisper-cli:
00:00:17,500 --> 00:00:22,000
 Would you please remove any metallic items you're carrying, keys, boost change?
3
00:00:22,000 --> 00:00:24,000
 [Music]
4
00:00:24,000 --> 00:00:26,000
 Holy shit!
5
00:00:26,000 --> 00:00:37,000
 [Music]
6
00:00:37,000 --> 00:00:38,000
 Back up!
7
00:00:38,000 --> 00:00:39,000
 Stand back up!


vs.

af_whisper
6
00:00:17.915 --> 00:00:20.815
Please remove any metallic items you're carrying, keys.
7
00:00:20.901 --> 00:00:21.741
boost change.
8
00:00:23.887 --> 00:00:25.887
Holy shit.
10
00:00:29.859 --> 00:00:32.119
(explosion)
11
00:00:32.845 --> 00:00:35.105
(explosion)
12
00:00:35.831 --> 00:00:37.831
Back on!
12
00:00:37.831 --> 00:00:38.831
Stand back on!






[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Frequently ignored answer#1 FFmpeg bugs should be sent to our bugtracker. User
questions about the command line tools should be sent to the ffmpeg-user ML.
And questions about how to use libav* should be sent to the libav-user ML.

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  reply	other threads:[~2025-07-10 12:20 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-09  7:23 Vittorio Palmisano
2025-07-09 13:36 ` Marvin Scholz
2025-07-09 15:24 ` Zhao Zhili
2025-07-10  8:43   ` Vittorio Palmisano
2025-07-10  9:47     ` Zhao Zhili
2025-07-10 12:41   ` Michael Niedermayer
2025-07-09 23:37 ` Michael Niedermayer
2025-07-10  8:34   ` Vittorio Palmisano
2025-07-10 10:05     ` Marvin Scholz
2025-07-10 10:20       ` Vittorio Palmisano
2025-07-10 10:25         ` Vittorio Palmisano
2025-07-10 12:20           ` Michael Niedermayer [this message]
2025-07-10 11:31     ` Michael Niedermayer
2025-07-10 12:07       ` Nicolas George
2025-07-10 12:10         ` Nicolas George
2025-07-09 23:41 ` Michael Niedermayer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250710122008.GP29660@pb2 \
    --to=michael@niedermayer.cc \
    --cc=ffmpeg-devel@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git