From: Y0SH1M4S73R via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: Y0SH1M4S73R <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] avfilter_flite_voicefile_support (PR #20644) Date: Sat, 04 Oct 2025 06:58:50 -0000 Message-ID: <175956113143.69.17145999310343513605@bf249f23a2c8> (raw) PR #20644 opened by Y0SH1M4S73R URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20644 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20644.patch See commit messages for details. >From e4590f078e26dcee8ff8ff3c48fd0d50a9f231fa Mon Sep 17 00:00:00 2001 From: Y0SH1M4S73R <Y0SH1M4S73R@github.com> Date: Sat, 4 Oct 2025 01:10:54 -0400 Subject: [PATCH 1/2] avfilter/flite: add .flitevox file support libflite allows users to load voices from .flitevox files. The flite filter can now load a voice from such a file using the voicefile option. The global initialization logic for the flite filter has been updated to register language and lexicon initialization functions for the "eng" and "useng" language strings. This is necessary to allow .flitevox files to be loaded correctly. Due to the relatively large size of loaded voices (~15 MB), I have chosen to cache them in `voice_entry` structs contained in a static array. However, this implementation does not account for the possibility of the voice file at a specified path being changed before a voice previously loaded from that file is no longer in use. As such, changes to this implementation, and whether loaded voices should even be cached in the first place, should be discussed. --- libavfilter/asrc_flite.c | 141 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 136 insertions(+), 5 deletions(-) diff --git a/libavfilter/asrc_flite.c b/libavfilter/asrc_flite.c index 33576beade..6f7ef467cd 100644 --- a/libavfilter/asrc_flite.c +++ b/libavfilter/asrc_flite.c @@ -31,6 +31,7 @@ #include "libavutil/mem.h" #include "libavutil/opt.h" #include "libavutil/thread.h" +#include "libavutil/tree.h" #include "avfilter.h" #include "filters.h" #include "audio.h" @@ -39,6 +40,7 @@ typedef struct FliteContext { const AVClass *class; char *voice_str; + char *voice_file; char *textfile; char *text; char *text_p; @@ -65,6 +67,7 @@ static const AVOption flite_options[] = { { "textfile", "set filename of the text to speak", OFFSET(textfile), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS }, { "v", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS }, { "voice", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS }, + { "voicefile", "set flitevox voice file", OFFSET(voice_file), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS}, { NULL } }; @@ -74,7 +77,7 @@ static AVMutex flite_mutex = AV_MUTEX_INITIALIZER; static int flite_inited = 0; -/* declare functions for all the supported voices */ +/* declare functions for all the built-in voices */ #define DECLARE_REGISTER_VOICE_FN(name) \ cst_voice *register_cmu_us_## name(const char *); \ void unregister_cmu_us_## name(cst_voice *) @@ -84,8 +87,12 @@ DECLARE_REGISTER_VOICE_FN(kal16); DECLARE_REGISTER_VOICE_FN(rms); DECLARE_REGISTER_VOICE_FN(slt); +void usenglish_init(cst_voice *); +cst_lexicon *cmulex_init(void); + struct voice_entry { const char *name; + char *voice_file; cst_voice * (*register_fn)(const char *); void (*unregister_fn)(cst_voice *); cst_voice *voice; @@ -145,6 +152,106 @@ static int select_voice(struct voice_entry **entry_ret, const char *voice_name, return AVERROR(EINVAL); } +static int loaded_voice_entry_count = 0; +static int loaded_voice_entry_capacity = 0; +static struct voice_entry **loaded_voice_entries = NULL; + +static int add_loaded_entry(struct voice_entry *entry) { + if (loaded_voice_entry_count == loaded_voice_entry_capacity) { + if(av_dynarray_add_nofree(&loaded_voice_entries, &loaded_voice_entry_capacity, entry) < 0) + return AVERROR(ENOMEM); + loaded_voice_entry_count++; + } else { + for (int i = 0; i < loaded_voice_entry_capacity; i++) { + if(!loaded_voice_entries[i]) { + loaded_voice_entries[i] = entry; + loaded_voice_entry_count++; + break; + } + } + } + return 0; +} + +static struct voice_entry *get_loaded_entry(void *key, int (predicate)(const void *key, const struct voice_entry *entry)) { + for (int i = 0; i < loaded_voice_entry_capacity; i++) { + struct voice_entry *entry = loaded_voice_entries[i]; + if (entry && predicate(key, entry)) + return entry; + } + return NULL; +} + +static int path_predicate(const void *key, const struct voice_entry *entry) { + return !strcmp((const char *)key, entry->voice_file); +} + +static int voice_predicate(const void *key, const struct voice_entry *entry) { + return (const cst_voice *)key == entry->voice; +} + +static void remove_loaded_entry(const struct voice_entry *removing_entry) { + for (int i = 0; i < loaded_voice_entry_capacity; i++) { + const struct voice_entry *entry = loaded_voice_entries[i]; + if (entry == removing_entry) { + loaded_voice_entries[i] = NULL; + loaded_voice_entry_count--; + } + } +} + +static void unregister_loaded_voice(cst_voice *voice) { + struct voice_entry *entry = get_loaded_entry(voice, voice_predicate); + if (!entry) { + av_log(NULL, AV_LOG_ERROR, "unregister_loaded_voice failed: no voice entry\n"); + return; + } + if (entry->voice != voice) { + av_log(NULL, AV_LOG_ERROR, "unregister_loaded_voice failed: voice mismatch\n"); + return; + } + remove_loaded_entry(entry); + delete_voice(entry->voice); + av_free((char *)entry->name); + av_free(entry->voice_file); + av_free(entry); + return; +} + +static int load_voice(struct voice_entry **entry_ret, char *voice_path, void *log_ctx) { + pthread_mutex_lock(&flite_mutex); + struct voice_entry *entry = get_loaded_entry(voice_path, path_predicate); + if (!entry) { + cst_voice *voice; + if (!(voice = flite_voice_load(voice_path))) { + pthread_mutex_unlock(&flite_mutex); + av_log(log_ctx, AV_LOG_ERROR, "the voice file '%s' can not be read\n", voice_path); + return AVERROR_EXTERNAL; + } + + entry = av_mallocz(sizeof(struct voice_entry)); + entry->name = av_strdup(voice->name); + entry->voice_file = av_strdup(voice_path); + entry->unregister_fn = unregister_loaded_voice; + entry->voice = voice; + + int ret; + if ((ret = add_loaded_entry(entry)) < 0) { + pthread_mutex_unlock(&flite_mutex); + delete_voice(voice); + av_free((char *)entry->name); + av_free(entry->voice_file); + av_free(entry); + return ret; + } + } + entry->usage_count++; + pthread_mutex_unlock(&flite_mutex); + *entry_ret = entry; + return 0; + +} + static int audio_stream_chunk_by_word(const cst_wave *wave, int start, int size, int last, cst_audio_streaming_info *asi) { @@ -164,6 +271,17 @@ static int audio_stream_chunk_by_word(const cst_wave *wave, int start, int size, return CST_AUDIO_STREAM_CONT; } +static int perform_flite_initializations(void) { + int ret = 0; + if ((ret = flite_init()) < 0) + return ret; + if ((ret = flite_add_lang("eng", usenglish_init, cmulex_init)) < 0) + return ret; + if ((ret = flite_add_lang("usenglish", usenglish_init, cmulex_init)) < 0) + return ret; + return 0; +} + static av_cold int init(AVFilterContext *ctx) { FliteContext *flite = ctx->priv; @@ -177,7 +295,7 @@ static av_cold int init(AVFilterContext *ctx) pthread_mutex_lock(&flite_mutex); if (!flite_inited) { - if ((ret = flite_init()) >= 0) + if ((ret = perform_flite_initializations()) >= 0) flite_inited = 1; } pthread_mutex_unlock(&flite_mutex); @@ -186,8 +304,14 @@ static av_cold int init(AVFilterContext *ctx) return AVERROR_EXTERNAL; } - if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0) - return ret; + if (flite->voice_file) { + if ((ret = load_voice(&flite->voice_entry, flite->voice_file, ctx)) < 0) + return ret; + } else { + if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0) + return ret; + } + flite->voice = flite->voice_entry->voice; if (flite->textfile && flite->text) { @@ -297,8 +421,15 @@ static int config_props(AVFilterLink *outlink) outlink->sample_rate = flite->sample_rate; outlink->time_base = (AVRational){1, flite->sample_rate}; + const char *voice_name; + if (flite->voice_file) { + voice_name = av_asprintf("%s (%s)", flite->voice->name, flite->voice_file); + } else { + voice_name = flite->voice_str; + } + av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n", - flite->voice_str, + voice_name, av_get_sample_fmt_name(outlink->format), outlink->sample_rate); return 0; -- 2.49.1 >From 7d7a83f78e78928f7c561dee06ace8ab7ee68f50 Mon Sep 17 00:00:00 2001 From: Y0SH1M4S73R <Y0SH1M4S73R@github.com> Date: Sat, 4 Oct 2025 02:52:34 -0400 Subject: [PATCH 2/2] avfilter/ffmpeg: corrects formatting mistakes --- libavfilter/asrc_flite.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavfilter/asrc_flite.c b/libavfilter/asrc_flite.c index 6f7ef467cd..90936b9028 100644 --- a/libavfilter/asrc_flite.c +++ b/libavfilter/asrc_flite.c @@ -67,7 +67,7 @@ static const AVOption flite_options[] = { { "textfile", "set filename of the text to speak", OFFSET(textfile), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS }, { "v", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS }, { "voice", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS }, - { "voicefile", "set flitevox voice file", OFFSET(voice_file), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS}, + { "voicefile", "set flitevox voice file", OFFSET(voice_file), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS }, { NULL } }; @@ -158,12 +158,12 @@ static struct voice_entry **loaded_voice_entries = NULL; static int add_loaded_entry(struct voice_entry *entry) { if (loaded_voice_entry_count == loaded_voice_entry_capacity) { - if(av_dynarray_add_nofree(&loaded_voice_entries, &loaded_voice_entry_capacity, entry) < 0) + if (av_dynarray_add_nofree(&loaded_voice_entries, &loaded_voice_entry_capacity, entry) < 0) return AVERROR(ENOMEM); loaded_voice_entry_count++; } else { for (int i = 0; i < loaded_voice_entry_capacity; i++) { - if(!loaded_voice_entries[i]) { + if (!loaded_voice_entries[i]) { loaded_voice_entries[i] = entry; loaded_voice_entry_count++; break; -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-10-04 6:59 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=175956113143.69.17145999310343513605@bf249f23a2c8 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git