[FFmpeg-devel] [PATCH] avfilter_flite_voicefile_support (PR #20644)

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

From: Y0SH1M4S73R via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: Y0SH1M4S73R <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] avfilter_flite_voicefile_support (PR #20644)
Date: Sat, 04 Oct 2025 06:58:50 -0000
Message-ID: <175956113143.69.17145999310343513605@bf249f23a2c8> (raw)

PR #20644 opened by Y0SH1M4S73R
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20644
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20644.patch

See commit messages for details.


>From e4590f078e26dcee8ff8ff3c48fd0d50a9f231fa Mon Sep 17 00:00:00 2001
From: Y0SH1M4S73R <Y0SH1M4S73R@github.com>
Date: Sat, 4 Oct 2025 01:10:54 -0400
Subject: [PATCH 1/2] avfilter/flite: add .flitevox file support

libflite allows users to load voices from .flitevox files. The flite filter can now load a voice from such a file using the voicefile option.

The global initialization logic for the flite filter has been updated to register language and lexicon initialization functions for the "eng" and "useng" language strings. This is necessary to allow .flitevox files to be loaded correctly.

Due to the relatively large size of loaded voices (~15 MB), I have chosen to cache them in `voice_entry` structs contained in a static array. However, this implementation does not account for the possibility of the voice file at a specified path being changed before a voice previously loaded from that file is no longer in use. As such, changes to this implementation, and whether loaded voices should even be cached in the first place, should be discussed.
---
 libavfilter/asrc_flite.c | 141 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 136 insertions(+), 5 deletions(-)

diff --git a/libavfilter/asrc_flite.c b/libavfilter/asrc_flite.c
index 33576beade..6f7ef467cd 100644
--- a/libavfilter/asrc_flite.c
+++ b/libavfilter/asrc_flite.c
@@ -31,6 +31,7 @@
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
 #include "libavutil/thread.h"
+#include "libavutil/tree.h"
 #include "avfilter.h"
 #include "filters.h"
 #include "audio.h"
@@ -39,6 +40,7 @@
 typedef struct FliteContext {
     const AVClass *class;
     char *voice_str;
+    char *voice_file;
     char *textfile;
     char *text;
     char *text_p;
@@ -65,6 +67,7 @@ static const AVOption flite_options[] = {
     { "textfile",    "set filename of the text to speak", OFFSET(textfile),  AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
     { "v",           "set voice",                         OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
     { "voice",       "set voice",                         OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
+    { "voicefile",   "set flitevox voice file",           OFFSET(voice_file), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS},
     { NULL }
 };
 
@@ -74,7 +77,7 @@ static AVMutex flite_mutex = AV_MUTEX_INITIALIZER;
 
 static int flite_inited = 0;
 
-/* declare functions for all the supported voices */
+/* declare functions for all the built-in voices */
 #define DECLARE_REGISTER_VOICE_FN(name) \
     cst_voice *register_cmu_us_## name(const char *); \
     void     unregister_cmu_us_## name(cst_voice *)
@@ -84,8 +87,12 @@ DECLARE_REGISTER_VOICE_FN(kal16);
 DECLARE_REGISTER_VOICE_FN(rms);
 DECLARE_REGISTER_VOICE_FN(slt);
 
+void usenglish_init(cst_voice *);
+cst_lexicon *cmulex_init(void);
+
 struct voice_entry {
     const char *name;
+    char *voice_file;
     cst_voice * (*register_fn)(const char *);
     void (*unregister_fn)(cst_voice *);
     cst_voice *voice;
@@ -145,6 +152,106 @@ static int select_voice(struct voice_entry **entry_ret, const char *voice_name,
     return AVERROR(EINVAL);
 }
 
+static int loaded_voice_entry_count = 0;
+static int loaded_voice_entry_capacity = 0;
+static struct voice_entry **loaded_voice_entries = NULL;
+
+static int add_loaded_entry(struct voice_entry *entry) {
+    if (loaded_voice_entry_count == loaded_voice_entry_capacity) {
+        if(av_dynarray_add_nofree(&loaded_voice_entries, &loaded_voice_entry_capacity, entry) < 0)
+            return AVERROR(ENOMEM);
+        loaded_voice_entry_count++;
+    } else {
+        for (int i = 0; i < loaded_voice_entry_capacity; i++) {
+            if(!loaded_voice_entries[i]) {
+                loaded_voice_entries[i] = entry;
+                loaded_voice_entry_count++;
+                break;
+            }
+        }
+    }
+    return 0;
+}
+
+static struct voice_entry *get_loaded_entry(void *key, int (predicate)(const void *key, const struct voice_entry *entry)) {
+    for (int i = 0; i < loaded_voice_entry_capacity; i++) {
+        struct voice_entry *entry = loaded_voice_entries[i];
+        if (entry && predicate(key, entry))
+            return entry;
+    }
+    return NULL;
+}
+
+static int path_predicate(const void *key, const struct voice_entry *entry) {
+    return !strcmp((const char *)key, entry->voice_file);
+}
+
+static int voice_predicate(const void *key, const struct voice_entry *entry) {
+    return (const cst_voice *)key == entry->voice;
+}
+
+static void remove_loaded_entry(const struct voice_entry *removing_entry) {
+    for (int i = 0; i < loaded_voice_entry_capacity; i++) {
+        const struct voice_entry *entry = loaded_voice_entries[i];
+        if (entry == removing_entry) {
+            loaded_voice_entries[i] = NULL;
+            loaded_voice_entry_count--;
+        }
+    }
+}
+
+static void unregister_loaded_voice(cst_voice *voice) {
+    struct voice_entry *entry = get_loaded_entry(voice, voice_predicate);
+    if (!entry) {
+        av_log(NULL, AV_LOG_ERROR, "unregister_loaded_voice failed: no voice entry\n");
+        return;
+    }
+    if (entry->voice != voice) {
+        av_log(NULL, AV_LOG_ERROR, "unregister_loaded_voice failed: voice mismatch\n");
+        return;
+    }
+    remove_loaded_entry(entry);
+    delete_voice(entry->voice);
+    av_free((char *)entry->name);
+    av_free(entry->voice_file);
+    av_free(entry);
+    return;
+}
+
+static int load_voice(struct voice_entry **entry_ret, char *voice_path, void *log_ctx) {
+    pthread_mutex_lock(&flite_mutex);
+    struct voice_entry *entry = get_loaded_entry(voice_path, path_predicate);
+    if (!entry) {
+        cst_voice *voice;
+        if (!(voice = flite_voice_load(voice_path))) {
+            pthread_mutex_unlock(&flite_mutex);
+            av_log(log_ctx, AV_LOG_ERROR, "the voice file '%s' can not be read\n", voice_path);
+            return AVERROR_EXTERNAL;
+        }
+
+        entry = av_mallocz(sizeof(struct voice_entry));
+        entry->name = av_strdup(voice->name);
+        entry->voice_file = av_strdup(voice_path);
+        entry->unregister_fn = unregister_loaded_voice;
+        entry->voice = voice;
+
+        int ret;
+        if ((ret = add_loaded_entry(entry)) < 0) {
+            pthread_mutex_unlock(&flite_mutex);
+            delete_voice(voice);
+            av_free((char *)entry->name);
+            av_free(entry->voice_file);
+            av_free(entry);
+            return ret;
+        }
+    }
+    entry->usage_count++;
+    pthread_mutex_unlock(&flite_mutex);
+    *entry_ret = entry;
+    return 0;
+
+}
+
 static int audio_stream_chunk_by_word(const cst_wave *wave, int start, int size,
                                       int last, cst_audio_streaming_info *asi)
 {
@@ -164,6 +271,17 @@ static int audio_stream_chunk_by_word(const cst_wave *wave, int start, int size,
     return CST_AUDIO_STREAM_CONT;
 }
 
+static int perform_flite_initializations(void) {
+    int ret = 0;
+    if ((ret = flite_init()) < 0)
+        return ret;
+    if ((ret = flite_add_lang("eng", usenglish_init, cmulex_init)) < 0)
+        return ret;
+    if ((ret = flite_add_lang("usenglish", usenglish_init, cmulex_init)) < 0)
+        return ret;
+    return 0;
+}
+
 static av_cold int init(AVFilterContext *ctx)
 {
     FliteContext *flite = ctx->priv;
@@ -177,7 +295,7 @@ static av_cold int init(AVFilterContext *ctx)
 
     pthread_mutex_lock(&flite_mutex);
     if (!flite_inited) {
-        if ((ret = flite_init()) >= 0)
+        if ((ret = perform_flite_initializations()) >= 0)
             flite_inited = 1;
     }
     pthread_mutex_unlock(&flite_mutex);
@@ -186,8 +304,14 @@ static av_cold int init(AVFilterContext *ctx)
         return AVERROR_EXTERNAL;
     }
 
-    if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0)
-        return ret;
+    if (flite->voice_file) {
+        if ((ret = load_voice(&flite->voice_entry, flite->voice_file, ctx)) < 0)
+            return ret;
+    } else {
+        if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0)
+            return ret;
+    }
+
     flite->voice = flite->voice_entry->voice;
 
     if (flite->textfile && flite->text) {
@@ -297,8 +421,15 @@ static int config_props(AVFilterLink *outlink)
     outlink->sample_rate = flite->sample_rate;
     outlink->time_base = (AVRational){1, flite->sample_rate};
 
+    const char *voice_name;
+    if (flite->voice_file) {
+        voice_name = av_asprintf("%s (%s)", flite->voice->name, flite->voice_file);
+    } else {
+        voice_name = flite->voice_str;
+    }
+
     av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
-           flite->voice_str,
+           voice_name,
            av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
 
     return 0;
-- 
2.49.1


>From 7d7a83f78e78928f7c561dee06ace8ab7ee68f50 Mon Sep 17 00:00:00 2001
From: Y0SH1M4S73R <Y0SH1M4S73R@github.com>
Date: Sat, 4 Oct 2025 02:52:34 -0400
Subject: [PATCH 2/2] avfilter/ffmpeg: corrects formatting mistakes

---
 libavfilter/asrc_flite.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavfilter/asrc_flite.c b/libavfilter/asrc_flite.c
index 6f7ef467cd..90936b9028 100644
--- a/libavfilter/asrc_flite.c
+++ b/libavfilter/asrc_flite.c
@@ -67,7 +67,7 @@ static const AVOption flite_options[] = {
     { "textfile",    "set filename of the text to speak", OFFSET(textfile),  AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
     { "v",           "set voice",                         OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
     { "voice",       "set voice",                         OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
-    { "voicefile",   "set flitevox voice file",           OFFSET(voice_file), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS},
+    { "voicefile",   "set flitevox voice file",           OFFSET(voice_file), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
     { NULL }
 };
 
@@ -158,12 +158,12 @@ static struct voice_entry **loaded_voice_entries = NULL;
 
 static int add_loaded_entry(struct voice_entry *entry) {
     if (loaded_voice_entry_count == loaded_voice_entry_capacity) {
-        if(av_dynarray_add_nofree(&loaded_voice_entries, &loaded_voice_entry_capacity, entry) < 0)
+        if (av_dynarray_add_nofree(&loaded_voice_entries, &loaded_voice_entry_capacity, entry) < 0)
             return AVERROR(ENOMEM);
         loaded_voice_entry_count++;
     } else {
         for (int i = 0; i < loaded_voice_entry_capacity; i++) {
-            if(!loaded_voice_entries[i]) {
+            if (!loaded_voice_entries[i]) {
                 loaded_voice_entries[i] = entry;
                 loaded_voice_entry_count++;
                 break;
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

                 reply	other threads:[~2025-10-04  6:59 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=175956113143.69.17145999310343513605@bf249f23a2c8 \
    --to=ffmpeg-devel@ffmpeg.org \
    --cc=code@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ http://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git