From c77d994dc44a9ef8f87dd36661201f499877fc34 Mon Sep 17 00:00:00 2001 From: Přemysl Eric Janouch Date: Sun, 11 Jun 2023 17:45:38 +0200 Subject: Rename tools, make them installable --- src/tdv-add-pronunciation.c | 469 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 469 insertions(+) create mode 100644 src/tdv-add-pronunciation.c (limited to 'src/tdv-add-pronunciation.c') diff --git a/src/tdv-add-pronunciation.c b/src/tdv-add-pronunciation.c new file mode 100644 index 0000000..90d9673 --- /dev/null +++ b/src/tdv-add-pronunciation.c @@ -0,0 +1,469 @@ +/* + * A tool to add eSpeak-generated pronunciation to dictionaries + * + * Here I use the `espeak' process rather than libespeak because of the GPL. + * It's far from ideal, rather good as a starting point. + * + * Copyright (c) 2013, Přemysl Eric Janouch + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include +#include + +#include "stardict.h" +#include "stardict-private.h" +#include "generator.h" +#include "utils.h" + + +// --- Pronunciation generator ------------------------------------------------- + +typedef struct worker_data WorkerData; + +struct worker_data +{ + gchar **cmdline; ///< eSpeak command line + guint ignore_acronyms : 1; ///< Don't spell out acronyms + GRegex *re_stop; ///< Regex for stop sequences + GRegex *re_acronym; ///< Regex for ACRONYMS + + guint32 start_entry; ///< The first entry to be processed + guint32 end_entry; ///< Past the last entry to be processed + + // Reader, writer + GMutex *dict_mutex; ///< Locks the dictionary object + + // Reader + GThread *main_thread; ///< A handle to the reader thread + StardictDict *dict; ///< The dictionary object + gpointer output; ///< Linked-list of pronunciation data + + GMutex *remaining_mutex; ///< Locks the progress stats + GCond *remaining_cond; ///< Signals a change in progress + guint32 remaining; ///< How many entries remain + guint32 total; ///< Total number of entries + + // Writer + StardictIterator *iterator; ///< Iterates over the dictionary + FILE *child_stdin; ///< Standard input of eSpeak +}; + +/// eSpeak splits the output on certain characters. +#define LINE_SPLITTING_CHARS ".,:;?!" + +/// We don't want to include brackets either. +#define OTHER_STOP_CHARS "([{<" + +/// A void word used to make a unique "no pronunciation available" mark. +#define VOID_ENTRY "not present in any dictionary" + + +/// Adds dots between characters. +static gboolean +writer_acronym_cb (const GMatchInfo *info, GString *res, + G_GNUC_UNUSED gpointer data) +{ + gchar *preceding = g_match_info_fetch (info, 1); + g_string_append (res, preceding); + g_free (preceding); + + gchar *word = g_match_info_fetch (info, 2); + + g_string_append_c (res, *word); + const gchar *p; + for (p = word + 1; *p; p++) + { + g_string_append_c (res, '.'); + g_string_append_c (res, *p); + } + + g_free (word); + return FALSE; +} + +/// Writes to espeak's stdin. +static gpointer +worker_writer (WorkerData *data) +{ + GError *error = NULL; + GMatchInfo *match_info; + while (stardict_iterator_get_offset (data->iterator) != data->end_entry) + { + g_mutex_lock (data->dict_mutex); + const gchar *word = stardict_iterator_get_word (data->iterator); + g_mutex_unlock (data->dict_mutex); + + word += strspn (word, LINE_SPLITTING_CHARS " \t"); + gchar *x = g_strdup (word); + + // Cut the word if needed be + error = NULL; + if (g_regex_match_full (data->re_stop, + x, -1, 0, 0, &match_info, &error)) + { + gint start_pos; + g_match_info_fetch_pos (match_info, 0, &start_pos, NULL); + x[start_pos] = 0; + } + g_match_info_free (match_info); + + // Change acronyms so that they're not pronounced as words + if (!error && !data->ignore_acronyms) + { + char *tmp = g_regex_replace_eval (data->re_acronym, + x, -1, 0, 0, writer_acronym_cb, NULL, &error); + g_free (x); + x = tmp; + } + + if (error) + { + g_printerr ("Notice: error processing '%s': %s\n", + word, error->message); + g_clear_error (&error); + *x = 0; + } + + // We might have accidentally cut off everything + if (!*x) + { + g_free (x); + x = g_strdup (VOID_ENTRY); + } + + stardict_iterator_next (data->iterator); + if (fprintf (data->child_stdin, "%s\n", x) < 0) + fatal ("write to eSpeak failed: %s\n", g_strerror (errno)); + + g_free (x); + } + + g_object_unref (data->iterator); + return GINT_TO_POINTER (fclose (data->child_stdin)); +} + +/// Get the void entry (and test if espeak works). +static gchar * +get_void_entry (gchar *cmdline[]) +{ + gchar *output; + gint exit_status; + + GError *error = NULL; + if (!g_spawn_sync (NULL, cmdline, NULL, + G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, NULL, NULL, + &output, NULL, &exit_status, &error)) + fatal ("Error: couldn't spawn espeak: %s\n", error->message); + + if (exit_status) + fatal ("Error: espeak returned %d\n", exit_status); + + return output; +} + +/// Reads from espeak's stdout. +static gpointer +worker (WorkerData *data) +{ + // Spawn eSpeak + GError *error = NULL; + gint child_in, child_out; + if (!g_spawn_async_with_pipes (NULL, data->cmdline, NULL, + G_SPAWN_SEARCH_PATH, NULL, NULL, + NULL, &child_in, &child_out, NULL, &error)) + fatal ("g_spawn: %s\n", error->message); + + data->child_stdin = fdopen (child_in, "wb"); + if (!data->child_stdin) + perror ("fdopen"); + + FILE *child_stdout = fdopen (child_out, "rb"); + if (!child_stdout) + perror ("fdopen"); + + // Spawn a writer thread + g_mutex_lock (data->dict_mutex); + data->iterator = stardict_iterator_new (data->dict, data->start_entry); + g_mutex_unlock (data->dict_mutex); + + GThread *writer = g_thread_new ("write worker", + (GThreadFunc) worker_writer, data); + + // Read the output + g_mutex_lock (data->remaining_mutex); + guint32 remaining = data->remaining; + g_mutex_unlock (data->remaining_mutex); + + data->output = NULL; + gpointer *output_end = &data->output; + while (remaining) + { + static gchar next[sizeof (gpointer)]; + GString *s = g_string_new (NULL); + g_string_append_len (s, next, sizeof next); + + gint c; + while ((c = fgetc (child_stdout)) != EOF && c != '\n') + g_string_append_c (s, c); + if (c == EOF) + fatal ("eSpeak process died too soon\n"); + + gchar *translation = g_string_free (s, FALSE); + *output_end = translation; + output_end = (gpointer *) translation; + + // We limit progress reporting so that + // the mutex doesn't spin like crazy + if ((--remaining & 255) != 0) + continue; + + g_mutex_lock (data->remaining_mutex); + data->remaining = remaining; + g_cond_broadcast (data->remaining_cond); + g_mutex_unlock (data->remaining_mutex); + } + + if (fgetc (child_stdout) != EOF) + fatal ("Error: eSpeak has written more lines than it should. " + "The output would be corrupt, aborting.\n"); + + fclose (child_stdout); + return g_thread_join (writer); +} + +// --- Main -------------------------------------------------------------------- + +int +main (int argc, char *argv[]) +{ + gint n_processes = 1; + gchar *voice = NULL; + gboolean ignore_acronyms = FALSE; + + GOptionEntry entries[] = + { + { "processes", 'N', G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_INT, &n_processes, + "The number of espeak processes run in parallel", "PROCESSES" }, + { "voice", 'v', G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_STRING, &voice, + "The voice to be used by eSpeak to pronounce the words", "VOICE" }, + { "ignore-acronyms", 0, G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_NONE, &ignore_acronyms, + "Don't spell out words composed of big letters only", NULL }, + { NULL } + }; + +G_GNUC_BEGIN_IGNORE_DEPRECATIONS + if (glib_check_version (2, 36, 0)) + g_type_init (); +G_GNUC_END_IGNORE_DEPRECATIONS + + GError *error = NULL; + GOptionContext *ctx = g_option_context_new + ("input.ifo output-basename - add pronunciation to dictionaries"); + g_option_context_add_main_entries (ctx, entries, NULL); + if (!g_option_context_parse (ctx, &argc, &argv, &error)) + fatal ("Error: option parsing failed: %s\n", error->message); + + if (argc != 3) + fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL)); + + g_option_context_free (ctx); + + // See if we can run espeak + static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL, NULL, NULL }; + + if (voice) + { + cmdline[3] = "-v"; + cmdline[4] = voice; + } + + gchar *void_entry = g_strstrip (get_void_entry (cmdline)); + + // Load the dictionary + printf ("Loading the original dictionary...\n"); + StardictDict *dict = stardict_dict_new (argv[1], &error); + if (!dict) + fatal ("Error: opening the dictionary failed: %s\n", error->message); + + gsize n_words = stardict_info_get_word_count + (stardict_dict_get_info (dict)); + + if (n_processes <= 0) + fatal ("Error: there must be at least one process\n"); + + if ((gsize) n_processes > n_words * 1024) + { + n_processes = n_words / 1024; + if (!n_processes) + n_processes = 1; + g_printerr ("Warning: too many processes, reducing to %d\n", + n_processes); + } + + // Spawn worker threads to generate pronunciation data + static GMutex dict_mutex; + + static GMutex remaining_mutex; + static GCond remaining_cond; + + WorkerData *data = g_alloca (sizeof *data * n_processes); + + GRegex *re_stop = g_regex_new ("[" LINE_SPLITTING_CHARS "][ ?]" + "|\\.\\.\\.|[" OTHER_STOP_CHARS "]", G_REGEX_OPTIMIZE, 0, &error); + g_assert (re_stop != NULL); + + GRegex *re_acronym = g_regex_new ("(^|\\pZ)(\\p{Lu}+)(?=\\pZ|$)", + G_REGEX_OPTIMIZE, 0, &error); + g_assert (re_acronym != NULL); + + gint i; + for (i = 0; i < n_processes; i++) + { + data[i].start_entry = n_words * i / n_processes; + data[i].end_entry = n_words * (i + 1) / n_processes; + + data[i].total = data[i].remaining = + data[i].end_entry - data[i].start_entry; + data[i].remaining_mutex = &remaining_mutex; + data[i].remaining_cond = &remaining_cond; + + data[i].dict = dict; + data[i].dict_mutex = &dict_mutex; + + data[i].re_stop = re_stop; + data[i].re_acronym = re_acronym; + + data[i].cmdline = cmdline; + data[i].ignore_acronyms = ignore_acronyms; + data[i].main_thread = + g_thread_new ("worker", (GThreadFunc) worker, &data[i]); + } + + // Loop while the threads still have some work to do and report status + g_mutex_lock (&remaining_mutex); + for (;;) + { + gboolean all_finished = TRUE; + printf ("\rRetrieving pronunciation... "); + for (i = 0; i < n_processes; i++) + { + printf ("%3u%% ", 100 - data[i].remaining * 100 / data[i].total); + if (data[i].remaining) + all_finished = FALSE; + } + + if (all_finished) + break; + g_cond_wait (&remaining_cond, &remaining_mutex); + } + g_mutex_unlock (&remaining_mutex); + + putchar ('\n'); + for (i = 0; i < n_processes; i++) + g_thread_join (data[i].main_thread); + + g_regex_unref (re_stop); + g_regex_unref (re_acronym); + + // Put extended entries into a new dictionary + Generator *generator = generator_new (argv[2], &error); + if (!generator) + fatal ("Error: failed to create the output dictionary: %s\n", + error->message); + + StardictInfo *info = generator->info; + stardict_info_copy (info, stardict_dict_get_info (dict)); + + // This gets incremented each time an entry is finished + info->word_count = 0; + + if (info->same_type_sequence) + { + gchar *new_sts = g_strconcat ("t", info->same_type_sequence, NULL); + g_free (info->same_type_sequence); + info->same_type_sequence = new_sts; + } + + // Write out all the entries together with the pronunciation + for (i = 0; i < n_processes; i++) + { + StardictIterator *iterator = + stardict_iterator_new (dict, data[i].start_entry); + + gpointer *output = data[i].output; + while (stardict_iterator_get_offset (iterator) != data[i].end_entry) + { + printf ("\rCreating a new dictionary... %3lu%%", + (gulong) stardict_iterator_get_offset (iterator) * 100 + / stardict_dict_get_info (dict)->word_count); + + g_assert (output != NULL); + + gchar *pronunciation = g_strstrip ((gchar *) (output + 1)); + StardictEntry *entry = stardict_iterator_get_entry (iterator); + + generator_begin_entry (generator); + + if (!strcmp (pronunciation, void_entry)) + *pronunciation = 0; + +// g_printerr ("%s /%s/\n", +// stardict_iterator_get_word (iterator), pronunciation); + + // For the sake of simplicity we fake a new start; + // write_fields() only iterates the list in one direction. + StardictEntryField field; + field.type = 't'; + field.data = pronunciation; + + GList start_link; + start_link.next = entry->fields; + start_link.data = &field; + + if (!generator_write_fields (generator, &start_link, &error) + || !generator_finish_entry (generator, + stardict_iterator_get_word (iterator), &error)) + fatal ("Error: write failed: %s\n", error->message); + + g_object_unref (entry); + + gpointer *tmp = output; + output = *output; + g_free (tmp); + + stardict_iterator_next (iterator); + } + + g_assert (output == NULL); + g_object_unref (iterator); + } + + putchar ('\n'); + if (!generator_finish (generator, &error)) + fatal ("Error: failed to write the dictionary: %s\n", error->message); + + generator_free (generator); + g_object_unref (dict); + g_free (void_entry); + return 0; +} -- cgit v1.2.3-70-g09d2