diff options
Diffstat (limited to 'src/add-pronunciation.c')
-rw-r--r-- | src/add-pronunciation.c | 469 |
1 files changed, 0 insertions, 469 deletions
diff --git a/src/add-pronunciation.c b/src/add-pronunciation.c deleted file mode 100644 index 90d9673..0000000 --- a/src/add-pronunciation.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * A tool to add eSpeak-generated pronunciation to dictionaries - * - * Here I use the `espeak' process rather than libespeak because of the GPL. - * It's far from ideal, rather good as a starting point. - * - * Copyright (c) 2013, Přemysl Eric Janouch <p@janouch.name> - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <errno.h> - -#include <glib.h> -#include <gio/gio.h> - -#include "stardict.h" -#include "stardict-private.h" -#include "generator.h" -#include "utils.h" - - -// --- Pronunciation generator ------------------------------------------------- - -typedef struct worker_data WorkerData; - -struct worker_data -{ - gchar **cmdline; ///< eSpeak command line - guint ignore_acronyms : 1; ///< Don't spell out acronyms - GRegex *re_stop; ///< Regex for stop sequences - GRegex *re_acronym; ///< Regex for ACRONYMS - - guint32 start_entry; ///< The first entry to be processed - guint32 end_entry; ///< Past the last entry to be processed - - // Reader, writer - GMutex *dict_mutex; ///< Locks the dictionary object - - // Reader - GThread *main_thread; ///< A handle to the reader thread - StardictDict *dict; ///< The dictionary object - gpointer output; ///< Linked-list of pronunciation data - - GMutex *remaining_mutex; ///< Locks the progress stats - GCond *remaining_cond; ///< Signals a change in progress - guint32 remaining; ///< How many entries remain - guint32 total; ///< Total number of entries - - // Writer - StardictIterator *iterator; ///< Iterates over the dictionary - FILE *child_stdin; ///< Standard input of eSpeak -}; - -/// eSpeak splits the output on certain characters. -#define LINE_SPLITTING_CHARS ".,:;?!" - -/// We don't want to include brackets either. -#define OTHER_STOP_CHARS "([{<" - -/// A void word used to make a unique "no pronunciation available" mark. -#define VOID_ENTRY "not present in any dictionary" - - -/// Adds dots between characters. -static gboolean -writer_acronym_cb (const GMatchInfo *info, GString *res, - G_GNUC_UNUSED gpointer data) -{ - gchar *preceding = g_match_info_fetch (info, 1); - g_string_append (res, preceding); - g_free (preceding); - - gchar *word = g_match_info_fetch (info, 2); - - g_string_append_c (res, *word); - const gchar *p; - for (p = word + 1; *p; p++) - { - g_string_append_c (res, '.'); - g_string_append_c (res, *p); - } - - g_free (word); - return FALSE; -} - -/// Writes to espeak's stdin. -static gpointer -worker_writer (WorkerData *data) -{ - GError *error = NULL; - GMatchInfo *match_info; - while (stardict_iterator_get_offset (data->iterator) != data->end_entry) - { - g_mutex_lock (data->dict_mutex); - const gchar *word = stardict_iterator_get_word (data->iterator); - g_mutex_unlock (data->dict_mutex); - - word += strspn (word, LINE_SPLITTING_CHARS " \t"); - gchar *x = g_strdup (word); - - // Cut the word if needed be - error = NULL; - if (g_regex_match_full (data->re_stop, - x, -1, 0, 0, &match_info, &error)) - { - gint start_pos; - g_match_info_fetch_pos (match_info, 0, &start_pos, NULL); - x[start_pos] = 0; - } - g_match_info_free (match_info); - - // Change acronyms so that they're not pronounced as words - if (!error && !data->ignore_acronyms) - { - char *tmp = g_regex_replace_eval (data->re_acronym, - x, -1, 0, 0, writer_acronym_cb, NULL, &error); - g_free (x); - x = tmp; - } - - if (error) - { - g_printerr ("Notice: error processing '%s': %s\n", - word, error->message); - g_clear_error (&error); - *x = 0; - } - - // We might have accidentally cut off everything - if (!*x) - { - g_free (x); - x = g_strdup (VOID_ENTRY); - } - - stardict_iterator_next (data->iterator); - if (fprintf (data->child_stdin, "%s\n", x) < 0) - fatal ("write to eSpeak failed: %s\n", g_strerror (errno)); - - g_free (x); - } - - g_object_unref (data->iterator); - return GINT_TO_POINTER (fclose (data->child_stdin)); -} - -/// Get the void entry (and test if espeak works). -static gchar * -get_void_entry (gchar *cmdline[]) -{ - gchar *output; - gint exit_status; - - GError *error = NULL; - if (!g_spawn_sync (NULL, cmdline, NULL, - G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, NULL, NULL, - &output, NULL, &exit_status, &error)) - fatal ("Error: couldn't spawn espeak: %s\n", error->message); - - if (exit_status) - fatal ("Error: espeak returned %d\n", exit_status); - - return output; -} - -/// Reads from espeak's stdout. -static gpointer -worker (WorkerData *data) -{ - // Spawn eSpeak - GError *error = NULL; - gint child_in, child_out; - if (!g_spawn_async_with_pipes (NULL, data->cmdline, NULL, - G_SPAWN_SEARCH_PATH, NULL, NULL, - NULL, &child_in, &child_out, NULL, &error)) - fatal ("g_spawn: %s\n", error->message); - - data->child_stdin = fdopen (child_in, "wb"); - if (!data->child_stdin) - perror ("fdopen"); - - FILE *child_stdout = fdopen (child_out, "rb"); - if (!child_stdout) - perror ("fdopen"); - - // Spawn a writer thread - g_mutex_lock (data->dict_mutex); - data->iterator = stardict_iterator_new (data->dict, data->start_entry); - g_mutex_unlock (data->dict_mutex); - - GThread *writer = g_thread_new ("write worker", - (GThreadFunc) worker_writer, data); - - // Read the output - g_mutex_lock (data->remaining_mutex); - guint32 remaining = data->remaining; - g_mutex_unlock (data->remaining_mutex); - - data->output = NULL; - gpointer *output_end = &data->output; - while (remaining) - { - static gchar next[sizeof (gpointer)]; - GString *s = g_string_new (NULL); - g_string_append_len (s, next, sizeof next); - - gint c; - while ((c = fgetc (child_stdout)) != EOF && c != '\n') - g_string_append_c (s, c); - if (c == EOF) - fatal ("eSpeak process died too soon\n"); - - gchar *translation = g_string_free (s, FALSE); - *output_end = translation; - output_end = (gpointer *) translation; - - // We limit progress reporting so that - // the mutex doesn't spin like crazy - if ((--remaining & 255) != 0) - continue; - - g_mutex_lock (data->remaining_mutex); - data->remaining = remaining; - g_cond_broadcast (data->remaining_cond); - g_mutex_unlock (data->remaining_mutex); - } - - if (fgetc (child_stdout) != EOF) - fatal ("Error: eSpeak has written more lines than it should. " - "The output would be corrupt, aborting.\n"); - - fclose (child_stdout); - return g_thread_join (writer); -} - -// --- Main -------------------------------------------------------------------- - -int -main (int argc, char *argv[]) -{ - gint n_processes = 1; - gchar *voice = NULL; - gboolean ignore_acronyms = FALSE; - - GOptionEntry entries[] = - { - { "processes", 'N', G_OPTION_FLAG_IN_MAIN, - G_OPTION_ARG_INT, &n_processes, - "The number of espeak processes run in parallel", "PROCESSES" }, - { "voice", 'v', G_OPTION_FLAG_IN_MAIN, - G_OPTION_ARG_STRING, &voice, - "The voice to be used by eSpeak to pronounce the words", "VOICE" }, - { "ignore-acronyms", 0, G_OPTION_FLAG_IN_MAIN, - G_OPTION_ARG_NONE, &ignore_acronyms, - "Don't spell out words composed of big letters only", NULL }, - { NULL } - }; - -G_GNUC_BEGIN_IGNORE_DEPRECATIONS - if (glib_check_version (2, 36, 0)) - g_type_init (); -G_GNUC_END_IGNORE_DEPRECATIONS - - GError *error = NULL; - GOptionContext *ctx = g_option_context_new - ("input.ifo output-basename - add pronunciation to dictionaries"); - g_option_context_add_main_entries (ctx, entries, NULL); - if (!g_option_context_parse (ctx, &argc, &argv, &error)) - fatal ("Error: option parsing failed: %s\n", error->message); - - if (argc != 3) - fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL)); - - g_option_context_free (ctx); - - // See if we can run espeak - static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL, NULL, NULL }; - - if (voice) - { - cmdline[3] = "-v"; - cmdline[4] = voice; - } - - gchar *void_entry = g_strstrip (get_void_entry (cmdline)); - - // Load the dictionary - printf ("Loading the original dictionary...\n"); - StardictDict *dict = stardict_dict_new (argv[1], &error); - if (!dict) - fatal ("Error: opening the dictionary failed: %s\n", error->message); - - gsize n_words = stardict_info_get_word_count - (stardict_dict_get_info (dict)); - - if (n_processes <= 0) - fatal ("Error: there must be at least one process\n"); - - if ((gsize) n_processes > n_words * 1024) - { - n_processes = n_words / 1024; - if (!n_processes) - n_processes = 1; - g_printerr ("Warning: too many processes, reducing to %d\n", - n_processes); - } - - // Spawn worker threads to generate pronunciation data - static GMutex dict_mutex; - - static GMutex remaining_mutex; - static GCond remaining_cond; - - WorkerData *data = g_alloca (sizeof *data * n_processes); - - GRegex *re_stop = g_regex_new ("[" LINE_SPLITTING_CHARS "][ ?]" - "|\\.\\.\\.|[" OTHER_STOP_CHARS "]", G_REGEX_OPTIMIZE, 0, &error); - g_assert (re_stop != NULL); - - GRegex *re_acronym = g_regex_new ("(^|\\pZ)(\\p{Lu}+)(?=\\pZ|$)", - G_REGEX_OPTIMIZE, 0, &error); - g_assert (re_acronym != NULL); - - gint i; - for (i = 0; i < n_processes; i++) - { - data[i].start_entry = n_words * i / n_processes; - data[i].end_entry = n_words * (i + 1) / n_processes; - - data[i].total = data[i].remaining = - data[i].end_entry - data[i].start_entry; - data[i].remaining_mutex = &remaining_mutex; - data[i].remaining_cond = &remaining_cond; - - data[i].dict = dict; - data[i].dict_mutex = &dict_mutex; - - data[i].re_stop = re_stop; - data[i].re_acronym = re_acronym; - - data[i].cmdline = cmdline; - data[i].ignore_acronyms = ignore_acronyms; - data[i].main_thread = - g_thread_new ("worker", (GThreadFunc) worker, &data[i]); - } - - // Loop while the threads still have some work to do and report status - g_mutex_lock (&remaining_mutex); - for (;;) - { - gboolean all_finished = TRUE; - printf ("\rRetrieving pronunciation... "); - for (i = 0; i < n_processes; i++) - { - printf ("%3u%% ", 100 - data[i].remaining * 100 / data[i].total); - if (data[i].remaining) - all_finished = FALSE; - } - - if (all_finished) - break; - g_cond_wait (&remaining_cond, &remaining_mutex); - } - g_mutex_unlock (&remaining_mutex); - - putchar ('\n'); - for (i = 0; i < n_processes; i++) - g_thread_join (data[i].main_thread); - - g_regex_unref (re_stop); - g_regex_unref (re_acronym); - - // Put extended entries into a new dictionary - Generator *generator = generator_new (argv[2], &error); - if (!generator) - fatal ("Error: failed to create the output dictionary: %s\n", - error->message); - - StardictInfo *info = generator->info; - stardict_info_copy (info, stardict_dict_get_info (dict)); - - // This gets incremented each time an entry is finished - info->word_count = 0; - - if (info->same_type_sequence) - { - gchar *new_sts = g_strconcat ("t", info->same_type_sequence, NULL); - g_free (info->same_type_sequence); - info->same_type_sequence = new_sts; - } - - // Write out all the entries together with the pronunciation - for (i = 0; i < n_processes; i++) - { - StardictIterator *iterator = - stardict_iterator_new (dict, data[i].start_entry); - - gpointer *output = data[i].output; - while (stardict_iterator_get_offset (iterator) != data[i].end_entry) - { - printf ("\rCreating a new dictionary... %3lu%%", - (gulong) stardict_iterator_get_offset (iterator) * 100 - / stardict_dict_get_info (dict)->word_count); - - g_assert (output != NULL); - - gchar *pronunciation = g_strstrip ((gchar *) (output + 1)); - StardictEntry *entry = stardict_iterator_get_entry (iterator); - - generator_begin_entry (generator); - - if (!strcmp (pronunciation, void_entry)) - *pronunciation = 0; - -// g_printerr ("%s /%s/\n", -// stardict_iterator_get_word (iterator), pronunciation); - - // For the sake of simplicity we fake a new start; - // write_fields() only iterates the list in one direction. - StardictEntryField field; - field.type = 't'; - field.data = pronunciation; - - GList start_link; - start_link.next = entry->fields; - start_link.data = &field; - - if (!generator_write_fields (generator, &start_link, &error) - || !generator_finish_entry (generator, - stardict_iterator_get_word (iterator), &error)) - fatal ("Error: write failed: %s\n", error->message); - - g_object_unref (entry); - - gpointer *tmp = output; - output = *output; - g_free (tmp); - - stardict_iterator_next (iterator); - } - - g_assert (output == NULL); - g_object_unref (iterator); - } - - putchar ('\n'); - if (!generator_finish (generator, &error)) - fatal ("Error: failed to write the dictionary: %s\n", error->message); - - generator_free (generator); - g_object_unref (dict); - g_free (void_entry); - return 0; -} |