From e783f31de91771d0f94402f9b9f088dae07bc084 Mon Sep 17 00:00:00 2001 From: Přemysl Janouch Date: Fri, 10 May 2013 02:06:57 +0200 Subject: Finish the add-pronunciation tool --- src/add-pronunciation.c | 339 ++++++++++++++++++++++++++++++++++++++++++++---- src/generator.c | 4 + 2 files changed, 319 insertions(+), 24 deletions(-) (limited to 'src') diff --git a/src/add-pronunciation.c b/src/add-pronunciation.c index 3b0a6ce..2abf8e6 100644 --- a/src/add-pronunciation.c +++ b/src/add-pronunciation.c @@ -2,6 +2,7 @@ * A tool to add eSpeak-generated pronunciation to dictionaries * * Here I use the `espeak' process rather than libespeak because of the GPL. + * It's far from ideal, rather good as a starting point. * * Copyright (c) 2013, Přemysl Janouch * All rights reserved. @@ -39,6 +40,11 @@ typedef struct worker_data WorkerData; struct worker_data { + gchar **cmdline; //! eSpeak command line + guint ignore_acronyms : 1; //! Don't spell out acronyms + GRegex *re_stop; //! Regex for stop sequences + GRegex *re_acronym; //! Regex for ACRONYMS + guint32 start_entry; //! The first entry to be processed guint32 end_entry; //! Past the last entry to be processed @@ -53,41 +59,140 @@ struct worker_data GMutex *remaining_mutex; //! Locks the progress stats GCond *remaining_cond; //! Signals a change in progress guint32 remaining; //! How many entries remain + guint32 total; //! Total number of entries /* Writer */ StardictIterator *iterator; //! Iterates over the dictionary FILE *child_stdin; //! Standard input of eSpeak }; +/** eSpeak splits the output on certain characters. */ +#define LINE_SPLITTING_CHARS ".,:;?!" + +/** We don't want to include brackets either. */ +#define OTHER_STOP_CHARS "([{<" + +/** A void word used to make a unique "no pronunciation available" mark. */ +#define VOID_ENTRY "not present in any dictionary" + + +/** Adds dots between characters. */ +static gboolean +writer_acronym_cb (const GMatchInfo *info, GString *res, + G_GNUC_UNUSED gpointer data) +{ + gchar *preceding = g_match_info_fetch (info, 1); + g_string_append (res, preceding); + g_free (preceding); + + gchar *word = g_match_info_fetch (info, 2); + + g_string_append_c (res, *word); + const gchar *p; + for (p = word + 1; *p; p++) + { + g_string_append_c (res, '.'); + g_string_append_c (res, *p); + } + + g_free (word); + return FALSE; +} + /** Writes to espeak's stdin. */ static gpointer worker_writer (WorkerData *data) { + GError *error; + GMatchInfo *match_info; while (stardict_iterator_get_offset (data->iterator) != data->end_entry) { g_mutex_lock (data->dict_mutex); const gchar *word = stardict_iterator_get_word (data->iterator); g_mutex_unlock (data->dict_mutex); + word += strspn (word, LINE_SPLITTING_CHARS " \t"); + gchar *x = g_strdup (word); + + /* Cut the word if needed be */ + error = NULL; + if (g_regex_match_full (data->re_stop, + x, -1, 0, 0, &match_info, &error)) + { + gint start_pos; + g_match_info_fetch_pos (match_info, 0, &start_pos, NULL); + x[start_pos] = 0; + } + g_match_info_free (match_info); + + /* Change acronyms so that they're not pronounced as words */ + if (!error && !data->ignore_acronyms) + { + char *tmp = g_regex_replace_eval (data->re_acronym, + x, -1, 0, 0, writer_acronym_cb, NULL, &error); + g_free (x); + x = tmp; + } + + if (error) + { + g_printerr ("Notice: error processing '%s': %s\n", + word, error->message); + g_clear_error (&error); + *x = 0; + } + + /* We might have accidentally cut off everything */ + if (!*x) + { + g_free (x); + x = g_strdup (VOID_ENTRY); + } + stardict_iterator_next (data->iterator); - if (fprintf (data->child_stdin, "%s\n", word) < 0) + if (fprintf (data->child_stdin, "%s\n", x) < 0) g_error ("write to eSpeak failed: %s", strerror (errno)); + + g_free (x); } g_object_unref (data->iterator); return GINT_TO_POINTER (fclose (data->child_stdin)); } +/** Get the void entry (and test if espeak works). */ +static gchar * +get_void_entry (gchar *cmdline[]) +{ + gchar *output; + gint exit_status; + + GError *error; + if (!g_spawn_sync (NULL, cmdline, NULL, + G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, NULL, NULL, + &output, NULL, &exit_status, &error)) + { + g_printerr ("Error: couldn't spawn espeak: %s", error->message); + exit (EXIT_FAILURE); + } + + if (exit_status) + { + g_printerr ("Error: espeak returned %d\n", exit_status); + exit (EXIT_FAILURE); + } + + return output; +} + /** Reads from espeak's stdout. */ static gpointer worker (WorkerData *data) { /* Spawn eSpeak */ - static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL }; - gint child_in, child_out; - GError *error; - if (!g_spawn_async_with_pipes (NULL, cmdline, NULL, + gint child_in, child_out; + if (!g_spawn_async_with_pipes (NULL, data->cmdline, NULL, G_SPAWN_SEARCH_PATH, NULL, NULL, NULL, &child_in, &child_out, NULL, &error)) g_error ("g_spawn() failed: %s", error->message); @@ -133,7 +238,7 @@ worker (WorkerData *data) /* We limit progress reporting so that * the mutex doesn't spin like crazy */ - if ((--remaining & 1023) != 0) + if ((--remaining & 255) != 0) continue; g_mutex_lock (data->remaining_mutex); @@ -142,47 +247,133 @@ worker (WorkerData *data) g_mutex_unlock (data->remaining_mutex); } + if (fgetc (child_stdout) != EOF) + { + g_printerr ("Error: eSpeak has written more lines than it should. " + "The output would be corrupt, aborting.\n"); + exit (EXIT_FAILURE); + } + fclose (child_stdout); return g_thread_join (writer); } // --- Main -------------------------------------------------------------------- +/** Copy the contents of one StardictInfo object into another. Ignores path. */ +static void +stardict_info_copy (StardictInfo *dest, const StardictInfo *src) +{ + dest->version = src->version; + + guint i; + for (i = 0; i < _stardict_ifo_keys_length; i++) + { + const struct stardict_ifo_key *key = &_stardict_ifo_keys[i]; + if (key->type == IFO_STRING) + { + gchar **p = &G_STRUCT_MEMBER (gchar *, dest, key->offset); + gchar *q = G_STRUCT_MEMBER (gchar *, src, key->offset); + + g_free (*p); + *p = q ? g_strdup (q) : NULL; + } + else + G_STRUCT_MEMBER (gulong, dest, key->offset) = + G_STRUCT_MEMBER (gulong, src, key->offset); + } +} + +/** Write a list of data fields back to a dictionary. */ +static gboolean +write_fields (Generator *generator, GList *fields, gboolean sts, GError **error) +{ + while (fields) + { + StardictEntryField *field = fields->data; + if (!sts && !generator_write_type (generator, field->type, error)) + return FALSE; + + gboolean mark_end = !sts || fields->next != NULL; + if (g_ascii_islower (field->type)) + { + if (!generator_write_string (generator, + field->data, mark_end, error)) + return FALSE; + } + else if (!generator_write_raw (generator, + field->data, field->data_size, mark_end, error)) + return FALSE; + + fields = fields->next; + } + return TRUE; +} + int main (int argc, char *argv[]) { gint n_processes = 1; + gchar *voice = NULL; + gboolean ignore_acronyms = FALSE; GOptionEntry entries[] = { { "processes", 'N', G_OPTION_FLAG_IN_MAIN, G_OPTION_ARG_INT, &n_processes, - "the number of espeak processes run in parallel", "PROCESSES" }, + "The number of espeak processes run in parallel", "PROCESSES" }, + { "voice", 'v', G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_STRING, &voice, + "The voice to be used by eSpeak to pronounce the words", "VOICE" }, + { "ignore-acronyms", 0, G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_NONE, &ignore_acronyms, + "Don't spell out words composed of big letters only", NULL }, { NULL } }; +G_GNUC_BEGIN_IGNORE_DEPRECATIONS + if (glib_check_version (2, 36, 0)) + g_type_init (); +G_GNUC_END_IGNORE_DEPRECATIONS + GError *error = NULL; GOptionContext *ctx = g_option_context_new - ("input.ifo output.ifo - add pronunciation to dictionaries"); + ("input.ifo output-basename - add pronunciation to dictionaries"); g_option_context_add_main_entries (ctx, entries, NULL); if (!g_option_context_parse (ctx, &argc, &argv, &error)) { - g_print ("option parsing failed: %s\n", error->message); + g_printerr ("Error: option parsing failed: %s\n", error->message); exit (EXIT_FAILURE); } if (argc != 3) { gchar *help = g_option_context_get_help (ctx, TRUE, FALSE); - g_print ("%s", help); + g_printerr ("%s", help); g_free (help); exit (EXIT_FAILURE); } + g_option_context_free (ctx); + + /* See if we can run espeak */ + static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL, NULL, NULL }; + + if (voice) + { + cmdline[3] = "-v"; + cmdline[4] = voice; + } + + gchar *void_entry = g_strstrip (get_void_entry (cmdline)); + + /* Load the dictionary */ + printf ("Loading the original dictionary...\n"); StardictDict *dict = stardict_dict_new (argv[1], &error); if (!dict) { - g_printerr ("opening the dictionary failed: %s\n", error->message); + g_printerr ("Error: opening the dictionary failed: %s\n", + error->message); exit (EXIT_FAILURE); } @@ -204,7 +395,7 @@ main (int argc, char *argv[]) n_processes); } - /* Spawn worker threads to generate pronunciations */ + /* Spawn worker threads to generate pronunciation data */ static GMutex dict_mutex; static GMutex remaining_mutex; @@ -212,20 +403,35 @@ main (int argc, char *argv[]) WorkerData *data = g_alloca (sizeof *data * n_processes); + GRegex *re_stop = g_regex_new ("[" LINE_SPLITTING_CHARS "][ ?]" + "|\\.\\.\\.|[" OTHER_STOP_CHARS "]", G_REGEX_OPTIMIZE, 0, &error); + g_assert (re_stop != NULL); + + GRegex *re_acronym = g_regex_new ("(^|\\pZ)(\\p{Lu}+)(?=\\pZ|$)", + G_REGEX_OPTIMIZE, 0, &error); + g_assert (re_acronym != NULL); + gint i; for (i = 0; i < n_processes; i++) { - data[i].start_entry = (n_words - 1) * i / n_processes; - data[i].end_entry = (n_words - 1) * (i + 1) / n_processes; + data[i].start_entry = n_words * i / n_processes; + data[i].end_entry = n_words * (i + 1) / n_processes; - data[i].remaining = data[i].end_entry - data[i].start_entry; + data[i].total = data[i].remaining = + data[i].end_entry - data[i].start_entry; data[i].remaining_mutex = &remaining_mutex; data[i].remaining_cond = &remaining_cond; data[i].dict = dict; data[i].dict_mutex = &dict_mutex; - data->main_thread = g_thread_new ("worker", (GThreadFunc) worker, data); + data[i].re_stop = re_stop; + data[i].re_acronym = re_acronym; + + data[i].cmdline = cmdline; + data[i].ignore_acronyms = ignore_acronyms; + data[i].main_thread = + g_thread_new ("worker", (GThreadFunc) worker, &data[i]); } /* Loop while the threads still have some work to do and report status */ @@ -236,8 +442,7 @@ main (int argc, char *argv[]) printf ("\rRetrieving pronunciation... "); for (i = 0; i < n_processes; i++) { - printf ("%3u%% ", data[i].remaining * 100 - / (data[i].end_entry - data[i].start_entry)); + printf ("%3u%% ", 100 - data[i].remaining * 100 / data[i].total); if (data[i].remaining) all_finished = FALSE; } @@ -248,17 +453,103 @@ main (int argc, char *argv[]) } g_mutex_unlock (&remaining_mutex); + putchar ('\n'); for (i = 0; i < n_processes; i++) g_thread_join (data[i].main_thread); - // TODO after all processing is done, the program will go through the whole - // dictionary and put extended data entries into a new one. - StardictIterator *iterator = stardict_iterator_new (dict, 0); - while (stardict_iterator_is_valid (iterator)) + g_regex_unref (re_stop); + g_regex_unref (re_acronym); + + /* Put extended entries into a new dictionary */ + Generator *generator = generator_new (argv[2], &error); + if (!generator) { - // ... - stardict_iterator_next (iterator); + g_printerr ("Error: failed to create the output dictionary: %s\n", + error->message); + exit (EXIT_FAILURE); + } + + StardictInfo *info = generator->info; + stardict_info_copy (info, stardict_dict_get_info (dict)); + + /* This gets incremented each time an entry is finished */ + info->word_count = 0; + + if (info->same_type_sequence) + { + gchar *new_sts = g_strconcat ("t", info->same_type_sequence, NULL); + g_free (info->same_type_sequence); + info->same_type_sequence = new_sts; + } + + /* Write out all the entries together with the pronunciation */ + for (i = 0; i < n_processes; i++) + { + StardictIterator *iterator = + stardict_iterator_new (dict, data[i].start_entry); + + gpointer *output = data[i].output; + while (stardict_iterator_get_offset (iterator) != data[i].end_entry) + { + printf ("\rCreating a new dictionary... %3lu%%", + (gulong) stardict_iterator_get_offset (iterator) * 100 + / stardict_dict_get_info (dict)->word_count); + + g_assert (output != NULL); + + gchar *pronunciation = g_strstrip ((gchar *) (output + 1)); + StardictEntry *entry = stardict_iterator_get_entry (iterator); + + generator_begin_entry (generator); + + if (!strcmp (pronunciation, void_entry)) + *pronunciation = 0; + +// g_printerr ("%s /%s/\n", +// stardict_iterator_get_word (iterator), pronunciation); + + /* For the sake of simplicity we fake a new start; + * write_fields() only iterates the list in one direction. */ + StardictEntryField field; + field.type = 't'; + field.data = pronunciation; + + GList start_link; + start_link.next = entry->fields; + start_link.data = &field; + + if (!write_fields (generator, &start_link, + info->same_type_sequence != NULL, &error) + || !generator_finish_entry (generator, + stardict_iterator_get_word (iterator), &error)) + { + g_printerr ("Error: write failed: %s\n", error->message); + exit (EXIT_FAILURE); + } + + g_object_unref (entry); + + gpointer *tmp = output; + output = *output; + g_free (tmp); + + stardict_iterator_next (iterator); + } + + g_assert (output == NULL); + g_object_unref (iterator); + } + + putchar ('\n'); + if (!generator_finish (generator, &error)) + { + g_printerr ("Error: failed to write the dictionary: %s\n", + error->message); + exit (EXIT_FAILURE); } + generator_free (generator); + g_object_unref (dict); + g_free (void_entry); return 0; } diff --git a/src/generator.c b/src/generator.c index b4bec9d..ac704ca 100644 --- a/src/generator.c +++ b/src/generator.c @@ -114,6 +114,10 @@ generator_finish (Generator *self, GError **error) } else { + if (self->info->version == SD_VERSION_2_4_2 + && !strcmp (key->name, "idxoffsetbits")) + continue; + gulong value = G_STRUCT_MEMBER (gulong, self->info, key->offset); if (value) -- cgit v1.2.3-70-g09d2