From 04ae72158e8bb0bd11553eacece6901714c9d455 Mon Sep 17 00:00:00 2001
From: Přemysl Janouch
Date: Wed, 8 May 2013 20:42:35 +0200
Subject: Add a WIP tool to add pronunciations to dicts
---
Makefile | 5 +-
add-pronunciation.c | 262 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 266 insertions(+), 1 deletion(-)
create mode 100644 add-pronunciation.c
diff --git a/Makefile b/Makefile
index 4ca0280..41e69c1 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ SHELL = /bin/sh
pkgs = ncursesw glib-2.0 gio-2.0
tests = test-stardict
-targets = sdcli $(tests)
+targets = sdcli add-pronunciation $(tests)
CC = clang
CFLAGS = -ggdb -std=gnu99 -Wall -Wextra -Wno-missing-field-initializers \
@@ -19,6 +19,9 @@ clean:
sdcli: sdcli.o stardict.o
$(CC) $^ -o $@ $(LDFLAGS)
+add-pronunciation: add-pronunciation.o stardict.o
+ $(CC) $^ -o $@ $(LDFLAGS)
+
test-stardict: test-stardict.o stardict.o
$(CC) $^ -o $@ $(LDFLAGS)
diff --git a/add-pronunciation.c b/add-pronunciation.c
new file mode 100644
index 0000000..45eae61
--- /dev/null
+++ b/add-pronunciation.c
@@ -0,0 +1,262 @@
+/*
+ * A tool to add eSpeak-generated pronunciation to dictionaries
+ *
+ * Here I use the `espeak' process rather than libespeak because of the GPL.
+ *
+ * Copyright (c) 2013, Přemysl Janouch
+ * All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include "stardict.h"
+
+
+// --- Pronunciation generator -------------------------------------------------
+
+typedef struct worker_data WorkerData;
+
+struct worker_data
+{
+ guint32 start_entry; //! The first entry to be processed
+ guint32 end_entry; //! Past the last entry to be processed
+
+ /* Reader, writer */
+ GMutex *dict_mutex; //! Locks the dictionary object
+
+ /* Reader */
+ GThread *main_thread; //! A handle to the reader thread
+ StardictDict *dict; //! The dictionary object
+ gpointer output; //! Linked-list of pronunciation data
+
+ GMutex *remaining_mutex; //! Locks the progress stats
+ GCond *remaining_cond; //! Signals a change in progress
+ guint32 remaining; //! How many entries remain
+
+ /* Writer */
+ StardictIterator *iterator; //! Iterates over the dictionary
+ FILE *child_stdin; //! Standard input of eSpeak
+};
+
+/** Writes to espeak's stdin. */
+static gpointer
+worker_writer (WorkerData *data)
+{
+ while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
+ {
+ g_mutex_lock (data->dict_mutex);
+ const gchar *word = stardict_iterator_get_word (data->iterator);
+ g_mutex_unlock (data->dict_mutex);
+
+ stardict_iterator_next (data->iterator);
+ if (fprintf (data->child_stdin, "%s\n", word) < 0)
+ g_error ("write to eSpeak failed: %s", strerror (errno));
+ }
+
+ g_object_unref (data->iterator);
+ return GINT_TO_POINTER (fclose (data->child_stdin));
+}
+
+/** Reads from espeak's stdout. */
+static gpointer
+worker (WorkerData *data)
+{
+ /* Spawn eSpeak */
+ static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL };
+ gint child_in, child_out;
+
+ GError *error;
+ if (!g_spawn_async_with_pipes (NULL, cmdline, NULL,
+ G_SPAWN_SEARCH_PATH, NULL, NULL,
+ NULL, &child_in, &child_out, NULL, &error))
+ g_error ("g_spawn() failed: %s", error->message);
+
+ data->child_stdin = fdopen (child_in, "wb");
+ if (!data->child_stdin)
+ perror ("fdopen");
+
+ FILE *child_stdout = fdopen (child_out, "rb");
+ if (!child_stdout)
+ perror ("fdopen");
+
+ /* Spawn a writer thread */
+ g_mutex_lock (data->dict_mutex);
+ data->iterator = stardict_iterator_new (data->dict, data->start_entry);
+ g_mutex_unlock (data->dict_mutex);
+
+ GThread *writer = g_thread_new ("write worker",
+ (GThreadFunc) worker_writer, data);
+
+ /* Read the output */
+ g_mutex_lock (data->remaining_mutex);
+ guint32 remaining = data->remaining;
+ g_mutex_unlock (data->remaining_mutex);
+
+ data->output = NULL;
+ gpointer *output_end = &data->output;
+ while (remaining)
+ {
+ static gchar next[sizeof (gpointer)];
+ GString *s = g_string_new (NULL);
+ g_string_append_len (s, next, sizeof next);
+
+ gint c;
+ while ((c = fgetc (child_stdout)) != EOF && c != '\n')
+ g_string_append_c (s, c);
+ if (c == EOF)
+ g_error ("eSpeak process died too soon");
+
+ gchar *translation = g_string_free (s, FALSE);
+ *output_end = translation;
+ output_end = (gpointer *) translation;
+
+ /* We limit progress reporting so that
+ * the mutex doesn't spin like crazy */
+ if ((--remaining & 1023) != 0)
+ continue;
+
+ g_mutex_lock (data->remaining_mutex);
+ data->remaining = remaining;
+ g_cond_broadcast (data->remaining_cond);
+ g_mutex_unlock (data->remaining_mutex);
+ }
+
+ fclose (child_stdout);
+ return g_thread_join (writer);
+}
+
+// --- Main --------------------------------------------------------------------
+
+int
+main (int argc, char *argv[])
+{
+ gint n_processes = 1;
+
+ GOptionEntry entries[] =
+ {
+ { "processes", 'N', G_OPTION_FLAG_IN_MAIN,
+ G_OPTION_ARG_INT, &n_processes,
+ "the number of espeak processes run in parallel", "PROCESSES" },
+ { NULL }
+ };
+
+ GError *error = NULL;
+ GOptionContext *ctx = g_option_context_new
+ ("input.ifo output.ifo - add pronunciation to dictionaries");
+ g_option_context_add_main_entries (ctx, entries, NULL);
+ if (!g_option_context_parse (ctx, &argc, &argv, &error))
+ {
+ g_print ("option parsing failed: %s\n", error->message);
+ exit (EXIT_FAILURE);
+ }
+
+ if (argc != 3)
+ {
+ gchar *help = g_option_context_get_help (ctx, TRUE, FALSE);
+ g_print ("%s", help);
+ g_free (help);
+ exit (EXIT_FAILURE);
+ }
+
+ StardictDict *dict = stardict_dict_new (argv[1], &error);
+ if (!dict)
+ {
+ g_printerr ("opening the dictionary failed: %s\n", error->message);
+ exit (EXIT_FAILURE);
+ }
+
+ gsize n_words = stardict_info_get_word_count
+ (stardict_dict_get_info (dict));
+
+ if (n_processes <= 0)
+ {
+ g_printerr ("Error: there must be at least one process\n");
+ exit (EXIT_FAILURE);
+ }
+
+ if ((gsize) n_processes > n_words * 1024)
+ {
+ n_processes = n_words / 1024;
+ if (!n_processes)
+ n_processes = 1;
+ g_printerr ("Warning: too many processes, reducing to %d\n",
+ n_processes);
+ }
+
+ /* Spawn worker threads to generate pronunciations */
+ static GMutex dict_mutex;
+
+ static GMutex remaining_mutex;
+ static GCond remaining_cond;
+
+ WorkerData *data = g_alloca (sizeof *data * n_processes);
+
+ gint i;
+ for (i = 0; i < n_processes; i++)
+ {
+ data[i].start_entry = (n_words - 1) * i / n_processes;
+ data[i].end_entry = (n_words - 1) * (i + 1) / n_processes;
+
+ data[i].remaining = data[i].end_entry - data[i].start_entry;
+ data[i].remaining_mutex = &remaining_mutex;
+ data[i].remaining_cond = &remaining_cond;
+
+ data[i].dict = dict;
+ data[i].dict_mutex = &dict_mutex;
+
+ data->main_thread = g_thread_new ("worker", (GThreadFunc) worker, data);
+ }
+
+ /* Loop while the threads still have some work to do and report status */
+ g_mutex_lock (&remaining_mutex);
+ for (;;)
+ {
+ gboolean all_finished = TRUE;
+ printf ("\rRetrieving pronunciation... ");
+ for (i = 0; i < n_processes; i++)
+ {
+ printf ("%3u%% ", data[i].remaining * 100
+ / (data[i].end_entry - data[i].start_entry));
+ if (data[i].remaining)
+ all_finished = FALSE;
+ }
+
+ if (all_finished)
+ break;
+ g_cond_wait (&remaining_cond, &remaining_mutex);
+ }
+ g_mutex_unlock (&remaining_mutex);
+
+ for (i = 0; i < n_processes; i++)
+ g_thread_join (data[i].main_thread);
+
+ // TODO after all processing is done, the program will go through the whole
+ // dictionary and put extended data entries into a new one.
+ StardictIterator *iterator = stardict_iterator_new (dict, 0);
+ while (stardict_iterator_is_valid (iterator))
+ {
+ // ...
+ stardict_iterator_next (iterator);
+ }
+
+ return 0;
+}
--
cgit v1.2.3-70-g09d2