Add a WIP tool to add pronunciations to dicts

author: Přemysl Janouch <p.janouch@gmail.com> 2013-05-08 20:42:35 +0200
committer: Přemysl Janouch <p.janouch@gmail.com> 2013-05-08 20:44:41 +0200
commit: 04ae72158e8bb0bd11553eacece6901714c9d455 (patch)
tree: 53b946c7153c1de33b409612f5a78a2b9a9b94f0
parent: 818ee593aadece3ffe8ac36579c241c0bf157047 (diff)
download: tdv-04ae72158e8bb0bd11553eacece6901714c9d455.tar.gz
tdv-04ae72158e8bb0bd11553eacece6901714c9d455.tar.xz
tdv-04ae72158e8bb0bd11553eacece6901714c9d455.zip
2 files changed, 266 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index 4ca0280..41e69c1 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ SHELL = /bin/sh
 
 pkgs = ncursesw glib-2.0 gio-2.0
 tests = test-stardict
-targets = sdcli $(tests)
+targets = sdcli add-pronunciation $(tests)
 
 CC = clang
 CFLAGS = -ggdb -std=gnu99 -Wall -Wextra -Wno-missing-field-initializers \
@@ -19,6 +19,9 @@ clean:
 sdcli: sdcli.o stardict.o
 	$(CC) $^ -o $@ $(LDFLAGS)
 
+add-pronunciation: add-pronunciation.o stardict.o
+	$(CC) $^ -o $@ $(LDFLAGS)
+
 test-stardict: test-stardict.o stardict.o
 	$(CC) $^ -o $@ $(LDFLAGS)
 
diff --git a/add-pronunciation.c b/add-pronunciation.c
new file mode 100644
index 0000000..45eae61
--- /dev/null
+++ b/add-pronunciation.c
@@ -0,0 +1,262 @@
+/*
+ * A tool to add eSpeak-generated pronunciation to dictionaries
+ *
+ * Here I use the `espeak' process rather than libespeak because of the GPL.
+ *
+ * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
+ * All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <glib.h>
+#include <gio/gio.h>
+
+#include "stardict.h"
+
+
+// --- Pronunciation generator -------------------------------------------------
+
+typedef struct worker_data WorkerData;
+
+struct worker_data
+{
+	guint32 start_entry;                //! The first entry to be processed
+	guint32 end_entry;                  //! Past the last entry to be processed
+
+	/* Reader, writer */
+	GMutex *dict_mutex;                 //! Locks the dictionary object
+
+	/* Reader */
+	GThread *main_thread;               //! A handle to the reader thread
+	StardictDict *dict;                 //! The dictionary object
+	gpointer output;                    //! Linked-list of pronunciation data
+
+	GMutex *remaining_mutex;            //! Locks the progress stats
+	GCond *remaining_cond;              //! Signals a change in progress
+	guint32 remaining;                  //! How many entries remain
+
+	/* Writer */
+	StardictIterator *iterator;         //! Iterates over the dictionary
+	FILE *child_stdin;                  //! Standard input of eSpeak
+};
+
+/** Writes to espeak's stdin. */
+static gpointer
+worker_writer (WorkerData *data)
+{
+	while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
+	{
+		g_mutex_lock (data->dict_mutex);
+		const gchar *word = stardict_iterator_get_word (data->iterator);
+		g_mutex_unlock (data->dict_mutex);
+
+		stardict_iterator_next (data->iterator);
+		if (fprintf (data->child_stdin, "%s\n", word) < 0)
+			g_error ("write to eSpeak failed: %s", strerror (errno));
+	}
+
+	g_object_unref (data->iterator);
+	return GINT_TO_POINTER (fclose (data->child_stdin));
+}
+
+/** Reads from espeak's stdout. */
+static gpointer
+worker (WorkerData *data)
+{
+	/* Spawn eSpeak */
+	static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL };
+	gint child_in, child_out;
+
+	GError *error;
+	if (!g_spawn_async_with_pipes (NULL, cmdline, NULL,
+		G_SPAWN_SEARCH_PATH, NULL, NULL,
+		NULL, &child_in, &child_out, NULL, &error))
+		g_error ("g_spawn() failed: %s", error->message);
+
+	data->child_stdin = fdopen (child_in, "wb");
+	if (!data->child_stdin)
+		perror ("fdopen");
+
+	FILE *child_stdout = fdopen (child_out, "rb");
+	if (!child_stdout)
+		perror ("fdopen");
+
+	/* Spawn a writer thread */
+	g_mutex_lock (data->dict_mutex);
+	data->iterator = stardict_iterator_new (data->dict, data->start_entry);
+	g_mutex_unlock (data->dict_mutex);
+
+	GThread *writer = g_thread_new ("write worker",
+		(GThreadFunc) worker_writer, data);
+
+	/* Read the output */
+	g_mutex_lock (data->remaining_mutex);
+	guint32 remaining = data->remaining;
+	g_mutex_unlock (data->remaining_mutex);
+
+	data->output = NULL;
+	gpointer *output_end = &data->output;
+	while (remaining)
+	{
+		static gchar next[sizeof (gpointer)];
+		GString *s = g_string_new (NULL);
+		g_string_append_len (s, next, sizeof next);
+
+		gint c;
+		while ((c = fgetc (child_stdout)) != EOF && c != '\n')
+			g_string_append_c (s, c);
+		if (c == EOF)
+			g_error ("eSpeak process died too soon");
+
+		gchar *translation = g_string_free (s, FALSE);
+		*output_end = translation;
+		output_end = (gpointer *) translation;
+
+		/* We limit progress reporting so that
+		 * the mutex doesn't spin like crazy */
+		if ((--remaining & 1023) != 0)
+			continue;
+
+		g_mutex_lock (data->remaining_mutex);
+		data->remaining = remaining;
+		g_cond_broadcast (data->remaining_cond);
+		g_mutex_unlock (data->remaining_mutex);
+	}
+
+	fclose (child_stdout);
+	return g_thread_join (writer);
+}
+
+// --- Main --------------------------------------------------------------------
+
+int
+main (int argc, char *argv[])
+{
+	gint n_processes = 1;
+
+	GOptionEntry entries[] =
+	{
+		{ "processes", 'N', G_OPTION_FLAG_IN_MAIN,
+		  G_OPTION_ARG_INT, &n_processes,
+		  "the number of espeak processes run in parallel", "PROCESSES" },
+		{ NULL }
+	};
+
+	GError *error = NULL;
+	GOptionContext *ctx = g_option_context_new
+		("input.ifo output.ifo - add pronunciation to dictionaries");
+	g_option_context_add_main_entries (ctx, entries, NULL);
+	if (!g_option_context_parse (ctx, &argc, &argv, &error))
+	{
+		g_print ("option parsing failed: %s\n", error->message);
+		exit (EXIT_FAILURE);
+	}
+
+	if (argc != 3)
+	{
+		gchar *help = g_option_context_get_help (ctx, TRUE, FALSE);
+		g_print ("%s", help);
+		g_free (help);
+		exit (EXIT_FAILURE);
+	}
+
+	StardictDict *dict = stardict_dict_new (argv[1], &error);
+	if (!dict)
+	{
+		g_printerr ("opening the dictionary failed: %s\n", error->message);
+		exit (EXIT_FAILURE);
+	}
+
+	gsize n_words = stardict_info_get_word_count
+		(stardict_dict_get_info (dict));
+
+	if (n_processes <= 0)
+	{
+		g_printerr ("Error: there must be at least one process\n");
+		exit (EXIT_FAILURE);
+	}
+
+	if ((gsize) n_processes > n_words * 1024)
+	{
+		n_processes = n_words / 1024;
+		if (!n_processes)
+			n_processes = 1;
+		g_printerr ("Warning: too many processes, reducing to %d\n",
+			n_processes);
+	}
+
+	/* Spawn worker threads to generate pronunciations */
+	static GMutex dict_mutex;
+
+	static GMutex remaining_mutex;
+	static GCond remaining_cond;
+
+	WorkerData *data = g_alloca (sizeof *data * n_processes);
+
+	gint i;
+	for (i = 0; i < n_processes; i++)
+	{
+		data[i].start_entry = (n_words - 1) *  i      / n_processes;
+		data[i].end_entry   = (n_words - 1) * (i + 1) / n_processes;
+
+		data[i].remaining = data[i].end_entry - data[i].start_entry;
+		data[i].remaining_mutex = &remaining_mutex;
+		data[i].remaining_cond = &remaining_cond;
+
+		data[i].dict = dict;
+		data[i].dict_mutex = &dict_mutex;
+
+		data->main_thread = g_thread_new ("worker", (GThreadFunc) worker, data);
+	}
+
+	/* Loop while the threads still have some work to do and report status */
+	g_mutex_lock (&remaining_mutex);
+	for (;;)
+	{
+		gboolean all_finished = TRUE;
+		printf ("\rRetrieving pronunciation... ");
+		for (i = 0; i < n_processes; i++)
+		{
+			printf ("%3u%% ", data[i].remaining * 100
+				/ (data[i].end_entry - data[i].start_entry));
+			if (data[i].remaining)
+				all_finished = FALSE;
+		}
+
+		if (all_finished)
+			break;
+		g_cond_wait (&remaining_cond, &remaining_mutex);
+	}
+	g_mutex_unlock (&remaining_mutex);
+
+	for (i = 0; i < n_processes; i++)
+		g_thread_join (data[i].main_thread);
+
+	// TODO after all processing is done, the program will go through the whole
+	//      dictionary and put extended data entries into a new one.
+	StardictIterator *iterator = stardict_iterator_new (dict, 0);
+	while (stardict_iterator_is_valid (iterator))
+	{
+		// ...
+		stardict_iterator_next (iterator);
+	}
+
+	return 0;
+}
author	Přemysl Janouch <p.janouch@gmail.com>	2013-05-08 20:42:35 +0200
committer	Přemysl Janouch <p.janouch@gmail.com>	2013-05-08 20:44:41 +0200
commit	04ae72158e8bb0bd11553eacece6901714c9d455 (patch)
tree	53b946c7153c1de33b409612f5a78a2b9a9b94f0
parent	818ee593aadece3ffe8ac36579c241c0bf157047 (diff)
download	tdv-04ae72158e8bb0bd11553eacece6901714c9d455.tar.gz tdv-04ae72158e8bb0bd11553eacece6901714c9d455.tar.xz tdv-04ae72158e8bb0bd11553eacece6901714c9d455.zip