Rename tools, make them installable

author: Přemysl Eric Janouch <p@janouch.name> 2023-06-11 17:45:38 +0200
committer: Přemysl Eric Janouch <p@janouch.name> 2023-06-11 18:08:03 +0200
commit: c77d994dc44a9ef8f87dd36661201f499877fc34 (patch)
tree: 0ff850d9807f53b9acfe4e9ea95e3346b214ef37 /src/add-pronunciation.c
parent: 238e7a2bb961eb448dee1542e03cbdb84dea027d (diff)
download: tdv-c77d994dc44a9ef8f87dd36661201f499877fc34.tar.gz
tdv-c77d994dc44a9ef8f87dd36661201f499877fc34.tar.xz
tdv-c77d994dc44a9ef8f87dd36661201f499877fc34.zip
1 files changed, 0 insertions, 469 deletions
diff --git a/src/add-pronunciation.c b/src/add-pronunciation.c
deleted file mode 100644
index 90d9673..0000000
--- a/src/add-pronunciation.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * A tool to add eSpeak-generated pronunciation to dictionaries
- *
- * Here I use the `espeak' process rather than libespeak because of the GPL.
- * It's far from ideal, rather good as a starting point.
- *
- * Copyright (c) 2013, Přemysl Eric Janouch <p@janouch.name>
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include <glib.h>
-#include <gio/gio.h>
-
-#include "stardict.h"
-#include "stardict-private.h"
-#include "generator.h"
-#include "utils.h"
-
-
-// --- Pronunciation generator -------------------------------------------------
-
-typedef struct worker_data WorkerData;
-
-struct worker_data
-{
-	gchar **cmdline;                    ///< eSpeak command line
-	guint ignore_acronyms : 1;          ///< Don't spell out acronyms
-	GRegex *re_stop;                    ///< Regex for stop sequences
-	GRegex *re_acronym;                 ///< Regex for ACRONYMS
-
-	guint32 start_entry;                ///< The first entry to be processed
-	guint32 end_entry;                  ///< Past the last entry to be processed
-
-	// Reader, writer
-	GMutex *dict_mutex;                 ///< Locks the dictionary object
-
-	// Reader
-	GThread *main_thread;               ///< A handle to the reader thread
-	StardictDict *dict;                 ///< The dictionary object
-	gpointer output;                    ///< Linked-list of pronunciation data
-
-	GMutex *remaining_mutex;            ///< Locks the progress stats
-	GCond *remaining_cond;              ///< Signals a change in progress
-	guint32 remaining;                  ///< How many entries remain
-	guint32 total;                      ///< Total number of entries
-
-	// Writer
-	StardictIterator *iterator;         ///< Iterates over the dictionary
-	FILE *child_stdin;                  ///< Standard input of eSpeak
-};
-
-/// eSpeak splits the output on certain characters.
-#define LINE_SPLITTING_CHARS            ".,:;?!"
-
-/// We don't want to include brackets either.
-#define OTHER_STOP_CHARS                "([{<"
-
-/// A void word used to make a unique "no pronunciation available" mark.
-#define VOID_ENTRY                      "not present in any dictionary"
-
-
-/// Adds dots between characters.
-static gboolean
-writer_acronym_cb (const GMatchInfo *info, GString *res,
-	G_GNUC_UNUSED gpointer data)
-{
-	gchar *preceding = g_match_info_fetch (info, 1);
-	g_string_append (res, preceding);
-	g_free (preceding);
-
-	gchar *word = g_match_info_fetch (info, 2);
-
-	g_string_append_c (res, *word);
-	const gchar *p;
-	for (p = word + 1; *p; p++)
-	{
-		g_string_append_c (res, '.');
-		g_string_append_c (res, *p);
-	}
-
-	g_free (word);
-	return FALSE;
-}
-
-/// Writes to espeak's stdin.
-static gpointer
-worker_writer (WorkerData *data)
-{
-	GError *error = NULL;
-	GMatchInfo *match_info;
-	while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
-	{
-		g_mutex_lock (data->dict_mutex);
-		const gchar *word = stardict_iterator_get_word (data->iterator);
-		g_mutex_unlock (data->dict_mutex);
-
-		word += strspn (word, LINE_SPLITTING_CHARS " \t");
-		gchar *x = g_strdup (word);
-
-		// Cut the word if needed be
-		error = NULL;
-		if (g_regex_match_full (data->re_stop,
-			x, -1, 0, 0, &match_info, &error))
-		{
-			gint start_pos;
-			g_match_info_fetch_pos (match_info, 0, &start_pos, NULL);
-			x[start_pos] = 0;
-		}
-		g_match_info_free (match_info);
-
-		// Change acronyms so that they're not pronounced as words
-		if (!error && !data->ignore_acronyms)
-		{
-			char *tmp = g_regex_replace_eval (data->re_acronym,
-				x, -1, 0, 0, writer_acronym_cb, NULL, &error);
-			g_free (x);
-			x = tmp;
-		}
-
-		if (error)
-		{
-			g_printerr ("Notice: error processing '%s': %s\n",
-				word, error->message);
-			g_clear_error (&error);
-			*x = 0;
-		}
-
-		// We might have accidentally cut off everything
-		if (!*x)
-		{
-			g_free (x);
-			x = g_strdup (VOID_ENTRY);
-		}
-
-		stardict_iterator_next (data->iterator);
-		if (fprintf (data->child_stdin, "%s\n", x) < 0)
-			fatal ("write to eSpeak failed: %s\n", g_strerror (errno));
-
-		g_free (x);
-	}
-
-	g_object_unref (data->iterator);
-	return GINT_TO_POINTER (fclose (data->child_stdin));
-}
-
-/// Get the void entry (and test if espeak works).
-static gchar *
-get_void_entry (gchar *cmdline[])
-{
-	gchar *output;
-	gint exit_status;
-
-	GError *error = NULL;
-	if (!g_spawn_sync (NULL, cmdline, NULL,
-		G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, NULL, NULL,
-		&output, NULL, &exit_status, &error))
-		fatal ("Error: couldn't spawn espeak: %s\n", error->message);
-
-	if (exit_status)
-		fatal ("Error: espeak returned %d\n", exit_status);
-
-	return output;
-}
-
-/// Reads from espeak's stdout.
-static gpointer
-worker (WorkerData *data)
-{
-	// Spawn eSpeak
-	GError *error = NULL;
-	gint child_in, child_out;
-	if (!g_spawn_async_with_pipes (NULL, data->cmdline, NULL,
-		G_SPAWN_SEARCH_PATH, NULL, NULL,
-		NULL, &child_in, &child_out, NULL, &error))
-		fatal ("g_spawn: %s\n", error->message);
-
-	data->child_stdin = fdopen (child_in, "wb");
-	if (!data->child_stdin)
-		perror ("fdopen");
-
-	FILE *child_stdout = fdopen (child_out, "rb");
-	if (!child_stdout)
-		perror ("fdopen");
-
-	// Spawn a writer thread
-	g_mutex_lock (data->dict_mutex);
-	data->iterator = stardict_iterator_new (data->dict, data->start_entry);
-	g_mutex_unlock (data->dict_mutex);
-
-	GThread *writer = g_thread_new ("write worker",
-		(GThreadFunc) worker_writer, data);
-
-	// Read the output
-	g_mutex_lock (data->remaining_mutex);
-	guint32 remaining = data->remaining;
-	g_mutex_unlock (data->remaining_mutex);
-
-	data->output = NULL;
-	gpointer *output_end = &data->output;
-	while (remaining)
-	{
-		static gchar next[sizeof (gpointer)];
-		GString *s = g_string_new (NULL);
-		g_string_append_len (s, next, sizeof next);
-
-		gint c;
-		while ((c = fgetc (child_stdout)) != EOF && c != '\n')
-			g_string_append_c (s, c);
-		if (c == EOF)
-			fatal ("eSpeak process died too soon\n");
-
-		gchar *translation = g_string_free (s, FALSE);
-		*output_end = translation;
-		output_end = (gpointer *) translation;
-
-		// We limit progress reporting so that
-		// the mutex doesn't spin like crazy
-		if ((--remaining & 255) != 0)
-			continue;
-
-		g_mutex_lock (data->remaining_mutex);
-		data->remaining = remaining;
-		g_cond_broadcast (data->remaining_cond);
-		g_mutex_unlock (data->remaining_mutex);
-	}
-
-	if (fgetc (child_stdout) != EOF)
-		fatal ("Error: eSpeak has written more lines than it should. "
-			"The output would be corrupt, aborting.\n");
-
-	fclose (child_stdout);
-	return g_thread_join (writer);
-}
-
-// --- Main --------------------------------------------------------------------
-
-int
-main (int argc, char *argv[])
-{
-	gint n_processes = 1;
-	gchar *voice = NULL;
-	gboolean ignore_acronyms = FALSE;
-
-	GOptionEntry entries[] =
-	{
-		{ "processes", 'N', G_OPTION_FLAG_IN_MAIN,
-		  G_OPTION_ARG_INT, &n_processes,
-		  "The number of espeak processes run in parallel", "PROCESSES" },
-		{ "voice", 'v', G_OPTION_FLAG_IN_MAIN,
-		  G_OPTION_ARG_STRING, &voice,
-		  "The voice to be used by eSpeak to pronounce the words", "VOICE" },
-		{ "ignore-acronyms", 0, G_OPTION_FLAG_IN_MAIN,
-		  G_OPTION_ARG_NONE, &ignore_acronyms,
-		  "Don't spell out words composed of big letters only", NULL },
-		{ NULL }
-	};
-
-G_GNUC_BEGIN_IGNORE_DEPRECATIONS
-	if (glib_check_version (2, 36, 0))
-		g_type_init ();
-G_GNUC_END_IGNORE_DEPRECATIONS
-
-	GError *error = NULL;
-	GOptionContext *ctx = g_option_context_new
-		("input.ifo output-basename - add pronunciation to dictionaries");
-	g_option_context_add_main_entries (ctx, entries, NULL);
-	if (!g_option_context_parse (ctx, &argc, &argv, &error))
-		fatal ("Error: option parsing failed: %s\n", error->message);
-
-	if (argc != 3)
-		fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL));
-
-	g_option_context_free (ctx);
-
-	// See if we can run espeak
-	static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL, NULL, NULL };
-
-	if (voice)
-	{
-		cmdline[3] = "-v";
-		cmdline[4] = voice;
-	}
-
-	gchar *void_entry = g_strstrip (get_void_entry (cmdline));
-
-	// Load the dictionary
-	printf ("Loading the original dictionary...\n");
-	StardictDict *dict = stardict_dict_new (argv[1], &error);
-	if (!dict)
-		fatal ("Error: opening the dictionary failed: %s\n", error->message);
-
-	gsize n_words = stardict_info_get_word_count
-		(stardict_dict_get_info (dict));
-
-	if (n_processes <= 0)
-		fatal ("Error: there must be at least one process\n");
-
-	if ((gsize) n_processes > n_words * 1024)
-	{
-		n_processes = n_words / 1024;
-		if (!n_processes)
-			n_processes = 1;
-		g_printerr ("Warning: too many processes, reducing to %d\n",
-			n_processes);
-	}
-
-	// Spawn worker threads to generate pronunciation data
-	static GMutex dict_mutex;
-
-	static GMutex remaining_mutex;
-	static GCond remaining_cond;
-
-	WorkerData *data = g_alloca (sizeof *data * n_processes);
-
-	GRegex *re_stop = g_regex_new ("[" LINE_SPLITTING_CHARS "][ ?]"
-		"|\\.\\.\\.|[" OTHER_STOP_CHARS "]", G_REGEX_OPTIMIZE, 0, &error);
-	g_assert (re_stop != NULL);
-
-	GRegex *re_acronym = g_regex_new ("(^|\\pZ)(\\p{Lu}+)(?=\\pZ|$)",
-		G_REGEX_OPTIMIZE, 0, &error);
-	g_assert (re_acronym != NULL);
-
-	gint i;
-	for (i = 0; i < n_processes; i++)
-	{
-		data[i].start_entry = n_words *  i      / n_processes;
-		data[i].end_entry   = n_words * (i + 1) / n_processes;
-
-		data[i].total = data[i].remaining =
-			data[i].end_entry - data[i].start_entry;
-		data[i].remaining_mutex = &remaining_mutex;
-		data[i].remaining_cond = &remaining_cond;
-
-		data[i].dict = dict;
-		data[i].dict_mutex = &dict_mutex;
-
-		data[i].re_stop = re_stop;
-		data[i].re_acronym = re_acronym;
-
-		data[i].cmdline = cmdline;
-		data[i].ignore_acronyms = ignore_acronyms;
-		data[i].main_thread =
-			g_thread_new ("worker", (GThreadFunc) worker, &data[i]);
-	}
-
-	// Loop while the threads still have some work to do and report status
-	g_mutex_lock (&remaining_mutex);
-	for (;;)
-	{
-		gboolean all_finished = TRUE;
-		printf ("\rRetrieving pronunciation... ");
-		for (i = 0; i < n_processes; i++)
-		{
-			printf ("%3u%% ", 100 - data[i].remaining * 100 / data[i].total);
-			if (data[i].remaining)
-				all_finished = FALSE;
-		}
-
-		if (all_finished)
-			break;
-		g_cond_wait (&remaining_cond, &remaining_mutex);
-	}
-	g_mutex_unlock (&remaining_mutex);
-
-	putchar ('\n');
-	for (i = 0; i < n_processes; i++)
-		g_thread_join (data[i].main_thread);
-
-	g_regex_unref (re_stop);
-	g_regex_unref (re_acronym);
-
-	// Put extended entries into a new dictionary
-	Generator *generator = generator_new (argv[2], &error);
-	if (!generator)
-		fatal ("Error: failed to create the output dictionary: %s\n",
-			error->message);
-
-	StardictInfo *info = generator->info;
-	stardict_info_copy (info, stardict_dict_get_info (dict));
-
-	// This gets incremented each time an entry is finished
-	info->word_count = 0;
-
-	if (info->same_type_sequence)
-	{
-		gchar *new_sts = g_strconcat ("t", info->same_type_sequence, NULL);
-		g_free (info->same_type_sequence);
-		info->same_type_sequence = new_sts;
-	}
-
-	// Write out all the entries together with the pronunciation
-	for (i = 0; i < n_processes; i++)
-	{
-		StardictIterator *iterator =
-			stardict_iterator_new (dict, data[i].start_entry);
-
-		gpointer *output = data[i].output;
-		while (stardict_iterator_get_offset (iterator) != data[i].end_entry)
-		{
-			printf ("\rCreating a new dictionary... %3lu%%",
-				(gulong) stardict_iterator_get_offset (iterator) * 100
-				/ stardict_dict_get_info (dict)->word_count);
-
-			g_assert (output != NULL);
-
-			gchar *pronunciation = g_strstrip ((gchar *) (output + 1));
-			StardictEntry *entry = stardict_iterator_get_entry (iterator);
-
-			generator_begin_entry (generator);
-
-			if (!strcmp (pronunciation, void_entry))
-				*pronunciation = 0;
-
-//			g_printerr ("%s /%s/\n",
-//				stardict_iterator_get_word (iterator), pronunciation);
-
-			// For the sake of simplicity we fake a new start;
-			// write_fields() only iterates the list in one direction.
-			StardictEntryField field;
-			field.type = 't';
-			field.data = pronunciation;
-
-			GList start_link;
-			start_link.next = entry->fields;
-			start_link.data = &field;
-
-			if (!generator_write_fields (generator, &start_link, &error)
-			 || !generator_finish_entry (generator,
-					stardict_iterator_get_word (iterator), &error))
-				fatal ("Error: write failed: %s\n", error->message);
-
-			g_object_unref (entry);
-
-			gpointer *tmp = output;
-			output = *output;
-			g_free (tmp);
-
-			stardict_iterator_next (iterator);
-		}
-
-		g_assert (output == NULL);
-		g_object_unref (iterator);
-	}
-
-	putchar ('\n');
-	if (!generator_finish (generator, &error))
-		fatal ("Error: failed to write the dictionary: %s\n", error->message);
-
-	generator_free (generator);
-	g_object_unref (dict);
-	g_free (void_entry);
-	return 0;
-}
author	Přemysl Eric Janouch <p@janouch.name>	2023-06-11 17:45:38 +0200
committer	Přemysl Eric Janouch <p@janouch.name>	2023-06-11 18:08:03 +0200
commit	c77d994dc44a9ef8f87dd36661201f499877fc34 (patch)
tree	0ff850d9807f53b9acfe4e9ea95e3346b214ef37 /src/add-pronunciation.c
parent	238e7a2bb961eb448dee1542e03cbdb84dea027d (diff)
download	tdv-c77d994dc44a9ef8f87dd36661201f499877fc34.tar.gz tdv-c77d994dc44a9ef8f87dd36661201f499877fc34.tar.xz tdv-c77d994dc44a9ef8f87dd36661201f499877fc34.zip