From c77d994dc44a9ef8f87dd36661201f499877fc34 Mon Sep 17 00:00:00 2001
From: Přemysl Eric Janouch
Date: Sun, 11 Jun 2023 17:45:38 +0200
Subject: Rename tools, make them installable
---
CMakeLists.txt | 10 +-
README.adoc | 8 +-
src/add-pronunciation.c | 469 --------------------------------------------
src/query-tool.c | 313 -----------------------------
src/tabfile.c | 223 ---------------------
src/tdv-add-pronunciation.c | 469 ++++++++++++++++++++++++++++++++++++++++++++
src/tdv-query-tool.c | 313 +++++++++++++++++++++++++++++
src/tdv-tabfile.c | 223 +++++++++++++++++++++
src/tdv-transform.c | 226 +++++++++++++++++++++
src/transform.c | 226 ---------------------
10 files changed, 1243 insertions(+), 1237 deletions(-)
delete mode 100644 src/add-pronunciation.c
delete mode 100644 src/query-tool.c
delete mode 100644 src/tabfile.c
create mode 100644 src/tdv-add-pronunciation.c
create mode 100644 src/tdv-query-tool.c
create mode 100644 src/tdv-tabfile.c
create mode 100644 src/tdv-transform.c
delete mode 100644 src/transform.c
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d4c494..f995dd4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -285,14 +285,17 @@ endif ()
target_link_libraries (${PROJECT_NAME} ${project_libraries})
# Tools
-set (tools tabfile add-pronunciation query-tool transform)
+set (tools tdv-tabfile tdv-add-pronunciation tdv-query-tool tdv-transform)
foreach (tool ${tools})
add_executable (${tool} EXCLUDE_FROM_ALL
src/${tool}.c ${project_common_sources})
target_link_libraries (${tool} ${project_common_libraries})
endforeach ()
-add_custom_target (tools DEPENDS ${tools})
+option (WITH_TOOLS "Build and install some StarDict tools" ${UNIX})
+if (WITH_TOOLS)
+ add_custom_target (tools ALL DEPENDS ${tools})
+endif ()
# Example dictionaries
file (GLOB dicts_scripts "${PROJECT_SOURCE_DIR}/dicts/*.*")
@@ -315,6 +318,9 @@ if (NOT WIN32)
install (TARGETS ${PROJECT_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
install (FILES LICENSE DESTINATION ${CMAKE_INSTALL_DOCDIR})
+ if (WITH_TOOLS)
+ install (TARGETS ${tools} DESTINATION ${CMAKE_INSTALL_BINDIR})
+ endif ()
if (WITH_GUI)
install (FILES ${PROJECT_NAME}.svg
DESTINATION ${CMAKE_INSTALL_DATADIR}/icons/hicolor/scalable/apps)
diff --git a/README.adoc b/README.adoc
index 0d9ca4e..ab2b4be 100644
--- a/README.adoc
+++ b/README.adoc
@@ -81,10 +81,10 @@ The `make dicts` command will build some examples from freely available sources:
- Czech foreign words (the site's export is broken as of 2022/08, no response)
- Czech WordNet 1.9 PDT (synonyms, hypernyms, hyponyms)
-You can use the included 'transform' tool to convert already existing StarDict
-dictionaries that are nearly good as they are. Remember that you can change
-the `sametypesequence` of the resulting '.ifo' file to another format, or run
-'dictzip' on '.dict' files to make them compact.
+You can use the included 'tdv-transform' tool to convert already existing
+StarDict dictionaries that are nearly good as they are. Remember that you can
+change the `sametypesequence` of the resulting '.ifo' file to another format,
+or run 'dictzip' on '.dict' files to make them compact.
https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[CZ <--> EN/DE/PL/RU dictionaries]
diff --git a/src/add-pronunciation.c b/src/add-pronunciation.c
deleted file mode 100644
index 90d9673..0000000
--- a/src/add-pronunciation.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * A tool to add eSpeak-generated pronunciation to dictionaries
- *
- * Here I use the `espeak' process rather than libespeak because of the GPL.
- * It's far from ideal, rather good as a starting point.
- *
- * Copyright (c) 2013, Přemysl Eric Janouch
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- */
-
-#include
-#include
-#include
-#include
-
-#include
-#include
-
-#include "stardict.h"
-#include "stardict-private.h"
-#include "generator.h"
-#include "utils.h"
-
-
-// --- Pronunciation generator -------------------------------------------------
-
-typedef struct worker_data WorkerData;
-
-struct worker_data
-{
- gchar **cmdline; ///< eSpeak command line
- guint ignore_acronyms : 1; ///< Don't spell out acronyms
- GRegex *re_stop; ///< Regex for stop sequences
- GRegex *re_acronym; ///< Regex for ACRONYMS
-
- guint32 start_entry; ///< The first entry to be processed
- guint32 end_entry; ///< Past the last entry to be processed
-
- // Reader, writer
- GMutex *dict_mutex; ///< Locks the dictionary object
-
- // Reader
- GThread *main_thread; ///< A handle to the reader thread
- StardictDict *dict; ///< The dictionary object
- gpointer output; ///< Linked-list of pronunciation data
-
- GMutex *remaining_mutex; ///< Locks the progress stats
- GCond *remaining_cond; ///< Signals a change in progress
- guint32 remaining; ///< How many entries remain
- guint32 total; ///< Total number of entries
-
- // Writer
- StardictIterator *iterator; ///< Iterates over the dictionary
- FILE *child_stdin; ///< Standard input of eSpeak
-};
-
-/// eSpeak splits the output on certain characters.
-#define LINE_SPLITTING_CHARS ".,:;?!"
-
-/// We don't want to include brackets either.
-#define OTHER_STOP_CHARS "([{<"
-
-/// A void word used to make a unique "no pronunciation available" mark.
-#define VOID_ENTRY "not present in any dictionary"
-
-
-/// Adds dots between characters.
-static gboolean
-writer_acronym_cb (const GMatchInfo *info, GString *res,
- G_GNUC_UNUSED gpointer data)
-{
- gchar *preceding = g_match_info_fetch (info, 1);
- g_string_append (res, preceding);
- g_free (preceding);
-
- gchar *word = g_match_info_fetch (info, 2);
-
- g_string_append_c (res, *word);
- const gchar *p;
- for (p = word + 1; *p; p++)
- {
- g_string_append_c (res, '.');
- g_string_append_c (res, *p);
- }
-
- g_free (word);
- return FALSE;
-}
-
-/// Writes to espeak's stdin.
-static gpointer
-worker_writer (WorkerData *data)
-{
- GError *error = NULL;
- GMatchInfo *match_info;
- while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
- {
- g_mutex_lock (data->dict_mutex);
- const gchar *word = stardict_iterator_get_word (data->iterator);
- g_mutex_unlock (data->dict_mutex);
-
- word += strspn (word, LINE_SPLITTING_CHARS " \t");
- gchar *x = g_strdup (word);
-
- // Cut the word if needed be
- error = NULL;
- if (g_regex_match_full (data->re_stop,
- x, -1, 0, 0, &match_info, &error))
- {
- gint start_pos;
- g_match_info_fetch_pos (match_info, 0, &start_pos, NULL);
- x[start_pos] = 0;
- }
- g_match_info_free (match_info);
-
- // Change acronyms so that they're not pronounced as words
- if (!error && !data->ignore_acronyms)
- {
- char *tmp = g_regex_replace_eval (data->re_acronym,
- x, -1, 0, 0, writer_acronym_cb, NULL, &error);
- g_free (x);
- x = tmp;
- }
-
- if (error)
- {
- g_printerr ("Notice: error processing '%s': %s\n",
- word, error->message);
- g_clear_error (&error);
- *x = 0;
- }
-
- // We might have accidentally cut off everything
- if (!*x)
- {
- g_free (x);
- x = g_strdup (VOID_ENTRY);
- }
-
- stardict_iterator_next (data->iterator);
- if (fprintf (data->child_stdin, "%s\n", x) < 0)
- fatal ("write to eSpeak failed: %s\n", g_strerror (errno));
-
- g_free (x);
- }
-
- g_object_unref (data->iterator);
- return GINT_TO_POINTER (fclose (data->child_stdin));
-}
-
-/// Get the void entry (and test if espeak works).
-static gchar *
-get_void_entry (gchar *cmdline[])
-{
- gchar *output;
- gint exit_status;
-
- GError *error = NULL;
- if (!g_spawn_sync (NULL, cmdline, NULL,
- G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, NULL, NULL,
- &output, NULL, &exit_status, &error))
- fatal ("Error: couldn't spawn espeak: %s\n", error->message);
-
- if (exit_status)
- fatal ("Error: espeak returned %d\n", exit_status);
-
- return output;
-}
-
-/// Reads from espeak's stdout.
-static gpointer
-worker (WorkerData *data)
-{
- // Spawn eSpeak
- GError *error = NULL;
- gint child_in, child_out;
- if (!g_spawn_async_with_pipes (NULL, data->cmdline, NULL,
- G_SPAWN_SEARCH_PATH, NULL, NULL,
- NULL, &child_in, &child_out, NULL, &error))
- fatal ("g_spawn: %s\n", error->message);
-
- data->child_stdin = fdopen (child_in, "wb");
- if (!data->child_stdin)
- perror ("fdopen");
-
- FILE *child_stdout = fdopen (child_out, "rb");
- if (!child_stdout)
- perror ("fdopen");
-
- // Spawn a writer thread
- g_mutex_lock (data->dict_mutex);
- data->iterator = stardict_iterator_new (data->dict, data->start_entry);
- g_mutex_unlock (data->dict_mutex);
-
- GThread *writer = g_thread_new ("write worker",
- (GThreadFunc) worker_writer, data);
-
- // Read the output
- g_mutex_lock (data->remaining_mutex);
- guint32 remaining = data->remaining;
- g_mutex_unlock (data->remaining_mutex);
-
- data->output = NULL;
- gpointer *output_end = &data->output;
- while (remaining)
- {
- static gchar next[sizeof (gpointer)];
- GString *s = g_string_new (NULL);
- g_string_append_len (s, next, sizeof next);
-
- gint c;
- while ((c = fgetc (child_stdout)) != EOF && c != '\n')
- g_string_append_c (s, c);
- if (c == EOF)
- fatal ("eSpeak process died too soon\n");
-
- gchar *translation = g_string_free (s, FALSE);
- *output_end = translation;
- output_end = (gpointer *) translation;
-
- // We limit progress reporting so that
- // the mutex doesn't spin like crazy
- if ((--remaining & 255) != 0)
- continue;
-
- g_mutex_lock (data->remaining_mutex);
- data->remaining = remaining;
- g_cond_broadcast (data->remaining_cond);
- g_mutex_unlock (data->remaining_mutex);
- }
-
- if (fgetc (child_stdout) != EOF)
- fatal ("Error: eSpeak has written more lines than it should. "
- "The output would be corrupt, aborting.\n");
-
- fclose (child_stdout);
- return g_thread_join (writer);
-}
-
-// --- Main --------------------------------------------------------------------
-
-int
-main (int argc, char *argv[])
-{
- gint n_processes = 1;
- gchar *voice = NULL;
- gboolean ignore_acronyms = FALSE;
-
- GOptionEntry entries[] =
- {
- { "processes", 'N', G_OPTION_FLAG_IN_MAIN,
- G_OPTION_ARG_INT, &n_processes,
- "The number of espeak processes run in parallel", "PROCESSES" },
- { "voice", 'v', G_OPTION_FLAG_IN_MAIN,
- G_OPTION_ARG_STRING, &voice,
- "The voice to be used by eSpeak to pronounce the words", "VOICE" },
- { "ignore-acronyms", 0, G_OPTION_FLAG_IN_MAIN,
- G_OPTION_ARG_NONE, &ignore_acronyms,
- "Don't spell out words composed of big letters only", NULL },
- { NULL }
- };
-
-G_GNUC_BEGIN_IGNORE_DEPRECATIONS
- if (glib_check_version (2, 36, 0))
- g_type_init ();
-G_GNUC_END_IGNORE_DEPRECATIONS
-
- GError *error = NULL;
- GOptionContext *ctx = g_option_context_new
- ("input.ifo output-basename - add pronunciation to dictionaries");
- g_option_context_add_main_entries (ctx, entries, NULL);
- if (!g_option_context_parse (ctx, &argc, &argv, &error))
- fatal ("Error: option parsing failed: %s\n", error->message);
-
- if (argc != 3)
- fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL));
-
- g_option_context_free (ctx);
-
- // See if we can run espeak
- static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL, NULL, NULL };
-
- if (voice)
- {
- cmdline[3] = "-v";
- cmdline[4] = voice;
- }
-
- gchar *void_entry = g_strstrip (get_void_entry (cmdline));
-
- // Load the dictionary
- printf ("Loading the original dictionary...\n");
- StardictDict *dict = stardict_dict_new (argv[1], &error);
- if (!dict)
- fatal ("Error: opening the dictionary failed: %s\n", error->message);
-
- gsize n_words = stardict_info_get_word_count
- (stardict_dict_get_info (dict));
-
- if (n_processes <= 0)
- fatal ("Error: there must be at least one process\n");
-
- if ((gsize) n_processes > n_words * 1024)
- {
- n_processes = n_words / 1024;
- if (!n_processes)
- n_processes = 1;
- g_printerr ("Warning: too many processes, reducing to %d\n",
- n_processes);
- }
-
- // Spawn worker threads to generate pronunciation data
- static GMutex dict_mutex;
-
- static GMutex remaining_mutex;
- static GCond remaining_cond;
-
- WorkerData *data = g_alloca (sizeof *data * n_processes);
-
- GRegex *re_stop = g_regex_new ("[" LINE_SPLITTING_CHARS "][ ?]"
- "|\\.\\.\\.|[" OTHER_STOP_CHARS "]", G_REGEX_OPTIMIZE, 0, &error);
- g_assert (re_stop != NULL);
-
- GRegex *re_acronym = g_regex_new ("(^|\\pZ)(\\p{Lu}+)(?=\\pZ|$)",
- G_REGEX_OPTIMIZE, 0, &error);
- g_assert (re_acronym != NULL);
-
- gint i;
- for (i = 0; i < n_processes; i++)
- {
- data[i].start_entry = n_words * i / n_processes;
- data[i].end_entry = n_words * (i + 1) / n_processes;
-
- data[i].total = data[i].remaining =
- data[i].end_entry - data[i].start_entry;
- data[i].remaining_mutex = &remaining_mutex;
- data[i].remaining_cond = &remaining_cond;
-
- data[i].dict = dict;
- data[i].dict_mutex = &dict_mutex;
-
- data[i].re_stop = re_stop;
- data[i].re_acronym = re_acronym;
-
- data[i].cmdline = cmdline;
- data[i].ignore_acronyms = ignore_acronyms;
- data[i].main_thread =
- g_thread_new ("worker", (GThreadFunc) worker, &data[i]);
- }
-
- // Loop while the threads still have some work to do and report status
- g_mutex_lock (&remaining_mutex);
- for (;;)
- {
- gboolean all_finished = TRUE;
- printf ("\rRetrieving pronunciation... ");
- for (i = 0; i < n_processes; i++)
- {
- printf ("%3u%% ", 100 - data[i].remaining * 100 / data[i].total);
- if (data[i].remaining)
- all_finished = FALSE;
- }
-
- if (all_finished)
- break;
- g_cond_wait (&remaining_cond, &remaining_mutex);
- }
- g_mutex_unlock (&remaining_mutex);
-
- putchar ('\n');
- for (i = 0; i < n_processes; i++)
- g_thread_join (data[i].main_thread);
-
- g_regex_unref (re_stop);
- g_regex_unref (re_acronym);
-
- // Put extended entries into a new dictionary
- Generator *generator = generator_new (argv[2], &error);
- if (!generator)
- fatal ("Error: failed to create the output dictionary: %s\n",
- error->message);
-
- StardictInfo *info = generator->info;
- stardict_info_copy (info, stardict_dict_get_info (dict));
-
- // This gets incremented each time an entry is finished
- info->word_count = 0;
-
- if (info->same_type_sequence)
- {
- gchar *new_sts = g_strconcat ("t", info->same_type_sequence, NULL);
- g_free (info->same_type_sequence);
- info->same_type_sequence = new_sts;
- }
-
- // Write out all the entries together with the pronunciation
- for (i = 0; i < n_processes; i++)
- {
- StardictIterator *iterator =
- stardict_iterator_new (dict, data[i].start_entry);
-
- gpointer *output = data[i].output;
- while (stardict_iterator_get_offset (iterator) != data[i].end_entry)
- {
- printf ("\rCreating a new dictionary... %3lu%%",
- (gulong) stardict_iterator_get_offset (iterator) * 100
- / stardict_dict_get_info (dict)->word_count);
-
- g_assert (output != NULL);
-
- gchar *pronunciation = g_strstrip ((gchar *) (output + 1));
- StardictEntry *entry = stardict_iterator_get_entry (iterator);
-
- generator_begin_entry (generator);
-
- if (!strcmp (pronunciation, void_entry))
- *pronunciation = 0;
-
-// g_printerr ("%s /%s/\n",
-// stardict_iterator_get_word (iterator), pronunciation);
-
- // For the sake of simplicity we fake a new start;
- // write_fields() only iterates the list in one direction.
- StardictEntryField field;
- field.type = 't';
- field.data = pronunciation;
-
- GList start_link;
- start_link.next = entry->fields;
- start_link.data = &field;
-
- if (!generator_write_fields (generator, &start_link, &error)
- || !generator_finish_entry (generator,
- stardict_iterator_get_word (iterator), &error))
- fatal ("Error: write failed: %s\n", error->message);
-
- g_object_unref (entry);
-
- gpointer *tmp = output;
- output = *output;
- g_free (tmp);
-
- stardict_iterator_next (iterator);
- }
-
- g_assert (output == NULL);
- g_object_unref (iterator);
- }
-
- putchar ('\n');
- if (!generator_finish (generator, &error))
- fatal ("Error: failed to write the dictionary: %s\n", error->message);
-
- generator_free (generator);
- g_object_unref (dict);
- g_free (void_entry);
- return 0;
-}
diff --git a/src/query-tool.c b/src/query-tool.c
deleted file mode 100644
index 6cfdc66..0000000
--- a/src/query-tool.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * A tool to query multiple dictionaries for the specified word
- *
- * Intended for use in IRC bots and similar silly things---words go in,
- * one per each line, and entries come out, one dictionary at a time,
- * finalised with an empty line. Newlines are escaped with `\n',
- * backslashes with `\\'.
- *
- * So far only the `m', `g`, and `x` fields are supported, as in tdv.
- *
- * Copyright (c) 2013 - 2021, Přemysl Eric Janouch
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- */
-
-#include
-#include
-#include
-#include
-
-#include
-#include
-#include
-
-#include "stardict.h"
-#include "stardict-private.h"
-#include "generator.h"
-#include "utils.h"
-
-
-// --- Output formatting -------------------------------------------------------
-
-/// Transform Pango attributes to in-line formatting sequences (non-reentrant)
-typedef const gchar *(*FormatterFunc) (PangoAttrIterator *);
-
-static const gchar *
-pango_attrs_ignore (G_GNUC_UNUSED PangoAttrIterator *iterator)
-{
- return "";
-}
-
-static const gchar *
-pango_attrs_to_irc (PangoAttrIterator *iterator)
-{
- static gchar buf[5];
- gchar *p = buf;
- *p++ = 0x0f;
-
- if (!iterator)
- goto reset_formatting;
-
- PangoAttrInt *attr = NULL;
- if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
- PANGO_ATTR_WEIGHT)) && attr->value >= PANGO_WEIGHT_BOLD)
- *p++ = 0x02;
- if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
- PANGO_ATTR_UNDERLINE)) && attr->value == PANGO_UNDERLINE_SINGLE)
- *p++ = 0x1f;
- if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
- PANGO_ATTR_STYLE)) && attr->value == PANGO_STYLE_ITALIC)
- *p++ = 0x1d;
-
-reset_formatting:
- *p++ = 0;
- return buf;
-}
-
-static const gchar *
-pango_attrs_to_ansi (PangoAttrIterator *iterator)
-{
- static gchar buf[16];
- g_strlcpy (buf, "\x1b[0", sizeof buf);
- if (!iterator)
- goto reset_formatting;
-
- PangoAttrInt *attr = NULL;
- if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
- PANGO_ATTR_WEIGHT)) && attr->value >= PANGO_WEIGHT_BOLD)
- g_strlcat (buf, ";1", sizeof buf);
- if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
- PANGO_ATTR_UNDERLINE)) && attr->value == PANGO_UNDERLINE_SINGLE)
- g_strlcat (buf, ";4", sizeof buf);
- if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
- PANGO_ATTR_STYLE)) && attr->value == PANGO_STYLE_ITALIC)
- g_strlcat (buf, ";3", sizeof buf);
-
-reset_formatting:
- g_strlcat (buf, "m", sizeof buf);
- return buf;
-}
-
-static gchar *
-pango_to_output_text (const gchar *markup, FormatterFunc formatter)
-{
- // This function skips leading whitespace, but it's the canonical one
- gchar *text = NULL;
- PangoAttrList *attrs = NULL;
- if (!pango_parse_markup (markup, -1, 0, &attrs, &text, NULL, NULL))
- return g_strdup_printf ("<%s>", ("error in entry"));
-
- PangoAttrIterator *iterator = pango_attr_list_get_iterator (attrs);
- GString *result = g_string_new ("");
- do
- {
- gint start = 0, end = 0;
- pango_attr_iterator_range (iterator, &start, &end);
- if (end == G_MAXINT)
- end = strlen (text);
-
- g_string_append (result, formatter (iterator));
- g_string_append_len (result, text + start, end - start);
- }
- while (pango_attr_iterator_next (iterator));
- g_string_append (result, formatter (NULL));
-
- g_free (text);
- pango_attr_iterator_destroy (iterator);
- pango_attr_list_unref (attrs);
- return g_string_free (result, FALSE);
-}
-
-static gchar *
-field_to_output_text (const StardictEntryField *field, FormatterFunc formatter)
-{
- const gchar *definition = field->data;
- if (field->type == STARDICT_FIELD_MEANING)
- return g_strdup (definition);
- if (field->type == STARDICT_FIELD_PANGO)
- return pango_to_output_text (definition, formatter);
- if (field->type == STARDICT_FIELD_XDXF)
- {
- gchar *markup = xdxf_to_pango_markup_with_reduced_effort (definition);
- gchar *result = pango_to_output_text (markup, formatter);
- g_free (markup);
- return result;
- }
- return NULL;
-}
-
-// --- Main --------------------------------------------------------------------
-
-static guint
-count_equal_chars (const gchar *a, const gchar *b)
-{
- guint count = 0;
- while (*a && *b)
- if (*a++ == *b++)
- count++;
- return count;
-}
-
-static void
-do_dictionary (StardictDict *dict, const gchar *word, FormatterFunc formatter)
-{
- gboolean found;
- StardictIterator *iter = stardict_dict_search (dict, word, &found);
- if (!found)
- goto out;
-
- // Default Stardict ordering is ASCII case-insensitive,
- // which may be further exacerbated by our own collation feature.
- // Try to find a better matching entry:
-
- gint64 best_offset = stardict_iterator_get_offset (iter);
- guint best_score = count_equal_chars
- (stardict_iterator_get_word (iter), word);
-
- while (TRUE)
- {
- stardict_iterator_next (iter);
- if (!stardict_iterator_is_valid (iter))
- break;
-
- const gchar *iter_word = stardict_iterator_get_word (iter);
- if (g_ascii_strcasecmp (iter_word, word))
- break;
-
- guint score = count_equal_chars (iter_word, word);
- if (score > best_score)
- {
- best_offset = stardict_iterator_get_offset (iter);
- best_score = score;
- }
- }
-
- stardict_iterator_set_offset (iter, best_offset, FALSE);
-
- StardictEntry *entry = stardict_iterator_get_entry (iter);
- StardictInfo *info = stardict_dict_get_info (dict);
- const GList *list = stardict_entry_get_fields (entry);
- for (; list; list = list->next)
- {
- StardictEntryField *field = list->data;
- gchar *definitions = field_to_output_text (field, formatter);
- if (!definitions)
- continue;
-
- printf ("%s\t", info->book_name);
- for (const gchar *p = definitions; *p; p++)
- {
- if (*p == '\\')
- printf ("\\\\");
- else if (*p == '\n')
- printf ("\\n");
- else
- putchar (*p);
- }
- putchar ('\n');
- g_free (definitions);
- }
- g_object_unref (entry);
-out:
- g_object_unref (iter);
-}
-
-static FormatterFunc
-parse_options (int *argc, char ***argv)
-{
- GError *error = NULL;
- GOptionContext *ctx = g_option_context_new
- ("DICTIONARY.ifo... - query multiple dictionaries");
-
- gboolean format_with_ansi = FALSE;
- gboolean format_with_irc = FALSE;
- GOptionEntry entries[] =
- {
- { "ansi", 'a', 0, G_OPTION_ARG_NONE, &format_with_ansi,
- "Format with ANSI sequences", NULL },
- { "irc", 'i', 0, G_OPTION_ARG_NONE, &format_with_irc,
- "Format with IRC codes", NULL },
- { }
- };
-
- g_option_context_add_main_entries (ctx, entries, NULL);
- if (!g_option_context_parse (ctx, argc, argv, &error))
- {
- g_printerr ("Error: option parsing failed: %s\n", error->message);
- exit (EXIT_FAILURE);
- }
- if (*argc < 2)
- {
- g_printerr ("%s\n", g_option_context_get_help (ctx, TRUE, NULL));
- exit (EXIT_FAILURE);
- }
- g_option_context_free (ctx);
-
- if (format_with_ansi)
- return pango_attrs_to_ansi;
- if (format_with_irc)
- return pango_attrs_to_irc;
-
- return pango_attrs_ignore;
-}
-
-int
-main (int argc, char *argv[])
-{
-G_GNUC_BEGIN_IGNORE_DEPRECATIONS
- if (glib_check_version (2, 36, 0))
- g_type_init ();
-G_GNUC_END_IGNORE_DEPRECATIONS
-
- FormatterFunc formatter = parse_options (&argc, &argv);
-
- guint n_dicts = argc - 1;
- StardictDict **dicts = g_alloca (sizeof *dicts * n_dicts);
-
- guint i;
- for (i = 1; i <= n_dicts; i++)
- {
- GError *error = NULL;
- dicts[i - 1] = stardict_dict_new (argv[i], &error);
- if (error)
- {
- g_printerr ("Error: opening dictionary `%s' failed: %s\n",
- argv[i], error->message);
- exit (EXIT_FAILURE);
- }
- }
-
- gint c;
- do
- {
- GString *s = g_string_new (NULL);
- while ((c = getchar ()) != EOF && c != '\n')
- if (c != '\r')
- g_string_append_c (s, c);
-
- if (s->len)
- for (i = 0; i < n_dicts; i++)
- do_dictionary (dicts[i], s->str, formatter);
-
- printf ("\n");
- fflush (NULL);
- g_string_free (s, TRUE);
- }
- while (c != EOF);
-
- for (i = 0; i < n_dicts; i++)
- g_object_unref (dicts[i]);
-
- return 0;
-}
diff --git a/src/tabfile.c b/src/tabfile.c
deleted file mode 100644
index fab0ef2..0000000
--- a/src/tabfile.c
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * A clean reimplementation of StarDict's tabfile
- *
- * Copyright (c) 2020 - 2021, Přemysl Eric Janouch
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- */
-
-#include
-#include
-#include
-#include
-#include
-
-#include
-#include
-#include
-
-#include
-
-#include "config.h"
-#include "stardict.h"
-#include "stardict-private.h"
-#include "generator.h"
-#include "utils.h"
-
-
-static gboolean
-set_data_error (GError **error, const gchar *message)
-{
- g_set_error_literal (error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA, message);
- return FALSE;
-}
-
-static const gchar escapes[256] = { ['n'] = '\n', ['t'] = '\t', ['\\'] = '\\' };
-
-static gboolean
-inplace_unescape (gchar *line, GError **error)
-{
- gboolean escape = FALSE;
- gchar *dest = line;
- for (gchar *src = line; *src; src++)
- {
- if (escape)
- {
- escape = FALSE;
- if (!(*dest++ = escapes[(guchar) *src]))
- return set_data_error (error, "unsupported escape");
- }
- else if (*src == '\\')
- escape = TRUE;
- else
- *dest++ = *src;
- }
- if (escape)
- return set_data_error (error, "trailing escape character");
-
- *dest = 0;
- return TRUE;
-}
-
-static gboolean
-import_line (Generator *generator, gchar *line, gsize len, GError **error)
-{
- if (!len)
- return TRUE;
- if (!g_utf8_validate_len (line, len, NULL))
- return set_data_error (error, "not valid UTF-8");
-
- gchar *separator = strchr (line, '\t');
- if (!separator)
- return set_data_error (error, "keyword separator not found");
-
- *separator++ = 0;
- if (strchr (line, '\\'))
- // The index wouldn't be sorted correctly with our method
- return set_data_error (error, "escapes not allowed in keywords");
-
- gchar *newline = strpbrk (separator, "\r\n");
- if (newline)
- *newline = 0;
-
- if (!inplace_unescape (line, error)
- || !inplace_unescape (separator, error))
- return FALSE;
-
- if (generator->info->same_type_sequence
- && *generator->info->same_type_sequence == STARDICT_FIELD_PANGO
- && !pango_parse_markup (separator, -1, 0, NULL, NULL, NULL, error))
- return FALSE;
-
- generator_begin_entry (generator);
- return generator_write_string (generator, separator, TRUE, error)
- && generator_finish_entry (generator, line, error);
-}
-
-static gboolean
-transform (FILE *fsorted, Generator *generator, GError **error)
-{
- gchar *line = NULL;
- gsize size = 0, ln = 1;
- for (ssize_t read; (read = getline (&line, &size, fsorted)) >= 0; ln++)
- if (!import_line (generator, line, read, error))
- break;
-
- free (line);
- if (ferror (fsorted))
- {
- g_set_error_literal (error, G_IO_ERROR,
- g_io_error_from_errno (errno), g_strerror (errno));
- return FALSE;
- }
- if (!feof (fsorted))
- {
- // You'll only get good line number output with presorted input!
- g_prefix_error (error, "line %zu: ", ln);
- return FALSE;
- }
- return TRUE;
-}
-
-static void
-validate_collation_locale (const gchar *locale)
-{
- UErrorCode error = U_ZERO_ERROR;
- UCollator *collator = ucol_open (locale, &error);
- if (!collator)
- fatal ("failed to create a collator for %s: %s\n",
- locale, u_errorName (error));
- ucol_close (collator);
-}
-
-int
-main (int argc, char *argv[])
-{
- // The GLib help includes an ellipsis character, for some reason
- (void) setlocale (LC_ALL, "");
-
- GError *error = NULL;
- GOptionContext *ctx = g_option_context_new ("output-basename < input");
- g_option_context_set_summary (ctx,
- "Create a StarDict dictionary from plaintext.");
-
- gboolean pango_markup = FALSE;
- StardictInfo template = {};
- GOptionEntry entries[] =
- {
- { "pango", 'p', 0, G_OPTION_ARG_NONE, &pango_markup,
- "Entries use Pango markup", NULL },
-
- { "book-name", 'b', 0, G_OPTION_ARG_STRING, &template.book_name,
- "Set the book name field", "TEXT" },
- { "author", 'a', 0, G_OPTION_ARG_STRING, &template.author,
- "Set the author field ", "NAME" },
- { "e-mail", 'e', 0, G_OPTION_ARG_STRING, &template.email,
- "Set the e-mail field", "ADDRESS" },
- { "website", 'w', 0, G_OPTION_ARG_STRING, &template.website,
- "Set the website field", "LINK" },
- { "description", 'd', 0, G_OPTION_ARG_STRING, &template.description,
- "Set the description field (newlines supported)", "TEXT" },
- { "date", 'D', 0, G_OPTION_ARG_STRING, &template.date,
- "Set the date field", "DATE" },
- { "collation", 'c', 0, G_OPTION_ARG_STRING, &template.collation,
- "Set the collation field (for ICU)", "LOCALE" },
- { }
- };
-
- g_option_context_add_main_entries (ctx, entries, GETTEXT_PACKAGE);
- if (!g_option_context_parse (ctx, &argc, &argv, &error))
- fatal ("Error: option parsing failed: %s\n", error->message);
- if (argc != 2)
- fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL));
- g_option_context_free (ctx);
-
- template.version = SD_VERSION_3_0_0;
- template.same_type_sequence = pango_markup
- ? (char[]) { STARDICT_FIELD_PANGO, 0 }
- : (char[]) { STARDICT_FIELD_MEANING, 0 };
-
- if (!template.book_name)
- template.book_name = argv[1];
- if (template.description)
- {
- gchar **lines = g_strsplit (template.description, "\n", -1);
- g_free (template.description);
- gchar *in_one_line = g_strjoinv ("
", lines);
- g_strfreev (lines);
- template.description = in_one_line;
- }
- if (template.collation)
- validate_collation_locale (template.collation);
-
- // This actually implements stardict_strcmp(), POSIX-compatibly.
- // Your sort(1) is not expected to be stable by default, like bsdsort is.
- FILE *fsorted = popen ("LC_ALL=C sort -t'\t' -k1f,1", "r");
- if (!fsorted)
- fatal ("%s: %s\n", "popen", g_strerror (errno));
-
- Generator *generator = generator_new (argv[1], &error);
- if (!generator)
- fatal ("Error: failed to create the output dictionary: %s\n",
- error->message);
-
- StardictInfo *info = generator->info;
- stardict_info_copy (info, &template);
- if (!transform (fsorted, generator, &error)
- || !generator_finish (generator, &error))
- fatal ("Error: failed to write the dictionary: %s\n", error->message);
-
- generator_free (generator);
- pclose (fsorted);
- return 0;
-}
diff --git a/src/tdv-add-pronunciation.c b/src/tdv-add-pronunciation.c
new file mode 100644
index 0000000..90d9673
--- /dev/null
+++ b/src/tdv-add-pronunciation.c
@@ -0,0 +1,469 @@
+/*
+ * A tool to add eSpeak-generated pronunciation to dictionaries
+ *
+ * Here I use the `espeak' process rather than libespeak because of the GPL.
+ * It's far from ideal, rather good as a starting point.
+ *
+ * Copyright (c) 2013, Přemysl Eric Janouch
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include "stardict.h"
+#include "stardict-private.h"
+#include "generator.h"
+#include "utils.h"
+
+
+// --- Pronunciation generator -------------------------------------------------
+
+typedef struct worker_data WorkerData;
+
+struct worker_data
+{
+ gchar **cmdline; ///< eSpeak command line
+ guint ignore_acronyms : 1; ///< Don't spell out acronyms
+ GRegex *re_stop; ///< Regex for stop sequences
+ GRegex *re_acronym; ///< Regex for ACRONYMS
+
+ guint32 start_entry; ///< The first entry to be processed
+ guint32 end_entry; ///< Past the last entry to be processed
+
+ // Reader, writer
+ GMutex *dict_mutex; ///< Locks the dictionary object
+
+ // Reader
+ GThread *main_thread; ///< A handle to the reader thread
+ StardictDict *dict; ///< The dictionary object
+ gpointer output; ///< Linked-list of pronunciation data
+
+ GMutex *remaining_mutex; ///< Locks the progress stats
+ GCond *remaining_cond; ///< Signals a change in progress
+ guint32 remaining; ///< How many entries remain
+ guint32 total; ///< Total number of entries
+
+ // Writer
+ StardictIterator *iterator; ///< Iterates over the dictionary
+ FILE *child_stdin; ///< Standard input of eSpeak
+};
+
+/// eSpeak splits the output on certain characters.
+#define LINE_SPLITTING_CHARS ".,:;?!"
+
+/// We don't want to include brackets either.
+#define OTHER_STOP_CHARS "([{<"
+
+/// A void word used to make a unique "no pronunciation available" mark.
+#define VOID_ENTRY "not present in any dictionary"
+
+
+/// Adds dots between characters.
+static gboolean
+writer_acronym_cb (const GMatchInfo *info, GString *res,
+ G_GNUC_UNUSED gpointer data)
+{
+ gchar *preceding = g_match_info_fetch (info, 1);
+ g_string_append (res, preceding);
+ g_free (preceding);
+
+ gchar *word = g_match_info_fetch (info, 2);
+
+ g_string_append_c (res, *word);
+ const gchar *p;
+ for (p = word + 1; *p; p++)
+ {
+ g_string_append_c (res, '.');
+ g_string_append_c (res, *p);
+ }
+
+ g_free (word);
+ return FALSE;
+}
+
+/// Writes to espeak's stdin.
+static gpointer
+worker_writer (WorkerData *data)
+{
+ GError *error = NULL;
+ GMatchInfo *match_info;
+ while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
+ {
+ g_mutex_lock (data->dict_mutex);
+ const gchar *word = stardict_iterator_get_word (data->iterator);
+ g_mutex_unlock (data->dict_mutex);
+
+ word += strspn (word, LINE_SPLITTING_CHARS " \t");
+ gchar *x = g_strdup (word);
+
+ // Cut the word if needed be
+ error = NULL;
+ if (g_regex_match_full (data->re_stop,
+ x, -1, 0, 0, &match_info, &error))
+ {
+ gint start_pos;
+ g_match_info_fetch_pos (match_info, 0, &start_pos, NULL);
+ x[start_pos] = 0;
+ }
+ g_match_info_free (match_info);
+
+ // Change acronyms so that they're not pronounced as words
+ if (!error && !data->ignore_acronyms)
+ {
+ char *tmp = g_regex_replace_eval (data->re_acronym,
+ x, -1, 0, 0, writer_acronym_cb, NULL, &error);
+ g_free (x);
+ x = tmp;
+ }
+
+ if (error)
+ {
+ g_printerr ("Notice: error processing '%s': %s\n",
+ word, error->message);
+ g_clear_error (&error);
+ *x = 0;
+ }
+
+ // We might have accidentally cut off everything
+ if (!*x)
+ {
+ g_free (x);
+ x = g_strdup (VOID_ENTRY);
+ }
+
+ stardict_iterator_next (data->iterator);
+ if (fprintf (data->child_stdin, "%s\n", x) < 0)
+ fatal ("write to eSpeak failed: %s\n", g_strerror (errno));
+
+ g_free (x);
+ }
+
+ g_object_unref (data->iterator);
+ return GINT_TO_POINTER (fclose (data->child_stdin));
+}
+
+/// Get the void entry (and test if espeak works).
+static gchar *
+get_void_entry (gchar *cmdline[])
+{
+ gchar *output;
+ gint exit_status;
+
+ GError *error = NULL;
+ if (!g_spawn_sync (NULL, cmdline, NULL,
+ G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, NULL, NULL,
+ &output, NULL, &exit_status, &error))
+ fatal ("Error: couldn't spawn espeak: %s\n", error->message);
+
+ if (exit_status)
+ fatal ("Error: espeak returned %d\n", exit_status);
+
+ return output;
+}
+
+/// Reads from espeak's stdout.
+static gpointer
+worker (WorkerData *data)
+{
+ // Spawn eSpeak
+ GError *error = NULL;
+ gint child_in, child_out;
+ if (!g_spawn_async_with_pipes (NULL, data->cmdline, NULL,
+ G_SPAWN_SEARCH_PATH, NULL, NULL,
+ NULL, &child_in, &child_out, NULL, &error))
+ fatal ("g_spawn: %s\n", error->message);
+
+ data->child_stdin = fdopen (child_in, "wb");
+ if (!data->child_stdin)
+ perror ("fdopen");
+
+ FILE *child_stdout = fdopen (child_out, "rb");
+ if (!child_stdout)
+ perror ("fdopen");
+
+ // Spawn a writer thread
+ g_mutex_lock (data->dict_mutex);
+ data->iterator = stardict_iterator_new (data->dict, data->start_entry);
+ g_mutex_unlock (data->dict_mutex);
+
+ GThread *writer = g_thread_new ("write worker",
+ (GThreadFunc) worker_writer, data);
+
+ // Read the output
+ g_mutex_lock (data->remaining_mutex);
+ guint32 remaining = data->remaining;
+ g_mutex_unlock (data->remaining_mutex);
+
+ data->output = NULL;
+ gpointer *output_end = &data->output;
+ while (remaining)
+ {
+ static gchar next[sizeof (gpointer)];
+ GString *s = g_string_new (NULL);
+ g_string_append_len (s, next, sizeof next);
+
+ gint c;
+ while ((c = fgetc (child_stdout)) != EOF && c != '\n')
+ g_string_append_c (s, c);
+ if (c == EOF)
+ fatal ("eSpeak process died too soon\n");
+
+ gchar *translation = g_string_free (s, FALSE);
+ *output_end = translation;
+ output_end = (gpointer *) translation;
+
+ // We limit progress reporting so that
+ // the mutex doesn't spin like crazy
+ if ((--remaining & 255) != 0)
+ continue;
+
+ g_mutex_lock (data->remaining_mutex);
+ data->remaining = remaining;
+ g_cond_broadcast (data->remaining_cond);
+ g_mutex_unlock (data->remaining_mutex);
+ }
+
+ if (fgetc (child_stdout) != EOF)
+ fatal ("Error: eSpeak has written more lines than it should. "
+ "The output would be corrupt, aborting.\n");
+
+ fclose (child_stdout);
+ return g_thread_join (writer);
+}
+
+// --- Main --------------------------------------------------------------------
+
+int
+main (int argc, char *argv[])
+{
+ gint n_processes = 1;
+ gchar *voice = NULL;
+ gboolean ignore_acronyms = FALSE;
+
+ GOptionEntry entries[] =
+ {
+ { "processes", 'N', G_OPTION_FLAG_IN_MAIN,
+ G_OPTION_ARG_INT, &n_processes,
+ "The number of espeak processes run in parallel", "PROCESSES" },
+ { "voice", 'v', G_OPTION_FLAG_IN_MAIN,
+ G_OPTION_ARG_STRING, &voice,
+ "The voice to be used by eSpeak to pronounce the words", "VOICE" },
+ { "ignore-acronyms", 0, G_OPTION_FLAG_IN_MAIN,
+ G_OPTION_ARG_NONE, &ignore_acronyms,
+ "Don't spell out words composed of big letters only", NULL },
+ { NULL }
+ };
+
+G_GNUC_BEGIN_IGNORE_DEPRECATIONS
+ if (glib_check_version (2, 36, 0))
+ g_type_init ();
+G_GNUC_END_IGNORE_DEPRECATIONS
+
+ GError *error = NULL;
+ GOptionContext *ctx = g_option_context_new
+ ("input.ifo output-basename - add pronunciation to dictionaries");
+ g_option_context_add_main_entries (ctx, entries, NULL);
+ if (!g_option_context_parse (ctx, &argc, &argv, &error))
+ fatal ("Error: option parsing failed: %s\n", error->message);
+
+ if (argc != 3)
+ fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL));
+
+ g_option_context_free (ctx);
+
+ // See if we can run espeak
+ static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL, NULL, NULL };
+
+ if (voice)
+ {
+ cmdline[3] = "-v";
+ cmdline[4] = voice;
+ }
+
+ gchar *void_entry = g_strstrip (get_void_entry (cmdline));
+
+ // Load the dictionary
+ printf ("Loading the original dictionary...\n");
+ StardictDict *dict = stardict_dict_new (argv[1], &error);
+ if (!dict)
+ fatal ("Error: opening the dictionary failed: %s\n", error->message);
+
+ gsize n_words = stardict_info_get_word_count
+ (stardict_dict_get_info (dict));
+
+ if (n_processes <= 0)
+ fatal ("Error: there must be at least one process\n");
+
+ if ((gsize) n_processes > n_words * 1024)
+ {
+ n_processes = n_words / 1024;
+ if (!n_processes)
+ n_processes = 1;
+ g_printerr ("Warning: too many processes, reducing to %d\n",
+ n_processes);
+ }
+
+ // Spawn worker threads to generate pronunciation data
+ static GMutex dict_mutex;
+
+ static GMutex remaining_mutex;
+ static GCond remaining_cond;
+
+ WorkerData *data = g_alloca (sizeof *data * n_processes);
+
+ GRegex *re_stop = g_regex_new ("[" LINE_SPLITTING_CHARS "][ ?]"
+ "|\\.\\.\\.|[" OTHER_STOP_CHARS "]", G_REGEX_OPTIMIZE, 0, &error);
+ g_assert (re_stop != NULL);
+
+ GRegex *re_acronym = g_regex_new ("(^|\\pZ)(\\p{Lu}+)(?=\\pZ|$)",
+ G_REGEX_OPTIMIZE, 0, &error);
+ g_assert (re_acronym != NULL);
+
+ gint i;
+ for (i = 0; i < n_processes; i++)
+ {
+ data[i].start_entry = n_words * i / n_processes;
+ data[i].end_entry = n_words * (i + 1) / n_processes;
+
+ data[i].total = data[i].remaining =
+ data[i].end_entry - data[i].start_entry;
+ data[i].remaining_mutex = &remaining_mutex;
+ data[i].remaining_cond = &remaining_cond;
+
+ data[i].dict = dict;
+ data[i].dict_mutex = &dict_mutex;
+
+ data[i].re_stop = re_stop;
+ data[i].re_acronym = re_acronym;
+
+ data[i].cmdline = cmdline;
+ data[i].ignore_acronyms = ignore_acronyms;
+ data[i].main_thread =
+ g_thread_new ("worker", (GThreadFunc) worker, &data[i]);
+ }
+
+ // Loop while the threads still have some work to do and report status
+ g_mutex_lock (&remaining_mutex);
+ for (;;)
+ {
+ gboolean all_finished = TRUE;
+ printf ("\rRetrieving pronunciation... ");
+ for (i = 0; i < n_processes; i++)
+ {
+ printf ("%3u%% ", 100 - data[i].remaining * 100 / data[i].total);
+ if (data[i].remaining)
+ all_finished = FALSE;
+ }
+
+ if (all_finished)
+ break;
+ g_cond_wait (&remaining_cond, &remaining_mutex);
+ }
+ g_mutex_unlock (&remaining_mutex);
+
+ putchar ('\n');
+ for (i = 0; i < n_processes; i++)
+ g_thread_join (data[i].main_thread);
+
+ g_regex_unref (re_stop);
+ g_regex_unref (re_acronym);
+
+ // Put extended entries into a new dictionary
+ Generator *generator = generator_new (argv[2], &error);
+ if (!generator)
+ fatal ("Error: failed to create the output dictionary: %s\n",
+ error->message);
+
+ StardictInfo *info = generator->info;
+ stardict_info_copy (info, stardict_dict_get_info (dict));
+
+ // This gets incremented each time an entry is finished
+ info->word_count = 0;
+
+ if (info->same_type_sequence)
+ {
+ gchar *new_sts = g_strconcat ("t", info->same_type_sequence, NULL);
+ g_free (info->same_type_sequence);
+ info->same_type_sequence = new_sts;
+ }
+
+ // Write out all the entries together with the pronunciation
+ for (i = 0; i < n_processes; i++)
+ {
+ StardictIterator *iterator =
+ stardict_iterator_new (dict, data[i].start_entry);
+
+ gpointer *output = data[i].output;
+ while (stardict_iterator_get_offset (iterator) != data[i].end_entry)
+ {
+ printf ("\rCreating a new dictionary... %3lu%%",
+ (gulong) stardict_iterator_get_offset (iterator) * 100
+ / stardict_dict_get_info (dict)->word_count);
+
+ g_assert (output != NULL);
+
+ gchar *pronunciation = g_strstrip ((gchar *) (output + 1));
+ StardictEntry *entry = stardict_iterator_get_entry (iterator);
+
+ generator_begin_entry (generator);
+
+ if (!strcmp (pronunciation, void_entry))
+ *pronunciation = 0;
+
+// g_printerr ("%s /%s/\n",
+// stardict_iterator_get_word (iterator), pronunciation);
+
+ // For the sake of simplicity we fake a new start;
+ // write_fields() only iterates the list in one direction.
+ StardictEntryField field;
+ field.type = 't';
+ field.data = pronunciation;
+
+ GList start_link;
+ start_link.next = entry->fields;
+ start_link.data = &field;
+
+ if (!generator_write_fields (generator, &start_link, &error)
+ || !generator_finish_entry (generator,
+ stardict_iterator_get_word (iterator), &error))
+ fatal ("Error: write failed: %s\n", error->message);
+
+ g_object_unref (entry);
+
+ gpointer *tmp = output;
+ output = *output;
+ g_free (tmp);
+
+ stardict_iterator_next (iterator);
+ }
+
+ g_assert (output == NULL);
+ g_object_unref (iterator);
+ }
+
+ putchar ('\n');
+ if (!generator_finish (generator, &error))
+ fatal ("Error: failed to write the dictionary: %s\n", error->message);
+
+ generator_free (generator);
+ g_object_unref (dict);
+ g_free (void_entry);
+ return 0;
+}
diff --git a/src/tdv-query-tool.c b/src/tdv-query-tool.c
new file mode 100644
index 0000000..6cfdc66
--- /dev/null
+++ b/src/tdv-query-tool.c
@@ -0,0 +1,313 @@
+/*
+ * A tool to query multiple dictionaries for the specified word
+ *
+ * Intended for use in IRC bots and similar silly things---words go in,
+ * one per each line, and entries come out, one dictionary at a time,
+ * finalised with an empty line. Newlines are escaped with `\n',
+ * backslashes with `\\'.
+ *
+ * So far only the `m', `g`, and `x` fields are supported, as in tdv.
+ *
+ * Copyright (c) 2013 - 2021, Přemysl Eric Janouch
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include "stardict.h"
+#include "stardict-private.h"
+#include "generator.h"
+#include "utils.h"
+
+
+// --- Output formatting -------------------------------------------------------
+
+/// Transform Pango attributes to in-line formatting sequences (non-reentrant)
+typedef const gchar *(*FormatterFunc) (PangoAttrIterator *);
+
+static const gchar *
+pango_attrs_ignore (G_GNUC_UNUSED PangoAttrIterator *iterator)
+{
+ return "";
+}
+
+static const gchar *
+pango_attrs_to_irc (PangoAttrIterator *iterator)
+{
+ static gchar buf[5];
+ gchar *p = buf;
+ *p++ = 0x0f;
+
+ if (!iterator)
+ goto reset_formatting;
+
+ PangoAttrInt *attr = NULL;
+ if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
+ PANGO_ATTR_WEIGHT)) && attr->value >= PANGO_WEIGHT_BOLD)
+ *p++ = 0x02;
+ if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
+ PANGO_ATTR_UNDERLINE)) && attr->value == PANGO_UNDERLINE_SINGLE)
+ *p++ = 0x1f;
+ if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
+ PANGO_ATTR_STYLE)) && attr->value == PANGO_STYLE_ITALIC)
+ *p++ = 0x1d;
+
+reset_formatting:
+ *p++ = 0;
+ return buf;
+}
+
+static const gchar *
+pango_attrs_to_ansi (PangoAttrIterator *iterator)
+{
+ static gchar buf[16];
+ g_strlcpy (buf, "\x1b[0", sizeof buf);
+ if (!iterator)
+ goto reset_formatting;
+
+ PangoAttrInt *attr = NULL;
+ if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
+ PANGO_ATTR_WEIGHT)) && attr->value >= PANGO_WEIGHT_BOLD)
+ g_strlcat (buf, ";1", sizeof buf);
+ if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
+ PANGO_ATTR_UNDERLINE)) && attr->value == PANGO_UNDERLINE_SINGLE)
+ g_strlcat (buf, ";4", sizeof buf);
+ if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator,
+ PANGO_ATTR_STYLE)) && attr->value == PANGO_STYLE_ITALIC)
+ g_strlcat (buf, ";3", sizeof buf);
+
+reset_formatting:
+ g_strlcat (buf, "m", sizeof buf);
+ return buf;
+}
+
+static gchar *
+pango_to_output_text (const gchar *markup, FormatterFunc formatter)
+{
+ // This function skips leading whitespace, but it's the canonical one
+ gchar *text = NULL;
+ PangoAttrList *attrs = NULL;
+ if (!pango_parse_markup (markup, -1, 0, &attrs, &text, NULL, NULL))
+ return g_strdup_printf ("<%s>", ("error in entry"));
+
+ PangoAttrIterator *iterator = pango_attr_list_get_iterator (attrs);
+ GString *result = g_string_new ("");
+ do
+ {
+ gint start = 0, end = 0;
+ pango_attr_iterator_range (iterator, &start, &end);
+ if (end == G_MAXINT)
+ end = strlen (text);
+
+ g_string_append (result, formatter (iterator));
+ g_string_append_len (result, text + start, end - start);
+ }
+ while (pango_attr_iterator_next (iterator));
+ g_string_append (result, formatter (NULL));
+
+ g_free (text);
+ pango_attr_iterator_destroy (iterator);
+ pango_attr_list_unref (attrs);
+ return g_string_free (result, FALSE);
+}
+
+static gchar *
+field_to_output_text (const StardictEntryField *field, FormatterFunc formatter)
+{
+ const gchar *definition = field->data;
+ if (field->type == STARDICT_FIELD_MEANING)
+ return g_strdup (definition);
+ if (field->type == STARDICT_FIELD_PANGO)
+ return pango_to_output_text (definition, formatter);
+ if (field->type == STARDICT_FIELD_XDXF)
+ {
+ gchar *markup = xdxf_to_pango_markup_with_reduced_effort (definition);
+ gchar *result = pango_to_output_text (markup, formatter);
+ g_free (markup);
+ return result;
+ }
+ return NULL;
+}
+
+// --- Main --------------------------------------------------------------------
+
+static guint
+count_equal_chars (const gchar *a, const gchar *b)
+{
+ guint count = 0;
+ while (*a && *b)
+ if (*a++ == *b++)
+ count++;
+ return count;
+}
+
+static void
+do_dictionary (StardictDict *dict, const gchar *word, FormatterFunc formatter)
+{
+ gboolean found;
+ StardictIterator *iter = stardict_dict_search (dict, word, &found);
+ if (!found)
+ goto out;
+
+ // Default Stardict ordering is ASCII case-insensitive,
+ // which may be further exacerbated by our own collation feature.
+ // Try to find a better matching entry:
+
+ gint64 best_offset = stardict_iterator_get_offset (iter);
+ guint best_score = count_equal_chars
+ (stardict_iterator_get_word (iter), word);
+
+ while (TRUE)
+ {
+ stardict_iterator_next (iter);
+ if (!stardict_iterator_is_valid (iter))
+ break;
+
+ const gchar *iter_word = stardict_iterator_get_word (iter);
+ if (g_ascii_strcasecmp (iter_word, word))
+ break;
+
+ guint score = count_equal_chars (iter_word, word);
+ if (score > best_score)
+ {
+ best_offset = stardict_iterator_get_offset (iter);
+ best_score = score;
+ }
+ }
+
+ stardict_iterator_set_offset (iter, best_offset, FALSE);
+
+ StardictEntry *entry = stardict_iterator_get_entry (iter);
+ StardictInfo *info = stardict_dict_get_info (dict);
+ const GList *list = stardict_entry_get_fields (entry);
+ for (; list; list = list->next)
+ {
+ StardictEntryField *field = list->data;
+ gchar *definitions = field_to_output_text (field, formatter);
+ if (!definitions)
+ continue;
+
+ printf ("%s\t", info->book_name);
+ for (const gchar *p = definitions; *p; p++)
+ {
+ if (*p == '\\')
+ printf ("\\\\");
+ else if (*p == '\n')
+ printf ("\\n");
+ else
+ putchar (*p);
+ }
+ putchar ('\n');
+ g_free (definitions);
+ }
+ g_object_unref (entry);
+out:
+ g_object_unref (iter);
+}
+
+static FormatterFunc
+parse_options (int *argc, char ***argv)
+{
+ GError *error = NULL;
+ GOptionContext *ctx = g_option_context_new
+ ("DICTIONARY.ifo... - query multiple dictionaries");
+
+ gboolean format_with_ansi = FALSE;
+ gboolean format_with_irc = FALSE;
+ GOptionEntry entries[] =
+ {
+ { "ansi", 'a', 0, G_OPTION_ARG_NONE, &format_with_ansi,
+ "Format with ANSI sequences", NULL },
+ { "irc", 'i', 0, G_OPTION_ARG_NONE, &format_with_irc,
+ "Format with IRC codes", NULL },
+ { }
+ };
+
+ g_option_context_add_main_entries (ctx, entries, NULL);
+ if (!g_option_context_parse (ctx, argc, argv, &error))
+ {
+ g_printerr ("Error: option parsing failed: %s\n", error->message);
+ exit (EXIT_FAILURE);
+ }
+ if (*argc < 2)
+ {
+ g_printerr ("%s\n", g_option_context_get_help (ctx, TRUE, NULL));
+ exit (EXIT_FAILURE);
+ }
+ g_option_context_free (ctx);
+
+ if (format_with_ansi)
+ return pango_attrs_to_ansi;
+ if (format_with_irc)
+ return pango_attrs_to_irc;
+
+ return pango_attrs_ignore;
+}
+
+int
+main (int argc, char *argv[])
+{
+G_GNUC_BEGIN_IGNORE_DEPRECATIONS
+ if (glib_check_version (2, 36, 0))
+ g_type_init ();
+G_GNUC_END_IGNORE_DEPRECATIONS
+
+ FormatterFunc formatter = parse_options (&argc, &argv);
+
+ guint n_dicts = argc - 1;
+ StardictDict **dicts = g_alloca (sizeof *dicts * n_dicts);
+
+ guint i;
+ for (i = 1; i <= n_dicts; i++)
+ {
+ GError *error = NULL;
+ dicts[i - 1] = stardict_dict_new (argv[i], &error);
+ if (error)
+ {
+ g_printerr ("Error: opening dictionary `%s' failed: %s\n",
+ argv[i], error->message);
+ exit (EXIT_FAILURE);
+ }
+ }
+
+ gint c;
+ do
+ {
+ GString *s = g_string_new (NULL);
+ while ((c = getchar ()) != EOF && c != '\n')
+ if (c != '\r')
+ g_string_append_c (s, c);
+
+ if (s->len)
+ for (i = 0; i < n_dicts; i++)
+ do_dictionary (dicts[i], s->str, formatter);
+
+ printf ("\n");
+ fflush (NULL);
+ g_string_free (s, TRUE);
+ }
+ while (c != EOF);
+
+ for (i = 0; i < n_dicts; i++)
+ g_object_unref (dicts[i]);
+
+ return 0;
+}
diff --git a/src/tdv-tabfile.c b/src/tdv-tabfile.c
new file mode 100644
index 0000000..fab0ef2
--- /dev/null
+++ b/src/tdv-tabfile.c
@@ -0,0 +1,223 @@
+/*
+ * A clean reimplementation of StarDict's tabfile
+ *
+ * Copyright (c) 2020 - 2021, Přemysl Eric Janouch
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+
+#include "config.h"
+#include "stardict.h"
+#include "stardict-private.h"
+#include "generator.h"
+#include "utils.h"
+
+
+static gboolean
+set_data_error (GError **error, const gchar *message)
+{
+ g_set_error_literal (error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA, message);
+ return FALSE;
+}
+
+static const gchar escapes[256] = { ['n'] = '\n', ['t'] = '\t', ['\\'] = '\\' };
+
+static gboolean
+inplace_unescape (gchar *line, GError **error)
+{
+ gboolean escape = FALSE;
+ gchar *dest = line;
+ for (gchar *src = line; *src; src++)
+ {
+ if (escape)
+ {
+ escape = FALSE;
+ if (!(*dest++ = escapes[(guchar) *src]))
+ return set_data_error (error, "unsupported escape");
+ }
+ else if (*src == '\\')
+ escape = TRUE;
+ else
+ *dest++ = *src;
+ }
+ if (escape)
+ return set_data_error (error, "trailing escape character");
+
+ *dest = 0;
+ return TRUE;
+}
+
+static gboolean
+import_line (Generator *generator, gchar *line, gsize len, GError **error)
+{
+ if (!len)
+ return TRUE;
+ if (!g_utf8_validate_len (line, len, NULL))
+ return set_data_error (error, "not valid UTF-8");
+
+ gchar *separator = strchr (line, '\t');
+ if (!separator)
+ return set_data_error (error, "keyword separator not found");
+
+ *separator++ = 0;
+ if (strchr (line, '\\'))
+ // The index wouldn't be sorted correctly with our method
+ return set_data_error (error, "escapes not allowed in keywords");
+
+ gchar *newline = strpbrk (separator, "\r\n");
+ if (newline)
+ *newline = 0;
+
+ if (!inplace_unescape (line, error)
+ || !inplace_unescape (separator, error))
+ return FALSE;
+
+ if (generator->info->same_type_sequence
+ && *generator->info->same_type_sequence == STARDICT_FIELD_PANGO
+ && !pango_parse_markup (separator, -1, 0, NULL, NULL, NULL, error))
+ return FALSE;
+
+ generator_begin_entry (generator);
+ return generator_write_string (generator, separator, TRUE, error)
+ && generator_finish_entry (generator, line, error);
+}
+
+static gboolean
+transform (FILE *fsorted, Generator *generator, GError **error)
+{
+ gchar *line = NULL;
+ gsize size = 0, ln = 1;
+ for (ssize_t read; (read = getline (&line, &size, fsorted)) >= 0; ln++)
+ if (!import_line (generator, line, read, error))
+ break;
+
+ free (line);
+ if (ferror (fsorted))
+ {
+ g_set_error_literal (error, G_IO_ERROR,
+ g_io_error_from_errno (errno), g_strerror (errno));
+ return FALSE;
+ }
+ if (!feof (fsorted))
+ {
+ // You'll only get good line number output with presorted input!
+ g_prefix_error (error, "line %zu: ", ln);
+ return FALSE;
+ }
+ return TRUE;
+}
+
+static void
+validate_collation_locale (const gchar *locale)
+{
+ UErrorCode error = U_ZERO_ERROR;
+ UCollator *collator = ucol_open (locale, &error);
+ if (!collator)
+ fatal ("failed to create a collator for %s: %s\n",
+ locale, u_errorName (error));
+ ucol_close (collator);
+}
+
+int
+main (int argc, char *argv[])
+{
+ // The GLib help includes an ellipsis character, for some reason
+ (void) setlocale (LC_ALL, "");
+
+ GError *error = NULL;
+ GOptionContext *ctx = g_option_context_new ("output-basename < input");
+ g_option_context_set_summary (ctx,
+ "Create a StarDict dictionary from plaintext.");
+
+ gboolean pango_markup = FALSE;
+ StardictInfo template = {};
+ GOptionEntry entries[] =
+ {
+ { "pango", 'p', 0, G_OPTION_ARG_NONE, &pango_markup,
+ "Entries use Pango markup", NULL },
+
+ { "book-name", 'b', 0, G_OPTION_ARG_STRING, &template.book_name,
+ "Set the book name field", "TEXT" },
+ { "author", 'a', 0, G_OPTION_ARG_STRING, &template.author,
+ "Set the author field ", "NAME" },
+ { "e-mail", 'e', 0, G_OPTION_ARG_STRING, &template.email,
+ "Set the e-mail field", "ADDRESS" },
+ { "website", 'w', 0, G_OPTION_ARG_STRING, &template.website,
+ "Set the website field", "LINK" },
+ { "description", 'd', 0, G_OPTION_ARG_STRING, &template.description,
+ "Set the description field (newlines supported)", "TEXT" },
+ { "date", 'D', 0, G_OPTION_ARG_STRING, &template.date,
+ "Set the date field", "DATE" },
+ { "collation", 'c', 0, G_OPTION_ARG_STRING, &template.collation,
+ "Set the collation field (for ICU)", "LOCALE" },
+ { }
+ };
+
+ g_option_context_add_main_entries (ctx, entries, GETTEXT_PACKAGE);
+ if (!g_option_context_parse (ctx, &argc, &argv, &error))
+ fatal ("Error: option parsing failed: %s\n", error->message);
+ if (argc != 2)
+ fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL));
+ g_option_context_free (ctx);
+
+ template.version = SD_VERSION_3_0_0;
+ template.same_type_sequence = pango_markup
+ ? (char[]) { STARDICT_FIELD_PANGO, 0 }
+ : (char[]) { STARDICT_FIELD_MEANING, 0 };
+
+ if (!template.book_name)
+ template.book_name = argv[1];
+ if (template.description)
+ {
+ gchar **lines = g_strsplit (template.description, "\n", -1);
+ g_free (template.description);
+ gchar *in_one_line = g_strjoinv ("
", lines);
+ g_strfreev (lines);
+ template.description = in_one_line;
+ }
+ if (template.collation)
+ validate_collation_locale (template.collation);
+
+ // This actually implements stardict_strcmp(), POSIX-compatibly.
+ // Your sort(1) is not expected to be stable by default, like bsdsort is.
+ FILE *fsorted = popen ("LC_ALL=C sort -t'\t' -k1f,1", "r");
+ if (!fsorted)
+ fatal ("%s: %s\n", "popen", g_strerror (errno));
+
+ Generator *generator = generator_new (argv[1], &error);
+ if (!generator)
+ fatal ("Error: failed to create the output dictionary: %s\n",
+ error->message);
+
+ StardictInfo *info = generator->info;
+ stardict_info_copy (info, &template);
+ if (!transform (fsorted, generator, &error)
+ || !generator_finish (generator, &error))
+ fatal ("Error: failed to write the dictionary: %s\n", error->message);
+
+ generator_free (generator);
+ pclose (fsorted);
+ return 0;
+}
diff --git a/src/tdv-transform.c b/src/tdv-transform.c
new file mode 100644
index 0000000..7520eb8
--- /dev/null
+++ b/src/tdv-transform.c
@@ -0,0 +1,226 @@
+/*
+ * A tool to transform dictionaries dictionaries by an external filter
+ *
+ * The external filter needs to process NUL-separated textual entries.
+ *
+ * Example: tdv-transform input.ifo output -- perl -p0e s/bullshit/soykaf/g
+ *
+ * Copyright (c) 2020, Přemysl Eric Janouch
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#include "stardict.h"
+#include "stardict-private.h"
+#include "generator.h"
+#include "utils.h"
+
+enum { PIPE_READ, PIPE_WRITE };
+
+
+// --- Main --------------------------------------------------------------------
+
+static inline void
+print_progress (gulong *last_percent, StardictIterator *iterator, gsize total)
+{
+ gulong percent =
+ (gulong) stardict_iterator_get_offset (iterator) * 100 / total;
+ if (percent != *last_percent)
+ {
+ printf ("\r Writing entries... %3lu%%", percent);
+ *last_percent = percent;
+ }
+}
+
+static gboolean
+write_to_filter (StardictDict *dict, gint fd, GError **error)
+{
+ StardictInfo *info = stardict_dict_get_info (dict);
+ gsize n_words = stardict_info_get_word_count (info);
+
+ StardictIterator *iterator = stardict_iterator_new (dict, 0);
+ gulong last_percent = -1;
+ while (stardict_iterator_is_valid (iterator))
+ {
+ print_progress (&last_percent, iterator, n_words);
+
+ StardictEntry *entry = stardict_iterator_get_entry (iterator);
+ for (const GList *fields = stardict_entry_get_fields (entry);
+ fields; fields = fields->next)
+ {
+ StardictEntryField *field = fields->data;
+ if (!g_ascii_islower (field->type))
+ continue;
+
+ if (write (fd, field->data, field->data_size)
+ != (ssize_t) field->data_size)
+ {
+ g_set_error (error, G_IO_ERROR, g_io_error_from_errno (errno),
+ "%s", g_strerror (errno));
+ return FALSE;
+ }
+ }
+
+ g_object_unref (entry);
+ stardict_iterator_next (iterator);
+ }
+ printf ("\n");
+ return TRUE;
+}
+
+static gboolean
+update_from_filter (StardictDict *dict, Generator *generator,
+ GMappedFile *filtered_file, GError **error)
+{
+ gchar *filtered = g_mapped_file_get_contents (filtered_file);
+ gchar *filtered_end = filtered + g_mapped_file_get_length (filtered_file);
+
+ StardictInfo *info = stardict_dict_get_info (dict);
+ gsize n_words = stardict_info_get_word_count (info);
+
+ StardictIterator *iterator = stardict_iterator_new (dict, 0);
+ gulong last_percent = -1;
+ while (stardict_iterator_is_valid (iterator))
+ {
+ print_progress (&last_percent, iterator, n_words);
+
+ StardictEntry *entry = stardict_iterator_get_entry (iterator);
+ generator_begin_entry (generator);
+
+ for (GList *fields = entry->fields; fields; fields = fields->next)
+ {
+ StardictEntryField *field = fields->data;
+ if (!g_ascii_islower (field->type))
+ continue;
+
+ gchar *end = memchr (filtered, 0, filtered_end - filtered);
+ if (!end)
+ {
+ g_set_error (error, G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT,
+ "filter seems to have ended too early");
+ return FALSE;
+ }
+
+ g_free (field->data);
+ field->data = g_strdup (filtered);
+ field->data_size = end - filtered + 1;
+ filtered = end + 1;
+ }
+
+ if (!generator_write_fields (generator, entry->fields, error)
+ || !generator_finish_entry (generator,
+ stardict_iterator_get_word (iterator), error))
+ return FALSE;
+
+ g_object_unref (entry);
+ stardict_iterator_next (iterator);
+ }
+ printf ("\n");
+ return TRUE;
+}
+
+int
+main (int argc, char *argv[])
+{
+ // The GLib help includes an ellipsis character, for some reason
+ (void) setlocale (LC_ALL, "");
+
+ GError *error = NULL;
+ GOptionContext *ctx = g_option_context_new
+ ("input.ifo output-basename -- FILTER [ARG...]");
+ g_option_context_set_summary
+ (ctx, "Transform dictionaries using a filter program.");
+ if (!g_option_context_parse (ctx, &argc, &argv, &error))
+ fatal ("Error: option parsing failed: %s\n", error->message);
+
+ if (argc < 3)
+ fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL));
+
+ // GLib is bullshit, getopt_long() always correctly removes this
+ gint program_argv_start = 3;
+ if (!strcmp (argv[program_argv_start], "--"))
+ program_argv_start++;
+
+ g_option_context_free (ctx);
+
+ printf ("Loading the original dictionary...\n");
+ StardictDict *dict = stardict_dict_new (argv[1], &error);
+ if (!dict)
+ fatal ("Error: opening the dictionary failed: %s\n", error->message);
+
+ printf ("Filtering entries...\n");
+ gint child_in[2];
+ if (!g_unix_open_pipe (child_in, 0, &error))
+ fatal ("g_unix_open_pipe: %s\n", error->message);
+
+ FILE *child_out = tmpfile ();
+ if (!child_out)
+ fatal ("tmpfile: %s\n", g_strerror (errno));
+
+ GPid pid = -1;
+ if (!g_spawn_async_with_fds (NULL /* working_directory */,
+ argv + program_argv_start /* forward a part of ours */, NULL /* envp */,
+ G_SPAWN_SEARCH_PATH | G_SPAWN_DO_NOT_REAP_CHILD,
+ NULL /* child_setup */, NULL /* user_data */,
+ &pid, child_in[PIPE_READ], fileno (child_out), STDERR_FILENO, &error))
+ fatal ("g_spawn: %s\n", error->message);
+ if (!write_to_filter (dict, child_in[PIPE_WRITE], &error))
+ fatal ("write_to_filter: %s\n", error->message);
+ if (!g_close (child_in[PIPE_READ], &error)
+ || !g_close (child_in[PIPE_WRITE], &error))
+ fatal ("g_close: %s\n", error->message);
+
+ printf ("Waiting for the filter to finish...\n");
+ int wstatus = errno = 0;
+ if (waitpid (pid, &wstatus, 0) < 1
+ || !WIFEXITED (wstatus) || WEXITSTATUS (wstatus) > 0)
+ fatal ("Filter failed (%s, status %d)\n", g_strerror (errno), wstatus);
+
+ GMappedFile *filtered = g_mapped_file_new_from_fd (fileno (child_out),
+ FALSE /* writable */, &error);
+ if (!filtered)
+ fatal ("g_mapped_file_new_from_fd: %s\n", error->message);
+
+ printf ("Writing the new dictionary...\n");
+ Generator *generator = generator_new (argv[2], &error);
+ if (!generator)
+ fatal ("Error: failed to create the output dictionary: %s\n",
+ error->message);
+
+ StardictInfo *info = generator->info;
+ stardict_info_copy (info, stardict_dict_get_info (dict));
+
+ // This gets incremented each time an entry is finished
+ info->word_count = 0;
+
+ if (!update_from_filter (dict, generator, filtered, &error)
+ || !generator_finish (generator, &error))
+ fatal ("Error: failed to write the dictionary: %s\n", error->message);
+
+ g_mapped_file_unref (filtered);
+ fclose (child_out);
+ generator_free (generator);
+ g_object_unref (dict);
+ return 0;
+}
diff --git a/src/transform.c b/src/transform.c
deleted file mode 100644
index ba33dee..0000000
--- a/src/transform.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * A tool to transform dictionaries dictionaries by an external filter
- *
- * The external filter needs to process NUL-separated textual entries.
- *
- * Example: transform input.ifo output -- perl -p0e s/bullshit/soykaf/g
- *
- * Copyright (c) 2020, Přemysl Eric Janouch
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- */
-
-#include
-#include
-#include
-#include
-#include
-
-#include
-#include
-#include
-#include
-
-#include "stardict.h"
-#include "stardict-private.h"
-#include "generator.h"
-#include "utils.h"
-
-enum { PIPE_READ, PIPE_WRITE };
-
-
-// --- Main --------------------------------------------------------------------
-
-static inline void
-print_progress (gulong *last_percent, StardictIterator *iterator, gsize total)
-{
- gulong percent =
- (gulong) stardict_iterator_get_offset (iterator) * 100 / total;
- if (percent != *last_percent)
- {
- printf ("\r Writing entries... %3lu%%", percent);
- *last_percent = percent;
- }
-}
-
-static gboolean
-write_to_filter (StardictDict *dict, gint fd, GError **error)
-{
- StardictInfo *info = stardict_dict_get_info (dict);
- gsize n_words = stardict_info_get_word_count (info);
-
- StardictIterator *iterator = stardict_iterator_new (dict, 0);
- gulong last_percent = -1;
- while (stardict_iterator_is_valid (iterator))
- {
- print_progress (&last_percent, iterator, n_words);
-
- StardictEntry *entry = stardict_iterator_get_entry (iterator);
- for (const GList *fields = stardict_entry_get_fields (entry);
- fields; fields = fields->next)
- {
- StardictEntryField *field = fields->data;
- if (!g_ascii_islower (field->type))
- continue;
-
- if (write (fd, field->data, field->data_size)
- != (ssize_t) field->data_size)
- {
- g_set_error (error, G_IO_ERROR, g_io_error_from_errno (errno),
- "%s", g_strerror (errno));
- return FALSE;
- }
- }
-
- g_object_unref (entry);
- stardict_iterator_next (iterator);
- }
- printf ("\n");
- return TRUE;
-}
-
-static gboolean
-update_from_filter (StardictDict *dict, Generator *generator,
- GMappedFile *filtered_file, GError **error)
-{
- gchar *filtered = g_mapped_file_get_contents (filtered_file);
- gchar *filtered_end = filtered + g_mapped_file_get_length (filtered_file);
-
- StardictInfo *info = stardict_dict_get_info (dict);
- gsize n_words = stardict_info_get_word_count (info);
-
- StardictIterator *iterator = stardict_iterator_new (dict, 0);
- gulong last_percent = -1;
- while (stardict_iterator_is_valid (iterator))
- {
- print_progress (&last_percent, iterator, n_words);
-
- StardictEntry *entry = stardict_iterator_get_entry (iterator);
- generator_begin_entry (generator);
-
- for (GList *fields = entry->fields; fields; fields = fields->next)
- {
- StardictEntryField *field = fields->data;
- if (!g_ascii_islower (field->type))
- continue;
-
- gchar *end = memchr (filtered, 0, filtered_end - filtered);
- if (!end)
- {
- g_set_error (error, G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT,
- "filter seems to have ended too early");
- return FALSE;
- }
-
- g_free (field->data);
- field->data = g_strdup (filtered);
- field->data_size = end - filtered + 1;
- filtered = end + 1;
- }
-
- if (!generator_write_fields (generator, entry->fields, error)
- || !generator_finish_entry (generator,
- stardict_iterator_get_word (iterator), error))
- return FALSE;
-
- g_object_unref (entry);
- stardict_iterator_next (iterator);
- }
- printf ("\n");
- return TRUE;
-}
-
-int
-main (int argc, char *argv[])
-{
- // The GLib help includes an ellipsis character, for some reason
- (void) setlocale (LC_ALL, "");
-
- GError *error = NULL;
- GOptionContext *ctx = g_option_context_new
- ("input.ifo output-basename -- FILTER [ARG...]");
- g_option_context_set_summary
- (ctx, "Transform dictionaries using a filter program.");
- if (!g_option_context_parse (ctx, &argc, &argv, &error))
- fatal ("Error: option parsing failed: %s\n", error->message);
-
- if (argc < 3)
- fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL));
-
- // GLib is bullshit, getopt_long() always correctly removes this
- gint program_argv_start = 3;
- if (!strcmp (argv[program_argv_start], "--"))
- program_argv_start++;
-
- g_option_context_free (ctx);
-
- printf ("Loading the original dictionary...\n");
- StardictDict *dict = stardict_dict_new (argv[1], &error);
- if (!dict)
- fatal ("Error: opening the dictionary failed: %s\n", error->message);
-
- printf ("Filtering entries...\n");
- gint child_in[2];
- if (!g_unix_open_pipe (child_in, 0, &error))
- fatal ("g_unix_open_pipe: %s\n", error->message);
-
- FILE *child_out = tmpfile ();
- if (!child_out)
- fatal ("tmpfile: %s\n", g_strerror (errno));
-
- GPid pid = -1;
- if (!g_spawn_async_with_fds (NULL /* working_directory */,
- argv + program_argv_start /* forward a part of ours */, NULL /* envp */,
- G_SPAWN_SEARCH_PATH | G_SPAWN_DO_NOT_REAP_CHILD,
- NULL /* child_setup */, NULL /* user_data */,
- &pid, child_in[PIPE_READ], fileno (child_out), STDERR_FILENO, &error))
- fatal ("g_spawn: %s\n", error->message);
- if (!write_to_filter (dict, child_in[PIPE_WRITE], &error))
- fatal ("write_to_filter: %s\n", error->message);
- if (!g_close (child_in[PIPE_READ], &error)
- || !g_close (child_in[PIPE_WRITE], &error))
- fatal ("g_close: %s\n", error->message);
-
- printf ("Waiting for the filter to finish...\n");
- int wstatus = errno = 0;
- if (waitpid (pid, &wstatus, 0) < 1
- || !WIFEXITED (wstatus) || WEXITSTATUS (wstatus) > 0)
- fatal ("Filter failed (%s, status %d)\n", g_strerror (errno), wstatus);
-
- GMappedFile *filtered = g_mapped_file_new_from_fd (fileno (child_out),
- FALSE /* writable */, &error);
- if (!filtered)
- fatal ("g_mapped_file_new_from_fd: %s\n", error->message);
-
- printf ("Writing the new dictionary...\n");
- Generator *generator = generator_new (argv[2], &error);
- if (!generator)
- fatal ("Error: failed to create the output dictionary: %s\n",
- error->message);
-
- StardictInfo *info = generator->info;
- stardict_info_copy (info, stardict_dict_get_info (dict));
-
- // This gets incremented each time an entry is finished
- info->word_count = 0;
-
- if (!update_from_filter (dict, generator, filtered, &error)
- || !generator_finish (generator, &error))
- fatal ("Error: failed to write the dictionary: %s\n", error->message);
-
- g_mapped_file_unref (filtered);
- fclose (child_out);
- generator_free (generator);
- g_object_unref (dict);
- return 0;
-}
--
cgit v1.2.3-70-g09d2