From c77d994dc44a9ef8f87dd36661201f499877fc34 Mon Sep 17 00:00:00 2001 From: Přemysl Eric Janouch Date: Sun, 11 Jun 2023 17:45:38 +0200 Subject: Rename tools, make them installable --- CMakeLists.txt | 10 +- README.adoc | 8 +- src/add-pronunciation.c | 469 -------------------------------------------- src/query-tool.c | 313 ----------------------------- src/tabfile.c | 223 --------------------- src/tdv-add-pronunciation.c | 469 ++++++++++++++++++++++++++++++++++++++++++++ src/tdv-query-tool.c | 313 +++++++++++++++++++++++++++++ src/tdv-tabfile.c | 223 +++++++++++++++++++++ src/tdv-transform.c | 226 +++++++++++++++++++++ src/transform.c | 226 --------------------- 10 files changed, 1243 insertions(+), 1237 deletions(-) delete mode 100644 src/add-pronunciation.c delete mode 100644 src/query-tool.c delete mode 100644 src/tabfile.c create mode 100644 src/tdv-add-pronunciation.c create mode 100644 src/tdv-query-tool.c create mode 100644 src/tdv-tabfile.c create mode 100644 src/tdv-transform.c delete mode 100644 src/transform.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d4c494..f995dd4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -285,14 +285,17 @@ endif () target_link_libraries (${PROJECT_NAME} ${project_libraries}) # Tools -set (tools tabfile add-pronunciation query-tool transform) +set (tools tdv-tabfile tdv-add-pronunciation tdv-query-tool tdv-transform) foreach (tool ${tools}) add_executable (${tool} EXCLUDE_FROM_ALL src/${tool}.c ${project_common_sources}) target_link_libraries (${tool} ${project_common_libraries}) endforeach () -add_custom_target (tools DEPENDS ${tools}) +option (WITH_TOOLS "Build and install some StarDict tools" ${UNIX}) +if (WITH_TOOLS) + add_custom_target (tools ALL DEPENDS ${tools}) +endif () # Example dictionaries file (GLOB dicts_scripts "${PROJECT_SOURCE_DIR}/dicts/*.*") @@ -315,6 +318,9 @@ if (NOT WIN32) install (TARGETS ${PROJECT_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR}) install (FILES LICENSE DESTINATION ${CMAKE_INSTALL_DOCDIR}) + if (WITH_TOOLS) + install (TARGETS ${tools} DESTINATION ${CMAKE_INSTALL_BINDIR}) + endif () if (WITH_GUI) install (FILES ${PROJECT_NAME}.svg DESTINATION ${CMAKE_INSTALL_DATADIR}/icons/hicolor/scalable/apps) diff --git a/README.adoc b/README.adoc index 0d9ca4e..ab2b4be 100644 --- a/README.adoc +++ b/README.adoc @@ -81,10 +81,10 @@ The `make dicts` command will build some examples from freely available sources: - Czech foreign words (the site's export is broken as of 2022/08, no response) - Czech WordNet 1.9 PDT (synonyms, hypernyms, hyponyms) -You can use the included 'transform' tool to convert already existing StarDict -dictionaries that are nearly good as they are. Remember that you can change -the `sametypesequence` of the resulting '.ifo' file to another format, or run -'dictzip' on '.dict' files to make them compact. +You can use the included 'tdv-transform' tool to convert already existing +StarDict dictionaries that are nearly good as they are. Remember that you can +change the `sametypesequence` of the resulting '.ifo' file to another format, +or run 'dictzip' on '.dict' files to make them compact. https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[CZ <--> EN/DE/PL/RU dictionaries] diff --git a/src/add-pronunciation.c b/src/add-pronunciation.c deleted file mode 100644 index 90d9673..0000000 --- a/src/add-pronunciation.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * A tool to add eSpeak-generated pronunciation to dictionaries - * - * Here I use the `espeak' process rather than libespeak because of the GPL. - * It's far from ideal, rather good as a starting point. - * - * Copyright (c) 2013, Přemysl Eric Janouch - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - */ - -#include -#include -#include -#include - -#include -#include - -#include "stardict.h" -#include "stardict-private.h" -#include "generator.h" -#include "utils.h" - - -// --- Pronunciation generator ------------------------------------------------- - -typedef struct worker_data WorkerData; - -struct worker_data -{ - gchar **cmdline; ///< eSpeak command line - guint ignore_acronyms : 1; ///< Don't spell out acronyms - GRegex *re_stop; ///< Regex for stop sequences - GRegex *re_acronym; ///< Regex for ACRONYMS - - guint32 start_entry; ///< The first entry to be processed - guint32 end_entry; ///< Past the last entry to be processed - - // Reader, writer - GMutex *dict_mutex; ///< Locks the dictionary object - - // Reader - GThread *main_thread; ///< A handle to the reader thread - StardictDict *dict; ///< The dictionary object - gpointer output; ///< Linked-list of pronunciation data - - GMutex *remaining_mutex; ///< Locks the progress stats - GCond *remaining_cond; ///< Signals a change in progress - guint32 remaining; ///< How many entries remain - guint32 total; ///< Total number of entries - - // Writer - StardictIterator *iterator; ///< Iterates over the dictionary - FILE *child_stdin; ///< Standard input of eSpeak -}; - -/// eSpeak splits the output on certain characters. -#define LINE_SPLITTING_CHARS ".,:;?!" - -/// We don't want to include brackets either. -#define OTHER_STOP_CHARS "([{<" - -/// A void word used to make a unique "no pronunciation available" mark. -#define VOID_ENTRY "not present in any dictionary" - - -/// Adds dots between characters. -static gboolean -writer_acronym_cb (const GMatchInfo *info, GString *res, - G_GNUC_UNUSED gpointer data) -{ - gchar *preceding = g_match_info_fetch (info, 1); - g_string_append (res, preceding); - g_free (preceding); - - gchar *word = g_match_info_fetch (info, 2); - - g_string_append_c (res, *word); - const gchar *p; - for (p = word + 1; *p; p++) - { - g_string_append_c (res, '.'); - g_string_append_c (res, *p); - } - - g_free (word); - return FALSE; -} - -/// Writes to espeak's stdin. -static gpointer -worker_writer (WorkerData *data) -{ - GError *error = NULL; - GMatchInfo *match_info; - while (stardict_iterator_get_offset (data->iterator) != data->end_entry) - { - g_mutex_lock (data->dict_mutex); - const gchar *word = stardict_iterator_get_word (data->iterator); - g_mutex_unlock (data->dict_mutex); - - word += strspn (word, LINE_SPLITTING_CHARS " \t"); - gchar *x = g_strdup (word); - - // Cut the word if needed be - error = NULL; - if (g_regex_match_full (data->re_stop, - x, -1, 0, 0, &match_info, &error)) - { - gint start_pos; - g_match_info_fetch_pos (match_info, 0, &start_pos, NULL); - x[start_pos] = 0; - } - g_match_info_free (match_info); - - // Change acronyms so that they're not pronounced as words - if (!error && !data->ignore_acronyms) - { - char *tmp = g_regex_replace_eval (data->re_acronym, - x, -1, 0, 0, writer_acronym_cb, NULL, &error); - g_free (x); - x = tmp; - } - - if (error) - { - g_printerr ("Notice: error processing '%s': %s\n", - word, error->message); - g_clear_error (&error); - *x = 0; - } - - // We might have accidentally cut off everything - if (!*x) - { - g_free (x); - x = g_strdup (VOID_ENTRY); - } - - stardict_iterator_next (data->iterator); - if (fprintf (data->child_stdin, "%s\n", x) < 0) - fatal ("write to eSpeak failed: %s\n", g_strerror (errno)); - - g_free (x); - } - - g_object_unref (data->iterator); - return GINT_TO_POINTER (fclose (data->child_stdin)); -} - -/// Get the void entry (and test if espeak works). -static gchar * -get_void_entry (gchar *cmdline[]) -{ - gchar *output; - gint exit_status; - - GError *error = NULL; - if (!g_spawn_sync (NULL, cmdline, NULL, - G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, NULL, NULL, - &output, NULL, &exit_status, &error)) - fatal ("Error: couldn't spawn espeak: %s\n", error->message); - - if (exit_status) - fatal ("Error: espeak returned %d\n", exit_status); - - return output; -} - -/// Reads from espeak's stdout. -static gpointer -worker (WorkerData *data) -{ - // Spawn eSpeak - GError *error = NULL; - gint child_in, child_out; - if (!g_spawn_async_with_pipes (NULL, data->cmdline, NULL, - G_SPAWN_SEARCH_PATH, NULL, NULL, - NULL, &child_in, &child_out, NULL, &error)) - fatal ("g_spawn: %s\n", error->message); - - data->child_stdin = fdopen (child_in, "wb"); - if (!data->child_stdin) - perror ("fdopen"); - - FILE *child_stdout = fdopen (child_out, "rb"); - if (!child_stdout) - perror ("fdopen"); - - // Spawn a writer thread - g_mutex_lock (data->dict_mutex); - data->iterator = stardict_iterator_new (data->dict, data->start_entry); - g_mutex_unlock (data->dict_mutex); - - GThread *writer = g_thread_new ("write worker", - (GThreadFunc) worker_writer, data); - - // Read the output - g_mutex_lock (data->remaining_mutex); - guint32 remaining = data->remaining; - g_mutex_unlock (data->remaining_mutex); - - data->output = NULL; - gpointer *output_end = &data->output; - while (remaining) - { - static gchar next[sizeof (gpointer)]; - GString *s = g_string_new (NULL); - g_string_append_len (s, next, sizeof next); - - gint c; - while ((c = fgetc (child_stdout)) != EOF && c != '\n') - g_string_append_c (s, c); - if (c == EOF) - fatal ("eSpeak process died too soon\n"); - - gchar *translation = g_string_free (s, FALSE); - *output_end = translation; - output_end = (gpointer *) translation; - - // We limit progress reporting so that - // the mutex doesn't spin like crazy - if ((--remaining & 255) != 0) - continue; - - g_mutex_lock (data->remaining_mutex); - data->remaining = remaining; - g_cond_broadcast (data->remaining_cond); - g_mutex_unlock (data->remaining_mutex); - } - - if (fgetc (child_stdout) != EOF) - fatal ("Error: eSpeak has written more lines than it should. " - "The output would be corrupt, aborting.\n"); - - fclose (child_stdout); - return g_thread_join (writer); -} - -// --- Main -------------------------------------------------------------------- - -int -main (int argc, char *argv[]) -{ - gint n_processes = 1; - gchar *voice = NULL; - gboolean ignore_acronyms = FALSE; - - GOptionEntry entries[] = - { - { "processes", 'N', G_OPTION_FLAG_IN_MAIN, - G_OPTION_ARG_INT, &n_processes, - "The number of espeak processes run in parallel", "PROCESSES" }, - { "voice", 'v', G_OPTION_FLAG_IN_MAIN, - G_OPTION_ARG_STRING, &voice, - "The voice to be used by eSpeak to pronounce the words", "VOICE" }, - { "ignore-acronyms", 0, G_OPTION_FLAG_IN_MAIN, - G_OPTION_ARG_NONE, &ignore_acronyms, - "Don't spell out words composed of big letters only", NULL }, - { NULL } - }; - -G_GNUC_BEGIN_IGNORE_DEPRECATIONS - if (glib_check_version (2, 36, 0)) - g_type_init (); -G_GNUC_END_IGNORE_DEPRECATIONS - - GError *error = NULL; - GOptionContext *ctx = g_option_context_new - ("input.ifo output-basename - add pronunciation to dictionaries"); - g_option_context_add_main_entries (ctx, entries, NULL); - if (!g_option_context_parse (ctx, &argc, &argv, &error)) - fatal ("Error: option parsing failed: %s\n", error->message); - - if (argc != 3) - fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL)); - - g_option_context_free (ctx); - - // See if we can run espeak - static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL, NULL, NULL }; - - if (voice) - { - cmdline[3] = "-v"; - cmdline[4] = voice; - } - - gchar *void_entry = g_strstrip (get_void_entry (cmdline)); - - // Load the dictionary - printf ("Loading the original dictionary...\n"); - StardictDict *dict = stardict_dict_new (argv[1], &error); - if (!dict) - fatal ("Error: opening the dictionary failed: %s\n", error->message); - - gsize n_words = stardict_info_get_word_count - (stardict_dict_get_info (dict)); - - if (n_processes <= 0) - fatal ("Error: there must be at least one process\n"); - - if ((gsize) n_processes > n_words * 1024) - { - n_processes = n_words / 1024; - if (!n_processes) - n_processes = 1; - g_printerr ("Warning: too many processes, reducing to %d\n", - n_processes); - } - - // Spawn worker threads to generate pronunciation data - static GMutex dict_mutex; - - static GMutex remaining_mutex; - static GCond remaining_cond; - - WorkerData *data = g_alloca (sizeof *data * n_processes); - - GRegex *re_stop = g_regex_new ("[" LINE_SPLITTING_CHARS "][ ?]" - "|\\.\\.\\.|[" OTHER_STOP_CHARS "]", G_REGEX_OPTIMIZE, 0, &error); - g_assert (re_stop != NULL); - - GRegex *re_acronym = g_regex_new ("(^|\\pZ)(\\p{Lu}+)(?=\\pZ|$)", - G_REGEX_OPTIMIZE, 0, &error); - g_assert (re_acronym != NULL); - - gint i; - for (i = 0; i < n_processes; i++) - { - data[i].start_entry = n_words * i / n_processes; - data[i].end_entry = n_words * (i + 1) / n_processes; - - data[i].total = data[i].remaining = - data[i].end_entry - data[i].start_entry; - data[i].remaining_mutex = &remaining_mutex; - data[i].remaining_cond = &remaining_cond; - - data[i].dict = dict; - data[i].dict_mutex = &dict_mutex; - - data[i].re_stop = re_stop; - data[i].re_acronym = re_acronym; - - data[i].cmdline = cmdline; - data[i].ignore_acronyms = ignore_acronyms; - data[i].main_thread = - g_thread_new ("worker", (GThreadFunc) worker, &data[i]); - } - - // Loop while the threads still have some work to do and report status - g_mutex_lock (&remaining_mutex); - for (;;) - { - gboolean all_finished = TRUE; - printf ("\rRetrieving pronunciation... "); - for (i = 0; i < n_processes; i++) - { - printf ("%3u%% ", 100 - data[i].remaining * 100 / data[i].total); - if (data[i].remaining) - all_finished = FALSE; - } - - if (all_finished) - break; - g_cond_wait (&remaining_cond, &remaining_mutex); - } - g_mutex_unlock (&remaining_mutex); - - putchar ('\n'); - for (i = 0; i < n_processes; i++) - g_thread_join (data[i].main_thread); - - g_regex_unref (re_stop); - g_regex_unref (re_acronym); - - // Put extended entries into a new dictionary - Generator *generator = generator_new (argv[2], &error); - if (!generator) - fatal ("Error: failed to create the output dictionary: %s\n", - error->message); - - StardictInfo *info = generator->info; - stardict_info_copy (info, stardict_dict_get_info (dict)); - - // This gets incremented each time an entry is finished - info->word_count = 0; - - if (info->same_type_sequence) - { - gchar *new_sts = g_strconcat ("t", info->same_type_sequence, NULL); - g_free (info->same_type_sequence); - info->same_type_sequence = new_sts; - } - - // Write out all the entries together with the pronunciation - for (i = 0; i < n_processes; i++) - { - StardictIterator *iterator = - stardict_iterator_new (dict, data[i].start_entry); - - gpointer *output = data[i].output; - while (stardict_iterator_get_offset (iterator) != data[i].end_entry) - { - printf ("\rCreating a new dictionary... %3lu%%", - (gulong) stardict_iterator_get_offset (iterator) * 100 - / stardict_dict_get_info (dict)->word_count); - - g_assert (output != NULL); - - gchar *pronunciation = g_strstrip ((gchar *) (output + 1)); - StardictEntry *entry = stardict_iterator_get_entry (iterator); - - generator_begin_entry (generator); - - if (!strcmp (pronunciation, void_entry)) - *pronunciation = 0; - -// g_printerr ("%s /%s/\n", -// stardict_iterator_get_word (iterator), pronunciation); - - // For the sake of simplicity we fake a new start; - // write_fields() only iterates the list in one direction. - StardictEntryField field; - field.type = 't'; - field.data = pronunciation; - - GList start_link; - start_link.next = entry->fields; - start_link.data = &field; - - if (!generator_write_fields (generator, &start_link, &error) - || !generator_finish_entry (generator, - stardict_iterator_get_word (iterator), &error)) - fatal ("Error: write failed: %s\n", error->message); - - g_object_unref (entry); - - gpointer *tmp = output; - output = *output; - g_free (tmp); - - stardict_iterator_next (iterator); - } - - g_assert (output == NULL); - g_object_unref (iterator); - } - - putchar ('\n'); - if (!generator_finish (generator, &error)) - fatal ("Error: failed to write the dictionary: %s\n", error->message); - - generator_free (generator); - g_object_unref (dict); - g_free (void_entry); - return 0; -} diff --git a/src/query-tool.c b/src/query-tool.c deleted file mode 100644 index 6cfdc66..0000000 --- a/src/query-tool.c +++ /dev/null @@ -1,313 +0,0 @@ -/* - * A tool to query multiple dictionaries for the specified word - * - * Intended for use in IRC bots and similar silly things---words go in, - * one per each line, and entries come out, one dictionary at a time, - * finalised with an empty line. Newlines are escaped with `\n', - * backslashes with `\\'. - * - * So far only the `m', `g`, and `x` fields are supported, as in tdv. - * - * Copyright (c) 2013 - 2021, Přemysl Eric Janouch - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include "stardict.h" -#include "stardict-private.h" -#include "generator.h" -#include "utils.h" - - -// --- Output formatting ------------------------------------------------------- - -/// Transform Pango attributes to in-line formatting sequences (non-reentrant) -typedef const gchar *(*FormatterFunc) (PangoAttrIterator *); - -static const gchar * -pango_attrs_ignore (G_GNUC_UNUSED PangoAttrIterator *iterator) -{ - return ""; -} - -static const gchar * -pango_attrs_to_irc (PangoAttrIterator *iterator) -{ - static gchar buf[5]; - gchar *p = buf; - *p++ = 0x0f; - - if (!iterator) - goto reset_formatting; - - PangoAttrInt *attr = NULL; - if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, - PANGO_ATTR_WEIGHT)) && attr->value >= PANGO_WEIGHT_BOLD) - *p++ = 0x02; - if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, - PANGO_ATTR_UNDERLINE)) && attr->value == PANGO_UNDERLINE_SINGLE) - *p++ = 0x1f; - if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, - PANGO_ATTR_STYLE)) && attr->value == PANGO_STYLE_ITALIC) - *p++ = 0x1d; - -reset_formatting: - *p++ = 0; - return buf; -} - -static const gchar * -pango_attrs_to_ansi (PangoAttrIterator *iterator) -{ - static gchar buf[16]; - g_strlcpy (buf, "\x1b[0", sizeof buf); - if (!iterator) - goto reset_formatting; - - PangoAttrInt *attr = NULL; - if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, - PANGO_ATTR_WEIGHT)) && attr->value >= PANGO_WEIGHT_BOLD) - g_strlcat (buf, ";1", sizeof buf); - if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, - PANGO_ATTR_UNDERLINE)) && attr->value == PANGO_UNDERLINE_SINGLE) - g_strlcat (buf, ";4", sizeof buf); - if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, - PANGO_ATTR_STYLE)) && attr->value == PANGO_STYLE_ITALIC) - g_strlcat (buf, ";3", sizeof buf); - -reset_formatting: - g_strlcat (buf, "m", sizeof buf); - return buf; -} - -static gchar * -pango_to_output_text (const gchar *markup, FormatterFunc formatter) -{ - // This function skips leading whitespace, but it's the canonical one - gchar *text = NULL; - PangoAttrList *attrs = NULL; - if (!pango_parse_markup (markup, -1, 0, &attrs, &text, NULL, NULL)) - return g_strdup_printf ("<%s>", ("error in entry")); - - PangoAttrIterator *iterator = pango_attr_list_get_iterator (attrs); - GString *result = g_string_new (""); - do - { - gint start = 0, end = 0; - pango_attr_iterator_range (iterator, &start, &end); - if (end == G_MAXINT) - end = strlen (text); - - g_string_append (result, formatter (iterator)); - g_string_append_len (result, text + start, end - start); - } - while (pango_attr_iterator_next (iterator)); - g_string_append (result, formatter (NULL)); - - g_free (text); - pango_attr_iterator_destroy (iterator); - pango_attr_list_unref (attrs); - return g_string_free (result, FALSE); -} - -static gchar * -field_to_output_text (const StardictEntryField *field, FormatterFunc formatter) -{ - const gchar *definition = field->data; - if (field->type == STARDICT_FIELD_MEANING) - return g_strdup (definition); - if (field->type == STARDICT_FIELD_PANGO) - return pango_to_output_text (definition, formatter); - if (field->type == STARDICT_FIELD_XDXF) - { - gchar *markup = xdxf_to_pango_markup_with_reduced_effort (definition); - gchar *result = pango_to_output_text (markup, formatter); - g_free (markup); - return result; - } - return NULL; -} - -// --- Main -------------------------------------------------------------------- - -static guint -count_equal_chars (const gchar *a, const gchar *b) -{ - guint count = 0; - while (*a && *b) - if (*a++ == *b++) - count++; - return count; -} - -static void -do_dictionary (StardictDict *dict, const gchar *word, FormatterFunc formatter) -{ - gboolean found; - StardictIterator *iter = stardict_dict_search (dict, word, &found); - if (!found) - goto out; - - // Default Stardict ordering is ASCII case-insensitive, - // which may be further exacerbated by our own collation feature. - // Try to find a better matching entry: - - gint64 best_offset = stardict_iterator_get_offset (iter); - guint best_score = count_equal_chars - (stardict_iterator_get_word (iter), word); - - while (TRUE) - { - stardict_iterator_next (iter); - if (!stardict_iterator_is_valid (iter)) - break; - - const gchar *iter_word = stardict_iterator_get_word (iter); - if (g_ascii_strcasecmp (iter_word, word)) - break; - - guint score = count_equal_chars (iter_word, word); - if (score > best_score) - { - best_offset = stardict_iterator_get_offset (iter); - best_score = score; - } - } - - stardict_iterator_set_offset (iter, best_offset, FALSE); - - StardictEntry *entry = stardict_iterator_get_entry (iter); - StardictInfo *info = stardict_dict_get_info (dict); - const GList *list = stardict_entry_get_fields (entry); - for (; list; list = list->next) - { - StardictEntryField *field = list->data; - gchar *definitions = field_to_output_text (field, formatter); - if (!definitions) - continue; - - printf ("%s\t", info->book_name); - for (const gchar *p = definitions; *p; p++) - { - if (*p == '\\') - printf ("\\\\"); - else if (*p == '\n') - printf ("\\n"); - else - putchar (*p); - } - putchar ('\n'); - g_free (definitions); - } - g_object_unref (entry); -out: - g_object_unref (iter); -} - -static FormatterFunc -parse_options (int *argc, char ***argv) -{ - GError *error = NULL; - GOptionContext *ctx = g_option_context_new - ("DICTIONARY.ifo... - query multiple dictionaries"); - - gboolean format_with_ansi = FALSE; - gboolean format_with_irc = FALSE; - GOptionEntry entries[] = - { - { "ansi", 'a', 0, G_OPTION_ARG_NONE, &format_with_ansi, - "Format with ANSI sequences", NULL }, - { "irc", 'i', 0, G_OPTION_ARG_NONE, &format_with_irc, - "Format with IRC codes", NULL }, - { } - }; - - g_option_context_add_main_entries (ctx, entries, NULL); - if (!g_option_context_parse (ctx, argc, argv, &error)) - { - g_printerr ("Error: option parsing failed: %s\n", error->message); - exit (EXIT_FAILURE); - } - if (*argc < 2) - { - g_printerr ("%s\n", g_option_context_get_help (ctx, TRUE, NULL)); - exit (EXIT_FAILURE); - } - g_option_context_free (ctx); - - if (format_with_ansi) - return pango_attrs_to_ansi; - if (format_with_irc) - return pango_attrs_to_irc; - - return pango_attrs_ignore; -} - -int -main (int argc, char *argv[]) -{ -G_GNUC_BEGIN_IGNORE_DEPRECATIONS - if (glib_check_version (2, 36, 0)) - g_type_init (); -G_GNUC_END_IGNORE_DEPRECATIONS - - FormatterFunc formatter = parse_options (&argc, &argv); - - guint n_dicts = argc - 1; - StardictDict **dicts = g_alloca (sizeof *dicts * n_dicts); - - guint i; - for (i = 1; i <= n_dicts; i++) - { - GError *error = NULL; - dicts[i - 1] = stardict_dict_new (argv[i], &error); - if (error) - { - g_printerr ("Error: opening dictionary `%s' failed: %s\n", - argv[i], error->message); - exit (EXIT_FAILURE); - } - } - - gint c; - do - { - GString *s = g_string_new (NULL); - while ((c = getchar ()) != EOF && c != '\n') - if (c != '\r') - g_string_append_c (s, c); - - if (s->len) - for (i = 0; i < n_dicts; i++) - do_dictionary (dicts[i], s->str, formatter); - - printf ("\n"); - fflush (NULL); - g_string_free (s, TRUE); - } - while (c != EOF); - - for (i = 0; i < n_dicts; i++) - g_object_unref (dicts[i]); - - return 0; -} diff --git a/src/tabfile.c b/src/tabfile.c deleted file mode 100644 index fab0ef2..0000000 --- a/src/tabfile.c +++ /dev/null @@ -1,223 +0,0 @@ -/* - * A clean reimplementation of StarDict's tabfile - * - * Copyright (c) 2020 - 2021, Přemysl Eric Janouch - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -#include "config.h" -#include "stardict.h" -#include "stardict-private.h" -#include "generator.h" -#include "utils.h" - - -static gboolean -set_data_error (GError **error, const gchar *message) -{ - g_set_error_literal (error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA, message); - return FALSE; -} - -static const gchar escapes[256] = { ['n'] = '\n', ['t'] = '\t', ['\\'] = '\\' }; - -static gboolean -inplace_unescape (gchar *line, GError **error) -{ - gboolean escape = FALSE; - gchar *dest = line; - for (gchar *src = line; *src; src++) - { - if (escape) - { - escape = FALSE; - if (!(*dest++ = escapes[(guchar) *src])) - return set_data_error (error, "unsupported escape"); - } - else if (*src == '\\') - escape = TRUE; - else - *dest++ = *src; - } - if (escape) - return set_data_error (error, "trailing escape character"); - - *dest = 0; - return TRUE; -} - -static gboolean -import_line (Generator *generator, gchar *line, gsize len, GError **error) -{ - if (!len) - return TRUE; - if (!g_utf8_validate_len (line, len, NULL)) - return set_data_error (error, "not valid UTF-8"); - - gchar *separator = strchr (line, '\t'); - if (!separator) - return set_data_error (error, "keyword separator not found"); - - *separator++ = 0; - if (strchr (line, '\\')) - // The index wouldn't be sorted correctly with our method - return set_data_error (error, "escapes not allowed in keywords"); - - gchar *newline = strpbrk (separator, "\r\n"); - if (newline) - *newline = 0; - - if (!inplace_unescape (line, error) - || !inplace_unescape (separator, error)) - return FALSE; - - if (generator->info->same_type_sequence - && *generator->info->same_type_sequence == STARDICT_FIELD_PANGO - && !pango_parse_markup (separator, -1, 0, NULL, NULL, NULL, error)) - return FALSE; - - generator_begin_entry (generator); - return generator_write_string (generator, separator, TRUE, error) - && generator_finish_entry (generator, line, error); -} - -static gboolean -transform (FILE *fsorted, Generator *generator, GError **error) -{ - gchar *line = NULL; - gsize size = 0, ln = 1; - for (ssize_t read; (read = getline (&line, &size, fsorted)) >= 0; ln++) - if (!import_line (generator, line, read, error)) - break; - - free (line); - if (ferror (fsorted)) - { - g_set_error_literal (error, G_IO_ERROR, - g_io_error_from_errno (errno), g_strerror (errno)); - return FALSE; - } - if (!feof (fsorted)) - { - // You'll only get good line number output with presorted input! - g_prefix_error (error, "line %zu: ", ln); - return FALSE; - } - return TRUE; -} - -static void -validate_collation_locale (const gchar *locale) -{ - UErrorCode error = U_ZERO_ERROR; - UCollator *collator = ucol_open (locale, &error); - if (!collator) - fatal ("failed to create a collator for %s: %s\n", - locale, u_errorName (error)); - ucol_close (collator); -} - -int -main (int argc, char *argv[]) -{ - // The GLib help includes an ellipsis character, for some reason - (void) setlocale (LC_ALL, ""); - - GError *error = NULL; - GOptionContext *ctx = g_option_context_new ("output-basename < input"); - g_option_context_set_summary (ctx, - "Create a StarDict dictionary from plaintext."); - - gboolean pango_markup = FALSE; - StardictInfo template = {}; - GOptionEntry entries[] = - { - { "pango", 'p', 0, G_OPTION_ARG_NONE, &pango_markup, - "Entries use Pango markup", NULL }, - - { "book-name", 'b', 0, G_OPTION_ARG_STRING, &template.book_name, - "Set the book name field", "TEXT" }, - { "author", 'a', 0, G_OPTION_ARG_STRING, &template.author, - "Set the author field ", "NAME" }, - { "e-mail", 'e', 0, G_OPTION_ARG_STRING, &template.email, - "Set the e-mail field", "ADDRESS" }, - { "website", 'w', 0, G_OPTION_ARG_STRING, &template.website, - "Set the website field", "LINK" }, - { "description", 'd', 0, G_OPTION_ARG_STRING, &template.description, - "Set the description field (newlines supported)", "TEXT" }, - { "date", 'D', 0, G_OPTION_ARG_STRING, &template.date, - "Set the date field", "DATE" }, - { "collation", 'c', 0, G_OPTION_ARG_STRING, &template.collation, - "Set the collation field (for ICU)", "LOCALE" }, - { } - }; - - g_option_context_add_main_entries (ctx, entries, GETTEXT_PACKAGE); - if (!g_option_context_parse (ctx, &argc, &argv, &error)) - fatal ("Error: option parsing failed: %s\n", error->message); - if (argc != 2) - fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL)); - g_option_context_free (ctx); - - template.version = SD_VERSION_3_0_0; - template.same_type_sequence = pango_markup - ? (char[]) { STARDICT_FIELD_PANGO, 0 } - : (char[]) { STARDICT_FIELD_MEANING, 0 }; - - if (!template.book_name) - template.book_name = argv[1]; - if (template.description) - { - gchar **lines = g_strsplit (template.description, "\n", -1); - g_free (template.description); - gchar *in_one_line = g_strjoinv ("
", lines); - g_strfreev (lines); - template.description = in_one_line; - } - if (template.collation) - validate_collation_locale (template.collation); - - // This actually implements stardict_strcmp(), POSIX-compatibly. - // Your sort(1) is not expected to be stable by default, like bsdsort is. - FILE *fsorted = popen ("LC_ALL=C sort -t'\t' -k1f,1", "r"); - if (!fsorted) - fatal ("%s: %s\n", "popen", g_strerror (errno)); - - Generator *generator = generator_new (argv[1], &error); - if (!generator) - fatal ("Error: failed to create the output dictionary: %s\n", - error->message); - - StardictInfo *info = generator->info; - stardict_info_copy (info, &template); - if (!transform (fsorted, generator, &error) - || !generator_finish (generator, &error)) - fatal ("Error: failed to write the dictionary: %s\n", error->message); - - generator_free (generator); - pclose (fsorted); - return 0; -} diff --git a/src/tdv-add-pronunciation.c b/src/tdv-add-pronunciation.c new file mode 100644 index 0000000..90d9673 --- /dev/null +++ b/src/tdv-add-pronunciation.c @@ -0,0 +1,469 @@ +/* + * A tool to add eSpeak-generated pronunciation to dictionaries + * + * Here I use the `espeak' process rather than libespeak because of the GPL. + * It's far from ideal, rather good as a starting point. + * + * Copyright (c) 2013, Přemysl Eric Janouch + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include +#include + +#include "stardict.h" +#include "stardict-private.h" +#include "generator.h" +#include "utils.h" + + +// --- Pronunciation generator ------------------------------------------------- + +typedef struct worker_data WorkerData; + +struct worker_data +{ + gchar **cmdline; ///< eSpeak command line + guint ignore_acronyms : 1; ///< Don't spell out acronyms + GRegex *re_stop; ///< Regex for stop sequences + GRegex *re_acronym; ///< Regex for ACRONYMS + + guint32 start_entry; ///< The first entry to be processed + guint32 end_entry; ///< Past the last entry to be processed + + // Reader, writer + GMutex *dict_mutex; ///< Locks the dictionary object + + // Reader + GThread *main_thread; ///< A handle to the reader thread + StardictDict *dict; ///< The dictionary object + gpointer output; ///< Linked-list of pronunciation data + + GMutex *remaining_mutex; ///< Locks the progress stats + GCond *remaining_cond; ///< Signals a change in progress + guint32 remaining; ///< How many entries remain + guint32 total; ///< Total number of entries + + // Writer + StardictIterator *iterator; ///< Iterates over the dictionary + FILE *child_stdin; ///< Standard input of eSpeak +}; + +/// eSpeak splits the output on certain characters. +#define LINE_SPLITTING_CHARS ".,:;?!" + +/// We don't want to include brackets either. +#define OTHER_STOP_CHARS "([{<" + +/// A void word used to make a unique "no pronunciation available" mark. +#define VOID_ENTRY "not present in any dictionary" + + +/// Adds dots between characters. +static gboolean +writer_acronym_cb (const GMatchInfo *info, GString *res, + G_GNUC_UNUSED gpointer data) +{ + gchar *preceding = g_match_info_fetch (info, 1); + g_string_append (res, preceding); + g_free (preceding); + + gchar *word = g_match_info_fetch (info, 2); + + g_string_append_c (res, *word); + const gchar *p; + for (p = word + 1; *p; p++) + { + g_string_append_c (res, '.'); + g_string_append_c (res, *p); + } + + g_free (word); + return FALSE; +} + +/// Writes to espeak's stdin. +static gpointer +worker_writer (WorkerData *data) +{ + GError *error = NULL; + GMatchInfo *match_info; + while (stardict_iterator_get_offset (data->iterator) != data->end_entry) + { + g_mutex_lock (data->dict_mutex); + const gchar *word = stardict_iterator_get_word (data->iterator); + g_mutex_unlock (data->dict_mutex); + + word += strspn (word, LINE_SPLITTING_CHARS " \t"); + gchar *x = g_strdup (word); + + // Cut the word if needed be + error = NULL; + if (g_regex_match_full (data->re_stop, + x, -1, 0, 0, &match_info, &error)) + { + gint start_pos; + g_match_info_fetch_pos (match_info, 0, &start_pos, NULL); + x[start_pos] = 0; + } + g_match_info_free (match_info); + + // Change acronyms so that they're not pronounced as words + if (!error && !data->ignore_acronyms) + { + char *tmp = g_regex_replace_eval (data->re_acronym, + x, -1, 0, 0, writer_acronym_cb, NULL, &error); + g_free (x); + x = tmp; + } + + if (error) + { + g_printerr ("Notice: error processing '%s': %s\n", + word, error->message); + g_clear_error (&error); + *x = 0; + } + + // We might have accidentally cut off everything + if (!*x) + { + g_free (x); + x = g_strdup (VOID_ENTRY); + } + + stardict_iterator_next (data->iterator); + if (fprintf (data->child_stdin, "%s\n", x) < 0) + fatal ("write to eSpeak failed: %s\n", g_strerror (errno)); + + g_free (x); + } + + g_object_unref (data->iterator); + return GINT_TO_POINTER (fclose (data->child_stdin)); +} + +/// Get the void entry (and test if espeak works). +static gchar * +get_void_entry (gchar *cmdline[]) +{ + gchar *output; + gint exit_status; + + GError *error = NULL; + if (!g_spawn_sync (NULL, cmdline, NULL, + G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, NULL, NULL, + &output, NULL, &exit_status, &error)) + fatal ("Error: couldn't spawn espeak: %s\n", error->message); + + if (exit_status) + fatal ("Error: espeak returned %d\n", exit_status); + + return output; +} + +/// Reads from espeak's stdout. +static gpointer +worker (WorkerData *data) +{ + // Spawn eSpeak + GError *error = NULL; + gint child_in, child_out; + if (!g_spawn_async_with_pipes (NULL, data->cmdline, NULL, + G_SPAWN_SEARCH_PATH, NULL, NULL, + NULL, &child_in, &child_out, NULL, &error)) + fatal ("g_spawn: %s\n", error->message); + + data->child_stdin = fdopen (child_in, "wb"); + if (!data->child_stdin) + perror ("fdopen"); + + FILE *child_stdout = fdopen (child_out, "rb"); + if (!child_stdout) + perror ("fdopen"); + + // Spawn a writer thread + g_mutex_lock (data->dict_mutex); + data->iterator = stardict_iterator_new (data->dict, data->start_entry); + g_mutex_unlock (data->dict_mutex); + + GThread *writer = g_thread_new ("write worker", + (GThreadFunc) worker_writer, data); + + // Read the output + g_mutex_lock (data->remaining_mutex); + guint32 remaining = data->remaining; + g_mutex_unlock (data->remaining_mutex); + + data->output = NULL; + gpointer *output_end = &data->output; + while (remaining) + { + static gchar next[sizeof (gpointer)]; + GString *s = g_string_new (NULL); + g_string_append_len (s, next, sizeof next); + + gint c; + while ((c = fgetc (child_stdout)) != EOF && c != '\n') + g_string_append_c (s, c); + if (c == EOF) + fatal ("eSpeak process died too soon\n"); + + gchar *translation = g_string_free (s, FALSE); + *output_end = translation; + output_end = (gpointer *) translation; + + // We limit progress reporting so that + // the mutex doesn't spin like crazy + if ((--remaining & 255) != 0) + continue; + + g_mutex_lock (data->remaining_mutex); + data->remaining = remaining; + g_cond_broadcast (data->remaining_cond); + g_mutex_unlock (data->remaining_mutex); + } + + if (fgetc (child_stdout) != EOF) + fatal ("Error: eSpeak has written more lines than it should. " + "The output would be corrupt, aborting.\n"); + + fclose (child_stdout); + return g_thread_join (writer); +} + +// --- Main -------------------------------------------------------------------- + +int +main (int argc, char *argv[]) +{ + gint n_processes = 1; + gchar *voice = NULL; + gboolean ignore_acronyms = FALSE; + + GOptionEntry entries[] = + { + { "processes", 'N', G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_INT, &n_processes, + "The number of espeak processes run in parallel", "PROCESSES" }, + { "voice", 'v', G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_STRING, &voice, + "The voice to be used by eSpeak to pronounce the words", "VOICE" }, + { "ignore-acronyms", 0, G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_NONE, &ignore_acronyms, + "Don't spell out words composed of big letters only", NULL }, + { NULL } + }; + +G_GNUC_BEGIN_IGNORE_DEPRECATIONS + if (glib_check_version (2, 36, 0)) + g_type_init (); +G_GNUC_END_IGNORE_DEPRECATIONS + + GError *error = NULL; + GOptionContext *ctx = g_option_context_new + ("input.ifo output-basename - add pronunciation to dictionaries"); + g_option_context_add_main_entries (ctx, entries, NULL); + if (!g_option_context_parse (ctx, &argc, &argv, &error)) + fatal ("Error: option parsing failed: %s\n", error->message); + + if (argc != 3) + fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL)); + + g_option_context_free (ctx); + + // See if we can run espeak + static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL, NULL, NULL }; + + if (voice) + { + cmdline[3] = "-v"; + cmdline[4] = voice; + } + + gchar *void_entry = g_strstrip (get_void_entry (cmdline)); + + // Load the dictionary + printf ("Loading the original dictionary...\n"); + StardictDict *dict = stardict_dict_new (argv[1], &error); + if (!dict) + fatal ("Error: opening the dictionary failed: %s\n", error->message); + + gsize n_words = stardict_info_get_word_count + (stardict_dict_get_info (dict)); + + if (n_processes <= 0) + fatal ("Error: there must be at least one process\n"); + + if ((gsize) n_processes > n_words * 1024) + { + n_processes = n_words / 1024; + if (!n_processes) + n_processes = 1; + g_printerr ("Warning: too many processes, reducing to %d\n", + n_processes); + } + + // Spawn worker threads to generate pronunciation data + static GMutex dict_mutex; + + static GMutex remaining_mutex; + static GCond remaining_cond; + + WorkerData *data = g_alloca (sizeof *data * n_processes); + + GRegex *re_stop = g_regex_new ("[" LINE_SPLITTING_CHARS "][ ?]" + "|\\.\\.\\.|[" OTHER_STOP_CHARS "]", G_REGEX_OPTIMIZE, 0, &error); + g_assert (re_stop != NULL); + + GRegex *re_acronym = g_regex_new ("(^|\\pZ)(\\p{Lu}+)(?=\\pZ|$)", + G_REGEX_OPTIMIZE, 0, &error); + g_assert (re_acronym != NULL); + + gint i; + for (i = 0; i < n_processes; i++) + { + data[i].start_entry = n_words * i / n_processes; + data[i].end_entry = n_words * (i + 1) / n_processes; + + data[i].total = data[i].remaining = + data[i].end_entry - data[i].start_entry; + data[i].remaining_mutex = &remaining_mutex; + data[i].remaining_cond = &remaining_cond; + + data[i].dict = dict; + data[i].dict_mutex = &dict_mutex; + + data[i].re_stop = re_stop; + data[i].re_acronym = re_acronym; + + data[i].cmdline = cmdline; + data[i].ignore_acronyms = ignore_acronyms; + data[i].main_thread = + g_thread_new ("worker", (GThreadFunc) worker, &data[i]); + } + + // Loop while the threads still have some work to do and report status + g_mutex_lock (&remaining_mutex); + for (;;) + { + gboolean all_finished = TRUE; + printf ("\rRetrieving pronunciation... "); + for (i = 0; i < n_processes; i++) + { + printf ("%3u%% ", 100 - data[i].remaining * 100 / data[i].total); + if (data[i].remaining) + all_finished = FALSE; + } + + if (all_finished) + break; + g_cond_wait (&remaining_cond, &remaining_mutex); + } + g_mutex_unlock (&remaining_mutex); + + putchar ('\n'); + for (i = 0; i < n_processes; i++) + g_thread_join (data[i].main_thread); + + g_regex_unref (re_stop); + g_regex_unref (re_acronym); + + // Put extended entries into a new dictionary + Generator *generator = generator_new (argv[2], &error); + if (!generator) + fatal ("Error: failed to create the output dictionary: %s\n", + error->message); + + StardictInfo *info = generator->info; + stardict_info_copy (info, stardict_dict_get_info (dict)); + + // This gets incremented each time an entry is finished + info->word_count = 0; + + if (info->same_type_sequence) + { + gchar *new_sts = g_strconcat ("t", info->same_type_sequence, NULL); + g_free (info->same_type_sequence); + info->same_type_sequence = new_sts; + } + + // Write out all the entries together with the pronunciation + for (i = 0; i < n_processes; i++) + { + StardictIterator *iterator = + stardict_iterator_new (dict, data[i].start_entry); + + gpointer *output = data[i].output; + while (stardict_iterator_get_offset (iterator) != data[i].end_entry) + { + printf ("\rCreating a new dictionary... %3lu%%", + (gulong) stardict_iterator_get_offset (iterator) * 100 + / stardict_dict_get_info (dict)->word_count); + + g_assert (output != NULL); + + gchar *pronunciation = g_strstrip ((gchar *) (output + 1)); + StardictEntry *entry = stardict_iterator_get_entry (iterator); + + generator_begin_entry (generator); + + if (!strcmp (pronunciation, void_entry)) + *pronunciation = 0; + +// g_printerr ("%s /%s/\n", +// stardict_iterator_get_word (iterator), pronunciation); + + // For the sake of simplicity we fake a new start; + // write_fields() only iterates the list in one direction. + StardictEntryField field; + field.type = 't'; + field.data = pronunciation; + + GList start_link; + start_link.next = entry->fields; + start_link.data = &field; + + if (!generator_write_fields (generator, &start_link, &error) + || !generator_finish_entry (generator, + stardict_iterator_get_word (iterator), &error)) + fatal ("Error: write failed: %s\n", error->message); + + g_object_unref (entry); + + gpointer *tmp = output; + output = *output; + g_free (tmp); + + stardict_iterator_next (iterator); + } + + g_assert (output == NULL); + g_object_unref (iterator); + } + + putchar ('\n'); + if (!generator_finish (generator, &error)) + fatal ("Error: failed to write the dictionary: %s\n", error->message); + + generator_free (generator); + g_object_unref (dict); + g_free (void_entry); + return 0; +} diff --git a/src/tdv-query-tool.c b/src/tdv-query-tool.c new file mode 100644 index 0000000..6cfdc66 --- /dev/null +++ b/src/tdv-query-tool.c @@ -0,0 +1,313 @@ +/* + * A tool to query multiple dictionaries for the specified word + * + * Intended for use in IRC bots and similar silly things---words go in, + * one per each line, and entries come out, one dictionary at a time, + * finalised with an empty line. Newlines are escaped with `\n', + * backslashes with `\\'. + * + * So far only the `m', `g`, and `x` fields are supported, as in tdv. + * + * Copyright (c) 2013 - 2021, Přemysl Eric Janouch + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "stardict.h" +#include "stardict-private.h" +#include "generator.h" +#include "utils.h" + + +// --- Output formatting ------------------------------------------------------- + +/// Transform Pango attributes to in-line formatting sequences (non-reentrant) +typedef const gchar *(*FormatterFunc) (PangoAttrIterator *); + +static const gchar * +pango_attrs_ignore (G_GNUC_UNUSED PangoAttrIterator *iterator) +{ + return ""; +} + +static const gchar * +pango_attrs_to_irc (PangoAttrIterator *iterator) +{ + static gchar buf[5]; + gchar *p = buf; + *p++ = 0x0f; + + if (!iterator) + goto reset_formatting; + + PangoAttrInt *attr = NULL; + if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, + PANGO_ATTR_WEIGHT)) && attr->value >= PANGO_WEIGHT_BOLD) + *p++ = 0x02; + if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, + PANGO_ATTR_UNDERLINE)) && attr->value == PANGO_UNDERLINE_SINGLE) + *p++ = 0x1f; + if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, + PANGO_ATTR_STYLE)) && attr->value == PANGO_STYLE_ITALIC) + *p++ = 0x1d; + +reset_formatting: + *p++ = 0; + return buf; +} + +static const gchar * +pango_attrs_to_ansi (PangoAttrIterator *iterator) +{ + static gchar buf[16]; + g_strlcpy (buf, "\x1b[0", sizeof buf); + if (!iterator) + goto reset_formatting; + + PangoAttrInt *attr = NULL; + if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, + PANGO_ATTR_WEIGHT)) && attr->value >= PANGO_WEIGHT_BOLD) + g_strlcat (buf, ";1", sizeof buf); + if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, + PANGO_ATTR_UNDERLINE)) && attr->value == PANGO_UNDERLINE_SINGLE) + g_strlcat (buf, ";4", sizeof buf); + if ((attr = (PangoAttrInt *) pango_attr_iterator_get (iterator, + PANGO_ATTR_STYLE)) && attr->value == PANGO_STYLE_ITALIC) + g_strlcat (buf, ";3", sizeof buf); + +reset_formatting: + g_strlcat (buf, "m", sizeof buf); + return buf; +} + +static gchar * +pango_to_output_text (const gchar *markup, FormatterFunc formatter) +{ + // This function skips leading whitespace, but it's the canonical one + gchar *text = NULL; + PangoAttrList *attrs = NULL; + if (!pango_parse_markup (markup, -1, 0, &attrs, &text, NULL, NULL)) + return g_strdup_printf ("<%s>", ("error in entry")); + + PangoAttrIterator *iterator = pango_attr_list_get_iterator (attrs); + GString *result = g_string_new (""); + do + { + gint start = 0, end = 0; + pango_attr_iterator_range (iterator, &start, &end); + if (end == G_MAXINT) + end = strlen (text); + + g_string_append (result, formatter (iterator)); + g_string_append_len (result, text + start, end - start); + } + while (pango_attr_iterator_next (iterator)); + g_string_append (result, formatter (NULL)); + + g_free (text); + pango_attr_iterator_destroy (iterator); + pango_attr_list_unref (attrs); + return g_string_free (result, FALSE); +} + +static gchar * +field_to_output_text (const StardictEntryField *field, FormatterFunc formatter) +{ + const gchar *definition = field->data; + if (field->type == STARDICT_FIELD_MEANING) + return g_strdup (definition); + if (field->type == STARDICT_FIELD_PANGO) + return pango_to_output_text (definition, formatter); + if (field->type == STARDICT_FIELD_XDXF) + { + gchar *markup = xdxf_to_pango_markup_with_reduced_effort (definition); + gchar *result = pango_to_output_text (markup, formatter); + g_free (markup); + return result; + } + return NULL; +} + +// --- Main -------------------------------------------------------------------- + +static guint +count_equal_chars (const gchar *a, const gchar *b) +{ + guint count = 0; + while (*a && *b) + if (*a++ == *b++) + count++; + return count; +} + +static void +do_dictionary (StardictDict *dict, const gchar *word, FormatterFunc formatter) +{ + gboolean found; + StardictIterator *iter = stardict_dict_search (dict, word, &found); + if (!found) + goto out; + + // Default Stardict ordering is ASCII case-insensitive, + // which may be further exacerbated by our own collation feature. + // Try to find a better matching entry: + + gint64 best_offset = stardict_iterator_get_offset (iter); + guint best_score = count_equal_chars + (stardict_iterator_get_word (iter), word); + + while (TRUE) + { + stardict_iterator_next (iter); + if (!stardict_iterator_is_valid (iter)) + break; + + const gchar *iter_word = stardict_iterator_get_word (iter); + if (g_ascii_strcasecmp (iter_word, word)) + break; + + guint score = count_equal_chars (iter_word, word); + if (score > best_score) + { + best_offset = stardict_iterator_get_offset (iter); + best_score = score; + } + } + + stardict_iterator_set_offset (iter, best_offset, FALSE); + + StardictEntry *entry = stardict_iterator_get_entry (iter); + StardictInfo *info = stardict_dict_get_info (dict); + const GList *list = stardict_entry_get_fields (entry); + for (; list; list = list->next) + { + StardictEntryField *field = list->data; + gchar *definitions = field_to_output_text (field, formatter); + if (!definitions) + continue; + + printf ("%s\t", info->book_name); + for (const gchar *p = definitions; *p; p++) + { + if (*p == '\\') + printf ("\\\\"); + else if (*p == '\n') + printf ("\\n"); + else + putchar (*p); + } + putchar ('\n'); + g_free (definitions); + } + g_object_unref (entry); +out: + g_object_unref (iter); +} + +static FormatterFunc +parse_options (int *argc, char ***argv) +{ + GError *error = NULL; + GOptionContext *ctx = g_option_context_new + ("DICTIONARY.ifo... - query multiple dictionaries"); + + gboolean format_with_ansi = FALSE; + gboolean format_with_irc = FALSE; + GOptionEntry entries[] = + { + { "ansi", 'a', 0, G_OPTION_ARG_NONE, &format_with_ansi, + "Format with ANSI sequences", NULL }, + { "irc", 'i', 0, G_OPTION_ARG_NONE, &format_with_irc, + "Format with IRC codes", NULL }, + { } + }; + + g_option_context_add_main_entries (ctx, entries, NULL); + if (!g_option_context_parse (ctx, argc, argv, &error)) + { + g_printerr ("Error: option parsing failed: %s\n", error->message); + exit (EXIT_FAILURE); + } + if (*argc < 2) + { + g_printerr ("%s\n", g_option_context_get_help (ctx, TRUE, NULL)); + exit (EXIT_FAILURE); + } + g_option_context_free (ctx); + + if (format_with_ansi) + return pango_attrs_to_ansi; + if (format_with_irc) + return pango_attrs_to_irc; + + return pango_attrs_ignore; +} + +int +main (int argc, char *argv[]) +{ +G_GNUC_BEGIN_IGNORE_DEPRECATIONS + if (glib_check_version (2, 36, 0)) + g_type_init (); +G_GNUC_END_IGNORE_DEPRECATIONS + + FormatterFunc formatter = parse_options (&argc, &argv); + + guint n_dicts = argc - 1; + StardictDict **dicts = g_alloca (sizeof *dicts * n_dicts); + + guint i; + for (i = 1; i <= n_dicts; i++) + { + GError *error = NULL; + dicts[i - 1] = stardict_dict_new (argv[i], &error); + if (error) + { + g_printerr ("Error: opening dictionary `%s' failed: %s\n", + argv[i], error->message); + exit (EXIT_FAILURE); + } + } + + gint c; + do + { + GString *s = g_string_new (NULL); + while ((c = getchar ()) != EOF && c != '\n') + if (c != '\r') + g_string_append_c (s, c); + + if (s->len) + for (i = 0; i < n_dicts; i++) + do_dictionary (dicts[i], s->str, formatter); + + printf ("\n"); + fflush (NULL); + g_string_free (s, TRUE); + } + while (c != EOF); + + for (i = 0; i < n_dicts; i++) + g_object_unref (dicts[i]); + + return 0; +} diff --git a/src/tdv-tabfile.c b/src/tdv-tabfile.c new file mode 100644 index 0000000..fab0ef2 --- /dev/null +++ b/src/tdv-tabfile.c @@ -0,0 +1,223 @@ +/* + * A clean reimplementation of StarDict's tabfile + * + * Copyright (c) 2020 - 2021, Přemysl Eric Janouch + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "config.h" +#include "stardict.h" +#include "stardict-private.h" +#include "generator.h" +#include "utils.h" + + +static gboolean +set_data_error (GError **error, const gchar *message) +{ + g_set_error_literal (error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA, message); + return FALSE; +} + +static const gchar escapes[256] = { ['n'] = '\n', ['t'] = '\t', ['\\'] = '\\' }; + +static gboolean +inplace_unescape (gchar *line, GError **error) +{ + gboolean escape = FALSE; + gchar *dest = line; + for (gchar *src = line; *src; src++) + { + if (escape) + { + escape = FALSE; + if (!(*dest++ = escapes[(guchar) *src])) + return set_data_error (error, "unsupported escape"); + } + else if (*src == '\\') + escape = TRUE; + else + *dest++ = *src; + } + if (escape) + return set_data_error (error, "trailing escape character"); + + *dest = 0; + return TRUE; +} + +static gboolean +import_line (Generator *generator, gchar *line, gsize len, GError **error) +{ + if (!len) + return TRUE; + if (!g_utf8_validate_len (line, len, NULL)) + return set_data_error (error, "not valid UTF-8"); + + gchar *separator = strchr (line, '\t'); + if (!separator) + return set_data_error (error, "keyword separator not found"); + + *separator++ = 0; + if (strchr (line, '\\')) + // The index wouldn't be sorted correctly with our method + return set_data_error (error, "escapes not allowed in keywords"); + + gchar *newline = strpbrk (separator, "\r\n"); + if (newline) + *newline = 0; + + if (!inplace_unescape (line, error) + || !inplace_unescape (separator, error)) + return FALSE; + + if (generator->info->same_type_sequence + && *generator->info->same_type_sequence == STARDICT_FIELD_PANGO + && !pango_parse_markup (separator, -1, 0, NULL, NULL, NULL, error)) + return FALSE; + + generator_begin_entry (generator); + return generator_write_string (generator, separator, TRUE, error) + && generator_finish_entry (generator, line, error); +} + +static gboolean +transform (FILE *fsorted, Generator *generator, GError **error) +{ + gchar *line = NULL; + gsize size = 0, ln = 1; + for (ssize_t read; (read = getline (&line, &size, fsorted)) >= 0; ln++) + if (!import_line (generator, line, read, error)) + break; + + free (line); + if (ferror (fsorted)) + { + g_set_error_literal (error, G_IO_ERROR, + g_io_error_from_errno (errno), g_strerror (errno)); + return FALSE; + } + if (!feof (fsorted)) + { + // You'll only get good line number output with presorted input! + g_prefix_error (error, "line %zu: ", ln); + return FALSE; + } + return TRUE; +} + +static void +validate_collation_locale (const gchar *locale) +{ + UErrorCode error = U_ZERO_ERROR; + UCollator *collator = ucol_open (locale, &error); + if (!collator) + fatal ("failed to create a collator for %s: %s\n", + locale, u_errorName (error)); + ucol_close (collator); +} + +int +main (int argc, char *argv[]) +{ + // The GLib help includes an ellipsis character, for some reason + (void) setlocale (LC_ALL, ""); + + GError *error = NULL; + GOptionContext *ctx = g_option_context_new ("output-basename < input"); + g_option_context_set_summary (ctx, + "Create a StarDict dictionary from plaintext."); + + gboolean pango_markup = FALSE; + StardictInfo template = {}; + GOptionEntry entries[] = + { + { "pango", 'p', 0, G_OPTION_ARG_NONE, &pango_markup, + "Entries use Pango markup", NULL }, + + { "book-name", 'b', 0, G_OPTION_ARG_STRING, &template.book_name, + "Set the book name field", "TEXT" }, + { "author", 'a', 0, G_OPTION_ARG_STRING, &template.author, + "Set the author field ", "NAME" }, + { "e-mail", 'e', 0, G_OPTION_ARG_STRING, &template.email, + "Set the e-mail field", "ADDRESS" }, + { "website", 'w', 0, G_OPTION_ARG_STRING, &template.website, + "Set the website field", "LINK" }, + { "description", 'd', 0, G_OPTION_ARG_STRING, &template.description, + "Set the description field (newlines supported)", "TEXT" }, + { "date", 'D', 0, G_OPTION_ARG_STRING, &template.date, + "Set the date field", "DATE" }, + { "collation", 'c', 0, G_OPTION_ARG_STRING, &template.collation, + "Set the collation field (for ICU)", "LOCALE" }, + { } + }; + + g_option_context_add_main_entries (ctx, entries, GETTEXT_PACKAGE); + if (!g_option_context_parse (ctx, &argc, &argv, &error)) + fatal ("Error: option parsing failed: %s\n", error->message); + if (argc != 2) + fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL)); + g_option_context_free (ctx); + + template.version = SD_VERSION_3_0_0; + template.same_type_sequence = pango_markup + ? (char[]) { STARDICT_FIELD_PANGO, 0 } + : (char[]) { STARDICT_FIELD_MEANING, 0 }; + + if (!template.book_name) + template.book_name = argv[1]; + if (template.description) + { + gchar **lines = g_strsplit (template.description, "\n", -1); + g_free (template.description); + gchar *in_one_line = g_strjoinv ("
", lines); + g_strfreev (lines); + template.description = in_one_line; + } + if (template.collation) + validate_collation_locale (template.collation); + + // This actually implements stardict_strcmp(), POSIX-compatibly. + // Your sort(1) is not expected to be stable by default, like bsdsort is. + FILE *fsorted = popen ("LC_ALL=C sort -t'\t' -k1f,1", "r"); + if (!fsorted) + fatal ("%s: %s\n", "popen", g_strerror (errno)); + + Generator *generator = generator_new (argv[1], &error); + if (!generator) + fatal ("Error: failed to create the output dictionary: %s\n", + error->message); + + StardictInfo *info = generator->info; + stardict_info_copy (info, &template); + if (!transform (fsorted, generator, &error) + || !generator_finish (generator, &error)) + fatal ("Error: failed to write the dictionary: %s\n", error->message); + + generator_free (generator); + pclose (fsorted); + return 0; +} diff --git a/src/tdv-transform.c b/src/tdv-transform.c new file mode 100644 index 0000000..7520eb8 --- /dev/null +++ b/src/tdv-transform.c @@ -0,0 +1,226 @@ +/* + * A tool to transform dictionaries dictionaries by an external filter + * + * The external filter needs to process NUL-separated textual entries. + * + * Example: tdv-transform input.ifo output -- perl -p0e s/bullshit/soykaf/g + * + * Copyright (c) 2020, Přemysl Eric Janouch + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "stardict.h" +#include "stardict-private.h" +#include "generator.h" +#include "utils.h" + +enum { PIPE_READ, PIPE_WRITE }; + + +// --- Main -------------------------------------------------------------------- + +static inline void +print_progress (gulong *last_percent, StardictIterator *iterator, gsize total) +{ + gulong percent = + (gulong) stardict_iterator_get_offset (iterator) * 100 / total; + if (percent != *last_percent) + { + printf ("\r Writing entries... %3lu%%", percent); + *last_percent = percent; + } +} + +static gboolean +write_to_filter (StardictDict *dict, gint fd, GError **error) +{ + StardictInfo *info = stardict_dict_get_info (dict); + gsize n_words = stardict_info_get_word_count (info); + + StardictIterator *iterator = stardict_iterator_new (dict, 0); + gulong last_percent = -1; + while (stardict_iterator_is_valid (iterator)) + { + print_progress (&last_percent, iterator, n_words); + + StardictEntry *entry = stardict_iterator_get_entry (iterator); + for (const GList *fields = stardict_entry_get_fields (entry); + fields; fields = fields->next) + { + StardictEntryField *field = fields->data; + if (!g_ascii_islower (field->type)) + continue; + + if (write (fd, field->data, field->data_size) + != (ssize_t) field->data_size) + { + g_set_error (error, G_IO_ERROR, g_io_error_from_errno (errno), + "%s", g_strerror (errno)); + return FALSE; + } + } + + g_object_unref (entry); + stardict_iterator_next (iterator); + } + printf ("\n"); + return TRUE; +} + +static gboolean +update_from_filter (StardictDict *dict, Generator *generator, + GMappedFile *filtered_file, GError **error) +{ + gchar *filtered = g_mapped_file_get_contents (filtered_file); + gchar *filtered_end = filtered + g_mapped_file_get_length (filtered_file); + + StardictInfo *info = stardict_dict_get_info (dict); + gsize n_words = stardict_info_get_word_count (info); + + StardictIterator *iterator = stardict_iterator_new (dict, 0); + gulong last_percent = -1; + while (stardict_iterator_is_valid (iterator)) + { + print_progress (&last_percent, iterator, n_words); + + StardictEntry *entry = stardict_iterator_get_entry (iterator); + generator_begin_entry (generator); + + for (GList *fields = entry->fields; fields; fields = fields->next) + { + StardictEntryField *field = fields->data; + if (!g_ascii_islower (field->type)) + continue; + + gchar *end = memchr (filtered, 0, filtered_end - filtered); + if (!end) + { + g_set_error (error, G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT, + "filter seems to have ended too early"); + return FALSE; + } + + g_free (field->data); + field->data = g_strdup (filtered); + field->data_size = end - filtered + 1; + filtered = end + 1; + } + + if (!generator_write_fields (generator, entry->fields, error) + || !generator_finish_entry (generator, + stardict_iterator_get_word (iterator), error)) + return FALSE; + + g_object_unref (entry); + stardict_iterator_next (iterator); + } + printf ("\n"); + return TRUE; +} + +int +main (int argc, char *argv[]) +{ + // The GLib help includes an ellipsis character, for some reason + (void) setlocale (LC_ALL, ""); + + GError *error = NULL; + GOptionContext *ctx = g_option_context_new + ("input.ifo output-basename -- FILTER [ARG...]"); + g_option_context_set_summary + (ctx, "Transform dictionaries using a filter program."); + if (!g_option_context_parse (ctx, &argc, &argv, &error)) + fatal ("Error: option parsing failed: %s\n", error->message); + + if (argc < 3) + fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL)); + + // GLib is bullshit, getopt_long() always correctly removes this + gint program_argv_start = 3; + if (!strcmp (argv[program_argv_start], "--")) + program_argv_start++; + + g_option_context_free (ctx); + + printf ("Loading the original dictionary...\n"); + StardictDict *dict = stardict_dict_new (argv[1], &error); + if (!dict) + fatal ("Error: opening the dictionary failed: %s\n", error->message); + + printf ("Filtering entries...\n"); + gint child_in[2]; + if (!g_unix_open_pipe (child_in, 0, &error)) + fatal ("g_unix_open_pipe: %s\n", error->message); + + FILE *child_out = tmpfile (); + if (!child_out) + fatal ("tmpfile: %s\n", g_strerror (errno)); + + GPid pid = -1; + if (!g_spawn_async_with_fds (NULL /* working_directory */, + argv + program_argv_start /* forward a part of ours */, NULL /* envp */, + G_SPAWN_SEARCH_PATH | G_SPAWN_DO_NOT_REAP_CHILD, + NULL /* child_setup */, NULL /* user_data */, + &pid, child_in[PIPE_READ], fileno (child_out), STDERR_FILENO, &error)) + fatal ("g_spawn: %s\n", error->message); + if (!write_to_filter (dict, child_in[PIPE_WRITE], &error)) + fatal ("write_to_filter: %s\n", error->message); + if (!g_close (child_in[PIPE_READ], &error) + || !g_close (child_in[PIPE_WRITE], &error)) + fatal ("g_close: %s\n", error->message); + + printf ("Waiting for the filter to finish...\n"); + int wstatus = errno = 0; + if (waitpid (pid, &wstatus, 0) < 1 + || !WIFEXITED (wstatus) || WEXITSTATUS (wstatus) > 0) + fatal ("Filter failed (%s, status %d)\n", g_strerror (errno), wstatus); + + GMappedFile *filtered = g_mapped_file_new_from_fd (fileno (child_out), + FALSE /* writable */, &error); + if (!filtered) + fatal ("g_mapped_file_new_from_fd: %s\n", error->message); + + printf ("Writing the new dictionary...\n"); + Generator *generator = generator_new (argv[2], &error); + if (!generator) + fatal ("Error: failed to create the output dictionary: %s\n", + error->message); + + StardictInfo *info = generator->info; + stardict_info_copy (info, stardict_dict_get_info (dict)); + + // This gets incremented each time an entry is finished + info->word_count = 0; + + if (!update_from_filter (dict, generator, filtered, &error) + || !generator_finish (generator, &error)) + fatal ("Error: failed to write the dictionary: %s\n", error->message); + + g_mapped_file_unref (filtered); + fclose (child_out); + generator_free (generator); + g_object_unref (dict); + return 0; +} diff --git a/src/transform.c b/src/transform.c deleted file mode 100644 index ba33dee..0000000 --- a/src/transform.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - * A tool to transform dictionaries dictionaries by an external filter - * - * The external filter needs to process NUL-separated textual entries. - * - * Example: transform input.ifo output -- perl -p0e s/bullshit/soykaf/g - * - * Copyright (c) 2020, Přemysl Eric Janouch - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "stardict.h" -#include "stardict-private.h" -#include "generator.h" -#include "utils.h" - -enum { PIPE_READ, PIPE_WRITE }; - - -// --- Main -------------------------------------------------------------------- - -static inline void -print_progress (gulong *last_percent, StardictIterator *iterator, gsize total) -{ - gulong percent = - (gulong) stardict_iterator_get_offset (iterator) * 100 / total; - if (percent != *last_percent) - { - printf ("\r Writing entries... %3lu%%", percent); - *last_percent = percent; - } -} - -static gboolean -write_to_filter (StardictDict *dict, gint fd, GError **error) -{ - StardictInfo *info = stardict_dict_get_info (dict); - gsize n_words = stardict_info_get_word_count (info); - - StardictIterator *iterator = stardict_iterator_new (dict, 0); - gulong last_percent = -1; - while (stardict_iterator_is_valid (iterator)) - { - print_progress (&last_percent, iterator, n_words); - - StardictEntry *entry = stardict_iterator_get_entry (iterator); - for (const GList *fields = stardict_entry_get_fields (entry); - fields; fields = fields->next) - { - StardictEntryField *field = fields->data; - if (!g_ascii_islower (field->type)) - continue; - - if (write (fd, field->data, field->data_size) - != (ssize_t) field->data_size) - { - g_set_error (error, G_IO_ERROR, g_io_error_from_errno (errno), - "%s", g_strerror (errno)); - return FALSE; - } - } - - g_object_unref (entry); - stardict_iterator_next (iterator); - } - printf ("\n"); - return TRUE; -} - -static gboolean -update_from_filter (StardictDict *dict, Generator *generator, - GMappedFile *filtered_file, GError **error) -{ - gchar *filtered = g_mapped_file_get_contents (filtered_file); - gchar *filtered_end = filtered + g_mapped_file_get_length (filtered_file); - - StardictInfo *info = stardict_dict_get_info (dict); - gsize n_words = stardict_info_get_word_count (info); - - StardictIterator *iterator = stardict_iterator_new (dict, 0); - gulong last_percent = -1; - while (stardict_iterator_is_valid (iterator)) - { - print_progress (&last_percent, iterator, n_words); - - StardictEntry *entry = stardict_iterator_get_entry (iterator); - generator_begin_entry (generator); - - for (GList *fields = entry->fields; fields; fields = fields->next) - { - StardictEntryField *field = fields->data; - if (!g_ascii_islower (field->type)) - continue; - - gchar *end = memchr (filtered, 0, filtered_end - filtered); - if (!end) - { - g_set_error (error, G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT, - "filter seems to have ended too early"); - return FALSE; - } - - g_free (field->data); - field->data = g_strdup (filtered); - field->data_size = end - filtered + 1; - filtered = end + 1; - } - - if (!generator_write_fields (generator, entry->fields, error) - || !generator_finish_entry (generator, - stardict_iterator_get_word (iterator), error)) - return FALSE; - - g_object_unref (entry); - stardict_iterator_next (iterator); - } - printf ("\n"); - return TRUE; -} - -int -main (int argc, char *argv[]) -{ - // The GLib help includes an ellipsis character, for some reason - (void) setlocale (LC_ALL, ""); - - GError *error = NULL; - GOptionContext *ctx = g_option_context_new - ("input.ifo output-basename -- FILTER [ARG...]"); - g_option_context_set_summary - (ctx, "Transform dictionaries using a filter program."); - if (!g_option_context_parse (ctx, &argc, &argv, &error)) - fatal ("Error: option parsing failed: %s\n", error->message); - - if (argc < 3) - fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL)); - - // GLib is bullshit, getopt_long() always correctly removes this - gint program_argv_start = 3; - if (!strcmp (argv[program_argv_start], "--")) - program_argv_start++; - - g_option_context_free (ctx); - - printf ("Loading the original dictionary...\n"); - StardictDict *dict = stardict_dict_new (argv[1], &error); - if (!dict) - fatal ("Error: opening the dictionary failed: %s\n", error->message); - - printf ("Filtering entries...\n"); - gint child_in[2]; - if (!g_unix_open_pipe (child_in, 0, &error)) - fatal ("g_unix_open_pipe: %s\n", error->message); - - FILE *child_out = tmpfile (); - if (!child_out) - fatal ("tmpfile: %s\n", g_strerror (errno)); - - GPid pid = -1; - if (!g_spawn_async_with_fds (NULL /* working_directory */, - argv + program_argv_start /* forward a part of ours */, NULL /* envp */, - G_SPAWN_SEARCH_PATH | G_SPAWN_DO_NOT_REAP_CHILD, - NULL /* child_setup */, NULL /* user_data */, - &pid, child_in[PIPE_READ], fileno (child_out), STDERR_FILENO, &error)) - fatal ("g_spawn: %s\n", error->message); - if (!write_to_filter (dict, child_in[PIPE_WRITE], &error)) - fatal ("write_to_filter: %s\n", error->message); - if (!g_close (child_in[PIPE_READ], &error) - || !g_close (child_in[PIPE_WRITE], &error)) - fatal ("g_close: %s\n", error->message); - - printf ("Waiting for the filter to finish...\n"); - int wstatus = errno = 0; - if (waitpid (pid, &wstatus, 0) < 1 - || !WIFEXITED (wstatus) || WEXITSTATUS (wstatus) > 0) - fatal ("Filter failed (%s, status %d)\n", g_strerror (errno), wstatus); - - GMappedFile *filtered = g_mapped_file_new_from_fd (fileno (child_out), - FALSE /* writable */, &error); - if (!filtered) - fatal ("g_mapped_file_new_from_fd: %s\n", error->message); - - printf ("Writing the new dictionary...\n"); - Generator *generator = generator_new (argv[2], &error); - if (!generator) - fatal ("Error: failed to create the output dictionary: %s\n", - error->message); - - StardictInfo *info = generator->info; - stardict_info_copy (info, stardict_dict_get_info (dict)); - - // This gets incremented each time an entry is finished - info->word_count = 0; - - if (!update_from_filter (dict, generator, filtered, &error) - || !generator_finish (generator, &error)) - fatal ("Error: failed to write the dictionary: %s\n", error->message); - - g_mapped_file_unref (filtered); - fclose (child_out); - generator_free (generator); - g_object_unref (dict); - return 0; -} -- cgit v1.2.3-70-g09d2