diff options
| author | Přemysl Eric Janouch <p@janouch.name> | 2020-09-03 23:17:17 +0200 | 
|---|---|---|
| committer | Přemysl Eric Janouch <p@janouch.name> | 2020-09-04 00:13:34 +0200 | 
| commit | 8d19acd91af9592d862ef2a7aa8e95eea4160152 (patch) | |
| tree | fb2a422cf9446829c41e152415f128b47babe938 | |
| parent | dd2bd04a07030f15e6eb6875041f95c74023dd35 (diff) | |
| download | tdv-8d19acd91af9592d862ef2a7aa8e95eea4160152.tar.gz tdv-8d19acd91af9592d862ef2a7aa8e95eea4160152.tar.xz tdv-8d19acd91af9592d862ef2a7aa8e95eea4160152.zip | |
Add a tool to transform dictionaries
| -rw-r--r-- | CMakeLists.txt | 15 | ||||
| -rw-r--r-- | README.adoc | 5 | ||||
| -rw-r--r-- | src/add-pronunciation.c | 29 | ||||
| -rw-r--r-- | src/generator.c | 30 | ||||
| -rw-r--r-- | src/generator.h | 7 | ||||
| -rw-r--r-- | src/transform.c | 270 | 
6 files changed, 317 insertions, 39 deletions
| diff --git a/CMakeLists.txt b/CMakeLists.txt index 6edd410..3bb97aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,15 +180,14 @@ add_executable (${PROJECT_NAME}  target_link_libraries (${PROJECT_NAME} ${project_common_libraries})  # Tools -add_executable (query-tool EXCLUDE_FROM_ALL -	src/query-tool.c ${project_common_sources}) -target_link_libraries (query-tool ${project_common_libraries}) +set (tools add-pronunciation query-tool transform) +foreach (tool ${tools}) +	add_executable (${tool} EXCLUDE_FROM_ALL +		src/${tool}.c ${project_common_sources}) +	target_link_libraries (${tool} ${project_common_libraries}) +endforeach (tool) -add_executable (add-pronunciation EXCLUDE_FROM_ALL -	src/add-pronunciation.c ${project_common_sources}) -target_link_libraries (add-pronunciation ${project_common_libraries}) - -add_custom_target (tools DEPENDS add-pronunciation query-tool) +add_custom_target (tools DEPENDS ${tools})  # The files to be installed  include (GNUInstallDirs) diff --git a/README.adoc b/README.adoc index cfad569..fb89f18 100644 --- a/README.adoc +++ b/README.adoc @@ -100,6 +100,11 @@ Dictionaries  Unfortunately this application only really works with specific dictionaries.  Word definitions have to be in plain text, separated by newlines. +You may use the included transform tool to transform existing dictionaries that +are almost useful as they are, e.g. after stripping XML tags.  You might want to +fix up the `sametypesequence` of the resulting '.ifo' file afterwards, and run +dictzip on the resulting '.dict' file. +  https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[  CZ <--> { EN, DE, PL, RU } dictionaries] diff --git a/src/add-pronunciation.c b/src/add-pronunciation.c index 6ca5ad3..26261f9 100644 --- a/src/add-pronunciation.c +++ b/src/add-pronunciation.c @@ -282,32 +282,6 @@ stardict_info_copy (StardictInfo *dest, const StardictInfo *src)  	}  } -/// Write a list of data fields back to a dictionary. -static gboolean -write_fields (Generator *generator, GList *fields, gboolean sts, GError **error) -{ -	while (fields) -	{ -		StardictEntryField *field = fields->data; -		if (!sts && !generator_write_type (generator, field->type, error)) -			return FALSE; - -		gboolean mark_end = !sts || fields->next != NULL; -		if (g_ascii_islower (field->type)) -		{ -			if (!generator_write_string (generator, -				field->data, mark_end, error)) -				return FALSE; -		} -		else if (!generator_write_raw (generator, -			field->data, field->data_size, mark_end, error)) -			return FALSE; - -		fields = fields->next; -	} -	return TRUE; -} -  int  main (int argc, char *argv[])  { @@ -516,8 +490,7 @@ G_GNUC_END_IGNORE_DEPRECATIONS  			start_link.next = entry->fields;  			start_link.data = &field; -			if (!write_fields (generator, &start_link, -					info->same_type_sequence != NULL, &error) +			if (!generator_write_fields (generator, &start_link, &error)  			 || !generator_finish_entry (generator,  					stardict_iterator_get_word (iterator), &error))  			{ diff --git a/src/generator.c b/src/generator.c index 9f6be9b..25c8e43 100644 --- a/src/generator.c +++ b/src/generator.c @@ -1,7 +1,7 @@  /*   * generator.c: dictionary generator   * - * Copyright (c) 2013, Přemysl Eric Janouch <p@janouch.name> + * Copyright (c) 2013 - 2020, Přemysl Eric Janouch <p@janouch.name>   *   * Permission to use, copy, modify, and/or distribute this software for any   * purpose with or without fee is hereby granted. @@ -170,6 +170,34 @@ generator_write_string (Generator *self,  	return TRUE;  } +/// Write a list of data fields back to a dictionary.  The list has to be +/// acceptable for the generated dictionary's sametypesequence (or lack of). +gboolean +generator_write_fields (Generator *self, const GList *fields, GError **error) +{ +	gboolean sts = self->info->same_type_sequence != NULL; +	while (fields) +	{ +		StardictEntryField *field = fields->data; +		if (!sts && !generator_write_type (self, field->type, error)) +			return FALSE; + +		gboolean mark_end = !sts || fields->next != NULL; +		if (g_ascii_islower (field->type)) +		{ +			if (!generator_write_string (self, +				field->data, mark_end, error)) +				return FALSE; +		} +		else if (!generator_write_raw (self, +			field->data, field->data_size, mark_end, error)) +			return FALSE; + +		fields = fields->next; +	} +	return TRUE; +} +  /// Finishes the current entry and writes it into the index.  gboolean  generator_finish_entry (Generator *self, const gchar *word, GError **error) diff --git a/src/generator.h b/src/generator.h index 554e7ed..ba19d58 100644 --- a/src/generator.h +++ b/src/generator.h @@ -4,7 +4,7 @@   * Nothing fancy.  Just something moved out off the `stardict' test to be   * conveniently reused by the included tools.   * - * Copyright (c) 2013, Přemysl Eric Janouch <p@janouch.name> + * Copyright (c) 2013 - 2020, Přemysl Eric Janouch <p@janouch.name>   *   * Permission to use, copy, modify, and/or distribute this software for any   * purpose with or without fee is hereby granted. @@ -42,12 +42,15 @@ Generator *generator_new (const gchar *base, GError **error);  gboolean generator_finish (Generator *self, GError **error);  void generator_free (Generator *self); -void generator_begin_entry (Generator *self);  gboolean generator_write_type (Generator *self, gchar type, GError **error);  gboolean generator_write_raw (Generator *self,  	gpointer data, gsize data_size, gboolean mark_end, GError **error);  gboolean generator_write_string (Generator *self,  	const gchar *s, gboolean mark_end, GError **error); + +void generator_begin_entry (Generator *self); +gboolean generator_write_fields (Generator *self, +	const GList *fields, GError **error);  gboolean generator_finish_entry (Generator *self,  	const gchar *word, GError **error); diff --git a/src/transform.c b/src/transform.c new file mode 100644 index 0000000..2d5c2f2 --- /dev/null +++ b/src/transform.c @@ -0,0 +1,270 @@ +/* + * A tool to transform dictionaries dictionaries by an external filter + * + * The external filter needs to process NUL-separated textual entries. + * + * Example: transform input.info output -- perl -p0e s/bullshit/soykaf/g + * + * Copyright (c) 2020, Přemysl Eric Janouch <p@janouch.name> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <locale.h> + +#include <glib.h> +#include <glib/gstdio.h> +#include <glib-unix.h> +#include <gio/gio.h> + +#include "stardict.h" +#include "stardict-private.h" +#include "generator.h" + +enum { PIPE_READ, PIPE_WRITE }; + + +// --- Main -------------------------------------------------------------------- + +static inline void +print_progress (gulong *last_percent, StardictIterator *iterator, gsize total) +{ +	gulong percent = +		(gulong) stardict_iterator_get_offset (iterator) * 100 / total; +	if (percent != *last_percent) +	{ +		printf ("\r  Writing entries... %3lu%%", percent); +		*last_percent = percent; +	} +} + +static gboolean +write_to_filter (StardictDict *dict, gint fd, GError **error) +{ +	StardictInfo *info = stardict_dict_get_info (dict); +	gsize n_words = stardict_info_get_word_count (info); + +	StardictIterator *iterator = stardict_iterator_new (dict, 0); +	gulong last_percent = -1; +	while (stardict_iterator_is_valid (iterator)) +	{ +		print_progress (&last_percent, iterator, n_words); + +		StardictEntry *entry = stardict_iterator_get_entry (iterator); +		for (const GList *fields = stardict_entry_get_fields (entry); +			fields; fields = fields->next) +		{ +			StardictEntryField *field = fields->data; +			if (!g_ascii_islower (field->type)) +				continue; + +			if (write (fd, field->data, field->data_size) +				!= (ssize_t) field->data_size) +			{ +				g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED, +					"%s", strerror (errno)); +				return FALSE; +			} +		} + +		g_object_unref (entry); +		stardict_iterator_next (iterator); +	} +	printf ("\n"); +	return TRUE; +} + +static gboolean +update_from_filter (StardictDict *dict, Generator *generator, +	GMappedFile *filtered_file, GError **error) +{ +	gchar *filtered = g_mapped_file_get_contents (filtered_file); +	gchar *filtered_end = filtered + g_mapped_file_get_length (filtered_file); + +	StardictInfo *info = stardict_dict_get_info (dict); +	gsize n_words = stardict_info_get_word_count (info); + +	StardictIterator *iterator = stardict_iterator_new (dict, 0); +	gulong last_percent = -1; +	while (stardict_iterator_is_valid (iterator)) +	{ +		print_progress (&last_percent, iterator, n_words); + +		StardictEntry *entry = stardict_iterator_get_entry (iterator); +		generator_begin_entry (generator); + +		for (GList *fields = entry->fields; fields; fields = fields->next) +		{ +			StardictEntryField *field = fields->data; +			if (!g_ascii_islower (field->type)) +				continue; + +			gchar *end = memchr (filtered, 0, filtered_end - filtered); +			if (!end) +			{ +				g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED, +					"filter seems to have ended too early"); +				return FALSE; +			} + +			g_free (field->data); +			field->data = g_strdup (filtered); +			field->data_size = end - filtered + 1; +			filtered = end + 1; +		} + +		if (!generator_write_fields (generator, entry->fields, error) +		 || !generator_finish_entry (generator, +				stardict_iterator_get_word (iterator), error)) +			return FALSE; + +		g_object_unref (entry); +		stardict_iterator_next (iterator); +	} +	printf ("\n"); +	return TRUE; +} + +// FIXME: copied from add-pronunciation.c, should merge it somewhere (utils?) +/// Copy the contents of one StardictInfo object into another.  Ignores path. +static void +stardict_info_copy (StardictInfo *dest, const StardictInfo *src) +{ +	dest->version = src->version; + +	guint i; +	for (i = 0; i < _stardict_ifo_keys_length; i++) +	{ +		const struct stardict_ifo_key *key = &_stardict_ifo_keys[i]; +		if (key->type == IFO_STRING) +		{ +			gchar **p = &G_STRUCT_MEMBER (gchar *, dest, key->offset); +			gchar  *q =  G_STRUCT_MEMBER (gchar *, src,  key->offset); + +			g_free (*p); +			*p = q ? g_strdup (q) : NULL; +		} +		else +			G_STRUCT_MEMBER (gulong, dest, key->offset) = +				G_STRUCT_MEMBER (gulong, src, key->offset); +	} +} + +int +main (int argc, char *argv[]) +{ +	// The GLib help includes an ellipsis character, for some reason +	(void) setlocale (LC_ALL, ""); + +	GError *error = NULL; +	GOptionContext *ctx = g_option_context_new +		("input.ifo output-basename -- FILTER [ARG...]"); +	g_option_context_set_summary +		(ctx, "Transform dictionaries using a filter program."); +	g_option_context_set_description (ctx, "Test?"); +	if (!g_option_context_parse (ctx, &argc, &argv, &error)) +	{ +		g_printerr ("Error: option parsing failed: %s\n", error->message); +		exit (EXIT_FAILURE); +	} + +	if (argc < 3) +	{ +		gchar *help = g_option_context_get_help (ctx, TRUE, FALSE); +		g_printerr ("%s", help); +		g_free (help); +		exit (EXIT_FAILURE); +	} + +	// GLib is bullshit, getopt_long() always correctly removes this +	gint program_argv_start = 3; +	if (!strcmp (argv[program_argv_start], "--")) +		program_argv_start++; + +	g_option_context_free (ctx); + +	printf ("Loading the original dictionary...\n"); +	StardictDict *dict = stardict_dict_new (argv[1], &error); +	if (!dict) +	{ +		g_printerr ("Error: opening the dictionary failed: %s\n", +			error->message); +		exit (EXIT_FAILURE); +	} + +	printf ("Filtering entries...\n"); +	gint child_in[2]; +	if (!g_unix_open_pipe (child_in, 0, &error)) +		g_error ("g_unix_open_pipe: %s", error->message); + +	FILE *child_out = tmpfile (); +	if (!child_out) +		g_error ("tmpfile: %s", strerror (errno)); + +	GPid pid = -1; +	if (!g_spawn_async_with_fds (NULL /* working_directory */, +		argv + program_argv_start /* forward a part of ours */, NULL /* envp */, +		G_SPAWN_SEARCH_PATH | G_SPAWN_DO_NOT_REAP_CHILD, +		NULL /* child_setup */, NULL /* user_data */, +		&pid, child_in[PIPE_READ], fileno (child_out), STDERR_FILENO, &error)) +		g_error ("g_spawn: %s", error->message); +	if (!write_to_filter (dict, child_in[PIPE_WRITE], &error)) +		g_error ("write_to_filter: %s", error->message); +	if (!g_close (child_in[PIPE_READ], &error) +	 || !g_close (child_in[PIPE_WRITE], &error)) +		g_error ("g_close: %s", error->message); + +	printf ("Waiting for the filter to finish...\n"); +	int wstatus = errno = 0; +	if (waitpid (pid, &wstatus, 0) < 1 +	 || !WIFEXITED (wstatus) || WEXITSTATUS (wstatus) > 0) +		g_error ("Filter failed (%s, status %d)", strerror (errno), wstatus); + +	GMappedFile *filtered = g_mapped_file_new_from_fd (fileno (child_out), +		FALSE /* writable */, &error); +	if (!filtered) +		g_error ("g_mapped_file_new_from_fd: %s", error->message); + +	printf ("Writing the new dictionary...\n"); +	Generator *generator = generator_new (argv[2], &error); +	if (!generator) +	{ +		g_printerr ("Error: failed to create the output dictionary: %s\n", +			error->message); +		exit (EXIT_FAILURE); +	} + +	StardictInfo *info = generator->info; +	stardict_info_copy (info, stardict_dict_get_info (dict)); + +	// This gets incremented each time an entry is finished +	info->word_count = 0; + +	if (!update_from_filter (dict, generator, filtered, &error) +	 || !generator_finish (generator, &error)) +	{ +		g_printerr ("Error: failed to write the dictionary: %s\n", +			error->message); +		exit (EXIT_FAILURE); +	} + +	g_mapped_file_unref (filtered); +	fclose (child_out); +	generator_free (generator); +	g_object_unref (dict); +	return 0; +} | 
