From 6c364dc99766bfd3bd86cd262db37b1a766dca1e Mon Sep 17 00:00:00 2001
From: Přemysl Eric Janouch
Date: Wed, 6 Oct 2021 20:13:43 +0200
Subject: Add an implementation of tabfile
The original one is a horrible thing. Now we're self-reliant.
---
CMakeLists.txt | 2 +-
src/tabfile.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
src/transform.c | 2 +-
3 files changed, 165 insertions(+), 2 deletions(-)
create mode 100644 src/tabfile.c
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1cff26..bd42553 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -163,7 +163,7 @@ if (gtk_FOUND)
endif ()
# Tools
-set (tools add-pronunciation query-tool transform)
+set (tools tabfile add-pronunciation query-tool transform)
foreach (tool ${tools})
add_executable (${tool} EXCLUDE_FROM_ALL
src/${tool}.c ${project_common_sources})
diff --git a/src/tabfile.c b/src/tabfile.c
new file mode 100644
index 0000000..0bcbff3
--- /dev/null
+++ b/src/tabfile.c
@@ -0,0 +1,163 @@
+/*
+ * A clean reimplementation of StarDict's tabfile
+ *
+ * Copyright (c) 2020 - 2021, Přemysl Eric Janouch
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include "stardict.h"
+#include "stardict-private.h"
+#include "generator.h"
+#include "utils.h"
+
+static gboolean
+set_data_error (GError **error, const char *message)
+{
+ g_set_error_literal (error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA, message);
+ return FALSE;
+}
+
+static const gchar escapes[256] = { ['n'] = '\n', ['t'] = '\t', ['\\'] = '\\' };
+
+static gboolean
+inplace_unescape (char *line, GError **error)
+{
+ gboolean escape = FALSE;
+ char *dest = line;
+ for (char *src = line; *src; src++)
+ {
+ if (escape)
+ {
+ escape = FALSE;
+ if (!(*dest++ = escapes[(guchar) *src]))
+ return set_data_error (error, "unsupported escape");
+ }
+ else if (*src == '\\')
+ escape = TRUE;
+ else
+ *dest++ = *src;
+ }
+ if (escape)
+ return set_data_error (error, "trailing escape character");
+
+ *dest = 0;
+ return TRUE;
+}
+
+static gboolean
+import_line (Generator *generator, char *line, size_t len, GError **error)
+{
+ if (!len)
+ return TRUE;
+
+ char *separator = strchr (line, '\t');
+ if (!separator)
+ return set_data_error (error, "keyword separator not found");
+
+ *separator++ = 0;
+ if (strchr (line, '\\'))
+ // The index wouldn't be sorted correctly with our method
+ return set_data_error (error, "escapes not allowed in keywords");
+
+ char *newline = strpbrk (separator, "\r\n");
+ if (newline)
+ *newline = 0;
+
+ if (!inplace_unescape (line, error)
+ || !inplace_unescape (separator, error))
+ return FALSE;
+
+ generator_begin_entry (generator);
+ return generator_write_string (generator, separator, TRUE, error)
+ && generator_finish_entry (generator, line, error);
+}
+
+static gboolean
+transform (FILE *fsorted, Generator *generator, GError **error)
+{
+ char *line = NULL;
+ size_t size = 0, ln = 1;
+ for (ssize_t read; (read = getline (&line, &size, fsorted)) >= 0; ln++)
+ if (!import_line (generator, line, read, error))
+ break;
+
+ free (line);
+ if (ferror (fsorted))
+ {
+ g_set_error_literal (error, G_IO_ERROR,
+ g_io_error_from_errno (errno), g_strerror (errno));
+ return FALSE;
+ }
+ if (!feof (fsorted))
+ {
+ // You'll only get good line number output with presorted input!
+ g_prefix_error (error, "line %zu: ", ln);
+ return FALSE;
+ }
+ return TRUE;
+}
+
+int
+main (int argc, char *argv[])
+{
+ // The GLib help includes an ellipsis character, for some reason
+ (void) setlocale (LC_ALL, "");
+
+ GError *error = NULL;
+ GOptionContext *ctx = g_option_context_new ("output-basename < input");
+ g_option_context_set_summary (ctx,
+ "Create a StarDict dictionary from plaintext.");
+ if (!g_option_context_parse (ctx, &argc, &argv, &error))
+ fatal ("Error: option parsing failed: %s\n", error->message);
+
+ if (argc != 2)
+ fatal ("%s", g_option_context_get_help (ctx, TRUE, FALSE));
+ g_option_context_free (ctx);
+
+ // This actually implements stardict_strcmp(), POSIX-compatibly.
+ // Your sort(1) is not expected to be stable by default, like bsdsort is.
+ FILE *fsorted = popen ("LC_ALL=C sort -t'\t' -k1f,1", "r");
+ if (!fsorted)
+ fatal ("%s: %s\n", "popen", g_strerror (errno));
+
+ Generator *generator = generator_new (argv[1], &error);
+ if (!generator)
+ fatal ("Error: failed to create the output dictionary: %s\n",
+ error->message);
+
+ StardictInfo *info = generator->info;
+ info->version = SD_VERSION_3_0_0;
+ info->book_name = g_strdup (argv[1]);
+ info->same_type_sequence = g_strdup ("m");
+
+ // This gets incremented each time an entry is finished
+ info->word_count = 0;
+
+ if (!transform (fsorted, generator, &error)
+ || !generator_finish (generator, &error))
+ fatal ("Error: failed to write the dictionary: %s\n", error->message);
+
+ generator_free (generator);
+ fclose (fsorted);
+ return 0;
+}
diff --git a/src/transform.c b/src/transform.c
index e3e8d4d..2144c6b 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -3,7 +3,7 @@
*
* The external filter needs to process NUL-separated textual entries.
*
- * Example: transform input.info output -- perl -p0e s/bullshit/soykaf/g
+ * Example: transform input.ifo output -- perl -p0e s/bullshit/soykaf/g
*
* Copyright (c) 2020, Přemysl Eric Janouch
*
--
cgit v1.2.3-70-g09d2