Add a custom collation mechanism

A lot better than that StarDict shitfuckery.
author: Přemysl Janouch <p.janouch@gmail.com> 2015-02-24 08:38:28 +0100
committer: Přemysl Janouch <p.janouch@gmail.com> 2015-02-24 09:27:38 +0100
commit: 2abbe7017fcc835c06edcc20ab4647edadb43035 (patch)
tree: 136e1561e3d01fd9fddb532b4c0a42101be2050e /src
parent: d93b241a65b7b0c4dcb6e42f957da1ed212221db (diff)
download: tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.tar.gz
tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.tar.xz
tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.zip
2 files changed, 158 insertions, 15 deletions
diff --git a/src/stardict-private.h b/src/stardict-private.h
index 4a97eea..123c0c3 100644
--- a/src/stardict-private.h
+++ b/src/stardict-private.h
@@ -1,7 +1,7 @@
 /*
  * stardict-private.h: internal StarDict API
  *
- * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
+ * Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>
  * All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
@@ -47,6 +47,8 @@ struct stardict_info
 	gchar           * description;
 	gchar           * date;
 	gchar           * same_type_sequence;
+
+	gchar           * collation;
 };
 
 struct stardict_index_entry
diff --git a/src/stardict.c b/src/stardict.c
index d81848c..b7f09ab 100644
--- a/src/stardict.c
+++ b/src/stardict.c
@@ -1,7 +1,7 @@
 /*
  * stardict.c: StarDict API
  *
- * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
+ * Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>
  * All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
@@ -27,6 +27,8 @@
 #include <gio/gio.h>
 #include <glib/gi18n.h>
 
+#include <unicode/ucol.h>
+
 #include "stardict.h"
 #include "stardict-private.h"
 #include "dictzip-input-stream.h"
@@ -177,6 +179,8 @@ stardict_info_free (StardictInfo *sdi)
 	g_free (sdi->description);
 	g_free (sdi->date);
 	g_free (sdi->same_type_sequence);
+
+	g_free (sdi->collation);
 	g_free (sdi);
 }
 
@@ -194,7 +198,10 @@ const struct stardict_ifo_key _stardict_ifo_keys[] =
 	DEFINE_IFO_KEY ("website",          STRING, website),
 	DEFINE_IFO_KEY ("description",      STRING, description),
 	DEFINE_IFO_KEY ("date",             STRING, date),
-	DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence)
+	DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence),
+
+	// These are our own custom
+	DEFINE_IFO_KEY ("collation",        STRING, collation)
 };
 
 gsize _stardict_ifo_keys_length = G_N_ELEMENTS (_stardict_ifo_keys);
@@ -358,6 +365,12 @@ struct stardict_dict_private
 	GArray        * index;              //!< Word index
 	GArray        * synonyms;           //!< Synonyms
 
+	/* The collated indexes are only permutations of their normal selves. */
+
+	UCollator     * collator;           //!< ICU index collator
+	GArray        * collated_index;     //!< Sorted indexes into @a index
+	GArray        * collated_synonyms;  //!< Sorted indexes into @a synonyms
+
 	/* There are currently three ways the dictionary data can be read:
 	 * through mmap(), from a seekable GInputStream, or from a preallocated
 	 * chunk of memory that the whole dictionary has been decompressed into.
@@ -384,6 +397,13 @@ stardict_dict_finalize (GObject *self)
 	g_array_free (priv->index, TRUE);
 	g_array_free (priv->synonyms, TRUE);
 
+	if (priv->collator)
+		ucol_close (priv->collator);
+	if (priv->collated_index)
+		g_array_free (priv->collated_index, TRUE);
+	if (priv->collated_synonyms)
+		g_array_free (priv->collated_synonyms, TRUE);
+
 	if (priv->mapped_dict)
 		g_mapped_file_unref (priv->mapped_dict);
 	else if (priv->dict_stream)
@@ -641,6 +661,90 @@ cannot_open:
 	return TRUE;
 }
 
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/** Compare the two strings by collation rules. */
+static inline gint
+stardict_dict_strcoll (gconstpointer s1, gconstpointer s2, gpointer data)
+{
+	StardictDict *sd = data;
+	UErrorCode error = U_ZERO_ERROR;
+	return ucol_strcollUTF8 (sd->priv->collator, s1, -1, s2, -1, &error);
+}
+
+/** Stricter stardict_dict_strcoll() used to sort the collated index. */
+static inline gint
+stardict_dict_strcoll_for_sorting
+	(gconstpointer s1, gconstpointer s2, gpointer data)
+{
+	UCollationResult a = stardict_dict_strcoll (s1, s2, data);
+	return a ? a : strcmp (s1, s2);
+}
+
+static inline gint
+stardict_dict_index_coll_for_sorting
+	(gconstpointer x1, gconstpointer x2, gpointer data)
+{
+	StardictDict *sd = data;
+	const gchar *s1 = g_array_index
+		(sd->priv->index, StardictIndexEntry, *(guint32 *) x1).name;
+	const gchar *s2 = g_array_index
+		(sd->priv->index, StardictIndexEntry, *(guint32 *) x2).name;
+	return stardict_dict_strcoll_for_sorting (s1, s2, data);
+}
+
+static inline gint
+stardict_dict_synonyms_coll_for_sorting
+	(gconstpointer x1, gconstpointer x2, gpointer data)
+{
+	StardictDict *sd = data;
+	const gchar *s1 = g_array_index
+		(sd->priv->index, StardictSynonymEntry, *(guint32 *) x1).word;
+	const gchar *s2 = g_array_index
+		(sd->priv->index, StardictSynonymEntry, *(guint32 *) x2).word;
+	return stardict_dict_strcoll_for_sorting (s1, s2, data);
+}
+
+static gboolean
+stardict_dict_set_collation (StardictDict *sd, const gchar *collation)
+{
+	StardictDictPrivate *priv = sd->priv;
+	UErrorCode error = U_ZERO_ERROR;
+	if (!(priv->collator = ucol_open (collation, &error)))
+	{
+		// TODO: set a meaningful error
+		g_info ("failed to create a collator for `%s'", collation);
+		return FALSE;
+	}
+
+	// TODO: if error != U_ZERO_ERROR, report a meaningful message
+
+	ucol_setAttribute (priv->collator, UCOL_CASE_FIRST, UCOL_OFF, &error);
+
+	priv->collated_index = g_array_sized_new (FALSE, FALSE,
+		sizeof (guint32), priv->index->len);
+	for (guint32 i = 0; i < priv->index->len; i++)
+		g_array_append_val (priv->collated_index, i);
+	g_array_sort_with_data (sd->priv->collated_index,
+		stardict_dict_index_coll_for_sorting, sd);
+
+	priv->collated_synonyms = g_array_sized_new (FALSE, FALSE,
+		sizeof (guint32), priv->synonyms->len);
+	for (guint32 i = 0; i < priv->synonyms->len; i++)
+		g_array_append_val (priv->collated_synonyms, i);
+	g_array_sort_with_data (sd->priv->collated_synonyms,
+		stardict_dict_synonyms_coll_for_sorting, sd);
+
+	// Make the collator something like case-insensitive, see:
+	// http://userguide.icu-project.org/collation/concepts
+	// We shouldn't need to sort the data anymore, and if we did, we could just
+	// reset the strength to its default value for the given locale.
+	ucol_setStrength (priv->collator, UCOL_SECONDARY);
+	return TRUE;
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
 /** Load a StarDict dictionary.
  *  @param[in] sdi  Parsed .ifo data.  The dictionary assumes ownership.
  */
@@ -709,9 +813,12 @@ stardict_dict_new_from_info (StardictInfo *sdi, GError **error)
 
 	gchar *base_syn = g_strconcat (base, ".syn", NULL);
 	if (g_file_test (base_syn, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR))
-		load_syn (sd, base_syn, NULL);
+		(void) load_syn (sd, base_syn, NULL);
 	g_free (base_syn);
 
+	if (sdi->collation)
+		(void) stardict_dict_set_collation (sd, sdi->collation);
+
 	g_free (base);
 	return sd;
 
@@ -722,6 +829,20 @@ error:
 	return NULL;
 }
 
+static gint
+stardict_dict_cmp_synonym (StardictDict *sd, const gchar *word, gint i)
+{
+	GArray *collated = sd->priv->collated_synonyms;
+	GArray *synonyms = sd->priv->synonyms;
+
+	if (sd->priv->collator)
+		return stardict_dict_strcoll (word,
+			g_array_index (synonyms, StardictSynonymEntry,
+				g_array_index (collated, guint32, i)).word, sd);
+	return g_ascii_strcasecmp (word,
+		g_array_index (synonyms, StardictSynonymEntry, i).word);
+}
+
 /** Return words for which the argument is a synonym of or NULL
  *  if there are no such words.
  */
@@ -731,12 +852,12 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
 	GArray *synonyms = sd->priv->synonyms;
 	GArray *index = sd->priv->index;
 
-	BINARY_SEARCH_BEGIN (synonyms->len - 1, g_ascii_strcasecmp (word,
-			g_array_index (synonyms, StardictSynonymEntry, imid).word))
+	BINARY_SEARCH_BEGIN (synonyms->len - 1,
+		stardict_dict_cmp_synonym (sd, word, imid))
 
 	// Back off to the first matching entry
-	while (imid > 0 && !g_ascii_strcasecmp (word,
-		g_array_index (synonyms, StardictSynonymEntry, --imid).word));
+	while (imid > 0 && !stardict_dict_cmp_synonym (sd, word, --imid))
+		;
 
 	GPtrArray *array = g_ptr_array_new ();
 
@@ -751,10 +872,23 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
 	return (gchar **) g_ptr_array_free (array, FALSE);
 
 	BINARY_SEARCH_END
-
 	return NULL;
 }
 
+static gint
+stardict_dict_cmp_index (StardictDict *sd, const gchar *word, gint i)
+{
+	GArray *collated = sd->priv->collated_index;
+	GArray *index = sd->priv->index;
+
+	if (sd->priv->collator)
+		return stardict_dict_strcoll (word,
+			g_array_index (index, StardictIndexEntry,
+				g_array_index (collated, guint32, i)).name, sd);
+	return g_ascii_strcasecmp (word,
+		g_array_index (index, StardictIndexEntry, i).name);
+}
+
 /** Search for a word.  The search is ASCII-case-insensitive.
  *  @param[in] word  The word in utf-8 encoding
  *  @param[out] success  TRUE if found
@@ -765,12 +899,11 @@ stardict_dict_search (StardictDict *sd, const gchar *word, gboolean *success)
 {
 	GArray *index = sd->priv->index;
 
-	BINARY_SEARCH_BEGIN (index->len - 1, g_ascii_strcasecmp (word,
-		g_array_index (index, StardictIndexEntry, imid).name))
+	BINARY_SEARCH_BEGIN (index->len - 1,
+		stardict_dict_cmp_index (sd, word, imid))
 
 	// Back off to the first matching entry
-	while (imid > 0 && !g_ascii_strcasecmp (word,
-		g_array_index (index, StardictIndexEntry, imid - 1).name))
+	while (imid > 0 && !stardict_dict_cmp_index (sd, word, imid - 1))
 		imid--;
 
 	if (success) *success = TRUE;
@@ -1051,6 +1184,13 @@ stardict_iterator_new (StardictDict *sd, guint32 offset)
 	return si;
 }
 
+static gint64
+stardict_iterator_get_real_offset (StardictIterator *sdi)
+{
+	return sdi->owner->priv->collator ? g_array_index
+		(sdi->owner->priv->collated_index, guint32, sdi->offset) : sdi->offset;
+}
+
 /** Return the word in the index that the iterator points at, or NULL. */
 const gchar *
 stardict_iterator_get_word (StardictIterator *sdi)
@@ -1059,7 +1199,7 @@ stardict_iterator_get_word (StardictIterator *sdi)
 	if (!stardict_iterator_is_valid (sdi))
 		return NULL;
 	return g_array_index (sdi->owner->priv->index,
-		StardictIndexEntry, sdi->offset).name;
+		StardictIndexEntry, stardict_iterator_get_real_offset (sdi)).name;
 }
 
 /** Return the dictionary entry that the iterator points at, or NULL. */
@@ -1069,7 +1209,8 @@ stardict_iterator_get_entry (StardictIterator *sdi)
 	g_return_val_if_fail (STARDICT_IS_ITERATOR (sdi), NULL);
 	if (!stardict_iterator_is_valid (sdi))
 		return FALSE;
-	return stardict_dict_get_entry (sdi->owner, sdi->offset);
+	return stardict_dict_get_entry (sdi->owner,
+		stardict_iterator_get_real_offset (sdi));
 }
 
 /** Return whether the iterator points to a valid index entry. */
author	Přemysl Janouch <p.janouch@gmail.com>	2015-02-24 08:38:28 +0100
committer	Přemysl Janouch <p.janouch@gmail.com>	2015-02-24 09:27:38 +0100
commit	2abbe7017fcc835c06edcc20ab4647edadb43035 (patch)
tree	136e1561e3d01fd9fddb532b4c0a42101be2050e /src
parent	d93b241a65b7b0c4dcb6e42f957da1ed212221db (diff)
download	tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.tar.gz tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.tar.xz tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.zip