diff options
| author | Přemysl Janouch <p.janouch@gmail.com> | 2015-02-24 08:38:28 +0100 | 
|---|---|---|
| committer | Přemysl Janouch <p.janouch@gmail.com> | 2015-02-24 09:27:38 +0100 | 
| commit | 2abbe7017fcc835c06edcc20ab4647edadb43035 (patch) | |
| tree | 136e1561e3d01fd9fddb532b4c0a42101be2050e /src | |
| parent | d93b241a65b7b0c4dcb6e42f957da1ed212221db (diff) | |
| download | tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.tar.gz tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.tar.xz tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.zip | |
Add a custom collation mechanism
A lot better than that StarDict shitfuckery.
Diffstat (limited to 'src')
| -rw-r--r-- | src/stardict-private.h | 4 | ||||
| -rw-r--r-- | src/stardict.c | 169 | 
2 files changed, 158 insertions, 15 deletions
| diff --git a/src/stardict-private.h b/src/stardict-private.h index 4a97eea..123c0c3 100644 --- a/src/stardict-private.h +++ b/src/stardict-private.h @@ -1,7 +1,7 @@  /*   * stardict-private.h: internal StarDict API   * - * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com> + * Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>   * All rights reserved.   *   * Permission to use, copy, modify, and/or distribute this software for any @@ -47,6 +47,8 @@ struct stardict_info  	gchar           * description;  	gchar           * date;  	gchar           * same_type_sequence; + +	gchar           * collation;  };  struct stardict_index_entry diff --git a/src/stardict.c b/src/stardict.c index d81848c..b7f09ab 100644 --- a/src/stardict.c +++ b/src/stardict.c @@ -1,7 +1,7 @@  /*   * stardict.c: StarDict API   * - * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com> + * Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>   * All rights reserved.   *   * Permission to use, copy, modify, and/or distribute this software for any @@ -27,6 +27,8 @@  #include <gio/gio.h>  #include <glib/gi18n.h> +#include <unicode/ucol.h> +  #include "stardict.h"  #include "stardict-private.h"  #include "dictzip-input-stream.h" @@ -177,6 +179,8 @@ stardict_info_free (StardictInfo *sdi)  	g_free (sdi->description);  	g_free (sdi->date);  	g_free (sdi->same_type_sequence); + +	g_free (sdi->collation);  	g_free (sdi);  } @@ -194,7 +198,10 @@ const struct stardict_ifo_key _stardict_ifo_keys[] =  	DEFINE_IFO_KEY ("website",          STRING, website),  	DEFINE_IFO_KEY ("description",      STRING, description),  	DEFINE_IFO_KEY ("date",             STRING, date), -	DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence) +	DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence), + +	// These are our own custom +	DEFINE_IFO_KEY ("collation",        STRING, collation)  };  gsize _stardict_ifo_keys_length = G_N_ELEMENTS (_stardict_ifo_keys); @@ -358,6 +365,12 @@ struct stardict_dict_private  	GArray        * index;              //!< Word index  	GArray        * synonyms;           //!< Synonyms +	/* The collated indexes are only permutations of their normal selves. */ + +	UCollator     * collator;           //!< ICU index collator +	GArray        * collated_index;     //!< Sorted indexes into @a index +	GArray        * collated_synonyms;  //!< Sorted indexes into @a synonyms +  	/* There are currently three ways the dictionary data can be read:  	 * through mmap(), from a seekable GInputStream, or from a preallocated  	 * chunk of memory that the whole dictionary has been decompressed into. @@ -384,6 +397,13 @@ stardict_dict_finalize (GObject *self)  	g_array_free (priv->index, TRUE);  	g_array_free (priv->synonyms, TRUE); +	if (priv->collator) +		ucol_close (priv->collator); +	if (priv->collated_index) +		g_array_free (priv->collated_index, TRUE); +	if (priv->collated_synonyms) +		g_array_free (priv->collated_synonyms, TRUE); +  	if (priv->mapped_dict)  		g_mapped_file_unref (priv->mapped_dict);  	else if (priv->dict_stream) @@ -641,6 +661,90 @@ cannot_open:  	return TRUE;  } +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/** Compare the two strings by collation rules. */ +static inline gint +stardict_dict_strcoll (gconstpointer s1, gconstpointer s2, gpointer data) +{ +	StardictDict *sd = data; +	UErrorCode error = U_ZERO_ERROR; +	return ucol_strcollUTF8 (sd->priv->collator, s1, -1, s2, -1, &error); +} + +/** Stricter stardict_dict_strcoll() used to sort the collated index. */ +static inline gint +stardict_dict_strcoll_for_sorting +	(gconstpointer s1, gconstpointer s2, gpointer data) +{ +	UCollationResult a = stardict_dict_strcoll (s1, s2, data); +	return a ? a : strcmp (s1, s2); +} + +static inline gint +stardict_dict_index_coll_for_sorting +	(gconstpointer x1, gconstpointer x2, gpointer data) +{ +	StardictDict *sd = data; +	const gchar *s1 = g_array_index +		(sd->priv->index, StardictIndexEntry, *(guint32 *) x1).name; +	const gchar *s2 = g_array_index +		(sd->priv->index, StardictIndexEntry, *(guint32 *) x2).name; +	return stardict_dict_strcoll_for_sorting (s1, s2, data); +} + +static inline gint +stardict_dict_synonyms_coll_for_sorting +	(gconstpointer x1, gconstpointer x2, gpointer data) +{ +	StardictDict *sd = data; +	const gchar *s1 = g_array_index +		(sd->priv->index, StardictSynonymEntry, *(guint32 *) x1).word; +	const gchar *s2 = g_array_index +		(sd->priv->index, StardictSynonymEntry, *(guint32 *) x2).word; +	return stardict_dict_strcoll_for_sorting (s1, s2, data); +} + +static gboolean +stardict_dict_set_collation (StardictDict *sd, const gchar *collation) +{ +	StardictDictPrivate *priv = sd->priv; +	UErrorCode error = U_ZERO_ERROR; +	if (!(priv->collator = ucol_open (collation, &error))) +	{ +		// TODO: set a meaningful error +		g_info ("failed to create a collator for `%s'", collation); +		return FALSE; +	} + +	// TODO: if error != U_ZERO_ERROR, report a meaningful message + +	ucol_setAttribute (priv->collator, UCOL_CASE_FIRST, UCOL_OFF, &error); + +	priv->collated_index = g_array_sized_new (FALSE, FALSE, +		sizeof (guint32), priv->index->len); +	for (guint32 i = 0; i < priv->index->len; i++) +		g_array_append_val (priv->collated_index, i); +	g_array_sort_with_data (sd->priv->collated_index, +		stardict_dict_index_coll_for_sorting, sd); + +	priv->collated_synonyms = g_array_sized_new (FALSE, FALSE, +		sizeof (guint32), priv->synonyms->len); +	for (guint32 i = 0; i < priv->synonyms->len; i++) +		g_array_append_val (priv->collated_synonyms, i); +	g_array_sort_with_data (sd->priv->collated_synonyms, +		stardict_dict_synonyms_coll_for_sorting, sd); + +	// Make the collator something like case-insensitive, see: +	// http://userguide.icu-project.org/collation/concepts +	// We shouldn't need to sort the data anymore, and if we did, we could just +	// reset the strength to its default value for the given locale. +	ucol_setStrength (priv->collator, UCOL_SECONDARY); +	return TRUE; +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +  /** Load a StarDict dictionary.   *  @param[in] sdi  Parsed .ifo data.  The dictionary assumes ownership.   */ @@ -709,9 +813,12 @@ stardict_dict_new_from_info (StardictInfo *sdi, GError **error)  	gchar *base_syn = g_strconcat (base, ".syn", NULL);  	if (g_file_test (base_syn, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR)) -		load_syn (sd, base_syn, NULL); +		(void) load_syn (sd, base_syn, NULL);  	g_free (base_syn); +	if (sdi->collation) +		(void) stardict_dict_set_collation (sd, sdi->collation); +  	g_free (base);  	return sd; @@ -722,6 +829,20 @@ error:  	return NULL;  } +static gint +stardict_dict_cmp_synonym (StardictDict *sd, const gchar *word, gint i) +{ +	GArray *collated = sd->priv->collated_synonyms; +	GArray *synonyms = sd->priv->synonyms; + +	if (sd->priv->collator) +		return stardict_dict_strcoll (word, +			g_array_index (synonyms, StardictSynonymEntry, +				g_array_index (collated, guint32, i)).word, sd); +	return g_ascii_strcasecmp (word, +		g_array_index (synonyms, StardictSynonymEntry, i).word); +} +  /** Return words for which the argument is a synonym of or NULL   *  if there are no such words.   */ @@ -731,12 +852,12 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)  	GArray *synonyms = sd->priv->synonyms;  	GArray *index = sd->priv->index; -	BINARY_SEARCH_BEGIN (synonyms->len - 1, g_ascii_strcasecmp (word, -			g_array_index (synonyms, StardictSynonymEntry, imid).word)) +	BINARY_SEARCH_BEGIN (synonyms->len - 1, +		stardict_dict_cmp_synonym (sd, word, imid))  	// Back off to the first matching entry -	while (imid > 0 && !g_ascii_strcasecmp (word, -		g_array_index (synonyms, StardictSynonymEntry, --imid).word)); +	while (imid > 0 && !stardict_dict_cmp_synonym (sd, word, --imid)) +		;  	GPtrArray *array = g_ptr_array_new (); @@ -751,10 +872,23 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)  	return (gchar **) g_ptr_array_free (array, FALSE);  	BINARY_SEARCH_END -  	return NULL;  } +static gint +stardict_dict_cmp_index (StardictDict *sd, const gchar *word, gint i) +{ +	GArray *collated = sd->priv->collated_index; +	GArray *index = sd->priv->index; + +	if (sd->priv->collator) +		return stardict_dict_strcoll (word, +			g_array_index (index, StardictIndexEntry, +				g_array_index (collated, guint32, i)).name, sd); +	return g_ascii_strcasecmp (word, +		g_array_index (index, StardictIndexEntry, i).name); +} +  /** Search for a word.  The search is ASCII-case-insensitive.   *  @param[in] word  The word in utf-8 encoding   *  @param[out] success  TRUE if found @@ -765,12 +899,11 @@ stardict_dict_search (StardictDict *sd, const gchar *word, gboolean *success)  {  	GArray *index = sd->priv->index; -	BINARY_SEARCH_BEGIN (index->len - 1, g_ascii_strcasecmp (word, -		g_array_index (index, StardictIndexEntry, imid).name)) +	BINARY_SEARCH_BEGIN (index->len - 1, +		stardict_dict_cmp_index (sd, word, imid))  	// Back off to the first matching entry -	while (imid > 0 && !g_ascii_strcasecmp (word, -		g_array_index (index, StardictIndexEntry, imid - 1).name)) +	while (imid > 0 && !stardict_dict_cmp_index (sd, word, imid - 1))  		imid--;  	if (success) *success = TRUE; @@ -1051,6 +1184,13 @@ stardict_iterator_new (StardictDict *sd, guint32 offset)  	return si;  } +static gint64 +stardict_iterator_get_real_offset (StardictIterator *sdi) +{ +	return sdi->owner->priv->collator ? g_array_index +		(sdi->owner->priv->collated_index, guint32, sdi->offset) : sdi->offset; +} +  /** Return the word in the index that the iterator points at, or NULL. */  const gchar *  stardict_iterator_get_word (StardictIterator *sdi) @@ -1059,7 +1199,7 @@ stardict_iterator_get_word (StardictIterator *sdi)  	if (!stardict_iterator_is_valid (sdi))  		return NULL;  	return g_array_index (sdi->owner->priv->index, -		StardictIndexEntry, sdi->offset).name; +		StardictIndexEntry, stardict_iterator_get_real_offset (sdi)).name;  }  /** Return the dictionary entry that the iterator points at, or NULL. */ @@ -1069,7 +1209,8 @@ stardict_iterator_get_entry (StardictIterator *sdi)  	g_return_val_if_fail (STARDICT_IS_ITERATOR (sdi), NULL);  	if (!stardict_iterator_is_valid (sdi))  		return FALSE; -	return stardict_dict_get_entry (sdi->owner, sdi->offset); +	return stardict_dict_get_entry (sdi->owner, +		stardict_iterator_get_real_offset (sdi));  }  /** Return whether the iterator points to a valid index entry. */ | 
