aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPřemysl Janouch <p.janouch@gmail.com>2015-02-24 08:38:28 +0100
committerPřemysl Janouch <p.janouch@gmail.com>2015-02-24 09:27:38 +0100
commit2abbe7017fcc835c06edcc20ab4647edadb43035 (patch)
tree136e1561e3d01fd9fddb532b4c0a42101be2050e
parentd93b241a65b7b0c4dcb6e42f957da1ed212221db (diff)
downloadtdv-2abbe7017fcc835c06edcc20ab4647edadb43035.tar.gz
tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.tar.xz
tdv-2abbe7017fcc835c06edcc20ab4647edadb43035.zip
Add a custom collation mechanism
A lot better than that StarDict shitfuckery.
-rw-r--r--.travis.yml2
-rw-r--r--CMakeLists.txt3
-rw-r--r--README10
-rw-r--r--src/stardict-private.h4
-rw-r--r--src/stardict.c169
5 files changed, 170 insertions, 18 deletions
diff --git a/.travis.yml b/.travis.yml
index 159fd4d..068dc7c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,7 +23,7 @@ before_install:
- sudo apt-get update -qq
install:
- sudo apt-get install -y xsltproc docbook-xsl zlib1g-dev libncursesw5-dev
- - sudo apt-get install -y libgtk-3-dev libpango1.0-dev
+ - sudo apt-get install -y libgtk-3-dev libpango1.0-dev libicu-dev
before_script:
- mkdir build
- cd build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e7ac15e..c50450e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,8 @@ set (project_VERSION "${project_VERSION}.${project_VERSION_PATCH}")
# Dependencies
find_package (ZLIB REQUIRED)
find_package (PkgConfig REQUIRED)
-pkg_check_modules (dependencies REQUIRED ncursesw glib-2.0 gio-2.0 pango)
+pkg_check_modules (dependencies REQUIRED
+ ncursesw glib-2.0 gio-2.0 pango icu-uc icu-i18n)
if (USE_SYSTEM_TERMO)
find_package (Termo REQUIRED)
diff --git a/README b/README
index 0e19183..3be700b 100644
--- a/README
+++ b/README
@@ -19,7 +19,7 @@ this regard.
Building and Running
--------------------
Build dependencies: CMake, pkg-config, xsltproc, docbook-xsl,
- ncursesw, zlib, termo (included),
+ ncursesw, zlib, ICU, termo (included),
glib-2.0, pango, gtk+ (optional, any version)
$ git clone https://github.com/pjanouch/sdtui.git
@@ -45,6 +45,14 @@ argument. If you want the application to watch the X11 primary selection for
changes and automatically search for the selected text, use the -w switch.
This feature requires GTK+.
+Extensions
+----------
+As the original StarDict is a bit of a clusterfuck with regard to collation of
+dictionary entries, I had to introduce an additional "collation" field into the
+.ifo file. When sdtui discovers this field while reading the dictionary, it
+automatically reorders the index according to that locale (e.g. "cs_CZ").
+This operation may take a little while.
+
Dictionaries
------------
Unfortunately this application only really works with specific dictionaries.
diff --git a/src/stardict-private.h b/src/stardict-private.h
index 4a97eea..123c0c3 100644
--- a/src/stardict-private.h
+++ b/src/stardict-private.h
@@ -1,7 +1,7 @@
/*
* stardict-private.h: internal StarDict API
*
- * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
+ * Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>
* All rights reserved.
*
* Permission to use, copy, modify, and/or distribute this software for any
@@ -47,6 +47,8 @@ struct stardict_info
gchar * description;
gchar * date;
gchar * same_type_sequence;
+
+ gchar * collation;
};
struct stardict_index_entry
diff --git a/src/stardict.c b/src/stardict.c
index d81848c..b7f09ab 100644
--- a/src/stardict.c
+++ b/src/stardict.c
@@ -1,7 +1,7 @@
/*
* stardict.c: StarDict API
*
- * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
+ * Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>
* All rights reserved.
*
* Permission to use, copy, modify, and/or distribute this software for any
@@ -27,6 +27,8 @@
#include <gio/gio.h>
#include <glib/gi18n.h>
+#include <unicode/ucol.h>
+
#include "stardict.h"
#include "stardict-private.h"
#include "dictzip-input-stream.h"
@@ -177,6 +179,8 @@ stardict_info_free (StardictInfo *sdi)
g_free (sdi->description);
g_free (sdi->date);
g_free (sdi->same_type_sequence);
+
+ g_free (sdi->collation);
g_free (sdi);
}
@@ -194,7 +198,10 @@ const struct stardict_ifo_key _stardict_ifo_keys[] =
DEFINE_IFO_KEY ("website", STRING, website),
DEFINE_IFO_KEY ("description", STRING, description),
DEFINE_IFO_KEY ("date", STRING, date),
- DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence)
+ DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence),
+
+ // These are our own custom
+ DEFINE_IFO_KEY ("collation", STRING, collation)
};
gsize _stardict_ifo_keys_length = G_N_ELEMENTS (_stardict_ifo_keys);
@@ -358,6 +365,12 @@ struct stardict_dict_private
GArray * index; //!< Word index
GArray * synonyms; //!< Synonyms
+ /* The collated indexes are only permutations of their normal selves. */
+
+ UCollator * collator; //!< ICU index collator
+ GArray * collated_index; //!< Sorted indexes into @a index
+ GArray * collated_synonyms; //!< Sorted indexes into @a synonyms
+
/* There are currently three ways the dictionary data can be read:
* through mmap(), from a seekable GInputStream, or from a preallocated
* chunk of memory that the whole dictionary has been decompressed into.
@@ -384,6 +397,13 @@ stardict_dict_finalize (GObject *self)
g_array_free (priv->index, TRUE);
g_array_free (priv->synonyms, TRUE);
+ if (priv->collator)
+ ucol_close (priv->collator);
+ if (priv->collated_index)
+ g_array_free (priv->collated_index, TRUE);
+ if (priv->collated_synonyms)
+ g_array_free (priv->collated_synonyms, TRUE);
+
if (priv->mapped_dict)
g_mapped_file_unref (priv->mapped_dict);
else if (priv->dict_stream)
@@ -641,6 +661,90 @@ cannot_open:
return TRUE;
}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/** Compare the two strings by collation rules. */
+static inline gint
+stardict_dict_strcoll (gconstpointer s1, gconstpointer s2, gpointer data)
+{
+ StardictDict *sd = data;
+ UErrorCode error = U_ZERO_ERROR;
+ return ucol_strcollUTF8 (sd->priv->collator, s1, -1, s2, -1, &error);
+}
+
+/** Stricter stardict_dict_strcoll() used to sort the collated index. */
+static inline gint
+stardict_dict_strcoll_for_sorting
+ (gconstpointer s1, gconstpointer s2, gpointer data)
+{
+ UCollationResult a = stardict_dict_strcoll (s1, s2, data);
+ return a ? a : strcmp (s1, s2);
+}
+
+static inline gint
+stardict_dict_index_coll_for_sorting
+ (gconstpointer x1, gconstpointer x2, gpointer data)
+{
+ StardictDict *sd = data;
+ const gchar *s1 = g_array_index
+ (sd->priv->index, StardictIndexEntry, *(guint32 *) x1).name;
+ const gchar *s2 = g_array_index
+ (sd->priv->index, StardictIndexEntry, *(guint32 *) x2).name;
+ return stardict_dict_strcoll_for_sorting (s1, s2, data);
+}
+
+static inline gint
+stardict_dict_synonyms_coll_for_sorting
+ (gconstpointer x1, gconstpointer x2, gpointer data)
+{
+ StardictDict *sd = data;
+ const gchar *s1 = g_array_index
+ (sd->priv->index, StardictSynonymEntry, *(guint32 *) x1).word;
+ const gchar *s2 = g_array_index
+ (sd->priv->index, StardictSynonymEntry, *(guint32 *) x2).word;
+ return stardict_dict_strcoll_for_sorting (s1, s2, data);
+}
+
+static gboolean
+stardict_dict_set_collation (StardictDict *sd, const gchar *collation)
+{
+ StardictDictPrivate *priv = sd->priv;
+ UErrorCode error = U_ZERO_ERROR;
+ if (!(priv->collator = ucol_open (collation, &error)))
+ {
+ // TODO: set a meaningful error
+ g_info ("failed to create a collator for `%s'", collation);
+ return FALSE;
+ }
+
+ // TODO: if error != U_ZERO_ERROR, report a meaningful message
+
+ ucol_setAttribute (priv->collator, UCOL_CASE_FIRST, UCOL_OFF, &error);
+
+ priv->collated_index = g_array_sized_new (FALSE, FALSE,
+ sizeof (guint32), priv->index->len);
+ for (guint32 i = 0; i < priv->index->len; i++)
+ g_array_append_val (priv->collated_index, i);
+ g_array_sort_with_data (sd->priv->collated_index,
+ stardict_dict_index_coll_for_sorting, sd);
+
+ priv->collated_synonyms = g_array_sized_new (FALSE, FALSE,
+ sizeof (guint32), priv->synonyms->len);
+ for (guint32 i = 0; i < priv->synonyms->len; i++)
+ g_array_append_val (priv->collated_synonyms, i);
+ g_array_sort_with_data (sd->priv->collated_synonyms,
+ stardict_dict_synonyms_coll_for_sorting, sd);
+
+ // Make the collator something like case-insensitive, see:
+ // http://userguide.icu-project.org/collation/concepts
+ // We shouldn't need to sort the data anymore, and if we did, we could just
+ // reset the strength to its default value for the given locale.
+ ucol_setStrength (priv->collator, UCOL_SECONDARY);
+ return TRUE;
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
/** Load a StarDict dictionary.
* @param[in] sdi Parsed .ifo data. The dictionary assumes ownership.
*/
@@ -709,9 +813,12 @@ stardict_dict_new_from_info (StardictInfo *sdi, GError **error)
gchar *base_syn = g_strconcat (base, ".syn", NULL);
if (g_file_test (base_syn, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR))
- load_syn (sd, base_syn, NULL);
+ (void) load_syn (sd, base_syn, NULL);
g_free (base_syn);
+ if (sdi->collation)
+ (void) stardict_dict_set_collation (sd, sdi->collation);
+
g_free (base);
return sd;
@@ -722,6 +829,20 @@ error:
return NULL;
}
+static gint
+stardict_dict_cmp_synonym (StardictDict *sd, const gchar *word, gint i)
+{
+ GArray *collated = sd->priv->collated_synonyms;
+ GArray *synonyms = sd->priv->synonyms;
+
+ if (sd->priv->collator)
+ return stardict_dict_strcoll (word,
+ g_array_index (synonyms, StardictSynonymEntry,
+ g_array_index (collated, guint32, i)).word, sd);
+ return g_ascii_strcasecmp (word,
+ g_array_index (synonyms, StardictSynonymEntry, i).word);
+}
+
/** Return words for which the argument is a synonym of or NULL
* if there are no such words.
*/
@@ -731,12 +852,12 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
GArray *synonyms = sd->priv->synonyms;
GArray *index = sd->priv->index;
- BINARY_SEARCH_BEGIN (synonyms->len - 1, g_ascii_strcasecmp (word,
- g_array_index (synonyms, StardictSynonymEntry, imid).word))
+ BINARY_SEARCH_BEGIN (synonyms->len - 1,
+ stardict_dict_cmp_synonym (sd, word, imid))
// Back off to the first matching entry
- while (imid > 0 && !g_ascii_strcasecmp (word,
- g_array_index (synonyms, StardictSynonymEntry, --imid).word));
+ while (imid > 0 && !stardict_dict_cmp_synonym (sd, word, --imid))
+ ;
GPtrArray *array = g_ptr_array_new ();
@@ -751,10 +872,23 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
return (gchar **) g_ptr_array_free (array, FALSE);
BINARY_SEARCH_END
-
return NULL;
}
+static gint
+stardict_dict_cmp_index (StardictDict *sd, const gchar *word, gint i)
+{
+ GArray *collated = sd->priv->collated_index;
+ GArray *index = sd->priv->index;
+
+ if (sd->priv->collator)
+ return stardict_dict_strcoll (word,
+ g_array_index (index, StardictIndexEntry,
+ g_array_index (collated, guint32, i)).name, sd);
+ return g_ascii_strcasecmp (word,
+ g_array_index (index, StardictIndexEntry, i).name);
+}
+
/** Search for a word. The search is ASCII-case-insensitive.
* @param[in] word The word in utf-8 encoding
* @param[out] success TRUE if found
@@ -765,12 +899,11 @@ stardict_dict_search (StardictDict *sd, const gchar *word, gboolean *success)
{
GArray *index = sd->priv->index;
- BINARY_SEARCH_BEGIN (index->len - 1, g_ascii_strcasecmp (word,
- g_array_index (index, StardictIndexEntry, imid).name))
+ BINARY_SEARCH_BEGIN (index->len - 1,
+ stardict_dict_cmp_index (sd, word, imid))
// Back off to the first matching entry
- while (imid > 0 && !g_ascii_strcasecmp (word,
- g_array_index (index, StardictIndexEntry, imid - 1).name))
+ while (imid > 0 && !stardict_dict_cmp_index (sd, word, imid - 1))
imid--;
if (success) *success = TRUE;
@@ -1051,6 +1184,13 @@ stardict_iterator_new (StardictDict *sd, guint32 offset)
return si;
}
+static gint64
+stardict_iterator_get_real_offset (StardictIterator *sdi)
+{
+ return sdi->owner->priv->collator ? g_array_index
+ (sdi->owner->priv->collated_index, guint32, sdi->offset) : sdi->offset;
+}
+
/** Return the word in the index that the iterator points at, or NULL. */
const gchar *
stardict_iterator_get_word (StardictIterator *sdi)
@@ -1059,7 +1199,7 @@ stardict_iterator_get_word (StardictIterator *sdi)
if (!stardict_iterator_is_valid (sdi))
return NULL;
return g_array_index (sdi->owner->priv->index,
- StardictIndexEntry, sdi->offset).name;
+ StardictIndexEntry, stardict_iterator_get_real_offset (sdi)).name;
}
/** Return the dictionary entry that the iterator points at, or NULL. */
@@ -1069,7 +1209,8 @@ stardict_iterator_get_entry (StardictIterator *sdi)
g_return_val_if_fail (STARDICT_IS_ITERATOR (sdi), NULL);
if (!stardict_iterator_is_valid (sdi))
return FALSE;
- return stardict_dict_get_entry (sdi->owner, sdi->offset);
+ return stardict_dict_get_entry (sdi->owner,
+ stardict_iterator_get_real_offset (sdi));
}
/** Return whether the iterator points to a valid index entry. */