diff options
author | Přemysl Janouch <p.janouch@gmail.com> | 2016-09-26 15:59:26 +0200 |
---|---|---|
committer | Přemysl Janouch <p.janouch@gmail.com> | 2016-09-26 15:59:26 +0200 |
commit | a59104191270b74d0a1312a6b83884ffb9334691 (patch) | |
tree | f2cfc11f27110e8570871773db0236a08bff7778 /src/stardict.c | |
parent | a87aca9c76c57770a69740b7e1ac9a56eb73f22c (diff) | |
download | tdv-a59104191270b74d0a1312a6b83884ffb9334691.tar.gz tdv-a59104191270b74d0a1312a6b83884ffb9334691.tar.xz tdv-a59104191270b74d0a1312a6b83884ffb9334691.zip |
Highlight the longest matching prefix of entries
Diffstat (limited to 'src/stardict.c')
-rw-r--r-- | src/stardict.c | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/src/stardict.c b/src/stardict.c index 42c7548..9ce6059 100644 --- a/src/stardict.c +++ b/src/stardict.c @@ -29,6 +29,7 @@ #include <unicode/ucol.h> #include <unicode/ustring.h> +#include <unicode/ubrk.h> #include "stardict.h" #include "stardict-private.h" @@ -934,6 +935,73 @@ stardict_dict_search (StardictDict *sd, const gchar *word, gboolean *success) return stardict_iterator_new (sd, imin); } +/// Return the longest sequence of bytes from @a s1 that form a common prefix +/// with @a s2 wrt. collation rules for this dictionary. +size_t +stardict_longest_common_collation_prefix (StardictDict *sd, + const gchar *s1, const gchar *s2) +{ + UErrorCode error; + int32_t uc1_len = 0; + int32_t uc2_len = 0; + + // It sets the error to overflow each time, even during pre-flight + error = U_ZERO_ERROR; + u_strFromUTF8 (NULL, 0, &uc1_len, s1, -1, &error); + error = U_ZERO_ERROR; + u_strFromUTF8 (NULL, 0, &uc2_len, s2, -1, &error); + error = U_ZERO_ERROR; + + UChar uc1[uc1_len]; + UChar uc2[uc2_len]; + u_strFromUTF8 (uc1, uc1_len, NULL, s1, -1, &error); + u_strFromUTF8 (uc2, uc2_len, NULL, s2, -1, &error); + + // Both inputs need to be valid UTF-8 because of all the iteration mess + if (U_FAILURE (error)) + return 0; + + // ucol_getSortKey() can't be used for these purposes, so the only + // reasonable thing remaining is iterating by full graphemes. It doesn't + // work entirely correctly (e.g. Czech "ch" should be regarded as a single + // unit, and punctuation could be ignored). It's just good enough. + // + // In theory we could set the strength to UCOL_PRIMARY and ignore accents + // but that's likely not what the user wants most of the time. + // + // Locale shouldn't matter much with graphemes, let's use the default. + UBreakIterator *it1 = + ubrk_open (UBRK_CHARACTER, NULL, uc1, uc1_len, &error); + UBreakIterator *it2 = + ubrk_open (UBRK_CHARACTER, NULL, uc2, uc2_len, &error); + + int32_t longest = 0; + int32_t pos1, pos2; + while ((pos1 = ubrk_next (it1)) != UBRK_DONE + && (pos2 = ubrk_next (it2)) != UBRK_DONE) + { + if (!ucol_strcoll (sd->priv->collator, uc1, pos1, uc2, pos2)) + longest = pos1; + } + ubrk_close (it1); + ubrk_close (it2); + + if (!longest) + return 0; + + int32_t common_len = 0; + u_strToUTF8 (NULL, 0, &common_len, uc1, longest, &error); + + // Since this heavily depends on UTF-16 <-> UTF-8 not modifying the chars + // (surrogate pairs interference?), let's add some paranoia here + char common[common_len]; + error = U_ZERO_ERROR; + u_strToUTF8 (common, common_len, NULL, uc1, longest, &error); + g_return_val_if_fail (!memcmp (s1, common, common_len), 0); + + return (size_t) common_len; +} + static void stardict_entry_field_free (StardictEntryField *sef) { |