aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPřemysl Eric Janouch <p@janouch.name>2020-10-12 22:34:46 +0200
committerPřemysl Eric Janouch <p@janouch.name>2020-10-12 22:56:22 +0200
commit9d14562f7e81441b244dc9b2e69004ad285a5ff2 (patch)
tree08c44be11f4023f6bd4a008d64c28c67dcd1b07b
parent9b723049635329feda63fab675df794e429aa151 (diff)
downloadliberty-9d14562f7e81441b244dc9b2e69004ad285a5ff2.tar.gz
liberty-9d14562f7e81441b244dc9b2e69004ad285a5ff2.tar.xz
liberty-9d14562f7e81441b244dc9b2e69004ad285a5ff2.zip
Improve the UTF-8 API
We need to be able to detect partial sequences.
-rw-r--r--liberty.c54
-rw-r--r--tests/liberty.c5
2 files changed, 30 insertions, 29 deletions
diff --git a/liberty.c b/liberty.c
index 6c9cee3..5e15ee2 100644
--- a/liberty.c
+++ b/liberty.c
@@ -2722,63 +2722,59 @@ isspace_ascii (int c)
// --- UTF-8 -------------------------------------------------------------------
-/// Return a pointer to the next UTF-8 character, or NULL on error
-static const char *
-utf8_next (const char *s, size_t len, int32_t *codepoint)
+/// Return the value of the UTF-8 character at `*s` and advance the pointer
+/// to the next one. Returns -2 if there is only a partial but possibly valid
+/// character sequence, or -1 on other errors. Either way, `*s` is untouched.
+static int32_t
+utf8_decode (const char **s, size_t len)
{
// End of string, we go no further
if (!len)
- return NULL;
+ return -1;
// Find out how long the sequence is (0 for ASCII)
unsigned mask = 0x80;
unsigned sequence_len = 0;
- const uint8_t *p = (const uint8_t *) s;
+ const uint8_t *p = (const uint8_t *) *s, *end = p + len;
while ((*p & mask) == mask)
{
// Invalid start of sequence
if (mask == 0xFE)
- return NULL;
+ return -1;
mask |= mask >> 1;
sequence_len++;
}
- // In the middle of a character or the input is too short
- if (sequence_len == 1 || sequence_len > len)
- return NULL;
+ // In the middle of a character
+ if (sequence_len == 1)
+ return -1;
// Check the rest of the sequence
uint32_t cp = *p++ & ~mask;
while (sequence_len && --sequence_len)
{
+ if (p == end)
+ return -2;
if ((*p & 0xC0) != 0x80)
- return NULL;
+ return -1;
cp = cp << 6 | (*p++ & 0x3F);
}
- if (codepoint)
- *codepoint = cp;
- return (const char *) p;
+ *s = (const char *) p;
+ return cp;
}
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated
static bool
utf8_validate (const char *s, size_t len)
{
- const char *next;
- while (len)
- {
- int32_t codepoint;
- // TODO: better validations
- if (!(next = utf8_next (s, len, &codepoint))
- || codepoint > 0x10FFFF)
- return false;
-
- len -= next - s;
- s = next;
- }
- return true;
+ const char *end = s + len;
+ int32_t codepoint;
+ while ((codepoint = utf8_decode (&s, end - s)) >= 0
+ && codepoint <= 0x10FFFF /* TODO: better validations */)
+ ;
+ return s == end;
}
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -2802,12 +2798,12 @@ utf8_iter_next (struct utf8_iter *self, size_t *len)
return -1;
const char *old = self->s;
- int32_t codepoint;
- if (!soft_assert ((self->s = utf8_next (old, self->len, &codepoint))))
+ int32_t codepoint = utf8_decode (&self->s, self->len);
+ if (!soft_assert (codepoint >= 0))
{
// Invalid UTF-8
self->len = 0;
- return -1;
+ return codepoint;
}
size_t advance = self->s - old;
diff --git a/tests/liberty.c b/tests/liberty.c
index 65847fe..b55fe2c 100644
--- a/tests/liberty.c
+++ b/tests/liberty.c
@@ -326,6 +326,11 @@ test_str_map (void)
static void
test_utf8 (void)
{
+ const char *full = "\xc5\x99", *partial = full, *empty = full;
+ soft_assert (utf8_decode (&full, 2) == 0x0159);
+ soft_assert (utf8_decode (&partial, 1) == -2);
+ soft_assert (utf8_decode (&empty, 0) == -1);
+
const char valid [] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm";
const char invalid[] = "\xf0\x90\x28\xbc";
soft_assert ( utf8_validate (valid, sizeof valid));