From 9d14562f7e81441b244dc9b2e69004ad285a5ff2 Mon Sep 17 00:00:00 2001 From: Přemysl Eric Janouch
Date: Mon, 12 Oct 2020 22:34:46 +0200 Subject: Improve the UTF-8 API We need to be able to detect partial sequences. --- liberty.c | 54 +++++++++++++++++++++++++----------------------------- tests/liberty.c | 5 +++++ 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/liberty.c b/liberty.c index 6c9cee3..5e15ee2 100644 --- a/liberty.c +++ b/liberty.c @@ -2722,63 +2722,59 @@ isspace_ascii (int c) // --- UTF-8 ------------------------------------------------------------------- -/// Return a pointer to the next UTF-8 character, or NULL on error -static const char * -utf8_next (const char *s, size_t len, int32_t *codepoint) +/// Return the value of the UTF-8 character at `*s` and advance the pointer +/// to the next one. Returns -2 if there is only a partial but possibly valid +/// character sequence, or -1 on other errors. Either way, `*s` is untouched. +static int32_t +utf8_decode (const char **s, size_t len) { // End of string, we go no further if (!len) - return NULL; + return -1; // Find out how long the sequence is (0 for ASCII) unsigned mask = 0x80; unsigned sequence_len = 0; - const uint8_t *p = (const uint8_t *) s; + const uint8_t *p = (const uint8_t *) *s, *end = p + len; while ((*p & mask) == mask) { // Invalid start of sequence if (mask == 0xFE) - return NULL; + return -1; mask |= mask >> 1; sequence_len++; } - // In the middle of a character or the input is too short - if (sequence_len == 1 || sequence_len > len) - return NULL; + // In the middle of a character + if (sequence_len == 1) + return -1; // Check the rest of the sequence uint32_t cp = *p++ & ~mask; while (sequence_len && --sequence_len) { + if (p == end) + return -2; if ((*p & 0xC0) != 0x80) - return NULL; + return -1; cp = cp << 6 | (*p++ & 0x3F); } - if (codepoint) - *codepoint = cp; - return (const char *) p; + *s = (const char *) p; + return cp; } /// Very rough UTF-8 validation, just makes sure codepoints can be iterated static bool utf8_validate (const char *s, size_t len) { - const char *next; - while (len) - { - int32_t codepoint; - // TODO: better validations - if (!(next = utf8_next (s, len, &codepoint)) - || codepoint > 0x10FFFF) - return false; - - len -= next - s; - s = next; - } - return true; + const char *end = s + len; + int32_t codepoint; + while ((codepoint = utf8_decode (&s, end - s)) >= 0 + && codepoint <= 0x10FFFF /* TODO: better validations */) + ; + return s == end; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -2802,12 +2798,12 @@ utf8_iter_next (struct utf8_iter *self, size_t *len) return -1; const char *old = self->s; - int32_t codepoint; - if (!soft_assert ((self->s = utf8_next (old, self->len, &codepoint)))) + int32_t codepoint = utf8_decode (&self->s, self->len); + if (!soft_assert (codepoint >= 0)) { // Invalid UTF-8 self->len = 0; - return -1; + return codepoint; } size_t advance = self->s - old; diff --git a/tests/liberty.c b/tests/liberty.c index 65847fe..b55fe2c 100644 --- a/tests/liberty.c +++ b/tests/liberty.c @@ -326,6 +326,11 @@ test_str_map (void) static void test_utf8 (void) { + const char *full = "\xc5\x99", *partial = full, *empty = full; + soft_assert (utf8_decode (&full, 2) == 0x0159); + soft_assert (utf8_decode (&partial, 1) == -2); + soft_assert (utf8_decode (&empty, 0) == -1); + const char valid [] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm"; const char invalid[] = "\xf0\x90\x28\xbc"; soft_assert ( utf8_validate (valid, sizeof valid)); -- cgit v1.2.3-70-g09d2