aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul LeoNerd Evans <leonerd@leonerd.org.uk>2011-03-31 23:42:52 +0100
committerPaul LeoNerd Evans <leonerd@leonerd.org.uk>2011-03-31 23:42:52 +0100
commitf1b3dff4c2075d9304dd4c298db433c7d404f3a3 (patch)
tree0705094f83a89c098ce3293edacc6e36c888f7ef
parent739be0e55d1301856ae6e86f85903a7f86b3b878 (diff)
downloadtermo-f1b3dff4c2075d9304dd4c298db433c7d404f3a3.tar.gz
termo-f1b3dff4c2075d9304dd4c298db433c7d404f3a3.tar.xz
termo-f1b3dff4c2075d9304dd4c298db433c7d404f3a3.zip
Extract UTF-8 sequence parser into its own function, fix bugs, apply unit tests
-rw-r--r--t/03utf8.c93
-rw-r--r--termkey.c145
2 files changed, 169 insertions, 69 deletions
diff --git a/t/03utf8.c b/t/03utf8.c
index d4d8da3..34fce6f 100644
--- a/t/03utf8.c
+++ b/t/03utf8.c
@@ -7,7 +7,7 @@ int main(int argc, char *argv[])
TermKey *tk;
TermKeyKey key;
- plan_tests(21);
+ plan_tests(57);
pipe(fd);
@@ -72,6 +72,97 @@ int main(int argc, char *argv[])
is_int(key.type, TERMKEY_TYPE_UNICODE, "key.type UTF-8 4 high");
is_int(key.code.number, 0x10FFFF, "key.code.number UTF-8 4 high");
+ /* Invalid continuations */
+
+ write(fd[1], "\xC2!", 2);
+
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 2 invalid cont");
+ is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 2 invalid cont");
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 2 invalid after");
+ is_int(key.code.number, '!', "key.code.number UTF-8 2 invalid after");
+
+ write(fd[1], "\xE0!", 2);
+
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid cont");
+ is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 3 invalid cont");
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid after");
+ is_int(key.code.number, '!', "key.code.number UTF-8 3 invalid after");
+
+ write(fd[1], "\xE0\xA0!", 3);
+
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid cont 2");
+ is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 3 invalid cont 2");
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid after");
+ is_int(key.code.number, '!', "key.code.number UTF-8 3 invalid after");
+
+ write(fd[1], "\xF0!", 2);
+
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid cont");
+ is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 4 invalid cont");
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid after");
+ is_int(key.code.number, '!', "key.code.number UTF-8 4 invalid after");
+
+ write(fd[1], "\xF0\x90!", 3);
+
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid cont 2");
+ is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 4 invalid cont 2");
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid after");
+ is_int(key.code.number, '!', "key.code.number UTF-8 4 invalid after");
+
+ write(fd[1], "\xF0\x90\x80!", 4);
+
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid cont 3");
+ is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 4 invalid cont 3");
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid after");
+ is_int(key.code.number, '!', "key.code.number UTF-8 4 invalid after");
+
+ /* Partials */
+
+ write(fd[1], "\xC2", 1);
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 2 partial");
+
+ write(fd[1], "\xA0", 1);
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 2 partial");
+ is_int(key.code.number, 0x00A0, "key.code.number UTF-8 2 partial");
+
+ write(fd[1], "\xE0", 1);
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 3 partial");
+
+ write(fd[1], "\xA0", 1);
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 3 partial");
+
+ write(fd[1], "\x80", 1);
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 partial");
+ is_int(key.code.number, 0x0800, "key.code.number UTF-8 3 partial");
+
+ write(fd[1], "\xF0", 1);
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 4 partial");
+
+ write(fd[1], "\x90", 1);
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 4 partial");
+
+ write(fd[1], "\x80", 1);
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 4 partial");
+
+ write(fd[1], "\x80", 1);
+ termkey_advisereadable(tk);
+ is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 partial");
+ is_int(key.code.number, 0x10000, "key.code.number UTF-8 4 partial");
+
termkey_destroy(tk);
return exit_status();
diff --git a/termkey.c b/termkey.c
index 881122e..05c1512 100644
--- a/termkey.c
+++ b/termkey.c
@@ -422,6 +422,76 @@ static void fill_utf8(TermKeyKey *key)
}
}
+#define UTF8_INVALID 0xFFFD
+static TermKeyResult parse_utf8(const unsigned char *bytes, size_t len, long *cp, size_t *nbytep)
+{
+ unsigned int nbytes;
+
+ unsigned char b0 = bytes[0];
+
+ if(b0 < 0xc0) {
+ // Starts with a continuation byte - that's not right
+ *cp = UTF8_INVALID;
+ *nbytep = 1;
+ return TERMKEY_RES_KEY;
+ }
+ else if(b0 < 0xe0) {
+ nbytes = 2;
+ *cp = b0 & 0x1f;
+ }
+ else if(b0 < 0xf0) {
+ nbytes = 3;
+ *cp = b0 & 0x0f;
+ }
+ else if(b0 < 0xf8) {
+ nbytes = 4;
+ *cp = b0 & 0x07;
+ }
+ else if(b0 < 0xfc) {
+ nbytes = 5;
+ *cp = b0 & 0x03;
+ }
+ else if(b0 < 0xfe) {
+ nbytes = 6;
+ *cp = b0 & 0x01;
+ }
+ else {
+ *cp = UTF8_INVALID;
+ *nbytep = 1;
+ return TERMKEY_RES_KEY;
+ }
+
+ for(unsigned int b = 1; b < nbytes; b++) {
+ unsigned char cb;
+
+ if(b >= len)
+ return TERMKEY_RES_AGAIN;
+
+ cb = bytes[b];
+ if(cb < 0x80 || cb >= 0xc0) {
+ *cp = UTF8_INVALID;
+ *nbytep = b;
+ return TERMKEY_RES_KEY;
+ }
+
+ *cp <<= 6;
+ *cp |= cb & 0x3f;
+ }
+
+ // Check for overlong sequences
+ if(nbytes > utf8_seqlen(*cp))
+ *cp = UTF8_INVALID;
+
+ // Check for UTF-16 surrogates or invalid *cps
+ if((*cp >= 0xD800 && *cp <= 0xDFFF) ||
+ *cp == 0xFFFE ||
+ *cp == 0xFFFF)
+ *cp = UTF8_INVALID;
+
+ *nbytep = nbytes;
+ return TERMKEY_RES_KEY;
+}
+
static void emit_codepoint(TermKey *tk, long codepoint, TermKeyKey *key)
{
if(codepoint < 0x20) {
@@ -487,8 +557,6 @@ static void emit_codepoint(TermKey *tk, long codepoint, TermKeyKey *key)
fill_utf8(key);
}
-#define UTF8_INVALID 0xFFFD
-
static TermKeyResult peekkey(TermKey *tk, TermKeyKey *key, int force, size_t *nbytep)
{
int again = 0;
@@ -604,83 +672,24 @@ static TermKeyResult peekkey_simple(TermKey *tk, TermKeyKey *key, int force, siz
}
else if(tk->flags & TERMKEY_FLAG_UTF8) {
// Some UTF-8
- unsigned int nbytes;
long codepoint;
+ TermKeyResult res = parse_utf8(tk->buffer + tk->buffstart, tk->buffcount, &codepoint, nbytep);
- key->type = TERMKEY_TYPE_UNICODE;
- key->modifiers = 0;
-
- if(b0 < 0xc0) {
- // Starts with a continuation byte - that's not right
- (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
- *nbytep = 1;
- return TERMKEY_RES_KEY;
- }
- else if(b0 < 0xe0) {
- nbytes = 2;
- codepoint = b0 & 0x1f;
- }
- else if(b0 < 0xf0) {
- nbytes = 3;
- codepoint = b0 & 0x0f;
- }
- else if(b0 < 0xf8) {
- nbytes = 4;
- codepoint = b0 & 0x07;
- }
- else if(b0 < 0xfc) {
- nbytes = 5;
- codepoint = b0 & 0x03;
- }
- else if(b0 < 0xfe) {
- nbytes = 6;
- codepoint = b0 & 0x01;
- }
- else {
- (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
- *nbytep = 1;
- return TERMKEY_RES_KEY;
- }
-
- if(tk->buffcount < nbytes) {
- if(!force)
- return TERMKEY_RES_AGAIN;
-
+ if(res == TERMKEY_RES_AGAIN && force) {
/* There weren't enough bytes for a complete UTF-8 sequence but caller
* demands an answer. About the best thing we can do here is eat as many
* bytes as we have, and emit a UTF8_INVALID. If the remaining bytes
* arrive later, they'll be invalid too.
*/
- (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
+ codepoint = UTF8_INVALID;
*nbytep = tk->buffcount;
- return TERMKEY_RES_KEY;
- }
-
- for(unsigned int b = 1; b < nbytes; b++) {
- unsigned char cb = CHARAT(b);
- if(cb < 0x80 || cb >= 0xc0) {
- (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
- *nbytep = b - 1;
- return TERMKEY_RES_KEY;
- }
-
- codepoint <<= 6;
- codepoint |= cb & 0x3f;
+ res = TERMKEY_RES_KEY;
}
- // Check for overlong sequences
- if(nbytes > utf8_seqlen(codepoint))
- codepoint = UTF8_INVALID;
-
- // Check for UTF-16 surrogates or invalid codepoints
- if((codepoint >= 0xD800 && codepoint <= 0xDFFF) ||
- codepoint == 0xFFFE ||
- codepoint == 0xFFFF)
- codepoint = UTF8_INVALID;
-
+ key->type = TERMKEY_TYPE_UNICODE;
+ key->modifiers = 0;
(*tk->method.emit_codepoint)(tk, codepoint, key);
- *nbytep = nbytes;
- return TERMKEY_RES_KEY;
+ return res;
}
else {
// Non UTF-8 case - just report the raw byte