Extract UTF-8 sequence parser into its own function, fix bugs, apply unit tests

author: Paul LeoNerd Evans <leonerd@leonerd.org.uk> 2011-03-31 23:42:52 +0100
committer: Paul LeoNerd Evans <leonerd@leonerd.org.uk> 2011-03-31 23:42:52 +0100
commit: f1b3dff4c2075d9304dd4c298db433c7d404f3a3 (patch)
tree: 0705094f83a89c098ce3293edacc6e36c888f7ef /termkey.c
parent: 739be0e55d1301856ae6e86f85903a7f86b3b878 (diff)
download: termo-f1b3dff4c2075d9304dd4c298db433c7d404f3a3.tar.gz
termo-f1b3dff4c2075d9304dd4c298db433c7d404f3a3.tar.xz
termo-f1b3dff4c2075d9304dd4c298db433c7d404f3a3.zip
1 files changed, 77 insertions, 68 deletions
diff --git a/termkey.c b/termkey.c
index 881122e..05c1512 100644
--- a/termkey.c
+++ b/termkey.c
@@ -422,6 +422,76 @@ static void fill_utf8(TermKeyKey *key)
   }
 }
 
+#define UTF8_INVALID 0xFFFD
+static TermKeyResult parse_utf8(const unsigned char *bytes, size_t len, long *cp, size_t *nbytep)
+{
+  unsigned int nbytes;
+
+  unsigned char b0 = bytes[0];
+
+  if(b0 < 0xc0) {
+    // Starts with a continuation byte - that's not right
+    *cp = UTF8_INVALID;
+    *nbytep = 1;
+    return TERMKEY_RES_KEY;
+  }
+  else if(b0 < 0xe0) {
+    nbytes = 2;
+    *cp = b0 & 0x1f;
+  }
+  else if(b0 < 0xf0) {
+    nbytes = 3;
+    *cp = b0 & 0x0f;
+  }
+  else if(b0 < 0xf8) {
+    nbytes = 4;
+    *cp = b0 & 0x07;
+  }
+  else if(b0 < 0xfc) {
+    nbytes = 5;
+    *cp = b0 & 0x03;
+  }
+  else if(b0 < 0xfe) {
+    nbytes = 6;
+    *cp = b0 & 0x01;
+  }
+  else {
+    *cp = UTF8_INVALID;
+    *nbytep = 1;
+    return TERMKEY_RES_KEY;
+  }
+
+  for(unsigned int b = 1; b < nbytes; b++) {
+    unsigned char cb;
+
+    if(b >= len)
+      return TERMKEY_RES_AGAIN;
+
+    cb = bytes[b];
+    if(cb < 0x80 || cb >= 0xc0) {
+      *cp = UTF8_INVALID;
+      *nbytep = b;
+      return TERMKEY_RES_KEY;
+    }
+
+    *cp <<= 6;
+    *cp |= cb & 0x3f;
+  }
+
+  // Check for overlong sequences
+  if(nbytes > utf8_seqlen(*cp))
+    *cp = UTF8_INVALID;
+
+  // Check for UTF-16 surrogates or invalid *cps
+  if((*cp >= 0xD800 && *cp <= 0xDFFF) ||
+     *cp == 0xFFFE ||
+     *cp == 0xFFFF)
+    *cp = UTF8_INVALID;
+
+  *nbytep = nbytes;
+  return TERMKEY_RES_KEY;
+}
+
 static void emit_codepoint(TermKey *tk, long codepoint, TermKeyKey *key)
 {
   if(codepoint < 0x20) {
@@ -487,8 +557,6 @@ static void emit_codepoint(TermKey *tk, long codepoint, TermKeyKey *key)
     fill_utf8(key);
 }
 
-#define UTF8_INVALID 0xFFFD
-
 static TermKeyResult peekkey(TermKey *tk, TermKeyKey *key, int force, size_t *nbytep)
 {
   int again = 0;
@@ -604,83 +672,24 @@ static TermKeyResult peekkey_simple(TermKey *tk, TermKeyKey *key, int force, siz
   }
   else if(tk->flags & TERMKEY_FLAG_UTF8) {
     // Some UTF-8
-    unsigned int nbytes;
     long codepoint;
+    TermKeyResult res = parse_utf8(tk->buffer + tk->buffstart, tk->buffcount, &codepoint, nbytep);
 
-    key->type = TERMKEY_TYPE_UNICODE;
-    key->modifiers = 0;
-
-    if(b0 < 0xc0) {
-      // Starts with a continuation byte - that's not right
-      (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
-      *nbytep = 1;
-      return TERMKEY_RES_KEY;
-    }
-    else if(b0 < 0xe0) {
-      nbytes = 2;
-      codepoint = b0 & 0x1f;
-    }
-    else if(b0 < 0xf0) {
-      nbytes = 3;
-      codepoint = b0 & 0x0f;
-    }
-    else if(b0 < 0xf8) {
-      nbytes = 4;
-      codepoint = b0 & 0x07;
-    }
-    else if(b0 < 0xfc) {
-      nbytes = 5;
-      codepoint = b0 & 0x03;
-    }
-    else if(b0 < 0xfe) {
-      nbytes = 6;
-      codepoint = b0 & 0x01;
-    }
-    else {
-      (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
-      *nbytep = 1;
-      return TERMKEY_RES_KEY;
-    }
-
-    if(tk->buffcount < nbytes) {
-      if(!force)
-        return TERMKEY_RES_AGAIN;
-
+    if(res == TERMKEY_RES_AGAIN && force) {
       /* There weren't enough bytes for a complete UTF-8 sequence but caller
        * demands an answer. About the best thing we can do here is eat as many
        * bytes as we have, and emit a UTF8_INVALID. If the remaining bytes
        * arrive later, they'll be invalid too.
        */
-      (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
+      codepoint = UTF8_INVALID;
       *nbytep = tk->buffcount;
-      return TERMKEY_RES_KEY;
-    }
-
-    for(unsigned int b = 1; b < nbytes; b++) {
-      unsigned char cb = CHARAT(b);
-      if(cb < 0x80 || cb >= 0xc0) {
-        (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
-        *nbytep = b - 1;
-        return TERMKEY_RES_KEY;
-      }
-
-      codepoint <<= 6;
-      codepoint |= cb & 0x3f;
+      res = TERMKEY_RES_KEY;
     }
 
-    // Check for overlong sequences
-    if(nbytes > utf8_seqlen(codepoint))
-      codepoint = UTF8_INVALID;
-
-    // Check for UTF-16 surrogates or invalid codepoints
-    if((codepoint >= 0xD800 && codepoint <= 0xDFFF) ||
-       codepoint == 0xFFFE ||
-       codepoint == 0xFFFF)
-      codepoint = UTF8_INVALID;
-
+    key->type = TERMKEY_TYPE_UNICODE;
+    key->modifiers = 0;
     (*tk->method.emit_codepoint)(tk, codepoint, key);
-    *nbytep = nbytes;
-    return TERMKEY_RES_KEY;
+    return res;
   }
   else {
     // Non UTF-8 case - just report the raw byte
author	Paul LeoNerd Evans <leonerd@leonerd.org.uk>	2011-03-31 23:42:52 +0100
committer	Paul LeoNerd Evans <leonerd@leonerd.org.uk>	2011-03-31 23:42:52 +0100
commit	f1b3dff4c2075d9304dd4c298db433c7d404f3a3 (patch)
tree	0705094f83a89c098ce3293edacc6e36c888f7ef /termkey.c
parent	739be0e55d1301856ae6e86f85903a7f86b3b878 (diff)
download	termo-f1b3dff4c2075d9304dd4c298db433c7d404f3a3.tar.gz termo-f1b3dff4c2075d9304dd4c298db433c7d404f3a3.tar.xz termo-f1b3dff4c2075d9304dd4c298db433c7d404f3a3.zip