From f7c67b2a2eeaa0d62c4473751e2d062b2b082cc8 Mon Sep 17 00:00:00 2001
From: Přemysl Janouch
Date: Tue, 4 Oct 2016 13:21:45 +0200
Subject: Add termo_str{f,p}key_utf8()
Adding back some original UTF-8 handling code.
---
termo.c | 187 ++++++++++++++++++++++++++++++++++++++++++++++++++++------------
termo.h | 5 ++
2 files changed, 157 insertions(+), 35 deletions(-)
diff --git a/termo.c b/termo.c
index 75800e6..477242b 100644
--- a/termo.c
+++ b/termo.c
@@ -1048,28 +1048,25 @@ peekkey_simple (termo_t *tk, termo_key_t *key, int flags, size_t *nbytep)
}
}
-// XXX: With the current infrastructure I'm not sure how to properly handle
-// this. peekkey() isn't made for skipping invalid inputs.
-#define INVALID_1005 0x20
+// REPLACEMENT CHARACTER
+#define UTF8_INVALID 0xFFFD
-static termo_result_t
-parse_1005_value (const unsigned char **bytes, size_t *len, uint32_t *cp)
+static size_t
+parse_utf8_fast (const unsigned char *bytes, size_t len, uint32_t *cp)
{
- unsigned int nbytes;
- unsigned char b0 = (*bytes)[0];
+ size_t nbytes;
+ unsigned char b0 = bytes[0];
if (b0 < 0x80)
{
// Single byte ASCII
*cp = b0;
- nbytes = 1;
- goto end;
+ return 1;
}
else if (b0 < 0xc0)
{
// Starts with a continuation byte - that's not right
- *cp = INVALID_1005;
- nbytes = 1;
- goto end;
+ *cp = UTF8_INVALID;
+ return 1;
}
else if (b0 < 0xe0)
{
@@ -1098,28 +1095,39 @@ parse_1005_value (const unsigned char **bytes, size_t *len, uint32_t *cp)
}
else
{
- *cp = INVALID_1005;
- nbytes = 1;
- goto end;
+ *cp = UTF8_INVALID;
+ return 1;
}
- for (unsigned int b = 1; b < nbytes; b++)
+ for (size_t b = 1; b < nbytes; b++)
{
- if (b >= *len)
- return TERMO_RES_AGAIN;
+ if (b >= len)
+ return 0;
- unsigned char cb = (*bytes)[b];
+ unsigned char cb = bytes[b];
if (cb < 0x80 || cb >= 0xc0)
{
- *cp = INVALID_1005;
- nbytes = b;
- goto end;
+ *cp = UTF8_INVALID;
+ return b;
}
*cp <<= 6;
*cp |= cb & 0x3f;
}
+ return nbytes;
+}
+
+static termo_result_t
+parse_1005_value (const unsigned char **bytes, size_t *len, uint32_t *cp)
+{
+ size_t nbytes = parse_utf8_fast (*bytes, *len, cp);
+ if (nbytes == 0)
+ return TERMO_RES_AGAIN;
+
+ // XXX: With the current infrastructure I'm not sure how to properly handle
+ // this. peekkey() isn't made for skipping invalid inputs.
+ if (*cp == UTF8_INVALID)
+ *cp = 0x20;
-end:
(*bytes) += nbytes;
(*len) -= nbytes;
return TERMO_RES_KEY;
@@ -1430,6 +1438,8 @@ register_c0_full (termo_t *tk, termo_sym_t sym,
return sym;
}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
static struct modnames
{
const char *shift, *alt, *ctrl;
@@ -1446,9 +1456,11 @@ modnames[] =
{ "shift", "meta", "ctrl" }, // LOWERMOD+ALTISMETA+LONGMOD
};
-size_t
-termo_strfkey (termo_t *tk, char *buffer, size_t len,
- termo_key_t *key, termo_format_t format)
+typedef const char *(*strfkey_emit_fn) (termo_t *, termo_key_t *, char *);
+
+static size_t
+termo_strfkey_generic (termo_t *tk, char *buffer, size_t len,
+ termo_key_t *key, termo_format_t format, strfkey_emit_fn emit)
{
size_t pos = 0;
int l = 0;
@@ -1524,10 +1536,11 @@ termo_strfkey (termo_t *tk, char *buffer, size_t len,
switch (key->type)
{
case TERMO_TYPE_KEY:
- if (!key->multibyte[0]) // In case of user-supplied key structures
- fill_multibyte (tk, key);
- l = snprintf (buffer + pos, len - pos, "%s", key->multibyte);
+ {
+ char buf[MB_LEN_MAX + 1];
+ l = snprintf (buffer + pos, len - pos, "%s", emit (tk, key, buf));
break;
+ }
case TERMO_TYPE_KEYSYM:
{
const char *name = termo_get_keyname (tk, key->code.sym);
@@ -1596,9 +1609,74 @@ termo_strfkey (termo_t *tk, char *buffer, size_t len,
return pos;
}
-const char *
-termo_strpkey (termo_t *tk,
- const char *str, termo_key_t *key, termo_format_t format)
+static const char *
+strfkey_emit_locale (termo_t *tk, termo_key_t *key, char buf[])
+{
+ (void) buf;
+ if (!key->multibyte[0]) // In case of user-supplied key structures
+ fill_multibyte (tk, key);
+ return key->multibyte;
+}
+
+size_t
+termo_strfkey (termo_t *tk, char *buffer, size_t len,
+ termo_key_t *key, termo_format_t format)
+{
+ return termo_strfkey_generic
+ (tk, buffer, len, key, format, strfkey_emit_locale);
+}
+
+static inline size_t
+utf8_seqlen (uint32_t codepoint)
+{
+ if (codepoint < 0x0000080) return 1;
+ if (codepoint < 0x0000800) return 2;
+ if (codepoint < 0x0010000) return 3;
+ if (codepoint < 0x0200000) return 4;
+ if (codepoint < 0x4000000) return 5;
+ return 6;
+}
+
+static const char *
+strfkey_emit_utf8 (termo_t *tk, termo_key_t *key, char buf[])
+{
+ (void) tk;
+ uint32_t codepoint = key->code.codepoint;
+ int nbytes = utf8_seqlen (codepoint);
+ buf[nbytes] = 0;
+
+ // This is easier done backwards
+ for (int b = nbytes; b-- > 1; codepoint >>= 6)
+ buf[b] = 0x80 | (codepoint & 0x3f);
+
+ switch (nbytes)
+ {
+ case 1: buf[0] = (codepoint & 0x7f); break;
+ case 2: buf[0] = 0xc0 | (codepoint & 0x1f); break;
+ case 3: buf[0] = 0xe0 | (codepoint & 0x0f); break;
+ case 4: buf[0] = 0xf0 | (codepoint & 0x07); break;
+ case 5: buf[0] = 0xf8 | (codepoint & 0x03); break;
+ case 6: buf[0] = 0xfc | (codepoint & 0x01); break;
+ }
+ return buf;
+}
+
+size_t
+termo_strfkey_utf8 (termo_t *tk, char *buffer, size_t len,
+ termo_key_t *key, termo_format_t format)
+{
+ return termo_strfkey_generic
+ (tk, buffer, len, key, format, strfkey_emit_utf8);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+typedef termo_result_t (*strpkey_parse_fn)
+ (termo_t *, const unsigned char *, size_t, uint32_t *, size_t *);
+
+static const char *
+termo_strpkey_generic (termo_t *tk, const char *str, termo_key_t *key,
+ termo_format_t format, strpkey_parse_fn parse)
{
struct modnames *mods = &modnames[
!!(format & TERMO_FORMAT_LONGMOD) +
@@ -1609,8 +1687,8 @@ termo_strpkey (termo_t *tk,
if ((format & TERMO_FORMAT_CARETCTRL) && str[0] == '^' && str[1])
{
- str = termo_strpkey (tk,
- str + 1, key, format & ~TERMO_FORMAT_CARETCTRL);
+ str = termo_strpkey_generic (tk,
+ str + 1, key, format & ~TERMO_FORMAT_CARETCTRL, parse);
if (!str
|| key->type != TERMO_TYPE_KEY
@@ -1660,7 +1738,7 @@ termo_strpkey (termo_t *tk,
str += snbytes;
}
// Multibyte must be last
- else if (parse_multibyte (tk, (unsigned const char *) str, strlen (str),
+ else if (parse (tk, (unsigned const char *) str, strlen (str),
&key->code.codepoint, &nbytes) == TERMO_RES_KEY)
{
key->type = TERMO_TYPE_KEY;
@@ -1675,6 +1753,45 @@ termo_strpkey (termo_t *tk,
return (char *) str;
}
+const char *
+termo_strpkey (termo_t *tk,
+ const char *str, termo_key_t *key, termo_format_t format)
+{
+ return termo_strpkey_generic (tk, str, key, format, parse_multibyte);
+}
+
+static termo_result_t
+parse_utf8 (termo_t *tk, const unsigned char *bytes, size_t len,
+ uint32_t *cp, size_t *nbytep)
+{
+ (void) tk;
+ size_t nbytes = parse_utf8_fast (bytes, len, cp);
+ if (nbytes == 0)
+ return TERMO_RES_AGAIN;
+
+ // Check for overlong sequences
+ if (nbytes > utf8_seqlen (*cp))
+ *cp = UTF8_INVALID;
+
+ // Check for UTF-16 surrogates or invalid *cps
+ if ((*cp >= 0xD800 && *cp <= 0xDFFF)
+ || *cp == 0xFFFE
+ || *cp == 0xFFFF)
+ *cp = UTF8_INVALID;
+
+ *nbytep = nbytes;
+ return TERMO_RES_KEY;
+}
+
+const char *
+termo_strpkey_utf8 (termo_t *tk,
+ const char *str, termo_key_t *key, termo_format_t format)
+{
+ return termo_strpkey_generic (tk, str, key, format, parse_utf8);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
int
termo_keycmp (termo_t *tk,
const termo_key_t *key1p, const termo_key_t *key2p)
diff --git a/termo.h b/termo.h
index 9df3e73..f32d5f8 100644
--- a/termo.h
+++ b/termo.h
@@ -299,6 +299,11 @@ size_t termo_strfkey (termo_t *tk, char *buffer, size_t len,
const char *termo_strpkey (termo_t *tk, const char *str,
termo_key_t *key, termo_format_t format);
+size_t termo_strfkey_utf8 (termo_t *tk, char *buffer, size_t len,
+ termo_key_t *key, termo_format_t format);
+const char *termo_strpkey_utf8 (termo_t *tk, const char *str,
+ termo_key_t *key, termo_format_t format);
+
int termo_keycmp (termo_t *tk,
const termo_key_t *key1, const termo_key_t *key2);
--
cgit v1.2.3-70-g09d2