From c4ea0e28fdc3b1be8c8f69f816acabfbe007dc79 Mon Sep 17 00:00:00 2001
From: Přemysl Janouch
Date: Sat, 2 May 2015 04:58:08 +0200
Subject: config: implement string tokenizing
---
common.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 123 insertions(+), 2 deletions(-)
diff --git a/common.c b/common.c
index 67781dc..7bfe9fe 100644
--- a/common.c
+++ b/common.c
@@ -1009,6 +1009,120 @@ config_tokenizer_error (struct config_tokenizer *self,
str_free (&description);
}
+static bool
+config_tokenizer_hexa_escape (struct config_tokenizer *self, struct str *output)
+{
+ int i;
+ unsigned char code = 0;
+
+ for (i = 0; self->len && i < 2; i++)
+ {
+ unsigned char c = tolower_ascii (*self->p);
+ if (c >= '0' && c <= '9')
+ code = (code << 4) | (c - '0');
+ else if (c >= 'a' && c <= 'f')
+ code = (code << 4) | (c - 'a' + 10);
+ else
+ break;
+
+ config_tokenizer_advance (self);
+ }
+
+ if (!i)
+ return false;
+
+ str_append_c (output, code);
+ return true;
+}
+
+static bool
+config_tokenizer_octal_escape
+ (struct config_tokenizer *self, struct str *output)
+{
+ int i;
+ unsigned char code = 0;
+
+ for (i = 0; self->len && i < 3; i++)
+ {
+ unsigned char c = *self->p;
+ if (c >= '0' && c <= '7')
+ code = (code << 3) | (c - '0');
+ else
+ break;
+
+ config_tokenizer_advance (self);
+ }
+
+ if (!i)
+ return false;
+
+ str_append_c (output, code);
+ return true;
+}
+
+static bool
+config_tokenizer_escape_sequence
+ (struct config_tokenizer *self, struct str *output, struct error **e)
+{
+ if (!self->len)
+ {
+ config_tokenizer_error (self, e, "premature end of escape sequence");
+ return false;
+ }
+
+ unsigned char c;
+ switch ((c = *self->p))
+ {
+ case '"': break;
+ case '\\': break;
+ case 'a': c = '\a'; break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'v': c = '\v'; break;
+
+ case 'x':
+ case 'X':
+ config_tokenizer_advance (self);
+ if (config_tokenizer_hexa_escape (self, output))
+ return true;
+
+ config_tokenizer_error (self, e, "invalid hexadecimal escape");
+ return false;
+
+ default:
+ if (config_tokenizer_octal_escape (self, output))
+ return true;
+
+ config_tokenizer_error (self, e, "unknown escape sequence");
+ return false;
+ }
+
+ str_append_c (output, c);
+ config_tokenizer_advance (self);
+ return true;
+}
+
+static bool
+config_tokenizer_string
+ (struct config_tokenizer *self, struct str *output, struct error **e)
+{
+ unsigned char c;
+ while (self->len)
+ {
+ if ((c = config_tokenizer_advance (self)) == '"')
+ return true;
+ if (c != '\\')
+ str_append_c (output, c);
+ else if (!config_tokenizer_escape_sequence (self, output, e))
+ return false;
+ }
+ config_tokenizer_error (self, e, "premature end of string");
+ return false;
+}
+
static enum config_token
config_tokenizer_next (struct config_tokenizer *self, struct error **e)
{
@@ -1033,8 +1147,15 @@ config_tokenizer_next (struct config_tokenizer *self, struct error **e)
return CONFIG_T_ABORT;
case '"':
- // TODO: string, validate as UTF-8
- break;
+ config_tokenizer_advance (self);
+ str_reset (&self->string);
+ if (!config_tokenizer_string (self, &self->string, e))
+ return CONFIG_T_ABORT;
+ if (!utf8_validate (self->string.str, self->string.len))
+ {
+ config_tokenizer_error (self, e, "not a valid UTF-8 string");
+ return CONFIG_T_ABORT;
+ }
}
bool is_word = false;
--
cgit v1.2.3-70-g09d2