From c4ea0e28fdc3b1be8c8f69f816acabfbe007dc79 Mon Sep 17 00:00:00 2001 From: Přemysl Janouch Date: Sat, 2 May 2015 04:58:08 +0200 Subject: config: implement string tokenizing --- common.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 2 deletions(-) diff --git a/common.c b/common.c index 67781dc..7bfe9fe 100644 --- a/common.c +++ b/common.c @@ -1009,6 +1009,120 @@ config_tokenizer_error (struct config_tokenizer *self, str_free (&description); } +static bool +config_tokenizer_hexa_escape (struct config_tokenizer *self, struct str *output) +{ + int i; + unsigned char code = 0; + + for (i = 0; self->len && i < 2; i++) + { + unsigned char c = tolower_ascii (*self->p); + if (c >= '0' && c <= '9') + code = (code << 4) | (c - '0'); + else if (c >= 'a' && c <= 'f') + code = (code << 4) | (c - 'a' + 10); + else + break; + + config_tokenizer_advance (self); + } + + if (!i) + return false; + + str_append_c (output, code); + return true; +} + +static bool +config_tokenizer_octal_escape + (struct config_tokenizer *self, struct str *output) +{ + int i; + unsigned char code = 0; + + for (i = 0; self->len && i < 3; i++) + { + unsigned char c = *self->p; + if (c >= '0' && c <= '7') + code = (code << 3) | (c - '0'); + else + break; + + config_tokenizer_advance (self); + } + + if (!i) + return false; + + str_append_c (output, code); + return true; +} + +static bool +config_tokenizer_escape_sequence + (struct config_tokenizer *self, struct str *output, struct error **e) +{ + if (!self->len) + { + config_tokenizer_error (self, e, "premature end of escape sequence"); + return false; + } + + unsigned char c; + switch ((c = *self->p)) + { + case '"': break; + case '\\': break; + case 'a': c = '\a'; break; + case 'b': c = '\b'; break; + case 'f': c = '\f'; break; + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'v': c = '\v'; break; + + case 'x': + case 'X': + config_tokenizer_advance (self); + if (config_tokenizer_hexa_escape (self, output)) + return true; + + config_tokenizer_error (self, e, "invalid hexadecimal escape"); + return false; + + default: + if (config_tokenizer_octal_escape (self, output)) + return true; + + config_tokenizer_error (self, e, "unknown escape sequence"); + return false; + } + + str_append_c (output, c); + config_tokenizer_advance (self); + return true; +} + +static bool +config_tokenizer_string + (struct config_tokenizer *self, struct str *output, struct error **e) +{ + unsigned char c; + while (self->len) + { + if ((c = config_tokenizer_advance (self)) == '"') + return true; + if (c != '\\') + str_append_c (output, c); + else if (!config_tokenizer_escape_sequence (self, output, e)) + return false; + } + config_tokenizer_error (self, e, "premature end of string"); + return false; +} + static enum config_token config_tokenizer_next (struct config_tokenizer *self, struct error **e) { @@ -1033,8 +1147,15 @@ config_tokenizer_next (struct config_tokenizer *self, struct error **e) return CONFIG_T_ABORT; case '"': - // TODO: string, validate as UTF-8 - break; + config_tokenizer_advance (self); + str_reset (&self->string); + if (!config_tokenizer_string (self, &self->string, e)) + return CONFIG_T_ABORT; + if (!utf8_validate (self->string.str, self->string.len)) + { + config_tokenizer_error (self, e, "not a valid UTF-8 string"); + return CONFIG_T_ABORT; + } } bool is_word = false; -- cgit v1.2.3-70-g09d2