From c5cd74d910499d7e8d83e317c7b114460857a22b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C5=99emysl=20Janouch?= Date: Wed, 24 May 2017 20:03:17 +0200 Subject: Simplify the lexer --- ell.c | 113 ++++++++++++++++++++++++------------------------------------------ 1 file changed, 40 insertions(+), 73 deletions(-) diff --git a/ell.c b/ell.c index 70d6904..56e557a 100644 --- a/ell.c +++ b/ell.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -215,9 +214,7 @@ struct lexer { /// Input has to be null-terminated anyway static void lexer_init (struct lexer *self, const char *p, size_t len) { - memset (self, 0, sizeof *self); - self->p = p; - self->len = len; + *self = (struct lexer) { .p = p, .len = len }; } static void @@ -225,11 +222,6 @@ lexer_free (struct lexer *self) { free (self->string.s); } -static bool lexer_is_ignored (int c) { return strchr (" \t", c); } -static bool lexer_is_word_char (int c) { - return !lexer_is_ignored (c) && !strchr ("()[]{}\n;@#'", c); -} - static int lexer_advance (struct lexer *self) { int c = *self->p++; @@ -245,58 +237,38 @@ lexer_advance (struct lexer *self) { static bool lexer_hexa_escape (struct lexer *self, struct buffer *output) { - int i; - unsigned char code = 0; - - for (i = 0; self->len && i < 2; i++) { - unsigned char c = tolower (*self->p); - if (c >= '0' && c <= '9') - code = (code << 4) | (c - '0'); - else if (c >= 'a' && c <= 'f') - code = (code << 4) | (c - 'a' + 10); - else - break; - - lexer_advance (self); - } - - if (!i) + const char *alphabet = "0123456789abcdef", *h, *l; + if (!self->len || !(h = strchr (alphabet, tolower (lexer_advance (self)))) + || !self->len || !(l = strchr (alphabet, tolower (lexer_advance (self))))) return false; - buffer_append_c (output, code); + buffer_append_c (output, (h - alphabet) << 4 | (l - alphabet)); return true; } +enum { LEXER_STRING_QUOTE = '\'', LEXER_ESCAPE = '\\', LEXER_COMMENT = '#' }; +static bool lexer_is_whitespace (int c) { return !c || c == ' ' || c == '\t'; } + +static unsigned char lexer_escapes[256] = { + [LEXER_STRING_QUOTE] = LEXER_STRING_QUOTE, [LEXER_ESCAPE] = LEXER_ESCAPE, + ['a'] = '\a', ['b'] = '\b', ['n'] = '\n', ['r'] = '\r', ['t'] = '\t', +}; + static const char * lexer_escape_sequence (struct lexer *self, struct buffer *output) { if (!self->len) return "premature end of escape sequence"; - unsigned char c = *self->p; - switch (c) { - case '"': break; - case '\\': break; - case 'a': c = '\a'; break; - case 'b': c = '\b'; break; - case 'f': c = '\f'; break; - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - case 'v': c = '\v'; break; - - case 'x': - case 'X': - lexer_advance (self); + unsigned char c = lexer_advance (self); + if (c == 'x') { if (lexer_hexa_escape (self, output)) return NULL; return "invalid hexadecimal escape"; - - default: - return "unknown escape sequence"; } + if (!(c = lexer_escapes[c])) + return "unknown escape sequence"; buffer_append_c (output, c); - lexer_advance (self); return NULL; } @@ -305,9 +277,9 @@ lexer_string (struct lexer *self, struct buffer *output) { unsigned char c; const char *e = NULL; while (self->len) { - if ((c = lexer_advance (self)) == '\'') + if ((c = lexer_advance (self)) == LEXER_STRING_QUOTE) return NULL; - if (c != '\\') + if (c != LEXER_ESCAPE) buffer_append_c (output, c); else if ((e = lexer_escape_sequence (self, output))) return e; @@ -315,10 +287,15 @@ lexer_string (struct lexer *self, struct buffer *output) { return "premature end of string"; } +static enum token lexer_tokens[256] = { + ['('] = T_LPAREN, [')'] = T_RPAREN, ['['] = T_LBRACKET, [']'] = T_RBRACKET, + ['{'] = T_LBRACE, ['}'] = T_RBRACE, [';'] = T_NEWLINE, ['\n'] = T_NEWLINE, + ['@'] = T_AT, [LEXER_STRING_QUOTE] = T_STRING, +}; + static enum token lexer_next (struct lexer *self, const char **e) { - // Skip over any whitespace between tokens - while (self->len && lexer_is_ignored (*self->p)) + while (self->len && lexer_is_whitespace (*self->p)) lexer_advance (self); if (!self->len) return T_ABORT; @@ -326,36 +303,26 @@ lexer_next (struct lexer *self, const char **e) { free (self->string.s); self->string = (struct buffer) BUFFER_INITIALIZER; - switch (*self->p) { - case '(': lexer_advance (self); return T_LPAREN; - case ')': lexer_advance (self); return T_RPAREN; - case '[': lexer_advance (self); return T_LBRACKET; - case ']': lexer_advance (self); return T_RBRACKET; - case '{': lexer_advance (self); return T_LBRACE; - case '}': lexer_advance (self); return T_RBRACE; - case '\n': lexer_advance (self); return T_NEWLINE; - case ';': lexer_advance (self); return T_NEWLINE; - case '@': lexer_advance (self); return T_AT; - - case '#': - // Comments go until newline + unsigned char c = lexer_advance (self); + if (c == LEXER_COMMENT) { while (self->len) if (lexer_advance (self) == '\n') return T_NEWLINE; return T_ABORT; + } - case '\'': - lexer_advance (self); - if ((*e = lexer_string (self, &self->string))) - return T_ABORT; + enum token token = lexer_tokens[c]; + if (!token) { + buffer_append_c (&self->string, c); + while (self->len && !lexer_is_whitespace (*self->p) + && !lexer_tokens[(unsigned char) *self->p]) + buffer_append_c (&self->string, lexer_advance (self)); return T_STRING; } - - assert (lexer_is_word_char (*self->p)); - do - buffer_append_c (&self->string, lexer_advance (self)); - while (lexer_is_word_char (*self->p)); - return T_STRING; + if (token == T_STRING + && (*e = lexer_string (self, &self->string))) + return T_ABORT; + return token; } static char *lexer_errorf (struct lexer *self, const char *fmt, ...) @@ -371,7 +338,7 @@ lexer_errorf (struct lexer *self, const char *fmt, ...) { if (!description) return NULL; - char *e = format ("near line %u, column %u: %s", + char *e = format ("at or before line %u, column %u: %s", self->line + 1, self->column + 1, description); free (description); return e; -- cgit v1.2.3