aboutsummaryrefslogtreecommitdiff
path: root/ell.c
diff options
context:
space:
mode:
authorPřemysl Janouch <p.janouch@gmail.com>2017-05-24 20:03:17 +0200
committerPřemysl Janouch <p.janouch@gmail.com>2017-05-24 20:42:49 +0200
commitc5cd74d910499d7e8d83e317c7b114460857a22b (patch)
treef73dc8105ac9f1d519f2af7a173f990d4852ec23 /ell.c
parent383c9d8fd2169fe9b3d9dbc4c7b973175dde7819 (diff)
downloadell-c5cd74d910499d7e8d83e317c7b114460857a22b.tar.gz
ell-c5cd74d910499d7e8d83e317c7b114460857a22b.tar.xz
ell-c5cd74d910499d7e8d83e317c7b114460857a22b.zip
Simplify the lexer
Diffstat (limited to 'ell.c')
-rw-r--r--ell.c113
1 files changed, 40 insertions, 73 deletions
diff --git a/ell.c b/ell.c
index 70d6904..56e557a 100644
--- a/ell.c
+++ b/ell.c
@@ -23,7 +23,6 @@
#include <ctype.h>
#include <errno.h>
#include <stdarg.h>
-#include <assert.h>
#include <stdbool.h>
#include <setjmp.h>
@@ -215,9 +214,7 @@ struct lexer {
/// Input has to be null-terminated anyway
static void
lexer_init (struct lexer *self, const char *p, size_t len) {
- memset (self, 0, sizeof *self);
- self->p = p;
- self->len = len;
+ *self = (struct lexer) { .p = p, .len = len };
}
static void
@@ -225,11 +222,6 @@ lexer_free (struct lexer *self) {
free (self->string.s);
}
-static bool lexer_is_ignored (int c) { return strchr (" \t", c); }
-static bool lexer_is_word_char (int c) {
- return !lexer_is_ignored (c) && !strchr ("()[]{}\n;@#'", c);
-}
-
static int
lexer_advance (struct lexer *self) {
int c = *self->p++;
@@ -245,58 +237,38 @@ lexer_advance (struct lexer *self) {
static bool
lexer_hexa_escape (struct lexer *self, struct buffer *output) {
- int i;
- unsigned char code = 0;
-
- for (i = 0; self->len && i < 2; i++) {
- unsigned char c = tolower (*self->p);
- if (c >= '0' && c <= '9')
- code = (code << 4) | (c - '0');
- else if (c >= 'a' && c <= 'f')
- code = (code << 4) | (c - 'a' + 10);
- else
- break;
-
- lexer_advance (self);
- }
-
- if (!i)
+ const char *alphabet = "0123456789abcdef", *h, *l;
+ if (!self->len || !(h = strchr (alphabet, tolower (lexer_advance (self))))
+ || !self->len || !(l = strchr (alphabet, tolower (lexer_advance (self)))))
return false;
- buffer_append_c (output, code);
+ buffer_append_c (output, (h - alphabet) << 4 | (l - alphabet));
return true;
}
+enum { LEXER_STRING_QUOTE = '\'', LEXER_ESCAPE = '\\', LEXER_COMMENT = '#' };
+static bool lexer_is_whitespace (int c) { return !c || c == ' ' || c == '\t'; }
+
+static unsigned char lexer_escapes[256] = {
+ [LEXER_STRING_QUOTE] = LEXER_STRING_QUOTE, [LEXER_ESCAPE] = LEXER_ESCAPE,
+ ['a'] = '\a', ['b'] = '\b', ['n'] = '\n', ['r'] = '\r', ['t'] = '\t',
+};
+
static const char *
lexer_escape_sequence (struct lexer *self, struct buffer *output) {
if (!self->len)
return "premature end of escape sequence";
- unsigned char c = *self->p;
- switch (c) {
- case '"': break;
- case '\\': break;
- case 'a': c = '\a'; break;
- case 'b': c = '\b'; break;
- case 'f': c = '\f'; break;
- case 'n': c = '\n'; break;
- case 'r': c = '\r'; break;
- case 't': c = '\t'; break;
- case 'v': c = '\v'; break;
-
- case 'x':
- case 'X':
- lexer_advance (self);
+ unsigned char c = lexer_advance (self);
+ if (c == 'x') {
if (lexer_hexa_escape (self, output))
return NULL;
return "invalid hexadecimal escape";
-
- default:
- return "unknown escape sequence";
}
+ if (!(c = lexer_escapes[c]))
+ return "unknown escape sequence";
buffer_append_c (output, c);
- lexer_advance (self);
return NULL;
}
@@ -305,9 +277,9 @@ lexer_string (struct lexer *self, struct buffer *output) {
unsigned char c;
const char *e = NULL;
while (self->len) {
- if ((c = lexer_advance (self)) == '\'')
+ if ((c = lexer_advance (self)) == LEXER_STRING_QUOTE)
return NULL;
- if (c != '\\')
+ if (c != LEXER_ESCAPE)
buffer_append_c (output, c);
else if ((e = lexer_escape_sequence (self, output)))
return e;
@@ -315,10 +287,15 @@ lexer_string (struct lexer *self, struct buffer *output) {
return "premature end of string";
}
+static enum token lexer_tokens[256] = {
+ ['('] = T_LPAREN, [')'] = T_RPAREN, ['['] = T_LBRACKET, [']'] = T_RBRACKET,
+ ['{'] = T_LBRACE, ['}'] = T_RBRACE, [';'] = T_NEWLINE, ['\n'] = T_NEWLINE,
+ ['@'] = T_AT, [LEXER_STRING_QUOTE] = T_STRING,
+};
+
static enum token
lexer_next (struct lexer *self, const char **e) {
- // Skip over any whitespace between tokens
- while (self->len && lexer_is_ignored (*self->p))
+ while (self->len && lexer_is_whitespace (*self->p))
lexer_advance (self);
if (!self->len)
return T_ABORT;
@@ -326,36 +303,26 @@ lexer_next (struct lexer *self, const char **e) {
free (self->string.s);
self->string = (struct buffer) BUFFER_INITIALIZER;
- switch (*self->p) {
- case '(': lexer_advance (self); return T_LPAREN;
- case ')': lexer_advance (self); return T_RPAREN;
- case '[': lexer_advance (self); return T_LBRACKET;
- case ']': lexer_advance (self); return T_RBRACKET;
- case '{': lexer_advance (self); return T_LBRACE;
- case '}': lexer_advance (self); return T_RBRACE;
- case '\n': lexer_advance (self); return T_NEWLINE;
- case ';': lexer_advance (self); return T_NEWLINE;
- case '@': lexer_advance (self); return T_AT;
-
- case '#':
- // Comments go until newline
+ unsigned char c = lexer_advance (self);
+ if (c == LEXER_COMMENT) {
while (self->len)
if (lexer_advance (self) == '\n')
return T_NEWLINE;
return T_ABORT;
+ }
- case '\'':
- lexer_advance (self);
- if ((*e = lexer_string (self, &self->string)))
- return T_ABORT;
+ enum token token = lexer_tokens[c];
+ if (!token) {
+ buffer_append_c (&self->string, c);
+ while (self->len && !lexer_is_whitespace (*self->p)
+ && !lexer_tokens[(unsigned char) *self->p])
+ buffer_append_c (&self->string, lexer_advance (self));
return T_STRING;
}
-
- assert (lexer_is_word_char (*self->p));
- do
- buffer_append_c (&self->string, lexer_advance (self));
- while (lexer_is_word_char (*self->p));
- return T_STRING;
+ if (token == T_STRING
+ && (*e = lexer_string (self, &self->string)))
+ return T_ABORT;
+ return token;
}
static char *lexer_errorf (struct lexer *self, const char *fmt, ...)
@@ -371,7 +338,7 @@ lexer_errorf (struct lexer *self, const char *fmt, ...) {
if (!description)
return NULL;
- char *e = format ("near line %u, column %u: %s",
+ char *e = format ("at or before line %u, column %u: %s",
self->line + 1, self->column + 1, description);
free (description);
return e;