summaryrefslogtreecommitdiff
path: root/tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'tokenizer.c')
-rw-r--r--tokenizer.c391
1 files changed, 249 insertions, 142 deletions
diff --git a/tokenizer.c b/tokenizer.c
index 5fafd3b..7782ca3 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -3,28 +3,35 @@ typedef enum {
TOKEN_IDENT,
TOKEN_NUM_CONST,
TOKEN_CHAR_CONST,
+ TOKEN_STR_CONST,
TOKEN_EOF
- /* TODO: char constnats, str constants */
} TokenKind;
typedef enum {
KW_SEMICOLON,
+ KW_EQ,
+ KW_COLON,
+ KW_FN,
+ KW_LPAREN,
+ KW_RPAREN,
+ KW_LBRACE,
+ KW_RBRACE,
KW_EQEQ,
KW_LT,
KW_LE,
- KW_EQ,
+ KW_MINUS,
KW_COUNT
} Keyword;
/* OPTIM: Use a trie or just a function if this gets too long */
static const char *keywords[KW_COUNT] =
- {";", "==", "<", "<=", "="};
+ {";", "=", ":", "fn", "(", ")", "{", "}", "==", "<", "<=", "-"};
-#define TOKENIZER_USE_LLONG 1
+#define TOKR_USE_LLONG 1
typedef unsigned long long IntConst;
-typedef long double RealConst; /* OPTIM: Maybe only use double */
+typedef long double RealConst; /* OPTIM: Switch to double */
typedef enum {
NUM_CONST_INT,
@@ -39,28 +46,40 @@ typedef struct {
};
} NumConst;
+typedef struct {
+ char *str;
+ size_t len;
+} StrConst;
+
/* NOTE: LineNo is typedef'd in util/err.c */
typedef struct {
TokenKind kind;
LineNo line;
- LineNo col;
+ char *code;
union {
Keyword kw;
Identifier ident;
NumConst num;
char chr;
+ StrConst str;
};
} Token;
+arr_declaration(Tokens, Token, tokens_)
+
typedef struct {
- Token *tokens;
- size_t ntokens;
- size_t cap; /* used internally */
+ Tokens tokens;
+ char *s; /* string being parsed */
+ LineNo line;
Token *token; /* token currently being processed */
} Tokenizer;
+static bool token_is_kw(Token *t, Keyword kw) {
+ return t->kind == TOKEN_KW && t->kw == kw;
+}
+
static void token_fprint(FILE *out, Token *t) {
- fprintf(out, "l%luc%lu-", (unsigned long)t->line, (unsigned long)t->col);
+ fprintf(out, "l%lu-", (unsigned long)t->line);
switch (t->kind) {
case TOKEN_KW:
fprintf(out, "keyword: %s", keywords[t->kw]);
@@ -83,129 +102,177 @@ static void token_fprint(FILE *out, Token *t) {
case TOKEN_CHAR_CONST:
fprintf(out, "char: '%c' (%d)", t->chr, t->chr);
break;
+ case TOKEN_STR_CONST:
+ fprintf(out, "str: \"%s\"", t->str.str);
+ break;
case TOKEN_EOF:
fprintf(out, "eof");
break;
}
}
-static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) {
- if (t->ntokens >= t->cap) {
- t->cap *= 2;
- t->tokens = err_realloc(t->tokens, t->cap * sizeof(*t->tokens));
+static void tokr_add(Tokenizer *t, Token *token) {
+ if (!token->line)
+ token->line = t->line;
+ if (!token->code)
+ token->code = t->s;
+ tokens_add(&t->tokens, token);
+}
+
+static void tokr_nextchar(Tokenizer *t) {
+ if (*(t->s) == '\n') {
+ t->line++;
+ }
+ t->s++;
+}
+
+static char tokr_esc_seq(Tokenizer *t) {
+ /* TODO: add more of these incl. \x41, \100 */
+ switch (*t->s) {
+ case '\'':
+ tokr_nextchar(t);
+ return '\'';
+ case '"':
+ tokr_nextchar(t);
+ return '"';
+ case '\\':
+ tokr_nextchar(t);
+ return '\\';
+ case 'n':
+ tokr_nextchar(t);
+ return '\n';
+ default:
+ return 0;
}
- token->line = line;
- token->col = col;
- t->tokens[t->ntokens++] = *token;
+
}
-static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't even try to pass it a string literal.*/
+/* to be used during tokenization */
+static void tokenization_err(Tokenizer *t, const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ err_vprint(t->line, t->s, fmt, args);
+ va_end(args);
+
+ char *end_of_line = strchr(t->s, '\n');
+ if (end_of_line) {
+ t->s = end_of_line;
+ t->s++; /* move past newline */
+ } else {
+ t->s = strchr(t->s, '\0');
+ }
+ t->line++;
+}
+
+/* to be used after tokenization */
+static void tokr_err(Tokenizer *t, const char *fmt, ...) {
+ LineNo line = t->token->line;
+ va_list args;
+ va_start(args, fmt);
+ err_vprint(line, t->token->code, fmt, args);
+ va_end(args);
+ while (1) {
+ if (t->token->line != line) break;
+ if (t->token->kind == TOKEN_EOF) break;
+ t->token++;
+ }
+}
+
+static bool tokenize_string(Tokenizer *tokr, char *str) {
int has_err = 0;
Tokenizer t;
- t.cap = 256;
- t.ntokens = 0;
- t.tokens = err_malloc(t.cap * sizeof(*t.tokens));
-
- LineNo line = 1;
- LineNo col = 1;
+ tokens_create(&t.tokens);
+ tokens_reserve(&t.tokens, 256);
+ t.s = str;
+ t.line = 1;
while (1) {
- if (*s == 0) break;
- if (isspace(*s)) {
- if (*s == '\n') {
- line++;
- col = 0;
- }
- s++; col++;
+ if (*t.s == 0) break;
+ if (isspace(*t.s)) {
+ tokr_nextchar(&t);
continue;
}
- if (*s == '/') {
+ if (*t.s == '/') {
/* maybe it's a comment */
int is_comment = 1;
- s++; col++;
- switch (*s) {
+ switch (t.s[1]) {
case '/': /* single line comment */
- for (s++; *s != '\n' && *s; s++);
- line++;
- col = 1;
+ tokr_nextchar(&t);
+ for (t.s++; *t.s != '\n' && *t.s; t.s++);
+ t.line++;
break;
case '*': { /* multi line comment */
+ tokr_nextchar(&t);
int comment_level = 1; /* allow nested multi-line comments */
- while (*s) {
- if (*s == '\n') {
- line++;
- col = 1;
- s++;
- continue;
- }
- if (s[0] == '*' && s[1] == '/') {
- s += 2; col += 2;
+ while (*t.s) {
+ if (t.s[0] == '*' && t.s[1] == '/') {
+ t.s += 2;
comment_level--;
if (comment_level == 0) {
break;
}
- } else if (s[0] == '/' && s[1] == '*') {
- s += 2; col += 2;
+ } else if (t.s[0] == '/' && t.s[1] == '*') {
+ t.s += 2;
comment_level++;
} else {
- s++; col++;
+ tokr_nextchar(&t);
}
}
- if (*s == 0) {
- err_print(line, col, "End of file reached inside multi-line comment.");
+ if (*t.s == 0) {
+ tokenization_err(&t, "End of file reached inside multi-line comment.");
abort(); /* there won't be any further errors, of course */
}
} break;
default:
is_comment = 0;
- s--; /* go back */
break;
}
if (is_comment) continue;
}
Keyword kw;
for (kw = 0; kw < KW_COUNT; kw++) {
- if (strncmp(s, keywords[kw], strlen(keywords[kw])) == 0) {
+ if (strncmp(t.s, keywords[kw], strlen(keywords[kw])) == 0) {
break;
}
}
if (kw != KW_COUNT) {
/* it's a keyword */
- Token token;
+ Token token = {0};
token.kind = TOKEN_KW;
token.kw = kw;
- tokenizer_add(&t, &token, line, col);
- col += (LineNo)strlen(keywords[kw]);
- s += (LineNo)strlen(keywords[kw]);
+ tokr_add(&t, &token);
+ t.s += (LineNo)strlen(keywords[kw]);
continue;
}
/* check if it's a number */
- if (isdigit(*s)) {
+ if (isdigit(*t.s)) {
/* it's a numeric constant */
int base = 10;
RealConst decimal_pow10;
NumConst n;
n.kind = NUM_CONST_INT;
n.intval = 0;
- LineNo line_start = line, col_start = col;
- if (*s == '0') {
- s++; col++;
+ Token token = {0};
+ token.line = t.line;
+ token.code = t.s;
+ if (*t.s == '0') {
+ tokr_nextchar(&t);
/* octal/hexadecimal/binary (or zero) */
- char format = *s;
+ char format = *t.s;
if (isdigit(format)) /* octal */
base = 8;
else {
switch (format) {
case 'b':
base = 2;
- s++; col++;
+ tokr_nextchar(&t);
break;
case 'x':
base = 16;
- s++; col++;
+ tokr_nextchar(&t);
break;
default:
/* it's 0/0.something etc. */
@@ -215,40 +282,39 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev
}
while (1) {
- if (*s == '.') {
+ if (*t.s == '.') {
if (n.kind == NUM_CONST_REAL) {
- err_print(line, col, "Double . in number.");
+ tokenization_err(&t, "Double . in number.");
goto err;
}
if (base != 10) {
- err_print(line, col, "Decimal point in non base 10 number.");
+ tokenization_err(&t, "Decimal point in non base 10 number.");
goto err;
}
n.kind = NUM_CONST_REAL;
decimal_pow10 = 0.1;
n.realval = (RealConst)n.intval;
- s++, col++;
+ tokr_nextchar(&t);
continue;
- } else if (*s == 'e') {
- s++; col++;
+ } else if (*t.s == 'e') {
+ tokr_nextchar(&t);
if (n.kind == NUM_CONST_INT) {
n.kind = NUM_CONST_REAL;
n.realval = (RealConst)n.intval;
}
/* TODO: check if exceeding maximum exponent */
int exponent = 0;
- if (*s == '+') {
- s++; col++;
- }
+ if (*t.s == '+')
+ tokr_nextchar(&t); /* ignore + after e */
int negative_exponent = 0;
- if (*s == '-') {
- s++; col++;
+ if (*t.s == '-') {
+ tokr_nextchar(&t);
negative_exponent = 1;
}
- for (; isdigit(*s); s++, col++) {
+ for (; isdigit(*t.s); tokr_nextchar(&t)) {
exponent *= 10;
- exponent += *s - '0';
+ exponent += *t.s - '0';
}
/* OPTIM: Slow for very large exponents (unlikely to happen) */
for (int i = 0; i < exponent; i++) {
@@ -262,19 +328,19 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev
}
int digit = -1;
if (base == 16) {
- if (*s >= 'a' && *s <= 'f')
- digit = 10 + *s - 'a';
- else if (*s >= 'A' && *s <= 'F')
- digit = *s - 'A';
+ if (*t.s >= 'a' && *t.s <= 'f')
+ digit = 10 + *t.s - 'a';
+ else if (*t.s >= 'A' && *t.s <= 'F')
+ digit = *t.s - 'A';
}
if (digit == -1) {
- if (*s >= '0' && *s <= '9')
- digit = *s - '0';
+ if (*t.s >= '0' && *t.s <= '9')
+ digit = *t.s - '0';
}
if (digit < 0 || digit >= base) {
- if (isdigit(*s)) {
+ if (isdigit(*t.s)) {
/* something like 0b011012 */
- err_print(line, col, "Digit %d cannot appear in a base %d number.", digit, base);
+ tokenization_err(&t, "Digit %d cannot appear in a base %d number.", digit, base);
goto err;
}
/* end of numeric constant */
@@ -282,9 +348,10 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev
}
switch (n.kind) {
case NUM_CONST_INT:
- if (n.intval > ULLONG_MAX / (IntConst)base) {
+ if (n.intval > ULLONG_MAX / (IntConst)base ||
+ n.intval * (IntConst)base > ULLONG_MAX - (IntConst)digit) {
/* too big! */
- err_print(line, col, "Number too big to fit in a numeric constant.");
+ tokenization_err(&t, "Number too big to fit in a numeric constant.");
goto err;
}
n.intval *= (IntConst)base;
@@ -295,88 +362,128 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev
decimal_pow10 /= 10;
break;
}
- s++; col++;
+ tokr_nextchar(&t);
}
- Token token;
token.kind = TOKEN_NUM_CONST;
token.num = n;
- tokenizer_add(&t, &token, line_start, col_start);
+ tokr_add(&t, &token);
continue;
}
- if (*s == '\'') {
+ if (*t.s == '\'') {
/* it's a character constant! */
- s++; col++;
+ tokr_nextchar(&t);
+ Token token = {0};
+ token.line = t.line;
+ token.code = t.s;
char c;
- if (*s == '\\') {
+ if (*t.s == '\\') {
/* escape sequence */
- s++; col++;
- /* TODO: Separate into function when string literals are added; add more of these */
- switch (*s) {
- case '\'':
- c = '\'';
- break;
- case '\\':
- c = '\\';
- break;
- case 'n':
- c = '\n';
- break;
- default:
- err_print(line, col, "Unrecognized escape character: '%c'.", *s);
+ tokr_nextchar(&t);
+ c = tokr_esc_seq(&t);
+ if (c == 0) {
+ tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s);
goto err;
}
} else {
- c = *s;
+ c = *t.s;
+ tokr_nextchar(&t);
}
- s++; col++;
- if (*s != '\'') {
- err_print(line, col, "End of character constant expected.");
+ if (*t.s != '\'') {
+ tokenization_err(&t, "End of character constant expected.");
goto err;
}
- s++; col++;
- Token token;
+ tokr_nextchar(&t);
token.kind = TOKEN_CHAR_CONST;
token.chr = c;
- tokenizer_add(&t, &token, line, col);
+ tokr_add(&t, &token);
+ continue;
+ }
+
+ if (*t.s == '"') {
+ /* it's a string constant! */
+ Token token;
+ token.line = t.line;
+ token.code = t.s;
+ tokr_nextchar(&t);
+ size_t len = 0;
+ size_t backslashes = 0;
+ while (*t.s != '"' || backslashes % 2 == 1) {
+ if (*t.s == '\\') {
+ backslashes++;
+ } else if (*t.s == 0) {
+ /* return t to opening " so that we go to the next line */
+ t.line = token.line;
+ t.s = token.code;
+ tokenization_err(&t, "No matching \" found.");
+ goto err;
+ } else {
+ backslashes = 0;
+ }
+ len++;
+ tokr_nextchar(&t);
+ }
+ char *str = malloc(len + 1);
+ char *strptr = str;
+ t.s = token.code;
+ t.line = token.line;
+ tokr_nextchar(&t); /* past opening " */
+ while (*t.s != '"') {
+ assert(*t.s);
+ if (*t.s == '\\') {
+ tokr_nextchar(&t);
+ char c = tokr_esc_seq(&t);
+ if (c == 0) {
+ tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s);
+ goto err;
+ }
+ *strptr++ = c;
+ } else {
+ *strptr++ = *t.s;
+ tokr_nextchar(&t);
+ }
+ }
+ *strptr = 0;
+ token.kind = TOKEN_STR_CONST;
+ token.str.len = len;
+ token.str.str = str;
+ tokr_add(&t, &token);
+ tokr_nextchar(&t); /* move past closing " */
continue;
}
- if (isidentstart(*s)) {
+ if (isidentstart(*t.s)) {
/* it's an identifier */
- Identifier ident = ident_insert(&s);
- Token token;
+ Token token = {0};
+ token.line = t.line;
+ token.code = t.s;
+ Identifier ident = ident_insert(&t.s);
token.kind = TOKEN_IDENT;
token.ident = ident;
- tokenizer_add(&t, &token, line, col);
+ tokr_add(&t, &token);
continue;
- }
- int has_newline;
- char *end_of_line = strchr(s, '\n');
- has_newline = end_of_line != NULL;
- if (has_newline)
- *end_of_line = 0;
-
- err_print(line, col, TEXT_IMPORTANT("Unrecognized token:") "\n\there --> %s\n", s);
- if (has_newline)
- *end_of_line = '\n';
+ }
+ tokenization_err(&t, "Token not recognized");
err:
has_err = 1;
- s = strchr(s, '\n');
- if (s == NULL) break;
- s++; /* move past newline */
- col = 1;
- line++;
-
- }
- if (has_err) {
- fprintf(stderr, TEXT_IMPORTANT("Errors occured while preprocessing.\n"));
- abort();
}
- t.token = t.tokens;
- return t;
+ Token token = {0};
+ token.kind = TOKEN_EOF;
+ tokr_add(&t, &token);
+
+ t.token = t.tokens.data;
+ *tokr = t;
+ return !has_err;
}
-static void tokenizer_free(Tokenizer *t) {
- free(t->tokens);
+static void tokr_free(Tokenizer *t) {
+ arr_foreach(t->tokens, Token, token) {
+ switch (token->kind) {
+ case TOKEN_STR_CONST:
+ free(token->str.str);
+ break;
+ default: break;
+ }
+ }
+ tokens_clear(&t->tokens);
}