diff options
Diffstat (limited to 'tokenizer.c')
-rw-r--r-- | tokenizer.c | 201 |
1 files changed, 102 insertions, 99 deletions
diff --git a/tokenizer.c b/tokenizer.c index a8932e2..73ae7d0 100644 --- a/tokenizer.c +++ b/tokenizer.c @@ -101,6 +101,7 @@ typedef struct { char *s; /* string being parsed */ LineNo line; Token *token; /* token currently being processed */ + Identifiers *idents; } Tokenizer; @@ -214,49 +215,52 @@ static void tokr_get_location(Tokenizer *tokr, Token *t) { tokr->s = t->where.code; } -static bool tokenize_string(Tokenizer *tokr, char *str) { +static void tokr_create(Tokenizer *t, Identifiers *idents) { + arr_create(&t->tokens, sizeof(Token)); + arr_reserve(&t->tokens, 256); + t->idents = idents; +} + +static bool tokenize_string(Tokenizer *t, char *str) { int has_err = 0; - Tokenizer t; - arr_create(&t.tokens, sizeof(Token)); - arr_reserve(&t.tokens, 256); - t.s = str; - t.line = 1; + t->s = str; + t->line = 1; while (1) { - if (*t.s == 0) break; - if (isspace(*t.s)) { - tokr_nextchar(&t); + if (*t->s == 0) break; + if (isspace(*t->s)) { + tokr_nextchar(t); continue; } - if (*t.s == '/') { + if (*t->s == '/') { /* maybe it's a comment */ int is_comment = 1; - switch (t.s[1]) { + switch (t->s[1]) { case '/': /* single line comment */ - tokr_nextchar(&t); - for (t.s++; *t.s != '\n' && *t.s; t.s++); - t.line++; + tokr_nextchar(t); + for (t->s++; *t->s != '\n' && *t->s; t->s++); + t->line++; break; case '*': { /* multi line comment */ - tokr_nextchar(&t); + tokr_nextchar(t); int comment_level = 1; /* allow nested multi-line comments */ - while (*t.s) { - if (t.s[0] == '*' && t.s[1] == '/') { - t.s += 2; + while (*t->s) { + if (t->s[0] == '*' && t->s[1] == '/') { + t->s += 2; comment_level--; if (comment_level == 0) { break; } - } else if (t.s[0] == '/' && t.s[1] == '*') { - t.s += 2; + } else if (t->s[0] == '/' && t->s[1] == '*') { + t->s += 2; comment_level++; } else { - tokr_nextchar(&t); + tokr_nextchar(t); } } - if (*t.s == 0) { - tokenization_err(&t, "End of file reached inside multi-line comment."); + if (*t->s == 0) { + tokenization_err(t, "End of file reached inside multi-line comment."); abort(); /* there won't be any further errors, of course */ } } break; @@ -267,12 +271,12 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { if (is_comment) continue; } { - char *start_s = t.s; - Keyword kw = tokenize_kw(&t.s); + char *start_s = t->s; + Keyword kw = tokenize_kw(&t->s); if (kw != KW_COUNT) { /* it's a keyword */ - Token *token = tokr_add(&t); - token->where.line = t.line; + Token *token = tokr_add(t); + token->where.line = t->line; token->where.code = start_s; token->kind = TOKEN_KW; token->kw = kw; @@ -282,30 +286,30 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { /* check if it's a number */ - if (isdigit(*t.s)) { + if (isdigit(*t->s)) { /* it's a numeric literal */ int base = 10; Floating decimal_pow10; NumLiteral n; n.kind = NUM_LITERAL_INT; n.intval = 0; - Token *token = tokr_add(&t); - tokr_put_location(&t, token); - if (*t.s == '0') { - tokr_nextchar(&t); + Token *token = tokr_add(t); + tokr_put_location(t, token); + if (*t->s == '0') { + tokr_nextchar(t); /* octal/hexadecimal/binary (or zero) */ - char format = *t.s; + char format = *t->s; if (isdigit(format)) /* octal */ base = 8; else { switch (format) { case 'b': base = 2; - tokr_nextchar(&t); + tokr_nextchar(t); break; case 'x': base = 16; - tokr_nextchar(&t); + tokr_nextchar(t); break; default: /* it's 0/0.something etc. */ @@ -315,39 +319,39 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { } while (1) { - if (*t.s == '.') { + if (*t->s == '.') { if (n.kind == NUM_LITERAL_FLOAT) { - tokenization_err(&t, "Double . in number."); + tokenization_err(t, "Double . in number."); goto err; } if (base != 10) { - tokenization_err(&t, "Decimal point in non base 10 number."); + tokenization_err(t, "Decimal point in non base 10 number."); goto err; } n.kind = NUM_LITERAL_FLOAT; decimal_pow10 = 0.1; n.floatval = (Floating)n.intval; - tokr_nextchar(&t); + tokr_nextchar(t); continue; - } else if (*t.s == 'e') { - tokr_nextchar(&t); + } else if (*t->s == 'e') { + tokr_nextchar(t); if (n.kind == NUM_LITERAL_INT) { n.kind = NUM_LITERAL_FLOAT; n.floatval = (Floating)n.intval; } /* TODO: check if exceeding maximum exponent */ int exponent = 0; - if (*t.s == '+') - tokr_nextchar(&t); /* ignore + after e */ + if (*t->s == '+') + tokr_nextchar(t); /* ignore + after e */ int negative_exponent = 0; - if (*t.s == '-') { - tokr_nextchar(&t); + if (*t->s == '-') { + tokr_nextchar(t); negative_exponent = 1; } - for (; isdigit(*t.s); tokr_nextchar(&t)) { + for (; isdigit(*t->s); tokr_nextchar(t)) { exponent *= 10; - exponent += *t.s - '0'; + exponent += *t->s - '0'; } /* OPTIM: Slow for very large exponents (unlikely to happen) */ for (int i = 0; i < exponent; i++) { @@ -361,19 +365,19 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { } int digit = -1; if (base == 16) { - if (*t.s >= 'a' && *t.s <= 'f') - digit = 10 + *t.s - 'a'; - else if (*t.s >= 'A' && *t.s <= 'F') - digit = *t.s - 'A'; + if (*t->s >= 'a' && *t->s <= 'f') + digit = 10 + *t->s - 'a'; + else if (*t->s >= 'A' && *t->s <= 'F') + digit = *t->s - 'A'; } if (digit == -1) { - if (*t.s >= '0' && *t.s <= '9') - digit = *t.s - '0'; + if (*t->s >= '0' && *t->s <= '9') + digit = *t->s - '0'; } if (digit < 0 || digit >= base) { - if (isdigit(*t.s)) { + if (isdigit(*t->s)) { /* something like 0b011012 */ - tokenization_err(&t, "Digit %d cannot appear in a base %d number.", digit, base); + tokenization_err(t, "Digit %d cannot appear in a base %d number.", digit, base); goto err; } /* end of numeric literal */ @@ -384,7 +388,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { if (n.intval > ULLONG_MAX / (UInteger)base || n.intval * (UInteger)base > ULLONG_MAX - (UInteger)digit) { /* too big! */ - tokenization_err(&t, "Number too big to fit in a numeric literal."); + tokenization_err(t, "Number too big to fit in a numeric literal."); goto err; } n.intval *= (UInteger)base; @@ -395,107 +399,106 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { decimal_pow10 /= 10; break; } - tokr_nextchar(&t); + tokr_nextchar(t); } token->kind = TOKEN_NUM_LITERAL; token->num = n; continue; } - if (*t.s == '\'') { + if (*t->s == '\'') { /* it's a character literal! */ - tokr_nextchar(&t); - Token *token = tokr_add(&t); - tokr_put_location(&t, token); + tokr_nextchar(t); + Token *token = tokr_add(t); + tokr_put_location(t, token); char c; - if (*t.s == '\\') { + if (*t->s == '\\') { /* escape sequence */ - tokr_nextchar(&t); - c = tokr_esc_seq(&t); + tokr_nextchar(t); + c = tokr_esc_seq(t); if (c == 0) { - tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s); + tokenization_err(t, "Unrecognized escape character: '\\%c'.", *t->s); goto err; } } else { - c = *t.s; - tokr_nextchar(&t); + c = *t->s; + tokr_nextchar(t); } - if (*t.s != '\'') { - tokenization_err(&t, "End of character literal expected."); + if (*t->s != '\'') { + tokenization_err(t, "End of character literal expected."); goto err; } - tokr_nextchar(&t); + tokr_nextchar(t); token->kind = TOKEN_CHAR_LITERAL; token->chr = c; continue; } - if (*t.s == '"') { + if (*t->s == '"') { /* it's a string literal! */ - Token *token = tokr_add(&t); - tokr_put_location(&t, token); - tokr_nextchar(&t); + Token *token = tokr_add(t); + tokr_put_location(t, token); + tokr_nextchar(t); size_t len = 0; size_t backslashes = 0; - while (*t.s != '"' || backslashes % 2 == 1) { - if (*t.s == '\\') { + while (*t->s != '"' || backslashes % 2 == 1) { + if (*t->s == '\\') { backslashes++; - } else if (*t.s == 0) { + } else if (*t->s == 0) { /* return t to opening " so that we go to the next line */ - tokr_get_location(&t, token); - tokenization_err(&t, "No matching \" found."); + tokr_get_location(t, token); + tokenization_err(t, "No matching \" found."); goto err; } else { backslashes = 0; } len++; - tokr_nextchar(&t); + tokr_nextchar(t); } char *strlit = malloc(len + 1); char *strptr = strlit; - tokr_get_location(&t, token); - tokr_nextchar(&t); /* past opening " */ - while (*t.s != '"') { - assert(*t.s); - if (*t.s == '\\') { - tokr_nextchar(&t); - char c = tokr_esc_seq(&t); + tokr_get_location(t, token); + tokr_nextchar(t); /* past opening " */ + while (*t->s != '"') { + assert(*t->s); + if (*t->s == '\\') { + tokr_nextchar(t); + char c = tokr_esc_seq(t); if (c == 0) { - tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s); + tokenization_err(t, "Unrecognized escape character: '\\%c'.", *t->s); goto err; } *strptr++ = c; } else { - *strptr++ = *t.s; - tokr_nextchar(&t); + *strptr++ = *t->s; + tokr_nextchar(t); } } *strptr = 0; token->kind = TOKEN_STR_LITERAL; token->str.len = len; token->str.str = strlit; - tokr_nextchar(&t); /* move past closing " */ + tokr_nextchar(t); /* move past closing " */ continue; } - if (isident(*t.s)) { + if (isident(*t->s)) { /* it's an identifier */ - Token *token = tokr_add(&t); - tokr_put_location(&t, token); - Identifier ident = ident_insert(&t.s); + Token *token = tokr_add(t); + tokr_put_location(t, token); + Identifier ident = ident_insert(t->idents, &t->s); token->kind = TOKEN_IDENT; token->ident = ident; continue; } - tokenization_err(&t, "Token not recognized"); + tokenization_err(t, "Token not recognized"); err: has_err = 1; } - Token *token = tokr_add(&t); + Token *token = tokr_add(t); token->kind = TOKEN_EOF; - t.token = t.tokens.data; - *tokr = t; + t->token = t->tokens.data; return !has_err; } |