diff options
Diffstat (limited to 'tokenizer.c')
-rw-r--r-- | tokenizer.c | 391 |
1 files changed, 249 insertions, 142 deletions
diff --git a/tokenizer.c b/tokenizer.c index 5fafd3b..7782ca3 100644 --- a/tokenizer.c +++ b/tokenizer.c @@ -3,28 +3,35 @@ typedef enum { TOKEN_IDENT, TOKEN_NUM_CONST, TOKEN_CHAR_CONST, + TOKEN_STR_CONST, TOKEN_EOF - /* TODO: char constnats, str constants */ } TokenKind; typedef enum { KW_SEMICOLON, + KW_EQ, + KW_COLON, + KW_FN, + KW_LPAREN, + KW_RPAREN, + KW_LBRACE, + KW_RBRACE, KW_EQEQ, KW_LT, KW_LE, - KW_EQ, + KW_MINUS, KW_COUNT } Keyword; /* OPTIM: Use a trie or just a function if this gets too long */ static const char *keywords[KW_COUNT] = - {";", "==", "<", "<=", "="}; + {";", "=", ":", "fn", "(", ")", "{", "}", "==", "<", "<=", "-"}; -#define TOKENIZER_USE_LLONG 1 +#define TOKR_USE_LLONG 1 typedef unsigned long long IntConst; -typedef long double RealConst; /* OPTIM: Maybe only use double */ +typedef long double RealConst; /* OPTIM: Switch to double */ typedef enum { NUM_CONST_INT, @@ -39,28 +46,40 @@ typedef struct { }; } NumConst; +typedef struct { + char *str; + size_t len; +} StrConst; + /* NOTE: LineNo is typedef'd in util/err.c */ typedef struct { TokenKind kind; LineNo line; - LineNo col; + char *code; union { Keyword kw; Identifier ident; NumConst num; char chr; + StrConst str; }; } Token; +arr_declaration(Tokens, Token, tokens_) + typedef struct { - Token *tokens; - size_t ntokens; - size_t cap; /* used internally */ + Tokens tokens; + char *s; /* string being parsed */ + LineNo line; Token *token; /* token currently being processed */ } Tokenizer; +static bool token_is_kw(Token *t, Keyword kw) { + return t->kind == TOKEN_KW && t->kw == kw; +} + static void token_fprint(FILE *out, Token *t) { - fprintf(out, "l%luc%lu-", (unsigned long)t->line, (unsigned long)t->col); + fprintf(out, "l%lu-", (unsigned long)t->line); switch (t->kind) { case TOKEN_KW: fprintf(out, "keyword: %s", keywords[t->kw]); @@ -83,129 +102,177 @@ static void token_fprint(FILE *out, Token *t) { case TOKEN_CHAR_CONST: fprintf(out, "char: '%c' (%d)", t->chr, t->chr); break; + case TOKEN_STR_CONST: + fprintf(out, "str: \"%s\"", t->str.str); + break; case TOKEN_EOF: fprintf(out, "eof"); break; } } -static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) { - if (t->ntokens >= t->cap) { - t->cap *= 2; - t->tokens = err_realloc(t->tokens, t->cap * sizeof(*t->tokens)); +static void tokr_add(Tokenizer *t, Token *token) { + if (!token->line) + token->line = t->line; + if (!token->code) + token->code = t->s; + tokens_add(&t->tokens, token); +} + +static void tokr_nextchar(Tokenizer *t) { + if (*(t->s) == '\n') { + t->line++; + } + t->s++; +} + +static char tokr_esc_seq(Tokenizer *t) { + /* TODO: add more of these incl. \x41, \100 */ + switch (*t->s) { + case '\'': + tokr_nextchar(t); + return '\''; + case '"': + tokr_nextchar(t); + return '"'; + case '\\': + tokr_nextchar(t); + return '\\'; + case 'n': + tokr_nextchar(t); + return '\n'; + default: + return 0; } - token->line = line; - token->col = col; - t->tokens[t->ntokens++] = *token; + } -static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't even try to pass it a string literal.*/ +/* to be used during tokenization */ +static void tokenization_err(Tokenizer *t, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + err_vprint(t->line, t->s, fmt, args); + va_end(args); + + char *end_of_line = strchr(t->s, '\n'); + if (end_of_line) { + t->s = end_of_line; + t->s++; /* move past newline */ + } else { + t->s = strchr(t->s, '\0'); + } + t->line++; +} + +/* to be used after tokenization */ +static void tokr_err(Tokenizer *t, const char *fmt, ...) { + LineNo line = t->token->line; + va_list args; + va_start(args, fmt); + err_vprint(line, t->token->code, fmt, args); + va_end(args); + while (1) { + if (t->token->line != line) break; + if (t->token->kind == TOKEN_EOF) break; + t->token++; + } +} + +static bool tokenize_string(Tokenizer *tokr, char *str) { int has_err = 0; Tokenizer t; - t.cap = 256; - t.ntokens = 0; - t.tokens = err_malloc(t.cap * sizeof(*t.tokens)); - - LineNo line = 1; - LineNo col = 1; + tokens_create(&t.tokens); + tokens_reserve(&t.tokens, 256); + t.s = str; + t.line = 1; while (1) { - if (*s == 0) break; - if (isspace(*s)) { - if (*s == '\n') { - line++; - col = 0; - } - s++; col++; + if (*t.s == 0) break; + if (isspace(*t.s)) { + tokr_nextchar(&t); continue; } - if (*s == '/') { + if (*t.s == '/') { /* maybe it's a comment */ int is_comment = 1; - s++; col++; - switch (*s) { + switch (t.s[1]) { case '/': /* single line comment */ - for (s++; *s != '\n' && *s; s++); - line++; - col = 1; + tokr_nextchar(&t); + for (t.s++; *t.s != '\n' && *t.s; t.s++); + t.line++; break; case '*': { /* multi line comment */ + tokr_nextchar(&t); int comment_level = 1; /* allow nested multi-line comments */ - while (*s) { - if (*s == '\n') { - line++; - col = 1; - s++; - continue; - } - if (s[0] == '*' && s[1] == '/') { - s += 2; col += 2; + while (*t.s) { + if (t.s[0] == '*' && t.s[1] == '/') { + t.s += 2; comment_level--; if (comment_level == 0) { break; } - } else if (s[0] == '/' && s[1] == '*') { - s += 2; col += 2; + } else if (t.s[0] == '/' && t.s[1] == '*') { + t.s += 2; comment_level++; } else { - s++; col++; + tokr_nextchar(&t); } } - if (*s == 0) { - err_print(line, col, "End of file reached inside multi-line comment."); + if (*t.s == 0) { + tokenization_err(&t, "End of file reached inside multi-line comment."); abort(); /* there won't be any further errors, of course */ } } break; default: is_comment = 0; - s--; /* go back */ break; } if (is_comment) continue; } Keyword kw; for (kw = 0; kw < KW_COUNT; kw++) { - if (strncmp(s, keywords[kw], strlen(keywords[kw])) == 0) { + if (strncmp(t.s, keywords[kw], strlen(keywords[kw])) == 0) { break; } } if (kw != KW_COUNT) { /* it's a keyword */ - Token token; + Token token = {0}; token.kind = TOKEN_KW; token.kw = kw; - tokenizer_add(&t, &token, line, col); - col += (LineNo)strlen(keywords[kw]); - s += (LineNo)strlen(keywords[kw]); + tokr_add(&t, &token); + t.s += (LineNo)strlen(keywords[kw]); continue; } /* check if it's a number */ - if (isdigit(*s)) { + if (isdigit(*t.s)) { /* it's a numeric constant */ int base = 10; RealConst decimal_pow10; NumConst n; n.kind = NUM_CONST_INT; n.intval = 0; - LineNo line_start = line, col_start = col; - if (*s == '0') { - s++; col++; + Token token = {0}; + token.line = t.line; + token.code = t.s; + if (*t.s == '0') { + tokr_nextchar(&t); /* octal/hexadecimal/binary (or zero) */ - char format = *s; + char format = *t.s; if (isdigit(format)) /* octal */ base = 8; else { switch (format) { case 'b': base = 2; - s++; col++; + tokr_nextchar(&t); break; case 'x': base = 16; - s++; col++; + tokr_nextchar(&t); break; default: /* it's 0/0.something etc. */ @@ -215,40 +282,39 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev } while (1) { - if (*s == '.') { + if (*t.s == '.') { if (n.kind == NUM_CONST_REAL) { - err_print(line, col, "Double . in number."); + tokenization_err(&t, "Double . in number."); goto err; } if (base != 10) { - err_print(line, col, "Decimal point in non base 10 number."); + tokenization_err(&t, "Decimal point in non base 10 number."); goto err; } n.kind = NUM_CONST_REAL; decimal_pow10 = 0.1; n.realval = (RealConst)n.intval; - s++, col++; + tokr_nextchar(&t); continue; - } else if (*s == 'e') { - s++; col++; + } else if (*t.s == 'e') { + tokr_nextchar(&t); if (n.kind == NUM_CONST_INT) { n.kind = NUM_CONST_REAL; n.realval = (RealConst)n.intval; } /* TODO: check if exceeding maximum exponent */ int exponent = 0; - if (*s == '+') { - s++; col++; - } + if (*t.s == '+') + tokr_nextchar(&t); /* ignore + after e */ int negative_exponent = 0; - if (*s == '-') { - s++; col++; + if (*t.s == '-') { + tokr_nextchar(&t); negative_exponent = 1; } - for (; isdigit(*s); s++, col++) { + for (; isdigit(*t.s); tokr_nextchar(&t)) { exponent *= 10; - exponent += *s - '0'; + exponent += *t.s - '0'; } /* OPTIM: Slow for very large exponents (unlikely to happen) */ for (int i = 0; i < exponent; i++) { @@ -262,19 +328,19 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev } int digit = -1; if (base == 16) { - if (*s >= 'a' && *s <= 'f') - digit = 10 + *s - 'a'; - else if (*s >= 'A' && *s <= 'F') - digit = *s - 'A'; + if (*t.s >= 'a' && *t.s <= 'f') + digit = 10 + *t.s - 'a'; + else if (*t.s >= 'A' && *t.s <= 'F') + digit = *t.s - 'A'; } if (digit == -1) { - if (*s >= '0' && *s <= '9') - digit = *s - '0'; + if (*t.s >= '0' && *t.s <= '9') + digit = *t.s - '0'; } if (digit < 0 || digit >= base) { - if (isdigit(*s)) { + if (isdigit(*t.s)) { /* something like 0b011012 */ - err_print(line, col, "Digit %d cannot appear in a base %d number.", digit, base); + tokenization_err(&t, "Digit %d cannot appear in a base %d number.", digit, base); goto err; } /* end of numeric constant */ @@ -282,9 +348,10 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev } switch (n.kind) { case NUM_CONST_INT: - if (n.intval > ULLONG_MAX / (IntConst)base) { + if (n.intval > ULLONG_MAX / (IntConst)base || + n.intval * (IntConst)base > ULLONG_MAX - (IntConst)digit) { /* too big! */ - err_print(line, col, "Number too big to fit in a numeric constant."); + tokenization_err(&t, "Number too big to fit in a numeric constant."); goto err; } n.intval *= (IntConst)base; @@ -295,88 +362,128 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev decimal_pow10 /= 10; break; } - s++; col++; + tokr_nextchar(&t); } - Token token; token.kind = TOKEN_NUM_CONST; token.num = n; - tokenizer_add(&t, &token, line_start, col_start); + tokr_add(&t, &token); continue; } - if (*s == '\'') { + if (*t.s == '\'') { /* it's a character constant! */ - s++; col++; + tokr_nextchar(&t); + Token token = {0}; + token.line = t.line; + token.code = t.s; char c; - if (*s == '\\') { + if (*t.s == '\\') { /* escape sequence */ - s++; col++; - /* TODO: Separate into function when string literals are added; add more of these */ - switch (*s) { - case '\'': - c = '\''; - break; - case '\\': - c = '\\'; - break; - case 'n': - c = '\n'; - break; - default: - err_print(line, col, "Unrecognized escape character: '%c'.", *s); + tokr_nextchar(&t); + c = tokr_esc_seq(&t); + if (c == 0) { + tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s); goto err; } } else { - c = *s; + c = *t.s; + tokr_nextchar(&t); } - s++; col++; - if (*s != '\'') { - err_print(line, col, "End of character constant expected."); + if (*t.s != '\'') { + tokenization_err(&t, "End of character constant expected."); goto err; } - s++; col++; - Token token; + tokr_nextchar(&t); token.kind = TOKEN_CHAR_CONST; token.chr = c; - tokenizer_add(&t, &token, line, col); + tokr_add(&t, &token); + continue; + } + + if (*t.s == '"') { + /* it's a string constant! */ + Token token; + token.line = t.line; + token.code = t.s; + tokr_nextchar(&t); + size_t len = 0; + size_t backslashes = 0; + while (*t.s != '"' || backslashes % 2 == 1) { + if (*t.s == '\\') { + backslashes++; + } else if (*t.s == 0) { + /* return t to opening " so that we go to the next line */ + t.line = token.line; + t.s = token.code; + tokenization_err(&t, "No matching \" found."); + goto err; + } else { + backslashes = 0; + } + len++; + tokr_nextchar(&t); + } + char *str = malloc(len + 1); + char *strptr = str; + t.s = token.code; + t.line = token.line; + tokr_nextchar(&t); /* past opening " */ + while (*t.s != '"') { + assert(*t.s); + if (*t.s == '\\') { + tokr_nextchar(&t); + char c = tokr_esc_seq(&t); + if (c == 0) { + tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s); + goto err; + } + *strptr++ = c; + } else { + *strptr++ = *t.s; + tokr_nextchar(&t); + } + } + *strptr = 0; + token.kind = TOKEN_STR_CONST; + token.str.len = len; + token.str.str = str; + tokr_add(&t, &token); + tokr_nextchar(&t); /* move past closing " */ continue; } - if (isidentstart(*s)) { + if (isidentstart(*t.s)) { /* it's an identifier */ - Identifier ident = ident_insert(&s); - Token token; + Token token = {0}; + token.line = t.line; + token.code = t.s; + Identifier ident = ident_insert(&t.s); token.kind = TOKEN_IDENT; token.ident = ident; - tokenizer_add(&t, &token, line, col); + tokr_add(&t, &token); continue; - } - int has_newline; - char *end_of_line = strchr(s, '\n'); - has_newline = end_of_line != NULL; - if (has_newline) - *end_of_line = 0; - - err_print(line, col, TEXT_IMPORTANT("Unrecognized token:") "\n\there --> %s\n", s); - if (has_newline) - *end_of_line = '\n'; + } + tokenization_err(&t, "Token not recognized"); err: has_err = 1; - s = strchr(s, '\n'); - if (s == NULL) break; - s++; /* move past newline */ - col = 1; - line++; - - } - if (has_err) { - fprintf(stderr, TEXT_IMPORTANT("Errors occured while preprocessing.\n")); - abort(); } - t.token = t.tokens; - return t; + Token token = {0}; + token.kind = TOKEN_EOF; + tokr_add(&t, &token); + + t.token = t.tokens.data; + *tokr = t; + return !has_err; } -static void tokenizer_free(Tokenizer *t) { - free(t->tokens); +static void tokr_free(Tokenizer *t) { + arr_foreach(t->tokens, Token, token) { + switch (token->kind) { + case TOKEN_STR_CONST: + free(token->str.str); + break; + default: break; + } + } + tokens_clear(&t->tokens); } |