diff options
Diffstat (limited to 'tokenizer.c')
-rw-r--r-- | tokenizer.c | 200 |
1 files changed, 175 insertions, 25 deletions
diff --git a/tokenizer.c b/tokenizer.c index 768693d..ff03e4f 100644 --- a/tokenizer.c +++ b/tokenizer.c @@ -1,7 +1,9 @@ typedef enum { TOKEN_KW, TOKEN_IDENT, + TOKEN_NUM_LITERAL, TOKEN_EOF + /* TODO: char literals, str literals */ } TokenKind; typedef enum { @@ -17,6 +19,37 @@ typedef enum { static const char *keywords[KW_COUNT] = {";", "==", "<", "<=", "="}; +#define TOKENIZER_USE_LLONG 1 + +#if TOKENIZER_USE_LLONG +typedef long long LiteralInt; +typedef unsigned long long LiteralUInt; +#define LITERAL_INT_FMT "%lld" +#define LITERAL_UINT_FMT "%llu" +#else +typedef long LiteralInt; +typedef unsigned long LiteralUInt; +#define LITERAL_INT_FMT "%ld" +#define LITERAL_UINT_FMT "%lu" +#endif + +typedef double LiteralReal; + +typedef enum { + NUM_LITERAL_INT, + NUM_LITERAL_UINT, + NUM_LITERAL_REAL +} NumLiteralKind; + +typedef struct { + NumLiteralKind kind; + union { + LiteralInt intval; + LiteralUInt uintval; + LiteralReal realval; + }; +} NumLiteral; + /* NOTE: LineNo is typedef'd in util/err.c */ typedef struct { TokenKind kind; @@ -25,6 +58,7 @@ typedef struct { union { Keyword kw; Identifier ident; + NumLiteral num; }; } Token; @@ -45,6 +79,20 @@ static void token_fprint(FILE *out, Token *t) { fprintf(out, "identifier: %ld:", t->ident->id); ident_fprint(out, t->ident); break; + case TOKEN_NUM_LITERAL: + fprintf(out, "number: "); + switch (t->num.kind) { + case NUM_LITERAL_INT: + fprintf(out, LITERAL_INT_FMT, t->num.intval); + break; + case NUM_LITERAL_UINT: + fprintf(out, LITERAL_UINT_FMT, t->num.uintval); + break; + case NUM_LITERAL_REAL: + fprintf(out, "%f", t->num.realval); + break; + } + break; case TOKEN_EOF: fprintf(out, "eof"); break; @@ -52,22 +100,19 @@ static void token_fprint(FILE *out, Token *t) { } static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) { - if (t->ntokens == t->cap) { + if (t->ntokens >= t->cap) { t->cap *= 2; - t->tokens = realloc(t->tokens, t->cap); + t->tokens = err_realloc(t->tokens, t->cap); } token->line = line; token->col = col; t->tokens[t->ntokens++] = *token; } -static Tokenizer tokenize_file(FILE *fp) { - char buf[4096]; - setvbuf(fp, buf, _IOFBF, sizeof buf); - char errbuf[256] = {0}; /* for errors */ +static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't even try to pass it a literal.*/ int has_err = 0; Tokenizer t; - t.cap = 4096; + t.cap = 4096; /* TODO: test more tokens than this */ t.ntokens = 0; t.tokens = malloc(t.cap * sizeof(*t.tokens)); @@ -75,20 +120,63 @@ static Tokenizer tokenize_file(FILE *fp) { LineNo col = 1; while (1) { - int c = fpeekc(fp); - if (c == EOF) break; - if (isspace(c)) { - if (c == '\n') { + if (*s == 0) break; + if (isspace(*s)) { + if (*s == '\n') { line++; col = 0; } - fnextc(fp); - col++; + s++; col++; continue; } + + if (*s == '/') { + /* maybe it's a comment */ + int is_comment = 1; + s++; col++; + switch (*s) { + case '/': /* single line comment */ + for (s++; *s != '\n' && *s; s++); + line++; + col = 1; + break; + case '*': { /* multi line comment */ + int comment_level = 1; /* allow nested multi-line comments */ + while (*s) { + if (*s == '\n') { + line++; + col = 1; + s++; + continue; + } + if (s[0] == '*' && s[1] == '/') { + s += 2; col += 2; + comment_level--; + if (comment_level == 0) { + break; + } + } else if (s[0] == '/' && s[1] == '*') { + s += 2; col += 2; + comment_level++; + } else { + s++; col++; + } + } + if (*s == 0) { + err_print(line, col, "End of file reached inside multi-line comment."); + abort(); /* there won't be any further errors, of course */ + } + } break; + default: + is_comment = 0; + s--; /* go back */ + break; + } + if (is_comment) continue; + } Keyword kw; for (kw = 0; kw < KW_COUNT; kw++) { - if (fhasprefix(fp, keywords[kw])) { + if (strncmp(s, keywords[kw], strlen(keywords[kw])) == 0) { break; } } @@ -99,35 +187,97 @@ static Tokenizer tokenize_file(FILE *fp) { token.kw = kw; tokenizer_add(&t, &token, line, col); col += (LineNo)strlen(keywords[kw]); + s += (LineNo)strlen(keywords[kw]); continue; } - if (isident(c)) { + if (isdigit(*s)) { + /* it's a numerical constant */ + int base = 10; + LiteralInt intval = 0; + LineNo line_start = line, col_start = col; + if (*s == '0') { + s++; col++; + /* octal/hexadecimal/binary (or zero) */ + char format = *s; + if (isdigit(format)) /* octal */ + base = 8; + else { + switch (format) { + case 'b': + base = 2; + s++; col++; + break; + case 'x': + base = 16; + s++; col++; + break; + default: + /* it's 0/0.something etc. */ + break; + } + } + } + while (1) { + if (*s == '.') { + /* TODO */ + } else if (*s == 'e') { + /* TODO */ + } + int digit = -1; + if (base == 16) { + if (*s >= 'a' && *s <= 'f') + digit = 10 + *s - 'a'; + else if (*s >= 'A' && *s <= 'F') + digit = *s - 'A'; + } + if (digit == -1) { + if (*s >= '0' && *s <= '9') + digit = *s - '0'; + } + if (digit < 0 || digit >= base) { + /* end of numerical literal */ + break; + } + /* TODO: check overflow; switch to uint */ + intval *= base; + intval += digit; + s++; col++; + } + Token token; + token.kind = TOKEN_NUM_LITERAL; + token.num.kind = NUM_LITERAL_INT; + token.num.intval = intval; + tokenizer_add(&t, &token, line_start, col_start); + continue; + } + + if (isident(*s)) { /* it's an identifier */ - Identifier ident = ident_finsert(fp); + Identifier ident = ident_insert(&s); Token token; token.kind = TOKEN_IDENT; token.ident = ident; tokenizer_add(&t, &token, line, col); continue; } + + int has_newline; + char *end_of_line = strchr(s, '\n'); + has_newline = end_of_line != NULL; + if (has_newline) + *end_of_line = 0; - fgets(errbuf, sizeof errbuf, fp); - size_t len = strlen(errbuf); - int has_newline = len && errbuf[len-1] == '\n'; - if (has_newline) { - /* remove newline */ - errbuf[len-1] = 0; - } - err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf); + err_print(line, col, "Unrecognized token:\n\there --> %s\n", s); has_err = 1; if (has_newline) { /* increment line counter because of it */ line++; col = 1; } else { - col += (LineNo)(sizeof errbuf); + col += (LineNo)strlen(s); } + s += strlen(s); } /* TODO: Check ferror/errno */ if (has_err) { |