diff options
Diffstat (limited to 'tokenizer.c')
-rw-r--r-- | tokenizer.c | 134 |
1 files changed, 90 insertions, 44 deletions
diff --git a/tokenizer.c b/tokenizer.c index ff03e4f..b2b2eb3 100644 --- a/tokenizer.c +++ b/tokenizer.c @@ -21,23 +21,12 @@ static const char *keywords[KW_COUNT] = #define TOKENIZER_USE_LLONG 1 -#if TOKENIZER_USE_LLONG -typedef long long LiteralInt; -typedef unsigned long long LiteralUInt; -#define LITERAL_INT_FMT "%lld" -#define LITERAL_UINT_FMT "%llu" -#else -typedef long LiteralInt; -typedef unsigned long LiteralUInt; -#define LITERAL_INT_FMT "%ld" -#define LITERAL_UINT_FMT "%lu" -#endif +typedef unsigned long long LiteralInt; -typedef double LiteralReal; +typedef long double LiteralReal; /* OPTIM: Maybe only use double */ typedef enum { NUM_LITERAL_INT, - NUM_LITERAL_UINT, NUM_LITERAL_REAL } NumLiteralKind; @@ -45,7 +34,6 @@ typedef struct { NumLiteralKind kind; union { LiteralInt intval; - LiteralUInt uintval; LiteralReal realval; }; } NumLiteral; @@ -83,13 +71,10 @@ static void token_fprint(FILE *out, Token *t) { fprintf(out, "number: "); switch (t->num.kind) { case NUM_LITERAL_INT: - fprintf(out, LITERAL_INT_FMT, t->num.intval); - break; - case NUM_LITERAL_UINT: - fprintf(out, LITERAL_UINT_FMT, t->num.uintval); + fprintf(out, "%llu", t->num.intval); break; case NUM_LITERAL_REAL: - fprintf(out, "%f", t->num.realval); + fprintf(out, "%g", (double)t->num.realval); break; } break; @@ -102,7 +87,7 @@ static void token_fprint(FILE *out, Token *t) { static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) { if (t->ntokens >= t->cap) { t->cap *= 2; - t->tokens = err_realloc(t->tokens, t->cap); + t->tokens = err_realloc(t->tokens, t->cap * sizeof(*t->tokens)); } token->line = line; token->col = col; @@ -112,9 +97,9 @@ static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) { static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't even try to pass it a literal.*/ int has_err = 0; Tokenizer t; - t.cap = 4096; /* TODO: test more tokens than this */ + t.cap = 256; t.ntokens = 0; - t.tokens = malloc(t.cap * sizeof(*t.tokens)); + t.tokens = err_malloc(t.cap * sizeof(*t.tokens)); LineNo line = 1; LineNo col = 1; @@ -190,11 +175,16 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev s += (LineNo)strlen(keywords[kw]); continue; } + + /* check if it's a number */ if (isdigit(*s)) { - /* it's a numerical constant */ + /* it's a numerical literal */ int base = 10; - LiteralInt intval = 0; + LiteralReal decimal_pow10; + NumLiteral l; + l.kind = NUM_LITERAL_INT; + l.intval = 0; LineNo line_start = line, col_start = col; if (*s == '0') { s++; col++; @@ -218,11 +208,52 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev } } } + while (1) { if (*s == '.') { - /* TODO */ + if (l.kind == NUM_LITERAL_REAL) { + err_print(line, col, "Double . in number."); + goto err; + } + if (base != 10) { + err_print(line, col, "Decimal point in non base 10 number."); + goto err; + } + l.kind = NUM_LITERAL_REAL; + decimal_pow10 = 0.1; + l.realval = (LiteralReal)l.intval; + s++, col++; + continue; } else if (*s == 'e') { - /* TODO */ + s++; col++; + if (l.kind == NUM_LITERAL_INT) { + l.kind = NUM_LITERAL_REAL; + l.realval = (LiteralReal)l.intval; + } + /* TODO: check if exceeding maximum exponent */ + int exponent = 0; + if (*s == '+') { + s++; col++; + } + + int negative_exponent = 0; + if (*s == '-') { + s++; col++; + negative_exponent = 1; + } + for (; isdigit(*s); s++, col++) { + exponent *= 10; + exponent += *s - '0'; + } + /* OPTIM: Slow for very large exponents (unlikely to happen) */ + for (int i = 0; i < exponent; i++) { + if (negative_exponent) + l.realval /= 10; + else + l.realval *= 10; + } + + break; } int digit = -1; if (base == 16) { @@ -236,23 +267,39 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev digit = *s - '0'; } if (digit < 0 || digit >= base) { + if (isdigit(*s)) { + /* something like 0b011012 */ + err_print(line, col, "Digit %d cannot appear in a base %d number.", digit, base); + goto err; + } /* end of numerical literal */ break; } - /* TODO: check overflow; switch to uint */ - intval *= base; - intval += digit; + switch (l.kind) { + case NUM_LITERAL_INT: + if (l.intval > ULLONG_MAX / (LiteralInt)base) { + /* too big! */ + err_print(line, col, "Number too big to fit in a numerical literal."); + goto err; + } + l.intval *= (LiteralInt)base; + l.intval += (LiteralInt)digit; + break; + case NUM_LITERAL_REAL: + l.realval += decimal_pow10 * (LiteralReal)digit; + decimal_pow10 /= 10; + break; + } s++; col++; } Token token; token.kind = TOKEN_NUM_LITERAL; - token.num.kind = NUM_LITERAL_INT; - token.num.intval = intval; + token.num = l; tokenizer_add(&t, &token, line_start, col_start); continue; } - if (isident(*s)) { + if (isidentstart(*s)) { /* it's an identifier */ Identifier ident = ident_insert(&s); Token token; @@ -261,27 +308,26 @@ static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't ev tokenizer_add(&t, &token, line, col); continue; } - int has_newline; char *end_of_line = strchr(s, '\n'); has_newline = end_of_line != NULL; if (has_newline) *end_of_line = 0; - err_print(line, col, "Unrecognized token:\n\there --> %s\n", s); + err_print(line, col, TEXT_IMPORTANT("Unrecognized token:") "\n\there --> %s\n", s); + if (has_newline) + *end_of_line = '\n'; + err: has_err = 1; - if (has_newline) { - /* increment line counter because of it */ - line++; - col = 1; - } else { - col += (LineNo)strlen(s); - } - s += strlen(s); + s = strchr(s, '\n'); + if (s == NULL) break; + s++; /* move past newline */ + col = 1; + line++; + } - /* TODO: Check ferror/errno */ if (has_err) { - fprintf(stderr, "Errors occured while preprocessing.\n"); + fprintf(stderr, TEXT_IMPORTANT("Errors occured while preprocessing.\n")); abort(); } t.token = t.tokens; |