summaryrefslogtreecommitdiff
path: root/tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'tokenizer.c')
-rw-r--r--tokenizer.c200
1 files changed, 175 insertions, 25 deletions
diff --git a/tokenizer.c b/tokenizer.c
index 768693d..ff03e4f 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -1,7 +1,9 @@
typedef enum {
TOKEN_KW,
TOKEN_IDENT,
+ TOKEN_NUM_LITERAL,
TOKEN_EOF
+ /* TODO: char literals, str literals */
} TokenKind;
typedef enum {
@@ -17,6 +19,37 @@ typedef enum {
static const char *keywords[KW_COUNT] =
{";", "==", "<", "<=", "="};
+#define TOKENIZER_USE_LLONG 1
+
+#if TOKENIZER_USE_LLONG
+typedef long long LiteralInt;
+typedef unsigned long long LiteralUInt;
+#define LITERAL_INT_FMT "%lld"
+#define LITERAL_UINT_FMT "%llu"
+#else
+typedef long LiteralInt;
+typedef unsigned long LiteralUInt;
+#define LITERAL_INT_FMT "%ld"
+#define LITERAL_UINT_FMT "%lu"
+#endif
+
+typedef double LiteralReal;
+
+typedef enum {
+ NUM_LITERAL_INT,
+ NUM_LITERAL_UINT,
+ NUM_LITERAL_REAL
+} NumLiteralKind;
+
+typedef struct {
+ NumLiteralKind kind;
+ union {
+ LiteralInt intval;
+ LiteralUInt uintval;
+ LiteralReal realval;
+ };
+} NumLiteral;
+
/* NOTE: LineNo is typedef'd in util/err.c */
typedef struct {
TokenKind kind;
@@ -25,6 +58,7 @@ typedef struct {
union {
Keyword kw;
Identifier ident;
+ NumLiteral num;
};
} Token;
@@ -45,6 +79,20 @@ static void token_fprint(FILE *out, Token *t) {
fprintf(out, "identifier: %ld:", t->ident->id);
ident_fprint(out, t->ident);
break;
+ case TOKEN_NUM_LITERAL:
+ fprintf(out, "number: ");
+ switch (t->num.kind) {
+ case NUM_LITERAL_INT:
+ fprintf(out, LITERAL_INT_FMT, t->num.intval);
+ break;
+ case NUM_LITERAL_UINT:
+ fprintf(out, LITERAL_UINT_FMT, t->num.uintval);
+ break;
+ case NUM_LITERAL_REAL:
+ fprintf(out, "%f", t->num.realval);
+ break;
+ }
+ break;
case TOKEN_EOF:
fprintf(out, "eof");
break;
@@ -52,22 +100,19 @@ static void token_fprint(FILE *out, Token *t) {
}
static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) {
- if (t->ntokens == t->cap) {
+ if (t->ntokens >= t->cap) {
t->cap *= 2;
- t->tokens = realloc(t->tokens, t->cap);
+ t->tokens = err_realloc(t->tokens, t->cap);
}
token->line = line;
token->col = col;
t->tokens[t->ntokens++] = *token;
}
-static Tokenizer tokenize_file(FILE *fp) {
- char buf[4096];
- setvbuf(fp, buf, _IOFBF, sizeof buf);
- char errbuf[256] = {0}; /* for errors */
+static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't even try to pass it a literal.*/
int has_err = 0;
Tokenizer t;
- t.cap = 4096;
+ t.cap = 4096; /* TODO: test more tokens than this */
t.ntokens = 0;
t.tokens = malloc(t.cap * sizeof(*t.tokens));
@@ -75,20 +120,63 @@ static Tokenizer tokenize_file(FILE *fp) {
LineNo col = 1;
while (1) {
- int c = fpeekc(fp);
- if (c == EOF) break;
- if (isspace(c)) {
- if (c == '\n') {
+ if (*s == 0) break;
+ if (isspace(*s)) {
+ if (*s == '\n') {
line++;
col = 0;
}
- fnextc(fp);
- col++;
+ s++; col++;
continue;
}
+
+ if (*s == '/') {
+ /* maybe it's a comment */
+ int is_comment = 1;
+ s++; col++;
+ switch (*s) {
+ case '/': /* single line comment */
+ for (s++; *s != '\n' && *s; s++);
+ line++;
+ col = 1;
+ break;
+ case '*': { /* multi line comment */
+ int comment_level = 1; /* allow nested multi-line comments */
+ while (*s) {
+ if (*s == '\n') {
+ line++;
+ col = 1;
+ s++;
+ continue;
+ }
+ if (s[0] == '*' && s[1] == '/') {
+ s += 2; col += 2;
+ comment_level--;
+ if (comment_level == 0) {
+ break;
+ }
+ } else if (s[0] == '/' && s[1] == '*') {
+ s += 2; col += 2;
+ comment_level++;
+ } else {
+ s++; col++;
+ }
+ }
+ if (*s == 0) {
+ err_print(line, col, "End of file reached inside multi-line comment.");
+ abort(); /* there won't be any further errors, of course */
+ }
+ } break;
+ default:
+ is_comment = 0;
+ s--; /* go back */
+ break;
+ }
+ if (is_comment) continue;
+ }
Keyword kw;
for (kw = 0; kw < KW_COUNT; kw++) {
- if (fhasprefix(fp, keywords[kw])) {
+ if (strncmp(s, keywords[kw], strlen(keywords[kw])) == 0) {
break;
}
}
@@ -99,35 +187,97 @@ static Tokenizer tokenize_file(FILE *fp) {
token.kw = kw;
tokenizer_add(&t, &token, line, col);
col += (LineNo)strlen(keywords[kw]);
+ s += (LineNo)strlen(keywords[kw]);
continue;
}
- if (isident(c)) {
+ if (isdigit(*s)) {
+ /* it's a numerical constant */
+ int base = 10;
+ LiteralInt intval = 0;
+ LineNo line_start = line, col_start = col;
+ if (*s == '0') {
+ s++; col++;
+ /* octal/hexadecimal/binary (or zero) */
+ char format = *s;
+ if (isdigit(format)) /* octal */
+ base = 8;
+ else {
+ switch (format) {
+ case 'b':
+ base = 2;
+ s++; col++;
+ break;
+ case 'x':
+ base = 16;
+ s++; col++;
+ break;
+ default:
+ /* it's 0/0.something etc. */
+ break;
+ }
+ }
+ }
+ while (1) {
+ if (*s == '.') {
+ /* TODO */
+ } else if (*s == 'e') {
+ /* TODO */
+ }
+ int digit = -1;
+ if (base == 16) {
+ if (*s >= 'a' && *s <= 'f')
+ digit = 10 + *s - 'a';
+ else if (*s >= 'A' && *s <= 'F')
+ digit = *s - 'A';
+ }
+ if (digit == -1) {
+ if (*s >= '0' && *s <= '9')
+ digit = *s - '0';
+ }
+ if (digit < 0 || digit >= base) {
+ /* end of numerical literal */
+ break;
+ }
+ /* TODO: check overflow; switch to uint */
+ intval *= base;
+ intval += digit;
+ s++; col++;
+ }
+ Token token;
+ token.kind = TOKEN_NUM_LITERAL;
+ token.num.kind = NUM_LITERAL_INT;
+ token.num.intval = intval;
+ tokenizer_add(&t, &token, line_start, col_start);
+ continue;
+ }
+
+ if (isident(*s)) {
/* it's an identifier */
- Identifier ident = ident_finsert(fp);
+ Identifier ident = ident_insert(&s);
Token token;
token.kind = TOKEN_IDENT;
token.ident = ident;
tokenizer_add(&t, &token, line, col);
continue;
}
+
+ int has_newline;
+ char *end_of_line = strchr(s, '\n');
+ has_newline = end_of_line != NULL;
+ if (has_newline)
+ *end_of_line = 0;
- fgets(errbuf, sizeof errbuf, fp);
- size_t len = strlen(errbuf);
- int has_newline = len && errbuf[len-1] == '\n';
- if (has_newline) {
- /* remove newline */
- errbuf[len-1] = 0;
- }
- err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf);
+ err_print(line, col, "Unrecognized token:\n\there --> %s\n", s);
has_err = 1;
if (has_newline) {
/* increment line counter because of it */
line++;
col = 1;
} else {
- col += (LineNo)(sizeof errbuf);
+ col += (LineNo)strlen(s);
}
+ s += strlen(s);
}
/* TODO: Check ferror/errno */
if (has_err) {