diff options
Diffstat (limited to 'tokenizer.c')
-rw-r--r-- | tokenizer.c | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/tokenizer.c b/tokenizer.c new file mode 100644 index 0000000..00e9979 --- /dev/null +++ b/tokenizer.c @@ -0,0 +1,126 @@ +typedef enum { + TOKEN_KW, + TOKEN_EOF +} TokenKind; + +typedef enum { + KW_SEMICOLON, + KW_EQEQ, + KW_LT, + KW_LE, + KW_EQ, + KW_COUNT +} Keyword; + +static const char *keywords[KW_COUNT] = + {";", "==", "<", "<=", "="}; + + +/* NOTE: LineNo is typedef'd in util/err.c */ +typedef struct { + TokenKind kind; + LineNo line; + LineNo col; + union { + Keyword kw; + }; +} Token; + +typedef struct { + Token *tokens; + size_t ntokens; + size_t cap; /* used internally */ + Token *token; /* token currently being processed */ +} Tokenizer; + +static void token_fprint(FILE *out, Token *t) { + fprintf(out, "l%luc%lu-", (unsigned long)t->line, (unsigned long)t->col); + switch (t->kind) { + case TOKEN_KW: + fprintf(out, "keyword: %s", keywords[t->kw]); + break; + case TOKEN_EOF: + fprintf(out, "eof"); + break; + } +} + +static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) { + if (t->ntokens == t->cap) { + t->cap *= 2; + t->tokens = realloc(t->tokens, t->cap); + } + token->line = line; + token->col = col; + t->tokens[t->ntokens++] = *token; +} + +static Tokenizer tokenize_file(FILE *fp) { + char buf[4096]; + setvbuf(fp, buf, _IOFBF, sizeof buf); + char errbuf[256] = {0}; /* for errors */ + int has_err = 0; + Tokenizer t; + t.cap = 4096; + t.ntokens = 0; + t.tokens = malloc(t.cap * sizeof(*t.tokens)); + + LineNo line = 1; + LineNo col = 1; + + while (1) { + int c = fpeekc(fp); + if (c == EOF) break; + if (isspace(c)) { + if (c == '\n') { + line++; + col = 0; + } + fnextc(fp); + col++; + continue; + } + Keyword kw; + for (kw = 0; kw < KW_COUNT; kw++) { + if (fhasprefix(fp, keywords[kw])) { + break; + } + } + if (kw != KW_COUNT) { + Token kw_token; + kw_token.kind = TOKEN_KW; + kw_token.kw = kw; + tokenizer_add(&t, &kw_token, line, col); + col += (LineNo)strlen(keywords[kw]); + continue; + } + + fgets(errbuf, sizeof errbuf, fp); + size_t len = strlen(errbuf); + int has_newline = len && errbuf[len-1] == '\n'; + if (has_newline) { + /* remove newline */ + errbuf[len-1] = 0; + } + err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf); + has_err = 1; + if (has_newline) { + /* increment line counter because of it */ + line++; + col = 1; + } else { + col += (LineNo)(sizeof errbuf); + } + } + /* TODO: Check ferror/errno */ + if (has_err) { + fprintf(stderr, "Errors occured while preprocessing.\n"); + abort(); + } + t.token = t.tokens; + return t; +} + +static void tokenizer_free(Tokenizer *t) { + free(t->tokens); +} |