summaryrefslogtreecommitdiff
path: root/tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'tokenizer.c')
-rw-r--r--tokenizer.c126
1 files changed, 126 insertions, 0 deletions
diff --git a/tokenizer.c b/tokenizer.c
new file mode 100644
index 0000000..00e9979
--- /dev/null
+++ b/tokenizer.c
@@ -0,0 +1,126 @@
+typedef enum {
+ TOKEN_KW,
+ TOKEN_EOF
+} TokenKind;
+
+typedef enum {
+ KW_SEMICOLON,
+ KW_EQEQ,
+ KW_LT,
+ KW_LE,
+ KW_EQ,
+ KW_COUNT
+} Keyword;
+
+static const char *keywords[KW_COUNT] =
+ {";", "==", "<", "<=", "="};
+
+
+/* NOTE: LineNo is typedef'd in util/err.c */
+typedef struct {
+ TokenKind kind;
+ LineNo line;
+ LineNo col;
+ union {
+ Keyword kw;
+ };
+} Token;
+
+typedef struct {
+ Token *tokens;
+ size_t ntokens;
+ size_t cap; /* used internally */
+ Token *token; /* token currently being processed */
+} Tokenizer;
+
+static void token_fprint(FILE *out, Token *t) {
+ fprintf(out, "l%luc%lu-", (unsigned long)t->line, (unsigned long)t->col);
+ switch (t->kind) {
+ case TOKEN_KW:
+ fprintf(out, "keyword: %s", keywords[t->kw]);
+ break;
+ case TOKEN_EOF:
+ fprintf(out, "eof");
+ break;
+ }
+}
+
+static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) {
+ if (t->ntokens == t->cap) {
+ t->cap *= 2;
+ t->tokens = realloc(t->tokens, t->cap);
+ }
+ token->line = line;
+ token->col = col;
+ t->tokens[t->ntokens++] = *token;
+}
+
+static Tokenizer tokenize_file(FILE *fp) {
+ char buf[4096];
+ setvbuf(fp, buf, _IOFBF, sizeof buf);
+ char errbuf[256] = {0}; /* for errors */
+ int has_err = 0;
+ Tokenizer t;
+ t.cap = 4096;
+ t.ntokens = 0;
+ t.tokens = malloc(t.cap * sizeof(*t.tokens));
+
+ LineNo line = 1;
+ LineNo col = 1;
+
+ while (1) {
+ int c = fpeekc(fp);
+ if (c == EOF) break;
+ if (isspace(c)) {
+ if (c == '\n') {
+ line++;
+ col = 0;
+ }
+ fnextc(fp);
+ col++;
+ continue;
+ }
+ Keyword kw;
+ for (kw = 0; kw < KW_COUNT; kw++) {
+ if (fhasprefix(fp, keywords[kw])) {
+ break;
+ }
+ }
+ if (kw != KW_COUNT) {
+ Token kw_token;
+ kw_token.kind = TOKEN_KW;
+ kw_token.kw = kw;
+ tokenizer_add(&t, &kw_token, line, col);
+ col += (LineNo)strlen(keywords[kw]);
+ continue;
+ }
+
+ fgets(errbuf, sizeof errbuf, fp);
+ size_t len = strlen(errbuf);
+ int has_newline = len && errbuf[len-1] == '\n';
+ if (has_newline) {
+ /* remove newline */
+ errbuf[len-1] = 0;
+ }
+ err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf);
+ has_err = 1;
+ if (has_newline) {
+ /* increment line counter because of it */
+ line++;
+ col = 1;
+ } else {
+ col += (LineNo)(sizeof errbuf);
+ }
+ }
+ /* TODO: Check ferror/errno */
+ if (has_err) {
+ fprintf(stderr, "Errors occured while preprocessing.\n");
+ abort();
+ }
+ t.token = t.tokens;
+ return t;
+}
+
+static void tokenizer_free(Tokenizer *t) {
+ free(t->tokens);
+}