tokenizer.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

typedef enum {
			  TOKEN_KW,
			  TOKEN_EOF
} TokenKind;

typedef enum {
			  KW_SEMICOLON,
			  KW_EQEQ,
			  KW_LT,
			  KW_LE,
			  KW_EQ,
			  KW_COUNT
} Keyword;

static const char *keywords[KW_COUNT] =
	{";", "==", "<", "<=", "="};


/* NOTE: LineNo is typedef'd in util/err.c */
typedef struct {
	TokenKind kind;
	LineNo line;
    LineNo col;
	union {
		Keyword kw;
	};
} Token;

typedef struct {
	Token *tokens;
	size_t ntokens;
	size_t cap;	/* used internally */
	Token *token; /* token currently being processed */
} Tokenizer;

static void token_fprint(FILE *out, Token *t) {
	fprintf(out, "l%luc%lu-", (unsigned long)t->line, (unsigned long)t->col);
	switch (t->kind) {
	case TOKEN_KW:
		fprintf(out, "keyword: %s", keywords[t->kw]);
		break;
	case TOKEN_EOF:
		fprintf(out, "eof");
		break;
	}
}

static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) {
	if (t->ntokens == t->cap) {
		t->cap *= 2;
		t->tokens = realloc(t->tokens, t->cap);
	}
	token->line = line;
	token->col = col;
	t->tokens[t->ntokens++] = *token;
}

static Tokenizer tokenize_file(FILE *fp) {
	char buf[4096];
	setvbuf(fp, buf, _IOFBF, sizeof buf);
	char errbuf[256] = {0}; /* for errors */
	int has_err = 0;
	Tokenizer t;
	t.cap = 4096;
	t.ntokens = 0;
	t.tokens = malloc(t.cap * sizeof(*t.tokens));

	LineNo line = 1;
	LineNo col = 1;
	
	while (1) {
		int c = fpeekc(fp);
	    if (c == EOF) break;
		if (isspace(c)) {
			if (c == '\n') {
				line++;
				col = 0;
			}
			fnextc(fp);
			col++;
	    	continue;
		}
		Keyword kw;
		for (kw = 0; kw < KW_COUNT; kw++) {
			if (fhasprefix(fp, keywords[kw])) {
				break;
			}
		}
		if (kw != KW_COUNT) {
			Token kw_token;
			kw_token.kind = TOKEN_KW;
			kw_token.kw = kw;
			tokenizer_add(&t, &kw_token, line, col);
			col += (LineNo)strlen(keywords[kw]);
			continue;
		}
		
		fgets(errbuf, sizeof errbuf, fp);
		size_t len = strlen(errbuf);
		int has_newline = len && errbuf[len-1] == '\n';
		if (has_newline) {
			/* remove newline */
			errbuf[len-1] = 0;
		}
		err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf);
		has_err = 1;
		if (has_newline) {
			/* increment line counter because of it */
		    line++;
			col = 1;
		} else {
			col += (LineNo)(sizeof errbuf);
		}
	}
	/* TODO: Check ferror/errno */
	if (has_err) {
		fprintf(stderr, "Errors occured while preprocessing.\n");
		abort();
	}
	t.token = t.tokens;
	return t;
}

static void tokenizer_free(Tokenizer *t) {
	free(t->tokens);
}