tokenizer.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

typedef enum {
			  TOKEN_KW,
			  TOKEN_IDENT,
			  TOKEN_EOF
} TokenKind;

typedef enum {
			  KW_SEMICOLON,
			  KW_EQEQ,
			  KW_LT,
			  KW_LE,
			  KW_EQ,
			  KW_COUNT
} Keyword;

/* OPTIM: Use a trie or just a function if this gets too long */
static const char *keywords[KW_COUNT] =
	{";", "==", "<", "<=", "="}; 

/* NOTE: LineNo is typedef'd in util/err.c */
typedef struct {
	TokenKind kind;
	LineNo line;
    LineNo col;
	union {
		Keyword kw;
		Identifier ident;
	};
} Token;

typedef struct {
	Token *tokens;
	size_t ntokens;
	size_t cap;	/* used internally */
	Token *token; /* token currently being processed */
} Tokenizer;

static void token_fprint(FILE *out, Token *t) {
	fprintf(out, "l%luc%lu-", (unsigned long)t->line, (unsigned long)t->col);
	switch (t->kind) {
	case TOKEN_KW:
		fprintf(out, "keyword: %s", keywords[t->kw]);
		break;
	case TOKEN_IDENT:
		fprintf(out, "identifier: %ld:", t->ident->id);
		ident_fprint(out, t->ident);
		break;
	case TOKEN_EOF:
		fprintf(out, "eof");
		break;
	}
}

static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) {
	if (t->ntokens == t->cap) {
		t->cap *= 2;
		t->tokens = realloc(t->tokens, t->cap);
	}
	token->line = line;
	token->col = col;
	t->tokens[t->ntokens++] = *token;
}

static Tokenizer tokenize_file(FILE *fp) {
	char buf[4096];
	setvbuf(fp, buf, _IOFBF, sizeof buf);
	char errbuf[256] = {0}; /* for errors */
	int has_err = 0;
	Tokenizer t;
	t.cap = 4096;
	t.ntokens = 0;
	t.tokens = malloc(t.cap * sizeof(*t.tokens));

	LineNo line = 1;
	LineNo col = 1;
	
	while (1) {
		int c = fpeekc(fp);
	    if (c == EOF) break;
		if (isspace(c)) {
			if (c == '\n') {
				line++;
				col = 0;
			}
			fnextc(fp);
			col++;
	    	continue;
		}
		Keyword kw;
		for (kw = 0; kw < KW_COUNT; kw++) {
			if (fhasprefix(fp, keywords[kw])) {
				break;
			}
		}
		if (kw != KW_COUNT) {
			/* it's a keyword */
			Token token;
			token.kind = TOKEN_KW;
			token.kw = kw;
			tokenizer_add(&t, &token, line, col);
			col += (LineNo)strlen(keywords[kw]);
			continue;
		}

		if (isident(c)) {
			/* it's an identifier */
			Identifier ident = ident_finsert(fp);
			Token token;
			token.kind = TOKEN_IDENT;
			token.ident = ident;
			tokenizer_add(&t, &token, line, col);			
			continue;
		}
		
		fgets(errbuf, sizeof errbuf, fp);
		size_t len = strlen(errbuf);
		int has_newline = len && errbuf[len-1] == '\n';
		if (has_newline) {
			/* remove newline */
			errbuf[len-1] = 0;
		}
		err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf);
		has_err = 1;
		if (has_newline) {
			/* increment line counter because of it */
		    line++;
			col = 1;
		} else {
			col += (LineNo)(sizeof errbuf);
		}
	}
	/* TODO: Check ferror/errno */
	if (has_err) {
		fprintf(stderr, "Errors occured while preprocessing.\n");
		abort();
	}
	t.token = t.tokens;
	return t;
}

static void tokenizer_free(Tokenizer *t) {
	free(t->tokens);
}