summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--identifiers.c33
-rw-r--r--main.c4
-rw-r--r--tokenizer.c201
-rw-r--r--util/err.c1
4 files changed, 124 insertions, 115 deletions
diff --git a/identifiers.c b/identifiers.c
index f491a06..a960fb7 100644
--- a/identifiers.c
+++ b/identifiers.c
@@ -1,9 +1,10 @@
+/* OPTIM: This is not ideal. There should be one dynamic array of tree nodes. */
+
typedef struct {
struct Block *scope; /* NULL for file scope */
struct Declaration *decl;
} IdentDecl;
-/* OPTIM: This is not ideal. There should be one dynamic array of tree nodes. */
typedef struct IdentTree {
/* zero value is an empty trie */
long id;
@@ -17,9 +18,13 @@ typedef struct IdentTree {
typedef IdentTree *Identifier;
-static IdentTree ident_base_tree;
-static long ident_curr_id; /* NOTE: you should eventually add something to reset this */
-static char identifier_chars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
+/* MUST be zero-initialized before use */
+typedef struct {
+ IdentTree tree_root;
+ long curr_id;
+} Identifiers;
+
+static const char identifier_chars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
#define NIDENTIFIER_CHARS ((int)((sizeof identifier_chars) - 1)) /* -1 for null char */
@@ -41,11 +46,14 @@ static int isident(int c) {
}
/* moves s to the char after the identifier */
-static Identifier ident_tree_insert(IdentTree *t, char **s) {
+/* inserts if does not exist. reads until non-ident char is found. */
+/* advances past identifier */
+static Identifier ident_insert(Identifiers *ids, char **s) {
+ IdentTree *t = &ids->tree_root;
while (1) {
char c = **s;
if (!isident(c)) {
- if (t->id == 0) t->id = ++ident_curr_id;
+ if (t->id == 0) t->id = ++ids->curr_id;
return t;
}
@@ -62,16 +70,11 @@ static Identifier ident_tree_insert(IdentTree *t, char **s) {
}
}
-/* inserts if does not exist. reads until non-ident char is found. */
-/* advances past identifier */
-static Identifier ident_insert(char **s) {
- return ident_tree_insert(&ident_base_tree, s);
-}
-
static void fprint_ident(FILE *out, Identifier id) {
if (id->parent == NULL) return; /* at root */
- /* OPTIM: Use malloc(id->len)???? */
+ /* OPTIM: Use malloc(id->len)???? would probably use less mem for long idents, but
+ it's on the heap */
fprint_ident(out, id->parent);
fputc(identifier_chars[id - id->parent->children /* index of self in parent */], out);
}
@@ -113,6 +116,6 @@ static void idents_free_tree(IdentTree *tree) {
free(tree->children);
}
-static void idents_free(void) {
- idents_free_tree(&ident_base_tree);
+static void idents_free(Identifiers *ids) {
+ idents_free_tree(&ids->tree_root);
}
diff --git a/main.c b/main.c
index f22b3c3..b4caeeb 100644
--- a/main.c
+++ b/main.c
@@ -36,7 +36,9 @@ int main(int argc, char **argv) {
fclose(in);
err_filename = in_filename;
+ Identifiers file_idents = {0};
Tokenizer t;
+ tokr_create(&t, &file_idents);
if (!tokenize_string(&t, contents)) {
err_fprint(TEXT_IMPORTANT("Errors occured while preprocessing.\n"));
return EXIT_FAILURE;
@@ -80,5 +82,5 @@ int main(int argc, char **argv) {
fclose(c_out);
fclose(h_out);
- idents_free();
+ idents_free(&file_idents);
}
diff --git a/tokenizer.c b/tokenizer.c
index a8932e2..73ae7d0 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -101,6 +101,7 @@ typedef struct {
char *s; /* string being parsed */
LineNo line;
Token *token; /* token currently being processed */
+ Identifiers *idents;
} Tokenizer;
@@ -214,49 +215,52 @@ static void tokr_get_location(Tokenizer *tokr, Token *t) {
tokr->s = t->where.code;
}
-static bool tokenize_string(Tokenizer *tokr, char *str) {
+static void tokr_create(Tokenizer *t, Identifiers *idents) {
+ arr_create(&t->tokens, sizeof(Token));
+ arr_reserve(&t->tokens, 256);
+ t->idents = idents;
+}
+
+static bool tokenize_string(Tokenizer *t, char *str) {
int has_err = 0;
- Tokenizer t;
- arr_create(&t.tokens, sizeof(Token));
- arr_reserve(&t.tokens, 256);
- t.s = str;
- t.line = 1;
+ t->s = str;
+ t->line = 1;
while (1) {
- if (*t.s == 0) break;
- if (isspace(*t.s)) {
- tokr_nextchar(&t);
+ if (*t->s == 0) break;
+ if (isspace(*t->s)) {
+ tokr_nextchar(t);
continue;
}
- if (*t.s == '/') {
+ if (*t->s == '/') {
/* maybe it's a comment */
int is_comment = 1;
- switch (t.s[1]) {
+ switch (t->s[1]) {
case '/': /* single line comment */
- tokr_nextchar(&t);
- for (t.s++; *t.s != '\n' && *t.s; t.s++);
- t.line++;
+ tokr_nextchar(t);
+ for (t->s++; *t->s != '\n' && *t->s; t->s++);
+ t->line++;
break;
case '*': { /* multi line comment */
- tokr_nextchar(&t);
+ tokr_nextchar(t);
int comment_level = 1; /* allow nested multi-line comments */
- while (*t.s) {
- if (t.s[0] == '*' && t.s[1] == '/') {
- t.s += 2;
+ while (*t->s) {
+ if (t->s[0] == '*' && t->s[1] == '/') {
+ t->s += 2;
comment_level--;
if (comment_level == 0) {
break;
}
- } else if (t.s[0] == '/' && t.s[1] == '*') {
- t.s += 2;
+ } else if (t->s[0] == '/' && t->s[1] == '*') {
+ t->s += 2;
comment_level++;
} else {
- tokr_nextchar(&t);
+ tokr_nextchar(t);
}
}
- if (*t.s == 0) {
- tokenization_err(&t, "End of file reached inside multi-line comment.");
+ if (*t->s == 0) {
+ tokenization_err(t, "End of file reached inside multi-line comment.");
abort(); /* there won't be any further errors, of course */
}
} break;
@@ -267,12 +271,12 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
if (is_comment) continue;
}
{
- char *start_s = t.s;
- Keyword kw = tokenize_kw(&t.s);
+ char *start_s = t->s;
+ Keyword kw = tokenize_kw(&t->s);
if (kw != KW_COUNT) {
/* it's a keyword */
- Token *token = tokr_add(&t);
- token->where.line = t.line;
+ Token *token = tokr_add(t);
+ token->where.line = t->line;
token->where.code = start_s;
token->kind = TOKEN_KW;
token->kw = kw;
@@ -282,30 +286,30 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
/* check if it's a number */
- if (isdigit(*t.s)) {
+ if (isdigit(*t->s)) {
/* it's a numeric literal */
int base = 10;
Floating decimal_pow10;
NumLiteral n;
n.kind = NUM_LITERAL_INT;
n.intval = 0;
- Token *token = tokr_add(&t);
- tokr_put_location(&t, token);
- if (*t.s == '0') {
- tokr_nextchar(&t);
+ Token *token = tokr_add(t);
+ tokr_put_location(t, token);
+ if (*t->s == '0') {
+ tokr_nextchar(t);
/* octal/hexadecimal/binary (or zero) */
- char format = *t.s;
+ char format = *t->s;
if (isdigit(format)) /* octal */
base = 8;
else {
switch (format) {
case 'b':
base = 2;
- tokr_nextchar(&t);
+ tokr_nextchar(t);
break;
case 'x':
base = 16;
- tokr_nextchar(&t);
+ tokr_nextchar(t);
break;
default:
/* it's 0/0.something etc. */
@@ -315,39 +319,39 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
}
while (1) {
- if (*t.s == '.') {
+ if (*t->s == '.') {
if (n.kind == NUM_LITERAL_FLOAT) {
- tokenization_err(&t, "Double . in number.");
+ tokenization_err(t, "Double . in number.");
goto err;
}
if (base != 10) {
- tokenization_err(&t, "Decimal point in non base 10 number.");
+ tokenization_err(t, "Decimal point in non base 10 number.");
goto err;
}
n.kind = NUM_LITERAL_FLOAT;
decimal_pow10 = 0.1;
n.floatval = (Floating)n.intval;
- tokr_nextchar(&t);
+ tokr_nextchar(t);
continue;
- } else if (*t.s == 'e') {
- tokr_nextchar(&t);
+ } else if (*t->s == 'e') {
+ tokr_nextchar(t);
if (n.kind == NUM_LITERAL_INT) {
n.kind = NUM_LITERAL_FLOAT;
n.floatval = (Floating)n.intval;
}
/* TODO: check if exceeding maximum exponent */
int exponent = 0;
- if (*t.s == '+')
- tokr_nextchar(&t); /* ignore + after e */
+ if (*t->s == '+')
+ tokr_nextchar(t); /* ignore + after e */
int negative_exponent = 0;
- if (*t.s == '-') {
- tokr_nextchar(&t);
+ if (*t->s == '-') {
+ tokr_nextchar(t);
negative_exponent = 1;
}
- for (; isdigit(*t.s); tokr_nextchar(&t)) {
+ for (; isdigit(*t->s); tokr_nextchar(t)) {
exponent *= 10;
- exponent += *t.s - '0';
+ exponent += *t->s - '0';
}
/* OPTIM: Slow for very large exponents (unlikely to happen) */
for (int i = 0; i < exponent; i++) {
@@ -361,19 +365,19 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
}
int digit = -1;
if (base == 16) {
- if (*t.s >= 'a' && *t.s <= 'f')
- digit = 10 + *t.s - 'a';
- else if (*t.s >= 'A' && *t.s <= 'F')
- digit = *t.s - 'A';
+ if (*t->s >= 'a' && *t->s <= 'f')
+ digit = 10 + *t->s - 'a';
+ else if (*t->s >= 'A' && *t->s <= 'F')
+ digit = *t->s - 'A';
}
if (digit == -1) {
- if (*t.s >= '0' && *t.s <= '9')
- digit = *t.s - '0';
+ if (*t->s >= '0' && *t->s <= '9')
+ digit = *t->s - '0';
}
if (digit < 0 || digit >= base) {
- if (isdigit(*t.s)) {
+ if (isdigit(*t->s)) {
/* something like 0b011012 */
- tokenization_err(&t, "Digit %d cannot appear in a base %d number.", digit, base);
+ tokenization_err(t, "Digit %d cannot appear in a base %d number.", digit, base);
goto err;
}
/* end of numeric literal */
@@ -384,7 +388,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
if (n.intval > ULLONG_MAX / (UInteger)base ||
n.intval * (UInteger)base > ULLONG_MAX - (UInteger)digit) {
/* too big! */
- tokenization_err(&t, "Number too big to fit in a numeric literal.");
+ tokenization_err(t, "Number too big to fit in a numeric literal.");
goto err;
}
n.intval *= (UInteger)base;
@@ -395,107 +399,106 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
decimal_pow10 /= 10;
break;
}
- tokr_nextchar(&t);
+ tokr_nextchar(t);
}
token->kind = TOKEN_NUM_LITERAL;
token->num = n;
continue;
}
- if (*t.s == '\'') {
+ if (*t->s == '\'') {
/* it's a character literal! */
- tokr_nextchar(&t);
- Token *token = tokr_add(&t);
- tokr_put_location(&t, token);
+ tokr_nextchar(t);
+ Token *token = tokr_add(t);
+ tokr_put_location(t, token);
char c;
- if (*t.s == '\\') {
+ if (*t->s == '\\') {
/* escape sequence */
- tokr_nextchar(&t);
- c = tokr_esc_seq(&t);
+ tokr_nextchar(t);
+ c = tokr_esc_seq(t);
if (c == 0) {
- tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s);
+ tokenization_err(t, "Unrecognized escape character: '\\%c'.", *t->s);
goto err;
}
} else {
- c = *t.s;
- tokr_nextchar(&t);
+ c = *t->s;
+ tokr_nextchar(t);
}
- if (*t.s != '\'') {
- tokenization_err(&t, "End of character literal expected.");
+ if (*t->s != '\'') {
+ tokenization_err(t, "End of character literal expected.");
goto err;
}
- tokr_nextchar(&t);
+ tokr_nextchar(t);
token->kind = TOKEN_CHAR_LITERAL;
token->chr = c;
continue;
}
- if (*t.s == '"') {
+ if (*t->s == '"') {
/* it's a string literal! */
- Token *token = tokr_add(&t);
- tokr_put_location(&t, token);
- tokr_nextchar(&t);
+ Token *token = tokr_add(t);
+ tokr_put_location(t, token);
+ tokr_nextchar(t);
size_t len = 0;
size_t backslashes = 0;
- while (*t.s != '"' || backslashes % 2 == 1) {
- if (*t.s == '\\') {
+ while (*t->s != '"' || backslashes % 2 == 1) {
+ if (*t->s == '\\') {
backslashes++;
- } else if (*t.s == 0) {
+ } else if (*t->s == 0) {
/* return t to opening " so that we go to the next line */
- tokr_get_location(&t, token);
- tokenization_err(&t, "No matching \" found.");
+ tokr_get_location(t, token);
+ tokenization_err(t, "No matching \" found.");
goto err;
} else {
backslashes = 0;
}
len++;
- tokr_nextchar(&t);
+ tokr_nextchar(t);
}
char *strlit = malloc(len + 1);
char *strptr = strlit;
- tokr_get_location(&t, token);
- tokr_nextchar(&t); /* past opening " */
- while (*t.s != '"') {
- assert(*t.s);
- if (*t.s == '\\') {
- tokr_nextchar(&t);
- char c = tokr_esc_seq(&t);
+ tokr_get_location(t, token);
+ tokr_nextchar(t); /* past opening " */
+ while (*t->s != '"') {
+ assert(*t->s);
+ if (*t->s == '\\') {
+ tokr_nextchar(t);
+ char c = tokr_esc_seq(t);
if (c == 0) {
- tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s);
+ tokenization_err(t, "Unrecognized escape character: '\\%c'.", *t->s);
goto err;
}
*strptr++ = c;
} else {
- *strptr++ = *t.s;
- tokr_nextchar(&t);
+ *strptr++ = *t->s;
+ tokr_nextchar(t);
}
}
*strptr = 0;
token->kind = TOKEN_STR_LITERAL;
token->str.len = len;
token->str.str = strlit;
- tokr_nextchar(&t); /* move past closing " */
+ tokr_nextchar(t); /* move past closing " */
continue;
}
- if (isident(*t.s)) {
+ if (isident(*t->s)) {
/* it's an identifier */
- Token *token = tokr_add(&t);
- tokr_put_location(&t, token);
- Identifier ident = ident_insert(&t.s);
+ Token *token = tokr_add(t);
+ tokr_put_location(t, token);
+ Identifier ident = ident_insert(t->idents, &t->s);
token->kind = TOKEN_IDENT;
token->ident = ident;
continue;
}
- tokenization_err(&t, "Token not recognized");
+ tokenization_err(t, "Token not recognized");
err:
has_err = 1;
}
- Token *token = tokr_add(&t);
+ Token *token = tokr_add(t);
token->kind = TOKEN_EOF;
- t.token = t.tokens.data;
- *tokr = t;
+ t->token = t->tokens.data;
return !has_err;
}
diff --git a/util/err.c b/util/err.c
index 5ff28ad..e53b16f 100644
--- a/util/err.c
+++ b/util/err.c
@@ -20,6 +20,7 @@ typedef struct {
} Location;
/* file name of file being processed */
+/* TODO: remove this */
static const char *err_filename;
/* Write directly to the error file */