From 3d784ecc2529504ad5d57da4f1216777e3f7788d Mon Sep 17 00:00:00 2001 From: Leo Tenenbaum Date: Sat, 31 Aug 2019 20:56:56 -0400 Subject: removed global identifier tree --- identifiers.c | 33 +++++----- main.c | 4 +- tokenizer.c | 201 +++++++++++++++++++++++++++++----------------------------- util/err.c | 1 + 4 files changed, 124 insertions(+), 115 deletions(-) diff --git a/identifiers.c b/identifiers.c index f491a06..a960fb7 100644 --- a/identifiers.c +++ b/identifiers.c @@ -1,9 +1,10 @@ +/* OPTIM: This is not ideal. There should be one dynamic array of tree nodes. */ + typedef struct { struct Block *scope; /* NULL for file scope */ struct Declaration *decl; } IdentDecl; -/* OPTIM: This is not ideal. There should be one dynamic array of tree nodes. */ typedef struct IdentTree { /* zero value is an empty trie */ long id; @@ -17,9 +18,13 @@ typedef struct IdentTree { typedef IdentTree *Identifier; -static IdentTree ident_base_tree; -static long ident_curr_id; /* NOTE: you should eventually add something to reset this */ -static char identifier_chars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; +/* MUST be zero-initialized before use */ +typedef struct { + IdentTree tree_root; + long curr_id; +} Identifiers; + +static const char identifier_chars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; #define NIDENTIFIER_CHARS ((int)((sizeof identifier_chars) - 1)) /* -1 for null char */ @@ -41,11 +46,14 @@ static int isident(int c) { } /* moves s to the char after the identifier */ -static Identifier ident_tree_insert(IdentTree *t, char **s) { +/* inserts if does not exist. reads until non-ident char is found. */ +/* advances past identifier */ +static Identifier ident_insert(Identifiers *ids, char **s) { + IdentTree *t = &ids->tree_root; while (1) { char c = **s; if (!isident(c)) { - if (t->id == 0) t->id = ++ident_curr_id; + if (t->id == 0) t->id = ++ids->curr_id; return t; } @@ -62,16 +70,11 @@ static Identifier ident_tree_insert(IdentTree *t, char **s) { } } -/* inserts if does not exist. reads until non-ident char is found. */ -/* advances past identifier */ -static Identifier ident_insert(char **s) { - return ident_tree_insert(&ident_base_tree, s); -} - static void fprint_ident(FILE *out, Identifier id) { if (id->parent == NULL) return; /* at root */ - /* OPTIM: Use malloc(id->len)???? */ + /* OPTIM: Use malloc(id->len)???? would probably use less mem for long idents, but + it's on the heap */ fprint_ident(out, id->parent); fputc(identifier_chars[id - id->parent->children /* index of self in parent */], out); } @@ -113,6 +116,6 @@ static void idents_free_tree(IdentTree *tree) { free(tree->children); } -static void idents_free(void) { - idents_free_tree(&ident_base_tree); +static void idents_free(Identifiers *ids) { + idents_free_tree(&ids->tree_root); } diff --git a/main.c b/main.c index f22b3c3..b4caeeb 100644 --- a/main.c +++ b/main.c @@ -36,7 +36,9 @@ int main(int argc, char **argv) { fclose(in); err_filename = in_filename; + Identifiers file_idents = {0}; Tokenizer t; + tokr_create(&t, &file_idents); if (!tokenize_string(&t, contents)) { err_fprint(TEXT_IMPORTANT("Errors occured while preprocessing.\n")); return EXIT_FAILURE; @@ -80,5 +82,5 @@ int main(int argc, char **argv) { fclose(c_out); fclose(h_out); - idents_free(); + idents_free(&file_idents); } diff --git a/tokenizer.c b/tokenizer.c index a8932e2..73ae7d0 100644 --- a/tokenizer.c +++ b/tokenizer.c @@ -101,6 +101,7 @@ typedef struct { char *s; /* string being parsed */ LineNo line; Token *token; /* token currently being processed */ + Identifiers *idents; } Tokenizer; @@ -214,49 +215,52 @@ static void tokr_get_location(Tokenizer *tokr, Token *t) { tokr->s = t->where.code; } -static bool tokenize_string(Tokenizer *tokr, char *str) { +static void tokr_create(Tokenizer *t, Identifiers *idents) { + arr_create(&t->tokens, sizeof(Token)); + arr_reserve(&t->tokens, 256); + t->idents = idents; +} + +static bool tokenize_string(Tokenizer *t, char *str) { int has_err = 0; - Tokenizer t; - arr_create(&t.tokens, sizeof(Token)); - arr_reserve(&t.tokens, 256); - t.s = str; - t.line = 1; + t->s = str; + t->line = 1; while (1) { - if (*t.s == 0) break; - if (isspace(*t.s)) { - tokr_nextchar(&t); + if (*t->s == 0) break; + if (isspace(*t->s)) { + tokr_nextchar(t); continue; } - if (*t.s == '/') { + if (*t->s == '/') { /* maybe it's a comment */ int is_comment = 1; - switch (t.s[1]) { + switch (t->s[1]) { case '/': /* single line comment */ - tokr_nextchar(&t); - for (t.s++; *t.s != '\n' && *t.s; t.s++); - t.line++; + tokr_nextchar(t); + for (t->s++; *t->s != '\n' && *t->s; t->s++); + t->line++; break; case '*': { /* multi line comment */ - tokr_nextchar(&t); + tokr_nextchar(t); int comment_level = 1; /* allow nested multi-line comments */ - while (*t.s) { - if (t.s[0] == '*' && t.s[1] == '/') { - t.s += 2; + while (*t->s) { + if (t->s[0] == '*' && t->s[1] == '/') { + t->s += 2; comment_level--; if (comment_level == 0) { break; } - } else if (t.s[0] == '/' && t.s[1] == '*') { - t.s += 2; + } else if (t->s[0] == '/' && t->s[1] == '*') { + t->s += 2; comment_level++; } else { - tokr_nextchar(&t); + tokr_nextchar(t); } } - if (*t.s == 0) { - tokenization_err(&t, "End of file reached inside multi-line comment."); + if (*t->s == 0) { + tokenization_err(t, "End of file reached inside multi-line comment."); abort(); /* there won't be any further errors, of course */ } } break; @@ -267,12 +271,12 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { if (is_comment) continue; } { - char *start_s = t.s; - Keyword kw = tokenize_kw(&t.s); + char *start_s = t->s; + Keyword kw = tokenize_kw(&t->s); if (kw != KW_COUNT) { /* it's a keyword */ - Token *token = tokr_add(&t); - token->where.line = t.line; + Token *token = tokr_add(t); + token->where.line = t->line; token->where.code = start_s; token->kind = TOKEN_KW; token->kw = kw; @@ -282,30 +286,30 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { /* check if it's a number */ - if (isdigit(*t.s)) { + if (isdigit(*t->s)) { /* it's a numeric literal */ int base = 10; Floating decimal_pow10; NumLiteral n; n.kind = NUM_LITERAL_INT; n.intval = 0; - Token *token = tokr_add(&t); - tokr_put_location(&t, token); - if (*t.s == '0') { - tokr_nextchar(&t); + Token *token = tokr_add(t); + tokr_put_location(t, token); + if (*t->s == '0') { + tokr_nextchar(t); /* octal/hexadecimal/binary (or zero) */ - char format = *t.s; + char format = *t->s; if (isdigit(format)) /* octal */ base = 8; else { switch (format) { case 'b': base = 2; - tokr_nextchar(&t); + tokr_nextchar(t); break; case 'x': base = 16; - tokr_nextchar(&t); + tokr_nextchar(t); break; default: /* it's 0/0.something etc. */ @@ -315,39 +319,39 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { } while (1) { - if (*t.s == '.') { + if (*t->s == '.') { if (n.kind == NUM_LITERAL_FLOAT) { - tokenization_err(&t, "Double . in number."); + tokenization_err(t, "Double . in number."); goto err; } if (base != 10) { - tokenization_err(&t, "Decimal point in non base 10 number."); + tokenization_err(t, "Decimal point in non base 10 number."); goto err; } n.kind = NUM_LITERAL_FLOAT; decimal_pow10 = 0.1; n.floatval = (Floating)n.intval; - tokr_nextchar(&t); + tokr_nextchar(t); continue; - } else if (*t.s == 'e') { - tokr_nextchar(&t); + } else if (*t->s == 'e') { + tokr_nextchar(t); if (n.kind == NUM_LITERAL_INT) { n.kind = NUM_LITERAL_FLOAT; n.floatval = (Floating)n.intval; } /* TODO: check if exceeding maximum exponent */ int exponent = 0; - if (*t.s == '+') - tokr_nextchar(&t); /* ignore + after e */ + if (*t->s == '+') + tokr_nextchar(t); /* ignore + after e */ int negative_exponent = 0; - if (*t.s == '-') { - tokr_nextchar(&t); + if (*t->s == '-') { + tokr_nextchar(t); negative_exponent = 1; } - for (; isdigit(*t.s); tokr_nextchar(&t)) { + for (; isdigit(*t->s); tokr_nextchar(t)) { exponent *= 10; - exponent += *t.s - '0'; + exponent += *t->s - '0'; } /* OPTIM: Slow for very large exponents (unlikely to happen) */ for (int i = 0; i < exponent; i++) { @@ -361,19 +365,19 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { } int digit = -1; if (base == 16) { - if (*t.s >= 'a' && *t.s <= 'f') - digit = 10 + *t.s - 'a'; - else if (*t.s >= 'A' && *t.s <= 'F') - digit = *t.s - 'A'; + if (*t->s >= 'a' && *t->s <= 'f') + digit = 10 + *t->s - 'a'; + else if (*t->s >= 'A' && *t->s <= 'F') + digit = *t->s - 'A'; } if (digit == -1) { - if (*t.s >= '0' && *t.s <= '9') - digit = *t.s - '0'; + if (*t->s >= '0' && *t->s <= '9') + digit = *t->s - '0'; } if (digit < 0 || digit >= base) { - if (isdigit(*t.s)) { + if (isdigit(*t->s)) { /* something like 0b011012 */ - tokenization_err(&t, "Digit %d cannot appear in a base %d number.", digit, base); + tokenization_err(t, "Digit %d cannot appear in a base %d number.", digit, base); goto err; } /* end of numeric literal */ @@ -384,7 +388,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { if (n.intval > ULLONG_MAX / (UInteger)base || n.intval * (UInteger)base > ULLONG_MAX - (UInteger)digit) { /* too big! */ - tokenization_err(&t, "Number too big to fit in a numeric literal."); + tokenization_err(t, "Number too big to fit in a numeric literal."); goto err; } n.intval *= (UInteger)base; @@ -395,107 +399,106 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { decimal_pow10 /= 10; break; } - tokr_nextchar(&t); + tokr_nextchar(t); } token->kind = TOKEN_NUM_LITERAL; token->num = n; continue; } - if (*t.s == '\'') { + if (*t->s == '\'') { /* it's a character literal! */ - tokr_nextchar(&t); - Token *token = tokr_add(&t); - tokr_put_location(&t, token); + tokr_nextchar(t); + Token *token = tokr_add(t); + tokr_put_location(t, token); char c; - if (*t.s == '\\') { + if (*t->s == '\\') { /* escape sequence */ - tokr_nextchar(&t); - c = tokr_esc_seq(&t); + tokr_nextchar(t); + c = tokr_esc_seq(t); if (c == 0) { - tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s); + tokenization_err(t, "Unrecognized escape character: '\\%c'.", *t->s); goto err; } } else { - c = *t.s; - tokr_nextchar(&t); + c = *t->s; + tokr_nextchar(t); } - if (*t.s != '\'') { - tokenization_err(&t, "End of character literal expected."); + if (*t->s != '\'') { + tokenization_err(t, "End of character literal expected."); goto err; } - tokr_nextchar(&t); + tokr_nextchar(t); token->kind = TOKEN_CHAR_LITERAL; token->chr = c; continue; } - if (*t.s == '"') { + if (*t->s == '"') { /* it's a string literal! */ - Token *token = tokr_add(&t); - tokr_put_location(&t, token); - tokr_nextchar(&t); + Token *token = tokr_add(t); + tokr_put_location(t, token); + tokr_nextchar(t); size_t len = 0; size_t backslashes = 0; - while (*t.s != '"' || backslashes % 2 == 1) { - if (*t.s == '\\') { + while (*t->s != '"' || backslashes % 2 == 1) { + if (*t->s == '\\') { backslashes++; - } else if (*t.s == 0) { + } else if (*t->s == 0) { /* return t to opening " so that we go to the next line */ - tokr_get_location(&t, token); - tokenization_err(&t, "No matching \" found."); + tokr_get_location(t, token); + tokenization_err(t, "No matching \" found."); goto err; } else { backslashes = 0; } len++; - tokr_nextchar(&t); + tokr_nextchar(t); } char *strlit = malloc(len + 1); char *strptr = strlit; - tokr_get_location(&t, token); - tokr_nextchar(&t); /* past opening " */ - while (*t.s != '"') { - assert(*t.s); - if (*t.s == '\\') { - tokr_nextchar(&t); - char c = tokr_esc_seq(&t); + tokr_get_location(t, token); + tokr_nextchar(t); /* past opening " */ + while (*t->s != '"') { + assert(*t->s); + if (*t->s == '\\') { + tokr_nextchar(t); + char c = tokr_esc_seq(t); if (c == 0) { - tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s); + tokenization_err(t, "Unrecognized escape character: '\\%c'.", *t->s); goto err; } *strptr++ = c; } else { - *strptr++ = *t.s; - tokr_nextchar(&t); + *strptr++ = *t->s; + tokr_nextchar(t); } } *strptr = 0; token->kind = TOKEN_STR_LITERAL; token->str.len = len; token->str.str = strlit; - tokr_nextchar(&t); /* move past closing " */ + tokr_nextchar(t); /* move past closing " */ continue; } - if (isident(*t.s)) { + if (isident(*t->s)) { /* it's an identifier */ - Token *token = tokr_add(&t); - tokr_put_location(&t, token); - Identifier ident = ident_insert(&t.s); + Token *token = tokr_add(t); + tokr_put_location(t, token); + Identifier ident = ident_insert(t->idents, &t->s); token->kind = TOKEN_IDENT; token->ident = ident; continue; } - tokenization_err(&t, "Token not recognized"); + tokenization_err(t, "Token not recognized"); err: has_err = 1; } - Token *token = tokr_add(&t); + Token *token = tokr_add(t); token->kind = TOKEN_EOF; - t.token = t.tokens.data; - *tokr = t; + t->token = t->tokens.data; return !has_err; } diff --git a/util/err.c b/util/err.c index 5ff28ad..e53b16f 100644 --- a/util/err.c +++ b/util/err.c @@ -20,6 +20,7 @@ typedef struct { } Location; /* file name of file being processed */ +/* TODO: remove this */ static const char *err_filename; /* Write directly to the error file */ -- cgit v1.2.3