From 95d2bbf3930c39116fe0bee78ed6feb734c38b0a Mon Sep 17 00:00:00 2001 From: Leo Tenenbaum Date: Fri, 16 Aug 2019 17:47:08 -0400 Subject: Switched to reading whole file into memory; started number literals --- identifiers.c | 9 +-- main.c | 20 ++++-- test.toc | 11 +--- tokenizer.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++-------- util/err.c | 1 + util/files.c | 28 -------- 6 files changed, 200 insertions(+), 69 deletions(-) delete mode 100644 util/files.c diff --git a/identifiers.c b/identifiers.c index 6335ed8..e88c745 100644 --- a/identifiers.c +++ b/identifiers.c @@ -32,9 +32,10 @@ typedef IdentTree *Identifier; static IdentTree ident_base_tree; static long ident_curr_id; /* NOTE: you should eventually add something to reset this */ -static Identifier ident_tree_finsert(IdentTree *t, FILE *fp) { +/* moves s to the char after the identifier */ +static Identifier ident_tree_insert(IdentTree *t, char **s) { while (1) { - int c = fgetc(fp); + char c = *((*s)++); if (!isident(c)) { if (t->id == 0) t->id = ++ident_curr_id; return t; @@ -51,8 +52,8 @@ static Identifier ident_tree_finsert(IdentTree *t, FILE *fp) { /* inserts if does not exist. reads until non-ident char is found. */ /* advances past identifier */ -static Identifier ident_finsert(FILE *fp) { - return ident_tree_finsert(&ident_base_tree, fp); +static Identifier ident_insert(char **s) { + return ident_tree_insert(&ident_base_tree, s); } diff --git a/main.c b/main.c index 900b723..099e311 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,6 @@ #include #include #include "util/err.c" -#include "util/files.c" #include "identifiers.c" #include "tokenizer.c" @@ -21,8 +20,20 @@ int main(int argc, char **argv) { fprintf(stderr, "Could not open file: %s.\n", argv[1]); return EXIT_FAILURE; } - - Tokenizer t = tokenize_file(in); + + char *contents = err_malloc(4096); /* TODO:check files with >this */ + size_t contents_cap = 4096; + size_t contents_len = 0; + while (fgets(contents + contents_len, (int)(contents_cap - contents_len), in)) { + contents_len += strlen(contents + contents_len); + if (contents_len >= contents_cap - 1024) { + contents_cap *= 2; + contents = err_realloc(contents, contents_cap); + } + } + /* TODO: check ferror */ + + Tokenizer t = tokenize_string(contents); for (size_t i = 0; i < t.ntokens; i++) { if (i) @@ -31,8 +42,9 @@ int main(int argc, char **argv) { } printf("\n"); + free(contents); tokenizer_free(&t); - + fclose(in); idents_free(); } diff --git a/test.toc b/test.toc index a0b4c4e..654c1e1 100644 --- a/test.toc +++ b/test.toc @@ -1,9 +1,4 @@ -== < -<ident->id); ident_fprint(out, t->ident); break; + case TOKEN_NUM_LITERAL: + fprintf(out, "number: "); + switch (t->num.kind) { + case NUM_LITERAL_INT: + fprintf(out, LITERAL_INT_FMT, t->num.intval); + break; + case NUM_LITERAL_UINT: + fprintf(out, LITERAL_UINT_FMT, t->num.uintval); + break; + case NUM_LITERAL_REAL: + fprintf(out, "%f", t->num.realval); + break; + } + break; case TOKEN_EOF: fprintf(out, "eof"); break; @@ -52,22 +100,19 @@ static void token_fprint(FILE *out, Token *t) { } static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) { - if (t->ntokens == t->cap) { + if (t->ntokens >= t->cap) { t->cap *= 2; - t->tokens = realloc(t->tokens, t->cap); + t->tokens = err_realloc(t->tokens, t->cap); } token->line = line; token->col = col; t->tokens[t->ntokens++] = *token; } -static Tokenizer tokenize_file(FILE *fp) { - char buf[4096]; - setvbuf(fp, buf, _IOFBF, sizeof buf); - char errbuf[256] = {0}; /* for errors */ +static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't even try to pass it a literal.*/ int has_err = 0; Tokenizer t; - t.cap = 4096; + t.cap = 4096; /* TODO: test more tokens than this */ t.ntokens = 0; t.tokens = malloc(t.cap * sizeof(*t.tokens)); @@ -75,20 +120,63 @@ static Tokenizer tokenize_file(FILE *fp) { LineNo col = 1; while (1) { - int c = fpeekc(fp); - if (c == EOF) break; - if (isspace(c)) { - if (c == '\n') { + if (*s == 0) break; + if (isspace(*s)) { + if (*s == '\n') { line++; col = 0; } - fnextc(fp); - col++; + s++; col++; continue; } + + if (*s == '/') { + /* maybe it's a comment */ + int is_comment = 1; + s++; col++; + switch (*s) { + case '/': /* single line comment */ + for (s++; *s != '\n' && *s; s++); + line++; + col = 1; + break; + case '*': { /* multi line comment */ + int comment_level = 1; /* allow nested multi-line comments */ + while (*s) { + if (*s == '\n') { + line++; + col = 1; + s++; + continue; + } + if (s[0] == '*' && s[1] == '/') { + s += 2; col += 2; + comment_level--; + if (comment_level == 0) { + break; + } + } else if (s[0] == '/' && s[1] == '*') { + s += 2; col += 2; + comment_level++; + } else { + s++; col++; + } + } + if (*s == 0) { + err_print(line, col, "End of file reached inside multi-line comment."); + abort(); /* there won't be any further errors, of course */ + } + } break; + default: + is_comment = 0; + s--; /* go back */ + break; + } + if (is_comment) continue; + } Keyword kw; for (kw = 0; kw < KW_COUNT; kw++) { - if (fhasprefix(fp, keywords[kw])) { + if (strncmp(s, keywords[kw], strlen(keywords[kw])) == 0) { break; } } @@ -99,35 +187,97 @@ static Tokenizer tokenize_file(FILE *fp) { token.kw = kw; tokenizer_add(&t, &token, line, col); col += (LineNo)strlen(keywords[kw]); + s += (LineNo)strlen(keywords[kw]); continue; } - if (isident(c)) { + if (isdigit(*s)) { + /* it's a numerical constant */ + int base = 10; + LiteralInt intval = 0; + LineNo line_start = line, col_start = col; + if (*s == '0') { + s++; col++; + /* octal/hexadecimal/binary (or zero) */ + char format = *s; + if (isdigit(format)) /* octal */ + base = 8; + else { + switch (format) { + case 'b': + base = 2; + s++; col++; + break; + case 'x': + base = 16; + s++; col++; + break; + default: + /* it's 0/0.something etc. */ + break; + } + } + } + while (1) { + if (*s == '.') { + /* TODO */ + } else if (*s == 'e') { + /* TODO */ + } + int digit = -1; + if (base == 16) { + if (*s >= 'a' && *s <= 'f') + digit = 10 + *s - 'a'; + else if (*s >= 'A' && *s <= 'F') + digit = *s - 'A'; + } + if (digit == -1) { + if (*s >= '0' && *s <= '9') + digit = *s - '0'; + } + if (digit < 0 || digit >= base) { + /* end of numerical literal */ + break; + } + /* TODO: check overflow; switch to uint */ + intval *= base; + intval += digit; + s++; col++; + } + Token token; + token.kind = TOKEN_NUM_LITERAL; + token.num.kind = NUM_LITERAL_INT; + token.num.intval = intval; + tokenizer_add(&t, &token, line_start, col_start); + continue; + } + + if (isident(*s)) { /* it's an identifier */ - Identifier ident = ident_finsert(fp); + Identifier ident = ident_insert(&s); Token token; token.kind = TOKEN_IDENT; token.ident = ident; tokenizer_add(&t, &token, line, col); continue; } + + int has_newline; + char *end_of_line = strchr(s, '\n'); + has_newline = end_of_line != NULL; + if (has_newline) + *end_of_line = 0; - fgets(errbuf, sizeof errbuf, fp); - size_t len = strlen(errbuf); - int has_newline = len && errbuf[len-1] == '\n'; - if (has_newline) { - /* remove newline */ - errbuf[len-1] = 0; - } - err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf); + err_print(line, col, "Unrecognized token:\n\there --> %s\n", s); has_err = 1; if (has_newline) { /* increment line counter because of it */ line++; col = 1; } else { - col += (LineNo)(sizeof errbuf); + col += (LineNo)strlen(s); } + s += strlen(s); } /* TODO: Check ferror/errno */ if (has_err) { diff --git a/util/err.c b/util/err.c index 7a38017..89a1335 100644 --- a/util/err.c +++ b/util/err.c @@ -7,6 +7,7 @@ static void err_print(LineNo line, LineNo col, const char *fmt, ...) { va_start(args, fmt); vfprintf(stderr, fmt, args); va_end(args); + fprintf(stderr, "\n"); } static void *err_malloc(size_t size) { diff --git a/util/files.c b/util/files.c deleted file mode 100644 index 0afa843..0000000 --- a/util/files.c +++ /dev/null @@ -1,28 +0,0 @@ -static int fpeekc(FILE *fp) { - int c = getc(fp); - if (c == EOF) - return c; - ungetc(c, fp); - return c; -} - -#define fnextc getc /* advance to the next character */ - -/* NOTE: Advances and returns # of characters advanced iff prefix is found. */ -static int fhasprefix(FILE *fp, const char *prefix) { - assert(*prefix); - long start = ftell(fp); - if (start == -1) - return 0; - const char *p = prefix; - while (*p) { - int c = getc(fp); - if (c != *p) { - /* wrong character / EOF */ - fseek(fp, start, SEEK_SET); - return 0; - } - p++; - } - return (int)(p - prefix); /* length of prefix */ -} -- cgit v1.2.3