diff options
-rw-r--r-- | identifiers.c | 9 | ||||
-rw-r--r-- | main.c | 20 | ||||
-rw-r--r-- | test.toc | 11 | ||||
-rw-r--r-- | tokenizer.c | 200 | ||||
-rw-r--r-- | util/err.c | 1 | ||||
-rw-r--r-- | util/files.c | 28 |
6 files changed, 200 insertions, 69 deletions
diff --git a/identifiers.c b/identifiers.c index 6335ed8..e88c745 100644 --- a/identifiers.c +++ b/identifiers.c @@ -32,9 +32,10 @@ typedef IdentTree *Identifier; static IdentTree ident_base_tree; static long ident_curr_id; /* NOTE: you should eventually add something to reset this */ -static Identifier ident_tree_finsert(IdentTree *t, FILE *fp) { +/* moves s to the char after the identifier */ +static Identifier ident_tree_insert(IdentTree *t, char **s) { while (1) { - int c = fgetc(fp); + char c = *((*s)++); if (!isident(c)) { if (t->id == 0) t->id = ++ident_curr_id; return t; @@ -51,8 +52,8 @@ static Identifier ident_tree_finsert(IdentTree *t, FILE *fp) { /* inserts if does not exist. reads until non-ident char is found. */ /* advances past identifier */ -static Identifier ident_finsert(FILE *fp) { - return ident_tree_finsert(&ident_base_tree, fp); +static Identifier ident_insert(char **s) { + return ident_tree_insert(&ident_base_tree, s); } @@ -6,7 +6,6 @@ #include <string.h> #include <ctype.h> #include "util/err.c" -#include "util/files.c" #include "identifiers.c" #include "tokenizer.c" @@ -21,8 +20,20 @@ int main(int argc, char **argv) { fprintf(stderr, "Could not open file: %s.\n", argv[1]); return EXIT_FAILURE; } - - Tokenizer t = tokenize_file(in); + + char *contents = err_malloc(4096); /* TODO:check files with >this */ + size_t contents_cap = 4096; + size_t contents_len = 0; + while (fgets(contents + contents_len, (int)(contents_cap - contents_len), in)) { + contents_len += strlen(contents + contents_len); + if (contents_len >= contents_cap - 1024) { + contents_cap *= 2; + contents = err_realloc(contents, contents_cap); + } + } + /* TODO: check ferror */ + + Tokenizer t = tokenize_string(contents); for (size_t i = 0; i < t.ntokens; i++) { if (i) @@ -31,8 +42,9 @@ int main(int argc, char **argv) { } printf("\n"); + free(contents); tokenizer_free(&t); - + fclose(in); idents_free(); } @@ -1,9 +1,4 @@ -== < -<<foo<<< -bar -foo -bar -baz -bar -foo
\ No newline at end of file +0x3f3a == 0777 +/* /* /*foo*/*/ /**/*/!~~ + diff --git a/tokenizer.c b/tokenizer.c index 768693d..ff03e4f 100644 --- a/tokenizer.c +++ b/tokenizer.c @@ -1,7 +1,9 @@ typedef enum { TOKEN_KW, TOKEN_IDENT, + TOKEN_NUM_LITERAL, TOKEN_EOF + /* TODO: char literals, str literals */ } TokenKind; typedef enum { @@ -17,6 +19,37 @@ typedef enum { static const char *keywords[KW_COUNT] = {";", "==", "<", "<=", "="}; +#define TOKENIZER_USE_LLONG 1 + +#if TOKENIZER_USE_LLONG +typedef long long LiteralInt; +typedef unsigned long long LiteralUInt; +#define LITERAL_INT_FMT "%lld" +#define LITERAL_UINT_FMT "%llu" +#else +typedef long LiteralInt; +typedef unsigned long LiteralUInt; +#define LITERAL_INT_FMT "%ld" +#define LITERAL_UINT_FMT "%lu" +#endif + +typedef double LiteralReal; + +typedef enum { + NUM_LITERAL_INT, + NUM_LITERAL_UINT, + NUM_LITERAL_REAL +} NumLiteralKind; + +typedef struct { + NumLiteralKind kind; + union { + LiteralInt intval; + LiteralUInt uintval; + LiteralReal realval; + }; +} NumLiteral; + /* NOTE: LineNo is typedef'd in util/err.c */ typedef struct { TokenKind kind; @@ -25,6 +58,7 @@ typedef struct { union { Keyword kw; Identifier ident; + NumLiteral num; }; } Token; @@ -45,6 +79,20 @@ static void token_fprint(FILE *out, Token *t) { fprintf(out, "identifier: %ld:", t->ident->id); ident_fprint(out, t->ident); break; + case TOKEN_NUM_LITERAL: + fprintf(out, "number: "); + switch (t->num.kind) { + case NUM_LITERAL_INT: + fprintf(out, LITERAL_INT_FMT, t->num.intval); + break; + case NUM_LITERAL_UINT: + fprintf(out, LITERAL_UINT_FMT, t->num.uintval); + break; + case NUM_LITERAL_REAL: + fprintf(out, "%f", t->num.realval); + break; + } + break; case TOKEN_EOF: fprintf(out, "eof"); break; @@ -52,22 +100,19 @@ static void token_fprint(FILE *out, Token *t) { } static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) { - if (t->ntokens == t->cap) { + if (t->ntokens >= t->cap) { t->cap *= 2; - t->tokens = realloc(t->tokens, t->cap); + t->tokens = err_realloc(t->tokens, t->cap); } token->line = line; token->col = col; t->tokens[t->ntokens++] = *token; } -static Tokenizer tokenize_file(FILE *fp) { - char buf[4096]; - setvbuf(fp, buf, _IOFBF, sizeof buf); - char errbuf[256] = {0}; /* for errors */ +static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't even try to pass it a literal.*/ int has_err = 0; Tokenizer t; - t.cap = 4096; + t.cap = 4096; /* TODO: test more tokens than this */ t.ntokens = 0; t.tokens = malloc(t.cap * sizeof(*t.tokens)); @@ -75,20 +120,63 @@ static Tokenizer tokenize_file(FILE *fp) { LineNo col = 1; while (1) { - int c = fpeekc(fp); - if (c == EOF) break; - if (isspace(c)) { - if (c == '\n') { + if (*s == 0) break; + if (isspace(*s)) { + if (*s == '\n') { line++; col = 0; } - fnextc(fp); - col++; + s++; col++; continue; } + + if (*s == '/') { + /* maybe it's a comment */ + int is_comment = 1; + s++; col++; + switch (*s) { + case '/': /* single line comment */ + for (s++; *s != '\n' && *s; s++); + line++; + col = 1; + break; + case '*': { /* multi line comment */ + int comment_level = 1; /* allow nested multi-line comments */ + while (*s) { + if (*s == '\n') { + line++; + col = 1; + s++; + continue; + } + if (s[0] == '*' && s[1] == '/') { + s += 2; col += 2; + comment_level--; + if (comment_level == 0) { + break; + } + } else if (s[0] == '/' && s[1] == '*') { + s += 2; col += 2; + comment_level++; + } else { + s++; col++; + } + } + if (*s == 0) { + err_print(line, col, "End of file reached inside multi-line comment."); + abort(); /* there won't be any further errors, of course */ + } + } break; + default: + is_comment = 0; + s--; /* go back */ + break; + } + if (is_comment) continue; + } Keyword kw; for (kw = 0; kw < KW_COUNT; kw++) { - if (fhasprefix(fp, keywords[kw])) { + if (strncmp(s, keywords[kw], strlen(keywords[kw])) == 0) { break; } } @@ -99,35 +187,97 @@ static Tokenizer tokenize_file(FILE *fp) { token.kw = kw; tokenizer_add(&t, &token, line, col); col += (LineNo)strlen(keywords[kw]); + s += (LineNo)strlen(keywords[kw]); continue; } - if (isident(c)) { + if (isdigit(*s)) { + /* it's a numerical constant */ + int base = 10; + LiteralInt intval = 0; + LineNo line_start = line, col_start = col; + if (*s == '0') { + s++; col++; + /* octal/hexadecimal/binary (or zero) */ + char format = *s; + if (isdigit(format)) /* octal */ + base = 8; + else { + switch (format) { + case 'b': + base = 2; + s++; col++; + break; + case 'x': + base = 16; + s++; col++; + break; + default: + /* it's 0/0.something etc. */ + break; + } + } + } + while (1) { + if (*s == '.') { + /* TODO */ + } else if (*s == 'e') { + /* TODO */ + } + int digit = -1; + if (base == 16) { + if (*s >= 'a' && *s <= 'f') + digit = 10 + *s - 'a'; + else if (*s >= 'A' && *s <= 'F') + digit = *s - 'A'; + } + if (digit == -1) { + if (*s >= '0' && *s <= '9') + digit = *s - '0'; + } + if (digit < 0 || digit >= base) { + /* end of numerical literal */ + break; + } + /* TODO: check overflow; switch to uint */ + intval *= base; + intval += digit; + s++; col++; + } + Token token; + token.kind = TOKEN_NUM_LITERAL; + token.num.kind = NUM_LITERAL_INT; + token.num.intval = intval; + tokenizer_add(&t, &token, line_start, col_start); + continue; + } + + if (isident(*s)) { /* it's an identifier */ - Identifier ident = ident_finsert(fp); + Identifier ident = ident_insert(&s); Token token; token.kind = TOKEN_IDENT; token.ident = ident; tokenizer_add(&t, &token, line, col); continue; } + + int has_newline; + char *end_of_line = strchr(s, '\n'); + has_newline = end_of_line != NULL; + if (has_newline) + *end_of_line = 0; - fgets(errbuf, sizeof errbuf, fp); - size_t len = strlen(errbuf); - int has_newline = len && errbuf[len-1] == '\n'; - if (has_newline) { - /* remove newline */ - errbuf[len-1] = 0; - } - err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf); + err_print(line, col, "Unrecognized token:\n\there --> %s\n", s); has_err = 1; if (has_newline) { /* increment line counter because of it */ line++; col = 1; } else { - col += (LineNo)(sizeof errbuf); + col += (LineNo)strlen(s); } + s += strlen(s); } /* TODO: Check ferror/errno */ if (has_err) { @@ -7,6 +7,7 @@ static void err_print(LineNo line, LineNo col, const char *fmt, ...) { va_start(args, fmt); vfprintf(stderr, fmt, args); va_end(args); + fprintf(stderr, "\n"); } static void *err_malloc(size_t size) { diff --git a/util/files.c b/util/files.c deleted file mode 100644 index 0afa843..0000000 --- a/util/files.c +++ /dev/null @@ -1,28 +0,0 @@ -static int fpeekc(FILE *fp) { - int c = getc(fp); - if (c == EOF) - return c; - ungetc(c, fp); - return c; -} - -#define fnextc getc /* advance to the next character */ - -/* NOTE: Advances and returns # of characters advanced iff prefix is found. */ -static int fhasprefix(FILE *fp, const char *prefix) { - assert(*prefix); - long start = ftell(fp); - if (start == -1) - return 0; - const char *p = prefix; - while (*p) { - int c = getc(fp); - if (c != *p) { - /* wrong character / EOF */ - fseek(fp, start, SEEK_SET); - return 0; - } - p++; - } - return (int)(p - prefix); /* length of prefix */ -} |