summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--identifiers.c9
-rw-r--r--main.c20
-rw-r--r--test.toc11
-rw-r--r--tokenizer.c200
-rw-r--r--util/err.c1
-rw-r--r--util/files.c28
6 files changed, 200 insertions, 69 deletions
diff --git a/identifiers.c b/identifiers.c
index 6335ed8..e88c745 100644
--- a/identifiers.c
+++ b/identifiers.c
@@ -32,9 +32,10 @@ typedef IdentTree *Identifier;
static IdentTree ident_base_tree;
static long ident_curr_id; /* NOTE: you should eventually add something to reset this */
-static Identifier ident_tree_finsert(IdentTree *t, FILE *fp) {
+/* moves s to the char after the identifier */
+static Identifier ident_tree_insert(IdentTree *t, char **s) {
while (1) {
- int c = fgetc(fp);
+ char c = *((*s)++);
if (!isident(c)) {
if (t->id == 0) t->id = ++ident_curr_id;
return t;
@@ -51,8 +52,8 @@ static Identifier ident_tree_finsert(IdentTree *t, FILE *fp) {
/* inserts if does not exist. reads until non-ident char is found. */
/* advances past identifier */
-static Identifier ident_finsert(FILE *fp) {
- return ident_tree_finsert(&ident_base_tree, fp);
+static Identifier ident_insert(char **s) {
+ return ident_tree_insert(&ident_base_tree, s);
}
diff --git a/main.c b/main.c
index 900b723..099e311 100644
--- a/main.c
+++ b/main.c
@@ -6,7 +6,6 @@
#include <string.h>
#include <ctype.h>
#include "util/err.c"
-#include "util/files.c"
#include "identifiers.c"
#include "tokenizer.c"
@@ -21,8 +20,20 @@ int main(int argc, char **argv) {
fprintf(stderr, "Could not open file: %s.\n", argv[1]);
return EXIT_FAILURE;
}
-
- Tokenizer t = tokenize_file(in);
+
+ char *contents = err_malloc(4096); /* TODO:check files with >this */
+ size_t contents_cap = 4096;
+ size_t contents_len = 0;
+ while (fgets(contents + contents_len, (int)(contents_cap - contents_len), in)) {
+ contents_len += strlen(contents + contents_len);
+ if (contents_len >= contents_cap - 1024) {
+ contents_cap *= 2;
+ contents = err_realloc(contents, contents_cap);
+ }
+ }
+ /* TODO: check ferror */
+
+ Tokenizer t = tokenize_string(contents);
for (size_t i = 0; i < t.ntokens; i++) {
if (i)
@@ -31,8 +42,9 @@ int main(int argc, char **argv) {
}
printf("\n");
+ free(contents);
tokenizer_free(&t);
-
+
fclose(in);
idents_free();
}
diff --git a/test.toc b/test.toc
index a0b4c4e..654c1e1 100644
--- a/test.toc
+++ b/test.toc
@@ -1,9 +1,4 @@
-== <
-<<foo<<<
-bar
-foo
-bar
-baz
-bar
-foo \ No newline at end of file
+0x3f3a == 0777
+/* /* /*foo*/*/ /**/*/!~~
+
diff --git a/tokenizer.c b/tokenizer.c
index 768693d..ff03e4f 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -1,7 +1,9 @@
typedef enum {
TOKEN_KW,
TOKEN_IDENT,
+ TOKEN_NUM_LITERAL,
TOKEN_EOF
+ /* TODO: char literals, str literals */
} TokenKind;
typedef enum {
@@ -17,6 +19,37 @@ typedef enum {
static const char *keywords[KW_COUNT] =
{";", "==", "<", "<=", "="};
+#define TOKENIZER_USE_LLONG 1
+
+#if TOKENIZER_USE_LLONG
+typedef long long LiteralInt;
+typedef unsigned long long LiteralUInt;
+#define LITERAL_INT_FMT "%lld"
+#define LITERAL_UINT_FMT "%llu"
+#else
+typedef long LiteralInt;
+typedef unsigned long LiteralUInt;
+#define LITERAL_INT_FMT "%ld"
+#define LITERAL_UINT_FMT "%lu"
+#endif
+
+typedef double LiteralReal;
+
+typedef enum {
+ NUM_LITERAL_INT,
+ NUM_LITERAL_UINT,
+ NUM_LITERAL_REAL
+} NumLiteralKind;
+
+typedef struct {
+ NumLiteralKind kind;
+ union {
+ LiteralInt intval;
+ LiteralUInt uintval;
+ LiteralReal realval;
+ };
+} NumLiteral;
+
/* NOTE: LineNo is typedef'd in util/err.c */
typedef struct {
TokenKind kind;
@@ -25,6 +58,7 @@ typedef struct {
union {
Keyword kw;
Identifier ident;
+ NumLiteral num;
};
} Token;
@@ -45,6 +79,20 @@ static void token_fprint(FILE *out, Token *t) {
fprintf(out, "identifier: %ld:", t->ident->id);
ident_fprint(out, t->ident);
break;
+ case TOKEN_NUM_LITERAL:
+ fprintf(out, "number: ");
+ switch (t->num.kind) {
+ case NUM_LITERAL_INT:
+ fprintf(out, LITERAL_INT_FMT, t->num.intval);
+ break;
+ case NUM_LITERAL_UINT:
+ fprintf(out, LITERAL_UINT_FMT, t->num.uintval);
+ break;
+ case NUM_LITERAL_REAL:
+ fprintf(out, "%f", t->num.realval);
+ break;
+ }
+ break;
case TOKEN_EOF:
fprintf(out, "eof");
break;
@@ -52,22 +100,19 @@ static void token_fprint(FILE *out, Token *t) {
}
static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) {
- if (t->ntokens == t->cap) {
+ if (t->ntokens >= t->cap) {
t->cap *= 2;
- t->tokens = realloc(t->tokens, t->cap);
+ t->tokens = err_realloc(t->tokens, t->cap);
}
token->line = line;
token->col = col;
t->tokens[t->ntokens++] = *token;
}
-static Tokenizer tokenize_file(FILE *fp) {
- char buf[4096];
- setvbuf(fp, buf, _IOFBF, sizeof buf);
- char errbuf[256] = {0}; /* for errors */
+static Tokenizer tokenize_string(char *s) { /* NOTE: May modify string. Don't even try to pass it a literal.*/
int has_err = 0;
Tokenizer t;
- t.cap = 4096;
+ t.cap = 4096; /* TODO: test more tokens than this */
t.ntokens = 0;
t.tokens = malloc(t.cap * sizeof(*t.tokens));
@@ -75,20 +120,63 @@ static Tokenizer tokenize_file(FILE *fp) {
LineNo col = 1;
while (1) {
- int c = fpeekc(fp);
- if (c == EOF) break;
- if (isspace(c)) {
- if (c == '\n') {
+ if (*s == 0) break;
+ if (isspace(*s)) {
+ if (*s == '\n') {
line++;
col = 0;
}
- fnextc(fp);
- col++;
+ s++; col++;
continue;
}
+
+ if (*s == '/') {
+ /* maybe it's a comment */
+ int is_comment = 1;
+ s++; col++;
+ switch (*s) {
+ case '/': /* single line comment */
+ for (s++; *s != '\n' && *s; s++);
+ line++;
+ col = 1;
+ break;
+ case '*': { /* multi line comment */
+ int comment_level = 1; /* allow nested multi-line comments */
+ while (*s) {
+ if (*s == '\n') {
+ line++;
+ col = 1;
+ s++;
+ continue;
+ }
+ if (s[0] == '*' && s[1] == '/') {
+ s += 2; col += 2;
+ comment_level--;
+ if (comment_level == 0) {
+ break;
+ }
+ } else if (s[0] == '/' && s[1] == '*') {
+ s += 2; col += 2;
+ comment_level++;
+ } else {
+ s++; col++;
+ }
+ }
+ if (*s == 0) {
+ err_print(line, col, "End of file reached inside multi-line comment.");
+ abort(); /* there won't be any further errors, of course */
+ }
+ } break;
+ default:
+ is_comment = 0;
+ s--; /* go back */
+ break;
+ }
+ if (is_comment) continue;
+ }
Keyword kw;
for (kw = 0; kw < KW_COUNT; kw++) {
- if (fhasprefix(fp, keywords[kw])) {
+ if (strncmp(s, keywords[kw], strlen(keywords[kw])) == 0) {
break;
}
}
@@ -99,35 +187,97 @@ static Tokenizer tokenize_file(FILE *fp) {
token.kw = kw;
tokenizer_add(&t, &token, line, col);
col += (LineNo)strlen(keywords[kw]);
+ s += (LineNo)strlen(keywords[kw]);
continue;
}
- if (isident(c)) {
+ if (isdigit(*s)) {
+ /* it's a numerical constant */
+ int base = 10;
+ LiteralInt intval = 0;
+ LineNo line_start = line, col_start = col;
+ if (*s == '0') {
+ s++; col++;
+ /* octal/hexadecimal/binary (or zero) */
+ char format = *s;
+ if (isdigit(format)) /* octal */
+ base = 8;
+ else {
+ switch (format) {
+ case 'b':
+ base = 2;
+ s++; col++;
+ break;
+ case 'x':
+ base = 16;
+ s++; col++;
+ break;
+ default:
+ /* it's 0/0.something etc. */
+ break;
+ }
+ }
+ }
+ while (1) {
+ if (*s == '.') {
+ /* TODO */
+ } else if (*s == 'e') {
+ /* TODO */
+ }
+ int digit = -1;
+ if (base == 16) {
+ if (*s >= 'a' && *s <= 'f')
+ digit = 10 + *s - 'a';
+ else if (*s >= 'A' && *s <= 'F')
+ digit = *s - 'A';
+ }
+ if (digit == -1) {
+ if (*s >= '0' && *s <= '9')
+ digit = *s - '0';
+ }
+ if (digit < 0 || digit >= base) {
+ /* end of numerical literal */
+ break;
+ }
+ /* TODO: check overflow; switch to uint */
+ intval *= base;
+ intval += digit;
+ s++; col++;
+ }
+ Token token;
+ token.kind = TOKEN_NUM_LITERAL;
+ token.num.kind = NUM_LITERAL_INT;
+ token.num.intval = intval;
+ tokenizer_add(&t, &token, line_start, col_start);
+ continue;
+ }
+
+ if (isident(*s)) {
/* it's an identifier */
- Identifier ident = ident_finsert(fp);
+ Identifier ident = ident_insert(&s);
Token token;
token.kind = TOKEN_IDENT;
token.ident = ident;
tokenizer_add(&t, &token, line, col);
continue;
}
+
+ int has_newline;
+ char *end_of_line = strchr(s, '\n');
+ has_newline = end_of_line != NULL;
+ if (has_newline)
+ *end_of_line = 0;
- fgets(errbuf, sizeof errbuf, fp);
- size_t len = strlen(errbuf);
- int has_newline = len && errbuf[len-1] == '\n';
- if (has_newline) {
- /* remove newline */
- errbuf[len-1] = 0;
- }
- err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf);
+ err_print(line, col, "Unrecognized token:\n\there --> %s\n", s);
has_err = 1;
if (has_newline) {
/* increment line counter because of it */
line++;
col = 1;
} else {
- col += (LineNo)(sizeof errbuf);
+ col += (LineNo)strlen(s);
}
+ s += strlen(s);
}
/* TODO: Check ferror/errno */
if (has_err) {
diff --git a/util/err.c b/util/err.c
index 7a38017..89a1335 100644
--- a/util/err.c
+++ b/util/err.c
@@ -7,6 +7,7 @@ static void err_print(LineNo line, LineNo col, const char *fmt, ...) {
va_start(args, fmt);
vfprintf(stderr, fmt, args);
va_end(args);
+ fprintf(stderr, "\n");
}
static void *err_malloc(size_t size) {
diff --git a/util/files.c b/util/files.c
deleted file mode 100644
index 0afa843..0000000
--- a/util/files.c
+++ /dev/null
@@ -1,28 +0,0 @@
-static int fpeekc(FILE *fp) {
- int c = getc(fp);
- if (c == EOF)
- return c;
- ungetc(c, fp);
- return c;
-}
-
-#define fnextc getc /* advance to the next character */
-
-/* NOTE: Advances and returns # of characters advanced iff prefix is found. */
-static int fhasprefix(FILE *fp, const char *prefix) {
- assert(*prefix);
- long start = ftell(fp);
- if (start == -1)
- return 0;
- const char *p = prefix;
- while (*p) {
- int c = getc(fp);
- if (c != *p) {
- /* wrong character / EOF */
- fseek(fp, start, SEEK_SET);
- return 0;
- }
- p++;
- }
- return (int)(p - prefix); /* length of prefix */
-}