From 95d2bbf3930c39116fe0bee78ed6feb734c38b0a Mon Sep 17 00:00:00 2001
From: Leo Tenenbaum <pommicket@gmail.com>
Date: Fri, 16 Aug 2019 17:47:08 -0400
Subject: Switched to reading whole file into memory; started number literals

---
 identifiers.c |   9 +--
 main.c        |  20 ++++--
 test.toc      |  11 +---
 tokenizer.c   | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++--------
 util/err.c    |   1 +
 util/files.c  |  28 --------
 6 files changed, 200 insertions(+), 69 deletions(-)
 delete mode 100644 util/files.c

diff --git a/identifiers.c b/identifiers.c
index 6335ed8..e88c745 100644
--- a/identifiers.c
+++ b/identifiers.c
@@ -32,9 +32,10 @@ typedef IdentTree *Identifier;
 static IdentTree ident_base_tree;
 static long ident_curr_id; /* NOTE: you should eventually add something to reset this */
 
-static Identifier ident_tree_finsert(IdentTree *t, FILE *fp) {
+/* moves s to the char after the identifier */
+static Identifier ident_tree_insert(IdentTree *t, char **s) {
 	while (1) {
-		int c = fgetc(fp);
+		char c = *((*s)++);
 		if (!isident(c)) {
 			if (t->id == 0) t->id = ++ident_curr_id;
 			return t;
@@ -51,8 +52,8 @@ static Identifier ident_tree_finsert(IdentTree *t, FILE *fp) {
 
 /* inserts if does not exist. reads until non-ident char is found. */
 /* advances past identifier */
-static Identifier ident_finsert(FILE *fp) {
-	return ident_tree_finsert(&ident_base_tree, fp);
+static Identifier ident_insert(char **s) {
+	return ident_tree_insert(&ident_base_tree, s);
 }
 
 
diff --git a/main.c b/main.c
index 900b723..099e311 100644
--- a/main.c
+++ b/main.c
@@ -6,7 +6,6 @@
 #include <string.h>
 #include <ctype.h>
 #include "util/err.c"
-#include "util/files.c"
 #include "identifiers.c"
 #include "tokenizer.c"
 
@@ -21,8 +20,20 @@ int main(int argc, char **argv) {
 		fprintf(stderr, "Could not open file: %s.\n", argv[1]);
 		return EXIT_FAILURE;
 	}
-
-	Tokenizer t = tokenize_file(in);
+	
+	char *contents = err_malloc(4096); /* TODO:check files with >this */
+	size_t contents_cap = 4096;
+	size_t contents_len = 0;
+	while (fgets(contents + contents_len, (int)(contents_cap - contents_len), in)) {
+		contents_len += strlen(contents + contents_len);
+		if (contents_len >= contents_cap - 1024) {
+			contents_cap *= 2;
+			contents = err_realloc(contents, contents_cap);
+		}
+	}
+	/* TODO: check ferror */
+	
+	Tokenizer t = tokenize_string(contents);
 	
 	for (size_t i = 0; i < t.ntokens; i++) {
 		if (i)
@@ -31,8 +42,9 @@ int main(int argc, char **argv) {
 	}
 	printf("\n");
 
+	free(contents);
 	tokenizer_free(&t);
-
+	
 	fclose(in);
 	idents_free();
 }
diff --git a/test.toc b/test.toc
index a0b4c4e..654c1e1 100644
--- a/test.toc
+++ b/test.toc
@@ -1,9 +1,4 @@
    
-== <
-<<foo<<<
-bar
-foo
-bar
-baz
-bar
-foo
\ No newline at end of file
+0x3f3a == 0777
+/* /* /*foo*/*/ /**/*/!~~
+
diff --git a/tokenizer.c b/tokenizer.c
index 768693d..ff03e4f 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -1,7 +1,9 @@
 typedef enum {
 			  TOKEN_KW,
 			  TOKEN_IDENT,
+			  TOKEN_NUM_LITERAL,
 			  TOKEN_EOF
+			  /* TODO: char literals, str literals */
 } TokenKind;
 
 typedef enum {
@@ -17,6 +19,37 @@ typedef enum {
 static const char *keywords[KW_COUNT] =
 	{";", "==", "<", "<=", "="}; 
 
+#define TOKENIZER_USE_LLONG 1
+
+#if TOKENIZER_USE_LLONG
+typedef long long LiteralInt;
+typedef unsigned long long LiteralUInt;
+#define LITERAL_INT_FMT "%lld"
+#define LITERAL_UINT_FMT "%llu"
+#else
+typedef long LiteralInt;
+typedef unsigned long LiteralUInt;
+#define LITERAL_INT_FMT "%ld"
+#define LITERAL_UINT_FMT "%lu"
+#endif
+
+typedef double LiteralReal;
+
+typedef enum {
+			  NUM_LITERAL_INT,
+			  NUM_LITERAL_UINT,
+			  NUM_LITERAL_REAL
+} NumLiteralKind;
+
+typedef struct {
+	NumLiteralKind kind;
+	union {
+		LiteralInt intval;
+		LiteralUInt uintval;
+		LiteralReal realval;
+	};
+} NumLiteral;
+
 /* NOTE: LineNo is typedef'd in util/err.c */
 typedef struct {
 	TokenKind kind;
@@ -25,6 +58,7 @@ typedef struct {
 	union {
 		Keyword kw;
 		Identifier ident;
+		NumLiteral num;
 	};
 } Token;
 
@@ -45,6 +79,20 @@ static void token_fprint(FILE *out, Token *t) {
 		fprintf(out, "identifier: %ld:", t->ident->id);
 		ident_fprint(out, t->ident);
 		break;
+	case TOKEN_NUM_LITERAL:
+		fprintf(out, "number: ");
+		switch (t->num.kind) {
+		case NUM_LITERAL_INT:
+			fprintf(out, LITERAL_INT_FMT, t->num.intval);
+			break;
+		case NUM_LITERAL_UINT:
+			fprintf(out, LITERAL_UINT_FMT, t->num.uintval);
+			break;
+		case NUM_LITERAL_REAL:
+			fprintf(out, "%f", t->num.realval);
+			break;
+		}
+		break;
 	case TOKEN_EOF:
 		fprintf(out, "eof");
 		break;
@@ -52,22 +100,19 @@ static void token_fprint(FILE *out, Token *t) {
 }
 
 static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) {
-	if (t->ntokens == t->cap) {
+	if (t->ntokens >= t->cap) {
 		t->cap *= 2;
-		t->tokens = realloc(t->tokens, t->cap);
+		t->tokens = err_realloc(t->tokens, t->cap);
 	}
 	token->line = line;
 	token->col = col;
 	t->tokens[t->ntokens++] = *token;
 }
 
-static Tokenizer tokenize_file(FILE *fp) {
-	char buf[4096];
-	setvbuf(fp, buf, _IOFBF, sizeof buf);
-	char errbuf[256] = {0}; /* for errors */
+static Tokenizer tokenize_string(char *s) {	/* NOTE: May modify string. Don't even try to pass it a literal.*/
 	int has_err = 0;
 	Tokenizer t;
-	t.cap = 4096;
+	t.cap = 4096; /* TODO: test more tokens than this */
 	t.ntokens = 0;
 	t.tokens = malloc(t.cap * sizeof(*t.tokens));
 
@@ -75,20 +120,63 @@ static Tokenizer tokenize_file(FILE *fp) {
 	LineNo col = 1;
 	
 	while (1) {
-		int c = fpeekc(fp);
-	    if (c == EOF) break;
-		if (isspace(c)) {
-			if (c == '\n') {
+	    if (*s == 0) break;
+		if (isspace(*s)) {
+			if (*s == '\n') {
 				line++;
 				col = 0;
 			}
-			fnextc(fp);
-			col++;
+			s++; col++;
 	    	continue;
 		}
+
+		if (*s == '/') {
+			/* maybe it's a comment */
+			int is_comment = 1;
+			s++; col++;
+			switch (*s) {
+			case '/': /* single line comment */
+				for (s++; *s != '\n' && *s; s++);
+				line++;
+				col = 1;
+				break;
+			case '*': { /* multi line comment */
+				int comment_level = 1; /* allow nested multi-line comments */
+			    while (*s) {
+					if (*s == '\n') {
+						line++;
+						col = 1;
+						s++;
+						continue;
+					}
+					if (s[0] == '*' && s[1] == '/') {
+						s += 2; col += 2;
+						comment_level--;
+						if (comment_level == 0) {
+							break;
+						}
+					} else if (s[0] == '/' && s[1] == '*') {
+						s += 2; col += 2;
+						comment_level++;
+					} else {
+						s++; col++;
+					}
+				}
+				if (*s == 0) {
+					err_print(line, col, "End of file reached inside multi-line comment.");
+					abort(); /* there won't be any further errors, of course */
+				}
+			} break;
+			default:
+				is_comment = 0;
+				s--; /* go back */
+				break;
+			}
+			if (is_comment) continue;
+		}
 		Keyword kw;
 		for (kw = 0; kw < KW_COUNT; kw++) {
-			if (fhasprefix(fp, keywords[kw])) {
+			if (strncmp(s, keywords[kw], strlen(keywords[kw])) == 0) {
 				break;
 			}
 		}
@@ -99,35 +187,97 @@ static Tokenizer tokenize_file(FILE *fp) {
 			token.kw = kw;
 			tokenizer_add(&t, &token, line, col);
 			col += (LineNo)strlen(keywords[kw]);
+			s += (LineNo)strlen(keywords[kw]);
 			continue;
 		}
 
-		if (isident(c)) {
+		if (isdigit(*s)) {
+			/* it's a numerical constant */
+			int base = 10;
+			LiteralInt intval = 0;
+			LineNo line_start = line, col_start = col;
+			if (*s == '0') {
+				s++; col++;
+				/* octal/hexadecimal/binary (or zero) */
+				char format = *s;
+				if (isdigit(format)) /* octal */
+					base = 8;
+				else {
+					switch (format) {
+					case 'b':
+						base = 2;
+						s++; col++;
+						break;
+					case 'x':
+						base = 16;
+						s++; col++;
+						break;
+					default:
+						/* it's 0/0.something etc.  */
+						break;
+					}
+				}
+			}
+			while (1) {
+				if (*s == '.') {
+					/* TODO */
+				} else if (*s == 'e') {
+					/* TODO */
+				}
+				int digit = -1;
+				if (base == 16) {
+					if (*s >= 'a' && *s <= 'f')
+						digit = 10 + *s - 'a';
+					else if (*s >= 'A' && *s <= 'F')
+						digit = *s - 'A';
+				}
+				if (digit == -1) {
+					if (*s >= '0' && *s <= '9')
+						digit = *s - '0';
+				}
+				if (digit < 0 || digit >= base) {
+					/* end of numerical literal */
+					break;
+				}
+				/* TODO: check overflow; switch to uint */
+				intval *= base;
+				intval += digit;
+				s++; col++;
+			}
+			Token token;
+			token.kind = TOKEN_NUM_LITERAL;
+			token.num.kind = NUM_LITERAL_INT;
+			token.num.intval = intval;
+			tokenizer_add(&t, &token, line_start, col_start);
+			continue;
+		}
+		
+		if (isident(*s)) {
 			/* it's an identifier */
-			Identifier ident = ident_finsert(fp);
+			Identifier ident = ident_insert(&s);
 			Token token;
 			token.kind = TOKEN_IDENT;
 			token.ident = ident;
 			tokenizer_add(&t, &token, line, col);			
 			continue;
 		}
+
+		int has_newline;
+		char *end_of_line = strchr(s, '\n');
+		has_newline = end_of_line != NULL;
+		if (has_newline)
+			*end_of_line = 0;
 		
-		fgets(errbuf, sizeof errbuf, fp);
-		size_t len = strlen(errbuf);
-		int has_newline = len && errbuf[len-1] == '\n';
-		if (has_newline) {
-			/* remove newline */
-			errbuf[len-1] = 0;
-		}
-		err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf);
+		err_print(line, col, "Unrecognized token:\n\there --> %s\n", s);
 		has_err = 1;
 		if (has_newline) {
 			/* increment line counter because of it */
 		    line++;
 			col = 1;
 		} else {
-			col += (LineNo)(sizeof errbuf);
+			col += (LineNo)strlen(s);
 		}
+		s += strlen(s);
 	}
 	/* TODO: Check ferror/errno */
 	if (has_err) {
diff --git a/util/err.c b/util/err.c
index 7a38017..89a1335 100644
--- a/util/err.c
+++ b/util/err.c
@@ -7,6 +7,7 @@ static void err_print(LineNo line, LineNo col, const char *fmt, ...) {
 	va_start(args, fmt);
 	vfprintf(stderr, fmt, args);
 	va_end(args);
+	fprintf(stderr, "\n");
 }
 
 static void *err_malloc(size_t size) {
diff --git a/util/files.c b/util/files.c
deleted file mode 100644
index 0afa843..0000000
--- a/util/files.c
+++ /dev/null
@@ -1,28 +0,0 @@
-static int fpeekc(FILE *fp) {
-	int c = getc(fp);
-	if (c == EOF)
-		return c;
-	ungetc(c, fp);
-	return c;
-}
-
-#define fnextc getc /* advance to the next character */
-
-/* NOTE: Advances and returns # of characters advanced iff prefix is found. */
-static int fhasprefix(FILE *fp, const char *prefix) {
-	assert(*prefix);
-	long start = ftell(fp);
-	if (start == -1)
-		return 0;
-	const char *p = prefix;
-	while (*p) {
-		int c = getc(fp);
-		if (c != *p) {
-			/* wrong character / EOF */
-			fseek(fp, start, SEEK_SET);
-			return 0;
-		}
-		p++;
-	}
-	return (int)(p - prefix); /* length of prefix */
-}
-- 
cgit v1.2.3