1 files changed, 175 insertions, 25 deletions
diff --git a/tokenizer.c b/tokenizer.c
index 768693d..ff03e4f 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -1,7 +1,9 @@
 typedef enum {
 			  TOKEN_KW,
 			  TOKEN_IDENT,
+			  TOKEN_NUM_LITERAL,
 			  TOKEN_EOF
+			  /* TODO: char literals, str literals */
 } TokenKind;
 
 typedef enum {
@@ -17,6 +19,37 @@ typedef enum {
 static const char *keywords[KW_COUNT] =
 	{";", "==", "<", "<=", "="}; 
 
+#define TOKENIZER_USE_LLONG 1
+
+#if TOKENIZER_USE_LLONG
+typedef long long LiteralInt;
+typedef unsigned long long LiteralUInt;
+#define LITERAL_INT_FMT "%lld"
+#define LITERAL_UINT_FMT "%llu"
+#else
+typedef long LiteralInt;
+typedef unsigned long LiteralUInt;
+#define LITERAL_INT_FMT "%ld"
+#define LITERAL_UINT_FMT "%lu"
+#endif
+
+typedef double LiteralReal;
+
+typedef enum {
+			  NUM_LITERAL_INT,
+			  NUM_LITERAL_UINT,
+			  NUM_LITERAL_REAL
+} NumLiteralKind;
+
+typedef struct {
+	NumLiteralKind kind;
+	union {
+		LiteralInt intval;
+		LiteralUInt uintval;
+		LiteralReal realval;
+	};
+} NumLiteral;
+
 /* NOTE: LineNo is typedef'd in util/err.c */
 typedef struct {
 	TokenKind kind;
@@ -25,6 +58,7 @@ typedef struct {
 	union {
 		Keyword kw;
 		Identifier ident;
+		NumLiteral num;
 	};
 } Token;
 
@@ -45,6 +79,20 @@ static void token_fprint(FILE *out, Token *t) {
 		fprintf(out, "identifier: %ld:", t->ident->id);
 		ident_fprint(out, t->ident);
 		break;
+	case TOKEN_NUM_LITERAL:
+		fprintf(out, "number: ");
+		switch (t->num.kind) {
+		case NUM_LITERAL_INT:
+			fprintf(out, LITERAL_INT_FMT, t->num.intval);
+			break;
+		case NUM_LITERAL_UINT:
+			fprintf(out, LITERAL_UINT_FMT, t->num.uintval);
+			break;
+		case NUM_LITERAL_REAL:
+			fprintf(out, "%f", t->num.realval);
+			break;
+		}
+		break;
 	case TOKEN_EOF:
 		fprintf(out, "eof");
 		break;
@@ -52,22 +100,19 @@ static void token_fprint(FILE *out, Token *t) {
 }
 
 static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) {
-	if (t->ntokens == t->cap) {
+	if (t->ntokens >= t->cap) {
 		t->cap *= 2;
-		t->tokens = realloc(t->tokens, t->cap);
+		t->tokens = err_realloc(t->tokens, t->cap);
 	}
 	token->line = line;
 	token->col = col;
 	t->tokens[t->ntokens++] = *token;
 }
 
-static Tokenizer tokenize_file(FILE *fp) {
-	char buf[4096];
-	setvbuf(fp, buf, _IOFBF, sizeof buf);
-	char errbuf[256] = {0}; /* for errors */
+static Tokenizer tokenize_string(char *s) {	/* NOTE: May modify string. Don't even try to pass it a literal.*/
 	int has_err = 0;
 	Tokenizer t;
-	t.cap = 4096;
+	t.cap = 4096; /* TODO: test more tokens than this */
 	t.ntokens = 0;
 	t.tokens = malloc(t.cap * sizeof(*t.tokens));
 
@@ -75,20 +120,63 @@ static Tokenizer tokenize_file(FILE *fp) {
 	LineNo col = 1;
 	
 	while (1) {
-		int c = fpeekc(fp);
-	    if (c == EOF) break;
-		if (isspace(c)) {
-			if (c == '\n') {
+	    if (*s == 0) break;
+		if (isspace(*s)) {
+			if (*s == '\n') {
 				line++;
 				col = 0;
 			}
-			fnextc(fp);
-			col++;
+			s++; col++;
 	    	continue;
 		}
+
+		if (*s == '/') {
+			/* maybe it's a comment */
+			int is_comment = 1;
+			s++; col++;
+			switch (*s) {
+			case '/': /* single line comment */
+				for (s++; *s != '\n' && *s; s++);
+				line++;
+				col = 1;
+				break;
+			case '*': { /* multi line comment */
+				int comment_level = 1; /* allow nested multi-line comments */
+			    while (*s) {
+					if (*s == '\n') {
+						line++;
+						col = 1;
+						s++;
+						continue;
+					}
+					if (s[0] == '*' && s[1] == '/') {
+						s += 2; col += 2;
+						comment_level--;
+						if (comment_level == 0) {
+							break;
+						}
+					} else if (s[0] == '/' && s[1] == '*') {
+						s += 2; col += 2;
+						comment_level++;
+					} else {
+						s++; col++;
+					}
+				}
+				if (*s == 0) {
+					err_print(line, col, "End of file reached inside multi-line comment.");
+					abort(); /* there won't be any further errors, of course */
+				}
+			} break;
+			default:
+				is_comment = 0;
+				s--; /* go back */
+				break;
+			}
+			if (is_comment) continue;
+		}
 		Keyword kw;
 		for (kw = 0; kw < KW_COUNT; kw++) {
-			if (fhasprefix(fp, keywords[kw])) {
+			if (strncmp(s, keywords[kw], strlen(keywords[kw])) == 0) {
 				break;
 			}
 		}
@@ -99,35 +187,97 @@ static Tokenizer tokenize_file(FILE *fp) {
 			token.kw = kw;
 			tokenizer_add(&t, &token, line, col);
 			col += (LineNo)strlen(keywords[kw]);
+			s += (LineNo)strlen(keywords[kw]);
 			continue;
 		}
 
-		if (isident(c)) {
+		if (isdigit(*s)) {
+			/* it's a numerical constant */
+			int base = 10;
+			LiteralInt intval = 0;
+			LineNo line_start = line, col_start = col;
+			if (*s == '0') {
+				s++; col++;
+				/* octal/hexadecimal/binary (or zero) */
+				char format = *s;
+				if (isdigit(format)) /* octal */
+					base = 8;
+				else {
+					switch (format) {
+					case 'b':
+						base = 2;
+						s++; col++;
+						break;
+					case 'x':
+						base = 16;
+						s++; col++;
+						break;
+					default:
+						/* it's 0/0.something etc.  */
+						break;
+					}
+				}
+			}
+			while (1) {
+				if (*s == '.') {
+					/* TODO */
+				} else if (*s == 'e') {
+					/* TODO */
+				}
+				int digit = -1;
+				if (base == 16) {
+					if (*s >= 'a' && *s <= 'f')
+						digit = 10 + *s - 'a';
+					else if (*s >= 'A' && *s <= 'F')
+						digit = *s - 'A';
+				}
+				if (digit == -1) {
+					if (*s >= '0' && *s <= '9')
+						digit = *s - '0';
+				}
+				if (digit < 0 || digit >= base) {
+					/* end of numerical literal */
+					break;
+				}
+				/* TODO: check overflow; switch to uint */
+				intval *= base;
+				intval += digit;
+				s++; col++;
+			}
+			Token token;
+			token.kind = TOKEN_NUM_LITERAL;
+			token.num.kind = NUM_LITERAL_INT;
+			token.num.intval = intval;
+			tokenizer_add(&t, &token, line_start, col_start);
+			continue;
+		}
+		
+		if (isident(*s)) {
 			/* it's an identifier */
-			Identifier ident = ident_finsert(fp);
+			Identifier ident = ident_insert(&s);
 			Token token;
 			token.kind = TOKEN_IDENT;
 			token.ident = ident;
 			tokenizer_add(&t, &token, line, col);			
 			continue;
 		}
+
+		int has_newline;
+		char *end_of_line = strchr(s, '\n');
+		has_newline = end_of_line != NULL;
+		if (has_newline)
+			*end_of_line = 0;
 		
-		fgets(errbuf, sizeof errbuf, fp);
-		size_t len = strlen(errbuf);
-		int has_newline = len && errbuf[len-1] == '\n';
-		if (has_newline) {
-			/* remove newline */
-			errbuf[len-1] = 0;
-		}
-		err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf);
+		err_print(line, col, "Unrecognized token:\n\there --> %s\n", s);
 		has_err = 1;
 		if (has_newline) {
 			/* increment line counter because of it */
 		    line++;
 			col = 1;
 		} else {
-			col += (LineNo)(sizeof errbuf);
+			col += (LineNo)strlen(s);
 		}
+		s += strlen(s);
 	}
 	/* TODO: Check ferror/errno */
 	if (has_err) {