started parsing expressions

author: Leo Tenenbaum <pommicket@gmail.com> 2019-08-18 16:42:40 -0400
committer: Leo Tenenbaum <pommicket@gmail.com> 2019-08-18 16:42:40 -0400
commit: 00cb291c4bf2c64342b00152e58f2544e66ddb2c (patch)
tree: 3eaa5ef6beca9049da3fa1e51022671dd3b8f28f
parent: dc3dab7f04f852c3ca7c7850623bedad27f185dd (diff)
4 files changed, 268 insertions, 45 deletions
diff --git a/main.c b/main.c
index f9332df..7fefa70 100644
--- a/main.c
+++ b/main.c
@@ -27,14 +27,15 @@ int main(int argc, char **argv) {
 	}
 	
 	char *contents = err_malloc(4096);
-	long contents_cap = 4096;
+	long contents_cap = 4095;
 	long contents_len = 0;
 	while (fgets(contents + contents_len, (int)(contents_cap - contents_len), in)) {
 		contents_len += (long)strlen(contents + contents_len);
 		
 		if (contents_len >= (long)contents_cap - 1024) {
 			contents_cap *= 2;
-			contents = err_realloc(contents, (size_t)contents_cap);
+			/* allocate +1 so that pointers don't overflow */
+			contents = err_realloc(contents, (size_t)contents_cap + 1);
 		}
 	}
 	if (ferror(in)) {
diff --git a/parse.c b/parse.c
index 775eac3..9ab9ee1 100644
--- a/parse.c
+++ b/parse.c
@@ -1,10 +1,58 @@
+typedef enum {
+			  TYPE_BUILTIN
+} TypeKind;
+
+typedef enum {
+			  BUILTIN_INT,
+			  BUILTIN_I8,
+			  BUILTIN_I16,
+			  BUILTIN_I32,
+			  BUILTIN_I64,
+			  BUILTIN_U8,
+			  BUILTIN_U16,
+			  BUILTIN_U32,
+			  BUILTIN_U64,
+			  BUILTIN_FLOAT,
+			  BUILTIN_F32,
+			  BUILTIN_F64,
+			  BUILTIN_TYPE_COUNT
+} BuiltinType;
+
+
+typedef struct {
+	TypeKind kind;
+	union {
+	    BuiltinType builtin;
+	};
+} Type;
+
+typedef enum {
+			  EXPR_INT_CONST,
+			  EXPR_FLOAT_CONST
+} ExprKind;
+
+typedef struct {
+	ExprKind kind;
+	Type type;
+	bool is_flexible_num:1; /* expressions like 5 or 7*8+3 can be any numerical type */
+	union {
+		FloatConst floatc;
+		IntConst intc;
+	};
+} Expression;
+
 typedef struct {
 	Location where;
 	Identifier var;
-	bool is_const;
-	bool has_expr;
+	Type type;
+	Expression expr;
+	bool infer_type:1;
+	bool is_const:1;
+	bool has_expr:1;
 } Declaration;
 
+/* OPTIM: Instead of using dynamic arrays, do two passes. */
+
 arr_declaration(Declarations, Declaration, decls_)
 
 typedef enum {
@@ -25,7 +73,133 @@ typedef struct {
 	Statements stmts;
 } ParsedFile;
 
-/* TODO: Add newline tokens back in; give tokens pointer to text */
+
+/* returns BUILTIN_TYPE_COUNT on failure */
+static BuiltinType kw_to_builtin_type(Keyword kw) {
+	switch (kw) {
+	case KW_INT: return BUILTIN_INT;
+	case KW_I8: return BUILTIN_I8;
+	case KW_I16: return BUILTIN_I16;
+	case KW_I32: return BUILTIN_I32;
+	case KW_I64: return BUILTIN_I64;
+	case KW_U8: return BUILTIN_U8;
+	case KW_U16: return BUILTIN_U16;
+	case KW_U32: return BUILTIN_U32;
+	case KW_U64: return BUILTIN_U64;
+	case KW_FLOAT: return BUILTIN_FLOAT;
+	case KW_F32: return BUILTIN_F32;
+	case KW_F64: return BUILTIN_F64;
+	default: return BUILTIN_TYPE_COUNT;
+	}
+}
+
+static bool parse_type(Type *type, Tokenizer *t) {
+	switch (t->token->kind) {
+	case TOKEN_KW:
+		type->kind = TYPE_BUILTIN;
+		type->builtin = kw_to_builtin_type(t->token->kw);
+		if (type->builtin == BUILTIN_TYPE_COUNT) {
+			tokr_err(t, "Expected type.");
+			return false;
+		} else {
+			t->token++;
+			return true;
+		}
+		break;
+	default: break;
+	}
+	tokr_err(t, "Unrecognized type.");
+	return false;
+}
+
+static bool parse_expr(Expression *e, Tokenizer *t, Token *end) {
+	if (end == NULL) return false;
+	memset(e, 0, sizeof *e);
+	if (end - t->token == 1) {
+		/* 1-token expression */
+		switch (t->token->kind) {
+		case TOKEN_NUM_CONST: {
+			NumConst *num = &t->token->num;
+			switch (num->kind) {
+			case NUM_CONST_FLOAT:
+				e->kind = EXPR_FLOAT_CONST;
+				e->type.kind = TYPE_BUILTIN;
+				e->type.builtin = BUILTIN_FLOAT;
+				e->floatc = num->floatval;
+				break;
+			case NUM_CONST_INT:
+				e->kind = EXPR_INT_CONST;
+				e->is_flexible_num = true;
+				e->type.kind = TYPE_BUILTIN;
+				e->type.builtin = BUILTIN_INT; /* TODO: if it's too big, use a u64 instead. */
+				e->floatc = num->intval;
+				break;
+			}
+		} break;
+		default:
+			tokr_err(t, "Unrecognized expression.");
+			return false;
+		}
+		t->token = end;
+		return true;
+	}
+	/* TODO */
+	tokr_err(t, "multi-token exprs not supported yet.");
+	return false;
+}
+
+/*
+ends_with = which keyword does this expression end with?
+if it's KW_RPAREN, this will match parentheses properly.
+*/
+typedef enum {
+	  EXPR_END_RPAREN_OR_COMMA,
+	  EXPR_END_SEMICOLON
+} ExprEndKind;
+static Token *expr_find_end(Tokenizer *t, ExprEndKind ends_with) {
+	long bracket_level = 0;
+	Token *token = t->token;
+	while (1) {
+		switch (ends_with) {
+		case EXPR_END_RPAREN_OR_COMMA:
+			if (token->kind == TOKEN_KW) {
+				if (token->kw == KW_COMMA && bracket_level == 0)
+					return token;
+				if (token->kw == KW_LPAREN)
+					bracket_level++;
+				if (token->kw == KW_RPAREN) {
+					bracket_level--;
+					if (bracket_level == 0) {
+						return token;
+					}
+				}
+			}
+			break;
+		case EXPR_END_SEMICOLON:
+			if (token_is_kw(token, KW_SEMICOLON))
+				return token;
+			break;
+		}
+		if (token->kind == TOKEN_EOF) {
+			switch (ends_with) {
+			case EXPR_END_SEMICOLON:
+				tokr_err(t, "Could not find ';' at end of expression.");
+				return NULL;
+			case EXPR_END_RPAREN_OR_COMMA:
+				if (bracket_level > 0) {
+					tokr_err(t, "Mismatched parentheses."); /* FEATURE: Find out where this is */
+					return NULL;
+				} else {
+					tokr_err(t, "Could not find ')' or ',' at end of expression.");
+					return NULL;
+				}
+				return NULL;
+			}
+		}
+		token++;
+	}
+}
+
 static bool parse_decls(Declarations *ds, Tokenizer *t) {
 	decls_create(ds);
 	while (1) {
@@ -43,19 +217,36 @@ static bool parse_decls(Declarations *ds, Tokenizer *t) {
 			tokr_err(t, "Expected ':' in declaration.");
 			return false;
 		}
-
-		/* TODO: type */
-
 		t->token++;
-	
+
+		if (!token_is_kw(t->token, KW_MINUS)
+			&& !token_is_kw(t->token, KW_EQ)
+			&& !token_is_kw(t->token, KW_SEMICOLON)) {
+			if (!parse_type(&decl.type, t))
+				return false;
+		} else {
+			decl.infer_type = true;
+		}
+		
 		if (token_is_kw(t->token, KW_SEMICOLON)) {
+			if (decl.infer_type) {
+				tokr_err(t, "Cannot infer type without expression.");
+				return false;
+			}
 		} else if (token_is_kw(t->token, KW_EQ)) {
 			t->token++;
+			if (!parse_expr(&decl.expr, t, expr_find_end(t, EXPR_END_SEMICOLON)))
+				return false;
 			decl.has_expr = true;
 		} else if (token_is_kw(t->token, KW_MINUS)) {
 			t->token++;
+			if (!parse_expr(&decl.expr, t, expr_find_end(t, EXPR_END_SEMICOLON)))
+				return false;
 			decl.has_expr = true;
 			decl.is_const = true;
+		} else {
+			tokr_err(t, "Expected ';', '=', or '-' in delaration.");
+			return false;
 		}
 		decls_add(ds, &decl);
 		if (token_is_kw(t->token, KW_SEMICOLON)) {
@@ -92,6 +283,13 @@ static bool parse_file(ParsedFile *f, Tokenizer *t) {
 	return ret;
 }
 
+static void expr_fprint(FILE *out, Expression *e) {
+	/* TODO */
+/* 	switch (e->kind) { */
+/* 	case : */
+/* 	} */
+}
+
 static void decl_fprint(FILE *out, Declaration *d) {
 	fprintf(out, "l%lu:", (unsigned long)d->where.line);
 	ident_fprint(out, d->var);
diff --git a/test.toc b/test.toc
index 10a9cff..a178ceb 100644
--- a/test.toc
+++ b/test.toc
@@ -1,4 +1,3 @@
-P := ;
-Q := ;
-R := , foo :;
-S :, R :-;
-\ No newline at end of file
+foo:i8=3;
+foo:i8=3;
+a:float-4;
+\ No newline at end of file
diff --git a/tokenizer.c b/tokenizer.c
index 447ffbc..e65b2e3 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -19,31 +19,57 @@ typedef enum {
 			  KW_RBRACE,
 			  KW_EQEQ,
 			  KW_LT,
-			  KW_LE,
+			  KW_LE,			  
 			  KW_MINUS,
+			  KW_INT,
+			  KW_I8,
+			  KW_I16,
+			  KW_I32,
+			  KW_I64,
+			  KW_U8,
+			  KW_U16,
+			  KW_U32,
+			  KW_U64,
+			  KW_FLOAT,
+			  KW_F32,
+			  KW_F64,
 			  KW_COUNT
 } Keyword;
 
-/* OPTIM: Use a trie or just a function if this gets too long */
 static const char *keywords[KW_COUNT] =
-	{";", "=", ":", ",", "fn", "(", ")", "{", "}", "==", "<", "<=", "-"}; 
+	{";", "=", ":", ",", "fn", "(", ")", "{", "}", "==", "<", "<=", "-",
+	 "int", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "float", "f32",
+	 "f64"};
+
+/* Returns KW_COUNT if it's not a keyword */
+/* OPTIM: don't use strncmp so much */
+static Keyword tokenize_keyword(char **s) {
+	for (Keyword k = 0; k < KW_COUNT; k++) {
+		size_t len = strlen(keywords[k]);
+		if (strncmp(*s, keywords[k], len) == 0) {
+			*s += len;
+			return k;
+		}
+	}
+	return KW_COUNT;
+}
 
 #define TOKR_USE_LLONG 1
 
 typedef unsigned long long IntConst;
 
-typedef long double RealConst; /* OPTIM: Switch to double */
+typedef long double FloatConst; /* OPTIM: Switch to double */
 
 typedef enum {
 			  NUM_CONST_INT,
-			  NUM_CONST_REAL
+			  NUM_CONST_FLOAT
 } NumConstKind;
 
 typedef struct {
 	NumConstKind kind;
 	union {
 		IntConst intval;
-		RealConst realval;
+		FloatConst floatval;
 	};
 } NumConst;
 
@@ -79,6 +105,8 @@ typedef struct {
 	Token *token; /* token currently being processed */
 } Tokenizer;
 
+
+
 static bool token_is_kw(Token *t, Keyword kw) {
 	return t->kind == TOKEN_KW && t->kw == kw;
 }
@@ -99,8 +127,8 @@ static void token_fprint(FILE *out, Token *t) {
 		case NUM_CONST_INT:
 			fprintf(out, "%llu", t->num.intval);
 			break;
-		case NUM_CONST_REAL:
-			fprintf(out, "%g", (double)t->num.realval);
+		case NUM_CONST_FLOAT:
+			fprintf(out, "%g", (double)t->num.floatval);
 			break;
 		}
 		break;
@@ -245,20 +273,17 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
 			}
 			if (is_comment) continue;
 		}
-		Keyword kw;
-		for (kw = 0; kw < KW_COUNT; kw++) {
-			if (strncmp(t.s, keywords[kw], strlen(keywords[kw])) == 0) {
-				break;
-			}
-		}
-		if (kw != KW_COUNT) {
-			/* it's a keyword */
+		{
 			Token token = {0};
-			token.kind = TOKEN_KW;
-			token.kw = kw;
-			tokr_add(&t, &token);
-			t.s += (LineNo)strlen(keywords[kw]);
-			continue;
+			tokr_put_location(&t, &token);
+			Keyword kw = tokenize_keyword(&t.s);
+			if (kw != KW_COUNT) {
+				/* it's a keyword */
+				token.kind = TOKEN_KW;
+				token.kw = kw;
+				tokr_add(&t, &token);
+				continue;
+			}
 		}
 		
 		/* check if it's a number */
@@ -266,7 +291,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
 		if (isdigit(*t.s)) {
 			/* it's a numeric constant */
 			int base = 10;
-			RealConst decimal_pow10;
+			FloatConst decimal_pow10;
 			NumConst n;
 			n.kind = NUM_CONST_INT;
 			n.intval = 0;
@@ -297,7 +322,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
 
 			while (1) {
 				if (*t.s == '.') {
-					if (n.kind == NUM_CONST_REAL) {
+					if (n.kind == NUM_CONST_FLOAT) {
 						tokenization_err(&t, "Double . in number.");
 						goto err;
 					}
@@ -305,16 +330,16 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
 						tokenization_err(&t, "Decimal point in non base 10 number.");
 						goto err;
 					}
-				    n.kind = NUM_CONST_REAL;
+				    n.kind = NUM_CONST_FLOAT;
 					decimal_pow10 = 0.1;
-					n.realval = (RealConst)n.intval;
+					n.floatval = (FloatConst)n.intval;
 					tokr_nextchar(&t);
 					continue;
 				} else if (*t.s == 'e') {
 					tokr_nextchar(&t);
 					if (n.kind == NUM_CONST_INT) {
-						n.kind = NUM_CONST_REAL;
-						n.realval = (RealConst)n.intval;
+						n.kind = NUM_CONST_FLOAT;
+						n.floatval = (FloatConst)n.intval;
 					}
 					/* TODO: check if exceeding maximum exponent */
 					int exponent = 0;
@@ -333,9 +358,9 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
 					/* OPTIM: Slow for very large exponents (unlikely to happen) */
 					for (int i = 0; i < exponent; i++) {
 						if (negative_exponent)
-							n.realval /= 10;
+							n.floatval /= 10;
 						else
-							n.realval *= 10;
+							n.floatval *= 10;
 					}
 						
 					break;
@@ -371,8 +396,8 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
 					n.intval *= (IntConst)base;
 					n.intval += (IntConst)digit;
 					break;
-				case NUM_CONST_REAL:
-					n.realval += decimal_pow10 * (RealConst)digit;
+				case NUM_CONST_FLOAT:
+					n.floatval += decimal_pow10 * (FloatConst)digit;
 					decimal_pow10 /= 10;
 					break;
 				}
@@ -469,7 +494,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
 			Identifier ident = ident_insert(&t.s);
 			token.kind = TOKEN_IDENT;
 			token.ident = ident;
-			tokr_add(&t, &token);			
+			tokr_add(&t, &token);
 			continue;
 		}		
 		tokenization_err(&t, "Token not recognized");
author	Leo Tenenbaum <pommicket@gmail.com>	2019-08-18 16:42:40 -0400
committer	Leo Tenenbaum <pommicket@gmail.com>	2019-08-18 16:42:40 -0400
commit	00cb291c4bf2c64342b00152e58f2544e66ddb2c (patch)
tree	3eaa5ef6beca9049da3fa1e51022671dd3b8f28f
parent	dc3dab7f04f852c3ca7c7850623bedad27f185dd (diff)