summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeo Tenenbaum <pommicket@gmail.com>2019-08-18 16:42:40 -0400
committerLeo Tenenbaum <pommicket@gmail.com>2019-08-18 16:42:40 -0400
commit00cb291c4bf2c64342b00152e58f2544e66ddb2c (patch)
tree3eaa5ef6beca9049da3fa1e51022671dd3b8f28f
parentdc3dab7f04f852c3ca7c7850623bedad27f185dd (diff)
started parsing expressions
-rw-r--r--main.c5
-rw-r--r--parse.c212
-rw-r--r--test.toc7
-rw-r--r--tokenizer.c89
4 files changed, 268 insertions, 45 deletions
diff --git a/main.c b/main.c
index f9332df..7fefa70 100644
--- a/main.c
+++ b/main.c
@@ -27,14 +27,15 @@ int main(int argc, char **argv) {
}
char *contents = err_malloc(4096);
- long contents_cap = 4096;
+ long contents_cap = 4095;
long contents_len = 0;
while (fgets(contents + contents_len, (int)(contents_cap - contents_len), in)) {
contents_len += (long)strlen(contents + contents_len);
if (contents_len >= (long)contents_cap - 1024) {
contents_cap *= 2;
- contents = err_realloc(contents, (size_t)contents_cap);
+ /* allocate +1 so that pointers don't overflow */
+ contents = err_realloc(contents, (size_t)contents_cap + 1);
}
}
if (ferror(in)) {
diff --git a/parse.c b/parse.c
index 775eac3..9ab9ee1 100644
--- a/parse.c
+++ b/parse.c
@@ -1,10 +1,58 @@
+typedef enum {
+ TYPE_BUILTIN
+} TypeKind;
+
+typedef enum {
+ BUILTIN_INT,
+ BUILTIN_I8,
+ BUILTIN_I16,
+ BUILTIN_I32,
+ BUILTIN_I64,
+ BUILTIN_U8,
+ BUILTIN_U16,
+ BUILTIN_U32,
+ BUILTIN_U64,
+ BUILTIN_FLOAT,
+ BUILTIN_F32,
+ BUILTIN_F64,
+ BUILTIN_TYPE_COUNT
+} BuiltinType;
+
+
+typedef struct {
+ TypeKind kind;
+ union {
+ BuiltinType builtin;
+ };
+} Type;
+
+typedef enum {
+ EXPR_INT_CONST,
+ EXPR_FLOAT_CONST
+} ExprKind;
+
+typedef struct {
+ ExprKind kind;
+ Type type;
+ bool is_flexible_num:1; /* expressions like 5 or 7*8+3 can be any numerical type */
+ union {
+ FloatConst floatc;
+ IntConst intc;
+ };
+} Expression;
+
typedef struct {
Location where;
Identifier var;
- bool is_const;
- bool has_expr;
+ Type type;
+ Expression expr;
+ bool infer_type:1;
+ bool is_const:1;
+ bool has_expr:1;
} Declaration;
+/* OPTIM: Instead of using dynamic arrays, do two passes. */
+
arr_declaration(Declarations, Declaration, decls_)
typedef enum {
@@ -25,7 +73,133 @@ typedef struct {
Statements stmts;
} ParsedFile;
-/* TODO: Add newline tokens back in; give tokens pointer to text */
+
+/* returns BUILTIN_TYPE_COUNT on failure */
+static BuiltinType kw_to_builtin_type(Keyword kw) {
+ switch (kw) {
+ case KW_INT: return BUILTIN_INT;
+ case KW_I8: return BUILTIN_I8;
+ case KW_I16: return BUILTIN_I16;
+ case KW_I32: return BUILTIN_I32;
+ case KW_I64: return BUILTIN_I64;
+ case KW_U8: return BUILTIN_U8;
+ case KW_U16: return BUILTIN_U16;
+ case KW_U32: return BUILTIN_U32;
+ case KW_U64: return BUILTIN_U64;
+ case KW_FLOAT: return BUILTIN_FLOAT;
+ case KW_F32: return BUILTIN_F32;
+ case KW_F64: return BUILTIN_F64;
+ default: return BUILTIN_TYPE_COUNT;
+ }
+}
+
+static bool parse_type(Type *type, Tokenizer *t) {
+ switch (t->token->kind) {
+ case TOKEN_KW:
+ type->kind = TYPE_BUILTIN;
+ type->builtin = kw_to_builtin_type(t->token->kw);
+ if (type->builtin == BUILTIN_TYPE_COUNT) {
+ tokr_err(t, "Expected type.");
+ return false;
+ } else {
+ t->token++;
+ return true;
+ }
+ break;
+ default: break;
+ }
+ tokr_err(t, "Unrecognized type.");
+ return false;
+}
+
+static bool parse_expr(Expression *e, Tokenizer *t, Token *end) {
+ if (end == NULL) return false;
+ memset(e, 0, sizeof *e);
+ if (end - t->token == 1) {
+ /* 1-token expression */
+ switch (t->token->kind) {
+ case TOKEN_NUM_CONST: {
+ NumConst *num = &t->token->num;
+ switch (num->kind) {
+ case NUM_CONST_FLOAT:
+ e->kind = EXPR_FLOAT_CONST;
+ e->type.kind = TYPE_BUILTIN;
+ e->type.builtin = BUILTIN_FLOAT;
+ e->floatc = num->floatval;
+ break;
+ case NUM_CONST_INT:
+ e->kind = EXPR_INT_CONST;
+ e->is_flexible_num = true;
+ e->type.kind = TYPE_BUILTIN;
+ e->type.builtin = BUILTIN_INT; /* TODO: if it's too big, use a u64 instead. */
+ e->floatc = num->intval;
+ break;
+ }
+ } break;
+ default:
+ tokr_err(t, "Unrecognized expression.");
+ return false;
+ }
+ t->token = end;
+ return true;
+ }
+ /* TODO */
+ tokr_err(t, "multi-token exprs not supported yet.");
+ return false;
+}
+
+/*
+ends_with = which keyword does this expression end with?
+if it's KW_RPAREN, this will match parentheses properly.
+*/
+typedef enum {
+ EXPR_END_RPAREN_OR_COMMA,
+ EXPR_END_SEMICOLON
+} ExprEndKind;
+static Token *expr_find_end(Tokenizer *t, ExprEndKind ends_with) {
+ long bracket_level = 0;
+ Token *token = t->token;
+ while (1) {
+ switch (ends_with) {
+ case EXPR_END_RPAREN_OR_COMMA:
+ if (token->kind == TOKEN_KW) {
+ if (token->kw == KW_COMMA && bracket_level == 0)
+ return token;
+ if (token->kw == KW_LPAREN)
+ bracket_level++;
+ if (token->kw == KW_RPAREN) {
+ bracket_level--;
+ if (bracket_level == 0) {
+ return token;
+ }
+ }
+ }
+ break;
+ case EXPR_END_SEMICOLON:
+ if (token_is_kw(token, KW_SEMICOLON))
+ return token;
+ break;
+ }
+ if (token->kind == TOKEN_EOF) {
+ switch (ends_with) {
+ case EXPR_END_SEMICOLON:
+ tokr_err(t, "Could not find ';' at end of expression.");
+ return NULL;
+ case EXPR_END_RPAREN_OR_COMMA:
+ if (bracket_level > 0) {
+ tokr_err(t, "Mismatched parentheses."); /* FEATURE: Find out where this is */
+ return NULL;
+ } else {
+ tokr_err(t, "Could not find ')' or ',' at end of expression.");
+ return NULL;
+ }
+ return NULL;
+ }
+ }
+ token++;
+ }
+}
+
static bool parse_decls(Declarations *ds, Tokenizer *t) {
decls_create(ds);
while (1) {
@@ -43,19 +217,36 @@ static bool parse_decls(Declarations *ds, Tokenizer *t) {
tokr_err(t, "Expected ':' in declaration.");
return false;
}
-
- /* TODO: type */
-
t->token++;
-
+
+ if (!token_is_kw(t->token, KW_MINUS)
+ && !token_is_kw(t->token, KW_EQ)
+ && !token_is_kw(t->token, KW_SEMICOLON)) {
+ if (!parse_type(&decl.type, t))
+ return false;
+ } else {
+ decl.infer_type = true;
+ }
+
if (token_is_kw(t->token, KW_SEMICOLON)) {
+ if (decl.infer_type) {
+ tokr_err(t, "Cannot infer type without expression.");
+ return false;
+ }
} else if (token_is_kw(t->token, KW_EQ)) {
t->token++;
+ if (!parse_expr(&decl.expr, t, expr_find_end(t, EXPR_END_SEMICOLON)))
+ return false;
decl.has_expr = true;
} else if (token_is_kw(t->token, KW_MINUS)) {
t->token++;
+ if (!parse_expr(&decl.expr, t, expr_find_end(t, EXPR_END_SEMICOLON)))
+ return false;
decl.has_expr = true;
decl.is_const = true;
+ } else {
+ tokr_err(t, "Expected ';', '=', or '-' in delaration.");
+ return false;
}
decls_add(ds, &decl);
if (token_is_kw(t->token, KW_SEMICOLON)) {
@@ -92,6 +283,13 @@ static bool parse_file(ParsedFile *f, Tokenizer *t) {
return ret;
}
+static void expr_fprint(FILE *out, Expression *e) {
+ /* TODO */
+/* switch (e->kind) { */
+/* case : */
+/* } */
+}
+
static void decl_fprint(FILE *out, Declaration *d) {
fprintf(out, "l%lu:", (unsigned long)d->where.line);
ident_fprint(out, d->var);
diff --git a/test.toc b/test.toc
index 10a9cff..a178ceb 100644
--- a/test.toc
+++ b/test.toc
@@ -1,4 +1,3 @@
-P := ;
-Q := ;
-R := , foo :;
-S :, R :-; \ No newline at end of file
+foo:i8=3;
+foo:i8=3;
+a:float-4; \ No newline at end of file
diff --git a/tokenizer.c b/tokenizer.c
index 447ffbc..e65b2e3 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -19,31 +19,57 @@ typedef enum {
KW_RBRACE,
KW_EQEQ,
KW_LT,
- KW_LE,
+ KW_LE,
KW_MINUS,
+ KW_INT,
+ KW_I8,
+ KW_I16,
+ KW_I32,
+ KW_I64,
+ KW_U8,
+ KW_U16,
+ KW_U32,
+ KW_U64,
+ KW_FLOAT,
+ KW_F32,
+ KW_F64,
KW_COUNT
} Keyword;
-/* OPTIM: Use a trie or just a function if this gets too long */
static const char *keywords[KW_COUNT] =
- {";", "=", ":", ",", "fn", "(", ")", "{", "}", "==", "<", "<=", "-"};
+ {";", "=", ":", ",", "fn", "(", ")", "{", "}", "==", "<", "<=", "-",
+ "int", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "float", "f32",
+ "f64"};
+
+/* Returns KW_COUNT if it's not a keyword */
+/* OPTIM: don't use strncmp so much */
+static Keyword tokenize_keyword(char **s) {
+ for (Keyword k = 0; k < KW_COUNT; k++) {
+ size_t len = strlen(keywords[k]);
+ if (strncmp(*s, keywords[k], len) == 0) {
+ *s += len;
+ return k;
+ }
+ }
+ return KW_COUNT;
+}
#define TOKR_USE_LLONG 1
typedef unsigned long long IntConst;
-typedef long double RealConst; /* OPTIM: Switch to double */
+typedef long double FloatConst; /* OPTIM: Switch to double */
typedef enum {
NUM_CONST_INT,
- NUM_CONST_REAL
+ NUM_CONST_FLOAT
} NumConstKind;
typedef struct {
NumConstKind kind;
union {
IntConst intval;
- RealConst realval;
+ FloatConst floatval;
};
} NumConst;
@@ -79,6 +105,8 @@ typedef struct {
Token *token; /* token currently being processed */
} Tokenizer;
+
+
static bool token_is_kw(Token *t, Keyword kw) {
return t->kind == TOKEN_KW && t->kw == kw;
}
@@ -99,8 +127,8 @@ static void token_fprint(FILE *out, Token *t) {
case NUM_CONST_INT:
fprintf(out, "%llu", t->num.intval);
break;
- case NUM_CONST_REAL:
- fprintf(out, "%g", (double)t->num.realval);
+ case NUM_CONST_FLOAT:
+ fprintf(out, "%g", (double)t->num.floatval);
break;
}
break;
@@ -245,20 +273,17 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
}
if (is_comment) continue;
}
- Keyword kw;
- for (kw = 0; kw < KW_COUNT; kw++) {
- if (strncmp(t.s, keywords[kw], strlen(keywords[kw])) == 0) {
- break;
- }
- }
- if (kw != KW_COUNT) {
- /* it's a keyword */
+ {
Token token = {0};
- token.kind = TOKEN_KW;
- token.kw = kw;
- tokr_add(&t, &token);
- t.s += (LineNo)strlen(keywords[kw]);
- continue;
+ tokr_put_location(&t, &token);
+ Keyword kw = tokenize_keyword(&t.s);
+ if (kw != KW_COUNT) {
+ /* it's a keyword */
+ token.kind = TOKEN_KW;
+ token.kw = kw;
+ tokr_add(&t, &token);
+ continue;
+ }
}
/* check if it's a number */
@@ -266,7 +291,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
if (isdigit(*t.s)) {
/* it's a numeric constant */
int base = 10;
- RealConst decimal_pow10;
+ FloatConst decimal_pow10;
NumConst n;
n.kind = NUM_CONST_INT;
n.intval = 0;
@@ -297,7 +322,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
while (1) {
if (*t.s == '.') {
- if (n.kind == NUM_CONST_REAL) {
+ if (n.kind == NUM_CONST_FLOAT) {
tokenization_err(&t, "Double . in number.");
goto err;
}
@@ -305,16 +330,16 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
tokenization_err(&t, "Decimal point in non base 10 number.");
goto err;
}
- n.kind = NUM_CONST_REAL;
+ n.kind = NUM_CONST_FLOAT;
decimal_pow10 = 0.1;
- n.realval = (RealConst)n.intval;
+ n.floatval = (FloatConst)n.intval;
tokr_nextchar(&t);
continue;
} else if (*t.s == 'e') {
tokr_nextchar(&t);
if (n.kind == NUM_CONST_INT) {
- n.kind = NUM_CONST_REAL;
- n.realval = (RealConst)n.intval;
+ n.kind = NUM_CONST_FLOAT;
+ n.floatval = (FloatConst)n.intval;
}
/* TODO: check if exceeding maximum exponent */
int exponent = 0;
@@ -333,9 +358,9 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
/* OPTIM: Slow for very large exponents (unlikely to happen) */
for (int i = 0; i < exponent; i++) {
if (negative_exponent)
- n.realval /= 10;
+ n.floatval /= 10;
else
- n.realval *= 10;
+ n.floatval *= 10;
}
break;
@@ -371,8 +396,8 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
n.intval *= (IntConst)base;
n.intval += (IntConst)digit;
break;
- case NUM_CONST_REAL:
- n.realval += decimal_pow10 * (RealConst)digit;
+ case NUM_CONST_FLOAT:
+ n.floatval += decimal_pow10 * (FloatConst)digit;
decimal_pow10 /= 10;
break;
}
@@ -469,7 +494,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) {
Identifier ident = ident_insert(&t.s);
token.kind = TOKEN_IDENT;
token.ident = ident;
- tokr_add(&t, &token);
+ tokr_add(&t, &token);
continue;
}
tokenization_err(&t, "Token not recognized");