diff options
author | Leo Tenenbaum <pommicket@gmail.com> | 2019-08-18 16:42:40 -0400 |
---|---|---|
committer | Leo Tenenbaum <pommicket@gmail.com> | 2019-08-18 16:42:40 -0400 |
commit | 00cb291c4bf2c64342b00152e58f2544e66ddb2c (patch) | |
tree | 3eaa5ef6beca9049da3fa1e51022671dd3b8f28f | |
parent | dc3dab7f04f852c3ca7c7850623bedad27f185dd (diff) |
started parsing expressions
-rw-r--r-- | main.c | 5 | ||||
-rw-r--r-- | parse.c | 212 | ||||
-rw-r--r-- | test.toc | 7 | ||||
-rw-r--r-- | tokenizer.c | 89 |
4 files changed, 268 insertions, 45 deletions
@@ -27,14 +27,15 @@ int main(int argc, char **argv) { } char *contents = err_malloc(4096); - long contents_cap = 4096; + long contents_cap = 4095; long contents_len = 0; while (fgets(contents + contents_len, (int)(contents_cap - contents_len), in)) { contents_len += (long)strlen(contents + contents_len); if (contents_len >= (long)contents_cap - 1024) { contents_cap *= 2; - contents = err_realloc(contents, (size_t)contents_cap); + /* allocate +1 so that pointers don't overflow */ + contents = err_realloc(contents, (size_t)contents_cap + 1); } } if (ferror(in)) { @@ -1,10 +1,58 @@ +typedef enum { + TYPE_BUILTIN +} TypeKind; + +typedef enum { + BUILTIN_INT, + BUILTIN_I8, + BUILTIN_I16, + BUILTIN_I32, + BUILTIN_I64, + BUILTIN_U8, + BUILTIN_U16, + BUILTIN_U32, + BUILTIN_U64, + BUILTIN_FLOAT, + BUILTIN_F32, + BUILTIN_F64, + BUILTIN_TYPE_COUNT +} BuiltinType; + + +typedef struct { + TypeKind kind; + union { + BuiltinType builtin; + }; +} Type; + +typedef enum { + EXPR_INT_CONST, + EXPR_FLOAT_CONST +} ExprKind; + +typedef struct { + ExprKind kind; + Type type; + bool is_flexible_num:1; /* expressions like 5 or 7*8+3 can be any numerical type */ + union { + FloatConst floatc; + IntConst intc; + }; +} Expression; + typedef struct { Location where; Identifier var; - bool is_const; - bool has_expr; + Type type; + Expression expr; + bool infer_type:1; + bool is_const:1; + bool has_expr:1; } Declaration; +/* OPTIM: Instead of using dynamic arrays, do two passes. */ + arr_declaration(Declarations, Declaration, decls_) typedef enum { @@ -25,7 +73,133 @@ typedef struct { Statements stmts; } ParsedFile; -/* TODO: Add newline tokens back in; give tokens pointer to text */ + +/* returns BUILTIN_TYPE_COUNT on failure */ +static BuiltinType kw_to_builtin_type(Keyword kw) { + switch (kw) { + case KW_INT: return BUILTIN_INT; + case KW_I8: return BUILTIN_I8; + case KW_I16: return BUILTIN_I16; + case KW_I32: return BUILTIN_I32; + case KW_I64: return BUILTIN_I64; + case KW_U8: return BUILTIN_U8; + case KW_U16: return BUILTIN_U16; + case KW_U32: return BUILTIN_U32; + case KW_U64: return BUILTIN_U64; + case KW_FLOAT: return BUILTIN_FLOAT; + case KW_F32: return BUILTIN_F32; + case KW_F64: return BUILTIN_F64; + default: return BUILTIN_TYPE_COUNT; + } +} + +static bool parse_type(Type *type, Tokenizer *t) { + switch (t->token->kind) { + case TOKEN_KW: + type->kind = TYPE_BUILTIN; + type->builtin = kw_to_builtin_type(t->token->kw); + if (type->builtin == BUILTIN_TYPE_COUNT) { + tokr_err(t, "Expected type."); + return false; + } else { + t->token++; + return true; + } + break; + default: break; + } + tokr_err(t, "Unrecognized type."); + return false; +} + +static bool parse_expr(Expression *e, Tokenizer *t, Token *end) { + if (end == NULL) return false; + memset(e, 0, sizeof *e); + if (end - t->token == 1) { + /* 1-token expression */ + switch (t->token->kind) { + case TOKEN_NUM_CONST: { + NumConst *num = &t->token->num; + switch (num->kind) { + case NUM_CONST_FLOAT: + e->kind = EXPR_FLOAT_CONST; + e->type.kind = TYPE_BUILTIN; + e->type.builtin = BUILTIN_FLOAT; + e->floatc = num->floatval; + break; + case NUM_CONST_INT: + e->kind = EXPR_INT_CONST; + e->is_flexible_num = true; + e->type.kind = TYPE_BUILTIN; + e->type.builtin = BUILTIN_INT; /* TODO: if it's too big, use a u64 instead. */ + e->floatc = num->intval; + break; + } + } break; + default: + tokr_err(t, "Unrecognized expression."); + return false; + } + t->token = end; + return true; + } + /* TODO */ + tokr_err(t, "multi-token exprs not supported yet."); + return false; +} + +/* +ends_with = which keyword does this expression end with? +if it's KW_RPAREN, this will match parentheses properly. +*/ +typedef enum { + EXPR_END_RPAREN_OR_COMMA, + EXPR_END_SEMICOLON +} ExprEndKind; +static Token *expr_find_end(Tokenizer *t, ExprEndKind ends_with) { + long bracket_level = 0; + Token *token = t->token; + while (1) { + switch (ends_with) { + case EXPR_END_RPAREN_OR_COMMA: + if (token->kind == TOKEN_KW) { + if (token->kw == KW_COMMA && bracket_level == 0) + return token; + if (token->kw == KW_LPAREN) + bracket_level++; + if (token->kw == KW_RPAREN) { + bracket_level--; + if (bracket_level == 0) { + return token; + } + } + } + break; + case EXPR_END_SEMICOLON: + if (token_is_kw(token, KW_SEMICOLON)) + return token; + break; + } + if (token->kind == TOKEN_EOF) { + switch (ends_with) { + case EXPR_END_SEMICOLON: + tokr_err(t, "Could not find ';' at end of expression."); + return NULL; + case EXPR_END_RPAREN_OR_COMMA: + if (bracket_level > 0) { + tokr_err(t, "Mismatched parentheses."); /* FEATURE: Find out where this is */ + return NULL; + } else { + tokr_err(t, "Could not find ')' or ',' at end of expression."); + return NULL; + } + return NULL; + } + } + token++; + } +} + static bool parse_decls(Declarations *ds, Tokenizer *t) { decls_create(ds); while (1) { @@ -43,19 +217,36 @@ static bool parse_decls(Declarations *ds, Tokenizer *t) { tokr_err(t, "Expected ':' in declaration."); return false; } - - /* TODO: type */ - t->token++; - + + if (!token_is_kw(t->token, KW_MINUS) + && !token_is_kw(t->token, KW_EQ) + && !token_is_kw(t->token, KW_SEMICOLON)) { + if (!parse_type(&decl.type, t)) + return false; + } else { + decl.infer_type = true; + } + if (token_is_kw(t->token, KW_SEMICOLON)) { + if (decl.infer_type) { + tokr_err(t, "Cannot infer type without expression."); + return false; + } } else if (token_is_kw(t->token, KW_EQ)) { t->token++; + if (!parse_expr(&decl.expr, t, expr_find_end(t, EXPR_END_SEMICOLON))) + return false; decl.has_expr = true; } else if (token_is_kw(t->token, KW_MINUS)) { t->token++; + if (!parse_expr(&decl.expr, t, expr_find_end(t, EXPR_END_SEMICOLON))) + return false; decl.has_expr = true; decl.is_const = true; + } else { + tokr_err(t, "Expected ';', '=', or '-' in delaration."); + return false; } decls_add(ds, &decl); if (token_is_kw(t->token, KW_SEMICOLON)) { @@ -92,6 +283,13 @@ static bool parse_file(ParsedFile *f, Tokenizer *t) { return ret; } +static void expr_fprint(FILE *out, Expression *e) { + /* TODO */ +/* switch (e->kind) { */ +/* case : */ +/* } */ +} + static void decl_fprint(FILE *out, Declaration *d) { fprintf(out, "l%lu:", (unsigned long)d->where.line); ident_fprint(out, d->var); @@ -1,4 +1,3 @@ -P := ; -Q := ; -R := , foo :; -S :, R :-;
\ No newline at end of file +foo:i8=3; +foo:i8=3; +a:float-4;
\ No newline at end of file diff --git a/tokenizer.c b/tokenizer.c index 447ffbc..e65b2e3 100644 --- a/tokenizer.c +++ b/tokenizer.c @@ -19,31 +19,57 @@ typedef enum { KW_RBRACE, KW_EQEQ, KW_LT, - KW_LE, + KW_LE, KW_MINUS, + KW_INT, + KW_I8, + KW_I16, + KW_I32, + KW_I64, + KW_U8, + KW_U16, + KW_U32, + KW_U64, + KW_FLOAT, + KW_F32, + KW_F64, KW_COUNT } Keyword; -/* OPTIM: Use a trie or just a function if this gets too long */ static const char *keywords[KW_COUNT] = - {";", "=", ":", ",", "fn", "(", ")", "{", "}", "==", "<", "<=", "-"}; + {";", "=", ":", ",", "fn", "(", ")", "{", "}", "==", "<", "<=", "-", + "int", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "float", "f32", + "f64"}; + +/* Returns KW_COUNT if it's not a keyword */ +/* OPTIM: don't use strncmp so much */ +static Keyword tokenize_keyword(char **s) { + for (Keyword k = 0; k < KW_COUNT; k++) { + size_t len = strlen(keywords[k]); + if (strncmp(*s, keywords[k], len) == 0) { + *s += len; + return k; + } + } + return KW_COUNT; +} #define TOKR_USE_LLONG 1 typedef unsigned long long IntConst; -typedef long double RealConst; /* OPTIM: Switch to double */ +typedef long double FloatConst; /* OPTIM: Switch to double */ typedef enum { NUM_CONST_INT, - NUM_CONST_REAL + NUM_CONST_FLOAT } NumConstKind; typedef struct { NumConstKind kind; union { IntConst intval; - RealConst realval; + FloatConst floatval; }; } NumConst; @@ -79,6 +105,8 @@ typedef struct { Token *token; /* token currently being processed */ } Tokenizer; + + static bool token_is_kw(Token *t, Keyword kw) { return t->kind == TOKEN_KW && t->kw == kw; } @@ -99,8 +127,8 @@ static void token_fprint(FILE *out, Token *t) { case NUM_CONST_INT: fprintf(out, "%llu", t->num.intval); break; - case NUM_CONST_REAL: - fprintf(out, "%g", (double)t->num.realval); + case NUM_CONST_FLOAT: + fprintf(out, "%g", (double)t->num.floatval); break; } break; @@ -245,20 +273,17 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { } if (is_comment) continue; } - Keyword kw; - for (kw = 0; kw < KW_COUNT; kw++) { - if (strncmp(t.s, keywords[kw], strlen(keywords[kw])) == 0) { - break; - } - } - if (kw != KW_COUNT) { - /* it's a keyword */ + { Token token = {0}; - token.kind = TOKEN_KW; - token.kw = kw; - tokr_add(&t, &token); - t.s += (LineNo)strlen(keywords[kw]); - continue; + tokr_put_location(&t, &token); + Keyword kw = tokenize_keyword(&t.s); + if (kw != KW_COUNT) { + /* it's a keyword */ + token.kind = TOKEN_KW; + token.kw = kw; + tokr_add(&t, &token); + continue; + } } /* check if it's a number */ @@ -266,7 +291,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { if (isdigit(*t.s)) { /* it's a numeric constant */ int base = 10; - RealConst decimal_pow10; + FloatConst decimal_pow10; NumConst n; n.kind = NUM_CONST_INT; n.intval = 0; @@ -297,7 +322,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { while (1) { if (*t.s == '.') { - if (n.kind == NUM_CONST_REAL) { + if (n.kind == NUM_CONST_FLOAT) { tokenization_err(&t, "Double . in number."); goto err; } @@ -305,16 +330,16 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { tokenization_err(&t, "Decimal point in non base 10 number."); goto err; } - n.kind = NUM_CONST_REAL; + n.kind = NUM_CONST_FLOAT; decimal_pow10 = 0.1; - n.realval = (RealConst)n.intval; + n.floatval = (FloatConst)n.intval; tokr_nextchar(&t); continue; } else if (*t.s == 'e') { tokr_nextchar(&t); if (n.kind == NUM_CONST_INT) { - n.kind = NUM_CONST_REAL; - n.realval = (RealConst)n.intval; + n.kind = NUM_CONST_FLOAT; + n.floatval = (FloatConst)n.intval; } /* TODO: check if exceeding maximum exponent */ int exponent = 0; @@ -333,9 +358,9 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { /* OPTIM: Slow for very large exponents (unlikely to happen) */ for (int i = 0; i < exponent; i++) { if (negative_exponent) - n.realval /= 10; + n.floatval /= 10; else - n.realval *= 10; + n.floatval *= 10; } break; @@ -371,8 +396,8 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { n.intval *= (IntConst)base; n.intval += (IntConst)digit; break; - case NUM_CONST_REAL: - n.realval += decimal_pow10 * (RealConst)digit; + case NUM_CONST_FLOAT: + n.floatval += decimal_pow10 * (FloatConst)digit; decimal_pow10 /= 10; break; } @@ -469,7 +494,7 @@ static bool tokenize_string(Tokenizer *tokr, char *str) { Identifier ident = ident_insert(&t.s); token.kind = TOKEN_IDENT; token.ident = ident; - tokr_add(&t, &token); + tokr_add(&t, &token); continue; } tokenization_err(&t, "Token not recognized"); |