From d1167f03d03c2a6ab75fce410706e3098dfd3090 Mon Sep 17 00:00:00 2001 From: pommicket Date: Thu, 3 Feb 2022 22:53:38 -0500 Subject: start parsing statements (not a lot yet) --- 05/constants.b | 46 ++++++++++++++++++++ 05/main.b | 34 +++++++++++++-- 05/main.c | 115 ++++++++++++++++++++++++++------------------------ 05/parse.b | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 260 insertions(+), 65 deletions(-) diff --git a/05/constants.b b/05/constants.b index 983bc2a..6a0bba1 100644 --- a/05/constants.b +++ b/05/constants.b @@ -230,6 +230,52 @@ ; types willl be initialized (in main) so that this refers to the type char* #define TYPE_POINTER_TO_CHAR 20 +; STATEMENTS +; In C, note that `if', `while', etc. always have a single statement as their body: +; if (x) { y; z; w; } +; here {y; z; w;} is a single `compound' statement containing three statements. +; our statements don't directly correspond to the C89 standard's notion of statements, in particular, +; labels count as separate statements and declarations count as statements. +; each statement is stored as exactly 40 bytes +; uchar type +; uchar padding +; ushort file +; uint line +; ulong data1 +; ulong data2 +; ulong data3 +; ulong data4 +; a type of 0 indicates the end of the block. +; data layout for particular statements: +; - STATEMENT_EXPRESSION - data1 is a pointer to expression data; data2,3,4 are unused +; - STATEMENT_LOCAL_DECLARATION - declaring a local variable (automatic/"register" storage duration), data1 = total bytes used by all local variables so far in this function including this one; data2,3,4 unused +; - STATEMENT_LABEL - data1 is a pointer to the name of the label; data2,3,4 are unused +; - STATEMENT_BLOCK - data1 is a pointer to an array of statements; data2,3,4 are unused +; - STATEMENT_IF - data1 is a pointer to the condition, data2 is a pointer to the `if' branch statement, data3 is a pointer to the `else' branch statement, or 0 if there is none; data4 is unused +; - STATEMENT_SWITCH - data1 is a pointer to the expression, data2 is a pointer to the body statement; data3,4 are unused +; - STATEMENT_WHILE - data1 is a pointer to the condition, data2 is a pointer to the body statement; data3,4 are unused +; - STATEMENT_DO - data1 is a pointer to the body statement, data2 is a pointer to the condition; data3,4 are unused +; - STATEMENT_FOR - data1,2,3 are pointers to the first, second, and third expressions inside parentheses, data4 is a pointer to the body statement +; - STATEMENT_GOTO - data1 is a pointer to the name of the label; data2,3,4 are unused +; - STATEMENT_CONTINUE - data1,2,3,4 are unused +; - STATEMENT_BREAK - data1,2,3,4 are unused +; - STATEMENT_RETURN - data1 is a pointer to the expression, or 0 if there is none; data2,3,4 are unused +#define STATEMENT_EXPRESSION 1 +#define STATEMENT_LOCAL_DECLARATION 2 +#define STATEMENT_LABEL 3 +#define STATEMENT_BLOCK 4 +#define STATEMENT_IF 5 +#define STATEMENT_SWITCH 6 +#define STATEMENT_WHILE 7 +#define STATEMENT_DO 8 +#define STATEMENT_FOR 9 +#define STATEMENT_GOTO 0xa +#define STATEMENT_CONTINUE 0xb +#define STATEMENT_BREAK 0xc +#define STATEMENT_RETURN 0xd + + + :keyword_table byte SYMBOL_SEMICOLON byte 59 diff --git a/05/main.b b/05/main.b index 8a6ceb0..381ad59 100644 --- a/05/main.b +++ b/05/main.b @@ -42,6 +42,16 @@ global output_file_data ; ident list of global variables. each one is stored as ; (type << 32) | address global global_variables +; ident list of functions. each entry is a pointer to a single statement - which should always be a STATEMENT_BLOCK +global function_statements +; statement_datas[0] = pointer to statement data for block-nesting depth 0 (i.e. function bodies) +; statement_datas[1] = pointer to statement data for block-nesting depth 1 (blocks inside functions) +; statement_datas[2] = pointer to statement data for block-nesting depth 2 (blocks inside blocks inside functions) +; etc. up to statement_datas[15] "* 15 nesting levels of compound statements, iteration control structures, and selection control structures" C89 § 2.2.4.1 +; these have to be separated for reasons™ +global statement_datas +global statement_datas_ends +global parse_stmt_depth #include util.b #include idents.b @@ -154,15 +164,32 @@ function main local tokens local ast local p + local q local i local output_fd + statement_datas = malloc(4000) + statement_datas_ends = malloc(4000) + p = statement_datas + q = statement_datas_ends + i = 0 + :statement_datas_loop + *8p = malloc(4000000) ; supports 100,000 statements at each level + *8q = p + p += 8 + q += 8 + i += 1 + if i < 16 goto statement_datas_loop + fill_in_powers_of_10() typedefs = ident_list_create(100000) enumerators = ident_list_create(4000000) structures = ident_list_create(4000000) - global_variables = ident_list_create(4000000) + global_variables = ident_list_create(400000) + function_statements = ident_list_create(400000) + + function_stmt_data = malloc(800000) ; should be at least 40 bytes * max # of functions dat_banned_objmacros = 255 dat_banned_fmacros = 255 @@ -197,14 +224,15 @@ function main translation_phase_4(input_filename, pptokens, processed_pptokens) free(pptokens) pptokens = processed_pptokens - print_pptokens(pptokens) - print_separator() + ;print_pptokens(pptokens) + ;print_separator() ;print_object_macros() ;print_function_macros() tokens = malloc(16000000) p = tokenize(pptokens, tokens, input_filename, 1) print_tokens(tokens, p) + print_separator() ; NOTE: do NOT free pptokens; identifiers still reference them. parse_tokens(tokens) diff --git a/05/main.c b/05/main.c index 204da87..c5d1b72 100644 --- a/05/main.c +++ b/05/main.c @@ -1,59 +1,64 @@ -typedef struct { - int i[41]; - long double d; -} (*x___)(void); +int f(void) { +blah:blah:blah:; +} -typedef enum X { - R,S,T - } *Foo[sizeof(unsigned long)]; -typedef int A___[T]; -typedef struct A { - int x, y; - long double c; - unsigned long d; - char e[3]; - long f; -} A; - -typedef union B{ - int x; - struct { - int y; - struct {long z; } c; - } c; -}B; - -typedef int QQQ[sizeof(A)+sizeof"hello"]; -typedef int RRR[sizeof(struct B)]; - -static unsigned int x={55}; -static char *s = "hello"; -static char *t = "goodbye"; -static char u[8] = "hellothe"; -static char v[100] = "re my"; -static char w[] = "friendly"; -static char x_[] = "hi"; -typedef int A_[sizeof x_ + sizeof u]; - -static int a[5] = {1,2,3}; -static char b[6][7] = {{'a'},{'b'},{'c'},{'d'},{'e'}}; -static char __b[][7] = {{'a'},"hello",'r'}; -static int _u = sizeof __b; - -struct { - int a; - long b; -} x1[] = {0x1234567890, 1ul<<60|1ul<<3, 77}; -int y1 = 0x12345678; - -struct { - int x[2], y; -} test[] = {3, 5,0x1234,0x4321}; -typedef int Blah[sizeof((B *)0)->c.y]; -unsigned marker = 0xdeadbeef; - -typedef int (*FUNCTION)(void); -typedef int AAAA[sizeof*****((FUNCTION)0)]; +/* typedef struct { */ +/* int i[41]; */ +/* long double d; */ +/* } (*x___)(void); */ +/* */ +/* typedef enum X { */ +/* R,S,T */ +/* } *Foo[sizeof(unsigned long)]; */ +/* typedef int A___[T]; */ +/* */ +/* typedef struct A { */ +/* int x, y; */ +/* long double c; */ +/* unsigned long d; */ +/* char e[3]; */ +/* long f; */ +/* } A; */ +/* */ +/* typedef union B{ */ +/* int x; */ +/* struct { */ +/* int y; */ +/* struct {long z; } c; */ +/* } c; */ +/* }B; */ +/* */ +/* typedef int QQQ[sizeof(A)+sizeof"hello"]; */ +/* typedef int RRR[sizeof(struct B)]; */ +/* */ +/* static unsigned int x={55}; */ +/* static char *s = "hello"; */ +/* static char *t = "goodbye"; */ +/* static char u[8] = "hellothe"; */ +/* static char v[100] = "re my"; */ +/* static char w[] = "friendly"; */ +/* static char x_[] = "hi"; */ +/* typedef int A_[sizeof x_ + sizeof u]; */ +/* */ +/* static int a[5] = {1,2,3}; */ +/* static char b[6][7] = {{'a'},{'b'},{'c'},{'d'},{'e'}}; */ +/* static char __b[][7] = {{'a'},"hello",'r'}; */ +/* static int _u = sizeof __b; */ +/* */ +/* struct { */ +/* int a; */ +/* long b; */ +/* } x1[] = {0x1234567890, 1ul<<60|1ul<<3, 77}; */ +/* int y1 = 0x12345678; */ +/* */ +/* struct { */ +/* int x[2], y; */ +/* } test[] = {3, 5,0x1234,0x4321}; */ +/* typedef int Blah[sizeof((B *)0)->c.y]; */ +/* unsigned marker = 0xdeadbeef; */ +/* */ +/* typedef int (*FUNCTION)(void); */ +/* typedef int AAAA[sizeof*****((FUNCTION)0)]; */ /* typedef int X[sizeof(int)+4]; */ diff --git a/05/parse.b b/05/parse.b index 661a257..18765b8 100644 --- a/05/parse.b +++ b/05/parse.b @@ -35,7 +35,7 @@ function structure_is_union if offset == 0 goto return_1 ; if that's 0, it's a union or 1-element struct goto return_0 - +; parse a translation unit function parse_tokens argument tokens local token @@ -44,6 +44,7 @@ function parse_tokens local p local b local c + local n local base_type local base_type_end local name @@ -52,6 +53,7 @@ function parse_tokens local suffix local suffix_end local is_extern + local out token = tokens :parse_tokens_loop @@ -64,7 +66,7 @@ function parse_tokens b = token_is_type(token) if b != 0 goto parse_toplevel_decl - die(.str_bad_statement) + token_error(token, .str_bad_statement) :str_bad_statement string Bad statement. byte 0 @@ -164,12 +166,26 @@ function parse_tokens byte 0 :parse_function_definition p = types + type - ; @NOTE: remember to turn array members into pointers + ; @TODO: parameters + ; @NOTE: remember to turn array members into pointers if *1p != TYPE_FUNCTION goto lbrace_after_declaration - die(.str_fdNI) ; @TODO - :str_fdNI - string function definitions not implemented. - byte 10 + + global function_stmt_data ; initialized in main + global function_stmt_data_bytes_used + + n = function_stmt_data_bytes_used + out = function_stmt_data + function_stmt_data_bytes_used + parse_statement(&token, &out) + if parse_stmt_depth != 0 goto stmtdepth_internal_err + function_stmt_data_bytes_used = out - function_stmt_data + + ident_list_add(function_statements, name, n) + goto parse_tokens_loop + + :stmtdepth_internal_err + token_error(token, .str_stmtdepth_internal_err) + :str_stmtdepth_internal_err + string Internal compiler error: parse_stmt_depth is not 0 after parsing function body. byte 0 :lbrace_after_declaration token_error(token, .str_lbrace_after_declaration) @@ -240,6 +256,106 @@ function parse_tokens :parse_tokens_eof return +; write type, file, and line info for statement +function write_statement_header + local out + local type + local token + *1out = type + out += 2 + token += 2 + *2out = *2token + out += 2 + token += 2 + *4out = *4token + return 0 + +; writes statement data for the statement at *p_token to (*)*p_out +; always advances *p_out by exactly 40 bytes, since that's the length of a statement. +function parse_statement + argument p_token + argument p_out + local out + local token + local p + local c + local n + + + out = *8p_out + token = *8p_token + + :stmt_label_loop + ; if second token in statement is a colon, this must be a label + p = token + 16 + if *1p == SYMBOL_COLON goto stmt_label + goto stmt_label_loop_end + + :stmt_label + write_statement_header(out, STATEMENT_LABEL, token) + out += 8 + token += 8 + *8out = *8token ; copy label name + out += 32 + token += 24 ; skip ident name, and colon + goto stmt_label_loop + :stmt_label_loop_end + + c = *1token + if c == SYMBOL_SEMICOLON goto stmt_empty + if c == SYMBOL_LBRACE goto stmt_block + + token_error(token, .str_unrecognized_statement) + :str_unrecognized_statement + string Unrecognized statement. + byte 0 + :parse_statement_ret + *8p_token = token + *8p_out = out + return + :stmt_block + local block_p_out + ; find the appropriate statement data to use for this block's body + block_p_out = statement_datas_ends + block_p_out += parse_stmt_depth < 3 + + write_statement_header(out, STATEMENT_BLOCK, token) + out += 8 + *8out = *8block_p_out + out += 32 + + parse_stmt_depth += 1 + if parse_stmt_depth >= 16 goto too_much_nesting + + token += 16 ; skip opening { + :parse_block_loop + if *1token == TOKEN_EOF goto parse_block_eof + if *1token == SYMBOL_RBRACE goto parse_block_loop_end + parse_statement(&token, block_p_out) + goto parse_block_loop + :parse_block_loop_end + token += 16 ; skip closing } + p = *8block_p_out + *1p = 0 ; probably redundant, but whatever + *8block_p_out += 8 ; add 8 and not 1 because of alignment + parse_stmt_depth -= 1 + goto parse_statement_ret + + :parse_block_eof + token_error(*8p_token, .str_parse_block_eof) + :str_parse_block_eof + string End of file reached while trying to parse block. Are you missing a closing brace? + byte 0 + :too_much_nesting + token_error(token, .str_too_much_nesting) + :str_too_much_nesting + string Too many levels of nesting blocks. + byte 0 + :stmt_empty + ; empty statement, e.g. while(something)-> ; <- + token += 16 ; skip semicolon + goto parse_statement_ret + ; parse a global variable's initializer ; e.g. int x[5] = {1+8, 2, 3, 4, 5}; ; advances *p_token to the token right after the initializer -- cgit v1.2.3