summaryrefslogtreecommitdiff
path: root/05
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2022-02-03 22:53:38 -0500
committerpommicket <pommicket@gmail.com>2022-02-03 22:53:38 -0500
commitd1167f03d03c2a6ab75fce410706e3098dfd3090 (patch)
tree30a3361c6ffdfaa626672d431771264a91674b34 /05
parentfd02968c23739e4289c9b675c5d50d8b1f51153d (diff)
start parsing statements (not a lot yet)
Diffstat (limited to '05')
-rw-r--r--05/constants.b46
-rw-r--r--05/main.b34
-rw-r--r--05/main.c115
-rw-r--r--05/parse.b130
4 files changed, 260 insertions, 65 deletions
diff --git a/05/constants.b b/05/constants.b
index 983bc2a..6a0bba1 100644
--- a/05/constants.b
+++ b/05/constants.b
@@ -230,6 +230,52 @@
; types willl be initialized (in main) so that this refers to the type char*
#define TYPE_POINTER_TO_CHAR 20
+; STATEMENTS
+; In C, note that `if', `while', etc. always have a single statement as their body:
+; if (x) { y; z; w; }
+; here {y; z; w;} is a single `compound' statement containing three statements.
+; our statements don't directly correspond to the C89 standard's notion of statements, in particular,
+; labels count as separate statements and declarations count as statements.
+; each statement is stored as exactly 40 bytes
+; uchar type
+; uchar padding
+; ushort file
+; uint line
+; ulong data1
+; ulong data2
+; ulong data3
+; ulong data4
+; a type of 0 indicates the end of the block.
+; data layout for particular statements:
+; - STATEMENT_EXPRESSION - data1 is a pointer to expression data; data2,3,4 are unused
+; - STATEMENT_LOCAL_DECLARATION - declaring a local variable (automatic/"register" storage duration), data1 = total bytes used by all local variables so far in this function including this one; data2,3,4 unused
+; - STATEMENT_LABEL - data1 is a pointer to the name of the label; data2,3,4 are unused
+; - STATEMENT_BLOCK - data1 is a pointer to an array of statements; data2,3,4 are unused
+; - STATEMENT_IF - data1 is a pointer to the condition, data2 is a pointer to the `if' branch statement, data3 is a pointer to the `else' branch statement, or 0 if there is none; data4 is unused
+; - STATEMENT_SWITCH - data1 is a pointer to the expression, data2 is a pointer to the body statement; data3,4 are unused
+; - STATEMENT_WHILE - data1 is a pointer to the condition, data2 is a pointer to the body statement; data3,4 are unused
+; - STATEMENT_DO - data1 is a pointer to the body statement, data2 is a pointer to the condition; data3,4 are unused
+; - STATEMENT_FOR - data1,2,3 are pointers to the first, second, and third expressions inside parentheses, data4 is a pointer to the body statement
+; - STATEMENT_GOTO - data1 is a pointer to the name of the label; data2,3,4 are unused
+; - STATEMENT_CONTINUE - data1,2,3,4 are unused
+; - STATEMENT_BREAK - data1,2,3,4 are unused
+; - STATEMENT_RETURN - data1 is a pointer to the expression, or 0 if there is none; data2,3,4 are unused
+#define STATEMENT_EXPRESSION 1
+#define STATEMENT_LOCAL_DECLARATION 2
+#define STATEMENT_LABEL 3
+#define STATEMENT_BLOCK 4
+#define STATEMENT_IF 5
+#define STATEMENT_SWITCH 6
+#define STATEMENT_WHILE 7
+#define STATEMENT_DO 8
+#define STATEMENT_FOR 9
+#define STATEMENT_GOTO 0xa
+#define STATEMENT_CONTINUE 0xb
+#define STATEMENT_BREAK 0xc
+#define STATEMENT_RETURN 0xd
+
+
+
:keyword_table
byte SYMBOL_SEMICOLON
byte 59
diff --git a/05/main.b b/05/main.b
index 8a6ceb0..381ad59 100644
--- a/05/main.b
+++ b/05/main.b
@@ -42,6 +42,16 @@ global output_file_data
; ident list of global variables. each one is stored as
; (type << 32) | address
global global_variables
+; ident list of functions. each entry is a pointer to a single statement - which should always be a STATEMENT_BLOCK
+global function_statements
+; statement_datas[0] = pointer to statement data for block-nesting depth 0 (i.e. function bodies)
+; statement_datas[1] = pointer to statement data for block-nesting depth 1 (blocks inside functions)
+; statement_datas[2] = pointer to statement data for block-nesting depth 2 (blocks inside blocks inside functions)
+; etc. up to statement_datas[15] "* 15 nesting levels of compound statements, iteration control structures, and selection control structures" C89 § 2.2.4.1
+; these have to be separated for reasons™
+global statement_datas
+global statement_datas_ends
+global parse_stmt_depth
#include util.b
#include idents.b
@@ -154,15 +164,32 @@ function main
local tokens
local ast
local p
+ local q
local i
local output_fd
+ statement_datas = malloc(4000)
+ statement_datas_ends = malloc(4000)
+ p = statement_datas
+ q = statement_datas_ends
+ i = 0
+ :statement_datas_loop
+ *8p = malloc(4000000) ; supports 100,000 statements at each level
+ *8q = p
+ p += 8
+ q += 8
+ i += 1
+ if i < 16 goto statement_datas_loop
+
fill_in_powers_of_10()
typedefs = ident_list_create(100000)
enumerators = ident_list_create(4000000)
structures = ident_list_create(4000000)
- global_variables = ident_list_create(4000000)
+ global_variables = ident_list_create(400000)
+ function_statements = ident_list_create(400000)
+
+ function_stmt_data = malloc(800000) ; should be at least 40 bytes * max # of functions
dat_banned_objmacros = 255
dat_banned_fmacros = 255
@@ -197,14 +224,15 @@ function main
translation_phase_4(input_filename, pptokens, processed_pptokens)
free(pptokens)
pptokens = processed_pptokens
- print_pptokens(pptokens)
- print_separator()
+ ;print_pptokens(pptokens)
+ ;print_separator()
;print_object_macros()
;print_function_macros()
tokens = malloc(16000000)
p = tokenize(pptokens, tokens, input_filename, 1)
print_tokens(tokens, p)
+ print_separator()
; NOTE: do NOT free pptokens; identifiers still reference them.
parse_tokens(tokens)
diff --git a/05/main.c b/05/main.c
index 204da87..c5d1b72 100644
--- a/05/main.c
+++ b/05/main.c
@@ -1,59 +1,64 @@
-typedef struct {
- int i[41];
- long double d;
-} (*x___)(void);
+int f(void) {
+blah:blah:blah:;
+}
-typedef enum X {
- R,S,T
- } *Foo[sizeof(unsigned long)];
-typedef int A___[T];
-typedef struct A {
- int x, y;
- long double c;
- unsigned long d;
- char e[3];
- long f;
-} A;
-
-typedef union B{
- int x;
- struct {
- int y;
- struct {long z; } c;
- } c;
-}B;
-
-typedef int QQQ[sizeof(A)+sizeof"hello"];
-typedef int RRR[sizeof(struct B)];
-
-static unsigned int x={55};
-static char *s = "hello";
-static char *t = "goodbye";
-static char u[8] = "hellothe";
-static char v[100] = "re my";
-static char w[] = "friendly";
-static char x_[] = "hi";
-typedef int A_[sizeof x_ + sizeof u];
-
-static int a[5] = {1,2,3};
-static char b[6][7] = {{'a'},{'b'},{'c'},{'d'},{'e'}};
-static char __b[][7] = {{'a'},"hello",'r'};
-static int _u = sizeof __b;
-
-struct {
- int a;
- long b;
-} x1[] = {0x1234567890, 1ul<<60|1ul<<3, 77};
-int y1 = 0x12345678;
-
-struct {
- int x[2], y;
-} test[] = {3, 5,0x1234,0x4321};
-typedef int Blah[sizeof((B *)0)->c.y];
-unsigned marker = 0xdeadbeef;
-
-typedef int (*FUNCTION)(void);
-typedef int AAAA[sizeof*****((FUNCTION)0)];
+/* typedef struct { */
+/* int i[41]; */
+/* long double d; */
+/* } (*x___)(void); */
+/* */
+/* typedef enum X { */
+/* R,S,T */
+/* } *Foo[sizeof(unsigned long)]; */
+/* typedef int A___[T]; */
+/* */
+/* typedef struct A { */
+/* int x, y; */
+/* long double c; */
+/* unsigned long d; */
+/* char e[3]; */
+/* long f; */
+/* } A; */
+/* */
+/* typedef union B{ */
+/* int x; */
+/* struct { */
+/* int y; */
+/* struct {long z; } c; */
+/* } c; */
+/* }B; */
+/* */
+/* typedef int QQQ[sizeof(A)+sizeof"hello"]; */
+/* typedef int RRR[sizeof(struct B)]; */
+/* */
+/* static unsigned int x={55}; */
+/* static char *s = "hello"; */
+/* static char *t = "goodbye"; */
+/* static char u[8] = "hellothe"; */
+/* static char v[100] = "re my"; */
+/* static char w[] = "friendly"; */
+/* static char x_[] = "hi"; */
+/* typedef int A_[sizeof x_ + sizeof u]; */
+/* */
+/* static int a[5] = {1,2,3}; */
+/* static char b[6][7] = {{'a'},{'b'},{'c'},{'d'},{'e'}}; */
+/* static char __b[][7] = {{'a'},"hello",'r'}; */
+/* static int _u = sizeof __b; */
+/* */
+/* struct { */
+/* int a; */
+/* long b; */
+/* } x1[] = {0x1234567890, 1ul<<60|1ul<<3, 77}; */
+/* int y1 = 0x12345678; */
+/* */
+/* struct { */
+/* int x[2], y; */
+/* } test[] = {3, 5,0x1234,0x4321}; */
+/* typedef int Blah[sizeof((B *)0)->c.y]; */
+/* unsigned marker = 0xdeadbeef; */
+/* */
+/* typedef int (*FUNCTION)(void); */
+/* typedef int AAAA[sizeof*****((FUNCTION)0)]; */
/* typedef int X[sizeof(int)+4]; */
diff --git a/05/parse.b b/05/parse.b
index 661a257..18765b8 100644
--- a/05/parse.b
+++ b/05/parse.b
@@ -35,7 +35,7 @@ function structure_is_union
if offset == 0 goto return_1 ; if that's 0, it's a union or 1-element struct
goto return_0
-
+; parse a translation unit
function parse_tokens
argument tokens
local token
@@ -44,6 +44,7 @@ function parse_tokens
local p
local b
local c
+ local n
local base_type
local base_type_end
local name
@@ -52,6 +53,7 @@ function parse_tokens
local suffix
local suffix_end
local is_extern
+ local out
token = tokens
:parse_tokens_loop
@@ -64,7 +66,7 @@ function parse_tokens
b = token_is_type(token)
if b != 0 goto parse_toplevel_decl
- die(.str_bad_statement)
+ token_error(token, .str_bad_statement)
:str_bad_statement
string Bad statement.
byte 0
@@ -164,12 +166,26 @@ function parse_tokens
byte 0
:parse_function_definition
p = types + type
- ; @NOTE: remember to turn array members into pointers
+ ; @TODO: parameters
+ ; @NOTE: remember to turn array members into pointers
if *1p != TYPE_FUNCTION goto lbrace_after_declaration
- die(.str_fdNI) ; @TODO
- :str_fdNI
- string function definitions not implemented.
- byte 10
+
+ global function_stmt_data ; initialized in main
+ global function_stmt_data_bytes_used
+
+ n = function_stmt_data_bytes_used
+ out = function_stmt_data + function_stmt_data_bytes_used
+ parse_statement(&token, &out)
+ if parse_stmt_depth != 0 goto stmtdepth_internal_err
+ function_stmt_data_bytes_used = out - function_stmt_data
+
+ ident_list_add(function_statements, name, n)
+ goto parse_tokens_loop
+
+ :stmtdepth_internal_err
+ token_error(token, .str_stmtdepth_internal_err)
+ :str_stmtdepth_internal_err
+ string Internal compiler error: parse_stmt_depth is not 0 after parsing function body.
byte 0
:lbrace_after_declaration
token_error(token, .str_lbrace_after_declaration)
@@ -240,6 +256,106 @@ function parse_tokens
:parse_tokens_eof
return
+; write type, file, and line info for statement
+function write_statement_header
+ local out
+ local type
+ local token
+ *1out = type
+ out += 2
+ token += 2
+ *2out = *2token
+ out += 2
+ token += 2
+ *4out = *4token
+ return 0
+
+; writes statement data for the statement at *p_token to (*)*p_out
+; always advances *p_out by exactly 40 bytes, since that's the length of a statement.
+function parse_statement
+ argument p_token
+ argument p_out
+ local out
+ local token
+ local p
+ local c
+ local n
+
+
+ out = *8p_out
+ token = *8p_token
+
+ :stmt_label_loop
+ ; if second token in statement is a colon, this must be a label
+ p = token + 16
+ if *1p == SYMBOL_COLON goto stmt_label
+ goto stmt_label_loop_end
+
+ :stmt_label
+ write_statement_header(out, STATEMENT_LABEL, token)
+ out += 8
+ token += 8
+ *8out = *8token ; copy label name
+ out += 32
+ token += 24 ; skip ident name, and colon
+ goto stmt_label_loop
+ :stmt_label_loop_end
+
+ c = *1token
+ if c == SYMBOL_SEMICOLON goto stmt_empty
+ if c == SYMBOL_LBRACE goto stmt_block
+
+ token_error(token, .str_unrecognized_statement)
+ :str_unrecognized_statement
+ string Unrecognized statement.
+ byte 0
+ :parse_statement_ret
+ *8p_token = token
+ *8p_out = out
+ return
+ :stmt_block
+ local block_p_out
+ ; find the appropriate statement data to use for this block's body
+ block_p_out = statement_datas_ends
+ block_p_out += parse_stmt_depth < 3
+
+ write_statement_header(out, STATEMENT_BLOCK, token)
+ out += 8
+ *8out = *8block_p_out
+ out += 32
+
+ parse_stmt_depth += 1
+ if parse_stmt_depth >= 16 goto too_much_nesting
+
+ token += 16 ; skip opening {
+ :parse_block_loop
+ if *1token == TOKEN_EOF goto parse_block_eof
+ if *1token == SYMBOL_RBRACE goto parse_block_loop_end
+ parse_statement(&token, block_p_out)
+ goto parse_block_loop
+ :parse_block_loop_end
+ token += 16 ; skip closing }
+ p = *8block_p_out
+ *1p = 0 ; probably redundant, but whatever
+ *8block_p_out += 8 ; add 8 and not 1 because of alignment
+ parse_stmt_depth -= 1
+ goto parse_statement_ret
+
+ :parse_block_eof
+ token_error(*8p_token, .str_parse_block_eof)
+ :str_parse_block_eof
+ string End of file reached while trying to parse block. Are you missing a closing brace?
+ byte 0
+ :too_much_nesting
+ token_error(token, .str_too_much_nesting)
+ :str_too_much_nesting
+ string Too many levels of nesting blocks.
+ byte 0
+ :stmt_empty
+ ; empty statement, e.g. while(something)-> ; <-
+ token += 16 ; skip semicolon
+ goto parse_statement_ret
+
; parse a global variable's initializer
; e.g. int x[5] = {1+8, 2, 3, 4, 5};
; advances *p_token to the token right after the initializer