From 3d44eba388cadfb7e1bf5dd481f6dc94d147df4b Mon Sep 17 00:00:00 2001 From: pommicket Date: Wed, 9 Feb 2022 22:44:27 -0500 Subject: start codegen --- 05/codegen.b | 195 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 05/constants.b | 7 ++- 05/main.b | 4 ++ 05/main.c | 5 +- 4 files changed, 207 insertions(+), 4 deletions(-) create mode 100644 05/codegen.b (limited to '05') diff --git a/05/codegen.b b/05/codegen.b new file mode 100644 index 0000000..1507e6d --- /dev/null +++ b/05/codegen.b @@ -0,0 +1,195 @@ +; CALLING CONVENTION: +; arguments are pushed onto the stack by the caller, from right to left +; caller must also reserve space on stack for return value +; so the function puts the return value at [rbp+8] (+8 for stored return address) + + + +global code_output +global codegen_second_pass ; = 0 on first global pass, 1 on second global pass +global functions_addresses ; ident list of addresses +global functions_labels ; ident list of ident lists of label addresses +global curr_function_labels ; ident list of labels for current function (written to in 1st pass, read from in 2nd pass) + +#define REG_RAX 0 +#define REG_RBX 3 +#define REG_RCX 1 +#define REG_RDX 2 +#define REG_RSP 4 +#define REG_RBP 5 +#define REG_RSI 6 +#define REG_RDI 7 + +function emit_byte + argument byte + *1code_output = byte + code_output += 1 + return + +function emit_bytes + argument bytes + argument count + memcpy(code_output, bytes, count) + code_output += count + return + +function emit_word + argument word + *2code_output = word + code_output += 2 + return + +function emit_dword + argument word + *4code_output = word + code_output += 4 + return + +function emit_qword + argument word + *8code_output = word + code_output += 8 + return + +; e.g. emit_mov_reg(REG_RAX, REG_RBX) emits mov rax, rbx +function emit_mov_reg + argument dest + argument src + local n + + ;48 89 (DEST|SRC<<3|0xc0) + *2code_output = 0x8948 + code_output += 2 + n = 0xc0 | dest + n |= src < 3 + *1code_output = n + code_output += 1 + return + + +function emit_sub_rsp_imm32 + argument imm32 + ;48 81 ec IMM32 + *2code_output = 0x8148 + code_output += 2 + *1code_output = 0xec + code_output += 1 + *4code_output = imm32 + code_output += 4 + return + +function emit_mov_qword_rsp_rbp + ; 48 89 2c 24 + *4code_output = 0x242c8948 + code_output += 4 + return + +function emit_mov_rbp_qword_rsp + ; 48 8b 2c 24 + *4code_output = 0x242c8b48 + code_output += 4 + return + +function emit_add_rsp_imm32 + argument imm32 + ;48 81 c4 IMM32 + *2code_output = 0x8148 + code_output += 2 + *1code_output = 0xc4 + code_output += 1 + *4code_output = imm32 + code_output += 4 + return + +function emit_ret + *1code_output = 0xc3 + code_output += 1 + return + +; make sure you put the return value in the proper place before calling this +function generate_return + emit_mov_reg(REG_RSP, REG_RBP) + emit_mov_rbp_qword_rsp() + emit_add_rsp_imm32(8) + emit_ret() + return + +function generate_statement + argument statement + ; @TODO + return + +function generate_function + argument function_name + argument function_statement + local out0 + + if codegen_second_pass != 0 goto genf_second_pass + curr_function_labels = ident_list_create(4000) ; ~ 200 labels per function should be plenty + ident_list_add(functions_labels, function_name, curr_function_labels) + goto genf_cont + :genf_second_pass + curr_function_labels = ident_list_lookup(functions_labels, function_name) + :genf_cont + + ; prologue + emit_sub_rsp_imm32(8) + emit_mov_qword_rsp_rbp() + emit_mov_reg(REG_RBP, REG_RSP) + + generate_statement(function_statement) + + ; implicit return at end of function + generate_return() + + return + +function generate_functions + local addr + local c + local p + local function_name + + function_name = function_statements + + :genfunctions_loop + if *1function_name == 0 goto genfunctions_loop_end + addr = code_output - output_file_data ; address of this function + if codegen_second_pass != 0 goto genfs_check_addr + ; first pass; record address of function + ident_list_add(functions_addresses, function_name, addr) + goto genfs_cont + :genfs_check_addr + c = ident_list_lookup(functions_addresses, function_name) + if c != addr goto function_addr_mismatch + goto genfs_cont + :genfs_cont + p = memchr(function_name, 0) + p += 1 + generate_function(function_name, p) + function_name = p + 8 + goto genfunctions_loop + :genfunctions_loop_end + return + + :function_addr_mismatch + ; address of function on 2nd pass doesn't line up with 1st pass + fputs(2, .str_function_addr_mismatch) + fputs(2, function_name) + exit(1) + :str_function_addr_mismatch + string Function address on first pass doesn't match 2nd pass: + byte 32 + byte 0 + +function generate_code + local p_func + code_output = output_file_data + FUNCTIONS_ADDR + codegen_second_pass = 0 + generate_functions() + code_output = output_file_data + FUNCTIONS_ADDR + codegen_second_pass = 1 + generate_functions() + ; generate code at the entry point of the executable + ; @TODO + return diff --git a/05/constants.b b/05/constants.b index 68e3777..b719375 100644 --- a/05/constants.b +++ b/05/constants.b @@ -1,10 +1,13 @@ ; this is the format of the executables we produce: -; elf header 4MB addresses 0x000000-0x400000 (no, it won't actually take up that much space) -; code 4MB addresses 0x400000-0x7fffff +; elf header 2MB addresses 0x000000-0x200000 (no, it won't actually take up that much space) +; entry point 2MB addresses 0x200000-0x3fffff this is where we put the code to call main(), etc. (again, it won't actually take up that much space) +; code (functions) 4MB addresses 0x400000-0x7fffff ; read-only data 4MB addresses 0x800000-0xbfffff ; read-write data 4MB addresses 0xc00000-0xffffff ; note that file offsets and runtime addresses are the same. ; you should be able to change these constants without breaking anything: +#define ENTRY_ADDR 0x200000 +#define FUNCTIONS_ADDR 0x400000 #define RODATA_ADDR 0x800000 #define RWDATA_ADDR 0xc00000 #define RWDATA_END 0x1000000 diff --git a/05/main.b b/05/main.b index aa20095..b94dc45 100644 --- a/05/main.b +++ b/05/main.b @@ -81,6 +81,7 @@ global function_param_has_no_name #include preprocess.b #include tokenize.b #include parse.b +#include codegen.b function types_init argument _types @@ -235,6 +236,8 @@ function main structure_locations = ident_list_create(2000000) global_variables = ident_list_create(400000) function_statements = ident_list_create(800000) + functions_addresses = ident_list_create(800000) + functions_labels = ident_list_create(800000) function_types = ident_list_create(800000) function_stmt_data = malloc(800000) ; should be at least 40 bytes * max # of functions @@ -285,6 +288,7 @@ function main ; NOTE: do NOT free pptokens; identifiers still reference them. parse_tokens(tokens) + generate_code() p = output_file_data + RODATA_ADDR munmap(output_file_data, RWDATA_END) diff --git a/05/main.c b/05/main.c index b7e07be..56af630 100644 --- a/05/main.c +++ b/05/main.c @@ -1,5 +1,3 @@ -#include "tests/parse_stb_truetype.h" - /* ; @NONSTANDARD: ; the following does not work: @@ -14,3 +12,6 @@ This needs to be fixed because otherwise you can't do: struct A { struct B *blah; } struct B { struct A *blah; } */ + +int main(void) { +} -- cgit v1.2.3