From 4cd2b7047c19e45dc2e664bb6666ee1f288b126c Mon Sep 17 00:00:00 2001 From: pommicket Date: Thu, 6 Jan 2022 23:29:59 -0500 Subject: 04b initial readme, guessing game, compiler fixes --- 04b/Makefile | 8 +- 04b/README.md | 240 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 04b/guessing_game | 238 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 04b/in03 | 159 +++++++++++++++++++++++++++--------- 04b/in04b | 73 +++-------------- 5 files changed, 616 insertions(+), 102 deletions(-) create mode 100644 04b/README.md create mode 100644 04b/guessing_game (limited to '04b') diff --git a/04b/Makefile b/04b/Makefile index e6187d4..ef72181 100644 --- a/04b/Makefile +++ b/04b/Makefile @@ -1,7 +1,11 @@ -all: out03 +all: out03 guessing_game.out out04b README.html out03: in03 ../03/out02 ../03/out02 %.html: %.md ../markdown ../markdown $< +out04b: in04b out03 + ./out03 +%.out: % out03 + ./out03 $< $@ clean: - rm -f out* README.html + rm -f out* README.html *.out diff --git a/04b/README.md b/04b/README.md new file mode 100644 index 0000000..f131943 --- /dev/null +++ b/04b/README.md @@ -0,0 +1,240 @@ +# stage 04 + +As usual, the source for this compiler is `in03`, an input to the [previous compiler](../03/README.md). +`in04b` contains a hello world program written in the stage 4 language. +Here is the core of the program: + +``` +main() + +function main + puts(.str_hello_world) + putc(10) ; newline + syscall(0x3c, 0) +``` + +As you can see, we can now pass arguments to functions. And let's take a look at `putc`: + +``` +function putc + argument c + local p + p = &c + syscall(1, 1, p, 1) + return +``` + +It's so simple compared to previous languages! Rather than mess around with registers, we can now +declare local (and global) variables, and use them directly. These variables will be placed on the +stack. Since arguments are also placed on the stack, +by implementing local variables we get arguments for free. There is no difference +between the `local` and `argument` keywords in this language other than spelling. +In fact, the number of agruments to a function call is not checked against +how many arguments the function has. This does make it easy to screw things up by calling a function +with the wrong number of arguments, but it also means that we can provide a variable number of arguments +to the `syscall` function. Speaking of which, if you look at the bottom of `in04b`, you'll see: + +``` +function syscall + ... + byte 0x48 + byte 0x8b + byte 0x85 + byte 0xf0 + byte 0xff + byte 0xff + byte 0xff + ... +``` + +Originally I was going to make `syscall` a built-in feature of the language, but then I realized that wasn't +necessary. +Instead, `syscall` is a function written manually in machine language. +We can take a look at its decompilation to make things clearer: + +``` +mov rax,[rbp-0x10] +mov rdi,rax +mov rax,[rbp-0x18] +mov rsi,rax +mov rax,[rbp-0x20] +mov rdx,rax +mov rax,[rbp-0x28] +mov r10,rax +mov rax,[rbp-0x30] +mov r8,rax +mov rax,[rbp-0x38] +mov r9,rax +mov rax,[rbp-0x8] +syscall +``` + +This just sets `rax`, `rdi`, `rsi`, etc. to the arguments the function was called with, +and then does a syscall. + +## functions and local variables + +In this language, function arguments are placed onto the stack from left to right +and all arguments and local variables are 8 bytes. +As a reminder, +the stack is just an area of memory which is automatically extended downwards (on x86-64, at least). +So, how do we keep track of the location of local variables in the stack? We could do something like +this: + +``` +sub rsp, 24 ; make room for 3 variables +mov [rsp], 10 ; variable1 = 10 +mov [rsp+8], 20 ; variable2 = 20 +mov [rsp+16], 30 ; variable3 = 30 +; ... +add rsp, 24 ; reset rsp +``` + +But now suppose that in the middle of the `; ...` code we want another local variable: +``` +sub rsp, 8 ; make room for another variable +``` +well, since we've changed `rsp`, `variable1` is now at `rsp+8` instead of `rsp`, +`variable2` is at `rsp+16` instead of `rsp+8`, and +`variable3` is at `rsp+24` instead of `rsp+16`. +Also, we had better make sure we increment `rsp` by `32` now instead of `24` +to put it back in the right place. +It would be annoying (but by no means impossible) to keep track of all this. +We could just declare all local variables at the start of the function, +but that makes the language more annoying to use. + +Instead, we can use the `rbp` register to keep track of what `rsp` was +at the start of the function: + +``` +; save old value of rbp +sub rsp, 8 +mov [rsp], rbp +; set rbp to initial value of rsp +mov rbp, rsp + +lea rsp, [rbp-8] ; add variable1 (this instruction sets rsp to rbp-8) +mov [rbp-8], 10 ; variable1 = 10 +lea rsp, [rbp-16] ; add variable2 +mov [rbp-16], 20 ; variable2 = 20 +lea rsp, [rbp-24] ; add variable3 +mov [rbp-24], 30 ; variable3 = 30 +; Note that variable1's address is still rbp-8; adding more variables didn't affect it. +; ... + +; restore old values of rbp and rsp +mov rsp, rbp +mov rbp, [rsp] +add rsp, 8 +``` + +This is actually the intended use of `rbp` (it *p*oints to the *b*ase of the stack frame). +Note that setting `rsp` very specifically rather than just doing `sub rsp, 8` is important: +if we skip over some code with a local variable declaration, or execute a local declaration twice, +we want `rsp` to be in the right place. +The first three and last three instructions above are called the function *prologue* and *epilogue*. +They are all the same for all functions; a prologue is generated at the start of every function, +and an epilogue is generated for every return statement. +The return value is placed in `rax`. + +## global variables + +Global variables are much simpler than local ones. The variable `:static_memory_end` in the compiler +keeps track of where to put the next global variable in memory. It is initialized at address `0x440000`, +which gives us 256KB for code (and strings). When a global variable is added, `:static_memory_end` is increased +by its size. + +## language description + +Comments begin with `;` and may be put at the end of lines +with or without code. +Blank lines are ignored. + +To make the compiler simpler, this language doesn't support fancy +expressions like `2 * (3 + 5) / 6`. There is a limited set of possible +expressions, specifically there are *terms* and *r-values*. + +But first, each program is made up of a series of statements, and +each statement is one of the following: +- `global {name}` or `global {size} {name}` - declare a global variable with the given size, or 8 bytes if none is provided. +- `local {name}` - declare a local variable +- `argument {name}` - declare a function argument. this is functionally equivalent to `local`, so it just exists for readability. +- `function {name}` - declare a function +- `:{name}` - declare a label +- `goto {label}` - jump to the specified label +- `if {term} {operator} {term} goto {label}` - +conditionally jump to the specified label. `{operator}` should be one of +`==`, `<`, `>`, `>=`, `<=`, `!=`, `[`, `]`, `[=`, `]=` +(the last four do unsigned comparisons). +- `{lvalue} = {rvalue}` - set `lvalue` to `rvalue` +- `{lvalue} += {rvalue}` - add `rvalue` to `lvalue` +- `{lvalue} -= {rvalue}` - etc. +- `{lvalue} *= {rvalue}` +- `{lvalue} /= {rvalue}` +- `{lvalue} %= {rvalue}` +- `{lvalue} &= {rvalue}` +- `{lvalue} |= {rvalue}` +- `{lvalue} ^= {rvalue}` +- `{lvalue} <= {rvalue}` - left shift `lvalue` by `rvalue` +- `{lvalue} >= {rvalue}` - right shift `lvalue` by `rvalue` +- `{function}({term}, {term}, ...)` - function call, ignoring the return value +- `return {rvalue}` +- `string {str}` - places a literal string in the code +- `byte {number}` - places a literal byte in the code + +Now let's get down into the weeds: + +A a *number* is one of: +- `{decimal number}` - e.g. `108` (note: there's no `d` prefix anymore) +- `0x{hexadecimal number}` - e.g. `0x2f` for 47 +- `'{character}` - e.g. `'a` for 97 (the character code for `a`) + +A *term* is one of: +- `{variable name}` - the value of a (local or global) variable +- `.{label name}` - the address of a label +- `{number}` + +An *lvalue* is the left-hand side of an assignment expression, +and it is one of: +- `{variable}` +- `*1{variable}` - dereference 1 byte +- `*2{variable}` - dereference 2 bytes +- `*4{variable}` - dereference 4 bytes +- `*8{variable}` - dereference 8 bytes + +An *rvalue* is an expression, which can be more complicated than a term. +rvalues are one of: +- `{term}` +- `&{variable}` - address of variable +- `*1{variable}` / `*2{variable}` / `*4{variable}` / `*8{variable}` - dereference 1, 2, 4, or 8 bytes +- `~{term}` - bitwise not +- `{function}({term}, {term}, ...)` +- `{term} + {term}` +- `{term} - {term}` +- `{term} * {term}` +- `{term} / {term}` +- `{term} % {term}` +- `{term} & {term}` +- `{term} | {term}` +- `{term} ^ {term}` +- `{term} < {term}` - left shift +- `{term} > {term}` - right shift + +That's quite a lot of stuff, and it makes for a pretty powerful +language, all things considered. To test out the language, +in addition to the hello world program, I also wrote a little +guessing game, which you can find in the file `guessing_game`. +It ended up being quite nice to write! + +## limitations + +Variables in this language do not have types. This makes it very easy to make mistakes like +treating numbers as pointers or vice versa. + +A big annoyance with this language is the lack of local label names. Due to the limited nature +of branching in this language (`if ... goto ...` stands in for `if`, `else if`, `while`, etc.), +you need to use a lot of labels, and that means their names can get quite long. But at least unlike +the 03 language, you'll get an error if you use the same label name twice! + +Overall, though, this language ended up being surprisingly powerful. With any luck, the next stage will +finally be a C compiler... diff --git a/04b/guessing_game b/04b/guessing_game new file mode 100644 index 0000000..415851d --- /dev/null +++ b/04b/guessing_game @@ -0,0 +1,238 @@ +global 0x1000 exit_code +global y +y = 4 +exit_code = main() +exit(exit_code) + +function main + local secret_number + local guess + global 32 input_line + local p_line + p_line = &input_line + secret_number = getrand(100) + fputs(1, .str_intro) + + :guess_loop + fputs(1, .str_guess) + syscall(0, 0, p_line, 30) + guess = stoi(p_line) + if guess < secret_number goto too_low + if guess > secret_number goto too_high + fputs(1, .str_got_it) + return 0 + :too_low + fputs(1, .str_too_low) + goto guess_loop + :too_high + fputs(1, .str_too_high) + goto guess_loop + +:str_intro + string I'm thinking of a number. + byte 10 + byte 0 + +:str_guess + string Guess what it is: + byte 32 + byte 0 + +:str_got_it + string You got it! + byte 10 + byte 0 + +:str_too_low + string Too low! + byte 10 + byte 0 + +:str_too_high + string Too high! + byte 10 + byte 0 + +; get a "random" number from 0 to x using the system clock +function getrand + argument x + global 16 getrand_time + local ptime + local n + + ptime = &getrand_time + syscall(228, 1, ptime) + ptime += 8 ; nanoseconds at offset 8 in struct timespec + n = *4ptime + n %= x + return n + +; returns a pointer to a null-terminated string containing the number given +function itos + global 32 itos_string + argument x + local c + local p + p = &itos_string + p += 30 + :itos_loop + c = x % 10 + c += '0 + *1p = c + x /= 10 + if x == 0 goto itos_loop_end + p -= 1 + goto itos_loop + :itos_loop_end + return p + + +; returns the number at the start of the given string +function stoi + argument s + local p + local n + local c + n = 0 + p = s + :stoi_loop + c = *1p + if c < '0 goto stoi_loop_end + if c > '9 goto stoi_loop_end + n *= 10 + n += c - '0 + p += 1 + goto stoi_loop + :stoi_loop_end + return n + + +function strlen + argument s + local c + local p + p = s + :strlen_loop + c = *1p + if c == 0 goto strlen_loop_end + p += 1 + goto strlen_loop + :strlen_loop_end + return p - s + +function fputs + argument fd + argument s + local length + length = strlen(s) + syscall(1, fd, s, length) + return + + +function fputn + argument fd + argument n + local s + s = itos(n) + fputs(fd, s) + return + +function exit + argument status_code + syscall(0x3c, status_code) + +function syscall + ; I've done some testing, and this should be okay even if + ; rbp-56 goes beyond the end of the stack. + ; mov rax, [rbp-16] + byte 0x48 + byte 0x8b + byte 0x85 + byte 0xf0 + byte 0xff + byte 0xff + byte 0xff + ; mov rdi, rax + byte 0x48 + byte 0x89 + byte 0xc7 + + ; mov rax, [rbp-24] + byte 0x48 + byte 0x8b + byte 0x85 + byte 0xe8 + byte 0xff + byte 0xff + byte 0xff + ; mov rsi, rax + byte 0x48 + byte 0x89 + byte 0xc6 + + ; mov rax, [rbp-32] + byte 0x48 + byte 0x8b + byte 0x85 + byte 0xe0 + byte 0xff + byte 0xff + byte 0xff + ; mov rdx, rax + byte 0x48 + byte 0x89 + byte 0xc2 + + ; mov rax, [rbp-40] + byte 0x48 + byte 0x8b + byte 0x85 + byte 0xd8 + byte 0xff + byte 0xff + byte 0xff + ; mov r10, rax + byte 0x49 + byte 0x89 + byte 0xc2 + + ; mov rax, [rbp-48] + byte 0x48 + byte 0x8b + byte 0x85 + byte 0xd0 + byte 0xff + byte 0xff + byte 0xff + ; mov r8, rax + byte 0x49 + byte 0x89 + byte 0xc0 + + ; mov rax, [rbp-56] + byte 0x48 + byte 0x8b + byte 0x85 + byte 0xc8 + byte 0xff + byte 0xff + byte 0xff + ; mov r9, rax + byte 0x49 + byte 0x89 + byte 0xc1 + + ; mov rax, [rbp-8] + byte 0x48 + byte 0x8b + byte 0x85 + byte 0xf8 + byte 0xff + byte 0xff + byte 0xff + + ; syscall + byte 0x0f + byte 0x05 + + return diff --git a/04b/in03 b/04b/in03 index 985ad9b..8fb9ade 100644 --- a/04b/in03 +++ b/04b/in03 @@ -4,28 +4,54 @@ D=:global_variables 8C=D ; initialize static_memory_end C=:static_memory_end -; 0x40000 = 256KB for code -D=x440000 +; 0x80000 = 512KB for code +D=x480000 8C=D ; initialize labels_end C=:labels_end D=:labels 8C=D -; open input file - J=:input_filename +I=8S +A=d2 +?I>A:argv_file_names + ; use default input/output filenames + ; open input file + J=:input_filename + I=d0 + syscall x2 + J=A + ?J<0:input_file_error + ; open output file + J=:output_filename + I=x241 + D=x1ed + syscall x2 + J=A + ?J<0:output_file_error + !:second_pass_starting_point +:argv_file_names + ; open input file + J=S + ; argv[1] is at *(rsp+16) + J+=d16 + J=8J I=d0 syscall x2 J=A ?J<0:input_file_error -; open output file - J=:output_filename + ; open output file + J=S + ; argv[2] is at *(rsp+24) + J+=d24 + J=8J I=x241 D=x1ed syscall x2 J=A ?J<0:output_file_error + :second_pass_starting_point ; write ELF header J=d4 @@ -161,15 +187,16 @@ call :string= D=A ?D!0:handle_if -; set delimiter to newline -C=xa - I=:line J=:"function" call :string= D=A ?D!0:handle_function + +; set delimiter to newline +C=xa + I=:line J=:"return\n" call :string= @@ -203,6 +230,7 @@ I=:line !:call_check_loop :call_check_loop_end +!:bad_statement !:read_line @@ -217,6 +245,7 @@ I=:line J=d4 I=:static_memory_end I=8I + I-=x400000 syscall x4d ; seek both files back to start J=d3 @@ -292,15 +321,6 @@ align !:read_line :handle_local - R=I - - ; emit sub rsp, 8 - J=d4 - I=:sub_rsp_8 - D=d7 - syscall x1 - - I=R ; skip ' ' I+=d1 @@ -333,23 +353,36 @@ align ; update :local_variables_end I=:local_variables_end 8I=J + + ; set rsp appropriately + C=:rbp_offset + J=d0 + J-=D + 4C=J + + J=d4 + I=:lea_rsp_[rbp_offset] + D=d7 + syscall x1 + + ; read the next line !:read_line - -:sub_rsp_8 + +:lea_rsp_[rbp_offset] x48 - x81 - xec - x08 - x00 - x00 - x00 + x8d + xa5 +:rbp_offset + reserve d4 align :global_start reserve d8 :global_variable_name reserve d8 +:global_variable_size + reserve d8 :handle_global ; ignore if this is the second pass C=:second_pass @@ -359,6 +392,27 @@ align ; skip ' ' I+=d1 + C=1I + D='9 + ?C>D:global_default_size + ; read specific size of global + call :read_number + D=A + C=:global_variable_size + 8C=D + ; check and skip space after number + C=1I + D=x20 + ?C!D:bad_number + I+=d1 + !:global_cont + :global_default_size + ; default size = 8 + C=:global_variable_size + D=d8 + 8C=D + :global_cont + ; store away pointer to variable name C=:global_variable_name 8C=I @@ -380,8 +434,11 @@ align C=4D 4J=C J+=d4 - ; increase static_memory_end - C+=d8 + ; increase static_memory_end by size + D=:global_variable_size + D=8D + C+=D + D=:static_memory_end 4D=C ; store null terminator 1J=0 @@ -392,6 +449,12 @@ align !:read_line :handle_function + I=:line + ; length of "function " + I+=d9 + ; make function name a label + call :add_label + ; emit prologue J=d4 I=:function_prologue @@ -450,14 +513,25 @@ align ; total length = 15 bytes :handle_label_definition + I=:line + I+=d1 + call :add_label + !:read_line + +align +:label_name + reserve d8 +; add the label in rsi to the label list (with the current pc address) +:add_label ; ignore if this is the second pass C=:second_pass C=1C - ?C!0:read_line + ?C!0:return_0 + + C=:label_name + 8C=I ; make sure label only has identifier characters - I=:line - I+=d1 :label_checking_loop C=1I D=xa @@ -470,8 +544,8 @@ align !:bad_label :label_checking_loop_end - I=:line - I+=d1 + C=:label_name + I=8C J=:labels call :ident_lookup C=A @@ -479,8 +553,8 @@ align J=:labels_end J=8J - I=:line - I+=d1 + C=:label_name + I=8C call :ident_copy R=J @@ -500,8 +574,7 @@ align C=:labels_end 8C=J - ; read the next line - !:read_line + return :handle_goto J=d4 @@ -2004,6 +2077,15 @@ align xa x0 +:bad_statement + B=:bad_statement_error_message + !:program_error + +:bad_statement_error_message + str Bad statement. + xa + x0 + :bad_jump B=:bad_jump_error_message !:program_error @@ -2205,6 +2287,7 @@ align 1J=D J-=d1 ?I!0:eputn_loop + J+=d1 D=S D-=J I=J @@ -2271,7 +2354,7 @@ align x20 :"function" str function - xa + x20 :"==" str == x20 diff --git a/04b/in04b b/04b/in04b index 5813cc4..2b85900 100644 --- a/04b/in04b +++ b/04b/in04b @@ -1,93 +1,42 @@ -; declaration: -; global -; local -; argument -; :