From b4e22170b4cbe580a1583337817f6a388c0e1f55 Mon Sep 17 00:00:00 2001 From: pommicket Date: Wed, 23 Feb 2022 23:50:49 -0800 Subject: edit readmes --- 00/README.md | 19 ++++++++----------- 01/README.md | 3 ++- 02/README.md | 2 +- 04/README.md | 2 +- 04a/README.md | 2 ++ 05/README.md | 19 +++++++++++-------- 05/main.b | 5 +++-- README.md | 54 ++++++++++++++++++++++++++---------------------------- 8 files changed, 54 insertions(+), 52 deletions(-) diff --git a/00/README.md b/00/README.md index b17060e..3ea31be 100644 --- a/00/README.md +++ b/00/README.md @@ -75,7 +75,7 @@ version of ELF) - `02 00` Object type = executable file (not a dynamic library/etc.) - `3e 00` Architecture x86-64 - `01 00 00 00` Version 1 of ELF, again -- `78 00 40 00 00 00 00 00` **Entry point of the executable** = 0x400078 +- `78 00 40 00 00 00 00 00` Entry point of the executable = 0x400078 - `40 00 00 00 00 00 00 00` Program header table offset in bytes from start of file - `00 00 00 00 00 00 00 00` Section header table offset (we're not using sections) - `00 00 00 00` Flags (not important to us) @@ -194,16 +194,13 @@ similar to our first call, with two important differences: first, we specify `0x241` as the second argument. This tells Linux that we are writing to the file (`O_WRONLY = 0x01`), that we want to create it if it doesn't exist (`O_CREAT = 0x40`), and that we want to delete any previous contents it had -(`O_TRUNC = 0x200`). Secondly, we're setting the third argument this time. It +(`O_TRUNC = 0x200`). Also, we're setting the third argument this time. It specifies the permissions our file is created with (`0o755` means user -read/write/execute, group/other read/execute). This is not very important to -the actual execution of the program, so don't worry if you don't know -about UNIX permissions. +read/write/execute, group/other read/execute). Note that the output file's descriptor will be 4. -Now we can start reading from the file. We're going to loop back to this part of -the code every time we want to read a new hexadecimal number from the input -file. +Now we can start reading the input file. We're going to loop back to this part of +the code every time we want to read a new digit pair. - `48 b8 03 00 00 00 00 00 00 00` `mov rax, 3` - `48 89 c7` `mov rdi, rax` @@ -246,7 +243,7 @@ reasons for this which I won't get into here. - `8a 03` `mov al, byte [rbx]` Here we put the ASCII code of the first character read from the file into `rax`. -But now we need to turn the ASCII character code into the actual numerical value +But now we need to turn the ASCII character code into the numerical value of the hex digit. - `48 89 c3` `mov rbx, rax` @@ -255,8 +252,8 @@ of the hex digit. - `0f 8c 0f 00 00 00` `jl 0x400136` This checks if the character code is greater than the character code for the -digit 9, and jumps to a different part of the code if so. This different part of -the code will handle the case of the hex digits `a` through `f`. +digit 9, and jumps to a different part of the code if so. That part +will handle the case of the hex digits `a` through `f`. - `48 b8 d0 ff ff ff ff ff ff ff` `mov rax, -48` diff --git a/01/README.md b/01/README.md index cf4fc63..4489df1 100644 --- a/01/README.md +++ b/01/README.md @@ -36,7 +36,8 @@ separated by semicolons. Any text after the command and before the semicolon is ignored (that's how we get comments), and there has to be a terminating semicolon. -So, for example, the `sy` command outputs a syscall instruction. You can see +For example, the `sy` command outputs a syscall instruction and the +`zA` command sets `rax` to 0. You can see `commands.txt` for a full list. `||` is a very important command. It outputs an ELF header for our executable. diff --git a/02/README.md b/02/README.md index 689e76f..9ba5063 100644 --- a/02/README.md +++ b/02/README.md @@ -106,7 +106,7 @@ And instead of figuring out the address of a piece of data, we can just use its ``` im --xy -// rax now points to the data at the label "::xy" +// rax now points to the data at the label ::xy ``` This also lets us compute the length of the hello world string automatically! diff --git a/04/README.md b/04/README.md index 6840080..3e3c292 100644 --- a/04/README.md +++ b/04/README.md @@ -270,5 +270,5 @@ of branching in this language (`if ... goto ...` stands in for `if`, `else if`, you need to use a lot of labels, and that means their names can get quite long. But at least unlike the 03 language, you'll get an error if you use the same label name twice! -Overall, though, this language ended up being surprisingly powerful. With any luck, stage `05` will +Overall, though, this language ended up being surprisingly powerful. In fact, stage `05` will finally be a C compiler... But first, it's time to make [something that's not a compiler](../04a/README.md). diff --git a/04a/README.md b/04a/README.md index 088c649..2874db3 100644 --- a/04a/README.md +++ b/04a/README.md @@ -72,3 +72,5 @@ B ``` Will be preprocessed to `A`, not `10`. + +And with that, it's finally time to write [a C compiler](../05/README.md). diff --git a/05/README.md b/05/README.md index a68cbbb..13af044 100644 --- a/05/README.md +++ b/05/README.md @@ -213,7 +213,8 @@ static unsigned char __syscall_data[] = { }; #define __syscall(no, arg1, arg2, arg3, arg4, arg5, arg6)\ - (((unsigned long (*)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long))__syscall_data)\ + (((unsigned long (*)(unsigned long, unsigned long, unsigned long,\ + unsigned long, unsigned long, unsigned long, unsigned long))__syscall_data)\ (no, arg1, arg2, arg3, arg4, arg5, arg6)) ``` @@ -296,6 +297,8 @@ Oh wait, tcc uses it. Fortunately it's not critically important to tcc. - `mktime()` always fails. - The keywords `signed`, `volatile`, `register`, and `const` are all ignored, but this should almost never have an effect on a legal C program. +- Converting `unsigned long` to `double`/`float` treats the number as signed (this is incorrect +for `unsigned long` values above 263). ## anecdotes @@ -306,7 +309,7 @@ which happened along the way: doesn't have floats turned out to be quite a fun challenge! Not all decimal numbers have a perfect floating point representation. You could round 0.1 up to ~0.1000000000000000056, or down to ~0.0999999999999999917. -This stage's C compiler should be entirely correct, up to rounding (which is all that the +This stage's C compiler should be correct up to rounding (which is all that the C standard requires). But typically C compilers will round to whichever is closest to the decimal value. Implementing this correctly @@ -343,7 +346,7 @@ executable from the gcc one. After spending a long time looking at disassemblies tcc_define_symbol(s, "__linux", NULL); # endif ``` -If the `__linux__` macro is defined (to indicate that the target OS is linux), +If the `__linux__` macro is defined (to indicate that the OS is linux), tcc will also define the `__linux__` macro in any programs it compiles. Unlike gcc, our compiler doesn't define the `__linux__` macro, so when it's used to compile tcc, tcc won't define it either, no matter how many times you compile it @@ -424,7 +427,7 @@ get to glibc: - build & install dash - build & install sed-4.2 - build & install ld, as (from binutils) -- build gcc +- build & install gcc - build & install grep-3.7 - build & install awk - build & install bash @@ -438,10 +441,10 @@ This made broken Makefiles which I spent hours editing by hand -- and is it really compiled from scratch if it's built from computer-generated source files and Makefiles? And although the developers at GNU -refrain from declaring variables after statements, and keep old-style function declarations -to support compilers from the 80s; they *still* manage to use gcc-specific extensions, and -not even extensions that all versions of gcc support! +refrain from declaring variables after statements, +they can't help but use bizarre gcc-specific extensions. After hours and hours of fixing compiler errors, I decided to give up. -THIS WAY LIES MADNESS. +I'll just say, as a reminder to my future self, and a warning to anyone +else who wants to compile glibc from scratch: THIS WAY LIES MADNESS. diff --git a/05/main.b b/05/main.b index 8e040d8..a71cce3 100644 --- a/05/main.b +++ b/05/main.b @@ -382,12 +382,13 @@ function main byte 0 :usage_error - puts(.str_usage_error) + putsln(.str_usage_error) exit(1) :str_usage_error string Please either specify no arguments or an input and output file. - + byte 0 + :str_default_input_filename string main.c byte 0 diff --git a/README.md b/README.md index 3a01b3c..22e73f9 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,12 @@ Compilers nowadays are written in languages like C, which themselves need to be compiled. But then, you need a C compiler to compile your C compiler! Of course, -the very first C compiler was not written in C (because how would it be -compiled?). Instead, it was built up over time, starting from a basic -assembler, eventually reaching a full-scale compiler. -In this repository, we'll explore how that's done. Each directory -represents a new "stage" in the process. The first one, `00`, is a hand-written -executable, and the last one will be a C compiler. Each directory has its own +the very first C compiler was not written in C. +First, people made assemblers, then simple programming languages, +then, eventually, it was possible to make a C compiler. +In this repository, we'll explore how that's done. Each directory here +is a "stage" in the process. The first one, `00`, is a hand-written +executable, and the last one, `05`, is a C compiler. Each directory has its own README explaining what's going on. You can run `bootstrap.sh` to run through and test every stage. @@ -33,25 +33,24 @@ command codes. If you want to follow along with this series, you'll probably want to know about: -- what a system call is -- what memory is -- what a compiler is -- what an executable file is - number bases -- if a number is preceded by 0x, 0o, or 0b in this series, that means hexadecimal/octal/binary respectively. So 0xff = FF hexadecimal = 255 decimal. -- what a CPU is -- what a CPU architecture is -- what a CPU register is -- what the (call) stack is - bits, bytes, kilobytes, etc. - bitwise operations (not, or, and, xor, left shift, right shift) - 2's complement - ASCII, null-terminated strings - how pointers work - how floating-point numbers work +- what a compiler is +- what an executable file is +- what a system call is +- what a CPU is +- what a CPU architecture is +- what a CPU register is +- what the (call) stack is -If you're unfamiliar with x86-64 assembly, you should check out the instruction list below. +If you're unfamiliar with x86-64 assembly, you should take a look at the instruction list below. ## principles @@ -60,13 +59,13 @@ If you're unfamiliar with x86-64 assembly, you should check out the instruction Bootstrapping a compiler is not an easy task, so we're trying to make it as easy as possible. We don't even necessarily need a standard-compliant C compiler, we only need enough to compile someone else's C compiler. Specifically, we'll be -using [TCC](https://bellard.org/tcc/) since it's written (mostly) in standard C89. +using [tcc](https://bellard.org/tcc/) since it's written (mostly) in C89. - efficiency is not a concern We will create big and slow executables, and that's okay. It doesn't really -matter if compiling TCC takes 30 as opposed to 0.01 seconds; once the process -is finished, we'll get the same executable either way. +matter if compiling TCC takes 30 as opposed to 0.01 seconds; once +we compile it with itself, we should get the same executable either way. ## reflections on trusting trust @@ -77,7 +76,7 @@ it's possible to create a malicious C compiler which will replicate its own malicious functionalities (e.g. detecting password-checking routines to make them also accept another password the attacker knows) when used to compile other C compilers. For all we know, such a compiler was used to -compile GCC, say, and so all programs around today could be compromised. Of +compile gcc, say, and so all programs around today could be compromised. Of course, this is practically definitely not the case, but it's still an interesting experiment to try to create a fully trustable compiler. This project can't necessarily even do that though, because the Linux kernel, which @@ -143,8 +142,7 @@ ax bx cx dx sp bp si di │ mov al, [rbx] │ 8a 03 │ load 1 byte from address rbx into al │ │ mov rax, [rbp+IMM32] │ 48 8b 85 IMM32 │ load 8 bytes from address rbp+IMM32 │ │ │ │ into rax (note: IMM32 may be negative) │ -│ mov rax, [rsp+IMM32] │ 48 8b 84 24 IMM32 │ load 8 bytes from address rsp+IMM32 │ -│ │ │ into rax (note: IMM32 may be negative) │ +│ mov rax, [rsp+IMM32] │ 48 8b 84 24 IMM32 │ load 8 bytes from rsp+IMM32 into rax │ │ mov [rbp+IMM32], rax │ 48 89 85 IMM32 │ store rax in 8 bytes at rbp+IMM32 │ │ mov [rsp+IMM32], rax │ 48 89 84 24 IMM32 │ store rax in 8 bytes at rsp+IMM32 │ │ mov [rsp], rbp │ 48 89 2c 24 │ store rbp in 8 bytes at rsp │ @@ -161,19 +159,19 @@ ax bx cx dx sp bp si di │ imul rbx │ 48 f7 eb │ set rdx:rax to rax * rbx (signed) │ │ cqo │ 48 99 │ sign-extend rax to rdx:rax | │ idiv rbx │ 48 f7 fb │ divide rdx:rax by rbx (signed); put │ -│ │ │ quotient in rax, remainder in rbx │ +│ │ │ quotient in rax, remainder in rdx │ │ mul rbx │ 48 f7 e3 │ like imul, but unsigned │ -│ div rbx │ 48 f7 f3 │ like idiv, but with unsigned division │ +│ div rbx │ 48 f7 f3 │ like idiv, but unsigned │ │ not rax │ 48 f7 d0 │ set rax to ~rax (bitwise not) │ │ and rax, rbx │ 48 21 d8 │ set rax to rax & rbx (bitwise and) │ │ or rax, rbx │ 48 09 d8 │ set rax to rax | rbx (bitwise or) │ │ xor rax, rbx │ 48 31 d8 │ set rax to rax ^ rbx (bitwise xor) │ │ shl rax, cl │ 48 d3 e0 │ set rax to rax << cl (left shift) │ │ shl rax, IMM8 │ 48 c1 e0 IMM8 │ set rax to rax << IMM8 │ -│ shr rax, cl │ 48 d3 e8 │ set rax to rax >> cl (zero-extend) │ -│ shr rax, IMM8 │ 48 c1 e8 IMM8 │ set rax to rax >> IMM8 (zero-extend) │ -│ sar rax, cl │ 48 d3 f8 │ set rax to rax >> cl (sign-extend) │ -│ sar rax, IMM8 │ 48 c1 f8 IMM8 │ set rax to rax >> IMM8 (sign-extend) │ +│ shr rax, cl │ 48 d3 e8 │ set rax to rax >> cl (unsigned) │ +│ shr rax, IMM8 │ 48 c1 e8 IMM8 │ set rax to rax >> IMM8 (unsigned) │ +│ sar rax, cl │ 48 d3 f8 │ set rax to rax >> cl (signed) │ +│ sar rax, IMM8 │ 48 c1 f8 IMM8 │ set rax to rax >> IMM8 (signed) │ │ sub rsp, IMM32 │ 48 81 ec IMM32 │ subtract IMM32 from rsp │ │ add rsp, IMM32 │ 48 81 c4 IMM32 │ add IMM32 to rsp │ │ cmp rax, rbx │ 48 39 d8 │ compare rax with rbx (see je, jl, etc.)│ @@ -226,7 +224,7 @@ The return value is placed in rax. ## license -This does not apply to TCC's or musl's source code. +This does not apply to tcc's or musl's source code. ``` This project is in the public domain. Any copyright protections from any law -- cgit v1.2.3