6 files changed, 470 insertions, 25 deletions
diff --git a/05/constants.b b/05/constants.b
index b3d23f5..80a7a0a 100644
--- a/05/constants.b
+++ b/05/constants.b
@@ -77,6 +77,7 @@
 #define TOKEN_CONSTANT_INT 3
 #define TOKEN_CONSTANT_CHAR 4
 #define TOKEN_STRING_LITERAL 5
+#define TOKEN_EOF 6
 
 ; these are stored in the "info" field of the token
 #define NUMBER_NO_SUFFIX 0
@@ -118,6 +119,104 @@
 #define KEYWORD_STATIC 51
 #define KEYWORD_WHILE 52
 
+; the format of expression headers is:
+;    uchar kind  (one of the constants below)
+;    uchar info
+;    ushort (padding)
+;    uint type (0 if expression hasn't been typed yet)
+; immediately following the header in memory are the arguments of the expression
+;    - for constant ints, the 64-bit integral value
+;    - for constant floats, the 64-bit double value (even if expression has type float)
+;    - for string literals, a 64-bit pointer to the string (for the executable, not for the compiler)
+;    - for unary operators, the operand
+;    - for binary operators, the first operand followed by the second
+;        - for the operators . and ->, the first operand is an expression and the second is just a pointer to the name of the member
+;    - for the ternary operator ? :, the first followed by the second followed by the third
+;    - for function calls, the function, followed by each of the arguments to the function — info indicates the number of arguments
+; Note that file/line number are not stored in expressions.
+#define EXPRESSION_IDENTIFIER 200
+#define EXPRESSION_CONSTANT_INT 201
+#define EXPRESSION_CONSTANT_FLOAT 202
+#define EXPRESSION_STRING_LITERAL 203
+#define EXPRESSION_SUBSCRIPT 204
+#define EXPRESSION_CALL 205
+#define EXPRESSION_DOT 206
+#define EXPRESSION_ARROW 207
+#define EXPRESSION_POST_INCREMENT 208
+#define EXPRESSION_POST_DECREMENT 209
+#define EXPRESSION_PRE_INCREMENT 210
+#define EXPRESSION_PRE_DECREMENT 211
+#define EXPRESSION_ADDRESS_OF 212
+#define EXPRESSION_DEREFERENCE 213
+; this matters for promotion. if x is a char, sizeof(+x) should be sizeof(int)
+#define EXPRESSION_UNARY_PLUS 214
+#define EXPRESSION_UNARY_MINUS 215
+#define EXPRESSION_BITWISE_NOT 216
+#define EXPRESSION_NOT 217
+#define EXPRESSION_SIZEOF 218
+#define EXPRESSION_CAST 219
+#define EXPRESSION_MUL 220
+#define EXPRESSION_DIV 221
+#define EXPRESSION_REMAINDER 222
+#define EXPRESSION_ADD 223
+#define EXPRESSION_SUB 224
+#define EXPRESSION_LSHIFT 225
+#define EXPRESSION_RSHIFT 226
+#define EXPRESSION_LT 227
+#define EXPRESSION_GT 228
+#define EXPRESSION_LEQ 229
+#define EXPRESSION_GEQ 230
+#define EXPRESSION_EQ 231
+#define EXPRESSION_NEQ 232
+#define EXPRESSION_BITWISE_AND 233
+#define EXPRESSION_BITWISE_XOR 234
+#define EXPRESSION_BITWISE_OR 235
+#define EXPRESSION_AND 236
+#define EXPRESSION_OR 237
+; e.g. x == 5 ? 6 : 7
+#define EXPRESSION_CONDITIONAL 238
+#define EXPRESSION_ASSIGN 239
+#define EXPRESSION_ASSIGN_ADD 240
+#define EXPRESSION_ASSIGN_SUB 241
+#define EXPRESSION_ASSIGN_MUL 242
+#define EXPRESSION_ASSIGN_DIV 243
+#define EXPRESSION_ASSIGN_REMAINDER 244
+#define EXPRESSION_ASSIGN_LSHIFT 245
+#define EXPRESSION_ASSIGN_RSHIFT 246
+#define EXPRESSION_ASSIGN_AND 247
+#define EXPRESSION_ASSIGN_XOR 248
+#define EXPRESSION_ASSIGN_OR 249
+#define EXPRESSION_COMMA 250
+
+; TYPES: A type is a 4-byte index into the global array `types`. Byte 0 in `types`
+; is reserved, and bytes 1-16 contain the values 1-16. Thus TYPE_INT, etc.
+; can be used as types directly.
+; The format of each type is as follows:
+;  char, unsigned char, etc.: TYPE_CHAR, TYPE_UNSIGNED_CHAR, etc. as a single byte
+;  pointer to type t: TYPE_PTR t
+;  array of n t's: TYPE_ARRAY {n as 8 bytes} t
+;  struct/union: TYPE_STRUCT/TYPE_UNION {0 for incomplete types/4-byte pointer to struct/union}
+; NOTE: we just treat function pointers as pointers to the function return type.
+#define TYPE_VOID 1
+#define TYPE_CHAR 3
+#define TYPE_UNSIGNED_CHAR 4
+#define TYPE_SHORT 5
+#define TYPE_UNSIGNED_SHORT 6
+#define TYPE_INT 7
+#define TYPE_UNSIGNED_INT 8
+#define TYPE_LONG 9
+#define TYPE_UNSIGNED_LONG 10
+#define TYPE_FLOAT 11
+; note that long double is treated the same as double.
+#define TYPE_DOUBLE 12
+#define TYPE_POINTER 13
+#define TYPE_STRUCT 14
+#define TYPE_UNION 15
+#define TYPE_ARRAY 16
+
+; types willl be initialized (in main) so that this refers to the type char*
+#define TYPE_POINTER_TO_CHAR 20
+
 :keyword_table
 	byte SYMBOL_SEMICOLON
 	byte 59
@@ -538,3 +637,42 @@
 :str___STDC__
 	string __STDC__
 	byte 0
+:str_void
+	string void
+	byte 0
+:str_char
+	string char
+	byte 0
+:str_unsigned_char
+	string unsigned char
+	byte 0
+:str_short
+	string short
+	byte 0
+:str_unsigned_short
+	string unsigned short
+	byte 0
+:str_int
+	string int
+	byte 0
+:str_unsigned_int
+	string unsigned int
+	byte 0
+:str_long
+	string long
+	byte 0
+:str_unsigned_long
+	string unsigned long
+	byte 0
+:str_float
+	string float
+	byte 0
+:str_double
+	string double
+	byte 0
+:str_struct
+	string struct
+	byte 0
+:str_union
+	string union
+	byte 0
diff --git a/05/main.b b/05/main.b
index f59eb4e..7f6eaa1 100644
--- a/05/main.b
+++ b/05/main.b
@@ -41,6 +41,18 @@ function compile_error
 	fputc(2, 10)
 	exit(1)
 
+function token_error
+	argument token
+	argument message
+	local p
+	local file
+	local line
+	p = token + 2
+	file = *2p
+	p += 2
+	line = *4p
+	compile_error(file, line, message)
+
 ; accepts EITHER file index OR pointer to filename
 function compile_warning
 	argument file
@@ -71,10 +83,15 @@ function compile_warning
 ;          10^i = significand * 2^exponent
 global powers_of_10
 
+global types
+global types_end
+
 #include util.b
 #include constants.b
 #include preprocess.b
 #include tokenize.b
+#include parse.b
+
 
 function main
 	argument argv2
@@ -86,7 +103,9 @@ function main
 	local pptokens
 	local processed_pptokens
 	local tokens
-	
+	local ast
+	local p
+	local i
 	fill_in_powers_of_10()
 	
 	dat_banned_objmacros = 255
@@ -97,6 +116,23 @@ function main
 	object_macros = malloc(4000000)
 	function_macros = malloc(4000000)
 	
+	types = malloc(16000000)
+	i = 0
+	p = types
+	:fill_initial_types_loop
+		*1p = i
+		p += 1
+		i += 1
+		if i <= 16 goto fill_initial_types_loop
+	p = types + TYPE_POINTER_TO_CHAR
+	*1p = TYPE_POINTER
+	p += 1
+	*1p = TYPE_CHAR
+	
+	
+	types_end = p
+	 
+	
 	input_filename = .str_default_input_filename
 	output_filename = .str_default_output_filename
 	if argc == 1 goto have_filenames
@@ -104,6 +140,9 @@ function main
 	input_filename = argv1
 	output_filename = argv2
 	:have_filenames
+	output_fd = open_w(output_filename)
+	rodata_end_offset = RODATA_OFFSET
+	
 	pptokens = split_into_preprocessing_tokens(input_filename)
 	;print_pptokens(pptokens)
 	;print_separator()
@@ -116,14 +155,17 @@ function main
 	;print_object_macros()
 	;print_function_macros()
 	
-	output_fd = open_w(output_filename)
-	rodata_end_offset = RODATA_OFFSET
-	
 	tokens = malloc(16000000)
-	tokenize(pptokens, tokens)
+	p = tokenize(pptokens, tokens, input_filename, 1)
 	print_tokens(tokens)
 	; NOTE: do NOT free pptokens as identifiers still reference them.
 	
+	ast = malloc(56000000)
+	p -= 16
+	parse_expression(tokens, p, ast)
+	print_expression(ast)
+	putc(10)
+	
 	exit(0)
 
 :usage_error
diff --git a/05/main.c b/05/main.c
index 663661d..67fe32d 100644
--- a/05/main.c
+++ b/05/main.c
@@ -1,5 +1 @@
-"Hello ther" "e good fellow."
-char * = "How are you"" d""o""i""ng today?\n";
-hi
-_TEST _ING _1
-5e+307
+'a'
diff --git a/05/parse.b b/05/parse.b
new file mode 100644
index 0000000..7502882
--- /dev/null
+++ b/05/parse.b
@@ -0,0 +1,216 @@
+function parse_expression
+	argument tokens
+	argument tokens_end
+	argument out
+	local in
+	local a
+	local b
+	local c
+	local p
+	local value
+	
+	if tokens == tokens_end goto empty_expression
+	p = tokens + 16
+	if p == tokens_end goto single_token_expression
+	
+	goto unrecognized_expression
+	
+	:single_token_expression
+		in = tokens
+		c = *1in
+		if c == TOKEN_CONSTANT_INT goto expression_integer
+		if c == TOKEN_CONSTANT_CHAR goto expression_integer ; character constants are basically the same as integer constants
+		if c == TOKEN_CONSTANT_FLOAT goto expression_float
+		if c == TOKEN_STRING_LITERAL goto expression_string_literal
+		byte 0xcc
+	
+	:expression_integer
+		*1out = EXPRESSION_CONSTANT_INT
+		p = in + 8
+		value = *8p
+		p = out + 8
+		*8p = value
+		
+		p = in + 1
+		a = int_suffix_to_type(*1p) ; what the suffix says the type should be
+		b = int_value_to_type(value) ; what the value says the type should be (if the value is too large to fit in int)
+		a = max_signed(a, b) ; take the maximum of the two types
+		; make sure that if the integer has a u suffix, the type will be unsigned
+		a &= b | 0xfe
+		p = out + 4
+		*4p = a
+		in += 16
+		out += 16
+		return out
+	
+	:expression_float
+		*1out = EXPRESSION_CONSTANT_FLOAT
+		p = in + 8
+		value = *8p
+		p = out + 8
+		*8p = value
+		
+		p = in + 1
+		a = float_suffix_to_type(*1p)
+		
+		p = out + 4
+		*4p = a
+		
+		in += 16
+		out += 16
+		return out
+		
+	:expression_string_literal
+		*1out = EXPRESSION_STRING_LITERAL
+		p = in + 8
+		value = *8p
+		p = out + 8
+		*8p = value
+		
+		; we already know this is char*
+		p = out + 4
+		*4p = TYPE_POINTER_TO_CHAR
+		
+		in += 16
+		out += 16
+		return out
+	
+	
+	:empty_expression
+		token_error(tokens, .str_empty_expression)
+	:str_empty_expression
+		string Empty expression.
+		byte 0
+	:unrecognized_expression
+		token_error(tokens, .str_unrecognized_expression)
+	:str_unrecognized_expression
+		string Unrecognized expression.
+		byte 0
+
+:return_type_int
+	return TYPE_INT
+:return_type_long
+	return TYPE_LONG
+:return_type_unsigned_int
+	return TYPE_UNSIGNED_INT
+:return_type_unsigned_long
+	return TYPE_UNSIGNED_LONG
+:return_type_float
+	return TYPE_FLOAT
+:return_type_double
+	return TYPE_DOUBLE
+	
+function int_suffix_to_type
+	argument suffix
+	if suffix == NUMBER_SUFFIX_L goto return_type_long
+	if suffix == NUMBER_SUFFIX_U goto return_type_unsigned_int
+	if suffix == NUMBER_SUFFIX_UL goto return_type_unsigned_long
+	goto return_type_int
+
+function float_suffix_to_type
+	argument suffix
+	if suffix == NUMBER_SUFFIX_F goto return_type_float
+	goto return_type_double
+
+; smallest integer type which can fit this value, only using unsigned if necessary
+function int_value_to_type
+	argument value
+	if value [ 0x80000000 goto return_type_int
+	if value [ 0x8000000000000000 goto return_type_long
+	goto return_type_unsigned_long
+
+function print_expression
+	argument expression
+	local c
+	local p
+	p = expression + 4
+	putc(40)
+	print_type(*4p)
+	putc(41)
+	c = *1expression
+	
+	if c == EXPRESSION_CONSTANT_INT goto print_expr_int
+	if c == EXPRESSION_CONSTANT_FLOAT goto print_expr_float
+	if c == EXPRESSION_STRING_LITERAL goto print_expr_str
+	byte 0xcc
+	:print_expr_int
+		expression += 8
+		putn(*8expression)
+		return
+	:print_expr_float
+		expression += 8
+		putx64(*8expression)
+		return
+	:print_expr_str
+		expression += 8
+		putc('0)
+		putc('x)
+		putx32(*8expression)
+		return
+
+; NOTE: to make things easier, the format which this outputs isn't the same as C's, specifically we have
+;    *int for pointer to int and [5]int for array of 5 ints
+function print_type
+	argument type
+	local c
+	:print_type_top
+	c = types + type
+	c = *1c
+	if c == TYPE_VOID goto print_type_void
+	if c == TYPE_CHAR goto print_type_char
+	if c == TYPE_UNSIGNED_CHAR goto print_type_unsigned_char
+	if c == TYPE_SHORT goto print_type_short
+	if c == TYPE_UNSIGNED_SHORT goto print_type_unsigned_short
+	if c == TYPE_INT goto print_type_int
+	if c == TYPE_UNSIGNED_INT goto print_type_unsigned_int
+	if c == TYPE_LONG goto print_type_long
+	if c == TYPE_UNSIGNED_LONG goto print_type_unsigned_long
+	if c == TYPE_FLOAT goto print_type_float
+	if c == TYPE_DOUBLE goto print_type_double
+	if c == TYPE_POINTER goto print_type_pointer
+	if c == TYPE_ARRAY goto print_type_array
+	if c == TYPE_STRUCT goto print_type_struct
+	if c == TYPE_UNION goto print_type_union
+	fputs(2, .str_bad_print_type)
+	exit(1)
+	:str_bad_print_type
+		string Bad type passed to print_type.
+		byte 10
+		byte 0
+	:print_type_void
+		return puts(.str_void)
+	:print_type_char
+		return puts(.str_char)
+	:print_type_unsigned_char
+		return puts(.str_unsigned_char)
+	:print_type_short
+		return puts(.str_short)
+	:print_type_unsigned_short
+		return puts(.str_unsigned_short)
+	:print_type_int
+		return puts(.str_int)
+	:print_type_unsigned_int
+		return puts(.str_unsigned_int)
+	:print_type_long
+		return puts(.str_long)
+	:print_type_unsigned_long
+		return puts(.str_unsigned_long)
+	:print_type_float
+		return puts(.str_float)
+	:print_type_double
+		return puts(.str_double)
+	:print_type_pointer
+		putc('*)
+		type += 1
+		goto print_type_top
+	:print_type_array
+		putc('[)
+		type += 1
+		putn(*8type) ; UNALIGNED
+		putc('])
+		type += 8
+		goto print_type_top
+	:print_type_struct
+		return puts(.str_struct)
+	:print_type_union
+		return puts(.str_union)
diff --git a/05/tokenize.b b/05/tokenize.b
index f85dccf..2fc4bc1 100644
--- a/05/tokenize.b
+++ b/05/tokenize.b
@@ -108,10 +108,16 @@ global rodata_end_offset
 ;    uint line
 ;    ulong data
 ; This corresponds to translation phases 5-6 and the first half of 7
-; IMPORTANT: this function uses pointers to pptokens, so they should NOT be freed!
+; IMPORTANT: this function uses pointers to pptokens, so it should NOT be freed!
+; Returns a pointer to the end of tokens.
 function tokenize
 	argument pptokens
 	argument out
+	; you might think we wouldn't need these arguments because the pptokens array starts with
+	; a line directive. but we also use this function to tokenize the expression of a #if,
+	; where that isn't the case.
+	argument initial_filename
+	argument initial_line_number
 	local in
 	local file
 	local line_number
@@ -129,6 +135,11 @@ function tokenize
 	local lower
 	local upper
 	
+	file_add(initial_filename)
+	file = file_get_index(initial_filename)
+	line_number = initial_line_number
+	
+	
 	in = pptokens
 	:tokenize_loop
 		c = *1in
@@ -301,10 +312,10 @@ function tokenize
 			:float_have_significand_and_exponent
 			if significand == 0 goto float_zero
 			normalize_float(&significand, &exponent)
-			putn(significand)
-			putc(32)
-			putn_signed(exponent)
-			putc(10)
+			; putn(significand)
+			; putc(32)
+			; putn_signed(exponent)
+			; putc(10)
 			; make number round to the nearest representable float roughly (this is what gcc does)
 			; this fails for 5e-100 probably because of imprecision, but mostly works
 			significand += 15
@@ -357,8 +368,15 @@ function tokenize
 				data = 0x7ff0000000000000 ; double infinity
 				goto float_have_data
 	:tokenize_loop_end
+	; EOF token
+	*1out = TOKEN_EOF
+	out += 2
+	*2out = file
+	out += 2
+	*4out = line_number
+	out += 12
 	
-	return 0
+	return out
 	:f_suffix_on_integer
 		compile_error(file, line_number, .str_f_suffix_on_integer)
 	:str_f_suffix_on_integer
@@ -581,6 +599,7 @@ function print_tokens
 		if *1p == TOKEN_CONSTANT_FLOAT goto print_token_float
 		if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal
 		if *1p == TOKEN_IDENTIFIER goto print_token_identifier
+		if *1p == TOKEN_EOF goto print_token_eof
 		fputs(2, .str_print_bad_token)
 		exit(1)
 		:print_token_keyword
@@ -603,9 +622,13 @@ function print_tokens
 		:print_token_float
 			p += 8
 			puts(.str_constant_float)
-			putx(*8p)
+			putx64(*8p)
 			p += 8
+			putc(32)
 			goto print_tokens_loop
+		:print_token_eof
+			puts(.str_eof)
+			goto print_token_data
 		:print_token_info
 		p += 1
 		putc('~)
@@ -643,3 +666,6 @@ function print_tokens
 		string Unrecognized token type in print_tokens. Aborting.
 		byte 10
 		byte 0
+	:str_eof
+		string EOF
+		byte 0
diff --git a/05/util.b b/05/util.b
index 138e440..4fe72aa 100644
--- a/05/util.b
+++ b/05/util.b
@@ -44,6 +44,14 @@ function left_shift
 	:left_shift_negative
 	n = 0 - n
 	return x > n
+
+function max_signed
+	argument a
+	argument b
+	if a > b goto maxs_return_a
+	return b
+	:maxs_return_a
+	return a
 	
 function file_error
 	argument name
@@ -327,26 +335,45 @@ function fputn_signed
 		fputn(fd, n)
 		return
 		
-function fputx
+:hex_digits
+	string 0123456789abcdef
+
+function fputx64
 	argument fd
 	argument n
 	local m
 	local x
 	m = 60
-	:fputx_loop
+	:fputx64_loop
 		x = n > m
 		x &= 0xf
 		x += .hex_digits
 		fputc(fd, *1x)
 		m -= 4
-		if m >= 0 goto fputx_loop
+		if m >= 0 goto fputx64_loop
 	return
-:hex_digits
-	string 0123456789abcdef
-
-function putx
+function putx64
+	argument n
+	fputx64(1, n)
+	return
+	
+function fputx32
+	argument fd
+	argument n
+	local m
+	local x
+	m = 28
+	:fputx32_loop
+		x = n > m
+		x &= 0xf
+		x += .hex_digits
+		fputc(fd, *1x)
+		m -= 4
+		if m >= 0 goto fputx32_loop
+	return
+function putx32
 	argument n
-	fputx(1, n)
+	fputx32(1, n)
 	return
 
 function putn