summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2022-01-13 16:12:28 -0500
committerpommicket <pommicket@gmail.com>2022-01-13 16:12:28 -0500
commit13363eac1af870ea256b35843078fa890cea6f24 (patch)
treef00a11cca9630bbb4664c66f08af2fc1477e5dde
parente43f32b93217dc712d113d43feb8d1e7b8177422 (diff)
simple expressions
-rw-r--r--05/constants.b138
-rw-r--r--05/main.b52
-rw-r--r--05/main.c6
-rw-r--r--05/parse.b216
-rw-r--r--05/tokenize.b40
-rw-r--r--05/util.b43
6 files changed, 470 insertions, 25 deletions
diff --git a/05/constants.b b/05/constants.b
index b3d23f5..80a7a0a 100644
--- a/05/constants.b
+++ b/05/constants.b
@@ -77,6 +77,7 @@
#define TOKEN_CONSTANT_INT 3
#define TOKEN_CONSTANT_CHAR 4
#define TOKEN_STRING_LITERAL 5
+#define TOKEN_EOF 6
; these are stored in the "info" field of the token
#define NUMBER_NO_SUFFIX 0
@@ -118,6 +119,104 @@
#define KEYWORD_STATIC 51
#define KEYWORD_WHILE 52
+; the format of expression headers is:
+; uchar kind (one of the constants below)
+; uchar info
+; ushort (padding)
+; uint type (0 if expression hasn't been typed yet)
+; immediately following the header in memory are the arguments of the expression
+; - for constant ints, the 64-bit integral value
+; - for constant floats, the 64-bit double value (even if expression has type float)
+; - for string literals, a 64-bit pointer to the string (for the executable, not for the compiler)
+; - for unary operators, the operand
+; - for binary operators, the first operand followed by the second
+; - for the operators . and ->, the first operand is an expression and the second is just a pointer to the name of the member
+; - for the ternary operator ? :, the first followed by the second followed by the third
+; - for function calls, the function, followed by each of the arguments to the function — info indicates the number of arguments
+; Note that file/line number are not stored in expressions.
+#define EXPRESSION_IDENTIFIER 200
+#define EXPRESSION_CONSTANT_INT 201
+#define EXPRESSION_CONSTANT_FLOAT 202
+#define EXPRESSION_STRING_LITERAL 203
+#define EXPRESSION_SUBSCRIPT 204
+#define EXPRESSION_CALL 205
+#define EXPRESSION_DOT 206
+#define EXPRESSION_ARROW 207
+#define EXPRESSION_POST_INCREMENT 208
+#define EXPRESSION_POST_DECREMENT 209
+#define EXPRESSION_PRE_INCREMENT 210
+#define EXPRESSION_PRE_DECREMENT 211
+#define EXPRESSION_ADDRESS_OF 212
+#define EXPRESSION_DEREFERENCE 213
+; this matters for promotion. if x is a char, sizeof(+x) should be sizeof(int)
+#define EXPRESSION_UNARY_PLUS 214
+#define EXPRESSION_UNARY_MINUS 215
+#define EXPRESSION_BITWISE_NOT 216
+#define EXPRESSION_NOT 217
+#define EXPRESSION_SIZEOF 218
+#define EXPRESSION_CAST 219
+#define EXPRESSION_MUL 220
+#define EXPRESSION_DIV 221
+#define EXPRESSION_REMAINDER 222
+#define EXPRESSION_ADD 223
+#define EXPRESSION_SUB 224
+#define EXPRESSION_LSHIFT 225
+#define EXPRESSION_RSHIFT 226
+#define EXPRESSION_LT 227
+#define EXPRESSION_GT 228
+#define EXPRESSION_LEQ 229
+#define EXPRESSION_GEQ 230
+#define EXPRESSION_EQ 231
+#define EXPRESSION_NEQ 232
+#define EXPRESSION_BITWISE_AND 233
+#define EXPRESSION_BITWISE_XOR 234
+#define EXPRESSION_BITWISE_OR 235
+#define EXPRESSION_AND 236
+#define EXPRESSION_OR 237
+; e.g. x == 5 ? 6 : 7
+#define EXPRESSION_CONDITIONAL 238
+#define EXPRESSION_ASSIGN 239
+#define EXPRESSION_ASSIGN_ADD 240
+#define EXPRESSION_ASSIGN_SUB 241
+#define EXPRESSION_ASSIGN_MUL 242
+#define EXPRESSION_ASSIGN_DIV 243
+#define EXPRESSION_ASSIGN_REMAINDER 244
+#define EXPRESSION_ASSIGN_LSHIFT 245
+#define EXPRESSION_ASSIGN_RSHIFT 246
+#define EXPRESSION_ASSIGN_AND 247
+#define EXPRESSION_ASSIGN_XOR 248
+#define EXPRESSION_ASSIGN_OR 249
+#define EXPRESSION_COMMA 250
+
+; TYPES: A type is a 4-byte index into the global array `types`. Byte 0 in `types`
+; is reserved, and bytes 1-16 contain the values 1-16. Thus TYPE_INT, etc.
+; can be used as types directly.
+; The format of each type is as follows:
+; char, unsigned char, etc.: TYPE_CHAR, TYPE_UNSIGNED_CHAR, etc. as a single byte
+; pointer to type t: TYPE_PTR t
+; array of n t's: TYPE_ARRAY {n as 8 bytes} t
+; struct/union: TYPE_STRUCT/TYPE_UNION {0 for incomplete types/4-byte pointer to struct/union}
+; NOTE: we just treat function pointers as pointers to the function return type.
+#define TYPE_VOID 1
+#define TYPE_CHAR 3
+#define TYPE_UNSIGNED_CHAR 4
+#define TYPE_SHORT 5
+#define TYPE_UNSIGNED_SHORT 6
+#define TYPE_INT 7
+#define TYPE_UNSIGNED_INT 8
+#define TYPE_LONG 9
+#define TYPE_UNSIGNED_LONG 10
+#define TYPE_FLOAT 11
+; note that long double is treated the same as double.
+#define TYPE_DOUBLE 12
+#define TYPE_POINTER 13
+#define TYPE_STRUCT 14
+#define TYPE_UNION 15
+#define TYPE_ARRAY 16
+
+; types willl be initialized (in main) so that this refers to the type char*
+#define TYPE_POINTER_TO_CHAR 20
+
:keyword_table
byte SYMBOL_SEMICOLON
byte 59
@@ -538,3 +637,42 @@
:str___STDC__
string __STDC__
byte 0
+:str_void
+ string void
+ byte 0
+:str_char
+ string char
+ byte 0
+:str_unsigned_char
+ string unsigned char
+ byte 0
+:str_short
+ string short
+ byte 0
+:str_unsigned_short
+ string unsigned short
+ byte 0
+:str_int
+ string int
+ byte 0
+:str_unsigned_int
+ string unsigned int
+ byte 0
+:str_long
+ string long
+ byte 0
+:str_unsigned_long
+ string unsigned long
+ byte 0
+:str_float
+ string float
+ byte 0
+:str_double
+ string double
+ byte 0
+:str_struct
+ string struct
+ byte 0
+:str_union
+ string union
+ byte 0
diff --git a/05/main.b b/05/main.b
index f59eb4e..7f6eaa1 100644
--- a/05/main.b
+++ b/05/main.b
@@ -41,6 +41,18 @@ function compile_error
fputc(2, 10)
exit(1)
+function token_error
+ argument token
+ argument message
+ local p
+ local file
+ local line
+ p = token + 2
+ file = *2p
+ p += 2
+ line = *4p
+ compile_error(file, line, message)
+
; accepts EITHER file index OR pointer to filename
function compile_warning
argument file
@@ -71,10 +83,15 @@ function compile_warning
; 10^i = significand * 2^exponent
global powers_of_10
+global types
+global types_end
+
#include util.b
#include constants.b
#include preprocess.b
#include tokenize.b
+#include parse.b
+
function main
argument argv2
@@ -86,7 +103,9 @@ function main
local pptokens
local processed_pptokens
local tokens
-
+ local ast
+ local p
+ local i
fill_in_powers_of_10()
dat_banned_objmacros = 255
@@ -97,6 +116,23 @@ function main
object_macros = malloc(4000000)
function_macros = malloc(4000000)
+ types = malloc(16000000)
+ i = 0
+ p = types
+ :fill_initial_types_loop
+ *1p = i
+ p += 1
+ i += 1
+ if i <= 16 goto fill_initial_types_loop
+ p = types + TYPE_POINTER_TO_CHAR
+ *1p = TYPE_POINTER
+ p += 1
+ *1p = TYPE_CHAR
+
+
+ types_end = p
+
+
input_filename = .str_default_input_filename
output_filename = .str_default_output_filename
if argc == 1 goto have_filenames
@@ -104,6 +140,9 @@ function main
input_filename = argv1
output_filename = argv2
:have_filenames
+ output_fd = open_w(output_filename)
+ rodata_end_offset = RODATA_OFFSET
+
pptokens = split_into_preprocessing_tokens(input_filename)
;print_pptokens(pptokens)
;print_separator()
@@ -116,14 +155,17 @@ function main
;print_object_macros()
;print_function_macros()
- output_fd = open_w(output_filename)
- rodata_end_offset = RODATA_OFFSET
-
tokens = malloc(16000000)
- tokenize(pptokens, tokens)
+ p = tokenize(pptokens, tokens, input_filename, 1)
print_tokens(tokens)
; NOTE: do NOT free pptokens as identifiers still reference them.
+ ast = malloc(56000000)
+ p -= 16
+ parse_expression(tokens, p, ast)
+ print_expression(ast)
+ putc(10)
+
exit(0)
:usage_error
diff --git a/05/main.c b/05/main.c
index 663661d..67fe32d 100644
--- a/05/main.c
+++ b/05/main.c
@@ -1,5 +1 @@
-"Hello ther" "e good fellow."
-char * = "How are you"" d""o""i""ng today?\n";
-hi
-_TEST _ING _1
-5e+307
+'a'
diff --git a/05/parse.b b/05/parse.b
new file mode 100644
index 0000000..7502882
--- /dev/null
+++ b/05/parse.b
@@ -0,0 +1,216 @@
+function parse_expression
+ argument tokens
+ argument tokens_end
+ argument out
+ local in
+ local a
+ local b
+ local c
+ local p
+ local value
+
+ if tokens == tokens_end goto empty_expression
+ p = tokens + 16
+ if p == tokens_end goto single_token_expression
+
+ goto unrecognized_expression
+
+ :single_token_expression
+ in = tokens
+ c = *1in
+ if c == TOKEN_CONSTANT_INT goto expression_integer
+ if c == TOKEN_CONSTANT_CHAR goto expression_integer ; character constants are basically the same as integer constants
+ if c == TOKEN_CONSTANT_FLOAT goto expression_float
+ if c == TOKEN_STRING_LITERAL goto expression_string_literal
+ byte 0xcc
+
+ :expression_integer
+ *1out = EXPRESSION_CONSTANT_INT
+ p = in + 8
+ value = *8p
+ p = out + 8
+ *8p = value
+
+ p = in + 1
+ a = int_suffix_to_type(*1p) ; what the suffix says the type should be
+ b = int_value_to_type(value) ; what the value says the type should be (if the value is too large to fit in int)
+ a = max_signed(a, b) ; take the maximum of the two types
+ ; make sure that if the integer has a u suffix, the type will be unsigned
+ a &= b | 0xfe
+ p = out + 4
+ *4p = a
+ in += 16
+ out += 16
+ return out
+
+ :expression_float
+ *1out = EXPRESSION_CONSTANT_FLOAT
+ p = in + 8
+ value = *8p
+ p = out + 8
+ *8p = value
+
+ p = in + 1
+ a = float_suffix_to_type(*1p)
+
+ p = out + 4
+ *4p = a
+
+ in += 16
+ out += 16
+ return out
+
+ :expression_string_literal
+ *1out = EXPRESSION_STRING_LITERAL
+ p = in + 8
+ value = *8p
+ p = out + 8
+ *8p = value
+
+ ; we already know this is char*
+ p = out + 4
+ *4p = TYPE_POINTER_TO_CHAR
+
+ in += 16
+ out += 16
+ return out
+
+
+ :empty_expression
+ token_error(tokens, .str_empty_expression)
+ :str_empty_expression
+ string Empty expression.
+ byte 0
+ :unrecognized_expression
+ token_error(tokens, .str_unrecognized_expression)
+ :str_unrecognized_expression
+ string Unrecognized expression.
+ byte 0
+
+:return_type_int
+ return TYPE_INT
+:return_type_long
+ return TYPE_LONG
+:return_type_unsigned_int
+ return TYPE_UNSIGNED_INT
+:return_type_unsigned_long
+ return TYPE_UNSIGNED_LONG
+:return_type_float
+ return TYPE_FLOAT
+:return_type_double
+ return TYPE_DOUBLE
+
+function int_suffix_to_type
+ argument suffix
+ if suffix == NUMBER_SUFFIX_L goto return_type_long
+ if suffix == NUMBER_SUFFIX_U goto return_type_unsigned_int
+ if suffix == NUMBER_SUFFIX_UL goto return_type_unsigned_long
+ goto return_type_int
+
+function float_suffix_to_type
+ argument suffix
+ if suffix == NUMBER_SUFFIX_F goto return_type_float
+ goto return_type_double
+
+; smallest integer type which can fit this value, only using unsigned if necessary
+function int_value_to_type
+ argument value
+ if value [ 0x80000000 goto return_type_int
+ if value [ 0x8000000000000000 goto return_type_long
+ goto return_type_unsigned_long
+
+function print_expression
+ argument expression
+ local c
+ local p
+ p = expression + 4
+ putc(40)
+ print_type(*4p)
+ putc(41)
+ c = *1expression
+
+ if c == EXPRESSION_CONSTANT_INT goto print_expr_int
+ if c == EXPRESSION_CONSTANT_FLOAT goto print_expr_float
+ if c == EXPRESSION_STRING_LITERAL goto print_expr_str
+ byte 0xcc
+ :print_expr_int
+ expression += 8
+ putn(*8expression)
+ return
+ :print_expr_float
+ expression += 8
+ putx64(*8expression)
+ return
+ :print_expr_str
+ expression += 8
+ putc('0)
+ putc('x)
+ putx32(*8expression)
+ return
+
+; NOTE: to make things easier, the format which this outputs isn't the same as C's, specifically we have
+; *int for pointer to int and [5]int for array of 5 ints
+function print_type
+ argument type
+ local c
+ :print_type_top
+ c = types + type
+ c = *1c
+ if c == TYPE_VOID goto print_type_void
+ if c == TYPE_CHAR goto print_type_char
+ if c == TYPE_UNSIGNED_CHAR goto print_type_unsigned_char
+ if c == TYPE_SHORT goto print_type_short
+ if c == TYPE_UNSIGNED_SHORT goto print_type_unsigned_short
+ if c == TYPE_INT goto print_type_int
+ if c == TYPE_UNSIGNED_INT goto print_type_unsigned_int
+ if c == TYPE_LONG goto print_type_long
+ if c == TYPE_UNSIGNED_LONG goto print_type_unsigned_long
+ if c == TYPE_FLOAT goto print_type_float
+ if c == TYPE_DOUBLE goto print_type_double
+ if c == TYPE_POINTER goto print_type_pointer
+ if c == TYPE_ARRAY goto print_type_array
+ if c == TYPE_STRUCT goto print_type_struct
+ if c == TYPE_UNION goto print_type_union
+ fputs(2, .str_bad_print_type)
+ exit(1)
+ :str_bad_print_type
+ string Bad type passed to print_type.
+ byte 10
+ byte 0
+ :print_type_void
+ return puts(.str_void)
+ :print_type_char
+ return puts(.str_char)
+ :print_type_unsigned_char
+ return puts(.str_unsigned_char)
+ :print_type_short
+ return puts(.str_short)
+ :print_type_unsigned_short
+ return puts(.str_unsigned_short)
+ :print_type_int
+ return puts(.str_int)
+ :print_type_unsigned_int
+ return puts(.str_unsigned_int)
+ :print_type_long
+ return puts(.str_long)
+ :print_type_unsigned_long
+ return puts(.str_unsigned_long)
+ :print_type_float
+ return puts(.str_float)
+ :print_type_double
+ return puts(.str_double)
+ :print_type_pointer
+ putc('*)
+ type += 1
+ goto print_type_top
+ :print_type_array
+ putc('[)
+ type += 1
+ putn(*8type) ; UNALIGNED
+ putc('])
+ type += 8
+ goto print_type_top
+ :print_type_struct
+ return puts(.str_struct)
+ :print_type_union
+ return puts(.str_union)
diff --git a/05/tokenize.b b/05/tokenize.b
index f85dccf..2fc4bc1 100644
--- a/05/tokenize.b
+++ b/05/tokenize.b
@@ -108,10 +108,16 @@ global rodata_end_offset
; uint line
; ulong data
; This corresponds to translation phases 5-6 and the first half of 7
-; IMPORTANT: this function uses pointers to pptokens, so they should NOT be freed!
+; IMPORTANT: this function uses pointers to pptokens, so it should NOT be freed!
+; Returns a pointer to the end of tokens.
function tokenize
argument pptokens
argument out
+ ; you might think we wouldn't need these arguments because the pptokens array starts with
+ ; a line directive. but we also use this function to tokenize the expression of a #if,
+ ; where that isn't the case.
+ argument initial_filename
+ argument initial_line_number
local in
local file
local line_number
@@ -129,6 +135,11 @@ function tokenize
local lower
local upper
+ file_add(initial_filename)
+ file = file_get_index(initial_filename)
+ line_number = initial_line_number
+
+
in = pptokens
:tokenize_loop
c = *1in
@@ -301,10 +312,10 @@ function tokenize
:float_have_significand_and_exponent
if significand == 0 goto float_zero
normalize_float(&significand, &exponent)
- putn(significand)
- putc(32)
- putn_signed(exponent)
- putc(10)
+ ; putn(significand)
+ ; putc(32)
+ ; putn_signed(exponent)
+ ; putc(10)
; make number round to the nearest representable float roughly (this is what gcc does)
; this fails for 5e-100 probably because of imprecision, but mostly works
significand += 15
@@ -357,8 +368,15 @@ function tokenize
data = 0x7ff0000000000000 ; double infinity
goto float_have_data
:tokenize_loop_end
+ ; EOF token
+ *1out = TOKEN_EOF
+ out += 2
+ *2out = file
+ out += 2
+ *4out = line_number
+ out += 12
- return 0
+ return out
:f_suffix_on_integer
compile_error(file, line_number, .str_f_suffix_on_integer)
:str_f_suffix_on_integer
@@ -581,6 +599,7 @@ function print_tokens
if *1p == TOKEN_CONSTANT_FLOAT goto print_token_float
if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal
if *1p == TOKEN_IDENTIFIER goto print_token_identifier
+ if *1p == TOKEN_EOF goto print_token_eof
fputs(2, .str_print_bad_token)
exit(1)
:print_token_keyword
@@ -603,9 +622,13 @@ function print_tokens
:print_token_float
p += 8
puts(.str_constant_float)
- putx(*8p)
+ putx64(*8p)
p += 8
+ putc(32)
goto print_tokens_loop
+ :print_token_eof
+ puts(.str_eof)
+ goto print_token_data
:print_token_info
p += 1
putc('~)
@@ -643,3 +666,6 @@ function print_tokens
string Unrecognized token type in print_tokens. Aborting.
byte 10
byte 0
+ :str_eof
+ string EOF
+ byte 0
diff --git a/05/util.b b/05/util.b
index 138e440..4fe72aa 100644
--- a/05/util.b
+++ b/05/util.b
@@ -44,6 +44,14 @@ function left_shift
:left_shift_negative
n = 0 - n
return x > n
+
+function max_signed
+ argument a
+ argument b
+ if a > b goto maxs_return_a
+ return b
+ :maxs_return_a
+ return a
function file_error
argument name
@@ -327,26 +335,45 @@ function fputn_signed
fputn(fd, n)
return
-function fputx
+:hex_digits
+ string 0123456789abcdef
+
+function fputx64
argument fd
argument n
local m
local x
m = 60
- :fputx_loop
+ :fputx64_loop
x = n > m
x &= 0xf
x += .hex_digits
fputc(fd, *1x)
m -= 4
- if m >= 0 goto fputx_loop
+ if m >= 0 goto fputx64_loop
return
-:hex_digits
- string 0123456789abcdef
-
-function putx
+function putx64
+ argument n
+ fputx64(1, n)
+ return
+
+function fputx32
+ argument fd
+ argument n
+ local m
+ local x
+ m = 28
+ :fputx32_loop
+ x = n > m
+ x &= 0xf
+ x += .hex_digits
+ fputc(fd, *1x)
+ m -= 4
+ if m >= 0 goto fputx32_loop
+ return
+function putx32
argument n
- fputx(1, n)
+ fputx32(1, n)
return
function putn