From 13363eac1af870ea256b35843078fa890cea6f24 Mon Sep 17 00:00:00 2001 From: pommicket Date: Thu, 13 Jan 2022 16:12:28 -0500 Subject: simple expressions --- 05/constants.b | 138 ++++++++++++++++++++++++++++++++++++ 05/main.b | 52 ++++++++++++-- 05/main.c | 6 +- 05/parse.b | 216 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 05/tokenize.b | 40 +++++++++-- 05/util.b | 43 +++++++++--- 6 files changed, 470 insertions(+), 25 deletions(-) create mode 100644 05/parse.b (limited to '05') diff --git a/05/constants.b b/05/constants.b index b3d23f5..80a7a0a 100644 --- a/05/constants.b +++ b/05/constants.b @@ -77,6 +77,7 @@ #define TOKEN_CONSTANT_INT 3 #define TOKEN_CONSTANT_CHAR 4 #define TOKEN_STRING_LITERAL 5 +#define TOKEN_EOF 6 ; these are stored in the "info" field of the token #define NUMBER_NO_SUFFIX 0 @@ -118,6 +119,104 @@ #define KEYWORD_STATIC 51 #define KEYWORD_WHILE 52 +; the format of expression headers is: +; uchar kind (one of the constants below) +; uchar info +; ushort (padding) +; uint type (0 if expression hasn't been typed yet) +; immediately following the header in memory are the arguments of the expression +; - for constant ints, the 64-bit integral value +; - for constant floats, the 64-bit double value (even if expression has type float) +; - for string literals, a 64-bit pointer to the string (for the executable, not for the compiler) +; - for unary operators, the operand +; - for binary operators, the first operand followed by the second +; - for the operators . and ->, the first operand is an expression and the second is just a pointer to the name of the member +; - for the ternary operator ? :, the first followed by the second followed by the third +; - for function calls, the function, followed by each of the arguments to the function — info indicates the number of arguments +; Note that file/line number are not stored in expressions. +#define EXPRESSION_IDENTIFIER 200 +#define EXPRESSION_CONSTANT_INT 201 +#define EXPRESSION_CONSTANT_FLOAT 202 +#define EXPRESSION_STRING_LITERAL 203 +#define EXPRESSION_SUBSCRIPT 204 +#define EXPRESSION_CALL 205 +#define EXPRESSION_DOT 206 +#define EXPRESSION_ARROW 207 +#define EXPRESSION_POST_INCREMENT 208 +#define EXPRESSION_POST_DECREMENT 209 +#define EXPRESSION_PRE_INCREMENT 210 +#define EXPRESSION_PRE_DECREMENT 211 +#define EXPRESSION_ADDRESS_OF 212 +#define EXPRESSION_DEREFERENCE 213 +; this matters for promotion. if x is a char, sizeof(+x) should be sizeof(int) +#define EXPRESSION_UNARY_PLUS 214 +#define EXPRESSION_UNARY_MINUS 215 +#define EXPRESSION_BITWISE_NOT 216 +#define EXPRESSION_NOT 217 +#define EXPRESSION_SIZEOF 218 +#define EXPRESSION_CAST 219 +#define EXPRESSION_MUL 220 +#define EXPRESSION_DIV 221 +#define EXPRESSION_REMAINDER 222 +#define EXPRESSION_ADD 223 +#define EXPRESSION_SUB 224 +#define EXPRESSION_LSHIFT 225 +#define EXPRESSION_RSHIFT 226 +#define EXPRESSION_LT 227 +#define EXPRESSION_GT 228 +#define EXPRESSION_LEQ 229 +#define EXPRESSION_GEQ 230 +#define EXPRESSION_EQ 231 +#define EXPRESSION_NEQ 232 +#define EXPRESSION_BITWISE_AND 233 +#define EXPRESSION_BITWISE_XOR 234 +#define EXPRESSION_BITWISE_OR 235 +#define EXPRESSION_AND 236 +#define EXPRESSION_OR 237 +; e.g. x == 5 ? 6 : 7 +#define EXPRESSION_CONDITIONAL 238 +#define EXPRESSION_ASSIGN 239 +#define EXPRESSION_ASSIGN_ADD 240 +#define EXPRESSION_ASSIGN_SUB 241 +#define EXPRESSION_ASSIGN_MUL 242 +#define EXPRESSION_ASSIGN_DIV 243 +#define EXPRESSION_ASSIGN_REMAINDER 244 +#define EXPRESSION_ASSIGN_LSHIFT 245 +#define EXPRESSION_ASSIGN_RSHIFT 246 +#define EXPRESSION_ASSIGN_AND 247 +#define EXPRESSION_ASSIGN_XOR 248 +#define EXPRESSION_ASSIGN_OR 249 +#define EXPRESSION_COMMA 250 + +; TYPES: A type is a 4-byte index into the global array `types`. Byte 0 in `types` +; is reserved, and bytes 1-16 contain the values 1-16. Thus TYPE_INT, etc. +; can be used as types directly. +; The format of each type is as follows: +; char, unsigned char, etc.: TYPE_CHAR, TYPE_UNSIGNED_CHAR, etc. as a single byte +; pointer to type t: TYPE_PTR t +; array of n t's: TYPE_ARRAY {n as 8 bytes} t +; struct/union: TYPE_STRUCT/TYPE_UNION {0 for incomplete types/4-byte pointer to struct/union} +; NOTE: we just treat function pointers as pointers to the function return type. +#define TYPE_VOID 1 +#define TYPE_CHAR 3 +#define TYPE_UNSIGNED_CHAR 4 +#define TYPE_SHORT 5 +#define TYPE_UNSIGNED_SHORT 6 +#define TYPE_INT 7 +#define TYPE_UNSIGNED_INT 8 +#define TYPE_LONG 9 +#define TYPE_UNSIGNED_LONG 10 +#define TYPE_FLOAT 11 +; note that long double is treated the same as double. +#define TYPE_DOUBLE 12 +#define TYPE_POINTER 13 +#define TYPE_STRUCT 14 +#define TYPE_UNION 15 +#define TYPE_ARRAY 16 + +; types willl be initialized (in main) so that this refers to the type char* +#define TYPE_POINTER_TO_CHAR 20 + :keyword_table byte SYMBOL_SEMICOLON byte 59 @@ -538,3 +637,42 @@ :str___STDC__ string __STDC__ byte 0 +:str_void + string void + byte 0 +:str_char + string char + byte 0 +:str_unsigned_char + string unsigned char + byte 0 +:str_short + string short + byte 0 +:str_unsigned_short + string unsigned short + byte 0 +:str_int + string int + byte 0 +:str_unsigned_int + string unsigned int + byte 0 +:str_long + string long + byte 0 +:str_unsigned_long + string unsigned long + byte 0 +:str_float + string float + byte 0 +:str_double + string double + byte 0 +:str_struct + string struct + byte 0 +:str_union + string union + byte 0 diff --git a/05/main.b b/05/main.b index f59eb4e..7f6eaa1 100644 --- a/05/main.b +++ b/05/main.b @@ -41,6 +41,18 @@ function compile_error fputc(2, 10) exit(1) +function token_error + argument token + argument message + local p + local file + local line + p = token + 2 + file = *2p + p += 2 + line = *4p + compile_error(file, line, message) + ; accepts EITHER file index OR pointer to filename function compile_warning argument file @@ -71,10 +83,15 @@ function compile_warning ; 10^i = significand * 2^exponent global powers_of_10 +global types +global types_end + #include util.b #include constants.b #include preprocess.b #include tokenize.b +#include parse.b + function main argument argv2 @@ -86,7 +103,9 @@ function main local pptokens local processed_pptokens local tokens - + local ast + local p + local i fill_in_powers_of_10() dat_banned_objmacros = 255 @@ -97,6 +116,23 @@ function main object_macros = malloc(4000000) function_macros = malloc(4000000) + types = malloc(16000000) + i = 0 + p = types + :fill_initial_types_loop + *1p = i + p += 1 + i += 1 + if i <= 16 goto fill_initial_types_loop + p = types + TYPE_POINTER_TO_CHAR + *1p = TYPE_POINTER + p += 1 + *1p = TYPE_CHAR + + + types_end = p + + input_filename = .str_default_input_filename output_filename = .str_default_output_filename if argc == 1 goto have_filenames @@ -104,6 +140,9 @@ function main input_filename = argv1 output_filename = argv2 :have_filenames + output_fd = open_w(output_filename) + rodata_end_offset = RODATA_OFFSET + pptokens = split_into_preprocessing_tokens(input_filename) ;print_pptokens(pptokens) ;print_separator() @@ -116,14 +155,17 @@ function main ;print_object_macros() ;print_function_macros() - output_fd = open_w(output_filename) - rodata_end_offset = RODATA_OFFSET - tokens = malloc(16000000) - tokenize(pptokens, tokens) + p = tokenize(pptokens, tokens, input_filename, 1) print_tokens(tokens) ; NOTE: do NOT free pptokens as identifiers still reference them. + ast = malloc(56000000) + p -= 16 + parse_expression(tokens, p, ast) + print_expression(ast) + putc(10) + exit(0) :usage_error diff --git a/05/main.c b/05/main.c index 663661d..67fe32d 100644 --- a/05/main.c +++ b/05/main.c @@ -1,5 +1 @@ -"Hello ther" "e good fellow." -char * = "How are you"" d""o""i""ng today?\n"; -hi -_TEST _ING _1 -5e+307 +'a' diff --git a/05/parse.b b/05/parse.b new file mode 100644 index 0000000..7502882 --- /dev/null +++ b/05/parse.b @@ -0,0 +1,216 @@ +function parse_expression + argument tokens + argument tokens_end + argument out + local in + local a + local b + local c + local p + local value + + if tokens == tokens_end goto empty_expression + p = tokens + 16 + if p == tokens_end goto single_token_expression + + goto unrecognized_expression + + :single_token_expression + in = tokens + c = *1in + if c == TOKEN_CONSTANT_INT goto expression_integer + if c == TOKEN_CONSTANT_CHAR goto expression_integer ; character constants are basically the same as integer constants + if c == TOKEN_CONSTANT_FLOAT goto expression_float + if c == TOKEN_STRING_LITERAL goto expression_string_literal + byte 0xcc + + :expression_integer + *1out = EXPRESSION_CONSTANT_INT + p = in + 8 + value = *8p + p = out + 8 + *8p = value + + p = in + 1 + a = int_suffix_to_type(*1p) ; what the suffix says the type should be + b = int_value_to_type(value) ; what the value says the type should be (if the value is too large to fit in int) + a = max_signed(a, b) ; take the maximum of the two types + ; make sure that if the integer has a u suffix, the type will be unsigned + a &= b | 0xfe + p = out + 4 + *4p = a + in += 16 + out += 16 + return out + + :expression_float + *1out = EXPRESSION_CONSTANT_FLOAT + p = in + 8 + value = *8p + p = out + 8 + *8p = value + + p = in + 1 + a = float_suffix_to_type(*1p) + + p = out + 4 + *4p = a + + in += 16 + out += 16 + return out + + :expression_string_literal + *1out = EXPRESSION_STRING_LITERAL + p = in + 8 + value = *8p + p = out + 8 + *8p = value + + ; we already know this is char* + p = out + 4 + *4p = TYPE_POINTER_TO_CHAR + + in += 16 + out += 16 + return out + + + :empty_expression + token_error(tokens, .str_empty_expression) + :str_empty_expression + string Empty expression. + byte 0 + :unrecognized_expression + token_error(tokens, .str_unrecognized_expression) + :str_unrecognized_expression + string Unrecognized expression. + byte 0 + +:return_type_int + return TYPE_INT +:return_type_long + return TYPE_LONG +:return_type_unsigned_int + return TYPE_UNSIGNED_INT +:return_type_unsigned_long + return TYPE_UNSIGNED_LONG +:return_type_float + return TYPE_FLOAT +:return_type_double + return TYPE_DOUBLE + +function int_suffix_to_type + argument suffix + if suffix == NUMBER_SUFFIX_L goto return_type_long + if suffix == NUMBER_SUFFIX_U goto return_type_unsigned_int + if suffix == NUMBER_SUFFIX_UL goto return_type_unsigned_long + goto return_type_int + +function float_suffix_to_type + argument suffix + if suffix == NUMBER_SUFFIX_F goto return_type_float + goto return_type_double + +; smallest integer type which can fit this value, only using unsigned if necessary +function int_value_to_type + argument value + if value [ 0x80000000 goto return_type_int + if value [ 0x8000000000000000 goto return_type_long + goto return_type_unsigned_long + +function print_expression + argument expression + local c + local p + p = expression + 4 + putc(40) + print_type(*4p) + putc(41) + c = *1expression + + if c == EXPRESSION_CONSTANT_INT goto print_expr_int + if c == EXPRESSION_CONSTANT_FLOAT goto print_expr_float + if c == EXPRESSION_STRING_LITERAL goto print_expr_str + byte 0xcc + :print_expr_int + expression += 8 + putn(*8expression) + return + :print_expr_float + expression += 8 + putx64(*8expression) + return + :print_expr_str + expression += 8 + putc('0) + putc('x) + putx32(*8expression) + return + +; NOTE: to make things easier, the format which this outputs isn't the same as C's, specifically we have +; *int for pointer to int and [5]int for array of 5 ints +function print_type + argument type + local c + :print_type_top + c = types + type + c = *1c + if c == TYPE_VOID goto print_type_void + if c == TYPE_CHAR goto print_type_char + if c == TYPE_UNSIGNED_CHAR goto print_type_unsigned_char + if c == TYPE_SHORT goto print_type_short + if c == TYPE_UNSIGNED_SHORT goto print_type_unsigned_short + if c == TYPE_INT goto print_type_int + if c == TYPE_UNSIGNED_INT goto print_type_unsigned_int + if c == TYPE_LONG goto print_type_long + if c == TYPE_UNSIGNED_LONG goto print_type_unsigned_long + if c == TYPE_FLOAT goto print_type_float + if c == TYPE_DOUBLE goto print_type_double + if c == TYPE_POINTER goto print_type_pointer + if c == TYPE_ARRAY goto print_type_array + if c == TYPE_STRUCT goto print_type_struct + if c == TYPE_UNION goto print_type_union + fputs(2, .str_bad_print_type) + exit(1) + :str_bad_print_type + string Bad type passed to print_type. + byte 10 + byte 0 + :print_type_void + return puts(.str_void) + :print_type_char + return puts(.str_char) + :print_type_unsigned_char + return puts(.str_unsigned_char) + :print_type_short + return puts(.str_short) + :print_type_unsigned_short + return puts(.str_unsigned_short) + :print_type_int + return puts(.str_int) + :print_type_unsigned_int + return puts(.str_unsigned_int) + :print_type_long + return puts(.str_long) + :print_type_unsigned_long + return puts(.str_unsigned_long) + :print_type_float + return puts(.str_float) + :print_type_double + return puts(.str_double) + :print_type_pointer + putc('*) + type += 1 + goto print_type_top + :print_type_array + putc('[) + type += 1 + putn(*8type) ; UNALIGNED + putc(']) + type += 8 + goto print_type_top + :print_type_struct + return puts(.str_struct) + :print_type_union + return puts(.str_union) diff --git a/05/tokenize.b b/05/tokenize.b index f85dccf..2fc4bc1 100644 --- a/05/tokenize.b +++ b/05/tokenize.b @@ -108,10 +108,16 @@ global rodata_end_offset ; uint line ; ulong data ; This corresponds to translation phases 5-6 and the first half of 7 -; IMPORTANT: this function uses pointers to pptokens, so they should NOT be freed! +; IMPORTANT: this function uses pointers to pptokens, so it should NOT be freed! +; Returns a pointer to the end of tokens. function tokenize argument pptokens argument out + ; you might think we wouldn't need these arguments because the pptokens array starts with + ; a line directive. but we also use this function to tokenize the expression of a #if, + ; where that isn't the case. + argument initial_filename + argument initial_line_number local in local file local line_number @@ -129,6 +135,11 @@ function tokenize local lower local upper + file_add(initial_filename) + file = file_get_index(initial_filename) + line_number = initial_line_number + + in = pptokens :tokenize_loop c = *1in @@ -301,10 +312,10 @@ function tokenize :float_have_significand_and_exponent if significand == 0 goto float_zero normalize_float(&significand, &exponent) - putn(significand) - putc(32) - putn_signed(exponent) - putc(10) + ; putn(significand) + ; putc(32) + ; putn_signed(exponent) + ; putc(10) ; make number round to the nearest representable float roughly (this is what gcc does) ; this fails for 5e-100 probably because of imprecision, but mostly works significand += 15 @@ -357,8 +368,15 @@ function tokenize data = 0x7ff0000000000000 ; double infinity goto float_have_data :tokenize_loop_end + ; EOF token + *1out = TOKEN_EOF + out += 2 + *2out = file + out += 2 + *4out = line_number + out += 12 - return 0 + return out :f_suffix_on_integer compile_error(file, line_number, .str_f_suffix_on_integer) :str_f_suffix_on_integer @@ -581,6 +599,7 @@ function print_tokens if *1p == TOKEN_CONSTANT_FLOAT goto print_token_float if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal if *1p == TOKEN_IDENTIFIER goto print_token_identifier + if *1p == TOKEN_EOF goto print_token_eof fputs(2, .str_print_bad_token) exit(1) :print_token_keyword @@ -603,9 +622,13 @@ function print_tokens :print_token_float p += 8 puts(.str_constant_float) - putx(*8p) + putx64(*8p) p += 8 + putc(32) goto print_tokens_loop + :print_token_eof + puts(.str_eof) + goto print_token_data :print_token_info p += 1 putc('~) @@ -643,3 +666,6 @@ function print_tokens string Unrecognized token type in print_tokens. Aborting. byte 10 byte 0 + :str_eof + string EOF + byte 0 diff --git a/05/util.b b/05/util.b index 138e440..4fe72aa 100644 --- a/05/util.b +++ b/05/util.b @@ -44,6 +44,14 @@ function left_shift :left_shift_negative n = 0 - n return x > n + +function max_signed + argument a + argument b + if a > b goto maxs_return_a + return b + :maxs_return_a + return a function file_error argument name @@ -327,26 +335,45 @@ function fputn_signed fputn(fd, n) return -function fputx +:hex_digits + string 0123456789abcdef + +function fputx64 argument fd argument n local m local x m = 60 - :fputx_loop + :fputx64_loop x = n > m x &= 0xf x += .hex_digits fputc(fd, *1x) m -= 4 - if m >= 0 goto fputx_loop + if m >= 0 goto fputx64_loop return -:hex_digits - string 0123456789abcdef - -function putx +function putx64 + argument n + fputx64(1, n) + return + +function fputx32 + argument fd + argument n + local m + local x + m = 28 + :fputx32_loop + x = n > m + x &= 0xf + x += .hex_digits + fputc(fd, *1x) + m -= 4 + if m >= 0 goto fputx32_loop + return +function putx32 argument n - fputx(1, n) + fputx32(1, n) return function putn -- cgit v1.2.3