diff options
author | pommicket <pommicket@gmail.com> | 2022-01-11 15:55:37 -0500 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2022-01-11 15:55:37 -0500 |
commit | f8f044a7a273074fdd08247508cc6b44a218cad3 (patch) | |
tree | e61fa4684c841be5be9eada264d81b40691d76b9 | |
parent | a245a5be96144b2212a907e0a45c972e8a2774ed (diff) |
tokenizing string literals and identifiers
-rw-r--r-- | 05/constants.b | 9 | ||||
-rw-r--r-- | 05/main.b | 9 | ||||
-rw-r--r-- | 05/main.c | 11 | ||||
-rw-r--r-- | 05/tokenize.b | 71 | ||||
-rw-r--r-- | 05/util.b | 10 |
5 files changed, 99 insertions, 11 deletions
diff --git a/05/constants.b b/05/constants.b index 966a125..b3d23f5 100644 --- a/05/constants.b +++ b/05/constants.b @@ -1,3 +1,10 @@ +; this is the format of the executables we produce: +; elf header + code 4MB addresses 0x400000-0x7fffff +; read-only data 4MB addresses 0x800000-0xbfffff +; read-write data 4MB addresses 0xc00000-0xffffff +#define RODATA_OFFSET 0x400000 +#define RODATA_ADDR 0x800000 + ; C OPERATOR PRECEDENCE ; lowest ; 1 , @@ -69,7 +76,7 @@ #define TOKEN_CONSTANT_FLOAT 2 #define TOKEN_CONSTANT_INT 3 #define TOKEN_CONSTANT_CHAR 4 -#define TOKEN_STRING 5 +#define TOKEN_STRING_LITERAL 5 ; these are stored in the "info" field of the token #define NUMBER_NO_SUFFIX 0 @@ -8,6 +8,8 @@ byte 0 byte 0 goto main +global output_fd + global object_macros_size global function_macros_size @@ -78,6 +80,7 @@ function main local processed_pptokens local tokens + dat_banned_objmacros = 255 dat_banned_fmacros = 255 @@ -104,9 +107,15 @@ function main print_separator() ;print_object_macros() ;print_function_macros() + + output_fd = open_w(output_filename) + rodata_end_offset = RODATA_OFFSET + tokens = malloc(16000000) tokenize(pptokens, tokens) print_tokens(tokens) + ; NOTE: do NOT free pptokens as identifiers still reference them. + exit(0) :usage_error @@ -1,7 +1,4 @@ -'\xfa' 'w' 'e' 'l' 'l' '\'' '\\' -sizeof(int) -0x332l -0xffffffffffffffff -0755u -double * = &; - +"Hello ther" "e good fellow." +char * = "How are you"" d""o""i""ng today?\n"; +hi +_TEST _ING _1 diff --git a/05/tokenize.b b/05/tokenize.b index 125d660..3715fc1 100644 --- a/05/tokenize.b +++ b/05/tokenize.b @@ -96,6 +96,10 @@ function get_keyword_str string @BAD_KEYWORD_ID byte 0 + +; file offset to write next piece of read-only data; initialized in main.b +global rodata_end_offset + ; turn pptokens into tokens, written to out. ; tokens are 16 bytes and have the following format: ; uchar type @@ -104,6 +108,7 @@ function get_keyword_str ; uint line ; ulong data ; This corresponds to translation phases 5-6 and the first half of 7 +; IMPORTANT: this function uses pointers to pptokens, so they should NOT be freed! function tokenize argument pptokens argument out @@ -113,6 +118,7 @@ function tokenize local b local c local n + local p local data in = pptokens @@ -122,6 +128,7 @@ function tokenize if c == 32 goto tokenize_skip_pptoken if c == 10 goto tokenize_newline if c == '' goto tokenize_constant_char + if c == '" goto tokenize_string_literal if c == 0 goto tokenize_loop_end b = get_keyword_id(in) @@ -130,7 +137,22 @@ function tokenize b = isdigit_or_dot(c) if b != 0 goto tokenize_number - byte 0xcc + ; it's an identifier. we just need to make sure it's made up of identifier characters. + p = in + b = isalpha_or_underscore(*1p) + if b == 0 goto bad_token + + :ident_check_loop + b = isalnum_or_underscore(*1p) + if b == 0 goto bad_token + p += 1 + if *1p != 0 goto ident_check_loop + ; all good. + *1out = TOKEN_IDENTIFIER + out += 2 ; no info + data = in ; data will point to the identifier name + pptoken_skip(&in) + goto token_output :tokenize_newline line_number += 1 @@ -217,7 +239,28 @@ function tokenize :tokenize_float ; @TODO byte 0xcc - + :tokenize_string_literal + n = rodata_end_offset - RODATA_OFFSET + n += RODATA_ADDR ; address of string + lseek(output_fd, rodata_end_offset, SEEK_SET) + :string_literal_loop + in += 1 ; skip opening " + :string_literal_char_loop + if *1in == '" goto string_literal_char_loop_end + c = read_c_char(&in) + if c ] 255 goto bad_char_in_string + fputc(output_fd, c) + goto string_literal_char_loop + :string_literal_char_loop_end + pptoken_skip(&in) ; skip closing " + pptoken_skip_spaces(&in) + if *1in == '" goto string_literal_loop ; string concatenation, e.g. "Hello, " "world!" + fputc(output_fd, 0) ; null terminator + rodata_end_offset = lseek(output_fd, 0, SEEK_CUR) + *1out = TOKEN_STRING_LITERAL + out += 2 ; no info + data = n + goto token_output :tokenize_loop_end return 0 @@ -236,7 +279,17 @@ function tokenize :str_bad_char_constant string Bad character constant. Note that multibyte constants are not supported. byte 0 - + :bad_char_in_string + compile_error(file, line_number, .str_bad_char_in_string) + :str_bad_char_in_string + string Bad character in string literal. + byte 0 + :bad_token + compile_error(file, line_number, .str_bad_token) + :str_bad_token + string Bad token. + byte 0 + ; return character or escaped character from *p_in, advancing accordingly ; returns -1 on bad character function read_c_char @@ -390,6 +443,8 @@ function print_tokens if *1p > 20 goto print_token_keyword if *1p == TOKEN_CONSTANT_INT goto print_token_int if *1p == TOKEN_CONSTANT_CHAR goto print_token_char + if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal + if *1p == TOKEN_IDENTIFIER goto print_token_identifier fputs(2, .str_print_bad_token) exit(1) :print_token_keyword @@ -402,6 +457,13 @@ function print_tokens :print_token_char puts(.str_constant_char) goto print_token_data + :print_token_string_literal + puts(.str_string_literal) + goto print_token_data + :print_token_identifier + s = p + 8 + puts(*8s) + goto print_token_data :print_token_info p += 1 putc('~) @@ -429,6 +491,9 @@ function print_tokens :str_constant_char string character byte 0 + :str_string_literal + string string + byte 0 :str_print_bad_token string Unrecognized token type in print_tokens. Aborting. byte 10 @@ -351,6 +351,16 @@ function close syscall(3, fd) return +#define SEEK_SET 0 +#define SEEK_CUR 1 +#define SEEK_END 2 + +function lseek + argument fd + argument offset + argument whence + return syscall(8, fd, offset, whence) + function isupper argument c if c < 'A goto return_0 |