diff options
author | pommicket <pommicket@gmail.com> | 2022-01-11 14:03:13 -0500 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2022-01-11 14:03:47 -0500 |
commit | b0837b367e812823e1723e95d3394744bb58e509 (patch) | |
tree | 9f87bdbaa4953396a576d8b496e478b872416536 | |
parent | fc96e22d4fc75e08c52e703a595d639f29eb101b (diff) |
tokenizing integer literals
-rw-r--r-- | 05/constants.b | 6 | ||||
-rw-r--r-- | 05/main.b | 1 | ||||
-rw-r--r-- | 05/main.c | 3 | ||||
-rw-r--r-- | 05/tokenize.b | 160 | ||||
-rw-r--r-- | 05/util.b | 80 |
5 files changed, 220 insertions, 30 deletions
diff --git a/05/constants.b b/05/constants.b index 82da6b2..966a125 100644 --- a/05/constants.b +++ b/05/constants.b @@ -71,6 +71,12 @@ #define TOKEN_CONSTANT_CHAR 4 #define TOKEN_STRING 5 +; these are stored in the "info" field of the token +#define NUMBER_NO_SUFFIX 0 +#define NUMBER_SUFFIX_U 1 +#define NUMBER_SUFFIX_L 2 +#define NUMBER_SUFFIX_UL 3 +#define NUMBER_SUFFIX_F 4 ; #define KEYWORD_AUTO 21 (@NONSTANDARD auto only exists in C for legacy reasons and doesn't appear in TCC's source code) #define KEYWORD_DOUBLE 22 @@ -21,7 +21,6 @@ function fprint_filename argument file if file ] 65535 goto print_filename_string file = file_get(file) - fputs(2, file) ; (fallthrough) :print_filename_string fputs(2, file) @@ -1,4 +1,7 @@ sizeof(int) +0x332l +0xffffffffffffffff +0755u double * = &; diff --git a/05/tokenize.b b/05/tokenize.b index 3a0d37d..aab3f45 100644 --- a/05/tokenize.b +++ b/05/tokenize.b @@ -98,7 +98,8 @@ function get_keyword_str ; turn pptokens into tokens, written to out. ; tokens are 16 bytes and have the following format: -; ushort type +; uchar type +; uchar info ; ushort file ; uint line ; ulong data @@ -109,16 +110,24 @@ function tokenize local file local line_number local b + local c + local n + local data + in = pptokens :tokenize_loop - if *1in == '$ goto tokenize_line_directive - if *1in == 32 goto tokenize_skip_pptoken - if *1in == 10 goto tokenize_newline - if *1in == 0 goto tokenize_loop_end + c = *1in + if c == '$ goto tokenize_line_directive + if c == 32 goto tokenize_skip_pptoken + if c == 10 goto tokenize_newline + if c == 0 goto tokenize_loop_end b = get_keyword_id(in) if b != 0 goto tokenize_keyword + b = isdigit_or_dot(c) + if b != 0 goto tokenize_number + byte 0xcc :tokenize_newline @@ -137,36 +146,157 @@ function tokenize file = file_get_index(in) pptoken_skip(&in) goto tokenize_loop - :tokenize_keyword - *2out = b ; type - out += 2 + :token_no_data + data = 0 + ; (fallthrough) + :token_output ; write token location & data (see local variable data), and continue tokenizing *2out = file out += 2 *4out = line_number out += 4 - ; no data + *8out = data out += 8 - pptoken_skip(&in) goto tokenize_loop + :tokenize_keyword + pptoken_skip(&in) + *1out = b ; type + ; no info for keywords + out += 2 + goto token_no_data + :tokenize_number + ; first, check if it's a float + b = strchr(in, '.) + if b != 0 goto tokenize_float + b = strchr(in, 'x) ; e may appear in hex integer literals, so we need to check this + if b != 0 goto tokenize_hex_integer + b = strchr(in, 'X) + if b != 0 goto tokenize_hex_integer + b = strchr(in, 'e) ; exponent + if b != 0 goto tokenize_float + b = strchr(in, 'E) ; exponent + if b != 0 goto tokenize_float + if *1in == '0 goto tokenize_octal_integer ; fun fact: in the C89 standard, 0 is considered an octal integer + ; plain ol' decimal constant + n = strtoi(&in, 10) + goto tokenize_finish_integer + :tokenize_hex_integer + if *1in != '0 goto bad_number_token + in += 1 + c = *1in + c &= 223 ; 223 = ~32 -- remove case + if c != 'X goto bad_number_token + in += 1 + n = strtoi(&in, 16) + goto tokenize_finish_integer + :tokenize_octal_integer + in += 1 ; skip 0 + n = strtoi(&in, 8) + goto tokenize_finish_integer + :tokenize_finish_integer + c = read_number_suffix(file, line_number, &in) + if c == NUMBER_SUFFIX_F goto f_suffix_on_integer + in += 1 ; move past null separator + *1out = TOKEN_CONSTANT_INT + out += 1 + *1out = c ; info = suffix + out += 1 + data = n + goto token_output + + :tokenize_float + ; @TODO + byte 0xcc + :tokenize_loop_end return 0 + :f_suffix_on_integer + compile_error(file, line_number, .str_f_suffix_on_integer) + :str_f_suffix_on_integer + string Integer with f suffix. + byte 0 + :bad_number_token + compile_error(file, line_number, .str_bad_number_token) + :str_bad_number_token + string Bad number literal. + byte 0 + +function read_number_suffix + argument file + argument line_number + argument p_s + local s + local c + local suffix + s = *8p_s + c = *1s + suffix = 0 + if c == 0 goto number_suffix_return + if c == 'u goto number_suffix_u + if c == 'l goto number_suffix_l + if c == 'f goto number_suffix_f + goto bad_number_suffix + :number_suffix_u + s += 1 + c = *1s + if c == 'l goto number_suffix_ul + if c != 0 goto bad_number_suffix + suffix = NUMBER_SUFFIX_U + goto number_suffix_return + :number_suffix_l + s += 1 + c = *1s + if c == 'u goto number_suffix_ul + if c != 0 goto bad_number_suffix + suffix = NUMBER_SUFFIX_L + goto number_suffix_return + :number_suffix_ul + s += 1 + c = *1s + if c != 0 goto bad_number_suffix + suffix = NUMBER_SUFFIX_UL + goto number_suffix_return + :number_suffix_f + s += 1 + c = *1s + if c != 0 goto bad_number_suffix + suffix = NUMBER_SUFFIX_F + goto number_suffix_return + :number_suffix_return + *8p_s = s + return suffix + + :bad_number_suffix + compile_error(file, line_number, .str_bad_number_suffix) + :str_bad_number_suffix + string Bad number suffix. + byte 0 + function print_tokens argument tokens local p local s p = tokens :print_tokens_loop - if *2p == 0 goto print_tokens_loop_end - if *2p > 20 goto print_token_keyword + if *1p == 0 goto print_tokens_loop_end + if *1p > 20 goto print_token_keyword + if *1p == TOKEN_CONSTANT_INT goto print_token_int fputs(2, .str_print_bad_token) exit(1) :print_token_keyword - s = get_keyword_str(*2p) + s = get_keyword_str(*1p) puts(s) goto print_token_data + :print_token_int + puts(.str_constant_int) + goto print_token_info + :print_token_info + p += 1 + putc('~) + putn(*1p) + p -= 1 :print_token_data p += 2 putc('@) @@ -183,6 +313,10 @@ function print_tokens :print_tokens_loop_end putc(10) return + :str_constant_int + string integer + byte 0 :str_print_bad_token string Unrecognized token type in print_tokens. Aborting. byte 10 + byte 0 @@ -58,25 +58,49 @@ function itos :itos_loop_end return p - -; returns the number at the start of the given string -function stoi - argument s - local p - local n +; returns the number in the given base at the start of the string, advancing the string past it. +function strtoi + argument p_s + argument base + local s local c + local n n = 0 - p = s - :stoi_loop - c = *1p - if c < '0 goto stoi_loop_end - if c > '9 goto stoi_loop_end - n *= 10 - n += c - '0 - p += 1 - goto stoi_loop - :stoi_loop_end + s = *8p_s + :strtoi_loop + c = *1s + if c < '0 goto strtoi_loop_end + if c <= '9 goto strtoi_decimal_digit + if c < 'A goto strtoi_loop_end + if c <= 'F goto strtoi_upper_hexdigit + if c < 'a goto strtoi_loop_end + if c <= 'f goto strtoi_lower_hexdigit + goto strtoi_loop_end + + :strtoi_decimal_digit + c -= '0 + goto strtoi_digit + :strtoi_upper_hexdigit + c += 10 - 'A + goto strtoi_digit + :strtoi_lower_hexdigit + c += 10 - 'a + goto strtoi_digit + :strtoi_digit + if c >= base goto strtoi_loop_end + n *= base + n += c + s += 1 + goto strtoi_loop + + :strtoi_loop_end + *8p_s = s return n + +; returns the decimal number at the start of the given string +function stoi + argument s + return strtoi(&s, 10) function memchr argument mem @@ -90,6 +114,19 @@ function memchr :memchr_loop_end return p +function strchr + argument str + argument c + local p + p = str + :strchr_loop + if *1p == 0 goto return_0 + if *1p == c goto strchr_loop_end + p += 1 + goto strchr_loop + :strchr_loop_end + return p + ; copy from *p_src to *p_dest until terminator is reached, setting both to point to their respective terminators function memccpy_advance argument p_dest @@ -362,6 +399,17 @@ function isalnum_or_underscore if c <= 'z goto return_1 goto return_0 +; is the given character one of: +; .0123456789 +; (these are the characters which can appear at the start of a number in C) +function isdigit_or_dot + argument c + if c < '. goto return_0 + if c == '. goto return_1 + if c < '0 goto return_0 + if c <= '9 goto return_1 + goto return_0 + function exit argument status_code syscall(0x3c, status_code) |