diff options
author | pommicket <pommicket@gmail.com> | 2022-01-08 12:15:17 -0500 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2022-01-08 12:15:17 -0500 |
commit | ac6cb985dbb0a7a1bf6311a0e9fd53033cf7ab94 (patch) | |
tree | b933e61333861a6dd79a3ea084be9bdb1cdc4970 /05/preprocess.b | |
parent | d48816e226d5ba081dadbba4bcded92e5c0a23d1 (diff) |
preprocessing tokens
Diffstat (limited to '05/preprocess.b')
-rw-r--r-- | 05/preprocess.b | 305 |
1 files changed, 296 insertions, 9 deletions
diff --git a/05/preprocess.b b/05/preprocess.b index 36fcbd2..ddc0c3b 100644 --- a/05/preprocess.b +++ b/05/preprocess.b @@ -6,10 +6,12 @@ function split_into_preprocessing_tokens local file_contents local pptokens local p + local b local c local in local out local n + local line_number fd = open_r(filename) file_contents = malloc(2000000) @@ -19,6 +21,7 @@ function split_into_preprocessing_tokens n = syscall(0, fd, p, 4096) if n == 0 goto pptokens_read_loop_end p += n + goto pptokens_read_loop :pptokens_read_loop_end ; okay we read the file. first, delete every backslash-newline sequence (phase 2) @@ -56,20 +59,304 @@ function split_into_preprocessing_tokens :backslashnewline_loop_end *1out = 0 + ; split file into preprocessing tokens, remove comments (phase 3) + ; we're still doing the trick with newlines, this time for ones inside comments + ; this is needed because the following is legal C: + ; #include/* + ; */<stdio.h> + ; and is not equivalent to: + ; #include + ; <stdio.h> + newlines = 1 in = file_contents - - fputs(1, file_contents) + out = pptokens + line_number = 1 + :pptokens_loop + c = *1in + if c == 10 goto pptokens_newline_loop + if c == 0 goto pptokens_loop_end + if c == 32 goto pptoken_space + if c == 9 goto pptoken_space + b = isdigit(c) + if b != 0 goto pptoken_number + b = isalpha_or_underscore(c) + if b != 0 goto pptoken_identifier + b = str_startswith(in, .str_comment_start) + if b != 0 goto pptoken_comment + ; now we check for all the various operators and symbols in C + + if c == 59 goto pptoken_single_character ; semicolon + if c == '( goto pptoken_single_character + if c == ') goto pptoken_single_character + if c == '[ goto pptoken_single_character + if c == '] goto pptoken_single_character + if c == '{ goto pptoken_single_character + if c == '} goto pptoken_single_character + if c == ', goto pptoken_single_character + if c == '~ goto pptoken_single_character + if c == '? goto pptoken_single_character + if c == ': goto pptoken_single_character + if c == '" goto pptoken_string_or_char_literal + if c == '' goto pptoken_string_or_char_literal + b = str_startswith(in, .str_lshift_eq) + if b != 0 goto pptoken_3_chars + b = str_startswith(in, .str_rshift_eq) + if b != 0 goto pptoken_3_chars + b = str_startswith(in, .str_eq_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_not_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_gt_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_lt_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_plus_plus) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_minus_minus) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_plus_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_minus_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_times_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_div_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_remainder_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_and_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_or_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_xor_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_and_and) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_or_or) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_lshift) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_rshift) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_arrow) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_dotdotdot) + if b != 0 goto pptoken_3_chars + b = str_startswith(in, .str_hash_hash) + if b != 0 goto pptoken_2_chars + if c == '+ goto pptoken_single_character + if c == '- goto pptoken_single_character + if c == '* goto pptoken_single_character + if c == '/ goto pptoken_single_character + if c == '% goto pptoken_single_character + if c == '& goto pptoken_single_character + if c == '| goto pptoken_single_character + if c == '^ goto pptoken_single_character + if c == '> goto pptoken_single_character + if c == '< goto pptoken_single_character + if c == '! goto pptoken_single_character + if c == '= goto pptoken_single_character + if c == '# goto pptoken_single_character + if c == '. goto pptoken_dot + + goto bad_pptoken + + :pptoken_comment + ; emit a space ("Each comment is replaced by one space character.") + *1out = 32 + out += 1 + *1out = 0 + out += 1 + ; skip over comment + :pptoken_comment_loop + b = str_startswith(in, .str_comment_end) + if b != 0 goto pptoken_comment_loop_end + c = *1in + in += 1 + if c == 0 goto unterminated_comment + if c == 10 goto pptoken_comment_newline + goto pptoken_comment_loop + :pptoken_comment_loop_end + in += 2 ; skip */ + goto pptokens_loop + :pptoken_comment_newline + ; keep line numbers correct + newlines += 1 + goto pptoken_comment_loop + :pptoken_dot + ; could just be a . or could be .3 -- we need to check if *(in+1) is a digit + p = in + 1 + b = isdigit(*1p) + if b != 0 goto pptoken_number + ; okay it's just a dot + goto pptoken_single_character + :pptoken_string_or_char_literal + local delimiter + local backslash + delimiter = c + backslash = 0 + *1out = c + out += 1 + in += 1 + :pptoken_strchar_loop + c = *1in + *1out = c + in += 1 + out += 1 + if c == '\ goto pptoken_strchar_backslash + if c == 10 goto unterminated_string + if c == 0 goto unterminated_string + b = backslash + backslash = 0 + if b == 1 goto pptoken_strchar_loop ; string can't end with an odd number of backslashes + if c == delimiter goto pptoken_strchar_loop_end + goto pptoken_strchar_loop + :pptoken_strchar_backslash + backslash ^= 1 + goto pptoken_strchar_loop + :pptoken_strchar_loop_end + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_number + c = *1in + b = is_ppnumber_char(c) + if b == 0 goto pptoken_number_end + *1out = c + out += 1 + in += 1 + if c == 'e goto pptoken_number_e + if c == 'E goto pptoken_number_e + goto pptoken_number + :pptoken_number_e + c = *1in + if c == '+ goto pptoken_number_sign + if c == '- goto pptoken_number_sign + goto pptoken_number + :pptoken_number_sign + ; special code to handle + - immediately following e + *1out = c + in += 1 + out += 1 + goto pptoken_number + :pptoken_number_end + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_identifier + c = *1in + b = isalnum_or_underscore(c) + if b == 0 goto pptoken_identifier_end + *1out = c + in += 1 + out += 1 + goto pptoken_identifier + :pptoken_identifier_end + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_space + ; space character token + *1out = 32 + in += 1 + out += 1 + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_single_character + ; a single character preprocessing token, like {?} + *1out = c + in += 1 + out += 1 + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_2_chars + ; two-character pptoken (e.g. ##) + *1out = c + in += 1 + out += 1 + *1out = *1in + in += 1 + out += 1 + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_3_chars + ; three-character pptoken (e.g. >>=) + *1out = c + in += 1 + out += 1 + *1out = *1in + in += 1 + out += 1 + *1out = *1in + in += 1 + out += 1 + *1out = 0 + out += 1 + goto pptokens_loop + :pptokens_newline_loop + if newlines == 0 goto pptokens_newline_loop_end + ; output a newline + *1out = 10 + out += 1 + *1out = 0 + out += 1 + line_number += 1 + newlines -= 1 + goto pptokens_newline_loop + :pptokens_newline_loop_end + newlines = 1 + in += 1 + goto pptokens_loop + :pptokens_loop_end free(file_contents) close(fd) - return + return pptokens :unterminated_comment - fputs(2, .str_unterminated_comment) - fputs(2, filename) - fputc(2, 10) - exit(1) + compile_error(filename, line_number, .str_unterminated_comment) :str_unterminated_comment - string Unterminated comment in file - byte 32 + string Unterminated comment. + byte 0 + :unterminated_string + compile_error(filename, line_number, .str_unterminated_string) + :str_unterminated_string + string Unterminated string or character literal. + byte 0 + :bad_pptoken + compile_error(filename, line_number, .str_bad_pptoken) + :str_bad_pptoken + string Bad preprocessing token. byte 0 + +; can the given character appear in a C89 ppnumber? +function is_ppnumber_char + argument c + if c == '. goto return_1 + if c < '0 goto return_0 + if c <= '9 goto return_1 + if c < 'A goto return_0 + if c <= 'Z goto return_1 + if c == '_ goto return_1 + if c < 'a goto return_0 + if c <= 'z goto return_1 + goto return_0 + +function print_pptokens + argument pptokens + local p + p = pptokens + :print_pptokens_loop + if *1p == 0 goto print_pptokens_loop_end + putc('{) + puts(p) + putc('}) + p += strlen(p) + p += 1 + goto print_pptokens_loop + :print_pptokens_loop_end + putc(10) + return |