diff options
author | pommicket <pommicket@gmail.com> | 2022-01-08 12:15:17 -0500 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2022-01-08 12:15:17 -0500 |
commit | ac6cb985dbb0a7a1bf6311a0e9fd53033cf7ab94 (patch) | |
tree | b933e61333861a6dd79a3ea084be9bdb1cdc4970 | |
parent | d48816e226d5ba081dadbba4bcded92e5c0a23d1 (diff) |
preprocessing tokens
-rw-r--r-- | 05/constants.b | 76 | ||||
-rw-r--r-- | 05/main.b | 21 | ||||
-rw-r--r-- | 05/main.c | 20 | ||||
-rw-r--r-- | 05/preprocess.b | 305 | ||||
-rw-r--r-- | 05/util.b | 58 |
5 files changed, 452 insertions, 28 deletions
diff --git a/05/constants.b b/05/constants.b index 691fe65..852d714 100644 --- a/05/constants.b +++ b/05/constants.b @@ -30,3 +30,79 @@ #define KEYWORD_IF 130 #define KEYWORD_STATIC 131 #define KEYWORD_WHILE 132 + +:str_comment_start + string /* + byte 0 +:str_comment_end + string */ + byte 0 +:str_lshift_eq + string <<= + byte 0 +:str_rshift_eq + string >>= + byte 0 +:str_eq_eq + string == + byte 0 +:str_not_eq + string != + byte 0 +:str_gt_eq + string >= + byte 0 +:str_lt_eq + string <= + byte 0 +:str_plus_plus + string ++ + byte 0 +:str_minus_minus + string -- + byte 0 +:str_plus_eq + string += + byte 0 +:str_minus_eq + string -= + byte 0 +:str_times_eq + string *= + byte 0 +:str_div_eq + string /= + byte 0 +:str_remainder_eq + string %= + byte 0 +:str_and_eq + string &= + byte 0 +:str_or_eq + string |= + byte 0 +:str_xor_eq + string ^= + byte 0 +:str_and_and + string && + byte 0 +:str_or_or + string || + byte 0 +:str_lshift + string << + byte 0 +:str_rshift + string >> + byte 0 +:str_arrow + string -> + byte 0 +:str_dotdotdot + string ... + byte 0 +:str_hash_hash + string ## + byte 0 @@ -8,6 +8,23 @@ byte 0 byte 0 goto main +function compile_error + argument file + argument line + argument message + fputs(2, file) + fputc(2, ':) + fputn(2, line) + fputs(2, .str_error) + fputs(2, message) + fputc(2, 10) + exit(1) + +:str_error + string : Error: + byte 32 + byte 0 + #include util.b #include constants.b #include preprocess.b @@ -19,6 +36,7 @@ function main argument argc local input_filename local output_filename + local pptokens input_filename = .str_default_input_filename output_filename = .str_default_output_filename @@ -27,7 +45,8 @@ function main input_filename = argv1 output_filename = argv2 :have_filenames - split_into_preprocessing_tokens(input_filename) + pptokens = split_into_preprocessing_tokens(input_filename) + print_pptokens(pptokens) exit(0) :usage_error @@ -1,6 +1,14 @@ -test\ -ing/* -I am */testing -that this is working -hello \ -there. +#include <stdio.h> + +int test(int, double, ...);\ +/* here is a nice +comment it is +here +*/ +int main(void) { + printf("\"Hello, world!%c\n\"", '\''); + _X55 = Y4_C_; + a.b = c; + 5 + (.3e+5+6) & 0xff | 93 -~5; + return 0; +} diff --git a/05/preprocess.b b/05/preprocess.b index 36fcbd2..ddc0c3b 100644 --- a/05/preprocess.b +++ b/05/preprocess.b @@ -6,10 +6,12 @@ function split_into_preprocessing_tokens local file_contents local pptokens local p + local b local c local in local out local n + local line_number fd = open_r(filename) file_contents = malloc(2000000) @@ -19,6 +21,7 @@ function split_into_preprocessing_tokens n = syscall(0, fd, p, 4096) if n == 0 goto pptokens_read_loop_end p += n + goto pptokens_read_loop :pptokens_read_loop_end ; okay we read the file. first, delete every backslash-newline sequence (phase 2) @@ -56,20 +59,304 @@ function split_into_preprocessing_tokens :backslashnewline_loop_end *1out = 0 + ; split file into preprocessing tokens, remove comments (phase 3) + ; we're still doing the trick with newlines, this time for ones inside comments + ; this is needed because the following is legal C: + ; #include/* + ; */<stdio.h> + ; and is not equivalent to: + ; #include + ; <stdio.h> + newlines = 1 in = file_contents - - fputs(1, file_contents) + out = pptokens + line_number = 1 + :pptokens_loop + c = *1in + if c == 10 goto pptokens_newline_loop + if c == 0 goto pptokens_loop_end + if c == 32 goto pptoken_space + if c == 9 goto pptoken_space + b = isdigit(c) + if b != 0 goto pptoken_number + b = isalpha_or_underscore(c) + if b != 0 goto pptoken_identifier + b = str_startswith(in, .str_comment_start) + if b != 0 goto pptoken_comment + ; now we check for all the various operators and symbols in C + + if c == 59 goto pptoken_single_character ; semicolon + if c == '( goto pptoken_single_character + if c == ') goto pptoken_single_character + if c == '[ goto pptoken_single_character + if c == '] goto pptoken_single_character + if c == '{ goto pptoken_single_character + if c == '} goto pptoken_single_character + if c == ', goto pptoken_single_character + if c == '~ goto pptoken_single_character + if c == '? goto pptoken_single_character + if c == ': goto pptoken_single_character + if c == '" goto pptoken_string_or_char_literal + if c == '' goto pptoken_string_or_char_literal + b = str_startswith(in, .str_lshift_eq) + if b != 0 goto pptoken_3_chars + b = str_startswith(in, .str_rshift_eq) + if b != 0 goto pptoken_3_chars + b = str_startswith(in, .str_eq_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_not_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_gt_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_lt_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_plus_plus) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_minus_minus) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_plus_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_minus_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_times_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_div_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_remainder_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_and_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_or_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_xor_eq) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_and_and) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_or_or) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_lshift) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_rshift) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_arrow) + if b != 0 goto pptoken_2_chars + b = str_startswith(in, .str_dotdotdot) + if b != 0 goto pptoken_3_chars + b = str_startswith(in, .str_hash_hash) + if b != 0 goto pptoken_2_chars + if c == '+ goto pptoken_single_character + if c == '- goto pptoken_single_character + if c == '* goto pptoken_single_character + if c == '/ goto pptoken_single_character + if c == '% goto pptoken_single_character + if c == '& goto pptoken_single_character + if c == '| goto pptoken_single_character + if c == '^ goto pptoken_single_character + if c == '> goto pptoken_single_character + if c == '< goto pptoken_single_character + if c == '! goto pptoken_single_character + if c == '= goto pptoken_single_character + if c == '# goto pptoken_single_character + if c == '. goto pptoken_dot + + goto bad_pptoken + + :pptoken_comment + ; emit a space ("Each comment is replaced by one space character.") + *1out = 32 + out += 1 + *1out = 0 + out += 1 + ; skip over comment + :pptoken_comment_loop + b = str_startswith(in, .str_comment_end) + if b != 0 goto pptoken_comment_loop_end + c = *1in + in += 1 + if c == 0 goto unterminated_comment + if c == 10 goto pptoken_comment_newline + goto pptoken_comment_loop + :pptoken_comment_loop_end + in += 2 ; skip */ + goto pptokens_loop + :pptoken_comment_newline + ; keep line numbers correct + newlines += 1 + goto pptoken_comment_loop + :pptoken_dot + ; could just be a . or could be .3 -- we need to check if *(in+1) is a digit + p = in + 1 + b = isdigit(*1p) + if b != 0 goto pptoken_number + ; okay it's just a dot + goto pptoken_single_character + :pptoken_string_or_char_literal + local delimiter + local backslash + delimiter = c + backslash = 0 + *1out = c + out += 1 + in += 1 + :pptoken_strchar_loop + c = *1in + *1out = c + in += 1 + out += 1 + if c == '\ goto pptoken_strchar_backslash + if c == 10 goto unterminated_string + if c == 0 goto unterminated_string + b = backslash + backslash = 0 + if b == 1 goto pptoken_strchar_loop ; string can't end with an odd number of backslashes + if c == delimiter goto pptoken_strchar_loop_end + goto pptoken_strchar_loop + :pptoken_strchar_backslash + backslash ^= 1 + goto pptoken_strchar_loop + :pptoken_strchar_loop_end + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_number + c = *1in + b = is_ppnumber_char(c) + if b == 0 goto pptoken_number_end + *1out = c + out += 1 + in += 1 + if c == 'e goto pptoken_number_e + if c == 'E goto pptoken_number_e + goto pptoken_number + :pptoken_number_e + c = *1in + if c == '+ goto pptoken_number_sign + if c == '- goto pptoken_number_sign + goto pptoken_number + :pptoken_number_sign + ; special code to handle + - immediately following e + *1out = c + in += 1 + out += 1 + goto pptoken_number + :pptoken_number_end + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_identifier + c = *1in + b = isalnum_or_underscore(c) + if b == 0 goto pptoken_identifier_end + *1out = c + in += 1 + out += 1 + goto pptoken_identifier + :pptoken_identifier_end + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_space + ; space character token + *1out = 32 + in += 1 + out += 1 + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_single_character + ; a single character preprocessing token, like {?} + *1out = c + in += 1 + out += 1 + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_2_chars + ; two-character pptoken (e.g. ##) + *1out = c + in += 1 + out += 1 + *1out = *1in + in += 1 + out += 1 + *1out = 0 + out += 1 + goto pptokens_loop + :pptoken_3_chars + ; three-character pptoken (e.g. >>=) + *1out = c + in += 1 + out += 1 + *1out = *1in + in += 1 + out += 1 + *1out = *1in + in += 1 + out += 1 + *1out = 0 + out += 1 + goto pptokens_loop + :pptokens_newline_loop + if newlines == 0 goto pptokens_newline_loop_end + ; output a newline + *1out = 10 + out += 1 + *1out = 0 + out += 1 + line_number += 1 + newlines -= 1 + goto pptokens_newline_loop + :pptokens_newline_loop_end + newlines = 1 + in += 1 + goto pptokens_loop + :pptokens_loop_end free(file_contents) close(fd) - return + return pptokens :unterminated_comment - fputs(2, .str_unterminated_comment) - fputs(2, filename) - fputc(2, 10) - exit(1) + compile_error(filename, line_number, .str_unterminated_comment) :str_unterminated_comment - string Unterminated comment in file - byte 32 + string Unterminated comment. + byte 0 + :unterminated_string + compile_error(filename, line_number, .str_unterminated_string) + :str_unterminated_string + string Unterminated string or character literal. + byte 0 + :bad_pptoken + compile_error(filename, line_number, .str_bad_pptoken) + :str_bad_pptoken + string Bad preprocessing token. byte 0 + +; can the given character appear in a C89 ppnumber? +function is_ppnumber_char + argument c + if c == '. goto return_1 + if c < '0 goto return_0 + if c <= '9 goto return_1 + if c < 'A goto return_0 + if c <= 'Z goto return_1 + if c == '_ goto return_1 + if c < 'a goto return_0 + if c <= 'z goto return_1 + goto return_0 + +function print_pptokens + argument pptokens + local p + p = pptokens + :print_pptokens_loop + if *1p == 0 goto print_pptokens_loop_end + putc('{) + puts(p) + putc('}) + p += strlen(p) + p += 1 + goto print_pptokens_loop + :print_pptokens_loop_end + putc(10) + return @@ -82,11 +82,9 @@ function memchr argument mem argument c local p - local a p = mem :memchr_loop - a = *1p - if a == c goto memchr_loop_end + if *1p == c goto memchr_loop_end p += 1 goto memchr_loop :memchr_loop_end @@ -94,12 +92,10 @@ function memchr function strlen argument s - local c local p p = s :strlen_loop - c = *1p - if c == 0 goto strlen_loop_end + if *1p == 0 goto strlen_loop_end p += 1 goto strlen_loop :strlen_loop_end @@ -165,9 +161,7 @@ function fputn function fputc argument fd argument c - local p - p = &c - syscall(1, fd, p, 1) + syscall(1, fd, &c, 1) return function putc @@ -179,10 +173,8 @@ function putc function fgetc argument fd local c - local p c = 0 - p = &c - syscall(0, fd, p, 1) + syscall(0, fd, &c, 1) return c ; read a line from fd as a null-terminated string @@ -251,6 +243,48 @@ function isupper if c <= 'Z goto return_1 goto return_0 +function islower + argument c + if c < 'a goto return_0 + if c <= 'z goto return_1 + goto return_0 + +function isdigit + argument c + if c < '0 goto return_0 + if c <= '9 goto return_1 + goto return_0 + +function isalpha + argument c + if c < 'A goto return_0 + if c <= 'Z goto return_1 + if c < 'a goto return_0 + if c <= 'z goto return_1 + goto return_0 + +; characters which can start identifiers in C +function isalpha_or_underscore + argument c + if c < 'A goto return_0 + if c <= 'Z goto return_1 + if c == '_ goto return_1 + if c < 'a goto return_0 + if c <= 'z goto return_1 + goto return_0 + +; characters which can appear in identifiers in C +function isalnum_or_underscore + argument c + if c < '0 goto return_0 + if c <= '9 goto return_1 + if c < 'A goto return_0 + if c <= 'Z goto return_1 + if c == '_ goto return_1 + if c < 'a goto return_0 + if c <= 'z goto return_1 + goto return_0 + function exit argument status_code syscall(0x3c, status_code) |