summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2022-01-08 12:15:17 -0500
committerpommicket <pommicket@gmail.com>2022-01-08 12:15:17 -0500
commitac6cb985dbb0a7a1bf6311a0e9fd53033cf7ab94 (patch)
treeb933e61333861a6dd79a3ea084be9bdb1cdc4970
parentd48816e226d5ba081dadbba4bcded92e5c0a23d1 (diff)
preprocessing tokens
-rw-r--r--05/constants.b76
-rw-r--r--05/main.b21
-rw-r--r--05/main.c20
-rw-r--r--05/preprocess.b305
-rw-r--r--05/util.b58
5 files changed, 452 insertions, 28 deletions
diff --git a/05/constants.b b/05/constants.b
index 691fe65..852d714 100644
--- a/05/constants.b
+++ b/05/constants.b
@@ -30,3 +30,79 @@
#define KEYWORD_IF 130
#define KEYWORD_STATIC 131
#define KEYWORD_WHILE 132
+
+:str_comment_start
+ string /*
+ byte 0
+:str_comment_end
+ string */
+ byte 0
+:str_lshift_eq
+ string <<=
+ byte 0
+:str_rshift_eq
+ string >>=
+ byte 0
+:str_eq_eq
+ string ==
+ byte 0
+:str_not_eq
+ string !=
+ byte 0
+:str_gt_eq
+ string >=
+ byte 0
+:str_lt_eq
+ string <=
+ byte 0
+:str_plus_plus
+ string ++
+ byte 0
+:str_minus_minus
+ string --
+ byte 0
+:str_plus_eq
+ string +=
+ byte 0
+:str_minus_eq
+ string -=
+ byte 0
+:str_times_eq
+ string *=
+ byte 0
+:str_div_eq
+ string /=
+ byte 0
+:str_remainder_eq
+ string %=
+ byte 0
+:str_and_eq
+ string &=
+ byte 0
+:str_or_eq
+ string |=
+ byte 0
+:str_xor_eq
+ string ^=
+ byte 0
+:str_and_and
+ string &&
+ byte 0
+:str_or_or
+ string ||
+ byte 0
+:str_lshift
+ string <<
+ byte 0
+:str_rshift
+ string >>
+ byte 0
+:str_arrow
+ string ->
+ byte 0
+:str_dotdotdot
+ string ...
+ byte 0
+:str_hash_hash
+ string ##
+ byte 0
diff --git a/05/main.b b/05/main.b
index 6239f71..59a11a0 100644
--- a/05/main.b
+++ b/05/main.b
@@ -8,6 +8,23 @@ byte 0
byte 0
goto main
+function compile_error
+ argument file
+ argument line
+ argument message
+ fputs(2, file)
+ fputc(2, ':)
+ fputn(2, line)
+ fputs(2, .str_error)
+ fputs(2, message)
+ fputc(2, 10)
+ exit(1)
+
+:str_error
+ string : Error:
+ byte 32
+ byte 0
+
#include util.b
#include constants.b
#include preprocess.b
@@ -19,6 +36,7 @@ function main
argument argc
local input_filename
local output_filename
+ local pptokens
input_filename = .str_default_input_filename
output_filename = .str_default_output_filename
@@ -27,7 +45,8 @@ function main
input_filename = argv1
output_filename = argv2
:have_filenames
- split_into_preprocessing_tokens(input_filename)
+ pptokens = split_into_preprocessing_tokens(input_filename)
+ print_pptokens(pptokens)
exit(0)
:usage_error
diff --git a/05/main.c b/05/main.c
index fedd283..3a38edf 100644
--- a/05/main.c
+++ b/05/main.c
@@ -1,6 +1,14 @@
-test\
-ing/*
-I am */testing
-that this is working
-hello \
-there.
+#include <stdio.h>
+
+int test(int, double, ...);\
+/* here is a nice
+comment it is
+here
+*/
+int main(void) {
+ printf("\"Hello, world!%c\n\"", '\'');
+ _X55 = Y4_C_;
+ a.b = c;
+ 5 + (.3e+5+6) & 0xff | 93 -~5;
+ return 0;
+}
diff --git a/05/preprocess.b b/05/preprocess.b
index 36fcbd2..ddc0c3b 100644
--- a/05/preprocess.b
+++ b/05/preprocess.b
@@ -6,10 +6,12 @@ function split_into_preprocessing_tokens
local file_contents
local pptokens
local p
+ local b
local c
local in
local out
local n
+ local line_number
fd = open_r(filename)
file_contents = malloc(2000000)
@@ -19,6 +21,7 @@ function split_into_preprocessing_tokens
n = syscall(0, fd, p, 4096)
if n == 0 goto pptokens_read_loop_end
p += n
+ goto pptokens_read_loop
:pptokens_read_loop_end
; okay we read the file. first, delete every backslash-newline sequence (phase 2)
@@ -56,20 +59,304 @@ function split_into_preprocessing_tokens
:backslashnewline_loop_end
*1out = 0
+ ; split file into preprocessing tokens, remove comments (phase 3)
+ ; we're still doing the trick with newlines, this time for ones inside comments
+ ; this is needed because the following is legal C:
+ ; #include/*
+ ; */<stdio.h>
+ ; and is not equivalent to:
+ ; #include
+ ; <stdio.h>
+ newlines = 1
in = file_contents
-
- fputs(1, file_contents)
+ out = pptokens
+ line_number = 1
+ :pptokens_loop
+ c = *1in
+ if c == 10 goto pptokens_newline_loop
+ if c == 0 goto pptokens_loop_end
+ if c == 32 goto pptoken_space
+ if c == 9 goto pptoken_space
+ b = isdigit(c)
+ if b != 0 goto pptoken_number
+ b = isalpha_or_underscore(c)
+ if b != 0 goto pptoken_identifier
+ b = str_startswith(in, .str_comment_start)
+ if b != 0 goto pptoken_comment
+ ; now we check for all the various operators and symbols in C
+
+ if c == 59 goto pptoken_single_character ; semicolon
+ if c == '( goto pptoken_single_character
+ if c == ') goto pptoken_single_character
+ if c == '[ goto pptoken_single_character
+ if c == '] goto pptoken_single_character
+ if c == '{ goto pptoken_single_character
+ if c == '} goto pptoken_single_character
+ if c == ', goto pptoken_single_character
+ if c == '~ goto pptoken_single_character
+ if c == '? goto pptoken_single_character
+ if c == ': goto pptoken_single_character
+ if c == '" goto pptoken_string_or_char_literal
+ if c == '' goto pptoken_string_or_char_literal
+ b = str_startswith(in, .str_lshift_eq)
+ if b != 0 goto pptoken_3_chars
+ b = str_startswith(in, .str_rshift_eq)
+ if b != 0 goto pptoken_3_chars
+ b = str_startswith(in, .str_eq_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_not_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_gt_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_lt_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_plus_plus)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_minus_minus)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_plus_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_minus_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_times_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_div_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_remainder_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_and_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_or_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_xor_eq)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_and_and)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_or_or)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_lshift)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_rshift)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_arrow)
+ if b != 0 goto pptoken_2_chars
+ b = str_startswith(in, .str_dotdotdot)
+ if b != 0 goto pptoken_3_chars
+ b = str_startswith(in, .str_hash_hash)
+ if b != 0 goto pptoken_2_chars
+ if c == '+ goto pptoken_single_character
+ if c == '- goto pptoken_single_character
+ if c == '* goto pptoken_single_character
+ if c == '/ goto pptoken_single_character
+ if c == '% goto pptoken_single_character
+ if c == '& goto pptoken_single_character
+ if c == '| goto pptoken_single_character
+ if c == '^ goto pptoken_single_character
+ if c == '> goto pptoken_single_character
+ if c == '< goto pptoken_single_character
+ if c == '! goto pptoken_single_character
+ if c == '= goto pptoken_single_character
+ if c == '# goto pptoken_single_character
+ if c == '. goto pptoken_dot
+
+ goto bad_pptoken
+
+ :pptoken_comment
+ ; emit a space ("Each comment is replaced by one space character.")
+ *1out = 32
+ out += 1
+ *1out = 0
+ out += 1
+ ; skip over comment
+ :pptoken_comment_loop
+ b = str_startswith(in, .str_comment_end)
+ if b != 0 goto pptoken_comment_loop_end
+ c = *1in
+ in += 1
+ if c == 0 goto unterminated_comment
+ if c == 10 goto pptoken_comment_newline
+ goto pptoken_comment_loop
+ :pptoken_comment_loop_end
+ in += 2 ; skip */
+ goto pptokens_loop
+ :pptoken_comment_newline
+ ; keep line numbers correct
+ newlines += 1
+ goto pptoken_comment_loop
+ :pptoken_dot
+ ; could just be a . or could be .3 -- we need to check if *(in+1) is a digit
+ p = in + 1
+ b = isdigit(*1p)
+ if b != 0 goto pptoken_number
+ ; okay it's just a dot
+ goto pptoken_single_character
+ :pptoken_string_or_char_literal
+ local delimiter
+ local backslash
+ delimiter = c
+ backslash = 0
+ *1out = c
+ out += 1
+ in += 1
+ :pptoken_strchar_loop
+ c = *1in
+ *1out = c
+ in += 1
+ out += 1
+ if c == '\ goto pptoken_strchar_backslash
+ if c == 10 goto unterminated_string
+ if c == 0 goto unterminated_string
+ b = backslash
+ backslash = 0
+ if b == 1 goto pptoken_strchar_loop ; string can't end with an odd number of backslashes
+ if c == delimiter goto pptoken_strchar_loop_end
+ goto pptoken_strchar_loop
+ :pptoken_strchar_backslash
+ backslash ^= 1
+ goto pptoken_strchar_loop
+ :pptoken_strchar_loop_end
+ *1out = 0
+ out += 1
+ goto pptokens_loop
+ :pptoken_number
+ c = *1in
+ b = is_ppnumber_char(c)
+ if b == 0 goto pptoken_number_end
+ *1out = c
+ out += 1
+ in += 1
+ if c == 'e goto pptoken_number_e
+ if c == 'E goto pptoken_number_e
+ goto pptoken_number
+ :pptoken_number_e
+ c = *1in
+ if c == '+ goto pptoken_number_sign
+ if c == '- goto pptoken_number_sign
+ goto pptoken_number
+ :pptoken_number_sign
+ ; special code to handle + - immediately following e
+ *1out = c
+ in += 1
+ out += 1
+ goto pptoken_number
+ :pptoken_number_end
+ *1out = 0
+ out += 1
+ goto pptokens_loop
+ :pptoken_identifier
+ c = *1in
+ b = isalnum_or_underscore(c)
+ if b == 0 goto pptoken_identifier_end
+ *1out = c
+ in += 1
+ out += 1
+ goto pptoken_identifier
+ :pptoken_identifier_end
+ *1out = 0
+ out += 1
+ goto pptokens_loop
+ :pptoken_space
+ ; space character token
+ *1out = 32
+ in += 1
+ out += 1
+ *1out = 0
+ out += 1
+ goto pptokens_loop
+ :pptoken_single_character
+ ; a single character preprocessing token, like {?}
+ *1out = c
+ in += 1
+ out += 1
+ *1out = 0
+ out += 1
+ goto pptokens_loop
+ :pptoken_2_chars
+ ; two-character pptoken (e.g. ##)
+ *1out = c
+ in += 1
+ out += 1
+ *1out = *1in
+ in += 1
+ out += 1
+ *1out = 0
+ out += 1
+ goto pptokens_loop
+ :pptoken_3_chars
+ ; three-character pptoken (e.g. >>=)
+ *1out = c
+ in += 1
+ out += 1
+ *1out = *1in
+ in += 1
+ out += 1
+ *1out = *1in
+ in += 1
+ out += 1
+ *1out = 0
+ out += 1
+ goto pptokens_loop
+ :pptokens_newline_loop
+ if newlines == 0 goto pptokens_newline_loop_end
+ ; output a newline
+ *1out = 10
+ out += 1
+ *1out = 0
+ out += 1
+ line_number += 1
+ newlines -= 1
+ goto pptokens_newline_loop
+ :pptokens_newline_loop_end
+ newlines = 1
+ in += 1
+ goto pptokens_loop
+ :pptokens_loop_end
free(file_contents)
close(fd)
- return
+ return pptokens
:unterminated_comment
- fputs(2, .str_unterminated_comment)
- fputs(2, filename)
- fputc(2, 10)
- exit(1)
+ compile_error(filename, line_number, .str_unterminated_comment)
:str_unterminated_comment
- string Unterminated comment in file
- byte 32
+ string Unterminated comment.
+ byte 0
+ :unterminated_string
+ compile_error(filename, line_number, .str_unterminated_string)
+ :str_unterminated_string
+ string Unterminated string or character literal.
+ byte 0
+ :bad_pptoken
+ compile_error(filename, line_number, .str_bad_pptoken)
+ :str_bad_pptoken
+ string Bad preprocessing token.
byte 0
+
+; can the given character appear in a C89 ppnumber?
+function is_ppnumber_char
+ argument c
+ if c == '. goto return_1
+ if c < '0 goto return_0
+ if c <= '9 goto return_1
+ if c < 'A goto return_0
+ if c <= 'Z goto return_1
+ if c == '_ goto return_1
+ if c < 'a goto return_0
+ if c <= 'z goto return_1
+ goto return_0
+
+function print_pptokens
+ argument pptokens
+ local p
+ p = pptokens
+ :print_pptokens_loop
+ if *1p == 0 goto print_pptokens_loop_end
+ putc('{)
+ puts(p)
+ putc('})
+ p += strlen(p)
+ p += 1
+ goto print_pptokens_loop
+ :print_pptokens_loop_end
+ putc(10)
+ return
diff --git a/05/util.b b/05/util.b
index 13fed4d..c8f2851 100644
--- a/05/util.b
+++ b/05/util.b
@@ -82,11 +82,9 @@ function memchr
argument mem
argument c
local p
- local a
p = mem
:memchr_loop
- a = *1p
- if a == c goto memchr_loop_end
+ if *1p == c goto memchr_loop_end
p += 1
goto memchr_loop
:memchr_loop_end
@@ -94,12 +92,10 @@ function memchr
function strlen
argument s
- local c
local p
p = s
:strlen_loop
- c = *1p
- if c == 0 goto strlen_loop_end
+ if *1p == 0 goto strlen_loop_end
p += 1
goto strlen_loop
:strlen_loop_end
@@ -165,9 +161,7 @@ function fputn
function fputc
argument fd
argument c
- local p
- p = &c
- syscall(1, fd, p, 1)
+ syscall(1, fd, &c, 1)
return
function putc
@@ -179,10 +173,8 @@ function putc
function fgetc
argument fd
local c
- local p
c = 0
- p = &c
- syscall(0, fd, p, 1)
+ syscall(0, fd, &c, 1)
return c
; read a line from fd as a null-terminated string
@@ -251,6 +243,48 @@ function isupper
if c <= 'Z goto return_1
goto return_0
+function islower
+ argument c
+ if c < 'a goto return_0
+ if c <= 'z goto return_1
+ goto return_0
+
+function isdigit
+ argument c
+ if c < '0 goto return_0
+ if c <= '9 goto return_1
+ goto return_0
+
+function isalpha
+ argument c
+ if c < 'A goto return_0
+ if c <= 'Z goto return_1
+ if c < 'a goto return_0
+ if c <= 'z goto return_1
+ goto return_0
+
+; characters which can start identifiers in C
+function isalpha_or_underscore
+ argument c
+ if c < 'A goto return_0
+ if c <= 'Z goto return_1
+ if c == '_ goto return_1
+ if c < 'a goto return_0
+ if c <= 'z goto return_1
+ goto return_0
+
+; characters which can appear in identifiers in C
+function isalnum_or_underscore
+ argument c
+ if c < '0 goto return_0
+ if c <= '9 goto return_1
+ if c < 'A goto return_0
+ if c <= 'Z goto return_1
+ if c == '_ goto return_1
+ if c < 'a goto return_0
+ if c <= 'z goto return_1
+ goto return_0
+
function exit
argument status_code
syscall(0x3c, status_code)