summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2022-01-11 14:03:13 -0500
committerpommicket <pommicket@gmail.com>2022-01-11 14:03:47 -0500
commitb0837b367e812823e1723e95d3394744bb58e509 (patch)
tree9f87bdbaa4953396a576d8b496e478b872416536
parentfc96e22d4fc75e08c52e703a595d639f29eb101b (diff)
tokenizing integer literals
-rw-r--r--05/constants.b6
-rw-r--r--05/main.b1
-rw-r--r--05/main.c3
-rw-r--r--05/tokenize.b160
-rw-r--r--05/util.b80
5 files changed, 220 insertions, 30 deletions
diff --git a/05/constants.b b/05/constants.b
index 82da6b2..966a125 100644
--- a/05/constants.b
+++ b/05/constants.b
@@ -71,6 +71,12 @@
#define TOKEN_CONSTANT_CHAR 4
#define TOKEN_STRING 5
+; these are stored in the "info" field of the token
+#define NUMBER_NO_SUFFIX 0
+#define NUMBER_SUFFIX_U 1
+#define NUMBER_SUFFIX_L 2
+#define NUMBER_SUFFIX_UL 3
+#define NUMBER_SUFFIX_F 4
; #define KEYWORD_AUTO 21 (@NONSTANDARD auto only exists in C for legacy reasons and doesn't appear in TCC's source code)
#define KEYWORD_DOUBLE 22
diff --git a/05/main.b b/05/main.b
index 8ca9ab5..eafbe07 100644
--- a/05/main.b
+++ b/05/main.b
@@ -21,7 +21,6 @@ function fprint_filename
argument file
if file ] 65535 goto print_filename_string
file = file_get(file)
- fputs(2, file)
; (fallthrough)
:print_filename_string
fputs(2, file)
diff --git a/05/main.c b/05/main.c
index f789480..2748783 100644
--- a/05/main.c
+++ b/05/main.c
@@ -1,4 +1,7 @@
sizeof(int)
+0x332l
+0xffffffffffffffff
+0755u
double * = &;
diff --git a/05/tokenize.b b/05/tokenize.b
index 3a0d37d..aab3f45 100644
--- a/05/tokenize.b
+++ b/05/tokenize.b
@@ -98,7 +98,8 @@ function get_keyword_str
; turn pptokens into tokens, written to out.
; tokens are 16 bytes and have the following format:
-; ushort type
+; uchar type
+; uchar info
; ushort file
; uint line
; ulong data
@@ -109,16 +110,24 @@ function tokenize
local file
local line_number
local b
+ local c
+ local n
+ local data
+
in = pptokens
:tokenize_loop
- if *1in == '$ goto tokenize_line_directive
- if *1in == 32 goto tokenize_skip_pptoken
- if *1in == 10 goto tokenize_newline
- if *1in == 0 goto tokenize_loop_end
+ c = *1in
+ if c == '$ goto tokenize_line_directive
+ if c == 32 goto tokenize_skip_pptoken
+ if c == 10 goto tokenize_newline
+ if c == 0 goto tokenize_loop_end
b = get_keyword_id(in)
if b != 0 goto tokenize_keyword
+ b = isdigit_or_dot(c)
+ if b != 0 goto tokenize_number
+
byte 0xcc
:tokenize_newline
@@ -137,36 +146,157 @@ function tokenize
file = file_get_index(in)
pptoken_skip(&in)
goto tokenize_loop
- :tokenize_keyword
- *2out = b ; type
- out += 2
+ :token_no_data
+ data = 0
+ ; (fallthrough)
+ :token_output ; write token location & data (see local variable data), and continue tokenizing
*2out = file
out += 2
*4out = line_number
out += 4
- ; no data
+ *8out = data
out += 8
- pptoken_skip(&in)
goto tokenize_loop
+ :tokenize_keyword
+ pptoken_skip(&in)
+ *1out = b ; type
+ ; no info for keywords
+ out += 2
+ goto token_no_data
+ :tokenize_number
+ ; first, check if it's a float
+ b = strchr(in, '.)
+ if b != 0 goto tokenize_float
+ b = strchr(in, 'x) ; e may appear in hex integer literals, so we need to check this
+ if b != 0 goto tokenize_hex_integer
+ b = strchr(in, 'X)
+ if b != 0 goto tokenize_hex_integer
+ b = strchr(in, 'e) ; exponent
+ if b != 0 goto tokenize_float
+ b = strchr(in, 'E) ; exponent
+ if b != 0 goto tokenize_float
+ if *1in == '0 goto tokenize_octal_integer ; fun fact: in the C89 standard, 0 is considered an octal integer
+ ; plain ol' decimal constant
+ n = strtoi(&in, 10)
+ goto tokenize_finish_integer
+ :tokenize_hex_integer
+ if *1in != '0 goto bad_number_token
+ in += 1
+ c = *1in
+ c &= 223 ; 223 = ~32 -- remove case
+ if c != 'X goto bad_number_token
+ in += 1
+ n = strtoi(&in, 16)
+ goto tokenize_finish_integer
+ :tokenize_octal_integer
+ in += 1 ; skip 0
+ n = strtoi(&in, 8)
+ goto tokenize_finish_integer
+ :tokenize_finish_integer
+ c = read_number_suffix(file, line_number, &in)
+ if c == NUMBER_SUFFIX_F goto f_suffix_on_integer
+ in += 1 ; move past null separator
+ *1out = TOKEN_CONSTANT_INT
+ out += 1
+ *1out = c ; info = suffix
+ out += 1
+ data = n
+ goto token_output
+
+ :tokenize_float
+ ; @TODO
+ byte 0xcc
+
:tokenize_loop_end
return 0
+ :f_suffix_on_integer
+ compile_error(file, line_number, .str_f_suffix_on_integer)
+ :str_f_suffix_on_integer
+ string Integer with f suffix.
+ byte 0
+ :bad_number_token
+ compile_error(file, line_number, .str_bad_number_token)
+ :str_bad_number_token
+ string Bad number literal.
+ byte 0
+
+function read_number_suffix
+ argument file
+ argument line_number
+ argument p_s
+ local s
+ local c
+ local suffix
+ s = *8p_s
+ c = *1s
+ suffix = 0
+ if c == 0 goto number_suffix_return
+ if c == 'u goto number_suffix_u
+ if c == 'l goto number_suffix_l
+ if c == 'f goto number_suffix_f
+ goto bad_number_suffix
+ :number_suffix_u
+ s += 1
+ c = *1s
+ if c == 'l goto number_suffix_ul
+ if c != 0 goto bad_number_suffix
+ suffix = NUMBER_SUFFIX_U
+ goto number_suffix_return
+ :number_suffix_l
+ s += 1
+ c = *1s
+ if c == 'u goto number_suffix_ul
+ if c != 0 goto bad_number_suffix
+ suffix = NUMBER_SUFFIX_L
+ goto number_suffix_return
+ :number_suffix_ul
+ s += 1
+ c = *1s
+ if c != 0 goto bad_number_suffix
+ suffix = NUMBER_SUFFIX_UL
+ goto number_suffix_return
+ :number_suffix_f
+ s += 1
+ c = *1s
+ if c != 0 goto bad_number_suffix
+ suffix = NUMBER_SUFFIX_F
+ goto number_suffix_return
+ :number_suffix_return
+ *8p_s = s
+ return suffix
+
+ :bad_number_suffix
+ compile_error(file, line_number, .str_bad_number_suffix)
+ :str_bad_number_suffix
+ string Bad number suffix.
+ byte 0
+
function print_tokens
argument tokens
local p
local s
p = tokens
:print_tokens_loop
- if *2p == 0 goto print_tokens_loop_end
- if *2p > 20 goto print_token_keyword
+ if *1p == 0 goto print_tokens_loop_end
+ if *1p > 20 goto print_token_keyword
+ if *1p == TOKEN_CONSTANT_INT goto print_token_int
fputs(2, .str_print_bad_token)
exit(1)
:print_token_keyword
- s = get_keyword_str(*2p)
+ s = get_keyword_str(*1p)
puts(s)
goto print_token_data
+ :print_token_int
+ puts(.str_constant_int)
+ goto print_token_info
+ :print_token_info
+ p += 1
+ putc('~)
+ putn(*1p)
+ p -= 1
:print_token_data
p += 2
putc('@)
@@ -183,6 +313,10 @@ function print_tokens
:print_tokens_loop_end
putc(10)
return
+ :str_constant_int
+ string integer
+ byte 0
:str_print_bad_token
string Unrecognized token type in print_tokens. Aborting.
byte 10
+ byte 0
diff --git a/05/util.b b/05/util.b
index 119d2d1..c5a686c 100644
--- a/05/util.b
+++ b/05/util.b
@@ -58,25 +58,49 @@ function itos
:itos_loop_end
return p
-
-; returns the number at the start of the given string
-function stoi
- argument s
- local p
- local n
+; returns the number in the given base at the start of the string, advancing the string past it.
+function strtoi
+ argument p_s
+ argument base
+ local s
local c
+ local n
n = 0
- p = s
- :stoi_loop
- c = *1p
- if c < '0 goto stoi_loop_end
- if c > '9 goto stoi_loop_end
- n *= 10
- n += c - '0
- p += 1
- goto stoi_loop
- :stoi_loop_end
+ s = *8p_s
+ :strtoi_loop
+ c = *1s
+ if c < '0 goto strtoi_loop_end
+ if c <= '9 goto strtoi_decimal_digit
+ if c < 'A goto strtoi_loop_end
+ if c <= 'F goto strtoi_upper_hexdigit
+ if c < 'a goto strtoi_loop_end
+ if c <= 'f goto strtoi_lower_hexdigit
+ goto strtoi_loop_end
+
+ :strtoi_decimal_digit
+ c -= '0
+ goto strtoi_digit
+ :strtoi_upper_hexdigit
+ c += 10 - 'A
+ goto strtoi_digit
+ :strtoi_lower_hexdigit
+ c += 10 - 'a
+ goto strtoi_digit
+ :strtoi_digit
+ if c >= base goto strtoi_loop_end
+ n *= base
+ n += c
+ s += 1
+ goto strtoi_loop
+
+ :strtoi_loop_end
+ *8p_s = s
return n
+
+; returns the decimal number at the start of the given string
+function stoi
+ argument s
+ return strtoi(&s, 10)
function memchr
argument mem
@@ -90,6 +114,19 @@ function memchr
:memchr_loop_end
return p
+function strchr
+ argument str
+ argument c
+ local p
+ p = str
+ :strchr_loop
+ if *1p == 0 goto return_0
+ if *1p == c goto strchr_loop_end
+ p += 1
+ goto strchr_loop
+ :strchr_loop_end
+ return p
+
; copy from *p_src to *p_dest until terminator is reached, setting both to point to their respective terminators
function memccpy_advance
argument p_dest
@@ -362,6 +399,17 @@ function isalnum_or_underscore
if c <= 'z goto return_1
goto return_0
+; is the given character one of:
+; .0123456789
+; (these are the characters which can appear at the start of a number in C)
+function isdigit_or_dot
+ argument c
+ if c < '. goto return_0
+ if c == '. goto return_1
+ if c < '0 goto return_0
+ if c <= '9 goto return_1
+ goto return_0
+
function exit
argument status_code
syscall(0x3c, status_code)