summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2022-01-11 15:55:37 -0500
committerpommicket <pommicket@gmail.com>2022-01-11 15:55:37 -0500
commitf8f044a7a273074fdd08247508cc6b44a218cad3 (patch)
treee61fa4684c841be5be9eada264d81b40691d76b9
parenta245a5be96144b2212a907e0a45c972e8a2774ed (diff)
tokenizing string literals and identifiers
-rw-r--r--05/constants.b9
-rw-r--r--05/main.b9
-rw-r--r--05/main.c11
-rw-r--r--05/tokenize.b71
-rw-r--r--05/util.b10
5 files changed, 99 insertions, 11 deletions
diff --git a/05/constants.b b/05/constants.b
index 966a125..b3d23f5 100644
--- a/05/constants.b
+++ b/05/constants.b
@@ -1,3 +1,10 @@
+; this is the format of the executables we produce:
+; elf header + code 4MB addresses 0x400000-0x7fffff
+; read-only data 4MB addresses 0x800000-0xbfffff
+; read-write data 4MB addresses 0xc00000-0xffffff
+#define RODATA_OFFSET 0x400000
+#define RODATA_ADDR 0x800000
+
; C OPERATOR PRECEDENCE
; lowest
; 1 ,
@@ -69,7 +76,7 @@
#define TOKEN_CONSTANT_FLOAT 2
#define TOKEN_CONSTANT_INT 3
#define TOKEN_CONSTANT_CHAR 4
-#define TOKEN_STRING 5
+#define TOKEN_STRING_LITERAL 5
; these are stored in the "info" field of the token
#define NUMBER_NO_SUFFIX 0
diff --git a/05/main.b b/05/main.b
index eafbe07..fc4f7a2 100644
--- a/05/main.b
+++ b/05/main.b
@@ -8,6 +8,8 @@ byte 0
byte 0
goto main
+global output_fd
+
global object_macros_size
global function_macros_size
@@ -78,6 +80,7 @@ function main
local processed_pptokens
local tokens
+
dat_banned_objmacros = 255
dat_banned_fmacros = 255
@@ -104,9 +107,15 @@ function main
print_separator()
;print_object_macros()
;print_function_macros()
+
+ output_fd = open_w(output_filename)
+ rodata_end_offset = RODATA_OFFSET
+
tokens = malloc(16000000)
tokenize(pptokens, tokens)
print_tokens(tokens)
+ ; NOTE: do NOT free pptokens as identifiers still reference them.
+
exit(0)
:usage_error
diff --git a/05/main.c b/05/main.c
index 26cea22..1348a5b 100644
--- a/05/main.c
+++ b/05/main.c
@@ -1,7 +1,4 @@
-'\xfa' 'w' 'e' 'l' 'l' '\'' '\\'
-sizeof(int)
-0x332l
-0xffffffffffffffff
-0755u
-double * = &;
-
+"Hello ther" "e good fellow."
+char * = "How are you"" d""o""i""ng today?\n";
+hi
+_TEST _ING _1
diff --git a/05/tokenize.b b/05/tokenize.b
index 125d660..3715fc1 100644
--- a/05/tokenize.b
+++ b/05/tokenize.b
@@ -96,6 +96,10 @@ function get_keyword_str
string @BAD_KEYWORD_ID
byte 0
+
+; file offset to write next piece of read-only data; initialized in main.b
+global rodata_end_offset
+
; turn pptokens into tokens, written to out.
; tokens are 16 bytes and have the following format:
; uchar type
@@ -104,6 +108,7 @@ function get_keyword_str
; uint line
; ulong data
; This corresponds to translation phases 5-6 and the first half of 7
+; IMPORTANT: this function uses pointers to pptokens, so they should NOT be freed!
function tokenize
argument pptokens
argument out
@@ -113,6 +118,7 @@ function tokenize
local b
local c
local n
+ local p
local data
in = pptokens
@@ -122,6 +128,7 @@ function tokenize
if c == 32 goto tokenize_skip_pptoken
if c == 10 goto tokenize_newline
if c == '' goto tokenize_constant_char
+ if c == '" goto tokenize_string_literal
if c == 0 goto tokenize_loop_end
b = get_keyword_id(in)
@@ -130,7 +137,22 @@ function tokenize
b = isdigit_or_dot(c)
if b != 0 goto tokenize_number
- byte 0xcc
+ ; it's an identifier. we just need to make sure it's made up of identifier characters.
+ p = in
+ b = isalpha_or_underscore(*1p)
+ if b == 0 goto bad_token
+
+ :ident_check_loop
+ b = isalnum_or_underscore(*1p)
+ if b == 0 goto bad_token
+ p += 1
+ if *1p != 0 goto ident_check_loop
+ ; all good.
+ *1out = TOKEN_IDENTIFIER
+ out += 2 ; no info
+ data = in ; data will point to the identifier name
+ pptoken_skip(&in)
+ goto token_output
:tokenize_newline
line_number += 1
@@ -217,7 +239,28 @@ function tokenize
:tokenize_float
; @TODO
byte 0xcc
-
+ :tokenize_string_literal
+ n = rodata_end_offset - RODATA_OFFSET
+ n += RODATA_ADDR ; address of string
+ lseek(output_fd, rodata_end_offset, SEEK_SET)
+ :string_literal_loop
+ in += 1 ; skip opening "
+ :string_literal_char_loop
+ if *1in == '" goto string_literal_char_loop_end
+ c = read_c_char(&in)
+ if c ] 255 goto bad_char_in_string
+ fputc(output_fd, c)
+ goto string_literal_char_loop
+ :string_literal_char_loop_end
+ pptoken_skip(&in) ; skip closing "
+ pptoken_skip_spaces(&in)
+ if *1in == '" goto string_literal_loop ; string concatenation, e.g. "Hello, " "world!"
+ fputc(output_fd, 0) ; null terminator
+ rodata_end_offset = lseek(output_fd, 0, SEEK_CUR)
+ *1out = TOKEN_STRING_LITERAL
+ out += 2 ; no info
+ data = n
+ goto token_output
:tokenize_loop_end
return 0
@@ -236,7 +279,17 @@ function tokenize
:str_bad_char_constant
string Bad character constant. Note that multibyte constants are not supported.
byte 0
-
+ :bad_char_in_string
+ compile_error(file, line_number, .str_bad_char_in_string)
+ :str_bad_char_in_string
+ string Bad character in string literal.
+ byte 0
+ :bad_token
+ compile_error(file, line_number, .str_bad_token)
+ :str_bad_token
+ string Bad token.
+ byte 0
+
; return character or escaped character from *p_in, advancing accordingly
; returns -1 on bad character
function read_c_char
@@ -390,6 +443,8 @@ function print_tokens
if *1p > 20 goto print_token_keyword
if *1p == TOKEN_CONSTANT_INT goto print_token_int
if *1p == TOKEN_CONSTANT_CHAR goto print_token_char
+ if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal
+ if *1p == TOKEN_IDENTIFIER goto print_token_identifier
fputs(2, .str_print_bad_token)
exit(1)
:print_token_keyword
@@ -402,6 +457,13 @@ function print_tokens
:print_token_char
puts(.str_constant_char)
goto print_token_data
+ :print_token_string_literal
+ puts(.str_string_literal)
+ goto print_token_data
+ :print_token_identifier
+ s = p + 8
+ puts(*8s)
+ goto print_token_data
:print_token_info
p += 1
putc('~)
@@ -429,6 +491,9 @@ function print_tokens
:str_constant_char
string character
byte 0
+ :str_string_literal
+ string string
+ byte 0
:str_print_bad_token
string Unrecognized token type in print_tokens. Aborting.
byte 10
diff --git a/05/util.b b/05/util.b
index d37e4de..09f51f7 100644
--- a/05/util.b
+++ b/05/util.b
@@ -351,6 +351,16 @@ function close
syscall(3, fd)
return
+#define SEEK_SET 0
+#define SEEK_CUR 1
+#define SEEK_END 2
+
+function lseek
+ argument fd
+ argument offset
+ argument whence
+ return syscall(8, fd, offset, whence)
+
function isupper
argument c
if c < 'A goto return_0