1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
|
global file_list ; initialized in main -- null-separated 255-terminated array of strings
; get the name of the file with the given index
function file_get
argument idx
local p
p = file_list
:file_get_loop
if idx == 0 goto file_got
if *1p == 255 goto file_uhoh
idx -= 1
p = memchr(p, 0)
p += 1
goto file_get_loop
:file_got
return p
:file_uhoh
fputs(2, .str_bad_file_index)
exit(1)
:str_bad_file_index
string Bad file index. This shouldn't happen.
byte 10
byte 0
; get the index of the given file, returns -1 if file does not exist
function file_get_index
argument filename
local p
local b
local i
p = file_list
i = 0
:file_get_index_loop
if *1p == 255 goto return_minus1
b = str_equals(p, filename)
if b != 0 goto file_found
i += 1
p = memchr(p, 0)
p += 1
goto file_get_index_loop
:file_found
return i
; add to list of files if not already there
function file_add
argument filename
local p
p = file_get_index(filename)
if p != -1 goto return_0
p = memchr(file_list, 255)
p = strcpy(p, filename)
p += 1
*1p = 255
return
; return keyword ID associated with str, or 0 if it's not a keyword
function get_keyword_id
argument keyword_str
local p
local c
local b
p = .keyword_table
:keyword_id_loop
c = *1p
if c == 255 goto no_such_keyword_str
p += 1
b = str_equals(keyword_str, p)
if b != 0 goto got_keyword_id
p = memchr(p, 0)
p += 1
goto keyword_id_loop
:no_such_keyword_str
return 0
:got_keyword_id
return c
; get string associated with keyword id, or "@BAD_KEYWORD_ID" if it's not a keyword
function get_keyword_str
argument keyword_id
local p
local c
local b
p = .keyword_table
:keyword_str_loop
c = *1p
if c == 255 goto no_such_keyword_id
if c == keyword_id goto found_keyword_id
p = memchr(p, 0)
p += 1
goto keyword_str_loop
:found_keyword_id
return p + 1
:no_such_keyword_id
return .str_no_such_keyword_id
:str_no_such_keyword_id
string @BAD_KEYWORD_ID
byte 0
; file offset to write next piece of read-only data; initialized in main.b
global rodata_end_offset
; turn pptokens into tokens, written to out.
; tokens are 16 bytes and have the following format:
; uchar type
; uchar info
; ushort file
; uint line
; ulong data
; This corresponds to translation phases 5-6 and the first half of 7
; IMPORTANT: this function uses pointers to pptokens, so it should NOT be freed!
; Returns a pointer to the end of tokens.
function tokenize
argument pptokens
argument out
; you might think we wouldn't need these arguments because the pptokens array starts with
; a line directive. but we also use this function to tokenize the expression of a #if,
; where that isn't the case.
argument initial_filename
argument initial_line_number
local in
local file
local line_number
local b
local c
local n
local p
local data
local significand
local exponent
local new_exponent
local pow10
local integer
local fraction
local lower
local upper
file_add(initial_filename)
file = file_get_index(initial_filename)
line_number = initial_line_number
in = pptokens
:tokenize_loop
c = *1in
if c == '$ goto tokenize_line_directive
if c == 32 goto tokenize_skip_pptoken
if c == 10 goto tokenize_newline
if c == '' goto tokenize_constant_char
if c == '" goto tokenize_string_literal
if c == 0 goto tokenize_loop_end
b = get_keyword_id(in)
if b != 0 goto tokenize_keyword
b = isdigit_or_dot(c)
if b != 0 goto tokenize_number
; it's an identifier. we just need to make sure it's made up of identifier characters.
p = in
b = isalpha_or_underscore(*1p)
if b == 0 goto bad_token
:ident_check_loop
b = isalnum_or_underscore(*1p)
if b == 0 goto bad_token
p += 1
if *1p != 0 goto ident_check_loop
; all good.
*1out = TOKEN_IDENTIFIER
out += 2 ; no info
data = in ; data will point to the identifier name
pptoken_skip(&in)
goto token_output
:tokenize_newline
line_number += 1
pptoken_skip(&in)
goto tokenize_loop
:tokenize_skip_pptoken
pptoken_skip(&in)
goto tokenize_loop
:tokenize_line_directive
in += 1
line_number = stoi(in)
in = memchr(in, 32)
in += 1
file_add(in)
file = file_get_index(in)
pptoken_skip(&in)
goto tokenize_loop
:token_no_data
data = 0
; (fallthrough)
:token_output ; write token location & data (see local variable data), and continue tokenizing
*2out = file
out += 2
*4out = line_number
out += 4
*8out = data
out += 8
goto tokenize_loop
:tokenize_keyword
pptoken_skip(&in)
*1out = b ; type
; no info for keywords
out += 2
goto token_no_data
:tokenize_number
; first, check if it's a float
b = strchr(in, '.)
if b != 0 goto tokenize_float
b = strchr(in, 'x) ; e may appear in hex integer literals, so we need to check this
if b != 0 goto tokenize_hex_integer
b = strchr(in, 'X)
if b != 0 goto tokenize_hex_integer
b = strchr(in, 'e) ; exponent
if b != 0 goto tokenize_float
b = strchr(in, 'E) ; exponent
if b != 0 goto tokenize_float
if *1in == '0 goto tokenize_octal_integer ; fun fact: in the C89 standard, 0 is considered an octal integer
; plain ol' decimal constant
n = strtoi(&in, 10)
goto tokenize_finish_integer
:tokenize_hex_integer
if *1in != '0 goto bad_number_token
in += 1
c = *1in
c &= 223 ; 223 = ~32 -- remove case
if c != 'X goto bad_number_token
in += 1
n = strtoi(&in, 16)
goto tokenize_finish_integer
:tokenize_octal_integer
in += 1 ; skip 0
n = strtoi(&in, 8)
goto tokenize_finish_integer
:tokenize_finish_integer
c = read_number_suffix(file, line_number, &in)
if c == NUMBER_SUFFIX_F goto f_suffix_on_integer
in += 1 ; move past null separator
*1out = TOKEN_CONSTANT_INT
out += 1
*1out = c ; info = suffix
out += 1
data = n
goto token_output
:tokenize_constant_char
in += 1
c = read_c_char(&in)
if *1in != '' goto bad_char_constant
if c ] 255 goto bad_char_constant
pptoken_skip(&in)
*1out = TOKEN_CONSTANT_CHAR
out += 2 ; no info
data = c
goto token_output
:tokenize_string_literal
n = rodata_end_offset - RODATA_OFFSET
n += RODATA_ADDR ; address of string
lseek(output_fd, rodata_end_offset, SEEK_SET)
:string_literal_loop
in += 1 ; skip opening "
:string_literal_char_loop
if *1in == '" goto string_literal_char_loop_end
c = read_c_char(&in)
if c ] 255 goto bad_char_in_string
fputc(output_fd, c)
goto string_literal_char_loop
:string_literal_char_loop_end
pptoken_skip(&in) ; skip closing "
pptoken_skip_spaces(&in)
if *1in == '" goto string_literal_loop ; string concatenation, e.g. "Hello, " "world!"
fputc(output_fd, 0) ; null terminator
rodata_end_offset = lseek(output_fd, 0, SEEK_CUR)
*1out = TOKEN_STRING_LITERAL
out += 2 ; no info
data = n
goto token_output
:tokenize_float
; @NONSTANDARD: this doesn't allow for floats whose integral part is >=2^64, e.g. 1000000000000000000000000.0
significand = 0
exponent = 0
pow10 = 0
integer = strtoi(&in, 10)
fraction = 0
if *1in != '. goto float_no_fraction
in += 1
p = in
fraction = strtoi(&in, 10)
; e.g. to turn 35 into .35, multiply by 10^-2
pow10 = p - in
if pow10 < -400 goto bad_float
:float_no_fraction
; construct the number integer + fraction*10^pow10
; first, deal with the fractional part
significand = fraction
float_multiply_by_power_of_10(&significand, &exponent, pow10)
if integer == 0 goto float_no_integer
; now deal with the integer part
new_exponent = leftmost_1bit(integer)
new_exponent -= 58
n = new_exponent - exponent
significand = right_shift(significand, n)
exponent = new_exponent
significand += right_shift(integer, exponent)
:float_no_integer
if *1in == 'e goto float_exponent
if *1in == 'E goto float_exponent
:float_have_significand_and_exponent
if significand == 0 goto float_zero
normalize_float(&significand, &exponent)
; putn(significand)
; putc(32)
; putn_signed(exponent)
; putc(10)
; make number round to the nearest representable float roughly (this is what gcc does)
; this fails for 5e-100 probably because of imprecision, but mostly works
significand += 15
; reduce to 53-bit significand (top bit is removed to get 52)
significand >= 5
exponent += 5
exponent += 52 ; 1001010111... => 1.001010111...
n = leftmost_1bit(significand)
b = 1 < n
significand &= ~b
data = significand
if exponent <= -1023 goto float_zero ; this number is too small in magnitude to be represented as a double. it becomes 0
if exponent >= 1024 goto float_infinity ; number too big to be represented as a double.
exponent += 1023 ; float format
data |= exponent < 52
:float_have_data
*1out = TOKEN_CONSTANT_FLOAT
out += 1
; suffix
*1out = read_number_suffix(file, line_number, &in)
pptoken_skip(&in)
out += 1
goto token_output
:float_exponent
in += 1
if *1in == '+ goto float_exponent_plus
if *1in == '- goto float_exponent_minus
; e.g. 1e100
pow10 = strtoi(&in, 10)
:float_have_exponent
float_multiply_by_power_of_10(&significand, &exponent, pow10)
goto float_have_significand_and_exponent
:float_exponent_plus
; e.g. 1e+100
in += 1
pow10 = strtoi(&in, 10)
goto float_have_exponent
:float_exponent_minus
; e.g. 1e-100
in += 1
pow10 = strtoi(&in, 10)
pow10 = 0 - pow10
goto float_have_exponent
:float_zero
data = 0
goto float_have_data
:float_infinity
data = 0x7ff0000000000000 ; double infinity
goto float_have_data
:tokenize_loop_end
; EOF token
*1out = TOKEN_EOF
out += 2
*2out = file
out += 2
*4out = line_number
out += 12
return out
:f_suffix_on_integer
compile_error(file, line_number, .str_f_suffix_on_integer)
:str_f_suffix_on_integer
string Integer with f suffix.
byte 0
:bad_number_token
compile_error(file, line_number, .str_bad_number_token)
:str_bad_number_token
string Bad number literal.
byte 0
:bad_char_constant
compile_error(file, line_number, .str_bad_char_constant)
:str_bad_char_constant
string Bad character constant. Note that multibyte constants are not supported.
byte 0
:bad_char_in_string
compile_error(file, line_number, .str_bad_char_in_string)
:str_bad_char_in_string
string Bad character in string literal.
byte 0
:bad_token
compile_error(file, line_number, .str_bad_token)
:str_bad_token
string Bad token.
byte 0
:bad_float
compile_error(file, line_number, .str_bad_float)
:str_bad_float
string Bad floating-point number.
byte 0
function float_multiply_by_power_of_10
argument p_significand
argument p_exponent
argument pow10
local significand
local exponent
local p
local lower
local upper
local n
significand = *8p_significand
exponent = *8p_exponent
p = powers_of_10
p += pow10 < 4
full_multiply_signed(significand, *8p, &lower, &upper)
if upper == 0 goto fmultiply2_no_upper
n = leftmost_1bit(upper)
n += 1
significand = lower > n
exponent += n
n = 64 - n
significand |= upper < n
goto fmultiply2_cont
:fmultiply2_no_upper
significand = lower
goto fmultiply2_cont
:fmultiply2_cont
p += 8
exponent += *8p
*8p_significand = significand
*8p_exponent = exponent
return 0
; return character or escaped character from *p_in, advancing accordingly
; returns -1 on bad character
function read_c_char
argument p_in
local in
local c
local x
in = *8p_in
if *1in == '\ goto escape_sequence
; no escape sequence; just a normal character
c = *1in
in += 1
goto escape_sequence_return
:escape_sequence
in += 1
c = *1in
in += 1
if c == 'x goto escape_sequence_hex
if c == '' goto escape_sequence_single_quote
if c == '" goto escape_sequence_double_quote
if c == '? goto escape_sequence_question
if c == '\ goto escape_sequence_backslash
if c == 'a goto escape_sequence_bell
if c == 'b goto escape_sequence_backspace
if c == 'f goto escape_sequence_form_feed
if c == 'n goto escape_sequence_newline
if c == 'r goto escape_sequence_carriage_return
if c == 't goto escape_sequence_tab
if c == 'v goto escape_sequence_vertical_tab
; octal
in -= 1
x = isoctdigit(*1in)
if x == 0 goto return_minus1
c = *1in - '0
in += 1
x = isoctdigit(*1in)
if x == 0 goto escape_sequence_return
c <= 3
c += *1in - '0
in += 1
x = isoctdigit(*1in)
if x == 0 goto escape_sequence_return
c <= 3
c += *1in - '0
in += 1
if c ] 255 goto return_minus1 ; e.g. '\712'
goto escape_sequence_return
:escape_sequence_hex
x = in
c = strtoi(&in, 16)
if in == x goto return_minus1 ; e.g. '\xhello'
if c ] 255 goto return_minus1 ; e.g. '\xabc'
goto escape_sequence_return
:escape_sequence_single_quote
c = ''
goto escape_sequence_return
:escape_sequence_double_quote
c = '"
goto escape_sequence_return
:escape_sequence_question
c = '?
goto escape_sequence_return
:escape_sequence_backslash
c = '\
goto escape_sequence_return
:escape_sequence_bell
c = 7
goto escape_sequence_return
:escape_sequence_backspace
c = 8
goto escape_sequence_return
:escape_sequence_form_feed
c = 12
goto escape_sequence_return
:escape_sequence_newline
c = 10
goto escape_sequence_return
:escape_sequence_carriage_return
c = 13
goto escape_sequence_return
:escape_sequence_tab
c = 9
goto escape_sequence_return
:escape_sequence_vertical_tab
c = 11
goto escape_sequence_return
:escape_sequence_return
*8p_in = in
return c
function read_number_suffix
argument file
argument line_number
argument p_s
local s
local c
local suffix
s = *8p_s
c = *1s
suffix = 0
if c == 0 goto number_suffix_return
if c == 'u goto number_suffix_u
if c == 'l goto number_suffix_l
if c == 'f goto number_suffix_f
goto bad_number_suffix
:number_suffix_u
s += 1
c = *1s
if c == 'l goto number_suffix_ul
if c != 0 goto bad_number_suffix
suffix = NUMBER_SUFFIX_U
goto number_suffix_return
:number_suffix_l
s += 1
c = *1s
if c == 'u goto number_suffix_ul
if c != 0 goto bad_number_suffix
suffix = NUMBER_SUFFIX_L
goto number_suffix_return
:number_suffix_ul
s += 1
c = *1s
if c != 0 goto bad_number_suffix
suffix = NUMBER_SUFFIX_UL
goto number_suffix_return
:number_suffix_f
s += 1
c = *1s
if c != 0 goto bad_number_suffix
suffix = NUMBER_SUFFIX_F
goto number_suffix_return
:number_suffix_return
*8p_s = s
return suffix
:bad_number_suffix
compile_error(file, line_number, .str_bad_number_suffix)
:str_bad_number_suffix
string Bad number suffix.
byte 0
function print_tokens
argument tokens
argument tokens_end
local p
local s
p = tokens
:print_tokens_loop
if p ]= tokens_end goto print_tokens_loop_end
if *1p == 0 goto print_tokens_loop_end
if *1p > 20 goto print_token_keyword
if *1p == TOKEN_CONSTANT_INT goto print_token_int
if *1p == TOKEN_CONSTANT_CHAR goto print_token_char
if *1p == TOKEN_CONSTANT_FLOAT goto print_token_float
if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal
if *1p == TOKEN_IDENTIFIER goto print_token_identifier
if *1p == TOKEN_EOF goto print_token_eof
fputs(2, .str_print_bad_token)
exit(1)
:print_token_keyword
s = get_keyword_str(*1p)
puts(s)
goto print_token_data
:print_token_int
puts(.str_constant_int)
goto print_token_info
:print_token_char
puts(.str_constant_char)
goto print_token_data
:print_token_string_literal
puts(.str_string_literal)
goto print_token_data
:print_token_identifier
s = p + 8
puts(*8s)
goto print_token_data
:print_token_float
p += 8
puts(.str_constant_float)
putx64(*8p)
p += 8
putc(32)
goto print_tokens_loop
:print_token_eof
puts(.str_eof)
goto print_token_data
:print_token_info
p += 1
putc('~)
putn(*1p)
p -= 1
:print_token_data
p += 2
putc('@)
putn(*2p)
p += 2
putc(':)
putn(*4p)
p += 4
putc(61)
putn(*8p)
p += 8
putc(32)
goto print_tokens_loop
:print_tokens_loop_end
putc(10)
return
:str_constant_int
string integer
byte 0
:str_constant_float
string float
byte 0
:str_constant_char
string character
byte 0
:str_string_literal
string string
byte 0
:str_print_bad_token
string Unrecognized token type in print_tokens. Aborting.
byte 10
byte 0
:str_eof
string EOF
byte 0
|