summaryrefslogtreecommitdiff
path: root/05/tokenize.b
blob: 977b162e6e88e873178585cb3756cbc1e89ad0d0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
global file_list ; initialized in main -- null-separated 255-terminated array of strings

; get the name of the file with the given index
function file_get
	argument idx
	local p
	p = file_list
	:file_get_loop
		if idx == 0 goto file_got
		if *1p == 255 goto file_uhoh
		idx -= 1
		p = memchr(p, 0)
		p += 1
		goto file_get_loop
	:file_got
	return p
	:file_uhoh
	fputs(2, .str_bad_file_index)
	exit(1)
	:str_bad_file_index
		string Bad file index. This shouldn't happen.
		byte 10
		byte 0
	
; get the index of the given file, returns -1 if file does not exist
function file_get_index
	argument filename
	local p
	local b
	local i
	p = file_list
	i = 0
	:file_get_index_loop
		if *1p == 255 goto return_minus1
		b = str_equals(p, filename)
		if b != 0 goto file_found
		i += 1
		p = memchr(p, 0)
		p += 1
		goto file_get_index_loop
	:file_found
	return i
	
; add to list of files if not already there
function file_add
	argument filename
	local p
	p = file_get_index(filename)
	if p != -1 goto return_0
	p = memchr(file_list, 255)
	p = strcpy(p, filename)
	p += 1
	*1p = 255
	return

; return keyword ID associated with str, or 0 if it's not a keyword
function get_keyword_id
	argument keyword_str
	local p
	local c
	local b
	p = .keyword_table
	:keyword_id_loop
		c = *1p
		if c == 255 goto no_such_keyword_str
		p += 1
		b = str_equals(keyword_str, p)
		if b != 0 goto got_keyword_id
		p = memchr(p, 0)
		p += 1
		goto keyword_id_loop
	:no_such_keyword_str
	return 0
	:got_keyword_id
	return c

; get string associated with keyword id, or "@BAD_KEYWORD_ID" if it's not a keyword
function get_keyword_str
	argument keyword_id
	local p
	local c
	local b
	p = .keyword_table
	:keyword_str_loop
		c = *1p
		if c == 255 goto no_such_keyword_id
		if c == keyword_id goto found_keyword_id
		p = memchr(p, 0)
		p += 1
		goto keyword_str_loop
	:found_keyword_id
		return p + 1
	:no_such_keyword_id
		return .str_no_such_keyword_id
	:str_no_such_keyword_id
		string @BAD_KEYWORD_ID
		byte 0
	

; turn pptokens into tokens, written to out.
; tokens are 16 bytes and have the following format:
;    uchar type
;    uchar info
;    ushort file
;    uint line
;    ulong data  -- for int/float literals, the value; for string literals, the runtime address; for identifiers, the name of the identifier
; This corresponds to translation phases 5-6 and the first half of 7
; IMPORTANT: this function uses pointers to pptokens, so it should NOT be freed!
; Returns a pointer to the end of tokens.
function tokenize
	argument pptokens
	argument out
	; you might think we wouldn't need these arguments because the pptokens array starts with
	; a line directive. but we also use this function to tokenize the expression of a #if,
	; where that isn't the case.
	argument initial_filename
	argument initial_line_number
	local in
	local file
	local line_number
	local b
	local c
	local n
	local p
	local data
	local significand
	local exponent
	local new_exponent
	local pow10
	local integer
	local fraction
	local lower
	local upper
	
	file_add(initial_filename)
	file = file_get_index(initial_filename)
	line_number = initial_line_number
	
	
	in = pptokens
	:tokenize_loop
		c = *1in
		if c == '$ goto tokenize_line_directive
		if c == 32 goto tokenize_skip_pptoken
		if c == 10 goto tokenize_newline
		if c == '' goto tokenize_constant_char
		if c == '" goto tokenize_string_literal
		if c == 0 goto tokenize_loop_end
		
		b = get_keyword_id(in)
		if b != 0 goto tokenize_keyword
		
		b = isdigit_or_dot(c)
		if b != 0 goto tokenize_number
		
		; it's an identifier. we just need to make sure it's made up of identifier characters.
		p = in
		b = isalpha_or_underscore(*1p)
		if b == 0 goto bad_token
		
		:ident_check_loop
			b = isalnum_or_underscore(*1p)
			if b == 0 goto bad_token
			p += 1
			if *1p != 0 goto ident_check_loop
		; all good.
		*1out = TOKEN_IDENTIFIER
		out += 2 ; no info
		data = in ; data will point to the identifier name
		pptoken_skip(&in)
		goto token_output
		
		:tokenize_newline
			line_number += 1
			pptoken_skip(&in)
			goto tokenize_loop
		:tokenize_skip_pptoken
			pptoken_skip(&in)
			goto tokenize_loop
		:tokenize_line_directive
			in += 1
			line_number = stoi(in)
			in = memchr(in, 32)
			in += 1
			file_add(in)
			file = file_get_index(in)
			pptoken_skip(&in)
			goto tokenize_loop
		:token_no_data
			data = 0
			; (fallthrough)
		:token_output ; write token location & data (see local variable data), and continue tokenizing
			*2out = file
			out += 2
			*4out = line_number
			out += 4
			*8out = data
			out += 8
			goto tokenize_loop
		:tokenize_keyword
			pptoken_skip(&in)
			*1out = b ; type
			; no info for keywords
			out += 2
			goto token_no_data
		:tokenize_number
			; first, check if it's a float
			b = strchr(in, '.)
			if b != 0 goto tokenize_float
			b = strchr(in, 'x) ; e may appear in hex integer literals, so we need to check this
			if b != 0 goto tokenize_hex_integer
			b = strchr(in, 'X)
			if b != 0 goto tokenize_hex_integer
			b = strchr(in, 'e) ; exponent
			if b != 0 goto tokenize_float
			b = strchr(in, 'E) ; exponent
			if b != 0 goto tokenize_float
			if *1in == '0 goto tokenize_octal_integer ; fun fact: in the C89 standard, 0 is considered an octal integer
				; plain ol' decimal constant
				n = strtoi(&in, 10)
				goto tokenize_finish_integer
			:tokenize_hex_integer
				if *1in != '0 goto bad_number_token
				in += 1
				c = *1in
				c &= 223 ; 223 = ~32 -- remove case
				if c != 'X goto bad_number_token
				in += 1
				n = strtoi(&in, 16)
				goto tokenize_finish_integer
			:tokenize_octal_integer
				in += 1 ; skip 0
				n = strtoi(&in, 8)
				goto tokenize_finish_integer
			:tokenize_finish_integer
			c = read_number_suffix(file, line_number, &in)
			if c == NUMBER_SUFFIX_F goto f_suffix_on_integer
			in += 1 ; move past null separator
			*1out = TOKEN_CONSTANT_INT
			out += 1
			*1out = c ; info = suffix
			out += 1
			data = n
			goto token_output
		:tokenize_constant_char
			in += 1
			c = read_c_char(&in)
			if *1in != '' goto bad_char_constant
			if c ] 255 goto bad_char_constant
			pptoken_skip(&in)
			*1out = TOKEN_CONSTANT_CHAR
			out += 2 ; no info
			data = c
			goto token_output
		:tokenize_string_literal
			data = rodata_end_addr
			p = output_file_data + rodata_end_addr
			
			:string_literal_loop
				in += 1 ; skip opening "
				:string_literal_char_loop
					if *1in == '" goto string_literal_char_loop_end
					c = read_c_char(&in)
					if c ] 255 goto bad_char_in_string
					*1p = c
					p += 1
					goto string_literal_char_loop					
				:string_literal_char_loop_end
				pptoken_skip(&in) ; skip closing "
				pptoken_skip_spaces(&in)
				if *1in == '" goto string_literal_loop ; string concatenation, e.g. "Hello, " "world!"
			*1p = 0 ; null terminator
			p += 1
			rodata_end_addr = p - output_file_data
			
			*1out = TOKEN_STRING_LITERAL
			out += 2 ; no info
			goto token_output
		:tokenize_float
			; @NONSTANDARD: this doesn't allow for floats whose integral part is >=2^64, e.g. 1000000000000000000000000.0
			significand = 0
			exponent = 0
			pow10 = 0
			integer = strtoi(&in, 10)
			fraction = 0
			if *1in != '. goto float_no_fraction 
			in += 1
			p = in
			fraction = strtoi(&in, 10)
			; e.g. to turn 35 into .35, multiply by 10^-2
			pow10 = p - in
			if pow10 < -400 goto bad_float
			:float_no_fraction
			; construct the number integer + fraction*10^pow10
			; first, deal with the fractional part
			significand = fraction
			float_multiply_by_power_of_10(&significand, &exponent, pow10)
			
			if integer == 0 goto float_no_integer
			; now deal with the integer part
			new_exponent = leftmost_1bit(integer)
			new_exponent -= 58
			n = new_exponent - exponent
			significand = right_shift(significand, n)
			exponent = new_exponent
			significand += right_shift(integer, exponent)
			:float_no_integer
			if *1in == 'e goto float_exponent
			if *1in == 'E goto float_exponent
			
			:float_have_significand_and_exponent
			if significand == 0 goto float_zero
			normalize_float(&significand, &exponent)
			; putn(significand)
			; putc(32)
			; putn_signed(exponent)
			; putc(10)
			; make number round to the nearest representable float roughly (this is what gcc does)
			; this fails for 5e-100 probably because of imprecision, but mostly works
			significand += 15
			; reduce to 53-bit significand (top bit is removed to get 52)
			significand >= 5
			exponent += 5
			exponent += 52  ; 1001010111... => 1.001010111... 
			n = leftmost_1bit(significand)
			b = 1 < n
			significand &= ~b
			data = significand
			
			if exponent <= -1023 goto float_zero ; this number is too small in magnitude to be represented as a double. it becomes 0
			if exponent >= 1024 goto float_infinity ; number too big to be represented as a double.
			exponent += 1023 ; float format
			
			data |= exponent < 52
			
			:float_have_data
			*1out = TOKEN_CONSTANT_FLOAT
			out += 1
			; suffix
			*1out = read_number_suffix(file, line_number, &in)
			pptoken_skip(&in)
			out += 1
			goto token_output
			:float_exponent
				in += 1
				if *1in == '+ goto float_exponent_plus
				if *1in == '- goto float_exponent_minus
				; e.g. 1e100
				pow10 = strtoi(&in, 10)
				:float_have_exponent
				float_multiply_by_power_of_10(&significand, &exponent, pow10)
				goto float_have_significand_and_exponent
			:float_exponent_plus
				; e.g. 1e+100
				in += 1
				pow10 = strtoi(&in, 10)
				goto float_have_exponent
			:float_exponent_minus
				; e.g. 1e-100
				in += 1
				pow10 = strtoi(&in, 10)
				pow10 = 0 - pow10
				goto float_have_exponent
			:float_zero
				data = 0
				goto float_have_data
			:float_infinity
				data = 0x7ff0000000000000 ; double infinity
				goto float_have_data
	:tokenize_loop_end
	; EOF token
	*1out = TOKEN_EOF
	out += 2
	*2out = file
	out += 2
	*4out = line_number
	out += 12
	
	return out
	:f_suffix_on_integer
		compile_error(file, line_number, .str_f_suffix_on_integer)
	:str_f_suffix_on_integer
		string Integer with f suffix.
		byte 0
	:bad_number_token
		compile_error(file, line_number, .str_bad_number_token)
	:str_bad_number_token
		string Bad number literal.
		byte 0
	:bad_char_constant
		compile_error(file, line_number, .str_bad_char_constant)
	:str_bad_char_constant
		string Bad character constant. Note that multibyte constants are not supported.
		byte 0
	:bad_char_in_string
		compile_error(file, line_number, .str_bad_char_in_string)
	:str_bad_char_in_string
		string Bad character in string literal.
		byte 0
	:bad_token
		compile_error(file, line_number, .str_bad_token)
	:str_bad_token
		string Bad token.
		byte 0
	:bad_float
		compile_error(file, line_number, .str_bad_float)
	:str_bad_float
		string Bad floating-point number.
		byte 0

function float_multiply_by_power_of_10
	argument p_significand
	argument p_exponent
	argument pow10
	local significand
	local exponent
	local p
	local lower
	local upper
	local n
	significand = *8p_significand
	exponent = *8p_exponent

	p = powers_of_10
	p += pow10 < 4
	full_multiply_signed(significand, *8p, &lower, &upper)
	if upper == 0 goto fmultiply2_no_upper
		n = leftmost_1bit(upper)
		n += 1
		significand = lower > n
		exponent += n
		n = 64 - n
		significand |= upper < n
		goto fmultiply2_cont
	:fmultiply2_no_upper
		significand = lower
		goto fmultiply2_cont
	:fmultiply2_cont
	p += 8
	exponent += *8p
	
	*8p_significand = significand
	*8p_exponent = exponent
	return 0
	
; return character or escaped character from *p_in, advancing accordingly
; returns -1 on bad character
function read_c_char
	argument p_in
	local in
	local c
	local x
	in = *8p_in
	if *1in == '\ goto escape_sequence
	; no escape sequence; just a normal character
	c = *1in
	in += 1
	goto escape_sequence_return
	
	:escape_sequence
		in += 1
		c = *1in
		in += 1
		if c == 'x goto escape_sequence_hex
		if c == '' goto escape_sequence_single_quote
		if c == '" goto escape_sequence_double_quote
		if c == '? goto escape_sequence_question
		if c == '\ goto escape_sequence_backslash
		if c == 'a goto escape_sequence_bell
		if c == 'b goto escape_sequence_backspace
		if c == 'f goto escape_sequence_form_feed
		if c == 'n goto escape_sequence_newline
		if c == 'r goto escape_sequence_carriage_return
		if c == 't goto escape_sequence_tab
		if c == 'v goto escape_sequence_vertical_tab
		; octal
		in -= 1
		x = isoctdigit(*1in)
		if x == 0 goto return_minus1
		c = *1in - '0
		in += 1
		x = isoctdigit(*1in)
		if x == 0 goto escape_sequence_return
		c <= 3
		c += *1in - '0
		in += 1
		x = isoctdigit(*1in)
		if x == 0 goto escape_sequence_return
		c <= 3
		c += *1in - '0
		in += 1
		if c ] 255 goto return_minus1 ; e.g. '\712'
		goto escape_sequence_return
	:escape_sequence_hex
		x = in
		c = strtoi(&in, 16)
		if in == x goto return_minus1 ; e.g. '\xhello'
		if c ] 255 goto return_minus1 ; e.g. '\xabc'
		goto escape_sequence_return
	:escape_sequence_single_quote
		c = ''
		goto escape_sequence_return
	:escape_sequence_double_quote
		c = '"
		goto escape_sequence_return
	:escape_sequence_question
		c = '?
		goto escape_sequence_return
	:escape_sequence_backslash
		c = '\
		goto escape_sequence_return
	:escape_sequence_bell
		c = 7
		goto escape_sequence_return
	:escape_sequence_backspace
		c = 8
		goto escape_sequence_return
	:escape_sequence_form_feed
		c = 12
		goto escape_sequence_return
	:escape_sequence_newline
		c = 10
		goto escape_sequence_return
	:escape_sequence_carriage_return
		c = 13
		goto escape_sequence_return
	:escape_sequence_tab
		c = 9
		goto escape_sequence_return
	:escape_sequence_vertical_tab
		c = 11
		goto escape_sequence_return
	:escape_sequence_return
	*8p_in = in
	return c
		
		
function read_number_suffix
	argument file
	argument line_number
	argument p_s
	local s
	local c
	local suffix
	s = *8p_s
	c = *1s
	suffix = 0
	if c == 0 goto number_suffix_return
	if c == 'u goto number_suffix_u
	if c == 'l goto number_suffix_l
	if c == 'f goto number_suffix_f
	goto bad_number_suffix
	:number_suffix_u
		s += 1
		c = *1s
		if c == 'l goto number_suffix_ul
		if c != 0 goto bad_number_suffix
		suffix = NUMBER_SUFFIX_U
		goto number_suffix_return
	:number_suffix_l
		s += 1
		c = *1s
		if c == 'u goto number_suffix_ul
		if c != 0 goto bad_number_suffix
		suffix = NUMBER_SUFFIX_L
		goto number_suffix_return
	:number_suffix_ul
		s += 1
		c = *1s
		if c != 0 goto bad_number_suffix
		suffix = NUMBER_SUFFIX_UL
		goto number_suffix_return
	:number_suffix_f
		s += 1
		c = *1s
		if c != 0 goto bad_number_suffix
		suffix = NUMBER_SUFFIX_F
		goto number_suffix_return
	:number_suffix_return
	*8p_s = s
	return suffix
	
	:bad_number_suffix
		compile_error(file, line_number, .str_bad_number_suffix)
	:str_bad_number_suffix
		string Bad number suffix.
		byte 0
	
function print_tokens
	argument tokens
	argument tokens_end
	local p
	local s
	p = tokens
	:print_tokens_loop
		if p ]= tokens_end goto print_tokens_loop_end
		if *1p == 0 goto print_tokens_loop_end
		if *1p > 20 goto print_token_keyword 
		if *1p == TOKEN_CONSTANT_INT goto print_token_int
		if *1p == TOKEN_CONSTANT_CHAR goto print_token_char
		if *1p == TOKEN_CONSTANT_FLOAT goto print_token_float
		if *1p == TOKEN_STRING_LITERAL goto print_token_string_literal
		if *1p == TOKEN_IDENTIFIER goto print_token_identifier
		if *1p == TOKEN_EOF goto print_token_eof
		fputs(2, .str_print_bad_token)
		exit(1)
		:print_token_keyword
			s = get_keyword_str(*1p)
			puts(s)
			goto print_token_data
		:print_token_int
			puts(.str_constant_int)
			goto print_token_info
		:print_token_char
			puts(.str_constant_char)
			goto print_token_data
		:print_token_string_literal
			puts(.str_string_literal)
			goto print_token_data
		:print_token_identifier
			s = p + 8
			puts(*8s)
			goto print_token_data
		:print_token_float
			p += 8
			puts(.str_constant_float)
			putx64(*8p)
			p += 8
			putc(32)
			goto print_tokens_loop
		:print_token_eof
			puts(.str_eof)
			goto print_token_data
		:print_token_info
		p += 1
		putc('~)
		putn(*1p)
		p -= 1
		:print_token_data
		p += 2
		putc('@)
		putn(*2p)
		p += 2
		putc(':)
		putn(*4p)
		p += 4
		putc(61)
		putn(*8p)
		p += 8
		putc(32)
		goto print_tokens_loop
	:print_tokens_loop_end
	putc(10)
	return
	:str_constant_int
		string integer
		byte 0
	:str_constant_float
		string float
		byte 0
	:str_constant_char
		string character
		byte 0
	:str_string_literal
		string string
		byte 0
	:str_print_bad_token
		string Unrecognized token type in print_tokens. Aborting.
		byte 10
		byte 0
	:str_eof
		string EOF
		byte 0

function print_token
	argument token
	local p
	p = token + 16
	print_tokens(token, p)
	return