typedef enum {
			  TOKEN_KW,
			  TOKEN_IDENT,
			  TOKEN_NUM_CONST,
			  TOKEN_CHAR_CONST,
			  TOKEN_STR_CONST,
			  TOKEN_EOF
} TokenKind;

typedef enum {
			  KW_SEMICOLON,
			  KW_EQ,
			  KW_COLON,
			  KW_FN,
			  KW_LPAREN,
			  KW_RPAREN,
			  KW_LBRACE,
			  KW_RBRACE,
			  KW_EQEQ,
			  KW_LT,
			  KW_LE,
			  KW_MINUS,
			  KW_COUNT
} Keyword;

/* OPTIM: Use a trie or just a function if this gets too long */
static const char *keywords[KW_COUNT] =
	{";", "=", ":", "fn", "(", ")", "{", "}", "==", "<", "<=", "-"}; 

#define TOKR_USE_LLONG 1

typedef unsigned long long IntConst;

typedef long double RealConst; /* OPTIM: Switch to double */

typedef enum {
			  NUM_CONST_INT,
			  NUM_CONST_REAL
} NumConstKind;

typedef struct {
	NumConstKind kind;
	union {
		IntConst intval;
		RealConst realval;
	};
} NumConst;

typedef struct {
	char *str;
	size_t len;
} StrConst;

/* NOTE: LineNo is typedef'd in util/err.c */
typedef struct {
	TokenKind kind;
	LineNo line;
	char *code;
	union {
		Keyword kw;
		Identifier ident;
		NumConst num;
		char chr;
		StrConst str;
	};
} Token;

arr_declaration(Tokens, Token, tokens_)

typedef struct {
	Tokens tokens;
	char *s; /* string being parsed */
	LineNo line;
	Token *token; /* token currently being processed */
} Tokenizer;

static bool token_is_kw(Token *t, Keyword kw) {
	return t->kind == TOKEN_KW && t->kw == kw;
}

static void token_fprint(FILE *out, Token *t) {
	fprintf(out, "l%lu-", (unsigned long)t->line);
	switch (t->kind) {
	case TOKEN_KW:
		fprintf(out, "keyword: %s", keywords[t->kw]);
		break;
	case TOKEN_IDENT:
		fprintf(out, "identifier: %ld:", t->ident->id);
		ident_fprint(out, t->ident);
		break;
	case TOKEN_NUM_CONST:
		fprintf(out, "number: ");
		switch (t->num.kind) {
		case NUM_CONST_INT:
			fprintf(out, "%llu", t->num.intval);
			break;
		case NUM_CONST_REAL:
			fprintf(out, "%g", (double)t->num.realval);
			break;
		}
		break;
	case TOKEN_CHAR_CONST:
		fprintf(out, "char: '%c' (%d)", t->chr, t->chr);
		break;
	case TOKEN_STR_CONST:
		fprintf(out, "str: \"%s\"", t->str.str);
		break;
	case TOKEN_EOF:
		fprintf(out, "eof");
		break;
	}
}

static void tokr_add(Tokenizer *t, Token *token) {
	if (!token->line)
		token->line = t->line;
	if (!token->code)
		token->code = t->s;
	tokens_add(&t->tokens, token);
}

static void tokr_nextchar(Tokenizer *t) {
	if (*(t->s) == '\n') {
		t->line++;
	}
	t->s++;
}

static char tokr_esc_seq(Tokenizer *t) {
	/* TODO: add more of these incl. \x41, \100 */
	switch (*t->s) {
	case '\'':
		tokr_nextchar(t);
		return '\'';
	case '"':
		tokr_nextchar(t);
		return '"';
	case '\\':
		tokr_nextchar(t);
		return '\\';
	case 'n':
		tokr_nextchar(t);
		return '\n';
	default:
		return 0;
	}

}

/* to be used during tokenization */
static void tokenization_err(Tokenizer *t, const char *fmt, ...) {
	va_list args;
	va_start(args, fmt);
	err_vprint(t->line, t->s, fmt, args);
	va_end(args);
	
	char *end_of_line = strchr(t->s, '\n');
	if (end_of_line) {
		t->s = end_of_line;
		t->s++; /* move past newline */
	} else {
		t->s = strchr(t->s, '\0');
	}
	t->line++;
}

/* to be used after tokenization */
static void tokr_err(Tokenizer *t, const char *fmt, ...) {
	LineNo line = t->token->line;
	va_list args;
	va_start(args, fmt);
	err_vprint(line, t->token->code, fmt, args);
	va_end(args);
	while (1) {
		if (t->token->line != line) break;
		if (t->token->kind == TOKEN_EOF) break;
		t->token++;
	}
}

static bool tokenize_string(Tokenizer *tokr, char *str) {
	int has_err = 0;
	Tokenizer t;
	tokens_create(&t.tokens);
	tokens_reserve(&t.tokens, 256);
	t.s = str;
	t.line = 1;
	
	while (1) {
	    if (*t.s == 0) break;
		if (isspace(*t.s)) {
			tokr_nextchar(&t);
	    	continue;
		}

		if (*t.s == '/') {
			/* maybe it's a comment */
			int is_comment = 1;
			switch (t.s[1]) {
			case '/': /* single line comment */
				tokr_nextchar(&t);
				for (t.s++; *t.s != '\n' && *t.s; t.s++);
				t.line++;
				break;
			case '*': { /* multi line comment */
				tokr_nextchar(&t);
				int comment_level = 1; /* allow nested multi-line comments */
			    while (*t.s) {
					if (t.s[0] == '*' && t.s[1] == '/') {
						t.s += 2;
						comment_level--;
						if (comment_level == 0) {
							break;
						}
					} else if (t.s[0] == '/' && t.s[1] == '*') {
						t.s += 2;
						comment_level++;
					} else {
						tokr_nextchar(&t);
					}
				}
				if (*t.s == 0) {
					tokenization_err(&t, "End of file reached inside multi-line comment.");
					abort(); /* there won't be any further errors, of course */
				}
			} break;
			default:
				is_comment = 0;
				break;
			}
			if (is_comment) continue;
		}
		Keyword kw;
		for (kw = 0; kw < KW_COUNT; kw++) {
			if (strncmp(t.s, keywords[kw], strlen(keywords[kw])) == 0) {
				break;
			}
		}
		if (kw != KW_COUNT) {
			/* it's a keyword */
			Token token = {0};
			token.kind = TOKEN_KW;
			token.kw = kw;
			tokr_add(&t, &token);
			t.s += (LineNo)strlen(keywords[kw]);
			continue;
		}
		
		/* check if it's a number */

		if (isdigit(*t.s)) {
			/* it's a numeric constant */
			int base = 10;
			RealConst decimal_pow10;
			NumConst n;
			n.kind = NUM_CONST_INT;
			n.intval = 0;
			Token token = {0};
			token.line = t.line;
			token.code = t.s;
			if (*t.s == '0') {
				tokr_nextchar(&t);
				/* octal/hexadecimal/binary (or zero) */
				char format = *t.s;
				if (isdigit(format)) /* octal */
					base = 8;
				else {
					switch (format) {
					case 'b':
						base = 2;
						tokr_nextchar(&t);
						break;
					case 'x':
						base = 16;
						tokr_nextchar(&t);
						break;
					default:
						/* it's 0/0.something etc.  */
						break;
					}
				}
			}

			while (1) {
				if (*t.s == '.') {
					if (n.kind == NUM_CONST_REAL) {
						tokenization_err(&t, "Double . in number.");
						goto err;
					}
					if (base != 10) {
						tokenization_err(&t, "Decimal point in non base 10 number.");
						goto err;
					}
				    n.kind = NUM_CONST_REAL;
					decimal_pow10 = 0.1;
					n.realval = (RealConst)n.intval;
					tokr_nextchar(&t);
					continue;
				} else if (*t.s == 'e') {
					tokr_nextchar(&t);
					if (n.kind == NUM_CONST_INT) {
						n.kind = NUM_CONST_REAL;
						n.realval = (RealConst)n.intval;
					}
					/* TODO: check if exceeding maximum exponent */
					int exponent = 0;
					if (*t.s == '+')
						tokr_nextchar(&t); /* ignore + after e */
					
					int negative_exponent = 0;
					if (*t.s == '-') {
						tokr_nextchar(&t);
						negative_exponent = 1;
					}
					for (; isdigit(*t.s); tokr_nextchar(&t)) {
						exponent *= 10;
						exponent += *t.s - '0';
					}
					/* OPTIM: Slow for very large exponents (unlikely to happen) */
					for (int i = 0; i < exponent; i++) {
						if (negative_exponent)
							n.realval /= 10;
						else
							n.realval *= 10;
					}
						
					break;
				}
				int digit = -1;
				if (base == 16) {
					if (*t.s >= 'a' && *t.s <= 'f')
						digit = 10 + *t.s - 'a';
					else if (*t.s >= 'A' && *t.s <= 'F')
						digit = *t.s - 'A';
				}
				if (digit == -1) {
					if (*t.s >= '0' && *t.s <= '9')
						digit = *t.s - '0';
				}
				if (digit < 0 || digit >= base) {
					if (isdigit(*t.s)) {
						/* something like 0b011012 */
						tokenization_err(&t, "Digit %d cannot appear in a base %d number.", digit, base);
						goto err;
					}
					/* end of numeric constant */
					break;
				}
				switch (n.kind) {
				case NUM_CONST_INT:
					if (n.intval > ULLONG_MAX / (IntConst)base ||
						n.intval * (IntConst)base > ULLONG_MAX - (IntConst)digit) {
						/* too big! */
						tokenization_err(&t, "Number too big to fit in a numeric constant.");
						goto err;
					}
					n.intval *= (IntConst)base;
					n.intval += (IntConst)digit;
					break;
				case NUM_CONST_REAL:
					n.realval += decimal_pow10 * (RealConst)digit;
					decimal_pow10 /= 10;
					break;
				}
				tokr_nextchar(&t);
			}
			token.kind = TOKEN_NUM_CONST;
			token.num = n;
			tokr_add(&t, &token);
			continue;
		}

		if (*t.s == '\'') {
			/* it's a character constant! */
			tokr_nextchar(&t);
			Token token = {0};
			token.line = t.line;
			token.code = t.s;
			char c;
			if (*t.s == '\\') {
				/* escape sequence */
				tokr_nextchar(&t);
				c = tokr_esc_seq(&t);
				if (c == 0) {
					tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s);
					goto err;
				}
			} else {
				c = *t.s;
				tokr_nextchar(&t);
			}
			if (*t.s != '\'') {
				tokenization_err(&t, "End of character constant expected.");
				goto err;
			}
			tokr_nextchar(&t);
			token.kind = TOKEN_CHAR_CONST;
			token.chr = c;
			tokr_add(&t, &token);
			continue;
		}

		if (*t.s == '"') {
			/* it's a string constant! */
			Token token;
			token.line = t.line;
			token.code = t.s;
			tokr_nextchar(&t);
			size_t len = 0;
			size_t backslashes = 0;
			while (*t.s != '"' || backslashes % 2 == 1) {
				if (*t.s == '\\') {
					backslashes++;
				} else if (*t.s == 0) {
					/* return t to opening " so that we go to the next line */
					t.line = token.line;
					t.s = token.code;
					tokenization_err(&t, "No matching \" found.");
					goto err;
				} else {
					backslashes = 0;
				}
				len++;
				tokr_nextchar(&t);
			}
			char *str = malloc(len + 1);
		    char *strptr = str;
			t.s = token.code;
			t.line = token.line;
			tokr_nextchar(&t); /* past opening " */
			while (*t.s != '"') {
				assert(*t.s);
				if (*t.s == '\\') {
					tokr_nextchar(&t);
					char c = tokr_esc_seq(&t);
					if (c == 0) {
						tokenization_err(&t, "Unrecognized escape character: '\\%c'.", *t.s);
						goto err;
					}
					*strptr++ = c;
				} else {
					*strptr++ = *t.s;
					tokr_nextchar(&t);
				}
			}
			*strptr = 0;
			token.kind = TOKEN_STR_CONST;
			token.str.len = len;
			token.str.str = str;
			tokr_add(&t, &token);
			tokr_nextchar(&t); /* move past closing " */
			continue;
		}
		
		if (isidentstart(*t.s)) {
			/* it's an identifier */
			Token token = {0};
			token.line = t.line;
			token.code = t.s;
			Identifier ident = ident_insert(&t.s);
			token.kind = TOKEN_IDENT;
			token.ident = ident;
			tokr_add(&t, &token);			
			continue;
		}		
		tokenization_err(&t, "Token not recognized");
	err:
		has_err = 1;
	}
	Token token = {0};
	token.kind = TOKEN_EOF;
	tokr_add(&t, &token);
	
	t.token = t.tokens.data;
	*tokr = t;
	return !has_err;
}

static void tokr_free(Tokenizer *t) {
	arr_foreach(t->tokens, Token, token) {
		switch (token->kind) {
		case TOKEN_STR_CONST:
			free(token->str.str);
			break;
		default: break;
		}
	}
	tokens_clear(&t->tokens);
}