Basic tokenization

author: Leo Tenenbaum <pommicket@gmail.com> 2019-08-16 13:51:35 -0400
committer: Leo Tenenbaum <pommicket@gmail.com> 2019-08-16 13:51:35 -0400
commit: 1617c304c270996504ac5d285e3b417e3310f97f (patch)
tree: e878ff8109ff486e8bae684d29d9f7210ec88d62
7 files changed, 207 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b431f19
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+toc
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..e8bd52e
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+gcc -o toc main.c -g -o toc -Wall -Wextra -Wpedantic -Wconversion -std=c11 || exit 1
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..f4c6280
--- /dev/null
+++ b/main.c
@@ -0,0 +1,36 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include "util/err.c"
+#include "util/files.c"
+#include "tokenizer.c"
+
+int main(int argc, char **argv) {
+	if (argc < 2) {
+		fprintf(stderr, "Please specify an input file.\n");
+		return EXIT_FAILURE;
+	}
+	
+	FILE *in = fopen(argv[1], "r");
+	if (!in) {
+		fprintf(stderr, "Could not open file: %s.\n", argv[1]);
+		return EXIT_FAILURE;
+	}
+
+	Tokenizer t = tokenize_file(in);
+	
+	for (size_t i = 0; i < t.ntokens; i++) {
+		if (i)
+			printf("    ");
+		token_fprint(stdout, &t.tokens[i]);
+	}
+	printf("\n");
+
+	tokenizer_free(&t);
+	
+	fclose(in);
+}
diff --git a/test.toc b/test.toc
new file mode 100644
index 0000000..54c9345
--- /dev/null
+++ b/test.toc
@@ -0,0 +1,3 @@
+   
+== <
+<<<<<
+\ No newline at end of file
diff --git a/tokenizer.c b/tokenizer.c
new file mode 100644
index 0000000..00e9979
--- /dev/null
+++ b/tokenizer.c
@@ -0,0 +1,126 @@
+typedef enum {
+			  TOKEN_KW,
+			  TOKEN_EOF
+} TokenKind;
+
+typedef enum {
+			  KW_SEMICOLON,
+			  KW_EQEQ,
+			  KW_LT,
+			  KW_LE,
+			  KW_EQ,
+			  KW_COUNT
+} Keyword;
+
+static const char *keywords[KW_COUNT] =
+	{";", "==", "<", "<=", "="};
+
+
+/* NOTE: LineNo is typedef'd in util/err.c */
+typedef struct {
+	TokenKind kind;
+	LineNo line;
+    LineNo col;
+	union {
+		Keyword kw;
+	};
+} Token;
+
+typedef struct {
+	Token *tokens;
+	size_t ntokens;
+	size_t cap;	/* used internally */
+	Token *token; /* token currently being processed */
+} Tokenizer;
+
+static void token_fprint(FILE *out, Token *t) {
+	fprintf(out, "l%luc%lu-", (unsigned long)t->line, (unsigned long)t->col);
+	switch (t->kind) {
+	case TOKEN_KW:
+		fprintf(out, "keyword: %s", keywords[t->kw]);
+		break;
+	case TOKEN_EOF:
+		fprintf(out, "eof");
+		break;
+	}
+}
+
+static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) {
+	if (t->ntokens == t->cap) {
+		t->cap *= 2;
+		t->tokens = realloc(t->tokens, t->cap);
+	}
+	token->line = line;
+	token->col = col;
+	t->tokens[t->ntokens++] = *token;
+}
+
+static Tokenizer tokenize_file(FILE *fp) {
+	char buf[4096];
+	setvbuf(fp, buf, _IOFBF, sizeof buf);
+	char errbuf[256] = {0}; /* for errors */
+	int has_err = 0;
+	Tokenizer t;
+	t.cap = 4096;
+	t.ntokens = 0;
+	t.tokens = malloc(t.cap * sizeof(*t.tokens));
+
+	LineNo line = 1;
+	LineNo col = 1;
+	
+	while (1) {
+		int c = fpeekc(fp);
+	    if (c == EOF) break;
+		if (isspace(c)) {
+			if (c == '\n') {
+				line++;
+				col = 0;
+			}
+			fnextc(fp);
+			col++;
+	    	continue;
+		}
+		Keyword kw;
+		for (kw = 0; kw < KW_COUNT; kw++) {
+			if (fhasprefix(fp, keywords[kw])) {
+				break;
+			}
+		}
+		if (kw != KW_COUNT) {
+			Token kw_token;
+			kw_token.kind = TOKEN_KW;
+			kw_token.kw = kw;
+			tokenizer_add(&t, &kw_token, line, col);
+			col += (LineNo)strlen(keywords[kw]);
+			continue;
+		}
+		
+		fgets(errbuf, sizeof errbuf, fp);
+		size_t len = strlen(errbuf);
+		int has_newline = len && errbuf[len-1] == '\n';
+		if (has_newline) {
+			/* remove newline */
+			errbuf[len-1] = 0;
+		}
+		err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf);
+		has_err = 1;
+		if (has_newline) {
+			/* increment line counter because of it */
+		    line++;
+			col = 1;
+		} else {
+			col += (LineNo)(sizeof errbuf);
+		}
+	}
+	/* TODO: Check ferror/errno */
+	if (has_err) {
+		fprintf(stderr, "Errors occured while preprocessing.\n");
+		abort();
+	}
+	t.token = t.tokens;
+	return t;
+}
+
+static void tokenizer_free(Tokenizer *t) {
+	free(t->tokens);
+}
diff --git a/util/err.c b/util/err.c
new file mode 100644
index 0000000..62886c4
--- /dev/null
+++ b/util/err.c
@@ -0,0 +1,10 @@
+typedef uint32_t LineNo;
+
+static void err_print(LineNo line, LineNo col, const char *fmt, ...) {
+	/* TODO: Color */
+	va_list args;
+	fprintf(stderr, "Error at line %lu col %lu:\n", (unsigned long)line, (unsigned long)col);
+	va_start(args, fmt);
+	vfprintf(stderr, fmt, args);
+	va_end(args);
+}
diff --git a/util/files.c b/util/files.c
new file mode 100644
index 0000000..0afa843
--- /dev/null
+++ b/util/files.c
@@ -0,0 +1,28 @@
+static int fpeekc(FILE *fp) {
+	int c = getc(fp);
+	if (c == EOF)
+		return c;
+	ungetc(c, fp);
+	return c;
+}
+
+#define fnextc getc /* advance to the next character */
+
+/* NOTE: Advances and returns # of characters advanced iff prefix is found. */
+static int fhasprefix(FILE *fp, const char *prefix) {
+	assert(*prefix);
+	long start = ftell(fp);
+	if (start == -1)
+		return 0;
+	const char *p = prefix;
+	while (*p) {
+		int c = getc(fp);
+		if (c != *p) {
+			/* wrong character / EOF */
+			fseek(fp, start, SEEK_SET);
+			return 0;
+		}
+		p++;
+	}
+	return (int)(p - prefix); /* length of prefix */
+}
author	Leo Tenenbaum <pommicket@gmail.com>	2019-08-16 13:51:35 -0400
committer	Leo Tenenbaum <pommicket@gmail.com>	2019-08-16 13:51:35 -0400
commit	1617c304c270996504ac5d285e3b417e3310f97f (patch)
tree	e878ff8109ff486e8bae684d29d9f7210ec88d62