summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--identifiers.c75
-rw-r--r--main.c4
-rw-r--r--test.toc8
-rw-r--r--tokenizer.c29
-rw-r--r--util/err.c28
5 files changed, 136 insertions, 8 deletions
diff --git a/identifiers.c b/identifiers.c
new file mode 100644
index 0000000..6335ed8
--- /dev/null
+++ b/identifiers.c
@@ -0,0 +1,75 @@
+static char identifier_chars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.";
+#define NIDENTIFIER_CHARS ((int)((sizeof identifier_chars) - 1)) /* -1 for null char */
+
+/* returns -1 if c is not a valid identifier character, its index in identifier_chars otherwise */
+static int ident_char_index(int c) {
+ if (c >= 'a' && c <= 'z')
+ return c - 'a';
+ if (c >= 'A' && c <= 'Z')
+ return c - 'A' + 26;
+ if (c >= '0' && c <= '9')
+ return c - '0' + 52;
+ if (c == '_') return 62;
+ if (c == '.') return 63;
+ return -1;
+}
+
+/* can this character be used in an identifier? */
+static int isident(int c) {
+ return ident_char_index(c) != -1; /* OPTIM: Write separate function */
+}
+
+typedef struct IdentTree {
+ /* zero value is an empty trie */
+ long id;
+ int len; /* length of identifier = depth in tree */
+ struct IdentTree *children;
+ struct IdentTree *parent;
+} IdentTree;
+
+typedef IdentTree *Identifier;
+
+static IdentTree ident_base_tree;
+static long ident_curr_id; /* NOTE: you should eventually add something to reset this */
+
+static Identifier ident_tree_finsert(IdentTree *t, FILE *fp) {
+ while (1) {
+ int c = fgetc(fp);
+ if (!isident(c)) {
+ if (t->id == 0) t->id = ++ident_curr_id;
+ return t;
+ }
+ if (!t->children) {
+ /* allocate children */
+ t->children = err_calloc(NIDENTIFIER_CHARS, sizeof *t->children);
+ for (int i = 0; i < NIDENTIFIER_CHARS; i++)
+ t->children[i].parent = t; /* child's parent = self */
+ }
+ t = &t->children[ident_char_index(c)];
+ }
+}
+
+/* inserts if does not exist. reads until non-ident char is found. */
+/* advances past identifier */
+static Identifier ident_finsert(FILE *fp) {
+ return ident_tree_finsert(&ident_base_tree, fp);
+}
+
+
+static void ident_fprint(FILE *out, Identifier id) {
+ if (id->parent == NULL) return; /* at root */
+ /* OPTIM: Use malloc(id->len)???? */
+ ident_fprint(out, id->parent);
+ fputc(identifier_chars[id - id->parent->children /* index of self in parent */], out);
+}
+
+static void idents_free_tree(IdentTree *tree) {
+ if (!tree->children) return;
+ for (int i = 0; i < NIDENTIFIER_CHARS; i++)
+ idents_free_tree(&tree->children[i]);
+ free(tree->children);
+}
+
+static void idents_free(void) {
+ idents_free_tree(&ident_base_tree);
+}
diff --git a/main.c b/main.c
index f4c6280..900b723 100644
--- a/main.c
+++ b/main.c
@@ -7,6 +7,7 @@
#include <ctype.h>
#include "util/err.c"
#include "util/files.c"
+#include "identifiers.c"
#include "tokenizer.c"
int main(int argc, char **argv) {
@@ -31,6 +32,7 @@ int main(int argc, char **argv) {
printf("\n");
tokenizer_free(&t);
-
+
fclose(in);
+ idents_free();
}
diff --git a/test.toc b/test.toc
index 54c9345..a0b4c4e 100644
--- a/test.toc
+++ b/test.toc
@@ -1,3 +1,9 @@
== <
-<<<<< \ No newline at end of file
+<<foo<<<
+bar
+foo
+bar
+baz
+bar
+foo \ No newline at end of file
diff --git a/tokenizer.c b/tokenizer.c
index 00e9979..768693d 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -1,5 +1,6 @@
typedef enum {
TOKEN_KW,
+ TOKEN_IDENT,
TOKEN_EOF
} TokenKind;
@@ -12,9 +13,9 @@ typedef enum {
KW_COUNT
} Keyword;
+/* OPTIM: Use a trie or just a function if this gets too long */
static const char *keywords[KW_COUNT] =
- {";", "==", "<", "<=", "="};
-
+ {";", "==", "<", "<=", "="};
/* NOTE: LineNo is typedef'd in util/err.c */
typedef struct {
@@ -23,6 +24,7 @@ typedef struct {
LineNo col;
union {
Keyword kw;
+ Identifier ident;
};
} Token;
@@ -39,6 +41,10 @@ static void token_fprint(FILE *out, Token *t) {
case TOKEN_KW:
fprintf(out, "keyword: %s", keywords[t->kw]);
break;
+ case TOKEN_IDENT:
+ fprintf(out, "identifier: %ld:", t->ident->id);
+ ident_fprint(out, t->ident);
+ break;
case TOKEN_EOF:
fprintf(out, "eof");
break;
@@ -87,13 +93,24 @@ static Tokenizer tokenize_file(FILE *fp) {
}
}
if (kw != KW_COUNT) {
- Token kw_token;
- kw_token.kind = TOKEN_KW;
- kw_token.kw = kw;
- tokenizer_add(&t, &kw_token, line, col);
+ /* it's a keyword */
+ Token token;
+ token.kind = TOKEN_KW;
+ token.kw = kw;
+ tokenizer_add(&t, &token, line, col);
col += (LineNo)strlen(keywords[kw]);
continue;
}
+
+ if (isident(c)) {
+ /* it's an identifier */
+ Identifier ident = ident_finsert(fp);
+ Token token;
+ token.kind = TOKEN_IDENT;
+ token.ident = ident;
+ tokenizer_add(&t, &token, line, col);
+ continue;
+ }
fgets(errbuf, sizeof errbuf, fp);
size_t len = strlen(errbuf);
diff --git a/util/err.c b/util/err.c
index 62886c4..7a38017 100644
--- a/util/err.c
+++ b/util/err.c
@@ -8,3 +8,31 @@ static void err_print(LineNo line, LineNo col, const char *fmt, ...) {
vfprintf(stderr, fmt, args);
va_end(args);
}
+
+static void *err_malloc(size_t size) {
+ void *ret = malloc(size);
+ if (!ret) {
+ fprintf(stderr, "Error: Out of memory.\n");
+ abort();
+ }
+ return ret;
+}
+
+static void *err_calloc(size_t n, size_t size) {
+ void *ret = calloc(n, size);
+ if (!ret) {
+ fprintf(stderr, "Error: Out of memory.\n");
+ abort();
+ }
+ return ret;
+}
+
+static void *err_realloc(void *data, size_t new_size) {
+ void *ret = realloc(data, new_size);
+ if (!ret) {
+ fprintf(stderr, "Error: Out of memory.\n");
+ abort();
+ }
+ return ret;
+}
+