diff options
author | Leo Tenenbaum <pommicket@gmail.com> | 2019-08-16 13:51:35 -0400 |
---|---|---|
committer | Leo Tenenbaum <pommicket@gmail.com> | 2019-08-16 13:51:35 -0400 |
commit | 1617c304c270996504ac5d285e3b417e3310f97f (patch) | |
tree | e878ff8109ff486e8bae684d29d9f7210ec88d62 |
Basic tokenization
-rw-r--r-- | .gitignore | 1 | ||||
-rwxr-xr-x | build.sh | 3 | ||||
-rw-r--r-- | main.c | 36 | ||||
-rw-r--r-- | test.toc | 3 | ||||
-rw-r--r-- | tokenizer.c | 126 | ||||
-rw-r--r-- | util/err.c | 10 | ||||
-rw-r--r-- | util/files.c | 28 |
7 files changed, 207 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b431f19 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +toc diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..e8bd52e --- /dev/null +++ b/build.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +gcc -o toc main.c -g -o toc -Wall -Wextra -Wpedantic -Wconversion -std=c11 || exit 1 @@ -0,0 +1,36 @@ +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdarg.h> +#include <string.h> +#include <ctype.h> +#include "util/err.c" +#include "util/files.c" +#include "tokenizer.c" + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Please specify an input file.\n"); + return EXIT_FAILURE; + } + + FILE *in = fopen(argv[1], "r"); + if (!in) { + fprintf(stderr, "Could not open file: %s.\n", argv[1]); + return EXIT_FAILURE; + } + + Tokenizer t = tokenize_file(in); + + for (size_t i = 0; i < t.ntokens; i++) { + if (i) + printf(" "); + token_fprint(stdout, &t.tokens[i]); + } + printf("\n"); + + tokenizer_free(&t); + + fclose(in); +} diff --git a/test.toc b/test.toc new file mode 100644 index 0000000..54c9345 --- /dev/null +++ b/test.toc @@ -0,0 +1,3 @@ + +== < +<<<<<
\ No newline at end of file diff --git a/tokenizer.c b/tokenizer.c new file mode 100644 index 0000000..00e9979 --- /dev/null +++ b/tokenizer.c @@ -0,0 +1,126 @@ +typedef enum { + TOKEN_KW, + TOKEN_EOF +} TokenKind; + +typedef enum { + KW_SEMICOLON, + KW_EQEQ, + KW_LT, + KW_LE, + KW_EQ, + KW_COUNT +} Keyword; + +static const char *keywords[KW_COUNT] = + {";", "==", "<", "<=", "="}; + + +/* NOTE: LineNo is typedef'd in util/err.c */ +typedef struct { + TokenKind kind; + LineNo line; + LineNo col; + union { + Keyword kw; + }; +} Token; + +typedef struct { + Token *tokens; + size_t ntokens; + size_t cap; /* used internally */ + Token *token; /* token currently being processed */ +} Tokenizer; + +static void token_fprint(FILE *out, Token *t) { + fprintf(out, "l%luc%lu-", (unsigned long)t->line, (unsigned long)t->col); + switch (t->kind) { + case TOKEN_KW: + fprintf(out, "keyword: %s", keywords[t->kw]); + break; + case TOKEN_EOF: + fprintf(out, "eof"); + break; + } +} + +static void tokenizer_add(Tokenizer *t, Token *token, LineNo line, LineNo col) { + if (t->ntokens == t->cap) { + t->cap *= 2; + t->tokens = realloc(t->tokens, t->cap); + } + token->line = line; + token->col = col; + t->tokens[t->ntokens++] = *token; +} + +static Tokenizer tokenize_file(FILE *fp) { + char buf[4096]; + setvbuf(fp, buf, _IOFBF, sizeof buf); + char errbuf[256] = {0}; /* for errors */ + int has_err = 0; + Tokenizer t; + t.cap = 4096; + t.ntokens = 0; + t.tokens = malloc(t.cap * sizeof(*t.tokens)); + + LineNo line = 1; + LineNo col = 1; + + while (1) { + int c = fpeekc(fp); + if (c == EOF) break; + if (isspace(c)) { + if (c == '\n') { + line++; + col = 0; + } + fnextc(fp); + col++; + continue; + } + Keyword kw; + for (kw = 0; kw < KW_COUNT; kw++) { + if (fhasprefix(fp, keywords[kw])) { + break; + } + } + if (kw != KW_COUNT) { + Token kw_token; + kw_token.kind = TOKEN_KW; + kw_token.kw = kw; + tokenizer_add(&t, &kw_token, line, col); + col += (LineNo)strlen(keywords[kw]); + continue; + } + + fgets(errbuf, sizeof errbuf, fp); + size_t len = strlen(errbuf); + int has_newline = len && errbuf[len-1] == '\n'; + if (has_newline) { + /* remove newline */ + errbuf[len-1] = 0; + } + err_print(line, col, "Unrecognized token:\n\there --> %s\n", errbuf); + has_err = 1; + if (has_newline) { + /* increment line counter because of it */ + line++; + col = 1; + } else { + col += (LineNo)(sizeof errbuf); + } + } + /* TODO: Check ferror/errno */ + if (has_err) { + fprintf(stderr, "Errors occured while preprocessing.\n"); + abort(); + } + t.token = t.tokens; + return t; +} + +static void tokenizer_free(Tokenizer *t) { + free(t->tokens); +} diff --git a/util/err.c b/util/err.c new file mode 100644 index 0000000..62886c4 --- /dev/null +++ b/util/err.c @@ -0,0 +1,10 @@ +typedef uint32_t LineNo; + +static void err_print(LineNo line, LineNo col, const char *fmt, ...) { + /* TODO: Color */ + va_list args; + fprintf(stderr, "Error at line %lu col %lu:\n", (unsigned long)line, (unsigned long)col); + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); +} diff --git a/util/files.c b/util/files.c new file mode 100644 index 0000000..0afa843 --- /dev/null +++ b/util/files.c @@ -0,0 +1,28 @@ +static int fpeekc(FILE *fp) { + int c = getc(fp); + if (c == EOF) + return c; + ungetc(c, fp); + return c; +} + +#define fnextc getc /* advance to the next character */ + +/* NOTE: Advances and returns # of characters advanced iff prefix is found. */ +static int fhasprefix(FILE *fp, const char *prefix) { + assert(*prefix); + long start = ftell(fp); + if (start == -1) + return 0; + const char *p = prefix; + while (*p) { + int c = getc(fp); + if (c != *p) { + /* wrong character / EOF */ + fseek(fp, start, SEEK_SET); + return 0; + } + p++; + } + return (int)(p - prefix); /* length of prefix */ +} |