summaryrefslogtreecommitdiff
path: root/pom.c
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2025-09-11 21:19:17 -0400
committerpommicket <pommicket@gmail.com>2025-09-11 21:19:17 -0400
commit32d65bf7387c389bf5f3b3d2e94056f50480594f (patch)
tree84f1c05315fdd016966baca2da1c792b40207394 /pom.c
parentaab7700e2bba9216a8343e8e4e0cd0096026ba1a (diff)
Check for valid utf-8
Diffstat (limited to 'pom.c')
-rw-r--r--pom.c123
1 files changed, 109 insertions, 14 deletions
diff --git a/pom.c b/pom.c
index de698f0..f1dd041 100644
--- a/pom.c
+++ b/pom.c
@@ -58,6 +58,25 @@ struct parser_error {
uint32_t message;
};
+// type for parser::utf8_state
+enum utf8_state {
+ UTF8_STATE_DEFAULT = 0,
+ // want 1 continuation byte
+ UTF8_STATE_1CONT = 1,
+ // want 2 continuation bytes
+ UTF8_STATE_2CONT = 2,
+ // want 3 continuation bytes
+ UTF8_STATE_3CONT = 3,
+ // want 2 continuation bytes, first one must be >=0xA0 (otherwise encoding is overlong)
+ UTF8_STATE_2CONT_GTEQ_A0 = 4,
+ // want 2 continuation bytes, first one must be <0xA0 (otherwise encodes a UTF-16 surrogate)
+ UTF8_STATE_2CONT_LT_A0 = 5,
+ // want 3 continuation bytes, first one must be >=0x90 (otherwise encodoing is overlong)
+ UTF8_STATE_3CONT_GTEQ_90 = 6,
+ // want 3 continuation bytes, first one must be <0x90 (otherwise encoding produces oversized code point)
+ UTF8_STATE_3CONT_LT_90 = 7,
+};
+
struct parser {
const char *filename;
uint64_t line_number;
@@ -77,6 +96,8 @@ struct parser {
size_t count, capacity;
} error_messages;
bool eof, out_of_memory, leftover_cr;
+ // see enum utf8_state -- starting state for future calls to read_func
+ uint8_t utf8_state;
uint16_t buf_pos;
uint16_t buf_count;
char buf[4096];
@@ -252,6 +273,10 @@ parser_error(struct parser *parser, const char *fmt, ...) {
// Reads into parser->line_buf.
static void
parser_read_line(struct parser *parser) {
+ if (parser->eof) {
+ parser->line.array[0] = 0;
+ return;
+ }
parser->line.count = 0;
while (true) {
char *line_out = parser_append(parser, line, sizeof parser->buf + 1);
@@ -266,35 +291,105 @@ parser_read_line(struct parser *parser) {
}
// ensure next append goes in the right place.
parser->line.count = line_out - parser->line.array;
+ char *buf = parser->buf;
// read more data into buf
- size_t read_count = parser->read_func(parser->userdata, parser->buf, sizeof parser->buf - 1);
+ size_t read_count = parser->read_func(parser->userdata, buf, sizeof parser->buf - 1);
parser->buf_pos = 0;
+ uint8_t utf8_state = parser->utf8_state;
if (read_count == 0) {
// EOF reached.
+ if (utf8_state) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte, got EOF).");
+ }
parser->eof = true;
*line_out = 0;
return;
}
- if (parser->leftover_cr && parser->buf[0] != '\n')
+ parser->utf8_state = utf8_state;
+ if (parser->leftover_cr && buf[0] != '\n')
parser_error(parser, "Carriage return with no newline after it.");
size_t out = 0;
uint64_t original_line_number = parser->line_number;
for (size_t in = 0; in < read_count; in++) {
- char c = parser->buf[in];
- if (c == '\r') {
- if (in == read_count - 1) {
- parser->leftover_cr = true;
- } else if (parser->buf[in + 1] != '\n') {
- parser_error(parser, "Carriage return with no newline after it.");
+ uint8_t byte = buf[in];
+ if (utf8_state == 0) {
+ if (byte < 0x80) {
+ // ASCII
+ if (byte == '\r') {
+ if (in == read_count - 1) {
+ parser->leftover_cr = true;
+ } else if (buf[in + 1] != '\n') {
+ parser_error(parser, "Carriage return with no newline after it.");
+ }
+ continue;
+ } else if (byte == '\n') {
+ parser->line_number++;
+ } else if (byte >= 0 && byte < 32 && byte != '\t') {
+ parser_error(parser, "Illegal control character (ASCII code %d)", byte);
+ continue;
+ }
+ } else if (byte < 0xC2) {
+ utf8_invalid_start_byte:
+ parser_error(parser, "Invalid UTF-8 (invalid start byte 0x%02X)", byte);
+ continue;
+ } else if (byte < 0xE0) {
+ // 2-byte sequence
+ utf8_state = UTF8_STATE_1CONT;
+ } else if (byte == 0xE0) {
+ // 3-byte sequence; must check for overlongness
+ utf8_state = UTF8_STATE_2CONT_GTEQ_A0;
+ } else if (byte == 0xED) {
+ // 3-byte sequence; must check for UTF-16 surrogate
+ utf8_state = UTF8_STATE_2CONT_LT_A0;
+ } else if (byte < 0xF0) {
+ // 3-byte sequence
+ utf8_state = UTF8_STATE_3CONT;
+ } else if (byte == 0xF0) {
+ // 4-byte sequence; must check for overlongness
+ utf8_state = UTF8_STATE_3CONT_GTEQ_90;
+ } else if (byte < 0xF4) {
+ // 4-byte sequence
+ utf8_state = UTF8_STATE_3CONT;
+ } else if (byte == 0xF4) {
+ // 4-byte sequence; must check for too-big code points
+ utf8_state = UTF8_STATE_3CONT_LT_90;
+ } else {
+ goto utf8_invalid_start_byte;
+ }
+ } else if (utf8_state == UTF8_STATE_1CONT || utf8_state == UTF8_STATE_2CONT || utf8_state == UTF8_STATE_3CONT) {
+ utf8_state -= 1;
+ if ((byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte, got 0x%02X)", byte);
+ continue;
+ }
+ } else if (utf8_state == UTF8_STATE_2CONT_GTEQ_A0) {
+ utf8_state = UTF8_STATE_1CONT;
+ if (byte < 0xA0 || (byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0xA0, got 0x%02X)", byte);
+ continue;
+ }
+ } else if (utf8_state == UTF8_STATE_2CONT_LT_A0) {
+ utf8_state = UTF8_STATE_1CONT;
+ if (byte >= 0xA0 || (byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte < 0xA0, got 0x%02X)", byte);
+ continue;
+ }
+ } else if (utf8_state == UTF8_STATE_3CONT_GTEQ_90) {
+ utf8_state = UTF8_STATE_2CONT;
+ if (byte < 0x90 || (byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0x90, got 0x%02X)", byte);
+ continue;
+ }
+ } else if (utf8_state == UTF8_STATE_3CONT_LT_90) {
+ utf8_state = UTF8_STATE_2CONT;
+ if (byte >= 0x90 || (byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte < 0x90, got 0x%02X)", byte);
+ continue;
}
- } else if (c == '\n') {
- parser->buf[out++] = c;
- parser->line_number++;
- } else if (c >= 0 && c < 32 && c != '\t') {
- parser_error(parser, "Illegal control character (ASCII code %d)", c);
} else {
- parser->buf[out++] = c;
+ abort(); // should be unreachable.
}
+ buf[out++] = byte;
}
parser->line_number = original_line_number;
parser->buf_count = out;