Check for valid utf-8

author: pommicket <pommicket@gmail.com> 2025-09-11 21:19:17 -0400
committer: pommicket <pommicket@gmail.com> 2025-09-11 21:19:17 -0400
commit: 32d65bf7387c389bf5f3b3d2e94056f50480594f (patch)
tree: 84f1c05315fdd016966baca2da1c792b40207394 /pom.c
parent: aab7700e2bba9216a8343e8e4e0cd0096026ba1a (diff)
1 files changed, 109 insertions, 14 deletions
diff --git a/pom.c b/pom.c
index de698f0..f1dd041 100644
--- a/pom.c
+++ b/pom.c
@@ -58,6 +58,25 @@ struct parser_error {
 	uint32_t message;
 };
 
+// type for parser::utf8_state
+enum utf8_state {
+	UTF8_STATE_DEFAULT = 0,
+	// want 1 continuation byte
+	UTF8_STATE_1CONT = 1,
+	// want 2 continuation bytes
+	UTF8_STATE_2CONT = 2,
+	// want 3 continuation bytes
+	UTF8_STATE_3CONT = 3,
+	// want 2 continuation bytes, first one must be >=0xA0 (otherwise encoding is overlong)
+	UTF8_STATE_2CONT_GTEQ_A0 = 4,
+	// want 2 continuation bytes, first one must be <0xA0 (otherwise encodes a UTF-16 surrogate)
+	UTF8_STATE_2CONT_LT_A0 = 5,
+	// want 3 continuation bytes, first one must be >=0x90 (otherwise encodoing is overlong)
+	UTF8_STATE_3CONT_GTEQ_90 = 6,
+	// want 3 continuation bytes, first one must be <0x90 (otherwise encoding produces oversized code point)
+	UTF8_STATE_3CONT_LT_90 = 7,
+};
+
 struct parser {
 	const char *filename;
 	uint64_t line_number;
@@ -77,6 +96,8 @@ struct parser {
 		size_t count, capacity;
 	} error_messages;
 	bool eof, out_of_memory, leftover_cr;
+	// see enum utf8_state -- starting state for future calls to read_func
+	uint8_t utf8_state;
 	uint16_t buf_pos;
 	uint16_t buf_count;
 	char buf[4096];
@@ -252,6 +273,10 @@ parser_error(struct parser *parser, const char *fmt, ...) {
 // Reads into parser->line_buf.
 static void
 parser_read_line(struct parser *parser) {
+	if (parser->eof) {
+		parser->line.array[0] = 0;
+		return;
+	}
 	parser->line.count = 0;
 	while (true) {
 		char *line_out = parser_append(parser, line, sizeof parser->buf + 1);
@@ -266,35 +291,105 @@ parser_read_line(struct parser *parser) {
 		}
 		// ensure next append goes in the right place.
 		parser->line.count = line_out - parser->line.array;
+		char *buf = parser->buf;
 		// read more data into buf
-		size_t read_count = parser->read_func(parser->userdata, parser->buf, sizeof parser->buf - 1);
+		size_t read_count = parser->read_func(parser->userdata, buf, sizeof parser->buf - 1);
 		parser->buf_pos = 0;
+		uint8_t utf8_state = parser->utf8_state;
 		if (read_count == 0) {
 			// EOF reached.
+			if (utf8_state) {
+				parser_error(parser, "Invalid UTF-8 (want continuation byte, got EOF).");
+			}
 			parser->eof = true;
 			*line_out = 0;
 			return;
 		}
-		if (parser->leftover_cr && parser->buf[0] != '\n')
+		parser->utf8_state = utf8_state;
+		if (parser->leftover_cr && buf[0] != '\n')
 			parser_error(parser, "Carriage return with no newline after it.");
 		size_t out = 0;
 		uint64_t original_line_number = parser->line_number;
 		for (size_t in = 0; in < read_count; in++) {
-			char c = parser->buf[in];
-			if (c == '\r') {
-				if (in == read_count - 1) {
-					parser->leftover_cr = true;
-				} else if (parser->buf[in + 1] != '\n') {
-					parser_error(parser, "Carriage return with no newline after it.");
+			uint8_t byte = buf[in];
+			if (utf8_state == 0) {
+				if (byte < 0x80) {
+					// ASCII	
+					if (byte == '\r') {
+						if (in == read_count - 1) {
+							parser->leftover_cr = true;
+						} else if (buf[in + 1] != '\n') {
+							parser_error(parser, "Carriage return with no newline after it.");
+						}
+						continue;
+					} else if (byte == '\n') {
+						parser->line_number++;
+					} else if (byte >= 0 && byte < 32 && byte != '\t') {
+						parser_error(parser, "Illegal control character (ASCII code %d)", byte);
+						continue;
+					}
+				} else if (byte < 0xC2) {
+				utf8_invalid_start_byte:
+					parser_error(parser, "Invalid UTF-8 (invalid start byte 0x%02X)", byte);
+					continue;
+				} else if (byte < 0xE0) {
+					// 2-byte sequence
+					utf8_state = UTF8_STATE_1CONT;
+				} else if (byte == 0xE0) {
+					// 3-byte sequence; must check for overlongness
+					utf8_state = UTF8_STATE_2CONT_GTEQ_A0;
+				} else if (byte == 0xED) {
+					// 3-byte sequence; must check for UTF-16 surrogate
+					utf8_state = UTF8_STATE_2CONT_LT_A0;
+				} else if (byte < 0xF0) {
+					// 3-byte sequence
+					utf8_state = UTF8_STATE_3CONT;
+				} else if (byte == 0xF0) {
+					// 4-byte sequence; must check for overlongness
+					utf8_state = UTF8_STATE_3CONT_GTEQ_90;
+				} else if (byte < 0xF4) {
+					// 4-byte sequence
+					utf8_state = UTF8_STATE_3CONT;
+				} else if (byte == 0xF4) {
+					// 4-byte sequence; must check for too-big code points
+					utf8_state = UTF8_STATE_3CONT_LT_90;
+				} else {
+					goto utf8_invalid_start_byte;
+				}
+			} else if (utf8_state == UTF8_STATE_1CONT || utf8_state == UTF8_STATE_2CONT || utf8_state == UTF8_STATE_3CONT) {
+				utf8_state -= 1;
+				if ((byte & 0xC0) != 0x80) {
+					parser_error(parser, "Invalid UTF-8 (want continuation byte, got 0x%02X)", byte);
+					continue;
+				}
+			} else if (utf8_state == UTF8_STATE_2CONT_GTEQ_A0) {
+				utf8_state = UTF8_STATE_1CONT;
+				if (byte < 0xA0 || (byte & 0xC0) != 0x80) {
+					parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0xA0, got 0x%02X)", byte);
+					continue;
+				}
+			} else if (utf8_state == UTF8_STATE_2CONT_LT_A0) {
+				utf8_state = UTF8_STATE_1CONT;
+				if (byte >= 0xA0 || (byte & 0xC0) != 0x80) {
+					parser_error(parser, "Invalid UTF-8 (want continuation byte < 0xA0, got 0x%02X)", byte);
+					continue;
+				}
+			} else if (utf8_state == UTF8_STATE_3CONT_GTEQ_90) {
+				utf8_state = UTF8_STATE_2CONT;
+				if (byte < 0x90 || (byte & 0xC0) != 0x80) {
+					parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0x90, got 0x%02X)", byte);
+					continue;
+				}
+			} else if (utf8_state == UTF8_STATE_3CONT_LT_90) {
+				utf8_state = UTF8_STATE_2CONT;
+				if (byte >= 0x90 || (byte & 0xC0) != 0x80) {
+					parser_error(parser, "Invalid UTF-8 (want continuation byte < 0x90, got 0x%02X)", byte);
+					continue;
 				}
-			} else if (c == '\n') {
-				parser->buf[out++] = c;
-				parser->line_number++;
-			} else if (c >= 0 && c < 32 && c != '\t') {
-				parser_error(parser, "Illegal control character (ASCII code %d)", c);
 			} else {
-				parser->buf[out++] = c;
+				abort(); // should be unreachable.
 			}
+			buf[out++] = byte;
 		}
 		parser->line_number = original_line_number;
 		parser->buf_count = out;
author	pommicket <pommicket@gmail.com>	2025-09-11 21:19:17 -0400
committer	pommicket <pommicket@gmail.com>	2025-09-11 21:19:17 -0400
commit	32d65bf7387c389bf5f3b3d2e94056f50480594f (patch)
tree	84f1c05315fdd016966baca2da1c792b40207394 /pom.c
parent	aab7700e2bba9216a8343e8e4e0cd0096026ba1a (diff)