diff options
author | pommicket <pommicket@gmail.com> | 2025-09-11 21:19:17 -0400 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2025-09-11 21:19:17 -0400 |
commit | 32d65bf7387c389bf5f3b3d2e94056f50480594f (patch) | |
tree | 84f1c05315fdd016966baca2da1c792b40207394 /pom.c | |
parent | aab7700e2bba9216a8343e8e4e0cd0096026ba1a (diff) |
Check for valid utf-8
Diffstat (limited to 'pom.c')
-rw-r--r-- | pom.c | 123 |
1 files changed, 109 insertions, 14 deletions
@@ -58,6 +58,25 @@ struct parser_error { uint32_t message; }; +// type for parser::utf8_state +enum utf8_state { + UTF8_STATE_DEFAULT = 0, + // want 1 continuation byte + UTF8_STATE_1CONT = 1, + // want 2 continuation bytes + UTF8_STATE_2CONT = 2, + // want 3 continuation bytes + UTF8_STATE_3CONT = 3, + // want 2 continuation bytes, first one must be >=0xA0 (otherwise encoding is overlong) + UTF8_STATE_2CONT_GTEQ_A0 = 4, + // want 2 continuation bytes, first one must be <0xA0 (otherwise encodes a UTF-16 surrogate) + UTF8_STATE_2CONT_LT_A0 = 5, + // want 3 continuation bytes, first one must be >=0x90 (otherwise encodoing is overlong) + UTF8_STATE_3CONT_GTEQ_90 = 6, + // want 3 continuation bytes, first one must be <0x90 (otherwise encoding produces oversized code point) + UTF8_STATE_3CONT_LT_90 = 7, +}; + struct parser { const char *filename; uint64_t line_number; @@ -77,6 +96,8 @@ struct parser { size_t count, capacity; } error_messages; bool eof, out_of_memory, leftover_cr; + // see enum utf8_state -- starting state for future calls to read_func + uint8_t utf8_state; uint16_t buf_pos; uint16_t buf_count; char buf[4096]; @@ -252,6 +273,10 @@ parser_error(struct parser *parser, const char *fmt, ...) { // Reads into parser->line_buf. static void parser_read_line(struct parser *parser) { + if (parser->eof) { + parser->line.array[0] = 0; + return; + } parser->line.count = 0; while (true) { char *line_out = parser_append(parser, line, sizeof parser->buf + 1); @@ -266,35 +291,105 @@ parser_read_line(struct parser *parser) { } // ensure next append goes in the right place. parser->line.count = line_out - parser->line.array; + char *buf = parser->buf; // read more data into buf - size_t read_count = parser->read_func(parser->userdata, parser->buf, sizeof parser->buf - 1); + size_t read_count = parser->read_func(parser->userdata, buf, sizeof parser->buf - 1); parser->buf_pos = 0; + uint8_t utf8_state = parser->utf8_state; if (read_count == 0) { // EOF reached. + if (utf8_state) { + parser_error(parser, "Invalid UTF-8 (want continuation byte, got EOF)."); + } parser->eof = true; *line_out = 0; return; } - if (parser->leftover_cr && parser->buf[0] != '\n') + parser->utf8_state = utf8_state; + if (parser->leftover_cr && buf[0] != '\n') parser_error(parser, "Carriage return with no newline after it."); size_t out = 0; uint64_t original_line_number = parser->line_number; for (size_t in = 0; in < read_count; in++) { - char c = parser->buf[in]; - if (c == '\r') { - if (in == read_count - 1) { - parser->leftover_cr = true; - } else if (parser->buf[in + 1] != '\n') { - parser_error(parser, "Carriage return with no newline after it."); + uint8_t byte = buf[in]; + if (utf8_state == 0) { + if (byte < 0x80) { + // ASCII + if (byte == '\r') { + if (in == read_count - 1) { + parser->leftover_cr = true; + } else if (buf[in + 1] != '\n') { + parser_error(parser, "Carriage return with no newline after it."); + } + continue; + } else if (byte == '\n') { + parser->line_number++; + } else if (byte >= 0 && byte < 32 && byte != '\t') { + parser_error(parser, "Illegal control character (ASCII code %d)", byte); + continue; + } + } else if (byte < 0xC2) { + utf8_invalid_start_byte: + parser_error(parser, "Invalid UTF-8 (invalid start byte 0x%02X)", byte); + continue; + } else if (byte < 0xE0) { + // 2-byte sequence + utf8_state = UTF8_STATE_1CONT; + } else if (byte == 0xE0) { + // 3-byte sequence; must check for overlongness + utf8_state = UTF8_STATE_2CONT_GTEQ_A0; + } else if (byte == 0xED) { + // 3-byte sequence; must check for UTF-16 surrogate + utf8_state = UTF8_STATE_2CONT_LT_A0; + } else if (byte < 0xF0) { + // 3-byte sequence + utf8_state = UTF8_STATE_3CONT; + } else if (byte == 0xF0) { + // 4-byte sequence; must check for overlongness + utf8_state = UTF8_STATE_3CONT_GTEQ_90; + } else if (byte < 0xF4) { + // 4-byte sequence + utf8_state = UTF8_STATE_3CONT; + } else if (byte == 0xF4) { + // 4-byte sequence; must check for too-big code points + utf8_state = UTF8_STATE_3CONT_LT_90; + } else { + goto utf8_invalid_start_byte; + } + } else if (utf8_state == UTF8_STATE_1CONT || utf8_state == UTF8_STATE_2CONT || utf8_state == UTF8_STATE_3CONT) { + utf8_state -= 1; + if ((byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte, got 0x%02X)", byte); + continue; + } + } else if (utf8_state == UTF8_STATE_2CONT_GTEQ_A0) { + utf8_state = UTF8_STATE_1CONT; + if (byte < 0xA0 || (byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0xA0, got 0x%02X)", byte); + continue; + } + } else if (utf8_state == UTF8_STATE_2CONT_LT_A0) { + utf8_state = UTF8_STATE_1CONT; + if (byte >= 0xA0 || (byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte < 0xA0, got 0x%02X)", byte); + continue; + } + } else if (utf8_state == UTF8_STATE_3CONT_GTEQ_90) { + utf8_state = UTF8_STATE_2CONT; + if (byte < 0x90 || (byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0x90, got 0x%02X)", byte); + continue; + } + } else if (utf8_state == UTF8_STATE_3CONT_LT_90) { + utf8_state = UTF8_STATE_2CONT; + if (byte >= 0x90 || (byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte < 0x90, got 0x%02X)", byte); + continue; } - } else if (c == '\n') { - parser->buf[out++] = c; - parser->line_number++; - } else if (c >= 0 && c < 32 && c != '\t') { - parser_error(parser, "Illegal control character (ASCII code %d)", c); } else { - parser->buf[out++] = c; + abort(); // should be unreachable. } + buf[out++] = byte; } parser->line_number = original_line_number; parser->buf_count = out; |