diff options
author | pommicket <pommicket@gmail.com> | 2025-09-11 21:33:13 -0400 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2025-09-11 21:33:13 -0400 |
commit | 17cb42b2e533e4207e135752a356180573a566e7 (patch) | |
tree | 4ba90ca31e5d4b581c71f8acb7887af2f7bd62c8 /pom.c | |
parent | 32d65bf7387c389bf5f3b3d2e94056f50480594f (diff) |
skip byte order mark
Diffstat (limited to 'pom.c')
-rw-r--r-- | pom.c | 218 |
1 files changed, 118 insertions, 100 deletions
@@ -270,6 +270,120 @@ parser_error(struct parser *parser, const char *fmt, ...) { error->message = message_idx; } +// read more data into parser->buf. returns false on EOF. +static bool +parser_read_to_buf(struct parser *parser, bool skip_bom) { + if (parser->eof) return false; + char *buf = parser->buf; + size_t read_count = parser->read_func(parser->userdata, buf, sizeof parser->buf - 1); + parser->buf_pos = 0; + uint8_t utf8_state = parser->utf8_state; + if (read_count == 0) { + // EOF reached. + if (utf8_state) { + parser_error(parser, "Invalid UTF-8 (want continuation byte, got EOF)."); + } + parser->eof = true; + return false; + } + parser->utf8_state = utf8_state; + if (parser->leftover_cr && buf[0] != '\n') + parser_error(parser, "Carriage return with no newline after it."); + size_t in = 0, out = 0; + uint64_t original_line_number = parser->line_number; + if (skip_bom && read_count >= 3 + && (uint8_t)parser->buf[0] == 0xEF + && (uint8_t)parser->buf[1] == 0xBB + && (uint8_t)parser->buf[2] == 0xBF) { + // skip byte-order mark + in = 3; + } + for (; in < read_count; in++) { + uint8_t byte = buf[in]; + if (utf8_state == 0) { + if (byte < 0x80) { + // ASCII + if (byte == '\r') { + if (in == read_count - 1) { + parser->leftover_cr = true; + } else if (buf[in + 1] != '\n') { + parser_error(parser, "Carriage return with no newline after it."); + } + continue; + } else if (byte == '\n') { + parser->line_number++; + } else if (byte >= 0 && byte < 32 && byte != '\t') { + parser_error(parser, "Illegal control character (ASCII code %d)", byte); + continue; + } + } else if (byte < 0xC2) { + utf8_invalid_start_byte: + parser_error(parser, "Invalid UTF-8 (invalid start byte 0x%02X)", byte); + continue; + } else if (byte < 0xE0) { + // 2-byte sequence + utf8_state = UTF8_STATE_1CONT; + } else if (byte == 0xE0) { + // 3-byte sequence; must check for overlongness + utf8_state = UTF8_STATE_2CONT_GTEQ_A0; + } else if (byte == 0xED) { + // 3-byte sequence; must check for UTF-16 surrogate + utf8_state = UTF8_STATE_2CONT_LT_A0; + } else if (byte < 0xF0) { + // 3-byte sequence + utf8_state = UTF8_STATE_3CONT; + } else if (byte == 0xF0) { + // 4-byte sequence; must check for overlongness + utf8_state = UTF8_STATE_3CONT_GTEQ_90; + } else if (byte < 0xF4) { + // 4-byte sequence + utf8_state = UTF8_STATE_3CONT; + } else if (byte == 0xF4) { + // 4-byte sequence; must check for too-big code points + utf8_state = UTF8_STATE_3CONT_LT_90; + } else { + goto utf8_invalid_start_byte; + } + } else if (utf8_state == UTF8_STATE_1CONT || utf8_state == UTF8_STATE_2CONT || utf8_state == UTF8_STATE_3CONT) { + utf8_state -= 1; + if ((byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte, got 0x%02X)", byte); + continue; + } + } else if (utf8_state == UTF8_STATE_2CONT_GTEQ_A0) { + utf8_state = UTF8_STATE_1CONT; + if (byte < 0xA0 || (byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0xA0, got 0x%02X)", byte); + continue; + } + } else if (utf8_state == UTF8_STATE_2CONT_LT_A0) { + utf8_state = UTF8_STATE_1CONT; + if (byte >= 0xA0 || (byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte < 0xA0, got 0x%02X)", byte); + continue; + } + } else if (utf8_state == UTF8_STATE_3CONT_GTEQ_90) { + utf8_state = UTF8_STATE_2CONT; + if (byte < 0x90 || (byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0x90, got 0x%02X)", byte); + continue; + } + } else if (utf8_state == UTF8_STATE_3CONT_LT_90) { + utf8_state = UTF8_STATE_2CONT; + if (byte >= 0x90 || (byte & 0xC0) != 0x80) { + parser_error(parser, "Invalid UTF-8 (want continuation byte < 0x90, got 0x%02X)", byte); + continue; + } + } else { + abort(); // should be unreachable. + } + buf[out++] = byte; + } + parser->line_number = original_line_number; + parser->buf_count = out; + return true; +} + // Reads into parser->line_buf. static void parser_read_line(struct parser *parser) { @@ -291,108 +405,11 @@ parser_read_line(struct parser *parser) { } // ensure next append goes in the right place. parser->line.count = line_out - parser->line.array; - char *buf = parser->buf; - // read more data into buf - size_t read_count = parser->read_func(parser->userdata, buf, sizeof parser->buf - 1); - parser->buf_pos = 0; - uint8_t utf8_state = parser->utf8_state; - if (read_count == 0) { - // EOF reached. - if (utf8_state) { - parser_error(parser, "Invalid UTF-8 (want continuation byte, got EOF)."); - } - parser->eof = true; + if (!parser_read_to_buf(parser, false)) { + // reached EOF *line_out = 0; return; } - parser->utf8_state = utf8_state; - if (parser->leftover_cr && buf[0] != '\n') - parser_error(parser, "Carriage return with no newline after it."); - size_t out = 0; - uint64_t original_line_number = parser->line_number; - for (size_t in = 0; in < read_count; in++) { - uint8_t byte = buf[in]; - if (utf8_state == 0) { - if (byte < 0x80) { - // ASCII - if (byte == '\r') { - if (in == read_count - 1) { - parser->leftover_cr = true; - } else if (buf[in + 1] != '\n') { - parser_error(parser, "Carriage return with no newline after it."); - } - continue; - } else if (byte == '\n') { - parser->line_number++; - } else if (byte >= 0 && byte < 32 && byte != '\t') { - parser_error(parser, "Illegal control character (ASCII code %d)", byte); - continue; - } - } else if (byte < 0xC2) { - utf8_invalid_start_byte: - parser_error(parser, "Invalid UTF-8 (invalid start byte 0x%02X)", byte); - continue; - } else if (byte < 0xE0) { - // 2-byte sequence - utf8_state = UTF8_STATE_1CONT; - } else if (byte == 0xE0) { - // 3-byte sequence; must check for overlongness - utf8_state = UTF8_STATE_2CONT_GTEQ_A0; - } else if (byte == 0xED) { - // 3-byte sequence; must check for UTF-16 surrogate - utf8_state = UTF8_STATE_2CONT_LT_A0; - } else if (byte < 0xF0) { - // 3-byte sequence - utf8_state = UTF8_STATE_3CONT; - } else if (byte == 0xF0) { - // 4-byte sequence; must check for overlongness - utf8_state = UTF8_STATE_3CONT_GTEQ_90; - } else if (byte < 0xF4) { - // 4-byte sequence - utf8_state = UTF8_STATE_3CONT; - } else if (byte == 0xF4) { - // 4-byte sequence; must check for too-big code points - utf8_state = UTF8_STATE_3CONT_LT_90; - } else { - goto utf8_invalid_start_byte; - } - } else if (utf8_state == UTF8_STATE_1CONT || utf8_state == UTF8_STATE_2CONT || utf8_state == UTF8_STATE_3CONT) { - utf8_state -= 1; - if ((byte & 0xC0) != 0x80) { - parser_error(parser, "Invalid UTF-8 (want continuation byte, got 0x%02X)", byte); - continue; - } - } else if (utf8_state == UTF8_STATE_2CONT_GTEQ_A0) { - utf8_state = UTF8_STATE_1CONT; - if (byte < 0xA0 || (byte & 0xC0) != 0x80) { - parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0xA0, got 0x%02X)", byte); - continue; - } - } else if (utf8_state == UTF8_STATE_2CONT_LT_A0) { - utf8_state = UTF8_STATE_1CONT; - if (byte >= 0xA0 || (byte & 0xC0) != 0x80) { - parser_error(parser, "Invalid UTF-8 (want continuation byte < 0xA0, got 0x%02X)", byte); - continue; - } - } else if (utf8_state == UTF8_STATE_3CONT_GTEQ_90) { - utf8_state = UTF8_STATE_2CONT; - if (byte < 0x90 || (byte & 0xC0) != 0x80) { - parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0x90, got 0x%02X)", byte); - continue; - } - } else if (utf8_state == UTF8_STATE_3CONT_LT_90) { - utf8_state = UTF8_STATE_2CONT; - if (byte >= 0x90 || (byte & 0xC0) != 0x80) { - parser_error(parser, "Invalid UTF-8 (want continuation byte < 0x90, got 0x%02X)", byte); - continue; - } - } else { - abort(); // should be unreachable. - } - buf[out++] = byte; - } - parser->line_number = original_line_number; - parser->buf_count = out; } } @@ -436,7 +453,8 @@ pom_load(const char *filename, parser->read_func = read_func; parser->userdata = userdata; parser->line_number = 1; - + // read into parser->buf, and skip initial BOM if present. + parser_read_to_buf(parser, true); while (!(parser->eof || parser->out_of_memory)) parse_line(parser); |