summaryrefslogtreecommitdiff
path: root/pom.c
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2025-09-11 21:33:13 -0400
committerpommicket <pommicket@gmail.com>2025-09-11 21:33:13 -0400
commit17cb42b2e533e4207e135752a356180573a566e7 (patch)
tree4ba90ca31e5d4b581c71f8acb7887af2f7bd62c8 /pom.c
parent32d65bf7387c389bf5f3b3d2e94056f50480594f (diff)
skip byte order mark
Diffstat (limited to 'pom.c')
-rw-r--r--pom.c218
1 files changed, 118 insertions, 100 deletions
diff --git a/pom.c b/pom.c
index f1dd041..78dfe68 100644
--- a/pom.c
+++ b/pom.c
@@ -270,6 +270,120 @@ parser_error(struct parser *parser, const char *fmt, ...) {
error->message = message_idx;
}
+// read more data into parser->buf. returns false on EOF.
+static bool
+parser_read_to_buf(struct parser *parser, bool skip_bom) {
+ if (parser->eof) return false;
+ char *buf = parser->buf;
+ size_t read_count = parser->read_func(parser->userdata, buf, sizeof parser->buf - 1);
+ parser->buf_pos = 0;
+ uint8_t utf8_state = parser->utf8_state;
+ if (read_count == 0) {
+ // EOF reached.
+ if (utf8_state) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte, got EOF).");
+ }
+ parser->eof = true;
+ return false;
+ }
+ parser->utf8_state = utf8_state;
+ if (parser->leftover_cr && buf[0] != '\n')
+ parser_error(parser, "Carriage return with no newline after it.");
+ size_t in = 0, out = 0;
+ uint64_t original_line_number = parser->line_number;
+ if (skip_bom && read_count >= 3
+ && (uint8_t)parser->buf[0] == 0xEF
+ && (uint8_t)parser->buf[1] == 0xBB
+ && (uint8_t)parser->buf[2] == 0xBF) {
+ // skip byte-order mark
+ in = 3;
+ }
+ for (; in < read_count; in++) {
+ uint8_t byte = buf[in];
+ if (utf8_state == 0) {
+ if (byte < 0x80) {
+ // ASCII
+ if (byte == '\r') {
+ if (in == read_count - 1) {
+ parser->leftover_cr = true;
+ } else if (buf[in + 1] != '\n') {
+ parser_error(parser, "Carriage return with no newline after it.");
+ }
+ continue;
+ } else if (byte == '\n') {
+ parser->line_number++;
+ } else if (byte >= 0 && byte < 32 && byte != '\t') {
+ parser_error(parser, "Illegal control character (ASCII code %d)", byte);
+ continue;
+ }
+ } else if (byte < 0xC2) {
+ utf8_invalid_start_byte:
+ parser_error(parser, "Invalid UTF-8 (invalid start byte 0x%02X)", byte);
+ continue;
+ } else if (byte < 0xE0) {
+ // 2-byte sequence
+ utf8_state = UTF8_STATE_1CONT;
+ } else if (byte == 0xE0) {
+ // 3-byte sequence; must check for overlongness
+ utf8_state = UTF8_STATE_2CONT_GTEQ_A0;
+ } else if (byte == 0xED) {
+ // 3-byte sequence; must check for UTF-16 surrogate
+ utf8_state = UTF8_STATE_2CONT_LT_A0;
+ } else if (byte < 0xF0) {
+ // 3-byte sequence
+ utf8_state = UTF8_STATE_3CONT;
+ } else if (byte == 0xF0) {
+ // 4-byte sequence; must check for overlongness
+ utf8_state = UTF8_STATE_3CONT_GTEQ_90;
+ } else if (byte < 0xF4) {
+ // 4-byte sequence
+ utf8_state = UTF8_STATE_3CONT;
+ } else if (byte == 0xF4) {
+ // 4-byte sequence; must check for too-big code points
+ utf8_state = UTF8_STATE_3CONT_LT_90;
+ } else {
+ goto utf8_invalid_start_byte;
+ }
+ } else if (utf8_state == UTF8_STATE_1CONT || utf8_state == UTF8_STATE_2CONT || utf8_state == UTF8_STATE_3CONT) {
+ utf8_state -= 1;
+ if ((byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte, got 0x%02X)", byte);
+ continue;
+ }
+ } else if (utf8_state == UTF8_STATE_2CONT_GTEQ_A0) {
+ utf8_state = UTF8_STATE_1CONT;
+ if (byte < 0xA0 || (byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0xA0, got 0x%02X)", byte);
+ continue;
+ }
+ } else if (utf8_state == UTF8_STATE_2CONT_LT_A0) {
+ utf8_state = UTF8_STATE_1CONT;
+ if (byte >= 0xA0 || (byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte < 0xA0, got 0x%02X)", byte);
+ continue;
+ }
+ } else if (utf8_state == UTF8_STATE_3CONT_GTEQ_90) {
+ utf8_state = UTF8_STATE_2CONT;
+ if (byte < 0x90 || (byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0x90, got 0x%02X)", byte);
+ continue;
+ }
+ } else if (utf8_state == UTF8_STATE_3CONT_LT_90) {
+ utf8_state = UTF8_STATE_2CONT;
+ if (byte >= 0x90 || (byte & 0xC0) != 0x80) {
+ parser_error(parser, "Invalid UTF-8 (want continuation byte < 0x90, got 0x%02X)", byte);
+ continue;
+ }
+ } else {
+ abort(); // should be unreachable.
+ }
+ buf[out++] = byte;
+ }
+ parser->line_number = original_line_number;
+ parser->buf_count = out;
+ return true;
+}
+
// Reads into parser->line_buf.
static void
parser_read_line(struct parser *parser) {
@@ -291,108 +405,11 @@ parser_read_line(struct parser *parser) {
}
// ensure next append goes in the right place.
parser->line.count = line_out - parser->line.array;
- char *buf = parser->buf;
- // read more data into buf
- size_t read_count = parser->read_func(parser->userdata, buf, sizeof parser->buf - 1);
- parser->buf_pos = 0;
- uint8_t utf8_state = parser->utf8_state;
- if (read_count == 0) {
- // EOF reached.
- if (utf8_state) {
- parser_error(parser, "Invalid UTF-8 (want continuation byte, got EOF).");
- }
- parser->eof = true;
+ if (!parser_read_to_buf(parser, false)) {
+ // reached EOF
*line_out = 0;
return;
}
- parser->utf8_state = utf8_state;
- if (parser->leftover_cr && buf[0] != '\n')
- parser_error(parser, "Carriage return with no newline after it.");
- size_t out = 0;
- uint64_t original_line_number = parser->line_number;
- for (size_t in = 0; in < read_count; in++) {
- uint8_t byte = buf[in];
- if (utf8_state == 0) {
- if (byte < 0x80) {
- // ASCII
- if (byte == '\r') {
- if (in == read_count - 1) {
- parser->leftover_cr = true;
- } else if (buf[in + 1] != '\n') {
- parser_error(parser, "Carriage return with no newline after it.");
- }
- continue;
- } else if (byte == '\n') {
- parser->line_number++;
- } else if (byte >= 0 && byte < 32 && byte != '\t') {
- parser_error(parser, "Illegal control character (ASCII code %d)", byte);
- continue;
- }
- } else if (byte < 0xC2) {
- utf8_invalid_start_byte:
- parser_error(parser, "Invalid UTF-8 (invalid start byte 0x%02X)", byte);
- continue;
- } else if (byte < 0xE0) {
- // 2-byte sequence
- utf8_state = UTF8_STATE_1CONT;
- } else if (byte == 0xE0) {
- // 3-byte sequence; must check for overlongness
- utf8_state = UTF8_STATE_2CONT_GTEQ_A0;
- } else if (byte == 0xED) {
- // 3-byte sequence; must check for UTF-16 surrogate
- utf8_state = UTF8_STATE_2CONT_LT_A0;
- } else if (byte < 0xF0) {
- // 3-byte sequence
- utf8_state = UTF8_STATE_3CONT;
- } else if (byte == 0xF0) {
- // 4-byte sequence; must check for overlongness
- utf8_state = UTF8_STATE_3CONT_GTEQ_90;
- } else if (byte < 0xF4) {
- // 4-byte sequence
- utf8_state = UTF8_STATE_3CONT;
- } else if (byte == 0xF4) {
- // 4-byte sequence; must check for too-big code points
- utf8_state = UTF8_STATE_3CONT_LT_90;
- } else {
- goto utf8_invalid_start_byte;
- }
- } else if (utf8_state == UTF8_STATE_1CONT || utf8_state == UTF8_STATE_2CONT || utf8_state == UTF8_STATE_3CONT) {
- utf8_state -= 1;
- if ((byte & 0xC0) != 0x80) {
- parser_error(parser, "Invalid UTF-8 (want continuation byte, got 0x%02X)", byte);
- continue;
- }
- } else if (utf8_state == UTF8_STATE_2CONT_GTEQ_A0) {
- utf8_state = UTF8_STATE_1CONT;
- if (byte < 0xA0 || (byte & 0xC0) != 0x80) {
- parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0xA0, got 0x%02X)", byte);
- continue;
- }
- } else if (utf8_state == UTF8_STATE_2CONT_LT_A0) {
- utf8_state = UTF8_STATE_1CONT;
- if (byte >= 0xA0 || (byte & 0xC0) != 0x80) {
- parser_error(parser, "Invalid UTF-8 (want continuation byte < 0xA0, got 0x%02X)", byte);
- continue;
- }
- } else if (utf8_state == UTF8_STATE_3CONT_GTEQ_90) {
- utf8_state = UTF8_STATE_2CONT;
- if (byte < 0x90 || (byte & 0xC0) != 0x80) {
- parser_error(parser, "Invalid UTF-8 (want continuation byte >= 0x90, got 0x%02X)", byte);
- continue;
- }
- } else if (utf8_state == UTF8_STATE_3CONT_LT_90) {
- utf8_state = UTF8_STATE_2CONT;
- if (byte >= 0x90 || (byte & 0xC0) != 0x80) {
- parser_error(parser, "Invalid UTF-8 (want continuation byte < 0x90, got 0x%02X)", byte);
- continue;
- }
- } else {
- abort(); // should be unreachable.
- }
- buf[out++] = byte;
- }
- parser->line_number = original_line_number;
- parser->buf_count = out;
}
}
@@ -436,7 +453,8 @@ pom_load(const char *filename,
parser->read_func = read_func;
parser->userdata = userdata;
parser->line_number = 1;
-
+ // read into parser->buf, and skip initial BOM if present.
+ parser_read_to_buf(parser, true);
while (!(parser->eof || parser->out_of_memory))
parse_line(parser);