From 135d90d050869f868a47061a1df68f22a36547de Mon Sep 17 00:00:00 2001 From: pommicket Date: Fri, 23 Dec 2022 14:03:31 -0500 Subject: improve unicode_utf8_to_utf32 to handle all types of bad UTF-8 --- unicode.h | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'unicode.h') diff --git a/unicode.h b/unicode.h index fb9810a..6f48e85 100644 --- a/unicode.h +++ b/unicode.h @@ -11,19 +11,20 @@ static bool unicode_is_start_of_code_point(u8 byte) { // A lot like mbrtoc32. Doesn't depend on the locale though, for one thing. // *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum -// number of bytes that can be read from `str`. +// number of bytes that can be read from `str` (note: this function will never read past a null +// byte, even if `bytes` indicates that it could). // Returns: -// 0 - if a null character was encountered +// 0 - if a null character was encountered or if `bytes == 0` // (size_t)-1 - on invalid UTF-8 // (size_t)-2 - on incomplete code point (str should be longer) // other - the number of bytes read from `str`. -static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) { +static size_t unicode_utf8_to_utf32(char32_t *c, const char *str, size_t bytes) { + *c = 0; if (bytes == 0) { - *c = 0; return 0; } // it's easier to do things with unsigned integers - u8 const *p = (u8 const *)str; + const u8 *p = (const u8 *)str; u8 first_byte = *p; @@ -33,13 +34,13 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) if (bytes >= 2) { ++p; u32 second_byte = *p; + if ((second_byte & 0xC0) != 0x80) return (size_t)-1; u32 value = ((u32)first_byte & 0x1F) << 6 | (second_byte & 0x3F); *c = (char32_t)value; return 2; } else { // incomplete code point - *c = 0; return (size_t)-2; } } @@ -48,8 +49,10 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) if (bytes >= 3) { ++p; u32 second_byte = *p; + if ((second_byte & 0xC0) != 0x80) return (size_t)-1; ++p; u32 third_byte = *p; + if ((third_byte & 0xC0) != 0x80) return (size_t)-1; u32 value = ((u32)first_byte & 0x0F) << 12 | (second_byte & 0x3F) << 6 | (third_byte & 0x3F); @@ -58,12 +61,10 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) return 3; } else { // reserved for UTF-16 surrogate halves - *c = 0; return (size_t)-1; } } else { // incomplete - *c = 0; return (size_t)-2; } } @@ -72,35 +73,37 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) if (bytes >= 4) { ++p; u32 second_byte = *p; + if ((second_byte & 0xC0) != 0x80) return (size_t)-1; ++p; u32 third_byte = *p; + if ((third_byte & 0xC0) != 0x80) return (size_t)-1; ++p; u32 fourth_byte = *p; + if ((fourth_byte & 0xC0) != 0x80) return (size_t)-1; u32 value = ((u32)first_byte & 0x07) << 18 | (second_byte & 0x3F) << 12 | (third_byte & 0x3F) << 6 | (fourth_byte & 0x3F); - if (value <= 0x10FFFF) { + if (value >= 0xD800 && value <= 0xDFFF) { + // reserved for UTF-16 surrogate halves + return (size_t)-1; + } else if (value <= 0x10FFFF) { *c = (char32_t)value; return 4; } else { - // Code points this big can't be encoded by UTF-16 and so are invalid UTF-8. - *c = 0; + // Code points this big can't be encoded by UTF-16 so are invalid UTF-8. return (size_t)-1; } } else { // incomplete - *c = 0; return (size_t)-2; } } // invalid UTF-8 - *c = 0; return (size_t)-1; } else { // ASCII character if (first_byte == 0) { - *c = 0; return 0; } *c = first_byte; -- cgit v1.2.3