improve unicode_utf8_to_utf32 to handle all types of bad UTF-8

author: pommicket <pommicket@gmail.com> 2022-12-23 14:03:31 -0500
committer: pommicket <pommicket@gmail.com> 2022-12-23 14:03:31 -0500
commit: 135d90d050869f868a47061a1df68f22a36547de (patch)
tree: 5388d8eeecedbf7b1deaadcc29030bd153c79b4a /unicode.h
parent: 806638e5ec9f43fb087e01620f8370c0d2ff47b3 (diff)
1 files changed, 17 insertions, 14 deletions
diff --git a/unicode.h b/unicode.h
index fb9810a..6f48e85 100644
--- a/unicode.h
+++ b/unicode.h
@@ -11,19 +11,20 @@ static bool unicode_is_start_of_code_point(u8 byte) {
 
 // A lot like mbrtoc32. Doesn't depend on the locale though, for one thing.
 // *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum
-// number of bytes that can be read from `str`.
+// number of bytes that can be read from `str` (note: this function will never read past a null
+// byte, even if `bytes` indicates that it could).
 // Returns:
-// 0 - if a null character was encountered
+// 0 - if a null character was encountered or if `bytes == 0`
 // (size_t)-1 - on invalid UTF-8
 // (size_t)-2 - on incomplete code point (str should be longer)
 // other - the number of bytes read from `str`.
-static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) {
+static size_t unicode_utf8_to_utf32(char32_t *c, const char *str, size_t bytes) {
+	*c = 0;
 	if (bytes == 0) {
-		*c = 0;
 		return 0;
 	}
 	// it's easier to do things with unsigned integers
-	u8 const *p = (u8 const *)str;
+	const u8 *p = (const u8 *)str;
 
 	u8 first_byte = *p;
 	
@@ -33,13 +34,13 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes)
 			if (bytes >= 2) {
 				++p;
 				u32 second_byte = *p;
+				if ((second_byte & 0xC0) != 0x80) return (size_t)-1;
 				u32 value = ((u32)first_byte & 0x1F) << 6
 					| (second_byte & 0x3F);
 				*c = (char32_t)value;
 				return 2;
 			} else {
 				// incomplete code point
-				*c = 0;
 				return (size_t)-2;
 			}
 		}
@@ -48,8 +49,10 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes)
 			if (bytes >= 3) {
 				++p;
 				u32 second_byte = *p;
+				if ((second_byte & 0xC0) != 0x80) return (size_t)-1;
 				++p;
 				u32 third_byte = *p;
+				if ((third_byte & 0xC0) != 0x80) return (size_t)-1;
 				u32 value = ((u32)first_byte & 0x0F) << 12
 					| (second_byte & 0x3F) << 6
 					| (third_byte & 0x3F);
@@ -58,12 +61,10 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes)
 					return 3;
 				} else {
 					// reserved for UTF-16 surrogate halves
-					*c = 0;
 					return (size_t)-1;
 				}
 			} else {
 				// incomplete
-				*c = 0;
 				return (size_t)-2;
 			}
 		}
@@ -72,35 +73,37 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes)
 			if (bytes >= 4) {
 				++p;
 				u32 second_byte = *p;
+				if ((second_byte & 0xC0) != 0x80) return (size_t)-1;
 				++p;
 				u32 third_byte = *p;
+				if ((third_byte & 0xC0) != 0x80) return (size_t)-1;
 				++p;
 				u32 fourth_byte = *p;
+				if ((fourth_byte & 0xC0) != 0x80) return (size_t)-1;
 				u32 value = ((u32)first_byte & 0x07) << 18
 					| (second_byte & 0x3F) << 12
 					| (third_byte  & 0x3F) << 6
 					| (fourth_byte & 0x3F);
-				if (value <= 0x10FFFF) {
+				if (value >= 0xD800 && value <= 0xDFFF) {
+					// reserved for UTF-16 surrogate halves
+					return (size_t)-1;
+				} else if (value <= 0x10FFFF) {
 					*c = (char32_t)value;
 					return 4;
 				} else {
-					// Code points this big can't be encoded by UTF-16 and so are invalid UTF-8.
-					*c = 0;
+					// Code points this big can't be encoded by UTF-16 so are invalid UTF-8.
 					return (size_t)-1;
 				}
 			} else {
 				// incomplete
-				*c = 0;
 				return (size_t)-2;
 			}
 		}
 		// invalid UTF-8
-		*c = 0;
 		return (size_t)-1;
 	} else {
 		// ASCII character
 		if (first_byte == 0) {
-			*c = 0;
 			return 0;
 		}
 		*c = first_byte;
author	pommicket <pommicket@gmail.com>	2022-12-23 14:03:31 -0500
committer	pommicket <pommicket@gmail.com>	2022-12-23 14:03:31 -0500
commit	135d90d050869f868a47061a1df68f22a36547de (patch)
tree	5388d8eeecedbf7b1deaadcc29030bd153c79b4a /unicode.h
parent	806638e5ec9f43fb087e01620f8370c0d2ff47b3 (diff)