summaryrefslogtreecommitdiff
path: root/unicode.h
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2022-12-23 14:03:31 -0500
committerpommicket <pommicket@gmail.com>2022-12-23 14:03:31 -0500
commit135d90d050869f868a47061a1df68f22a36547de (patch)
tree5388d8eeecedbf7b1deaadcc29030bd153c79b4a /unicode.h
parent806638e5ec9f43fb087e01620f8370c0d2ff47b3 (diff)
improve unicode_utf8_to_utf32 to handle all types of bad UTF-8
Diffstat (limited to 'unicode.h')
-rw-r--r--unicode.h31
1 files changed, 17 insertions, 14 deletions
diff --git a/unicode.h b/unicode.h
index fb9810a..6f48e85 100644
--- a/unicode.h
+++ b/unicode.h
@@ -11,19 +11,20 @@ static bool unicode_is_start_of_code_point(u8 byte) {
// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing.
// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum
-// number of bytes that can be read from `str`.
+// number of bytes that can be read from `str` (note: this function will never read past a null
+// byte, even if `bytes` indicates that it could).
// Returns:
-// 0 - if a null character was encountered
+// 0 - if a null character was encountered or if `bytes == 0`
// (size_t)-1 - on invalid UTF-8
// (size_t)-2 - on incomplete code point (str should be longer)
// other - the number of bytes read from `str`.
-static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) {
+static size_t unicode_utf8_to_utf32(char32_t *c, const char *str, size_t bytes) {
+ *c = 0;
if (bytes == 0) {
- *c = 0;
return 0;
}
// it's easier to do things with unsigned integers
- u8 const *p = (u8 const *)str;
+ const u8 *p = (const u8 *)str;
u8 first_byte = *p;
@@ -33,13 +34,13 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes)
if (bytes >= 2) {
++p;
u32 second_byte = *p;
+ if ((second_byte & 0xC0) != 0x80) return (size_t)-1;
u32 value = ((u32)first_byte & 0x1F) << 6
| (second_byte & 0x3F);
*c = (char32_t)value;
return 2;
} else {
// incomplete code point
- *c = 0;
return (size_t)-2;
}
}
@@ -48,8 +49,10 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes)
if (bytes >= 3) {
++p;
u32 second_byte = *p;
+ if ((second_byte & 0xC0) != 0x80) return (size_t)-1;
++p;
u32 third_byte = *p;
+ if ((third_byte & 0xC0) != 0x80) return (size_t)-1;
u32 value = ((u32)first_byte & 0x0F) << 12
| (second_byte & 0x3F) << 6
| (third_byte & 0x3F);
@@ -58,12 +61,10 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes)
return 3;
} else {
// reserved for UTF-16 surrogate halves
- *c = 0;
return (size_t)-1;
}
} else {
// incomplete
- *c = 0;
return (size_t)-2;
}
}
@@ -72,35 +73,37 @@ static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes)
if (bytes >= 4) {
++p;
u32 second_byte = *p;
+ if ((second_byte & 0xC0) != 0x80) return (size_t)-1;
++p;
u32 third_byte = *p;
+ if ((third_byte & 0xC0) != 0x80) return (size_t)-1;
++p;
u32 fourth_byte = *p;
+ if ((fourth_byte & 0xC0) != 0x80) return (size_t)-1;
u32 value = ((u32)first_byte & 0x07) << 18
| (second_byte & 0x3F) << 12
| (third_byte & 0x3F) << 6
| (fourth_byte & 0x3F);
- if (value <= 0x10FFFF) {
+ if (value >= 0xD800 && value <= 0xDFFF) {
+ // reserved for UTF-16 surrogate halves
+ return (size_t)-1;
+ } else if (value <= 0x10FFFF) {
*c = (char32_t)value;
return 4;
} else {
- // Code points this big can't be encoded by UTF-16 and so are invalid UTF-8.
- *c = 0;
+ // Code points this big can't be encoded by UTF-16 so are invalid UTF-8.
return (size_t)-1;
}
} else {
// incomplete
- *c = 0;
return (size_t)-2;
}
}
// invalid UTF-8
- *c = 0;
return (size_t)-1;
} else {
// ASCII character
if (first_byte == 0) {
- *c = 0;
return 0;
}
*c = first_byte;