replaced c32rtomb, mbrtoc32 with own versions

these are nicer to use since they don't involve mbstate_t and should be faster since they don't involve locales
author: Leo Tenenbaum <pommicket@gmail.com> 2021-01-25 18:00:06 -0500
committer: Leo Tenenbaum <pommicket@gmail.com> 2021-01-25 18:00:06 -0500
commit: 795262f69900af674156bed2bcd0fdb57dbbb55e (patch)
tree: 03723b919ff498722d7985a93f9ce7f470596abb /unicode.h
parent: a56f549a266e14cdc00a98e8dc3e154f5ac6c23e (diff)
1 files changed, 137 insertions, 1 deletions
diff --git a/unicode.h b/unicode.h
index 3f76090..cb4f2bc 100644
--- a/unicode.h
+++ b/unicode.h
@@ -9,4 +9,140 @@ static bool unicode_is_start_of_code_point(u8 byte) {
 	return (byte & 0xC0) != 0x80;
 }
 
-#endif
+// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing.
+// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum
+// number of bytes that can be read from `str`.
+// Returns:
+// 0 - if a NULL character was encountered
+// (size_t)-1 - on invalid UTF-8 / incomplete code point
+// other - the number of bytes read from `str`.
+static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) {
+	if (bytes == 0) {
+		*c = 0;
+		return 0;
+	}
+	// it's easier to do things with unsigned integers
+	u8 const *p = (u8 const *)str;
+
+	u8 first_byte = *p;
+	
+	if (first_byte & 0x80) {
+		if ((first_byte & 0xE0) == 0xC0) {
+			// two-byte code point
+			if (bytes >= 2) {
+				++p;
+				u32 second_byte = *p;
+				u32 value = ((u32)first_byte & 0x1F) << 6
+					| (second_byte & 0x3F);
+				*c = (char32_t)value;
+				return 2;
+			} else {
+				// incomplete code point
+				*c = 0;
+				return (size_t)-1;
+			}
+		}
+		if ((first_byte & 0xF0) == 0xE0) {
+			// three-byte code point
+			if (bytes >= 3) {
+				++p;
+				u32 second_byte = *p;
+				++p;
+				u32 third_byte = *p;
+				u32 value = ((u32)first_byte & 0x0F) << 12
+					| (second_byte & 0x3F) << 6
+					| (third_byte & 0x3F);
+				if (value < 0xD800 || value > 0xDFFF) {
+					*c = (char32_t)value;
+					return 3;
+				} else {
+					// reserved for UTF-16 surrogate halves
+					*c = 0;
+					return (size_t)-1;
+				}
+			} else {
+				// incomplete
+				*c = 0;
+				return (size_t)-1;
+			}
+		}
+		if ((first_byte & 0xF8) == 0xF0) {
+			// four-byte code point
+			if (bytes >= 4) {
+				++p;
+				u32 second_byte = *p;
+				++p;
+				u32 third_byte = *p;
+				++p;
+				u32 fourth_byte = *p;
+				u32 value = ((u32)first_byte & 0x07) << 18
+					| (second_byte & 0x3F) << 12
+					| (third_byte  & 0x3F) << 6
+					| (fourth_byte & 0x3F);
+				if (value <= 0x10FFFF) {
+					*c = (char32_t)value;
+					return 4;
+				} else {
+					// Code points this big can't be encoded by UTF-16 and so are invalid UTF-8.
+					*c = 0;
+					return (size_t)-1;
+				}
+			} else {
+				// incomplete
+				*c = 0;
+				return (size_t)-1;
+			}
+		}
+		// invalid UTF-8
+		*c = 0;
+		return (size_t)-1;
+	} else {
+		// ASCII character
+		if (first_byte == 0) {
+			*c = 0;
+			return 0;
+		}
+		*c = first_byte;
+		return 1;
+	}
+}
+
+// A lot like c32rtomb
+// Converts a UTF-32 codepoint to a UTF-8 string. Writes at most 4 bytes to s.
+// NOTE: It is YOUR JOB to null-terminate your string if the UTF-32 isn't null-terminated!
+// Returns the number of bytes written to s, or (size_t)-1 on invalid UTF-32.
+static size_t unicode_utf32_to_utf8(char *s, char32_t c32) {
+	u8 *p = (u8 *)s;
+	if (c32 <= 0x7F) {
+		// ASCII
+		*p = (u8)c32;
+		return 1;
+	} else if (c32 <= 0x7FF) {
+		// two bytes needed
+		*p++ = 0xC0 | (u8)(c32 >> 6);
+		*p   = 0x80 | (u8)(c32 & 0x3F);
+		return 2;
+	} else if (c32 <= 0x7FFF) {
+		if (c32 < 0xD800 || c32 > 0xDFFF) {
+			*p++ = 0xE0 | (u8)( c32 >> 12);
+			*p++ = 0x80 | (u8)((c32 >> 6) & 0x3F);
+			*p   = 0x80 | (u8)( c32       & 0x3F);
+			return 3;
+		} else {
+			// UTF-16 surrogate halves
+			*p = 0;
+			return (size_t)-1;
+		}
+	} else if (c32 <= 0x10FFFF) {
+		*p++ = 0xF0 | (u8)( c32 >> 18);
+		*p++ = 0x80 | (u8)((c32 >> 12) & 0x3F);
+		*p++ = 0x80 | (u8)((c32 >>  6) & 0x3F);
+		*p   = 0x80 | (u8)( c32        & 0x3F);
+		return 4;
+	} else {
+		// code point too big
+		*p = 0;
+		return (size_t)-1;
+	}
+}
+#endif // UNICODE_H_
author	Leo Tenenbaum <pommicket@gmail.com>	2021-01-25 18:00:06 -0500
committer	Leo Tenenbaum <pommicket@gmail.com>	2021-01-25 18:00:06 -0500
commit	795262f69900af674156bed2bcd0fdb57dbbb55e (patch)
tree	03723b919ff498722d7985a93f9ce7f470596abb /unicode.h
parent	a56f549a266e14cdc00a98e8dc3e154f5ac6c23e (diff)