diff options
author | Leo Tenenbaum <pommicket@gmail.com> | 2021-01-25 18:00:06 -0500 |
---|---|---|
committer | Leo Tenenbaum <pommicket@gmail.com> | 2021-01-25 18:00:06 -0500 |
commit | 795262f69900af674156bed2bcd0fdb57dbbb55e (patch) | |
tree | 03723b919ff498722d7985a93f9ce7f470596abb /unicode.h | |
parent | a56f549a266e14cdc00a98e8dc3e154f5ac6c23e (diff) |
replaced c32rtomb, mbrtoc32 with own versions
these are nicer to use since they don't involve mbstate_t and should be faster since they don't involve locales
Diffstat (limited to 'unicode.h')
-rw-r--r-- | unicode.h | 138 |
1 files changed, 137 insertions, 1 deletions
@@ -9,4 +9,140 @@ static bool unicode_is_start_of_code_point(u8 byte) { return (byte & 0xC0) != 0x80; } -#endif +// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing. +// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum +// number of bytes that can be read from `str`. +// Returns: +// 0 - if a NULL character was encountered +// (size_t)-1 - on invalid UTF-8 / incomplete code point +// other - the number of bytes read from `str`. +static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) { + if (bytes == 0) { + *c = 0; + return 0; + } + // it's easier to do things with unsigned integers + u8 const *p = (u8 const *)str; + + u8 first_byte = *p; + + if (first_byte & 0x80) { + if ((first_byte & 0xE0) == 0xC0) { + // two-byte code point + if (bytes >= 2) { + ++p; + u32 second_byte = *p; + u32 value = ((u32)first_byte & 0x1F) << 6 + | (second_byte & 0x3F); + *c = (char32_t)value; + return 2; + } else { + // incomplete code point + *c = 0; + return (size_t)-1; + } + } + if ((first_byte & 0xF0) == 0xE0) { + // three-byte code point + if (bytes >= 3) { + ++p; + u32 second_byte = *p; + ++p; + u32 third_byte = *p; + u32 value = ((u32)first_byte & 0x0F) << 12 + | (second_byte & 0x3F) << 6 + | (third_byte & 0x3F); + if (value < 0xD800 || value > 0xDFFF) { + *c = (char32_t)value; + return 3; + } else { + // reserved for UTF-16 surrogate halves + *c = 0; + return (size_t)-1; + } + } else { + // incomplete + *c = 0; + return (size_t)-1; + } + } + if ((first_byte & 0xF8) == 0xF0) { + // four-byte code point + if (bytes >= 4) { + ++p; + u32 second_byte = *p; + ++p; + u32 third_byte = *p; + ++p; + u32 fourth_byte = *p; + u32 value = ((u32)first_byte & 0x07) << 18 + | (second_byte & 0x3F) << 12 + | (third_byte & 0x3F) << 6 + | (fourth_byte & 0x3F); + if (value <= 0x10FFFF) { + *c = (char32_t)value; + return 4; + } else { + // Code points this big can't be encoded by UTF-16 and so are invalid UTF-8. + *c = 0; + return (size_t)-1; + } + } else { + // incomplete + *c = 0; + return (size_t)-1; + } + } + // invalid UTF-8 + *c = 0; + return (size_t)-1; + } else { + // ASCII character + if (first_byte == 0) { + *c = 0; + return 0; + } + *c = first_byte; + return 1; + } +} + +// A lot like c32rtomb +// Converts a UTF-32 codepoint to a UTF-8 string. Writes at most 4 bytes to s. +// NOTE: It is YOUR JOB to null-terminate your string if the UTF-32 isn't null-terminated! +// Returns the number of bytes written to s, or (size_t)-1 on invalid UTF-32. +static size_t unicode_utf32_to_utf8(char *s, char32_t c32) { + u8 *p = (u8 *)s; + if (c32 <= 0x7F) { + // ASCII + *p = (u8)c32; + return 1; + } else if (c32 <= 0x7FF) { + // two bytes needed + *p++ = 0xC0 | (u8)(c32 >> 6); + *p = 0x80 | (u8)(c32 & 0x3F); + return 2; + } else if (c32 <= 0x7FFF) { + if (c32 < 0xD800 || c32 > 0xDFFF) { + *p++ = 0xE0 | (u8)( c32 >> 12); + *p++ = 0x80 | (u8)((c32 >> 6) & 0x3F); + *p = 0x80 | (u8)( c32 & 0x3F); + return 3; + } else { + // UTF-16 surrogate halves + *p = 0; + return (size_t)-1; + } + } else if (c32 <= 0x10FFFF) { + *p++ = 0xF0 | (u8)( c32 >> 18); + *p++ = 0x80 | (u8)((c32 >> 12) & 0x3F); + *p++ = 0x80 | (u8)((c32 >> 6) & 0x3F); + *p = 0x80 | (u8)( c32 & 0x3F); + return 4; + } else { + // code point too big + *p = 0; + return (size_t)-1; + } +} +#endif // UNICODE_H_ |