diff options
Diffstat (limited to 'unicode.h')
-rw-r--r-- | unicode.h | 49 |
1 files changed, 29 insertions, 20 deletions
@@ -1,10 +1,14 @@ -// functions for dealing with UTF-8/UTF-16/UTF-32. -// this file is entirely self-contained. +/// \file +/// functions for dealing with UTF-8/UTF-16/UTF-32. +/// +/// this file is entirely self-contained. #ifndef UNICODE_H_ #define UNICODE_H_ +/// useful for "this character couldn't be rendered / is invalid UTF-8" #define UNICODE_BOX_CHARACTER 0x2610 -#define UNICODE_CODE_POINTS 0x110000 // number of Unicode code points +/// number of Unicode code points +#define UNICODE_CODE_POINTS 0x110000 #include <stddef.h> #include <stdint.h> @@ -19,15 +23,16 @@ static bool unicode_is_continuation_byte(uint8_t byte) { return (byte & 0xC0) == 0x80; } -// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing. -// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum -// number of bytes that can be read from `str` (note: this function will never read past a null -// byte, even if `bytes` indicates that it could). -// Returns: -// 0 - if a null character was encountered or if `bytes == 0` -// (size_t)-1 - on invalid UTF-8 -// (size_t)-2 - on incomplete code point (str should be longer) -// other - the number of bytes read from `str`. +/// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing. +/// +/// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum +/// number of bytes that can be read from `str` (note: this function will never read past a null +/// byte, even if `bytes` indicates that it could). +/// Returns:\n +/// `0` - if a null character was encountered or if `bytes == 0`\n +/// `(size_t)-1` - on invalid UTF-8\n +/// `(size_t)-2` - on incomplete code point (str should be longer)\n +/// other - the number of bytes read from `str`. static size_t unicode_utf8_to_utf32(uint32_t *c, const char *str, size_t bytes) { *c = 0; if (bytes == 0) { @@ -121,10 +126,11 @@ static size_t unicode_utf8_to_utf32(uint32_t *c, const char *str, size_t bytes) } } -// A lot like c32rtomb -// Converts a UTF-32 codepoint to a UTF-8 string. Writes at most 4 bytes to s. -// NOTE: It is YOUR JOB to null-terminate your string if the UTF-32 isn't null-terminated! -// Returns the number of bytes written to s, or (size_t)-1 on invalid UTF-32. +/// A lot like c32rtomb +/// +/// Converts a UTF-32 codepoint to a UTF-8 string. Writes at most 4 bytes to s. +/// NOTE: It is YOUR JOB to null-terminate your string if the UTF-32 isn't null-terminated! +/// Returns the number of bytes written to `s`, or `(size_t)-1` on invalid UTF-32. static size_t unicode_utf32_to_utf8(char *s, uint32_t c32) { uint8_t *p = (uint8_t *)s; if (c32 <= 0x7F) { @@ -162,7 +168,8 @@ static size_t unicode_utf32_to_utf8(char *s, uint32_t c32) { // get the number of UTF-16 codepoints needed to encode `str`. -// returns (size_t)-1 on bad UTF-8 +/// +// returns `(size_t)-1` on bad UTF-8 static size_t unicode_utf16_len(const char *str) { size_t len = 0; uint32_t c = 0; @@ -179,9 +186,11 @@ static size_t unicode_utf16_len(const char *str) { return len; } -// returns the UTF-8 offset from `str` which corresponds to a UTF-16 offset of utf16_offset (rounds down if utf16_offset is in the middle of a codepoint). -// returns strlen(str) if utf16_offset == unicode_utf16_len(str) -// returns (size_t)-1 on bad UTF-8, or if utf16_offset > unicode_utf16_len(str) +/// returns the UTF-8 offset from `str` which corresponds to a UTF-16 offset of +/// `utf16_offset` (rounds down if `utf16_offset` is in the middle of a codepoint). +/// +/// returns `strlen(str)` if `utf16_offset == unicode_utf16_len(str)` +/// returns `(size_t)-1` on bad UTF-8, or if `utf16_offset > unicode_utf16_len(str)` static size_t unicode_utf16_to_utf8_offset(const char *str, size_t utf16_offset) { size_t offset = 0; uint32_t c = 0; |