diff options
author | pommicket <pommicket@gmail.com> | 2022-12-28 11:45:07 -0500 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2022-12-28 11:45:07 -0500 |
commit | 11df4f10197d67e5b61898bd98cdfccc1159dd26 (patch) | |
tree | 7639b03aa348183eb563f6179c2435b5ea28a106 /unicode.h | |
parent | 47adb1651d35dcc545850916c4a16b747901dba5 (diff) |
parsing signature help
Diffstat (limited to 'unicode.h')
-rw-r--r-- | unicode.h | 42 |
1 files changed, 42 insertions, 0 deletions
@@ -152,4 +152,46 @@ static size_t unicode_utf32_to_utf8(char *s, char32_t c32) { return (size_t)-1; } } + + +// get the number of UTF-16 codepoints needed to encode `str`. +// returns (size_t)-1 on bad UTF-8 +static size_t unicode_utf16_len(const char *str) { + size_t len = 0; + char32_t c = 0; + while (*str) { + size_t n = unicode_utf8_to_utf32(&c, str, 4); + if (n >= (size_t)-2) + return (size_t)-1; + if (c >= 0x10000) + len += 2; + else + len += 1; + str += n; + } + return len; +} + +// returns the UTF-8 offset from `str` which corresponds to a UTF-16 offset of utf16_offset (rounds down if utf16_offset is in the middle of a codepoint). +// returns strlen(str) if utf16_offset == unicode_utf16_len(str) +// returns (size_t)-1 on bad UTF-8, or if utf16_offset > unicode_utf16_len(str) +static size_t unicode_utf16_to_utf8_offset(const char *str, size_t utf16_offset) { + size_t offset = 0; + char32_t c = 0; + while (*str) { + size_t n = unicode_utf8_to_utf32(&c, str, 4); + if (n >= (size_t)-2) + return (size_t)-1; + size_t u = c >= 0x10000 ? 2 : 1; + if (utf16_offset < u) + return offset; + utf16_offset -= u; + offset += n; + str += n; + } + if (utf16_offset == 0) + return offset; + return SIZE_MAX; +} + #endif // UNICODE_H_ |