summaryrefslogtreecommitdiff
path: root/unicode.h
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2022-12-28 11:45:07 -0500
committerpommicket <pommicket@gmail.com>2022-12-28 11:45:07 -0500
commit11df4f10197d67e5b61898bd98cdfccc1159dd26 (patch)
tree7639b03aa348183eb563f6179c2435b5ea28a106 /unicode.h
parent47adb1651d35dcc545850916c4a16b747901dba5 (diff)
parsing signature help
Diffstat (limited to 'unicode.h')
-rw-r--r--unicode.h42
1 files changed, 42 insertions, 0 deletions
diff --git a/unicode.h b/unicode.h
index 25ccd6f..1def401 100644
--- a/unicode.h
+++ b/unicode.h
@@ -152,4 +152,46 @@ static size_t unicode_utf32_to_utf8(char *s, char32_t c32) {
return (size_t)-1;
}
}
+
+
+// get the number of UTF-16 codepoints needed to encode `str`.
+// returns (size_t)-1 on bad UTF-8
+static size_t unicode_utf16_len(const char *str) {
+ size_t len = 0;
+ char32_t c = 0;
+ while (*str) {
+ size_t n = unicode_utf8_to_utf32(&c, str, 4);
+ if (n >= (size_t)-2)
+ return (size_t)-1;
+ if (c >= 0x10000)
+ len += 2;
+ else
+ len += 1;
+ str += n;
+ }
+ return len;
+}
+
+// returns the UTF-8 offset from `str` which corresponds to a UTF-16 offset of utf16_offset (rounds down if utf16_offset is in the middle of a codepoint).
+// returns strlen(str) if utf16_offset == unicode_utf16_len(str)
+// returns (size_t)-1 on bad UTF-8, or if utf16_offset > unicode_utf16_len(str)
+static size_t unicode_utf16_to_utf8_offset(const char *str, size_t utf16_offset) {
+ size_t offset = 0;
+ char32_t c = 0;
+ while (*str) {
+ size_t n = unicode_utf8_to_utf32(&c, str, 4);
+ if (n >= (size_t)-2)
+ return (size_t)-1;
+ size_t u = c >= 0x10000 ? 2 : 1;
+ if (utf16_offset < u)
+ return offset;
+ utf16_offset -= u;
+ offset += n;
+ str += n;
+ }
+ if (utf16_offset == 0)
+ return offset;
+ return SIZE_MAX;
+}
+
#endif // UNICODE_H_