parsing signature help

author: pommicket <pommicket@gmail.com> 2022-12-28 11:45:07 -0500
committer: pommicket <pommicket@gmail.com> 2022-12-28 11:45:07 -0500
commit: 11df4f10197d67e5b61898bd98cdfccc1159dd26 (patch)
tree: 7639b03aa348183eb563f6179c2435b5ea28a106 /unicode.h
parent: 47adb1651d35dcc545850916c4a16b747901dba5 (diff)
1 files changed, 42 insertions, 0 deletions
diff --git a/unicode.h b/unicode.h
index 25ccd6f..1def401 100644
--- a/unicode.h
+++ b/unicode.h
@@ -152,4 +152,46 @@ static size_t unicode_utf32_to_utf8(char *s, char32_t c32) {
 		return (size_t)-1;
 	}
 }
+
+
+// get the number of UTF-16 codepoints needed to encode `str`.
+// returns (size_t)-1 on bad UTF-8
+static size_t unicode_utf16_len(const char *str) {
+	size_t len = 0;
+	char32_t c = 0;
+	while (*str) {
+		size_t n = unicode_utf8_to_utf32(&c, str, 4);
+		if (n >= (size_t)-2)
+			return (size_t)-1;
+		if (c >= 0x10000)
+			len += 2;
+		else
+			len += 1;
+		str += n;
+	}
+	return len;
+}
+
+// returns the UTF-8 offset from `str` which corresponds to a UTF-16 offset of utf16_offset (rounds down if utf16_offset is in the middle of a codepoint).
+// returns strlen(str) if utf16_offset == unicode_utf16_len(str)
+// returns (size_t)-1 on bad UTF-8, or if utf16_offset > unicode_utf16_len(str)
+static size_t unicode_utf16_to_utf8_offset(const char *str, size_t utf16_offset) {
+	size_t offset = 0;
+	char32_t c = 0;
+	while (*str) {
+		size_t n = unicode_utf8_to_utf32(&c, str, 4);
+		if (n >= (size_t)-2)
+			return (size_t)-1;
+		size_t u = c >= 0x10000 ? 2 : 1;
+		if (utf16_offset < u)
+			return offset;
+		utf16_offset -= u;
+		offset += n;
+		str += n;
+	}
+	if (utf16_offset == 0)
+		return offset;
+	return SIZE_MAX;
+}
+
 #endif // UNICODE_H_
author	pommicket <pommicket@gmail.com>	2022-12-28 11:45:07 -0500
committer	pommicket <pommicket@gmail.com>	2022-12-28 11:45:07 -0500
commit	11df4f10197d67e5b61898bd98cdfccc1159dd26 (patch)
tree	7639b03aa348183eb563f6179c2435b5ea28a106 /unicode.h
parent	47adb1651d35dcc545850916c4a16b747901dba5 (diff)