summaryrefslogtreecommitdiff
path: root/string32.c
diff options
context:
space:
mode:
Diffstat (limited to 'string32.c')
-rw-r--r--string32.c50
1 files changed, 50 insertions, 0 deletions
diff --git a/string32.c b/string32.c
new file mode 100644
index 0000000..9b14e30
--- /dev/null
+++ b/string32.c
@@ -0,0 +1,50 @@
+// UTF-32 string
+typedef struct {
+ size_t len;
+ char32_t *str;
+} String32;
+
+void s32_free(String32 *s) {
+ free(s->str);
+ s->str = NULL;
+ s->len = 0;
+}
+
+// the string returned should be s32_free'd.
+// this will return an empty string if the allocation failed or the string is invalid UTF-8
+String32 s32_from_utf8(char const *utf8) {
+ String32 string = {0, NULL};
+ size_t len = strlen(utf8);
+ if (len) {
+ // the wide string uses at most as many "characters" (elements?) as the UTF-8 string
+ char32_t *widestr = calloc(len, sizeof *widestr);
+ if (widestr) {
+ char32_t *wide_p = widestr;
+ char const *utf8_p = utf8;
+ char const *utf8_end = utf8_p + len;
+ mbstate_t mbstate = {0};
+ while (utf8_p < utf8_end) {
+ char32_t c = 0;
+ size_t n = mbrtoc32(&c, utf8_p, (size_t)(utf8_end - utf8_p), &mbstate);
+ if (n == 0// null character. this shouldn't happen.
+ || n == (size_t)(-2) // incomplete character
+ || n == (size_t)(-1) // invalid UTF-8
+ ) {
+ free(widestr);
+ widestr = wide_p = NULL;
+ break;
+ } else if (n == (size_t)(-3)) { // no bytes consumed, but a character was produced
+ *wide_p++ = c;
+ } else {
+ // n bytes consumed
+ *wide_p++ = c;
+ utf8_p += n;
+ }
+ }
+ string.str = widestr;
+ string.len = (size_t)(wide_p - widestr);
+ }
+ }
+ return string;
+}
+