replaced c32rtomb, mbrtoc32 with own versions

these are nicer to use since they don't involve mbstate_t and should be faster since they don't involve locales
author: Leo Tenenbaum <pommicket@gmail.com> 2021-01-25 18:00:06 -0500
committer: Leo Tenenbaum <pommicket@gmail.com> 2021-01-25 18:00:06 -0500
commit: 795262f69900af674156bed2bcd0fdb57dbbb55e (patch)
tree: 03723b919ff498722d7985a93f9ce7f470596abb
parent: a56f549a266e14cdc00a98e8dc3e154f5ac6c23e (diff)
8 files changed, 186 insertions, 72 deletions
diff --git a/Makefile b/Makefile
index 6dc6b3e..babddc3 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ INSTALL_BIN_DIR=/usr/bin
 ted: *.[ch] text.o
 	$(CC) main.c text.o -o ted $(DEBUG_CFLAGS) $(LIBS)
 release: *.[ch]
-	$(CC) main.c text.o -o ted $(RELEASE_CFLAGS) $(LIBS)
+	$(CC) main.c -o ted $(RELEASE_CFLAGS) $(LIBS)
 text.o: text.c text.h base.h lib/stb_truetype.h
 	$(CC) text.c -c -o $@ $(DEBUG_CFLAGS)
 clean:
diff --git a/buffer.c b/buffer.c
index 03d9b9e..41ad74c 100644
--- a/buffer.c
+++ b/buffer.c
@@ -133,7 +133,7 @@ char32_t buffer_char_at_pos(TextBuffer *buffer, BufferPos p) {
 		// invalid (col too large)
 		return 0;
 	} else {
-		return U'\n';
+		return '\n';
 	}
 }
 
@@ -200,7 +200,7 @@ size_t buffer_get_text_at_pos(TextBuffer *buffer, BufferPos pos, char32_t *text,
 			if (p) {
 				memcpy(p, line->str + index, chars_from_this_line * sizeof *p);
 				p += chars_from_this_line;
-				*p++ = U'\n';
+				*p++ = '\n';
 			}
 			chars_left -= chars_from_this_line+1;
 		}
@@ -355,7 +355,7 @@ static void buffer_edit_print(BufferEdit *edit) {
 	printf(" (" U32_FMT " chars): ", edit->prev_len);
 	for (size_t i = 0; i < edit->prev_len; ++i) {
 		char32_t c = edit->prev_text[i];
-		if (c == U'\n')
+		if (c == '\n')
 			printf("\\n");
 		else
 			printf("%lc", (wint_t)c);
@@ -547,22 +547,19 @@ Status buffer_load_file(TextBuffer *buffer, char const *filename) {
 					size_t bytes_read = fread(file_contents, 1, file_size, fp);
 					if (bytes_read == file_size) {
 						char32_t c = 0;
-						mbstate_t mbstate = {0};
 						for (u8 *p = file_contents, *end = p + file_size; p != end; ) {
 							if (*p == '\r' && p != end-1 && p[1] == '\n') {
 								// CRLF line endings
 								p += 2;
-								c = U'\n';
+								c = '\n';
 							} else {
-								size_t n = mbrtoc32(&c, (char *)p, (size_t)(end - p), &mbstate);
+								size_t n = unicode_utf8_to_utf32(&c, (char *)p, (size_t)(end - p));
 								if (n == 0) {
 									// null character
 									c = 0;
 									++p;
-								} else if (n == (size_t)(-3)) {
-									// no bytes consumed, but a character was produced
-								} else if (n == (size_t)(-2) || n == (size_t)(-1)) {
-									// incomplete character at end of file or invalid UTF-8 respectively; fail
+								} else if (n == (size_t)(-1)) {
+									// invalid UTF-8
 									success = false;
 									buffer_seterr(buffer, "Invalid UTF-8 (position: %td).", p - file_contents);
 									break;
@@ -570,7 +567,7 @@ Status buffer_load_file(TextBuffer *buffer, char const *filename) {
 									p += n;
 								}
 							}
-							if (c == U'\n') {
+							if (c == '\n') {
 								if (buffer_lines_set_min_capacity(buffer, &lines, &lines_capacity, nlines + 1))
 									++nlines;
 							} else {
@@ -634,11 +631,12 @@ bool buffer_save(TextBuffer *buffer) {
 		if (out) {
 			bool success = true;
 			for (Line *line = buffer->lines, *end = line + buffer->nlines; line != end; ++line) {
-				mbstate_t state = {0};
 				for (char32_t *p = line->str, *p_end = p + line->len; p != p_end; ++p) {
-					char utf8[MB_LEN_MAX] = {0};
-					size_t bytes = c32rtomb(utf8, *p, &state);
-					fwrite(utf8, 1, bytes, out);
+					char utf8[4] = {0};
+					size_t bytes = unicode_utf32_to_utf8(utf8, *p);
+					if (bytes != (size_t)-1) {
+						fwrite(utf8, 1, bytes, out);
+					}
 				}
 
 				if (line != end-1) {
@@ -694,7 +692,7 @@ static u32 buffer_index_to_column(TextBuffer *buffer, u32 line, u32 index) {
 	uint tab_width = buffer_settings(buffer)->tab_width;
 	for (u32 i = 0; i < index; ++i) {
 		switch (str[i]) {
-		case U'\t': {
+		case '\t': {
 			do
 				++col;
 			while (col % tab_width);
@@ -718,7 +716,7 @@ static u32 buffer_column_to_index(TextBuffer *buffer, u32 line, u32 column) {
 	uint tab_width = buffer_settings(buffer)->tab_width;
 	for (u32 i = 0; i < len; ++i) {
 		switch (str[i]) {
-		case U'\t': {
+		case '\t': {
 			do {
 				if (col == column)
 					return i;
@@ -1059,7 +1057,7 @@ i64 buffer_cursor_move_down(TextBuffer *buffer, i64 by) {
 // Is this character a "word" character?
 // This determines how buffer_pos_move_words (i.e. ctrl+left/right) works
 static bool is_word(char32_t c) {
-	return c > WCHAR_MAX || c == U'_' || iswalnum((wint_t)c);
+	return c > WCHAR_MAX || c == '_' || iswalnum((wint_t)c);
 }
 
 static bool is_space(char32_t c) {
@@ -1229,7 +1227,7 @@ BufferPos buffer_insert_text_at_pos(TextBuffer *buffer, BufferPos pos, String32
 
 	if (buffer->is_line_buffer) {
 		// remove all the newlines from str.
-		str32_remove_all_instances_of_char(&str, U'\n');
+		str32_remove_all_instances_of_char(&str, '\n');
 	}
 
 	if (str.len == 0) {
@@ -1260,7 +1258,7 @@ BufferPos buffer_insert_text_at_pos(TextBuffer *buffer, BufferPos pos, String32
 
 	// `text` could consist of multiple lines, e.g. U"line 1\nline 2",
 	// so we need to go through them one by one
-	u32 n_added_lines = (u32)str32_count_char(str, U'\n');
+	u32 n_added_lines = (u32)str32_count_char(str, '\n');
 	if (n_added_lines) {
 		if (buffer_insert_lines(buffer, line_idx + 1, n_added_lines)) {
 			line = &buffer->lines[line_idx]; // fix pointer
@@ -1279,7 +1277,7 @@ BufferPos buffer_insert_text_at_pos(TextBuffer *buffer, BufferPos pos, String32
 
 
 	while (str.len) {
-		u32 text_line_len = (u32)str32chr(str, U'\n');
+		u32 text_line_len = (u32)str32chr(str, '\n');
 		u32 old_len = line->len;
 		u32 new_len = old_len + text_line_len;
 		if (new_len > old_len) { // handles both overflow and empty text lines
@@ -1947,12 +1945,12 @@ void buffer_render(TextBuffer *buffer, float x1, float y1, float x2, float y2) {
 			char32_t c = *p;
 
 			switch (c) {
-			case U'\n': assert(0); break;
-			case U'\r': break; // for CRLF line endings
-			case U'\t': {
+			case '\n': assert(0); break;
+			case '\r': break; // for CRLF line endings
+			case '\t': {
 				uint tab_width = settings->tab_width;
 				do {
-					text_render_char(font, &text_state, U' ');
+					text_render_char(font, &text_state, ' ');
 					++column;
 				} while (column % tab_width);
 			} break;
diff --git a/main.c b/main.c
index aea8588..4bf0a92 100644
--- a/main.c
+++ b/main.c
@@ -17,8 +17,12 @@ no_warn_end
 #include <shellapi.h>
 #endif
 
-
+#include "unicode.h"
+#if DEBUG
 #include "text.h"
+#else
+#include "text.c"
+#endif
 #include "util.c"
 #define MATH_GL
 #include "math.c"
@@ -30,7 +34,6 @@ no_warn_end
 #error "Unrecognized operating system."
 #endif
 
-#include "unicode.h"
 #include "command.h"
 #include "colors.h"
 #include "ted.h"
@@ -354,10 +357,10 @@ int main(int argc, char **argv) {
 						switch (key_combo) {
 						case SDL_SCANCODE_RETURN << 3:
 							if (!was_in_line_buffer) // make sure return to submit line buffer doesn't get added to newly-active buffer
-								buffer_insert_char_at_cursor(buffer, U'\n');
+								buffer_insert_char_at_cursor(buffer, '\n');
 							break;
 						case SDL_SCANCODE_TAB << 3:
-							buffer_insert_char_at_cursor(buffer, U'\t');
+							buffer_insert_char_at_cursor(buffer, '\t');
 							break;
 						}
 					}
@@ -478,15 +481,14 @@ int main(int argc, char **argv) {
 				TextRenderState text_state = {.x = text_x1, .y = text_y1,
 					.min_x = -FLT_MAX, .max_x = FLT_MAX, .min_y = -FLT_MAX, .max_y = FLT_MAX,
 					.render = true};
-				mbstate_t mbstate = {0};
 				char *p = ted->error_shown, *end = p + strlen(p);
 
 				text_chars_begin(font);
 				while (p != end) {
 					char32_t c = 0;
-					size_t n = mbrtoc32(&c, p, (size_t)(end - p), &mbstate);
-					if (n > (size_t)-3) { ++p; continue; } // invalid UTF-8; this shouldn't happen
-					if (n != (size_t)-3) p += n;
+					size_t n = unicode_utf8_to_utf32(&c, p, (size_t)(end - p));
+					if (n == (size_t)-1) { ++p; continue; } // invalid UTF-8; this shouldn't happen
+					p += n;
 					if (text_state.x + char_width >= text_x2) {
 						text_state.x = text_x1;
 						text_state.y += char_height;
diff --git a/make.bat b/make.bat
index 0f4fcdf..dee7a0c 100644
--- a/make.bat
+++ b/make.bat
@@ -6,9 +6,8 @@ if _%VCVARS% == _ (
 
 SET CFLAGS=/nologo /W4 /wd4200 /wd4204 /wd4221 /wd4706 /D_CRT_SECURE_NO_WARNINGS /I SDL2/include SDL2/lib/x64/SDL2main.lib SDL2/lib/x64/SDL2.lib opengl32.lib shell32.lib ole32.lib
 rc /nologo ted.rc
-SET SOURCES=main.c text.c ted.res
 if _%1 == _ (
-	cl %SOURCES% /DDEBUG /DEBUG /Zi %CFLAGS% /Fe:ted
+	cl main.c text.c ted.res /DDEBUG /DEBUG /Zi %CFLAGS% /Fe:ted
 )
-if _%1 == _release cl %SOURCES% /O2 %CFLAGS% /Fe:ted
-if _%1 == _profile cl %SOURCES% /O2 /DPROFILE %CFLAGS% /Fe:ted
+if _%1 == _release cl main.c ted.res /O2 %CFLAGS% /Fe:ted
+if _%1 == _profile cl main.c ted.res /O2 /DPROFILE %CFLAGS% /Fe:ted
diff --git a/string32.c b/string32.c
index d7278d2..fc6117b 100644
--- a/string32.c
+++ b/string32.c
@@ -32,19 +32,15 @@ String32 str32_from_utf8(char const *utf8) {
 			char32_t *wide_p = widestr;
 			char const *utf8_p = utf8;
 			char const *utf8_end = utf8_p + len;
-			mbstate_t mbstate = {0};
 			while (utf8_p < utf8_end) {
 				char32_t c = 0;
-				size_t n = mbrtoc32(&c, utf8_p, (size_t)(utf8_end - utf8_p), &mbstate);
-				if (n == 0// null character. this shouldn't happen.
-					|| n == (size_t)(-2) // incomplete character
+				size_t n = unicode_utf8_to_utf32(&c, utf8_p, (size_t)(utf8_end - utf8_p));
+				if (n == 0 // null character. this shouldn't happen.
 					|| n == (size_t)(-1) // invalid UTF-8
 					) {
 					free(widestr);
 					widestr = wide_p = NULL;
 					break;
-				} else if (n == (size_t)(-3)) { // no bytes consumed, but a character was produced
-					*wide_p++ = c;
 				} else {
 					// n bytes consumed
 					*wide_p++ = c;
@@ -65,11 +61,10 @@ static char *str32_to_utf8_cstr(String32 s) {
 	char *utf8 = calloc(4 * s.len + 1, 1); // each codepoint takes up at most 4 bytes in UTF-8, + we need a terminating null byte
 	if (utf8) {
 		char *p = utf8;
-		mbstate_t mbstate; memset(&mbstate, 0, sizeof mbstate);
 		for (size_t i = 0; i < s.len; ++i) {
-			size_t bytes = c32rtomb(p, s.str[i], &mbstate);
+			size_t bytes = unicode_utf32_to_utf8(p, s.str[i]);
 			if (bytes == (size_t)-1) {
-				// invalid UTF-32 character
+				// invalid UTF-32 code point
 				free(utf8);
 				return NULL;
 			} else {
diff --git a/text.c b/text.c
index fbce4eb..094db15 100644
--- a/text.c
+++ b/text.c
@@ -1,5 +1,6 @@
 #include "base.h"
 #include "text.h"
+#include "unicode.h"
 #define STB_TRUETYPE_IMPLEMENTATION
 #define STBTT_STATIC
 no_warn_start
@@ -8,15 +9,6 @@ no_warn_end
 #include <stdlib.h>
 #include <GL/gl.h>
 
-#define UNICODE_BOX_CHARACTER 0x2610
-#define UNICODE_CODE_POINTS 0x110000 // number of Unicode code points
-
-static bool unicode_is_start_of_code_point(u8 byte) {
-	// see https://en.wikipedia.org/wiki/UTF-8#Encoding
-	// continuation bytes are of the form 10xxxxxx
-	return (byte & 0xC0) != 0x80;
-}
-
 // We split up code points into a bunch of pages, so we don't have to load all of the font at
 // once into one texture.
 #define CHAR_PAGE_SIZE 2048
@@ -249,21 +241,16 @@ void text_render_with_state(Font *font, TextRenderState *render_state, char cons
 	render_state->x = x;
 	render_state->y = y;
 	char32_t c = 0;
-	mbstate_t mbstate = {0};
 	char const *end = text + strlen(text);
 	while (text != end) {
-		size_t ret = mbrtoc32(&c, text, (size_t)(end - text), &mbstate);
+		size_t ret = unicode_utf8_to_utf32(&c, text, (size_t)(end - text));
 		if (ret == 0) break;
-		if (ret == (size_t)(-2)) { // incomplete multi-byte character
-			text_render_char(font, render_state, U'?');
-			text = end; // done reading text
-		} else if (ret == (size_t)(-1)) {
+		if (ret == (size_t)(-1)) {
 			// invalid UTF-8; skip this byte
-			text_render_char(font, render_state, U'?');
+			text_render_char(font, render_state, '?');
 			++text;
 		} else {
-			if (ret != (size_t)(-3))
-				text += ret; // character consists of `ret` bytes
+			text += ret; // character consists of `ret` bytes
 			switch (c) {
 			default:
 				text_render_char(font, render_state, c);
diff --git a/unicode.h b/unicode.h
index 3f76090..cb4f2bc 100644
--- a/unicode.h
+++ b/unicode.h
@@ -9,4 +9,140 @@ static bool unicode_is_start_of_code_point(u8 byte) {
 	return (byte & 0xC0) != 0x80;
 }
 
-#endif
+// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing.
+// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum
+// number of bytes that can be read from `str`.
+// Returns:
+// 0 - if a NULL character was encountered
+// (size_t)-1 - on invalid UTF-8 / incomplete code point
+// other - the number of bytes read from `str`.
+static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) {
+	if (bytes == 0) {
+		*c = 0;
+		return 0;
+	}
+	// it's easier to do things with unsigned integers
+	u8 const *p = (u8 const *)str;
+
+	u8 first_byte = *p;
+	
+	if (first_byte & 0x80) {
+		if ((first_byte & 0xE0) == 0xC0) {
+			// two-byte code point
+			if (bytes >= 2) {
+				++p;
+				u32 second_byte = *p;
+				u32 value = ((u32)first_byte & 0x1F) << 6
+					| (second_byte & 0x3F);
+				*c = (char32_t)value;
+				return 2;
+			} else {
+				// incomplete code point
+				*c = 0;
+				return (size_t)-1;
+			}
+		}
+		if ((first_byte & 0xF0) == 0xE0) {
+			// three-byte code point
+			if (bytes >= 3) {
+				++p;
+				u32 second_byte = *p;
+				++p;
+				u32 third_byte = *p;
+				u32 value = ((u32)first_byte & 0x0F) << 12
+					| (second_byte & 0x3F) << 6
+					| (third_byte & 0x3F);
+				if (value < 0xD800 || value > 0xDFFF) {
+					*c = (char32_t)value;
+					return 3;
+				} else {
+					// reserved for UTF-16 surrogate halves
+					*c = 0;
+					return (size_t)-1;
+				}
+			} else {
+				// incomplete
+				*c = 0;
+				return (size_t)-1;
+			}
+		}
+		if ((first_byte & 0xF8) == 0xF0) {
+			// four-byte code point
+			if (bytes >= 4) {
+				++p;
+				u32 second_byte = *p;
+				++p;
+				u32 third_byte = *p;
+				++p;
+				u32 fourth_byte = *p;
+				u32 value = ((u32)first_byte & 0x07) << 18
+					| (second_byte & 0x3F) << 12
+					| (third_byte  & 0x3F) << 6
+					| (fourth_byte & 0x3F);
+				if (value <= 0x10FFFF) {
+					*c = (char32_t)value;
+					return 4;
+				} else {
+					// Code points this big can't be encoded by UTF-16 and so are invalid UTF-8.
+					*c = 0;
+					return (size_t)-1;
+				}
+			} else {
+				// incomplete
+				*c = 0;
+				return (size_t)-1;
+			}
+		}
+		// invalid UTF-8
+		*c = 0;
+		return (size_t)-1;
+	} else {
+		// ASCII character
+		if (first_byte == 0) {
+			*c = 0;
+			return 0;
+		}
+		*c = first_byte;
+		return 1;
+	}
+}
+
+// A lot like c32rtomb
+// Converts a UTF-32 codepoint to a UTF-8 string. Writes at most 4 bytes to s.
+// NOTE: It is YOUR JOB to null-terminate your string if the UTF-32 isn't null-terminated!
+// Returns the number of bytes written to s, or (size_t)-1 on invalid UTF-32.
+static size_t unicode_utf32_to_utf8(char *s, char32_t c32) {
+	u8 *p = (u8 *)s;
+	if (c32 <= 0x7F) {
+		// ASCII
+		*p = (u8)c32;
+		return 1;
+	} else if (c32 <= 0x7FF) {
+		// two bytes needed
+		*p++ = 0xC0 | (u8)(c32 >> 6);
+		*p   = 0x80 | (u8)(c32 & 0x3F);
+		return 2;
+	} else if (c32 <= 0x7FFF) {
+		if (c32 < 0xD800 || c32 > 0xDFFF) {
+			*p++ = 0xE0 | (u8)( c32 >> 12);
+			*p++ = 0x80 | (u8)((c32 >> 6) & 0x3F);
+			*p   = 0x80 | (u8)( c32       & 0x3F);
+			return 3;
+		} else {
+			// UTF-16 surrogate halves
+			*p = 0;
+			return (size_t)-1;
+		}
+	} else if (c32 <= 0x10FFFF) {
+		*p++ = 0xF0 | (u8)( c32 >> 18);
+		*p++ = 0x80 | (u8)((c32 >> 12) & 0x3F);
+		*p++ = 0x80 | (u8)((c32 >>  6) & 0x3F);
+		*p   = 0x80 | (u8)( c32        & 0x3F);
+		return 4;
+	} else {
+		// code point too big
+		*p = 0;
+		return (size_t)-1;
+	}
+}
+#endif // UNICODE_H_
diff --git a/util.c b/util.c
index a6dd3d9..651135e 100644
--- a/util.c
+++ b/util.c
@@ -134,17 +134,14 @@ static char *stristr(char const *haystack, char const *needle) {
 
 	for (char const *haystack_start = haystack; haystack_start + needle_bytes <= haystack_end; utf8_next_char_const(&haystack_start)) {
 		char const *p = haystack_start, *q = needle;
-		mbstate_t pstate = {0}, qstate = {0};
 		bool match = true;
 
 		// check if p matches q
 		while (q < needle_end) {
 			char32_t pchar = 0, qchar = 0;
-			size_t bytes_p = mbrtoc32(&pchar, p, (size_t)(haystack_end - p), &pstate);
-			size_t bytes_q = mbrtoc32(&qchar, q, (size_t)(needle_end - q),   &qstate);
-			if (bytes_p == (size_t)-3) bytes_p = 0;
-			if (bytes_q == (size_t)-3) bytes_q = 0;
-			if (bytes_p > (size_t)-3 || bytes_q > (size_t)-3) return NULL; // invalid UTF-8
+			size_t bytes_p = unicode_utf8_to_utf32(&pchar, p, (size_t)(haystack_end - p));
+			size_t bytes_q = unicode_utf8_to_utf32(&qchar, q, (size_t)(needle_end - q));
+			if (bytes_p == (size_t)-1 || bytes_q == (size_t)-1) return NULL; // invalid UTF-8
 			bool same = pchar == qchar;
 			if (pchar < WINT_MAX && qchar < WINT_MAX) // on Windows, there is no way of finding the lower-case version of a codepoint outside the BMP. ):
 				same = towlower((wint_t)pchar) == towlower((wint_t)qchar);
author	Leo Tenenbaum <pommicket@gmail.com>	2021-01-25 18:00:06 -0500
committer	Leo Tenenbaum <pommicket@gmail.com>	2021-01-25 18:00:06 -0500
commit	795262f69900af674156bed2bcd0fdb57dbbb55e (patch)
tree	03723b919ff498722d7985a93f9ce7f470596abb
parent	a56f549a266e14cdc00a98e8dc3e154f5ac6c23e (diff)