From 795262f69900af674156bed2bcd0fdb57dbbb55e Mon Sep 17 00:00:00 2001 From: Leo Tenenbaum Date: Mon, 25 Jan 2021 18:00:06 -0500 Subject: replaced c32rtomb, mbrtoc32 with own versions these are nicer to use since they don't involve mbstate_t and should be faster since they don't involve locales --- Makefile | 2 +- buffer.c | 48 +++++++++++---------- main.c | 18 ++++---- make.bat | 7 ++-- string32.c | 13 ++---- text.c | 23 +++-------- unicode.h | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- util.c | 9 ++-- 8 files changed, 186 insertions(+), 72 deletions(-) diff --git a/Makefile b/Makefile index 6dc6b3e..babddc3 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ INSTALL_BIN_DIR=/usr/bin ted: *.[ch] text.o $(CC) main.c text.o -o ted $(DEBUG_CFLAGS) $(LIBS) release: *.[ch] - $(CC) main.c text.o -o ted $(RELEASE_CFLAGS) $(LIBS) + $(CC) main.c -o ted $(RELEASE_CFLAGS) $(LIBS) text.o: text.c text.h base.h lib/stb_truetype.h $(CC) text.c -c -o $@ $(DEBUG_CFLAGS) clean: diff --git a/buffer.c b/buffer.c index 03d9b9e..41ad74c 100644 --- a/buffer.c +++ b/buffer.c @@ -133,7 +133,7 @@ char32_t buffer_char_at_pos(TextBuffer *buffer, BufferPos p) { // invalid (col too large) return 0; } else { - return U'\n'; + return '\n'; } } @@ -200,7 +200,7 @@ size_t buffer_get_text_at_pos(TextBuffer *buffer, BufferPos pos, char32_t *text, if (p) { memcpy(p, line->str + index, chars_from_this_line * sizeof *p); p += chars_from_this_line; - *p++ = U'\n'; + *p++ = '\n'; } chars_left -= chars_from_this_line+1; } @@ -355,7 +355,7 @@ static void buffer_edit_print(BufferEdit *edit) { printf(" (" U32_FMT " chars): ", edit->prev_len); for (size_t i = 0; i < edit->prev_len; ++i) { char32_t c = edit->prev_text[i]; - if (c == U'\n') + if (c == '\n') printf("\\n"); else printf("%lc", (wint_t)c); @@ -547,22 +547,19 @@ Status buffer_load_file(TextBuffer *buffer, char const *filename) { size_t bytes_read = fread(file_contents, 1, file_size, fp); if (bytes_read == file_size) { char32_t c = 0; - mbstate_t mbstate = {0}; for (u8 *p = file_contents, *end = p + file_size; p != end; ) { if (*p == '\r' && p != end-1 && p[1] == '\n') { // CRLF line endings p += 2; - c = U'\n'; + c = '\n'; } else { - size_t n = mbrtoc32(&c, (char *)p, (size_t)(end - p), &mbstate); + size_t n = unicode_utf8_to_utf32(&c, (char *)p, (size_t)(end - p)); if (n == 0) { // null character c = 0; ++p; - } else if (n == (size_t)(-3)) { - // no bytes consumed, but a character was produced - } else if (n == (size_t)(-2) || n == (size_t)(-1)) { - // incomplete character at end of file or invalid UTF-8 respectively; fail + } else if (n == (size_t)(-1)) { + // invalid UTF-8 success = false; buffer_seterr(buffer, "Invalid UTF-8 (position: %td).", p - file_contents); break; @@ -570,7 +567,7 @@ Status buffer_load_file(TextBuffer *buffer, char const *filename) { p += n; } } - if (c == U'\n') { + if (c == '\n') { if (buffer_lines_set_min_capacity(buffer, &lines, &lines_capacity, nlines + 1)) ++nlines; } else { @@ -634,11 +631,12 @@ bool buffer_save(TextBuffer *buffer) { if (out) { bool success = true; for (Line *line = buffer->lines, *end = line + buffer->nlines; line != end; ++line) { - mbstate_t state = {0}; for (char32_t *p = line->str, *p_end = p + line->len; p != p_end; ++p) { - char utf8[MB_LEN_MAX] = {0}; - size_t bytes = c32rtomb(utf8, *p, &state); - fwrite(utf8, 1, bytes, out); + char utf8[4] = {0}; + size_t bytes = unicode_utf32_to_utf8(utf8, *p); + if (bytes != (size_t)-1) { + fwrite(utf8, 1, bytes, out); + } } if (line != end-1) { @@ -694,7 +692,7 @@ static u32 buffer_index_to_column(TextBuffer *buffer, u32 line, u32 index) { uint tab_width = buffer_settings(buffer)->tab_width; for (u32 i = 0; i < index; ++i) { switch (str[i]) { - case U'\t': { + case '\t': { do ++col; while (col % tab_width); @@ -718,7 +716,7 @@ static u32 buffer_column_to_index(TextBuffer *buffer, u32 line, u32 column) { uint tab_width = buffer_settings(buffer)->tab_width; for (u32 i = 0; i < len; ++i) { switch (str[i]) { - case U'\t': { + case '\t': { do { if (col == column) return i; @@ -1059,7 +1057,7 @@ i64 buffer_cursor_move_down(TextBuffer *buffer, i64 by) { // Is this character a "word" character? // This determines how buffer_pos_move_words (i.e. ctrl+left/right) works static bool is_word(char32_t c) { - return c > WCHAR_MAX || c == U'_' || iswalnum((wint_t)c); + return c > WCHAR_MAX || c == '_' || iswalnum((wint_t)c); } static bool is_space(char32_t c) { @@ -1229,7 +1227,7 @@ BufferPos buffer_insert_text_at_pos(TextBuffer *buffer, BufferPos pos, String32 if (buffer->is_line_buffer) { // remove all the newlines from str. - str32_remove_all_instances_of_char(&str, U'\n'); + str32_remove_all_instances_of_char(&str, '\n'); } if (str.len == 0) { @@ -1260,7 +1258,7 @@ BufferPos buffer_insert_text_at_pos(TextBuffer *buffer, BufferPos pos, String32 // `text` could consist of multiple lines, e.g. U"line 1\nline 2", // so we need to go through them one by one - u32 n_added_lines = (u32)str32_count_char(str, U'\n'); + u32 n_added_lines = (u32)str32_count_char(str, '\n'); if (n_added_lines) { if (buffer_insert_lines(buffer, line_idx + 1, n_added_lines)) { line = &buffer->lines[line_idx]; // fix pointer @@ -1279,7 +1277,7 @@ BufferPos buffer_insert_text_at_pos(TextBuffer *buffer, BufferPos pos, String32 while (str.len) { - u32 text_line_len = (u32)str32chr(str, U'\n'); + u32 text_line_len = (u32)str32chr(str, '\n'); u32 old_len = line->len; u32 new_len = old_len + text_line_len; if (new_len > old_len) { // handles both overflow and empty text lines @@ -1947,12 +1945,12 @@ void buffer_render(TextBuffer *buffer, float x1, float y1, float x2, float y2) { char32_t c = *p; switch (c) { - case U'\n': assert(0); break; - case U'\r': break; // for CRLF line endings - case U'\t': { + case '\n': assert(0); break; + case '\r': break; // for CRLF line endings + case '\t': { uint tab_width = settings->tab_width; do { - text_render_char(font, &text_state, U' '); + text_render_char(font, &text_state, ' '); ++column; } while (column % tab_width); } break; diff --git a/main.c b/main.c index aea8588..4bf0a92 100644 --- a/main.c +++ b/main.c @@ -17,8 +17,12 @@ no_warn_end #include #endif - +#include "unicode.h" +#if DEBUG #include "text.h" +#else +#include "text.c" +#endif #include "util.c" #define MATH_GL #include "math.c" @@ -30,7 +34,6 @@ no_warn_end #error "Unrecognized operating system." #endif -#include "unicode.h" #include "command.h" #include "colors.h" #include "ted.h" @@ -354,10 +357,10 @@ int main(int argc, char **argv) { switch (key_combo) { case SDL_SCANCODE_RETURN << 3: if (!was_in_line_buffer) // make sure return to submit line buffer doesn't get added to newly-active buffer - buffer_insert_char_at_cursor(buffer, U'\n'); + buffer_insert_char_at_cursor(buffer, '\n'); break; case SDL_SCANCODE_TAB << 3: - buffer_insert_char_at_cursor(buffer, U'\t'); + buffer_insert_char_at_cursor(buffer, '\t'); break; } } @@ -478,15 +481,14 @@ int main(int argc, char **argv) { TextRenderState text_state = {.x = text_x1, .y = text_y1, .min_x = -FLT_MAX, .max_x = FLT_MAX, .min_y = -FLT_MAX, .max_y = FLT_MAX, .render = true}; - mbstate_t mbstate = {0}; char *p = ted->error_shown, *end = p + strlen(p); text_chars_begin(font); while (p != end) { char32_t c = 0; - size_t n = mbrtoc32(&c, p, (size_t)(end - p), &mbstate); - if (n > (size_t)-3) { ++p; continue; } // invalid UTF-8; this shouldn't happen - if (n != (size_t)-3) p += n; + size_t n = unicode_utf8_to_utf32(&c, p, (size_t)(end - p)); + if (n == (size_t)-1) { ++p; continue; } // invalid UTF-8; this shouldn't happen + p += n; if (text_state.x + char_width >= text_x2) { text_state.x = text_x1; text_state.y += char_height; diff --git a/make.bat b/make.bat index 0f4fcdf..dee7a0c 100644 --- a/make.bat +++ b/make.bat @@ -6,9 +6,8 @@ if _%VCVARS% == _ ( SET CFLAGS=/nologo /W4 /wd4200 /wd4204 /wd4221 /wd4706 /D_CRT_SECURE_NO_WARNINGS /I SDL2/include SDL2/lib/x64/SDL2main.lib SDL2/lib/x64/SDL2.lib opengl32.lib shell32.lib ole32.lib rc /nologo ted.rc -SET SOURCES=main.c text.c ted.res if _%1 == _ ( - cl %SOURCES% /DDEBUG /DEBUG /Zi %CFLAGS% /Fe:ted + cl main.c text.c ted.res /DDEBUG /DEBUG /Zi %CFLAGS% /Fe:ted ) -if _%1 == _release cl %SOURCES% /O2 %CFLAGS% /Fe:ted -if _%1 == _profile cl %SOURCES% /O2 /DPROFILE %CFLAGS% /Fe:ted +if _%1 == _release cl main.c ted.res /O2 %CFLAGS% /Fe:ted +if _%1 == _profile cl main.c ted.res /O2 /DPROFILE %CFLAGS% /Fe:ted diff --git a/string32.c b/string32.c index d7278d2..fc6117b 100644 --- a/string32.c +++ b/string32.c @@ -32,19 +32,15 @@ String32 str32_from_utf8(char const *utf8) { char32_t *wide_p = widestr; char const *utf8_p = utf8; char const *utf8_end = utf8_p + len; - mbstate_t mbstate = {0}; while (utf8_p < utf8_end) { char32_t c = 0; - size_t n = mbrtoc32(&c, utf8_p, (size_t)(utf8_end - utf8_p), &mbstate); - if (n == 0// null character. this shouldn't happen. - || n == (size_t)(-2) // incomplete character + size_t n = unicode_utf8_to_utf32(&c, utf8_p, (size_t)(utf8_end - utf8_p)); + if (n == 0 // null character. this shouldn't happen. || n == (size_t)(-1) // invalid UTF-8 ) { free(widestr); widestr = wide_p = NULL; break; - } else if (n == (size_t)(-3)) { // no bytes consumed, but a character was produced - *wide_p++ = c; } else { // n bytes consumed *wide_p++ = c; @@ -65,11 +61,10 @@ static char *str32_to_utf8_cstr(String32 s) { char *utf8 = calloc(4 * s.len + 1, 1); // each codepoint takes up at most 4 bytes in UTF-8, + we need a terminating null byte if (utf8) { char *p = utf8; - mbstate_t mbstate; memset(&mbstate, 0, sizeof mbstate); for (size_t i = 0; i < s.len; ++i) { - size_t bytes = c32rtomb(p, s.str[i], &mbstate); + size_t bytes = unicode_utf32_to_utf8(p, s.str[i]); if (bytes == (size_t)-1) { - // invalid UTF-32 character + // invalid UTF-32 code point free(utf8); return NULL; } else { diff --git a/text.c b/text.c index fbce4eb..094db15 100644 --- a/text.c +++ b/text.c @@ -1,5 +1,6 @@ #include "base.h" #include "text.h" +#include "unicode.h" #define STB_TRUETYPE_IMPLEMENTATION #define STBTT_STATIC no_warn_start @@ -8,15 +9,6 @@ no_warn_end #include #include -#define UNICODE_BOX_CHARACTER 0x2610 -#define UNICODE_CODE_POINTS 0x110000 // number of Unicode code points - -static bool unicode_is_start_of_code_point(u8 byte) { - // see https://en.wikipedia.org/wiki/UTF-8#Encoding - // continuation bytes are of the form 10xxxxxx - return (byte & 0xC0) != 0x80; -} - // We split up code points into a bunch of pages, so we don't have to load all of the font at // once into one texture. #define CHAR_PAGE_SIZE 2048 @@ -249,21 +241,16 @@ void text_render_with_state(Font *font, TextRenderState *render_state, char cons render_state->x = x; render_state->y = y; char32_t c = 0; - mbstate_t mbstate = {0}; char const *end = text + strlen(text); while (text != end) { - size_t ret = mbrtoc32(&c, text, (size_t)(end - text), &mbstate); + size_t ret = unicode_utf8_to_utf32(&c, text, (size_t)(end - text)); if (ret == 0) break; - if (ret == (size_t)(-2)) { // incomplete multi-byte character - text_render_char(font, render_state, U'?'); - text = end; // done reading text - } else if (ret == (size_t)(-1)) { + if (ret == (size_t)(-1)) { // invalid UTF-8; skip this byte - text_render_char(font, render_state, U'?'); + text_render_char(font, render_state, '?'); ++text; } else { - if (ret != (size_t)(-3)) - text += ret; // character consists of `ret` bytes + text += ret; // character consists of `ret` bytes switch (c) { default: text_render_char(font, render_state, c); diff --git a/unicode.h b/unicode.h index 3f76090..cb4f2bc 100644 --- a/unicode.h +++ b/unicode.h @@ -9,4 +9,140 @@ static bool unicode_is_start_of_code_point(u8 byte) { return (byte & 0xC0) != 0x80; } -#endif +// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing. +// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum +// number of bytes that can be read from `str`. +// Returns: +// 0 - if a NULL character was encountered +// (size_t)-1 - on invalid UTF-8 / incomplete code point +// other - the number of bytes read from `str`. +static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) { + if (bytes == 0) { + *c = 0; + return 0; + } + // it's easier to do things with unsigned integers + u8 const *p = (u8 const *)str; + + u8 first_byte = *p; + + if (first_byte & 0x80) { + if ((first_byte & 0xE0) == 0xC0) { + // two-byte code point + if (bytes >= 2) { + ++p; + u32 second_byte = *p; + u32 value = ((u32)first_byte & 0x1F) << 6 + | (second_byte & 0x3F); + *c = (char32_t)value; + return 2; + } else { + // incomplete code point + *c = 0; + return (size_t)-1; + } + } + if ((first_byte & 0xF0) == 0xE0) { + // three-byte code point + if (bytes >= 3) { + ++p; + u32 second_byte = *p; + ++p; + u32 third_byte = *p; + u32 value = ((u32)first_byte & 0x0F) << 12 + | (second_byte & 0x3F) << 6 + | (third_byte & 0x3F); + if (value < 0xD800 || value > 0xDFFF) { + *c = (char32_t)value; + return 3; + } else { + // reserved for UTF-16 surrogate halves + *c = 0; + return (size_t)-1; + } + } else { + // incomplete + *c = 0; + return (size_t)-1; + } + } + if ((first_byte & 0xF8) == 0xF0) { + // four-byte code point + if (bytes >= 4) { + ++p; + u32 second_byte = *p; + ++p; + u32 third_byte = *p; + ++p; + u32 fourth_byte = *p; + u32 value = ((u32)first_byte & 0x07) << 18 + | (second_byte & 0x3F) << 12 + | (third_byte & 0x3F) << 6 + | (fourth_byte & 0x3F); + if (value <= 0x10FFFF) { + *c = (char32_t)value; + return 4; + } else { + // Code points this big can't be encoded by UTF-16 and so are invalid UTF-8. + *c = 0; + return (size_t)-1; + } + } else { + // incomplete + *c = 0; + return (size_t)-1; + } + } + // invalid UTF-8 + *c = 0; + return (size_t)-1; + } else { + // ASCII character + if (first_byte == 0) { + *c = 0; + return 0; + } + *c = first_byte; + return 1; + } +} + +// A lot like c32rtomb +// Converts a UTF-32 codepoint to a UTF-8 string. Writes at most 4 bytes to s. +// NOTE: It is YOUR JOB to null-terminate your string if the UTF-32 isn't null-terminated! +// Returns the number of bytes written to s, or (size_t)-1 on invalid UTF-32. +static size_t unicode_utf32_to_utf8(char *s, char32_t c32) { + u8 *p = (u8 *)s; + if (c32 <= 0x7F) { + // ASCII + *p = (u8)c32; + return 1; + } else if (c32 <= 0x7FF) { + // two bytes needed + *p++ = 0xC0 | (u8)(c32 >> 6); + *p = 0x80 | (u8)(c32 & 0x3F); + return 2; + } else if (c32 <= 0x7FFF) { + if (c32 < 0xD800 || c32 > 0xDFFF) { + *p++ = 0xE0 | (u8)( c32 >> 12); + *p++ = 0x80 | (u8)((c32 >> 6) & 0x3F); + *p = 0x80 | (u8)( c32 & 0x3F); + return 3; + } else { + // UTF-16 surrogate halves + *p = 0; + return (size_t)-1; + } + } else if (c32 <= 0x10FFFF) { + *p++ = 0xF0 | (u8)( c32 >> 18); + *p++ = 0x80 | (u8)((c32 >> 12) & 0x3F); + *p++ = 0x80 | (u8)((c32 >> 6) & 0x3F); + *p = 0x80 | (u8)( c32 & 0x3F); + return 4; + } else { + // code point too big + *p = 0; + return (size_t)-1; + } +} +#endif // UNICODE_H_ diff --git a/util.c b/util.c index a6dd3d9..651135e 100644 --- a/util.c +++ b/util.c @@ -134,17 +134,14 @@ static char *stristr(char const *haystack, char const *needle) { for (char const *haystack_start = haystack; haystack_start + needle_bytes <= haystack_end; utf8_next_char_const(&haystack_start)) { char const *p = haystack_start, *q = needle; - mbstate_t pstate = {0}, qstate = {0}; bool match = true; // check if p matches q while (q < needle_end) { char32_t pchar = 0, qchar = 0; - size_t bytes_p = mbrtoc32(&pchar, p, (size_t)(haystack_end - p), &pstate); - size_t bytes_q = mbrtoc32(&qchar, q, (size_t)(needle_end - q), &qstate); - if (bytes_p == (size_t)-3) bytes_p = 0; - if (bytes_q == (size_t)-3) bytes_q = 0; - if (bytes_p > (size_t)-3 || bytes_q > (size_t)-3) return NULL; // invalid UTF-8 + size_t bytes_p = unicode_utf8_to_utf32(&pchar, p, (size_t)(haystack_end - p)); + size_t bytes_q = unicode_utf8_to_utf32(&qchar, q, (size_t)(needle_end - q)); + if (bytes_p == (size_t)-1 || bytes_q == (size_t)-1) return NULL; // invalid UTF-8 bool same = pchar == qchar; if (pchar < WINT_MAX && qchar < WINT_MAX) // on Windows, there is no way of finding the lower-case version of a codepoint outside the BMP. ): same = towlower((wint_t)pchar) == towlower((wint_t)qchar); -- cgit v1.2.3