summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeo Tenenbaum <pommicket@gmail.com>2021-01-25 18:00:06 -0500
committerLeo Tenenbaum <pommicket@gmail.com>2021-01-25 18:00:06 -0500
commit795262f69900af674156bed2bcd0fdb57dbbb55e (patch)
tree03723b919ff498722d7985a93f9ce7f470596abb
parenta56f549a266e14cdc00a98e8dc3e154f5ac6c23e (diff)
replaced c32rtomb, mbrtoc32 with own versions
these are nicer to use since they don't involve mbstate_t and should be faster since they don't involve locales
-rw-r--r--Makefile2
-rw-r--r--buffer.c48
-rw-r--r--main.c18
-rw-r--r--make.bat7
-rw-r--r--string32.c13
-rw-r--r--text.c23
-rw-r--r--unicode.h138
-rw-r--r--util.c9
8 files changed, 186 insertions, 72 deletions
diff --git a/Makefile b/Makefile
index 6dc6b3e..babddc3 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ INSTALL_BIN_DIR=/usr/bin
ted: *.[ch] text.o
$(CC) main.c text.o -o ted $(DEBUG_CFLAGS) $(LIBS)
release: *.[ch]
- $(CC) main.c text.o -o ted $(RELEASE_CFLAGS) $(LIBS)
+ $(CC) main.c -o ted $(RELEASE_CFLAGS) $(LIBS)
text.o: text.c text.h base.h lib/stb_truetype.h
$(CC) text.c -c -o $@ $(DEBUG_CFLAGS)
clean:
diff --git a/buffer.c b/buffer.c
index 03d9b9e..41ad74c 100644
--- a/buffer.c
+++ b/buffer.c
@@ -133,7 +133,7 @@ char32_t buffer_char_at_pos(TextBuffer *buffer, BufferPos p) {
// invalid (col too large)
return 0;
} else {
- return U'\n';
+ return '\n';
}
}
@@ -200,7 +200,7 @@ size_t buffer_get_text_at_pos(TextBuffer *buffer, BufferPos pos, char32_t *text,
if (p) {
memcpy(p, line->str + index, chars_from_this_line * sizeof *p);
p += chars_from_this_line;
- *p++ = U'\n';
+ *p++ = '\n';
}
chars_left -= chars_from_this_line+1;
}
@@ -355,7 +355,7 @@ static void buffer_edit_print(BufferEdit *edit) {
printf(" (" U32_FMT " chars): ", edit->prev_len);
for (size_t i = 0; i < edit->prev_len; ++i) {
char32_t c = edit->prev_text[i];
- if (c == U'\n')
+ if (c == '\n')
printf("\\n");
else
printf("%lc", (wint_t)c);
@@ -547,22 +547,19 @@ Status buffer_load_file(TextBuffer *buffer, char const *filename) {
size_t bytes_read = fread(file_contents, 1, file_size, fp);
if (bytes_read == file_size) {
char32_t c = 0;
- mbstate_t mbstate = {0};
for (u8 *p = file_contents, *end = p + file_size; p != end; ) {
if (*p == '\r' && p != end-1 && p[1] == '\n') {
// CRLF line endings
p += 2;
- c = U'\n';
+ c = '\n';
} else {
- size_t n = mbrtoc32(&c, (char *)p, (size_t)(end - p), &mbstate);
+ size_t n = unicode_utf8_to_utf32(&c, (char *)p, (size_t)(end - p));
if (n == 0) {
// null character
c = 0;
++p;
- } else if (n == (size_t)(-3)) {
- // no bytes consumed, but a character was produced
- } else if (n == (size_t)(-2) || n == (size_t)(-1)) {
- // incomplete character at end of file or invalid UTF-8 respectively; fail
+ } else if (n == (size_t)(-1)) {
+ // invalid UTF-8
success = false;
buffer_seterr(buffer, "Invalid UTF-8 (position: %td).", p - file_contents);
break;
@@ -570,7 +567,7 @@ Status buffer_load_file(TextBuffer *buffer, char const *filename) {
p += n;
}
}
- if (c == U'\n') {
+ if (c == '\n') {
if (buffer_lines_set_min_capacity(buffer, &lines, &lines_capacity, nlines + 1))
++nlines;
} else {
@@ -634,11 +631,12 @@ bool buffer_save(TextBuffer *buffer) {
if (out) {
bool success = true;
for (Line *line = buffer->lines, *end = line + buffer->nlines; line != end; ++line) {
- mbstate_t state = {0};
for (char32_t *p = line->str, *p_end = p + line->len; p != p_end; ++p) {
- char utf8[MB_LEN_MAX] = {0};
- size_t bytes = c32rtomb(utf8, *p, &state);
- fwrite(utf8, 1, bytes, out);
+ char utf8[4] = {0};
+ size_t bytes = unicode_utf32_to_utf8(utf8, *p);
+ if (bytes != (size_t)-1) {
+ fwrite(utf8, 1, bytes, out);
+ }
}
if (line != end-1) {
@@ -694,7 +692,7 @@ static u32 buffer_index_to_column(TextBuffer *buffer, u32 line, u32 index) {
uint tab_width = buffer_settings(buffer)->tab_width;
for (u32 i = 0; i < index; ++i) {
switch (str[i]) {
- case U'\t': {
+ case '\t': {
do
++col;
while (col % tab_width);
@@ -718,7 +716,7 @@ static u32 buffer_column_to_index(TextBuffer *buffer, u32 line, u32 column) {
uint tab_width = buffer_settings(buffer)->tab_width;
for (u32 i = 0; i < len; ++i) {
switch (str[i]) {
- case U'\t': {
+ case '\t': {
do {
if (col == column)
return i;
@@ -1059,7 +1057,7 @@ i64 buffer_cursor_move_down(TextBuffer *buffer, i64 by) {
// Is this character a "word" character?
// This determines how buffer_pos_move_words (i.e. ctrl+left/right) works
static bool is_word(char32_t c) {
- return c > WCHAR_MAX || c == U'_' || iswalnum((wint_t)c);
+ return c > WCHAR_MAX || c == '_' || iswalnum((wint_t)c);
}
static bool is_space(char32_t c) {
@@ -1229,7 +1227,7 @@ BufferPos buffer_insert_text_at_pos(TextBuffer *buffer, BufferPos pos, String32
if (buffer->is_line_buffer) {
// remove all the newlines from str.
- str32_remove_all_instances_of_char(&str, U'\n');
+ str32_remove_all_instances_of_char(&str, '\n');
}
if (str.len == 0) {
@@ -1260,7 +1258,7 @@ BufferPos buffer_insert_text_at_pos(TextBuffer *buffer, BufferPos pos, String32
// `text` could consist of multiple lines, e.g. U"line 1\nline 2",
// so we need to go through them one by one
- u32 n_added_lines = (u32)str32_count_char(str, U'\n');
+ u32 n_added_lines = (u32)str32_count_char(str, '\n');
if (n_added_lines) {
if (buffer_insert_lines(buffer, line_idx + 1, n_added_lines)) {
line = &buffer->lines[line_idx]; // fix pointer
@@ -1279,7 +1277,7 @@ BufferPos buffer_insert_text_at_pos(TextBuffer *buffer, BufferPos pos, String32
while (str.len) {
- u32 text_line_len = (u32)str32chr(str, U'\n');
+ u32 text_line_len = (u32)str32chr(str, '\n');
u32 old_len = line->len;
u32 new_len = old_len + text_line_len;
if (new_len > old_len) { // handles both overflow and empty text lines
@@ -1947,12 +1945,12 @@ void buffer_render(TextBuffer *buffer, float x1, float y1, float x2, float y2) {
char32_t c = *p;
switch (c) {
- case U'\n': assert(0); break;
- case U'\r': break; // for CRLF line endings
- case U'\t': {
+ case '\n': assert(0); break;
+ case '\r': break; // for CRLF line endings
+ case '\t': {
uint tab_width = settings->tab_width;
do {
- text_render_char(font, &text_state, U' ');
+ text_render_char(font, &text_state, ' ');
++column;
} while (column % tab_width);
} break;
diff --git a/main.c b/main.c
index aea8588..4bf0a92 100644
--- a/main.c
+++ b/main.c
@@ -17,8 +17,12 @@ no_warn_end
#include <shellapi.h>
#endif
-
+#include "unicode.h"
+#if DEBUG
#include "text.h"
+#else
+#include "text.c"
+#endif
#include "util.c"
#define MATH_GL
#include "math.c"
@@ -30,7 +34,6 @@ no_warn_end
#error "Unrecognized operating system."
#endif
-#include "unicode.h"
#include "command.h"
#include "colors.h"
#include "ted.h"
@@ -354,10 +357,10 @@ int main(int argc, char **argv) {
switch (key_combo) {
case SDL_SCANCODE_RETURN << 3:
if (!was_in_line_buffer) // make sure return to submit line buffer doesn't get added to newly-active buffer
- buffer_insert_char_at_cursor(buffer, U'\n');
+ buffer_insert_char_at_cursor(buffer, '\n');
break;
case SDL_SCANCODE_TAB << 3:
- buffer_insert_char_at_cursor(buffer, U'\t');
+ buffer_insert_char_at_cursor(buffer, '\t');
break;
}
}
@@ -478,15 +481,14 @@ int main(int argc, char **argv) {
TextRenderState text_state = {.x = text_x1, .y = text_y1,
.min_x = -FLT_MAX, .max_x = FLT_MAX, .min_y = -FLT_MAX, .max_y = FLT_MAX,
.render = true};
- mbstate_t mbstate = {0};
char *p = ted->error_shown, *end = p + strlen(p);
text_chars_begin(font);
while (p != end) {
char32_t c = 0;
- size_t n = mbrtoc32(&c, p, (size_t)(end - p), &mbstate);
- if (n > (size_t)-3) { ++p; continue; } // invalid UTF-8; this shouldn't happen
- if (n != (size_t)-3) p += n;
+ size_t n = unicode_utf8_to_utf32(&c, p, (size_t)(end - p));
+ if (n == (size_t)-1) { ++p; continue; } // invalid UTF-8; this shouldn't happen
+ p += n;
if (text_state.x + char_width >= text_x2) {
text_state.x = text_x1;
text_state.y += char_height;
diff --git a/make.bat b/make.bat
index 0f4fcdf..dee7a0c 100644
--- a/make.bat
+++ b/make.bat
@@ -6,9 +6,8 @@ if _%VCVARS% == _ (
SET CFLAGS=/nologo /W4 /wd4200 /wd4204 /wd4221 /wd4706 /D_CRT_SECURE_NO_WARNINGS /I SDL2/include SDL2/lib/x64/SDL2main.lib SDL2/lib/x64/SDL2.lib opengl32.lib shell32.lib ole32.lib
rc /nologo ted.rc
-SET SOURCES=main.c text.c ted.res
if _%1 == _ (
- cl %SOURCES% /DDEBUG /DEBUG /Zi %CFLAGS% /Fe:ted
+ cl main.c text.c ted.res /DDEBUG /DEBUG /Zi %CFLAGS% /Fe:ted
)
-if _%1 == _release cl %SOURCES% /O2 %CFLAGS% /Fe:ted
-if _%1 == _profile cl %SOURCES% /O2 /DPROFILE %CFLAGS% /Fe:ted
+if _%1 == _release cl main.c ted.res /O2 %CFLAGS% /Fe:ted
+if _%1 == _profile cl main.c ted.res /O2 /DPROFILE %CFLAGS% /Fe:ted
diff --git a/string32.c b/string32.c
index d7278d2..fc6117b 100644
--- a/string32.c
+++ b/string32.c
@@ -32,19 +32,15 @@ String32 str32_from_utf8(char const *utf8) {
char32_t *wide_p = widestr;
char const *utf8_p = utf8;
char const *utf8_end = utf8_p + len;
- mbstate_t mbstate = {0};
while (utf8_p < utf8_end) {
char32_t c = 0;
- size_t n = mbrtoc32(&c, utf8_p, (size_t)(utf8_end - utf8_p), &mbstate);
- if (n == 0// null character. this shouldn't happen.
- || n == (size_t)(-2) // incomplete character
+ size_t n = unicode_utf8_to_utf32(&c, utf8_p, (size_t)(utf8_end - utf8_p));
+ if (n == 0 // null character. this shouldn't happen.
|| n == (size_t)(-1) // invalid UTF-8
) {
free(widestr);
widestr = wide_p = NULL;
break;
- } else if (n == (size_t)(-3)) { // no bytes consumed, but a character was produced
- *wide_p++ = c;
} else {
// n bytes consumed
*wide_p++ = c;
@@ -65,11 +61,10 @@ static char *str32_to_utf8_cstr(String32 s) {
char *utf8 = calloc(4 * s.len + 1, 1); // each codepoint takes up at most 4 bytes in UTF-8, + we need a terminating null byte
if (utf8) {
char *p = utf8;
- mbstate_t mbstate; memset(&mbstate, 0, sizeof mbstate);
for (size_t i = 0; i < s.len; ++i) {
- size_t bytes = c32rtomb(p, s.str[i], &mbstate);
+ size_t bytes = unicode_utf32_to_utf8(p, s.str[i]);
if (bytes == (size_t)-1) {
- // invalid UTF-32 character
+ // invalid UTF-32 code point
free(utf8);
return NULL;
} else {
diff --git a/text.c b/text.c
index fbce4eb..094db15 100644
--- a/text.c
+++ b/text.c
@@ -1,5 +1,6 @@
#include "base.h"
#include "text.h"
+#include "unicode.h"
#define STB_TRUETYPE_IMPLEMENTATION
#define STBTT_STATIC
no_warn_start
@@ -8,15 +9,6 @@ no_warn_end
#include <stdlib.h>
#include <GL/gl.h>
-#define UNICODE_BOX_CHARACTER 0x2610
-#define UNICODE_CODE_POINTS 0x110000 // number of Unicode code points
-
-static bool unicode_is_start_of_code_point(u8 byte) {
- // see https://en.wikipedia.org/wiki/UTF-8#Encoding
- // continuation bytes are of the form 10xxxxxx
- return (byte & 0xC0) != 0x80;
-}
-
// We split up code points into a bunch of pages, so we don't have to load all of the font at
// once into one texture.
#define CHAR_PAGE_SIZE 2048
@@ -249,21 +241,16 @@ void text_render_with_state(Font *font, TextRenderState *render_state, char cons
render_state->x = x;
render_state->y = y;
char32_t c = 0;
- mbstate_t mbstate = {0};
char const *end = text + strlen(text);
while (text != end) {
- size_t ret = mbrtoc32(&c, text, (size_t)(end - text), &mbstate);
+ size_t ret = unicode_utf8_to_utf32(&c, text, (size_t)(end - text));
if (ret == 0) break;
- if (ret == (size_t)(-2)) { // incomplete multi-byte character
- text_render_char(font, render_state, U'?');
- text = end; // done reading text
- } else if (ret == (size_t)(-1)) {
+ if (ret == (size_t)(-1)) {
// invalid UTF-8; skip this byte
- text_render_char(font, render_state, U'?');
+ text_render_char(font, render_state, '?');
++text;
} else {
- if (ret != (size_t)(-3))
- text += ret; // character consists of `ret` bytes
+ text += ret; // character consists of `ret` bytes
switch (c) {
default:
text_render_char(font, render_state, c);
diff --git a/unicode.h b/unicode.h
index 3f76090..cb4f2bc 100644
--- a/unicode.h
+++ b/unicode.h
@@ -9,4 +9,140 @@ static bool unicode_is_start_of_code_point(u8 byte) {
return (byte & 0xC0) != 0x80;
}
-#endif
+// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing.
+// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum
+// number of bytes that can be read from `str`.
+// Returns:
+// 0 - if a NULL character was encountered
+// (size_t)-1 - on invalid UTF-8 / incomplete code point
+// other - the number of bytes read from `str`.
+static size_t unicode_utf8_to_utf32(char32_t *c, char const *str, size_t bytes) {
+ if (bytes == 0) {
+ *c = 0;
+ return 0;
+ }
+ // it's easier to do things with unsigned integers
+ u8 const *p = (u8 const *)str;
+
+ u8 first_byte = *p;
+
+ if (first_byte & 0x80) {
+ if ((first_byte & 0xE0) == 0xC0) {
+ // two-byte code point
+ if (bytes >= 2) {
+ ++p;
+ u32 second_byte = *p;
+ u32 value = ((u32)first_byte & 0x1F) << 6
+ | (second_byte & 0x3F);
+ *c = (char32_t)value;
+ return 2;
+ } else {
+ // incomplete code point
+ *c = 0;
+ return (size_t)-1;
+ }
+ }
+ if ((first_byte & 0xF0) == 0xE0) {
+ // three-byte code point
+ if (bytes >= 3) {
+ ++p;
+ u32 second_byte = *p;
+ ++p;
+ u32 third_byte = *p;
+ u32 value = ((u32)first_byte & 0x0F) << 12
+ | (second_byte & 0x3F) << 6
+ | (third_byte & 0x3F);
+ if (value < 0xD800 || value > 0xDFFF) {
+ *c = (char32_t)value;
+ return 3;
+ } else {
+ // reserved for UTF-16 surrogate halves
+ *c = 0;
+ return (size_t)-1;
+ }
+ } else {
+ // incomplete
+ *c = 0;
+ return (size_t)-1;
+ }
+ }
+ if ((first_byte & 0xF8) == 0xF0) {
+ // four-byte code point
+ if (bytes >= 4) {
+ ++p;
+ u32 second_byte = *p;
+ ++p;
+ u32 third_byte = *p;
+ ++p;
+ u32 fourth_byte = *p;
+ u32 value = ((u32)first_byte & 0x07) << 18
+ | (second_byte & 0x3F) << 12
+ | (third_byte & 0x3F) << 6
+ | (fourth_byte & 0x3F);
+ if (value <= 0x10FFFF) {
+ *c = (char32_t)value;
+ return 4;
+ } else {
+ // Code points this big can't be encoded by UTF-16 and so are invalid UTF-8.
+ *c = 0;
+ return (size_t)-1;
+ }
+ } else {
+ // incomplete
+ *c = 0;
+ return (size_t)-1;
+ }
+ }
+ // invalid UTF-8
+ *c = 0;
+ return (size_t)-1;
+ } else {
+ // ASCII character
+ if (first_byte == 0) {
+ *c = 0;
+ return 0;
+ }
+ *c = first_byte;
+ return 1;
+ }
+}
+
+// A lot like c32rtomb
+// Converts a UTF-32 codepoint to a UTF-8 string. Writes at most 4 bytes to s.
+// NOTE: It is YOUR JOB to null-terminate your string if the UTF-32 isn't null-terminated!
+// Returns the number of bytes written to s, or (size_t)-1 on invalid UTF-32.
+static size_t unicode_utf32_to_utf8(char *s, char32_t c32) {
+ u8 *p = (u8 *)s;
+ if (c32 <= 0x7F) {
+ // ASCII
+ *p = (u8)c32;
+ return 1;
+ } else if (c32 <= 0x7FF) {
+ // two bytes needed
+ *p++ = 0xC0 | (u8)(c32 >> 6);
+ *p = 0x80 | (u8)(c32 & 0x3F);
+ return 2;
+ } else if (c32 <= 0x7FFF) {
+ if (c32 < 0xD800 || c32 > 0xDFFF) {
+ *p++ = 0xE0 | (u8)( c32 >> 12);
+ *p++ = 0x80 | (u8)((c32 >> 6) & 0x3F);
+ *p = 0x80 | (u8)( c32 & 0x3F);
+ return 3;
+ } else {
+ // UTF-16 surrogate halves
+ *p = 0;
+ return (size_t)-1;
+ }
+ } else if (c32 <= 0x10FFFF) {
+ *p++ = 0xF0 | (u8)( c32 >> 18);
+ *p++ = 0x80 | (u8)((c32 >> 12) & 0x3F);
+ *p++ = 0x80 | (u8)((c32 >> 6) & 0x3F);
+ *p = 0x80 | (u8)( c32 & 0x3F);
+ return 4;
+ } else {
+ // code point too big
+ *p = 0;
+ return (size_t)-1;
+ }
+}
+#endif // UNICODE_H_
diff --git a/util.c b/util.c
index a6dd3d9..651135e 100644
--- a/util.c
+++ b/util.c
@@ -134,17 +134,14 @@ static char *stristr(char const *haystack, char const *needle) {
for (char const *haystack_start = haystack; haystack_start + needle_bytes <= haystack_end; utf8_next_char_const(&haystack_start)) {
char const *p = haystack_start, *q = needle;
- mbstate_t pstate = {0}, qstate = {0};
bool match = true;
// check if p matches q
while (q < needle_end) {
char32_t pchar = 0, qchar = 0;
- size_t bytes_p = mbrtoc32(&pchar, p, (size_t)(haystack_end - p), &pstate);
- size_t bytes_q = mbrtoc32(&qchar, q, (size_t)(needle_end - q), &qstate);
- if (bytes_p == (size_t)-3) bytes_p = 0;
- if (bytes_q == (size_t)-3) bytes_q = 0;
- if (bytes_p > (size_t)-3 || bytes_q > (size_t)-3) return NULL; // invalid UTF-8
+ size_t bytes_p = unicode_utf8_to_utf32(&pchar, p, (size_t)(haystack_end - p));
+ size_t bytes_q = unicode_utf8_to_utf32(&qchar, q, (size_t)(needle_end - q));
+ if (bytes_p == (size_t)-1 || bytes_q == (size_t)-1) return NULL; // invalid UTF-8
bool same = pchar == qchar;
if (pchar < WINT_MAX && qchar < WINT_MAX) // on Windows, there is no way of finding the lower-case version of a codepoint outside the BMP. ):
same = towlower((wint_t)pchar) == towlower((wint_t)qchar);