summaryrefslogtreecommitdiff
path: root/util.c
diff options
context:
space:
mode:
Diffstat (limited to 'util.c')
-rw-r--r--util.c62
1 files changed, 44 insertions, 18 deletions
diff --git a/util.c b/util.c
index 8c1239c..9dcb5ea 100644
--- a/util.c
+++ b/util.c
@@ -105,29 +105,50 @@ static void str_cpy(char *dst, size_t dst_sz, char const *src) {
dst[n] = 0;
}
+// advances str to the start of the next UTF8 character
+static void utf8_next_char_const(char const **str) {
+ if (**str) {
+ do {
+ ++*str;
+ } while (((u8)(**str) & 0xC0) == 0x80); // while we are on a continuation byte
+ }
+}
+
/*
-returns the first instance of needle in haystack, ignoring the case of the characters,
+returns the first instance of needle in haystack, where both are UTF-8 strings, ignoring the case of the characters,
or NULL if the haystack does not contain needle
WARNING: O(strlen(haystack) * strlen(needle))
*/
static char *stristr(char const *haystack, char const *needle) {
- size_t needle_len = strlen(needle), haystack_len = strlen(haystack), i, j;
+ size_t needle_bytes = strlen(needle), haystack_bytes = strlen(haystack);
+
+ if (needle_bytes > haystack_bytes) return NULL;
- if (needle_len > haystack_len) return NULL; // a larger string can't fit in a smaller string
+ char const *haystack_end = haystack + haystack_bytes;
+ char const *needle_end = needle + needle_bytes;
- for (i = 0; i <= haystack_len - needle_len; ++i) {
- char const *p = haystack + i, *q = needle;
+ for (char const *haystack_start = haystack; haystack_start + needle_bytes <= haystack_end; utf8_next_char_const(&haystack_start)) {
+ char const *p = haystack_start, *q = needle;
+ mbstate_t pstate = {0}, qstate = {0};
bool match = true;
- for (j = 0; j < needle_len; ++j) {
- if (tolower(*p) != tolower(*q)) {
- match = false;
- break;
- }
- ++p;
- ++q;
+
+ // check if p matches q
+ while (q < needle_end) {
+ char32_t pchar = 0, qchar = 0;
+ size_t bytes_p = mbrtoc32(&pchar, p, (size_t)(haystack_end - p), &pstate);
+ size_t bytes_q = mbrtoc32(&qchar, q, (size_t)(needle_end - q), &qstate);
+ if (bytes_p == (size_t)-3) bytes_p = 0;
+ if (bytes_q == (size_t)-3) bytes_q = 0;
+ if (bytes_p > (size_t)-3 || bytes_q > (size_t)-3) return NULL; // invalid UTF-8
+ bool same = pchar == qchar;
+ if (pchar < WINT_MAX && qchar < WINT_MAX) // on Windows, there is no way of finding the lower-case version of a codepoint outside the BMP. ):
+ same = towlower((wint_t)pchar) == towlower((wint_t)qchar);
+ if (!same) match = false;
+ p += bytes_p;
+ q += bytes_q;
}
if (match)
- return (char *)haystack + i;
+ return (char *)haystack_start;
}
return NULL;
}
@@ -151,12 +172,17 @@ static bool str_satisfies(char const *s, int (*predicate)(int)) {
return true;
}
-// function to be passed into qsort for case insensitive sorting
-static int str_qsort_case_insensitive_cmp(const void *av, const void *bv) {
- char const *const *a = av, *const *b = bv;
+
+static int strcmp_case_insensitive(char const *a, char const *b) {
#if _WIN32
- return _stricmp(*a, *b);
+ return _stricmp(a, b);
#else
- return strcasecmp(*a, *b);
+ return strcasecmp(a, b);
#endif
}
+
+// function to be passed into qsort for case insensitive sorting
+static int str_qsort_case_insensitive_cmp(const void *av, const void *bv) {
+ char const *const *a = av, *const *b = bv;
+ return strcmp_case_insensitive(*a, *b);
+}