summaryrefslogtreecommitdiff
path: root/unicode.h
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2023-08-05 12:01:15 -0400
committerpommicket <pommicket@gmail.com>2023-08-05 12:01:15 -0400
commitef84bb759becde98318011652c6c5b8a52433359 (patch)
treebd1383c92677f6c38c389604f24dc43ea795f171 /unicode.h
parent2cd73992ef519eb1aaf6c83abe87a34dadf7ac31 (diff)
reject overlong UTF-8
Diffstat (limited to 'unicode.h')
-rw-r--r--unicode.h15
1 files changed, 8 insertions, 7 deletions
diff --git a/unicode.h b/unicode.h
index c93897d..4e8c020 100644
--- a/unicode.h
+++ b/unicode.h
@@ -52,6 +52,10 @@ static size_t unicode_utf8_to_utf32(uint32_t *c, const char *str, size_t bytes)
if ((second_byte & 0xC0) != 0x80) return (size_t)-1;
uint32_t value = ((uint32_t)first_byte & 0x1F) << 6
| (second_byte & 0x3F);
+ if (value < 128) {
+ // overlong
+ return (size_t)-1;
+ }
*c = (uint32_t)value;
return 2;
} else {
@@ -71,11 +75,11 @@ static size_t unicode_utf8_to_utf32(uint32_t *c, const char *str, size_t bytes)
uint32_t value = ((uint32_t)first_byte & 0x0F) << 12
| (second_byte & 0x3F) << 6
| (third_byte & 0x3F);
- if (value < 0xD800 || value > 0xDFFF) {
+ if ((value < 0xD800 || value > 0xDFFF) && value >= 0x800) {
*c = (uint32_t)value;
return 3;
} else {
- // reserved for UTF-16 surrogate halves
+ // overlong or UTF-16 surrogate halves
return (size_t)-1;
}
} else {
@@ -99,14 +103,11 @@ static size_t unicode_utf8_to_utf32(uint32_t *c, const char *str, size_t bytes)
| (second_byte & 0x3F) << 12
| (third_byte & 0x3F) << 6
| (fourth_byte & 0x3F);
- if (value >= 0xD800 && value <= 0xDFFF) {
- // reserved for UTF-16 surrogate halves
- return (size_t)-1;
- } else if (value <= 0x10FFFF) {
+ if (value >= 0x10000 && value <= 0x10FFFF) {
*c = (uint32_t)value;
return 4;
} else {
- // Code points this big can't be encoded by UTF-16 so are invalid UTF-8.
+ // overlong or value too large.
return (size_t)-1;
}
} else {