summaryrefslogtreecommitdiff
path: root/unicode.h
diff options
context:
space:
mode:
Diffstat (limited to 'unicode.h')
-rw-r--r--unicode.h49
1 files changed, 29 insertions, 20 deletions
diff --git a/unicode.h b/unicode.h
index 2b686df..1e517c4 100644
--- a/unicode.h
+++ b/unicode.h
@@ -1,10 +1,14 @@
-// functions for dealing with UTF-8/UTF-16/UTF-32.
-// this file is entirely self-contained.
+/// \file
+/// functions for dealing with UTF-8/UTF-16/UTF-32.
+///
+/// this file is entirely self-contained.
#ifndef UNICODE_H_
#define UNICODE_H_
+/// useful for "this character couldn't be rendered / is invalid UTF-8"
#define UNICODE_BOX_CHARACTER 0x2610
-#define UNICODE_CODE_POINTS 0x110000 // number of Unicode code points
+/// number of Unicode code points
+#define UNICODE_CODE_POINTS 0x110000
#include <stddef.h>
#include <stdint.h>
@@ -19,15 +23,16 @@ static bool unicode_is_continuation_byte(uint8_t byte) {
return (byte & 0xC0) == 0x80;
}
-// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing.
-// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum
-// number of bytes that can be read from `str` (note: this function will never read past a null
-// byte, even if `bytes` indicates that it could).
-// Returns:
-// 0 - if a null character was encountered or if `bytes == 0`
-// (size_t)-1 - on invalid UTF-8
-// (size_t)-2 - on incomplete code point (str should be longer)
-// other - the number of bytes read from `str`.
+/// A lot like mbrtoc32. Doesn't depend on the locale though, for one thing.
+///
+/// *c will be filled with the next UTF-8 code point in `str`. `bytes` refers to the maximum
+/// number of bytes that can be read from `str` (note: this function will never read past a null
+/// byte, even if `bytes` indicates that it could).
+/// Returns:\n
+/// `0` - if a null character was encountered or if `bytes == 0`\n
+/// `(size_t)-1` - on invalid UTF-8\n
+/// `(size_t)-2` - on incomplete code point (str should be longer)\n
+/// other - the number of bytes read from `str`.
static size_t unicode_utf8_to_utf32(uint32_t *c, const char *str, size_t bytes) {
*c = 0;
if (bytes == 0) {
@@ -121,10 +126,11 @@ static size_t unicode_utf8_to_utf32(uint32_t *c, const char *str, size_t bytes)
}
}
-// A lot like c32rtomb
-// Converts a UTF-32 codepoint to a UTF-8 string. Writes at most 4 bytes to s.
-// NOTE: It is YOUR JOB to null-terminate your string if the UTF-32 isn't null-terminated!
-// Returns the number of bytes written to s, or (size_t)-1 on invalid UTF-32.
+/// A lot like c32rtomb
+///
+/// Converts a UTF-32 codepoint to a UTF-8 string. Writes at most 4 bytes to s.
+/// NOTE: It is YOUR JOB to null-terminate your string if the UTF-32 isn't null-terminated!
+/// Returns the number of bytes written to `s`, or `(size_t)-1` on invalid UTF-32.
static size_t unicode_utf32_to_utf8(char *s, uint32_t c32) {
uint8_t *p = (uint8_t *)s;
if (c32 <= 0x7F) {
@@ -162,7 +168,8 @@ static size_t unicode_utf32_to_utf8(char *s, uint32_t c32) {
// get the number of UTF-16 codepoints needed to encode `str`.
-// returns (size_t)-1 on bad UTF-8
+///
+// returns `(size_t)-1` on bad UTF-8
static size_t unicode_utf16_len(const char *str) {
size_t len = 0;
uint32_t c = 0;
@@ -179,9 +186,11 @@ static size_t unicode_utf16_len(const char *str) {
return len;
}
-// returns the UTF-8 offset from `str` which corresponds to a UTF-16 offset of utf16_offset (rounds down if utf16_offset is in the middle of a codepoint).
-// returns strlen(str) if utf16_offset == unicode_utf16_len(str)
-// returns (size_t)-1 on bad UTF-8, or if utf16_offset > unicode_utf16_len(str)
+/// returns the UTF-8 offset from `str` which corresponds to a UTF-16 offset of
+/// `utf16_offset` (rounds down if `utf16_offset` is in the middle of a codepoint).
+///
+/// returns `strlen(str)` if `utf16_offset == unicode_utf16_len(str)`
+/// returns `(size_t)-1` on bad UTF-8, or if `utf16_offset > unicode_utf16_len(str)`
static size_t unicode_utf16_to_utf8_offset(const char *str, size_t utf16_offset) {
size_t offset = 0;
uint32_t c = 0;