// syntax highlighting for ted #include "ted.h" #include "keywords.h" // all characters that can appear in a number #define SYNTAX_DIGITS "0123456789.xXoObBlLuUiIabcdefABCDEF_" // ---- syntax state constants ---- // syntax state is explained in development.md // these all say "CPP" but really they're C/C++ enum { SYNTAX_STATE_CPP_MULTI_LINE_COMMENT = 0x1u, // are we in a multi-line comment? (delineated by /* */) SYNTAX_STATE_CPP_SINGLE_LINE_COMMENT = 0x2u, // if you add a \ to the end of a single-line comment, it is continued to the next line. SYNTAX_STATE_CPP_PREPROCESSOR = 0x4u, // similar to above SYNTAX_STATE_CPP_STRING = 0x8u, SYNTAX_STATE_CPP_RAW_STRING = 0x10u, }; enum { SYNTAX_STATE_RUST_COMMENT_DEPTH_MASK = 0xfu, // in rust, /* */ comments can nest. SYNTAX_STATE_RUST_COMMENT_DEPTH_MUL = 0x1u, SYNTAX_STATE_RUST_COMMENT_DEPTH_BITS = 4, // number of bits we allocate for the comment depth. SYNTAX_STATE_RUST_STRING = 0x10u, SYNTAX_STATE_RUST_STRING_IS_RAW = 0x20u, }; enum { SYNTAX_STATE_PYTHON_STRING = 0x01u, // multiline strings (''' and """) SYNTAX_STATE_PYTHON_STRING_DBL_QUOTED = 0x02u, // is this a """ string, as opposed to a ''' string? }; enum { SYNTAX_STATE_TEX_DOLLAR = 0x01u, // inside math $ ... $ SYNTAX_STATE_TEX_DOLLARDOLLAR = 0x02u, // inside math $$ ... $$ SYNTAX_STATE_TEX_VERBATIM = 0x04u, // inside \begin{verbatim} ... \end{verbatim} }; enum { SYNTAX_STATE_MARKDOWN_CODE = 0x01u, // inside ``` ``` code section }; enum { SYNTAX_STATE_HTML_COMMENT = 0x01u }; enum { SYNTAX_STATE_JAVASCRIPT_TEMPLATE_STRING = 0x01u, SYNTAX_STATE_JAVASCRIPT_MULTILINE_COMMENT = 0x02u, }; enum { SYNTAX_STATE_JAVA_MULTILINE_COMMENT = 0x01u }; enum { SYNTAX_STATE_GO_RAW_STRING = 0x01u, // backtick-enclosed string SYNTAX_STATE_GO_MULTILINE_COMMENT = 0x02u }; enum { SYNTAX_STATE_TED_CFG_STRING = 0x01u, // ` or "-delimited string SYNTAX_STATE_TED_CFG_STRING_BACKTICK = 0x02u, // `-delimited string }; typedef struct { Language lang; const char *name; } LanguageName; static const LanguageName language_names[] = { {LANG_NONE, "None"}, {LANG_C, "C"}, {LANG_CPP, "C++"}, {LANG_RUST, "Rust"}, {LANG_PYTHON, "Python"}, {LANG_TEX, "Tex"}, {LANG_MARKDOWN, "Markdown"}, {LANG_HTML, "HTML"}, {LANG_CONFIG, "Config"}, {LANG_JAVASCRIPT, "JavaScript"}, {LANG_JAVA, "Java"}, {LANG_GO, "Go"}, {LANG_TED_CFG, "TedCfg"}, {LANG_TYPESCRIPT, "TypeScript"}, {LANG_JSON, "JSON"}, {LANG_XML, "XML"}, {LANG_GLSL, "GLSL"}, {LANG_TEXT, "Text"}, }; static_assert_if_possible(arr_count(language_names) == LANG_COUNT) // returns the language this string is referring to, or LANG_NONE if it's invalid. Language language_from_str(const char *str) { for (int i = 0; i < LANG_COUNT; ++i) { if (strcmp_case_insensitive(language_names[i].name, str) == 0) return language_names[i].lang; } return LANG_NONE; } const char *language_to_str(Language language) { for (int i = 0; i < LANG_COUNT; ++i) { if (language_names[i].lang == language) return language_names[i].name; } return "???"; } // start of single line comment for language l -- used for comment/uncomment selection const char *language_comment_start(Language l) { switch (l) { case LANG_C: case LANG_RUST: case LANG_CPP: case LANG_JAVASCRIPT: case LANG_TYPESCRIPT: case LANG_JSON: // JSON technically doesn't have comments but apparently some parsers support this so might as well have this here case LANG_JAVA: case LANG_GO: case LANG_GLSL: return "// "; case LANG_CONFIG: case LANG_TED_CFG: case LANG_PYTHON: return "# "; case LANG_TEX: return "% "; case LANG_HTML: case LANG_XML: return ""; default: return ""; } } ColorSetting syntax_char_type_to_color_setting(SyntaxCharType t) { switch (t) { case SYNTAX_NORMAL: return COLOR_TEXT; case SYNTAX_KEYWORD: return COLOR_KEYWORD; case SYNTAX_COMMENT: return COLOR_COMMENT; case SYNTAX_PREPROCESSOR: return COLOR_PREPROCESSOR; case SYNTAX_STRING: return COLOR_STRING; case SYNTAX_CHARACTER: return COLOR_CHARACTER; case SYNTAX_CONSTANT: return COLOR_CONSTANT; case SYNTAX_BUILTIN: return COLOR_BUILTIN; } return COLOR_TEXT; } static bool syntax_keyword_matches(const char32_t *text, size_t len, const char *keyword) { if (len == strlen(keyword)) { bool matches = true; const char32_t *p = text; // check if `p` starts with `keyword` for (const char *q = keyword; *q; ++p, ++q) { if (*p != (char32_t)*q) { matches = false; break; } } return matches; } else { return false; } } char32_t syntax_matching_bracket(Language lang, char32_t c) { if (lang == LANG_HTML || lang == LANG_XML) { // for most languages, this would look weird since // v cursor // if (x < 5 && y >| 6) // ^ this will be highlighted as a "matching bracket" // but for HTML this is nice switch (c) { case '<': return '>'; case '>': return '<'; } } switch (c) { case '(': return ')'; case ')': return '('; case '[': return ']'; case ']': return '['; case '{': return '}'; case '}': return '{'; } return 0; } bool syntax_is_opening_bracket(Language lang, char32_t c) { if (lang == LANG_HTML || lang == LANG_XML) { if (c == '<') return true; } switch (c) { case '(': case '[': case '{': return true; } return false; } // lookup the given string in the keywords table static Keyword const *syntax_keyword_lookup(const KeywordList *all_keywords, const char32_t *str, size_t len) { if (!len) return NULL; const KeywordList *list = &all_keywords[str[0] % 128]; const Keyword *keywords = list->keywords; size_t nkeywords = list->len; if (keywords) { for (size_t k = 0; k < nkeywords; ++k) { if (syntax_keyword_matches(str, len, keywords[k].str)) { return &keywords[k]; } } } return NULL; } // does i continue the number literal from i-1 static bool syntax_number_continues(Language lang, const char32_t *line, u32 line_len, u32 i) { if (line[i] == '.') { if ((i && line[i-1] == '.') || (i < line_len-1 && line[i+1] == '.')) return false; // can't have two .s in a row if (i < line_len-1 && lang == LANG_RUST && !isdigit(line[i+1]) && line[i+1] != '_') { // don't highlight 0.into() weirdly // (in Rust, only 0123456789_ can follow a decimal point) return false; } } return (line[i] < CHAR_MAX && (strchr(SYNTAX_DIGITS, (char)line[i]) || (i && line[i-1] == 'e' && (line[i] == '+' || line[i] == '-')))); } static bool is_keyword(Language lang, char32_t c) { if (c == '_' && lang == LANG_TEX) return false; if (is32_word(c)) return true; switch (lang) { case LANG_RUST: // Rust builtin macros if (c == '!') return true; break; case LANG_HTML: case LANG_XML: if (c == '-' || c == '=') return true; break; default: break; } return false; } // find how long this keyword would be (if this is a keyword) static u32 syntax_keyword_len(Language lang, const char32_t *line, u32 i, u32 line_len) { u32 keyword_end; for (keyword_end = i; keyword_end < line_len; ++keyword_end) { if (!is_keyword(lang, line[keyword_end])) break; } return keyword_end - i; } // highlighting for C, C++, and GLSL static void syntax_highlight_c_cpp(SyntaxState *state_ptr, const char32_t *line, u32 line_len, SyntaxCharType *char_types, Language lang) { SyntaxState state = *state_ptr; bool in_preprocessor = (state & SYNTAX_STATE_CPP_PREPROCESSOR) != 0; bool in_string = (state & SYNTAX_STATE_CPP_STRING) != 0; bool in_single_line_comment = (state & SYNTAX_STATE_CPP_SINGLE_LINE_COMMENT) != 0; bool in_multi_line_comment = (state & SYNTAX_STATE_CPP_MULTI_LINE_COMMENT) != 0; bool in_raw_string = (state & SYNTAX_STATE_CPP_RAW_STRING); bool in_char = false; bool in_number = false; bool raw_string_ending = false; int backslashes = 0; for (u32 i = 0; i < line_len; ++i) { // are there 1/2 characters left in the line? bool has_1_char = i + 1 < line_len; bool has_2_chars = i + 2 < line_len; bool dealt_with = false; char32_t c = line[i]; if (in_raw_string) { if (has_2_chars && c == ')' && line[1] == '"') { raw_string_ending = true; } if (char_types) char_types[i] = SYNTAX_STRING; if (raw_string_ending && c == '"') in_raw_string = false; dealt_with = true; } else switch (c) { case '#': if (!in_single_line_comment && !in_multi_line_comment && !in_char && !in_string) in_preprocessor = true; break; case '\\': ++backslashes; break; case '/': if (!in_multi_line_comment && !in_single_line_comment && !in_string && !in_char && has_1_char) { if (line[i + 1] == '/') in_single_line_comment = true; // // else if (line[i + 1] == '*') in_multi_line_comment = true; // /* } else if (in_multi_line_comment) { if (i && line[i - 1] == '*') { // */ in_multi_line_comment = false; if (char_types) { dealt_with = true; char_types[i] = SYNTAX_COMMENT; } } } break; case '"': if (in_string && backslashes % 2 == 0) { in_string = false; if (char_types) { dealt_with = true; char_types[i] = SYNTAX_STRING; } } else if (!in_multi_line_comment && !in_single_line_comment && !in_char) { in_string = true; } break; case '\'': if (in_char && backslashes % 2 == 0) { in_char = false; if (char_types) { dealt_with = true; char_types[i] = SYNTAX_CHARACTER; } } else if (!in_multi_line_comment && !in_single_line_comment && !in_string) { if (i == 0 || !is32_digit(line[i-1])) // in C++, you can use ' as a separator, e.g. 1'000'000 in_char = true; } break; case ANY_DIGIT: // a number! if (char_types && !in_single_line_comment && !in_multi_line_comment && !in_string && !in_number && !in_char) { in_number = true; if (i) { if (line[i - 1] == '.') { // support .6, for example char_types[i - 1] = SYNTAX_CONSTANT; } else if (is32_word(line[i - 1])) { // actually, this isn't a number. it's something like a*6* or u3*2*. in_number = false; } } } break; default: { if ((i && is32_word(line[i - 1])) || !is32_word(c)) break; // can't be a keyword on its own. if (!in_single_line_comment && !in_multi_line_comment && !in_string && c == 'R' && has_2_chars && line[i + 1] == '"' && line[i + 2] == '(') { // raw string in_raw_string = true; raw_string_ending = false; break; } // keywords don't matter for advancing the state if (char_types && !in_single_line_comment && !in_multi_line_comment && !in_number && !in_string && !in_preprocessor && !in_char) { u32 keyword_len = syntax_keyword_len(lang, line, i, line_len); Keyword const *keyword = NULL; switch (lang) { case LANG_CPP: keyword = syntax_keyword_lookup(syntax_all_keywords_cpp, &line[i], keyword_len); if (!keyword) keyword = syntax_keyword_lookup(syntax_all_keywords_c, &line[i], keyword_len); break; case LANG_GLSL: keyword = syntax_keyword_lookup(syntax_all_keywords_glsl, &line[i], keyword_len); break; default: assert(lang == LANG_C); keyword = syntax_keyword_lookup(syntax_all_keywords_c, &line[i], keyword_len); break; } if (keyword) { SyntaxCharType type = keyword->type; for (size_t j = 0; j < keyword_len; ++j) { char_types[i++] = type; } --i; // we'll increment i from the for loop dealt_with = true; break; } } } break; } if (c != '\\') backslashes = 0; if (in_number && !syntax_number_continues(lang, line, line_len, i)) { in_number = false; } if (char_types && !dealt_with) { SyntaxCharType type = SYNTAX_NORMAL; if (in_single_line_comment || in_multi_line_comment) type = SYNTAX_COMMENT; else if (in_string) type = SYNTAX_STRING; else if (in_char) type = SYNTAX_CHARACTER; else if (in_number) type = SYNTAX_CONSTANT; else if (in_preprocessor) type = SYNTAX_PREPROCESSOR; char_types[i] = type; } } *state_ptr = (SyntaxState)( ((backslashes && in_single_line_comment) * SYNTAX_STATE_CPP_SINGLE_LINE_COMMENT) | ((backslashes && in_preprocessor) * SYNTAX_STATE_CPP_PREPROCESSOR) | ((backslashes && in_string) * SYNTAX_STATE_CPP_STRING) | (in_multi_line_comment * SYNTAX_STATE_CPP_MULTI_LINE_COMMENT) | (in_raw_string * SYNTAX_STATE_CPP_RAW_STRING) ); } static void syntax_highlight_rust(SyntaxState *state, const char32_t *line, u32 line_len, SyntaxCharType *char_types) { u32 comment_depth = (((u32)*state & SYNTAX_STATE_RUST_COMMENT_DEPTH_MASK) / SYNTAX_STATE_RUST_COMMENT_DEPTH_MUL); bool in_string = (*state & SYNTAX_STATE_RUST_STRING) != 0; bool string_is_raw = (*state & SYNTAX_STATE_RUST_STRING_IS_RAW) != 0; bool in_number = false; bool in_attribute = false; int backslashes = 0; int bracket_depth = 0; for (u32 i = 0; i < line_len; ++i) { char32_t c = line[i]; bool dealt_with = false; bool has_1_char = i + 1 < line_len; bool has_2_chars = i + 2 < line_len; bool has_3_chars = i + 3 < line_len; switch (c) { case '/': if (!in_string) { if (i && line[i-1] == '*') { // */ if (comment_depth) --comment_depth; if (char_types) { char_types[i] = SYNTAX_COMMENT; dealt_with = true; } } else if (has_1_char && line[i+1] == '*') { // /* ++comment_depth; } else if (!comment_depth && has_1_char && line[i+1] == '/') { // // // just handle it all now if (char_types) { for (u32 j = i; j < line_len; ++j) char_types[j] = SYNTAX_COMMENT; } i = line_len - 1; dealt_with = true; break; } } break; case 'r': if (char_types && !comment_depth) { if (has_2_chars && line[i+1] == '#' && line[i+2] == '"') { // r before raw string char_types[i] = SYNTAX_STRING; dealt_with = true; } } goto keyword_check; case 'b': if (char_types && !comment_depth) { if ((has_1_char && line[i+1] == '"') || (has_3_chars && line[i+1] == 'r' && line[i+2] == '#' && line[i+3] == '"')) { // b before byte string char_types[i] = SYNTAX_STRING; dealt_with = true; } if (has_1_char && line[i+1] == '\'') { // b before byte char char_types[i] = SYNTAX_CHARACTER; dealt_with = true; } } goto keyword_check; case '"': if (!comment_depth) { if (in_string) { if (backslashes % 2 == 0) { if (!string_is_raw || (has_1_char && line[i+1] == '#')) { // end of string literal in_string = false; if (char_types) { char_types[i] = SYNTAX_STRING; dealt_with = true; if (string_is_raw && has_1_char) { // highlighting for final # ++i; char_types[i] = SYNTAX_STRING; } } string_is_raw = false; } } } else { // start of string literal in_string = true; if (i && line[i-1] == '#') string_is_raw = true; } } break; case '\'': { if (!comment_depth && !in_string && has_2_chars) { // figure out if this is a character or a lifetime u32 char_end; backslashes = line[i+1] == '\\'; for (char_end = i + 2; char_end < line_len; ++char_end) { if (line[char_end] == '\'' && backslashes % 2 == 0) { break; } if (line[char_end] == '\\') ++backslashes; else backslashes = 0; if (line[char_end] < CHAR_MAX && line[char_end - 1] != '\\' && !strchr("abcdefABCDEF0123456789", (char)line[char_end])) break; } if (char_end < line_len && line[char_end] == '\'') { // a character literal if (char_types) { for (u32 j = i; j <= char_end; ++j) char_types[j] = SYNTAX_CHARACTER; dealt_with = true; } i = char_end; } else { // a lifetime or something else } } } break; case '\\': ++backslashes; break; case ANY_DIGIT: // a number! if (char_types && !comment_depth && !in_string && !in_number) { in_number = true; if (i && (is32_word(line[i - 1]) || (line[i-1] == '.' && !(i >= 2 && line[i-2] == '.'))) ) { // actually, this isn't a number. it's something like a*6* or u3*2*. // also, don't highlight the 0 in tuple.0 in_number = false; } } break; case '[': if (in_attribute && !in_string && !comment_depth) { ++bracket_depth; } break; case ']': if (in_attribute && !in_string && !comment_depth) { --bracket_depth; if (bracket_depth < 0) { in_attribute = false; } } break; case '#': if (char_types && !in_string && !comment_depth) { if (i && line[i-1] == 'r') { if (has_1_char && line[i+1] == '"') { // raw string char_types[i] = SYNTAX_STRING; dealt_with = true; } else { // raw identifier } break; } if (!has_2_chars) break; if (line[i+1] == '[' || (line[i+1] == '!' && line[i+2] == '[')) { in_attribute = true; bracket_depth = 0; } } break; default: keyword_check: { if ((i && is32_word(line[i - 1])) || !is32_word(c)) break; // can't be a keyword on its own. if (i >= 2 && line[i-2] == 'r' && line[i-1] == '#') { // raw identifier break; } if (char_types && !in_string && !comment_depth && !in_number) { u32 keyword_len = syntax_keyword_len(LANG_RUST, line, i, line_len); Keyword const *keyword = syntax_keyword_lookup(syntax_all_keywords_rust, &line[i], keyword_len); if (keyword) { SyntaxCharType type = keyword->type; for (size_t j = 0; j < keyword_len; ++j) { char_types[i++] = type; } --i; // we'll increment i from the for loop dealt_with = true; break; } } } break; } if (c != '\\') backslashes = 0; if (in_number && !syntax_number_continues(LANG_RUST, line, line_len, i)) in_number = false; if (char_types && !dealt_with) { SyntaxCharType type = SYNTAX_NORMAL; if (comment_depth) { type = SYNTAX_COMMENT; } else if (in_string) { type = SYNTAX_STRING; } else if (in_number) { type = SYNTAX_CONSTANT; } else if (in_attribute) { type = SYNTAX_PREPROCESSOR; } char_types[i] = type; } } u32 max_comment_depth = ((u32)1<= max_comment_depth) comment_depth = max_comment_depth; *state = (SyntaxState)( (comment_depth * SYNTAX_STATE_RUST_COMMENT_DEPTH_MUL) | (in_string * SYNTAX_STATE_RUST_STRING) | (string_is_raw * SYNTAX_STATE_RUST_STRING_IS_RAW) ); } static void syntax_highlight_python(SyntaxState *state, const char32_t *line, u32 line_len, SyntaxCharType *char_types) { (void)state; bool in_string = (*state & SYNTAX_STATE_PYTHON_STRING) != 0; bool string_is_dbl_quoted = (*state & SYNTAX_STATE_PYTHON_STRING_DBL_QUOTED) != 0; bool string_is_multiline = true; bool in_number = false; uint backslashes = 0; for (u32 i = 0; i < line_len; ++i) { char32_t c = line[i]; bool dealt_with = false; switch (c) { case '#': if (!in_string) { // comment if (char_types) { for (u32 j = i; j < line_len; ++j) char_types[j] = SYNTAX_COMMENT; dealt_with = true; } i = line_len - 1; } break; case 'f': case 'r': case 'b': if (char_types && i+1 < line_len && (line[i+1] == '\'' || line[i+1] == '"')) { // format/raw/byte string // @TODO(eventually): we don't handle raw string highlighting correctly. char_types[i] = SYNTAX_STRING; dealt_with = true; } goto keyword_check; case '\'': case '"': { bool dbl_quoted = c == '"'; bool is_triple = i+2 < line_len && line[i+1] == c && line[i+2] == c; if (in_string) { if (!string_is_multiline || is_triple) { if (string_is_dbl_quoted == dbl_quoted && backslashes % 2 == 0) { // end of string in_string = false; if (char_types) { char_types[i] = SYNTAX_STRING; if (string_is_multiline) { // highlight all three ending quotes char_types[++i] = SYNTAX_STRING; char_types[++i] = SYNTAX_STRING; } dealt_with = true; } } } } else { // start of string string_is_dbl_quoted = dbl_quoted; in_string = true; string_is_multiline = is_triple; } } break; case ANY_DIGIT: if (char_types && !in_string && !in_number) { in_number = true; if (i) { if (line[i - 1] == '.') { // support .6, for example char_types[i - 1] = SYNTAX_CONSTANT; } else if (is32_word(line[i - 1])) { // actually, this isn't a number. it's something like a*6* or u3*2*. in_number = false; } } } break; case '\\': ++backslashes; break; default: keyword_check: if ((i && is32_word(line[i - 1])) || !is32_word(c)) break; // can't be a keyword on its own. if (char_types && !in_string && !in_number) { u32 keyword_len = syntax_keyword_len(LANG_PYTHON, line, i, line_len); Keyword const *keyword = syntax_keyword_lookup(syntax_all_keywords_python, &line[i], keyword_len); if (keyword) { SyntaxCharType type = keyword->type; for (size_t j = 0; j < keyword_len; ++j) { char_types[i++] = type; } --i; // we'll increment i from the for loop dealt_with = true; break; } } break; } if (c != '\\') backslashes = 0; if (in_number && !syntax_number_continues(LANG_PYTHON, line, line_len, i)) in_number = false; if (char_types && !dealt_with) { SyntaxCharType type = SYNTAX_NORMAL; if (in_string) type = SYNTAX_STRING; else if (in_number) type = SYNTAX_CONSTANT; char_types[i] = type; } } *state = 0; if (in_string && string_is_multiline) { *state |= (SyntaxState)( SYNTAX_STATE_PYTHON_STRING | (SYNTAX_STATE_PYTHON_STRING_DBL_QUOTED * string_is_dbl_quoted) ); } } static bool is_tex_ident(char32_t c) { // digits and underscores cannot appear in tex identifiers return is32_word(c) && !is32_digit(c) && c != '_'; } static void syntax_highlight_tex(SyntaxState *state, const char32_t *line, u32 line_len, SyntaxCharType *char_types) { bool dollar = (*state & SYNTAX_STATE_TEX_DOLLAR) != 0; bool dollardollar = (*state & SYNTAX_STATE_TEX_DOLLARDOLLAR) != 0; bool verbatim = (*state & SYNTAX_STATE_TEX_VERBATIM) != 0; for (u32 i = 0; i < line_len; ++i) { char32_t c = line[i]; bool has_1_char = i + 1 < line_len; if (char_types) char_types[i] = dollar || dollardollar ? SYNTAX_MATH : SYNTAX_NORMAL; switch (c) { case '\\': if (has_1_char) { if (is32_graph(line[i+1])) { if (is_tex_ident(line[i+1])) { // command, e.g. \begin String32 command_str = { .str = (char32_t *)line + i+1, .len = line_len - (i+1), }; bool new_verbatim = false; if (!dollar && !dollardollar) { if (!verbatim && str32_has_ascii_prefix(command_str, "begin{verbatim}")) { new_verbatim = true; } else if (verbatim && str32_has_ascii_prefix(command_str, "end{verbatim}")) { verbatim = false; } } if (!verbatim) { if (char_types) char_types[i] = SYNTAX_KEYWORD; for (++i; i < line_len; ++i) { if (is_tex_ident(line[i])) { if (char_types) char_types[i] = SYNTAX_KEYWORD; } else { --i; break; } } verbatim = new_verbatim; } } else if (!verbatim) { // something like \\, \%, etc. if (char_types) char_types[i] = SYNTAX_KEYWORD; ++i; if (char_types) char_types[i] = SYNTAX_KEYWORD; } } } break; case '%': // comment if (!verbatim) { for (; i < line_len; ++i) { if (char_types) char_types[i] = SYNTAX_COMMENT; } } break; case '&': // table/matrix/etc. separator if (char_types && !verbatim) char_types[i] = SYNTAX_BUILTIN; break; case '$': if (!verbatim) { if (!dollar && has_1_char && line[i+1] == '$') { // $$ if (dollardollar) { if (char_types) char_types[i] = SYNTAX_MATH; ++i; if (char_types) char_types[i] = SYNTAX_MATH; dollardollar = false; } else { if (char_types) char_types[i] = SYNTAX_MATH; dollardollar = true; } } else if (!dollardollar) { // single $ if (dollar) { dollar = false; } else { dollar = true; if (char_types) char_types[i] = SYNTAX_MATH; } } } break; } } *state = (SyntaxState)( (dollar * SYNTAX_STATE_TEX_DOLLAR) | (dollardollar * SYNTAX_STATE_TEX_DOLLARDOLLAR) | (verbatim * SYNTAX_STATE_TEX_VERBATIM) ); } static void syntax_highlight_markdown(SyntaxState *state, const char32_t *line, u32 line_len, SyntaxCharType *char_types) { bool multiline_code = (*state & SYNTAX_STATE_MARKDOWN_CODE) != 0; *state = (multiline_code * SYNTAX_STATE_MARKDOWN_CODE); if (line_len >= 3 && line[0] == '`' && line[1] == '`' && line[2] == '`') { if (multiline_code) { // end of multi-line code *state = 0; } else { // start of multi-line code multiline_code = true; *state = SYNTAX_STATE_MARKDOWN_CODE; } } if (!char_types) { return; } if (multiline_code) { static_assert_if_possible(sizeof *char_types == 1) // NOTE: memset is used extensively in this file this way memset(char_types, SYNTAX_CODE, line_len); return; } bool start_of_line = true; // is this the start of the line (not counting whitespace) int backslashes = 0; const char *format_ending = NULL; // "**" if we are inside **bold**, etc. for (u32 i = 0; i < line_len; ++i) { char32_t c = line[i]; bool next_sol = start_of_line && is32_space(c); bool has_1_char = i+1 < line_len; bool next_is_space = has_1_char && is32_space(line[i+1]); char_types[i] = SYNTAX_NORMAL; if (format_ending) { if (streq(format_ending, "`")) char_types[i] = SYNTAX_CODE; else char_types[i] = SYNTAX_STRING; } String32 remains = { .str = (char32_t *)line + i, .len = line_len - i }; if (!format_ending && str32_has_ascii_prefix(remains, "http")) { if (str32_has_ascii_prefix(remains, "http://") || str32_has_ascii_prefix(remains, "https://")) { // a link! for (; i < line_len; ++i) { if (is32_space(line[i])) break; char_types[i] = SYNTAX_LINK; } if (line[i-1] < 128 && strchr(".!,", (char)line[i-1])) { // punctuation after URLs char_types[i-1] = SYNTAX_NORMAL; } goto bottom; } } switch (c) { case '#': if (start_of_line) { memset(char_types + i, SYNTAX_STRING, line_len - i); i = line_len; } break; case '*': if (start_of_line && next_is_space) { // bullet list item char_types[i] = SYNTAX_BUILTIN; } FALLTHROUGH case '_': if (backslashes % 2 == 1) { // \* or \_ } else if (has_1_char && line[i+1] == c) { // **bold** or __bold__ const char *end = c == '*' ? "**" : "__"; if (format_ending) { if (streq(format_ending, end)) { char_types[i++] = SYNTAX_STRING; char_types[i] = SYNTAX_STRING; format_ending = NULL; } } else if (!next_is_space) { char_types[i++] = SYNTAX_STRING; char_types[i] = SYNTAX_STRING; format_ending = end; } } else { // *italics* or _italics_ const char *end = c == '*' ? "*" : "_"; if (format_ending) { if (streq(format_ending, end)) format_ending = NULL; } else if (!next_is_space) { char_types[i] = SYNTAX_STRING; format_ending = end; } } break; case '`': if (backslashes % 2 == 1) { // \` } else if (format_ending) { if (streq(format_ending, "`")) format_ending = NULL; } else { char_types[i] = SYNTAX_CODE; format_ending = "`"; } break; case '-': case '>': if (start_of_line && next_is_space) { // list item/blockquote char_types[i] = SYNTAX_BUILTIN; } break; case ANY_DIGIT: if (start_of_line) { size_t spn = str32_ascii_spn(remains, "0123456789"); size_t end = i + spn; if (end < line_len && line[end] == '.') { // numbered list item for (; i <= end; ++i) { char_types[i] = SYNTAX_BUILTIN; } } } break; case '[': { if (backslashes % 2 == 0) { // [URLS](like-this.com) u32 j; for (j = i+1; j < line_len; ++j) { if (line[j] == ']' && backslashes % 2 == 0) break; if (line[j] == '\\') ++backslashes; else backslashes = 0; } backslashes = 0; u32 closing_bracket = j; if (closing_bracket+2 < line_len && line[closing_bracket+1] == '(') { for (j = closing_bracket+2; j < line_len; ++j) { if (line[j] == ')' && backslashes % 2 == 0) break; if (line[j] == '\\') ++backslashes; else backslashes = 0; } u32 closing_parenthesis = j; if (closing_parenthesis < line_len) { // hooray! if (i > 0 && line[i-1] == '!') --i; // images are links, but with ! before them memset(&char_types[i], SYNTAX_LINK, closing_parenthesis+1 - i); i = closing_parenthesis; } backslashes = 0; } } } break; } bottom: if (i >= line_len) break; if (line[i] != '\\') backslashes = 0; else ++backslashes; start_of_line = next_sol; } } static bool is_html_tag_char(char32_t c) { return c == '<' || c == '/' || c == '!' || c == ':' || is32_alnum(c); } // highlights XML and HTML static void syntax_highlight_xml(SyntaxState *state, const char32_t *line, u32 line_len, SyntaxCharType *char_types, Language lang) { bool comment = (*state & SYNTAX_STATE_HTML_COMMENT) != 0; bool in_sgl_string = false; // 'string' bool in_dbl_string = false; // "string" int backslashes = 0; for (u32 i = 0; i < line_len; ++i) { String32 remains = { .str = (char32_t *)line + i, .len = line_len - i }; bool has_1_char = i + 1 < line_len; if (comment) { if (str32_has_ascii_prefix(remains, "-->")) { if (char_types) memset(&char_types[i], SYNTAX_COMMENT, 3); i += 2; // (don't worry, comments can't nest in HTML) comment = false; } else { if (char_types) char_types[i] = SYNTAX_COMMENT; } } else if (!in_sgl_string && !in_dbl_string && str32_has_ascii_prefix(remains, "