typedef struct { bool multi_line_comment:1; // are we in a multi-line comment? (delineated by /* */) bool continued_single_line_comment:1; // if you add a \ to the end of a single-line comment, it is continued to the next line. bool continued_preprocessor:1; // similar to above bool continued_string:1; } SyntaxStateC; typedef union { SyntaxStateC c; } SyntaxState; ENUM_U16 { LANG_C } ENUM_U16_END(Language); ENUM_U8 { SYNTAX_NORMAL, SYNTAX_KEYWORD, SYNTAX_COMMENT, SYNTAX_PREPROCESSOR, SYNTAX_STRING, SYNTAX_CHARACTER, SYNTAX_NUMBER } ENUM_U8_END(SyntaxCharType); // NOTE: returns the color setting, not the color ColorSetting syntax_char_type_to_color(SyntaxCharType t) { switch (t) { case SYNTAX_NORMAL: return COLOR_TEXT; case SYNTAX_KEYWORD: return COLOR_KEYWORD; case SYNTAX_COMMENT: return COLOR_COMMENT; case SYNTAX_PREPROCESSOR: return COLOR_PREPROCESSOR; case SYNTAX_STRING: return COLOR_STRING; case SYNTAX_CHARACTER: return COLOR_CHARACTER; case SYNTAX_NUMBER: return COLOR_NUMBER; } } static void syntax_highlight_c(SyntaxStateC *state, char32_t *line, u32 line_len, SyntaxCharType *char_types) { (void)state; bool in_preprocessor = state->continued_preprocessor; bool in_string = state->continued_string; bool in_single_line_comment = state->continued_single_line_comment; // this kind of comment :) bool in_multi_line_comment = state->multi_line_comment; bool in_char = false; bool in_number = false; int backslashes = 0; for (u32 i = 0; i < line_len; ++i) { SyntaxCharType type = SYNTAX_NORMAL; // necessary for the final " of a string to be highlighted bool in_string_now = in_string; bool in_char_now = in_char; bool in_multi_line_comment_now = in_multi_line_comment; // are there 1/2 characters left in the line? bool has_1_char = i + 1 < line_len; bool dealt_with = false; switch (line[i]) { case '#': if (!in_single_line_comment && !in_multi_line_comment) in_preprocessor = true; break; case '\\': ++backslashes; break; case '/': if (!in_multi_line_comment && !in_single_line_comment && !in_string && has_1_char) { if (line[i + 1] == '/') in_single_line_comment = true; // // else if (line[i + 1] == '*') in_multi_line_comment = in_multi_line_comment_now = true; // /* } else if (in_multi_line_comment) { if (i && line[i - 1] == '*') { // */ in_multi_line_comment = false; } } break; case '"': if (in_string && backslashes % 2 == 0) in_string = false; else if (!in_multi_line_comment && !in_single_line_comment) in_string = in_string_now = true; break; case '\'': if (in_char && backslashes % 2 == 0) in_char = false; else if (!in_multi_line_comment && !in_single_line_comment) in_char = in_char_now = true; break; case '<': // preprocessor string, e.g. if (in_preprocessor) in_string = in_string_now = true; break; case '>': if (in_preprocessor && in_string) in_string = false; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // don't you wish C had case ranges... // a number! if (!in_single_line_comment && !in_multi_line_comment && !in_string) { in_number = true; if (i && line[i - 1] == '.') { // support .6, for example char_types[i - 1] = SYNTAX_NUMBER; } } break; default: { // split keywords by starting letter to speed this up static char const *const all_keywords[][10] = { ['a'] = {"auto"}, ['b'] = {"break", "bool"}, ['c'] = {"case", "char", "const", "continue", "char8_t", "char16_t", "char32_t"}, ['d'] = {"default", "do", "double"}, ['e'] = {"else", "enum", "extern"}, ['f'] = {"float", "for", "false"}, ['g'] = {"goto"}, ['i'] = {"if", "inline", "int", "int8_t", "int16_t", "int32_t", "int64_t"}, ['l'] = {"long"}, ['r'] = {"register", "restrict", "return"}, ['s'] = {"short", "signed", "sizeof", "static", "struct", "switch"}, ['t'] = {"typedef", "true"}, ['u'] = {"union", "unsigned", "uint8_t", "uint16_t", "uint32_t", "uint64_t"}, ['v'] = {"void", "volatile"}, ['w'] = {"while", "wchar_t", "wint_t"}, ['_'] = {"_Alignas", "_Alignof", "_Atomic", "_Bool", "_Complex", "_Generic", "_Imaginary", "_Noreturn", "_Static_assert", "_Thread_local"}, }; // keywords don't matter for advancing the state if (char_types && !in_single_line_comment && !in_multi_line_comment && !in_string && !in_preprocessor) { char const *const *keywords = line[i] < arr_count(all_keywords) ? all_keywords[line[i]] : NULL; if (keywords) { for (size_t k = 0; keywords[k]; ++k) { bool matches = true; char const *keyword = keywords[k]; size_t keyword_len = strlen(keyword); if (i + keyword_len <= line_len) { // make sure we don't catch "print" as containing the keyword "int" bool separated = (i == 0 || !is32_ident(line[i-1])) && (i + keyword_len == line_len || !is32_ident(line[i + keyword_len])); if (separated) { char32_t *p = &line[i]; // check if `p` starts with `keyword` for (char const *q = keyword; *q; ++p, ++q) { if (*p != (char32_t)*q) { matches = false; break; } } if (matches) { // it's a keyword // let's highlight all of it now for (size_t c = 0; keyword[c]; ++c) { char_types[i++] = SYNTAX_KEYWORD; } --i; // we'll increment i from the for loop dealt_with = true; break; } } } } } } } break; } if (line[i] != '\\') backslashes = 0; if (in_number && !(is32_digit(line[i]) || line[i] == '.' || line[i] == 'e' || (i && line[i-1] == 'e' && (line[i] == '+' || line[i] == '-')))) { in_number = false; } if (char_types && !dealt_with) { if (in_single_line_comment || in_multi_line_comment_now) type = SYNTAX_COMMENT; else if (in_string_now) type = SYNTAX_STRING; else if (in_char_now) type = SYNTAX_CHARACTER; else if (in_preprocessor) type = SYNTAX_PREPROCESSOR; else if (in_number) type = SYNTAX_NUMBER; char_types[i] = type; } } state->continued_single_line_comment = backslashes && in_single_line_comment; state->continued_preprocessor = backslashes && in_preprocessor; state->continued_string = backslashes && in_string; state->multi_line_comment = in_multi_line_comment; } // This is the main syntax highlighting function. It will determine which colors to use for each character. // Rather than returning colors, it returns a character type (e.g. comment) which can be converted to a color. // To highlight multiple lines, start out with a zeroed SyntaxState, and pass a pointer to it each time. // You can set char_types to NULL if you just want to advance the state, and don't care about the character types. void syntax_highlight(SyntaxState *state, Language lang, char32_t *line, u32 line_len, SyntaxCharType *char_types) { switch (lang) { case LANG_C: syntax_highlight_c(&state->c, line, line_len, char_types); break; } }