From da61efabb1b28c5500824a560c960a720a628de0 Mon Sep 17 00:00:00 2001 From: Leo Tenenbaum Date: Mon, 19 Apr 2021 22:51:33 -0400 Subject: markdown highlighting --- README.md | 4 +- base.h | 4 ++ main.c | 1 + string32.c | 19 ++++++ syntax.c | 191 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ted.cfg | 1 + ted.h | 8 +++ 7 files changed, 226 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2ef6bea..04eb29b 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ A text editor. -**ted is still very new. There is no nice installer yet (if you want ted, you'll have to build it from source). -I'll release installers after testing it a bit more to try to find any remaining bugs.** +**ted is still very new. There is no nice installer yet (if you want ted, you'll have to build it from source).** +**I'll release installers after testing it a bit more to try to find any remaining bugs.** diff --git a/base.h b/base.h index 94ff2bf..9e39f93 100644 --- a/base.h +++ b/base.h @@ -9,6 +9,10 @@ #define _GNU_SOURCE #endif +#if __GNUC__ +#define FALLTHROUGH __attribute__((fallthrough)); +#endif + #if _WIN32 #include #include diff --git a/main.c b/main.c index 8cf3997..64fc96d 100644 --- a/main.c +++ b/main.c @@ -1,3 +1,4 @@ +// HTML highlighting #include "base.h" no_warn_start #if _WIN32 diff --git a/string32.c b/string32.c index 9b88f0e..1b4f1ac 100644 --- a/string32.c +++ b/string32.c @@ -148,6 +148,25 @@ size_t str32_remove_all_instances_of_char(String32 *s, char32_t c) { return ndeleted; } +// returns the length of the longest prefix of `s` containing only +// ASCII characters in the C-string `charset`. +size_t str32_ascii_spn(String32 s, char const *charset) { + for (u32 i = 0; i < s.len; ++i) { + if (s.str[i] >= 128) + return i; // non-ASCII character in s, so that can't be in charset. + bool found = false; + for (char const *p = charset; *p; ++p) { + assert((char32_t)*p < 128); + if ((char32_t)*p == s.str[i]) { + found = true; + break; + } + } + if (!found) return i; + } + return s.len; +} + bool is32_space(char32_t c) { return c <= WINT_MAX && iswspace((wint_t)c); } diff --git a/syntax.c b/syntax.c index fdb2d6a..0a4805d 100644 --- a/syntax.c +++ b/syntax.c @@ -22,6 +22,7 @@ char const *language_comment_start(Language l) { case LANG_PYTHON: return "# "; case LANG_TEX: return "% "; case LANG_NONE: + case LANG_MARKDOWN: case LANG_COUNT: break; } @@ -657,6 +658,193 @@ static void syntax_highlight_tex(SyntaxState *state, char32_t *line, u32 line_le ); } +static void syntax_highlight_markdown(SyntaxState *state, char32_t *line, u32 line_len, SyntaxCharType *char_types) { + bool multiline_code = (*state & SYNTAX_STATE_MARKDOWN_CODE) != 0; + + *state = (multiline_code * SYNTAX_STATE_MARKDOWN_CODE); + + if (line_len >= 3 && line[0] == '`' && line[1] == '`' && line[2] == '`') { + if (multiline_code) { + // end of multi-line code + *state = 0; + } else { + // start of multi-line code + multiline_code = true; + *state = SYNTAX_STATE_MARKDOWN_CODE; + } + } + + if (!char_types) { + return; + } + + if (multiline_code) { + static_assert_if_possible(sizeof *char_types == 1) + memset(char_types, SYNTAX_CODE, line_len); + return; + } + + bool start_of_line = true; // is this the start of the line (not counting whitespace) + int backslashes = 0; + char const *format_ending = NULL; // "**" if we are inside **bold**, etc. + + for (u32 i = 0; i < line_len; ++i) { + char32_t c = line[i]; + bool next_sol = start_of_line && is32_space(c); + bool has_1_char = i+1 < line_len; + bool next_is_space = has_1_char && is32_space(line[i+1]); + + char_types[i] = SYNTAX_NORMAL; + if (format_ending) { + if (streq(format_ending, "`")) + char_types[i] = SYNTAX_CODE; + else + char_types[i] = SYNTAX_STRING; + } + + String32 remains = { + .str = line + i, + .len = line_len - i + }; + if (!format_ending && str32_has_ascii_prefix(remains, "http")) { + if (str32_has_ascii_prefix(remains, "http://") + || str32_has_ascii_prefix(remains, "https://")) { + // a link! + for (; i < line_len; ++i) { + if (is32_space(line[i])) + break; + char_types[i] = SYNTAX_LINK; + } + if (line[i-1] < 128 && strchr(".!,", (char)line[i-1])) { + // punctuation after URLs + char_types[i-1] = SYNTAX_NORMAL; + } + goto bottom; + } + } + + switch (c) { + case '#': + if (start_of_line) { + memset(char_types + i, SYNTAX_STRING, line_len - i); + i = line_len; + } + break; + case '*': + if (start_of_line && next_is_space) { + // bullet list item + char_types[i] = SYNTAX_BUILTIN; + } + FALLTHROUGH + case '_': + if (backslashes % 2 == 1) { + // \* or \_ + } else if (has_1_char && line[i+1] == c) { + // **bold** or __bold__ + char const *end = c == '*' ? "**" : "__"; + if (format_ending) { + if (streq(format_ending, end)) { + char_types[i++] = SYNTAX_STRING; + char_types[i] = SYNTAX_STRING; + format_ending = NULL; + } + } else if (!next_is_space) { + char_types[i++] = SYNTAX_STRING; + char_types[i] = SYNTAX_STRING; + format_ending = end; + } + } else { + // *italics* or _italics_ + char const *end = c == '*' ? "*" : "_"; + if (format_ending) { + if (streq(format_ending, end)) + format_ending = NULL; + } else if (!next_is_space) { + char_types[i] = SYNTAX_STRING; + format_ending = end; + } + } + break; + case '`': + if (backslashes % 2 == 1) { + // \` + } else if (format_ending) { + if (streq(format_ending, "`")) + format_ending = NULL; + } else { + char_types[i] = SYNTAX_CODE; + format_ending = "`"; + } + break; + case '-': + case '>': + if (start_of_line && next_is_space) { + // list item/blockquote + char_types[i] = SYNTAX_BUILTIN; + } + break; + case ANY_DIGIT: + if (start_of_line) { + size_t spn = str32_ascii_spn(remains, "0123456789"); + size_t end = i + spn; + if (end < line_len && line[end] == '.') { + // numbered list item + for (; i <= end; ++i) { + char_types[i] = SYNTAX_BUILTIN; + } + } + } + break; + case '[': { + if (backslashes % 2 == 0) { + // [URLS](like-this.com) + u32 j; + for (j = i+1; j < line_len; ++j) { + if (line[j] == ']' && backslashes % 2 == 0) + break; + if (line[j] == '\\') + ++backslashes; + else + backslashes = 0; + } + backslashes = 0; + u32 closing_bracket = j; + if (closing_bracket+2 < line_len && line[closing_bracket+1] == '(') { + for (j = closing_bracket+2; j < line_len; ++j) { + if (line[j] == ')' && backslashes % 2 == 0) + break; + if (line[j] == '\\') + ++backslashes; + else + backslashes = 0; + } + u32 closing_parenthesis = j; + if (closing_parenthesis < line_len) { + // hooray! + if (i > 0 && line[i-1] == '!') + --i; // images are links, but with ! before them + memset(&char_types[i], SYNTAX_LINK, closing_parenthesis+1 - i); + i = closing_parenthesis; + } + backslashes = 0; + + } + } + } break; + } + bottom: + if (i >= line_len) break; + + if (line[i] != '\\') + backslashes = 0; + else + ++backslashes; + + start_of_line = next_sol; + } + +} + // This is the main syntax highlighting function. It will determine which colors to use for each character. // Rather than returning colors, it returns a character type (e.g. comment) which can be converted to a color. // To highlight multiple lines, start out with a zeroed SyntaxState, and pass a pointer to it each time. @@ -682,6 +870,9 @@ void syntax_highlight(SyntaxState *state, Language lang, char32_t *line, u32 lin case LANG_TEX: syntax_highlight_tex(state, line, line_len, char_types); break; + case LANG_MARKDOWN: + syntax_highlight_markdown(state, line, line_len, char_types); + break; case LANG_COUNT: assert(0); break; } } diff --git a/ted.cfg b/ted.cfg index 1e01cc3..1ed74cc 100644 --- a/ted.cfg +++ b/ted.cfg @@ -208,3 +208,4 @@ C++ = .cpp, .hpp, .C, .H, .cxx, .hxx, .cc, .hh Rust = .rs Python = .py Tex = .tex +Markdown = .md diff --git a/ted.h b/ted.h index 66648f5..c05f8d3 100644 --- a/ted.h +++ b/ted.h @@ -33,6 +33,10 @@ enum { SYNTAX_STATE_TEX_VERBATIM = 0x04u, // inside \begin{verbatim} ... \end{verbatim} }; +enum { + SYNTAX_STATE_MARKDOWN_CODE = 0x01u, // inside ``` ``` code section +}; + typedef u8 SyntaxState; ENUM_U16 { @@ -42,6 +46,7 @@ ENUM_U16 { LANG_RUST, LANG_PYTHON, LANG_TEX, + LANG_MARKDOWN, LANG_COUNT } ENUM_U16_END(Language); @@ -57,6 +62,7 @@ static LanguageName const language_names[] = { {LANG_RUST, "Rust"}, {LANG_PYTHON, "Python"}, {LANG_TEX, "Tex"}, + {LANG_MARKDOWN, "Markdown"}, }; static_assert_if_possible(arr_count(language_names) == LANG_COUNT) @@ -73,6 +79,8 @@ ENUM_U8 { } ENUM_U8_END(SyntaxCharType); #define SYNTAX_MATH SYNTAX_STRING // for tex +#define SYNTAX_CODE SYNTAX_PREPROCESSOR // for markdown +#define SYNTAX_LINK SYNTAX_CONSTANT // for markdown typedef struct { float cursor_blink_time_on, cursor_blink_time_off; -- cgit v1.2.3