From da61efabb1b28c5500824a560c960a720a628de0 Mon Sep 17 00:00:00 2001
From: Leo Tenenbaum <pommicket@gmail.com>
Date: Mon, 19 Apr 2021 22:51:33 -0400
Subject: markdown highlighting

---
 README.md  |   4 +-
 base.h     |   4 ++
 main.c     |   1 +
 string32.c |  19 ++++++
 syntax.c   | 191 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ted.cfg    |   1 +
 ted.h      |   8 +++
 7 files changed, 226 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 2ef6bea..04eb29b 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 A text editor.
 
-**ted is still very new. There is no nice installer yet (if you want ted, you'll have to build it from source).
-I'll release installers after testing it a bit more to try to find any remaining bugs.**
+**ted is still very new. There is no nice installer yet (if you want ted, you'll have to build it from source).**
+**I'll release installers after testing it a bit more to try to find any remaining bugs.**
 
 <img src="ted.png">
 
diff --git a/base.h b/base.h
index 94ff2bf..9e39f93 100644
--- a/base.h
+++ b/base.h
@@ -9,6 +9,10 @@
 #define _GNU_SOURCE
 #endif
 
+#if __GNUC__
+#define FALLTHROUGH __attribute__((fallthrough));
+#endif
+
 #if _WIN32
 #include <windows.h>
 #include <shlobj.h>
diff --git a/main.c b/main.c
index 8cf3997..64fc96d 100644
--- a/main.c
+++ b/main.c
@@ -1,3 +1,4 @@
+// HTML highlighting
 #include "base.h"
 no_warn_start
 #if _WIN32
diff --git a/string32.c b/string32.c
index 9b88f0e..1b4f1ac 100644
--- a/string32.c
+++ b/string32.c
@@ -148,6 +148,25 @@ size_t str32_remove_all_instances_of_char(String32 *s, char32_t c) {
 	return ndeleted;
 }
 
+// returns the length of the longest prefix of `s` containing only
+// ASCII characters in the C-string `charset`.
+size_t str32_ascii_spn(String32 s, char const *charset) {
+	for (u32 i = 0; i < s.len; ++i) {
+		if (s.str[i] >= 128)
+			return i; // non-ASCII character in s, so that can't be in charset.
+		bool found = false;
+		for (char const *p = charset; *p; ++p) {
+			assert((char32_t)*p < 128);
+			if ((char32_t)*p == s.str[i]) {
+				found = true;
+				break;
+			}
+		}
+		if (!found) return i;
+	}
+	return s.len;
+}
+
 bool is32_space(char32_t c) {
 	return c <= WINT_MAX && iswspace((wint_t)c);
 }
diff --git a/syntax.c b/syntax.c
index fdb2d6a..0a4805d 100644
--- a/syntax.c
+++ b/syntax.c
@@ -22,6 +22,7 @@ char const *language_comment_start(Language l) {
 	case LANG_PYTHON: return "# ";
 	case LANG_TEX: return "% ";
 	case LANG_NONE:
+	case LANG_MARKDOWN:
 	case LANG_COUNT:
 		break;
 	}
@@ -657,6 +658,193 @@ static void syntax_highlight_tex(SyntaxState *state, char32_t *line, u32 line_le
 	);
 }
 
+static void syntax_highlight_markdown(SyntaxState *state, char32_t *line, u32 line_len, SyntaxCharType *char_types) {
+	bool multiline_code = (*state & SYNTAX_STATE_MARKDOWN_CODE) != 0;
+	
+	*state = (multiline_code * SYNTAX_STATE_MARKDOWN_CODE);
+	
+	if (line_len >= 3 && line[0] == '`' && line[1] == '`' && line[2] == '`') {
+		if (multiline_code) {
+			// end of multi-line code
+			*state = 0;
+		} else {
+			// start of multi-line code
+			multiline_code = true;
+			*state = SYNTAX_STATE_MARKDOWN_CODE;
+		}
+	}
+
+	if (!char_types) {
+		return;
+	}
+
+	if (multiline_code) {
+		static_assert_if_possible(sizeof *char_types == 1)
+		memset(char_types, SYNTAX_CODE, line_len);
+		return;
+	}
+	
+	bool start_of_line = true; // is this the start of the line (not counting whitespace)
+	int backslashes = 0;
+	char const *format_ending = NULL; // "**" if we are inside **bold**, etc.
+	
+	for (u32 i = 0; i < line_len; ++i) {
+		char32_t c = line[i];
+		bool next_sol = start_of_line && is32_space(c);
+		bool has_1_char = i+1 < line_len;
+		bool next_is_space = has_1_char && is32_space(line[i+1]);
+		
+		char_types[i] = SYNTAX_NORMAL;
+		if (format_ending) {
+			if (streq(format_ending, "`"))
+				char_types[i] = SYNTAX_CODE;
+			else
+				char_types[i] = SYNTAX_STRING;
+		}
+		
+		String32 remains = {
+			.str = line + i,
+			.len = line_len - i
+		};
+		if (!format_ending && str32_has_ascii_prefix(remains, "http")) {
+			if (str32_has_ascii_prefix(remains, "http://")
+				|| str32_has_ascii_prefix(remains, "https://")) {
+				// a link!
+				for (; i < line_len; ++i) {
+					if (is32_space(line[i]))
+						break;
+					char_types[i] = SYNTAX_LINK;
+				}
+				if (line[i-1] < 128 && strchr(".!,", (char)line[i-1])) {
+					// punctuation after URLs
+					char_types[i-1] = SYNTAX_NORMAL;
+				}
+				goto bottom;
+			}
+		}
+		
+		switch (c) {
+		case '#':
+			if (start_of_line) {
+				memset(char_types + i, SYNTAX_STRING, line_len - i);
+				i = line_len;
+			}
+			break;
+		case '*':
+			if (start_of_line && next_is_space) {
+				// bullet list item
+				char_types[i] = SYNTAX_BUILTIN;
+			}
+			FALLTHROUGH
+		case '_':
+			if (backslashes % 2 == 1) {
+				// \* or \_
+			} else if (has_1_char && line[i+1] == c) {
+				// **bold** or __bold__
+				char const *end = c == '*' ? "**" : "__";
+				if (format_ending) {
+					if (streq(format_ending, end)) {
+						char_types[i++] = SYNTAX_STRING;
+						char_types[i] = SYNTAX_STRING;
+						format_ending = NULL;
+					}
+				} else if (!next_is_space) {
+					char_types[i++] = SYNTAX_STRING;
+					char_types[i] = SYNTAX_STRING;
+					format_ending = end;
+				}
+			} else {
+				// *italics* or _italics_
+				char const *end = c == '*' ? "*" : "_";
+				if (format_ending) {
+					if (streq(format_ending, end))
+						format_ending = NULL;
+				} else if (!next_is_space) {
+					char_types[i] = SYNTAX_STRING;
+					format_ending = end;
+				}
+			}
+			break;
+		case '`':
+			if (backslashes % 2 == 1) {
+				// \`
+			} else if (format_ending) {
+				if (streq(format_ending, "`"))
+					format_ending = NULL;
+			} else {
+				char_types[i] = SYNTAX_CODE;
+				format_ending = "`";
+			}
+			break;
+		case '-':
+		case '>':
+			if (start_of_line && next_is_space) {
+				// list item/blockquote
+				char_types[i] = SYNTAX_BUILTIN;
+			}
+			break;
+		case ANY_DIGIT:
+			if (start_of_line) {
+				size_t spn = str32_ascii_spn(remains, "0123456789");
+				size_t end = i + spn;
+				if (end < line_len && line[end] == '.') {
+					// numbered list item
+					for (; i <= end; ++i) {
+						char_types[i] = SYNTAX_BUILTIN;
+					}
+				}
+			}
+			break;
+		case '[': {
+			if (backslashes % 2 == 0) {
+				// [URLS](like-this.com)
+				u32 j;
+				for (j = i+1; j < line_len; ++j) {
+					if (line[j] == ']' && backslashes % 2 == 0)
+						break;
+					if (line[j] == '\\')
+						++backslashes;
+					else
+						backslashes = 0;
+				}
+				backslashes = 0;
+				u32 closing_bracket = j;
+				if (closing_bracket+2 < line_len && line[closing_bracket+1] == '(') {
+					for (j = closing_bracket+2; j < line_len; ++j) {
+						if (line[j] == ')' && backslashes % 2 == 0)
+							break;
+						if (line[j] == '\\')
+							++backslashes;
+						else
+							backslashes = 0;
+					}
+					u32 closing_parenthesis = j;
+					if (closing_parenthesis < line_len) {
+						// hooray!
+						if (i > 0 && line[i-1] == '!')
+							--i; // images are links, but with ! before them
+						memset(&char_types[i], SYNTAX_LINK, closing_parenthesis+1 - i);
+						i = closing_parenthesis;
+					}
+					backslashes = 0;
+					
+				}
+			}
+		} break;
+		}
+	bottom:
+		if (i >= line_len) break;
+		
+		if (line[i] != '\\')
+			backslashes = 0;
+		else
+			++backslashes;
+		
+		start_of_line = next_sol;
+	}
+	
+}
+
 // This is the main syntax highlighting function. It will determine which colors to use for each character.
 // Rather than returning colors, it returns a character type (e.g. comment) which can be converted to a color.
 // To highlight multiple lines, start out with a zeroed SyntaxState, and pass a pointer to it each time.
@@ -682,6 +870,9 @@ void syntax_highlight(SyntaxState *state, Language lang, char32_t *line, u32 lin
 	case LANG_TEX:
 		syntax_highlight_tex(state, line, line_len, char_types);
 		break;
+	case LANG_MARKDOWN:
+		syntax_highlight_markdown(state, line, line_len, char_types);
+		break;
 	case LANG_COUNT: assert(0); break;
 	}
 }
diff --git a/ted.cfg b/ted.cfg
index 1e01cc3..1ed74cc 100644
--- a/ted.cfg
+++ b/ted.cfg
@@ -208,3 +208,4 @@ C++ = .cpp, .hpp, .C, .H, .cxx, .hxx, .cc, .hh
 Rust = .rs
 Python = .py
 Tex = .tex
+Markdown = .md 
diff --git a/ted.h b/ted.h
index 66648f5..c05f8d3 100644
--- a/ted.h
+++ b/ted.h
@@ -33,6 +33,10 @@ enum {
 	SYNTAX_STATE_TEX_VERBATIM = 0x04u, // inside \begin{verbatim} ... \end{verbatim}
 };
 
+enum {
+	SYNTAX_STATE_MARKDOWN_CODE = 0x01u, // inside ``` ``` code section
+};
+
 typedef u8 SyntaxState;
 
 ENUM_U16 {
@@ -42,6 +46,7 @@ ENUM_U16 {
 	LANG_RUST,
 	LANG_PYTHON,
 	LANG_TEX,
+	LANG_MARKDOWN,
 	LANG_COUNT
 } ENUM_U16_END(Language);
 
@@ -57,6 +62,7 @@ static LanguageName const language_names[] = {
 	{LANG_RUST, "Rust"},
 	{LANG_PYTHON, "Python"},
 	{LANG_TEX, "Tex"},
+	{LANG_MARKDOWN, "Markdown"},
 };
 
 static_assert_if_possible(arr_count(language_names) == LANG_COUNT)
@@ -73,6 +79,8 @@ ENUM_U8 {
 } ENUM_U8_END(SyntaxCharType);
 
 #define SYNTAX_MATH SYNTAX_STRING // for tex
+#define SYNTAX_CODE SYNTAX_PREPROCESSOR // for markdown
+#define SYNTAX_LINK SYNTAX_CONSTANT // for markdown
 
 typedef struct {
 	float cursor_blink_time_on, cursor_blink_time_off;
-- 
cgit v1.2.3