From e8b7d01ff68fed675d1397f4556f159f0c32246d Mon Sep 17 00:00:00 2001 From: Leo Tenenbaum Date: Tue, 2 Feb 2021 14:36:55 -0500 Subject: rust syntax highlighting --- keywords.h | 24 ++++++ keywords.py | 18 +++++ string32.c | 2 +- syntax.c | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++--------- ted.cfg | 3 +- ted.h | 3 + test.rs | 8 +- 7 files changed, 270 insertions(+), 38 deletions(-) diff --git a/keywords.h b/keywords.h index c563316..4f4b1e6 100644 --- a/keywords.h +++ b/keywords.h @@ -62,3 +62,27 @@ static char const *const *const syntax_all_keywords_cpp[] = { ['a'] = syntax_keywords_cpp_a, ['b'] = syntax_keywords_cpp_b, ['c'] = syntax_keywords_cpp_c, ['d'] = syntax_keywords_cpp_d, ['e'] = syntax_keywords_cpp_e, ['f'] = syntax_keywords_cpp_f, ['m'] = syntax_keywords_cpp_m, ['n'] = syntax_keywords_cpp_n, ['o'] = syntax_keywords_cpp_o, ['p'] = syntax_keywords_cpp_p, ['r'] = syntax_keywords_cpp_r, ['s'] = syntax_keywords_cpp_s, ['t'] = syntax_keywords_cpp_t, ['u'] = syntax_keywords_cpp_u, ['v'] = syntax_keywords_cpp_v, ['x'] = syntax_keywords_cpp_x }; +static char const *const syntax_keywords_rust_S[2] = {"Self"}; +static char const *const syntax_keywords_rust_a[9] = {"abstract","as","asm!","assert!","assert_eq!","assert_ne!","async","await"}; +static char const *const syntax_keywords_rust_b[4] = {"become","box","break"}; +static char const *const syntax_keywords_rust_c[9] = {"cfg!","column!","compile_error!","concat!","concat_idents!","const","continue","crate"}; +static char const *const syntax_keywords_rust_d[7] = {"dbg!","debug_assert!","debug_assert_eq!","debug_assert_ne!","do","dyn"}; +static char const *const syntax_keywords_rust_e[7] = {"else","enum","env!","eprint!","eprintln!","extern"}; +static char const *const syntax_keywords_rust_f[9] = {"false","file!","final","fn","for","format!","format_args!","format_args_nl!"}; +static char const *const syntax_keywords_rust_g[2] = {"global_asm!"}; +static char const *const syntax_keywords_rust_i[14] = {"if","impl","in","include!","include_bytes!","include_str!","is_aarch64_feature_detected!","is_arm_feature_detected!","is_mips64_feature_detected!","is_mips_feature_detected!","is_powerpc64_feature_detected!","is_powerpc_feature_detected!","is_x86_feature_detected!"}; +static char const *const syntax_keywords_rust_l[6] = {"let","line!","llvm_asm!","log_syntax!","loop"}; +static char const *const syntax_keywords_rust_m[8] = {"macro","match","matches!","mod","module_path!","move","mut"}; +static char const *const syntax_keywords_rust_o[3] = {"option_env!","override"}; +static char const *const syntax_keywords_rust_p[6] = {"panic!","print!","println!","priv","pub"}; +static char const *const syntax_keywords_rust_r[3] = {"ref","return"}; +static char const *const syntax_keywords_rust_s[6] = {"self","static","stringify!","struct","super"}; +static char const *const syntax_keywords_rust_t[10] = {"thread_local!","todo!","trace_macros!","trait","true","try","try!","type","typeof"}; +static char const *const syntax_keywords_rust_u[7] = {"unimplemented!","union","unreachable!","unsafe","unsized","use"}; +static char const *const syntax_keywords_rust_v[3] = {"vec!","virtual"}; +static char const *const syntax_keywords_rust_w[5] = {"where","while","write!","writeln!"}; +static char const *const syntax_keywords_rust_y[2] = {"yield"}; +static char const *const *const syntax_all_keywords_rust[] = { + ['S'] = syntax_keywords_rust_S, ['a'] = syntax_keywords_rust_a, ['b'] = syntax_keywords_rust_b, ['c'] = syntax_keywords_rust_c, ['d'] = syntax_keywords_rust_d, ['e'] = syntax_keywords_rust_e, ['f'] = syntax_keywords_rust_f, ['g'] = syntax_keywords_rust_g, ['i'] = syntax_keywords_rust_i, ['l'] = syntax_keywords_rust_l, ['m'] = syntax_keywords_rust_m, ['o'] = syntax_keywords_rust_o, ['p'] = syntax_keywords_rust_p, ['r'] = syntax_keywords_rust_r, ['s'] = syntax_keywords_rust_s, ['t'] = syntax_keywords_rust_t, ['u'] = syntax_keywords_rust_u, ['v'] = syntax_keywords_rust_v, ['w'] = syntax_keywords_rust_w, ['y'] = syntax_keywords_rust_y +}; + diff --git a/keywords.py b/keywords.py index b8c5223..49c68bf 100644 --- a/keywords.py +++ b/keywords.py @@ -135,8 +135,26 @@ keywords_cpp = [ 'xor', 'xor_eq', ] assert not set(keywords_c).intersection(keywords_cpp) + +keywords_rust = [ + "as", "break", "const", "continue", "crate", "else", "enum", "extern", "false", "fn", "for", + "if", "impl", "in", "let", "loop", "match", "mod", "move", "mut", "pub", "ref", "return", + "self", "Self", "static", "struct", "super", "trait", "true", "type", "unsafe", "use", + "where", "while", "async", "await", "dyn", "abstract", "become", "box", "do", "final", + "macro", "override", "priv", "typeof", "unsized", "virtual", "yield", "try", "union", + "asm!","concat_idents!","format_args_nl!","global_asm!","is_aarch64_feature_detected!", + "is_arm_feature_detected!","is_mips64_feature_detected!","is_mips_feature_detected!", + "is_powerpc64_feature_detected!","is_powerpc_feature_detected!","llvm_asm!","log_syntax!", + "trace_macros!","assert!","assert_eq!","assert_ne!","cfg!","column!","compile_error!", + "concat!","dbg!","debug_assert!","debug_assert_eq!","debug_assert_ne!","env!","eprint!", + "eprintln!","file!","format!","format_args!","include!","include_bytes!","include_str!", + "is_x86_feature_detected!","line!","matches!","module_path!","option_env!","panic!", + "print!","println!","stringify!","thread_local!","todo!","try!","unimplemented!", + "unreachable!","vec!","write!","writeln!", +] file = open('keywords.h', 'w') output_keywords(file, keywords_c, 'c') output_keywords(file, keywords_cpp, 'cpp') +output_keywords(file, keywords_rust, 'rust') file.close() diff --git a/string32.c b/string32.c index 387d15c..a6f446a 100644 --- a/string32.c +++ b/string32.c @@ -130,4 +130,4 @@ bool is32_digit(char32_t c) { // could this character appear in a C-style identifier? bool is32_ident(char32_t c) { return c <= WINT_MAX && (iswalnum((wint_t)c) || c == '_'); -} +} \ No newline at end of file diff --git a/syntax.c b/syntax.c index 4f3864c..d75412c 100644 --- a/syntax.c +++ b/syntax.c @@ -1,5 +1,9 @@ #include "keywords.h" +// all characters that can appear in a number +#define SYNTAX_DIGITS "0123456789.xXoObBlLuUiIabcdefABCDEF_" + + // returns the language this string is referring to, or LANG_NONE if it's invalid. Language language_from_str(char const *str) { for (int i = 0; i < LANG_COUNT; ++i) { @@ -40,6 +44,25 @@ static inline bool keyword_matches(char32_t *text, size_t len, char const *keywo } } +// does i continue the number literal from i-1 +static inline bool syntax_number_continues(char32_t *line, u32 line_len, u32 i) { + if (line[i] == '.' && ((i && line[i-1] == '.') || (i < line_len-1 && line[i+1] == '.'))) + return false; // can't have two .s in a row + return line[i] < CHAR_MAX && strchr(SYNTAX_DIGITS, (char)line[i]) + || (i && line[i-1] == 'e' && (line[i] == '+' || line[i] == '-')); +} + +// find how long this keyword would be (if this is a keyword) +static inline u32 syntax_keyword_len(Language lang, char32_t *line, u32 i, u32 line_len) { + u32 keyword_end; + for (keyword_end = i; + keyword_end < line_len + && (is32_ident(line[keyword_end]) + || (lang == LANG_RUST && line[keyword_end] == '!')) // for rust builtin macros + ; ++keyword_end); + return keyword_end - i; +} + static void syntax_highlight_c_cpp(SyntaxState *state_ptr, bool cpp, char32_t *line, u32 line_len, SyntaxCharType *char_types) { SyntaxState state = *state_ptr; bool in_preprocessor = (state & SYNTAX_STATE_CPP_PREPROCESSOR) != 0; @@ -53,10 +76,6 @@ static void syntax_highlight_c_cpp(SyntaxState *state_ptr, bool cpp, char32_t *l int backslashes = 0; for (u32 i = 0; i < line_len; ++i) { - // necessary for the final " of a string to be highlighted - bool in_string_now = in_string; - bool in_char_now = in_char; - bool in_multi_line_comment_now = in_multi_line_comment; // are there 1/2 characters left in the line? bool has_1_char = i + 1 < line_len; @@ -88,25 +107,39 @@ static void syntax_highlight_c_cpp(SyntaxState *state_ptr, bool cpp, char32_t *l if (line[i + 1] == '/') in_single_line_comment = true; // // else if (line[i + 1] == '*') - in_multi_line_comment = in_multi_line_comment_now = true; // /* + in_multi_line_comment = true; // /* } else if (in_multi_line_comment) { if (i && line[i - 1] == '*') { // */ in_multi_line_comment = false; + if (char_types) { + dealt_with = true; + char_types[i] = SYNTAX_COMMENT; + } } } break; case '"': - if (in_string && backslashes % 2 == 0) + if (in_string && backslashes % 2 == 0) { in_string = false; - else if (!in_multi_line_comment && !in_single_line_comment && !in_char) - in_string = in_string_now = true; + if (char_types) { + dealt_with = true; + char_types[i] = SYNTAX_STRING; + } + } else if (!in_multi_line_comment && !in_single_line_comment && !in_char) { + in_string = true; + } break; case '\'': - if (in_char && backslashes % 2 == 0) + if (in_char && backslashes % 2 == 0) { in_char = false; - else if (!in_multi_line_comment && !in_single_line_comment && !in_string) - in_char = in_char_now = true; + if (char_types) { + dealt_with = true; + char_types[i] = SYNTAX_CHARACTER; + } + } else if (!in_multi_line_comment && !in_single_line_comment && !in_string) { + in_char = true; + } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // don't you wish C had case ranges... // a number! @@ -127,7 +160,7 @@ static void syntax_highlight_c_cpp(SyntaxState *state_ptr, bool cpp, char32_t *l if ((i && is32_ident(line[i - 1])) || !is32_ident(c)) break; // can't be a keyword on its own. - if (c == 'R' && has_2_chars && line[i + 1] == '"' && line[i + 2] == '(') { + if (!in_single_line_comment && !in_multi_line_comment && !in_string && c == 'R' && has_2_chars && line[i + 1] == '"' && line[i + 2] == '(') { // raw string in_raw_string = true; raw_string_ending = false; @@ -136,24 +169,28 @@ static void syntax_highlight_c_cpp(SyntaxState *state_ptr, bool cpp, char32_t *l // keywords don't matter for advancing the state if (char_types && !in_single_line_comment && !in_multi_line_comment && !in_string && !in_preprocessor && !in_char) { - u32 keyword_end; - // find where this keyword would end (if this is a keyword) - for (keyword_end = i; keyword_end < line_len && is32_ident(line[keyword_end]); ++keyword_end); - - u32 keyword_len = keyword_end - i; + u32 keyword_len = syntax_keyword_len(cpp ? LANG_CPP : LANG_C, line, i, line_len); char const *const *keywords = c < arr_count(syntax_all_keywords_c) ? syntax_all_keywords_c[c] : NULL; char const *keyword = NULL; - if (keywords) - for (size_t k = 0; keywords[k]; ++k) - if (keyword_matches(&line[i], keyword_len, keywords[k])) + if (keywords) { + for (size_t k = 0; keywords[k]; ++k) { + if (keyword_matches(&line[i], keyword_len, keywords[k])) { keyword = keywords[k]; + break; + } + } + } if (cpp && !keyword) { // check C++'s keywords too! keywords = c < arr_count(syntax_all_keywords_cpp) ? syntax_all_keywords_cpp[c] : NULL; - if (keywords) - for (size_t k = 0; keywords[k]; ++k) - if (keyword_matches(&line[i], keyword_len, keywords[k])) + if (keywords) { + for (size_t k = 0; keywords[k]; ++k) { + if (keyword_matches(&line[i], keyword_len, keywords[k])) { keyword = keywords[k]; + break; + } + } + } } if (keyword) { @@ -177,20 +214,18 @@ static void syntax_highlight_c_cpp(SyntaxState *state_ptr, bool cpp, char32_t *l } } break; } - if (line[i] != '\\') backslashes = 0; - if (in_number && !(is32_digit(line[i]) || line[i] == '.' - || (line[i] < CHAR_MAX && strchr("xXoObBlLuUabcdefABCDEF", (char)line[i])) - || (i && line[i-1] == 'e' && (line[i] == '+' || line[i] == '-')))) { + if (c != '\\') backslashes = 0; + if (in_number && !syntax_number_continues(line, line_len, i)) { in_number = false; } if (char_types && !dealt_with) { SyntaxCharType type = SYNTAX_NORMAL; - if (in_single_line_comment || in_multi_line_comment_now) + if (in_single_line_comment || in_multi_line_comment) type = SYNTAX_COMMENT; - else if (in_string_now) + else if (in_string) type = SYNTAX_STRING; - else if (in_char_now) + else if (in_char) type = SYNTAX_CHARACTER; else if (in_number) type = SYNTAX_CONSTANT; @@ -210,24 +245,166 @@ static void syntax_highlight_c_cpp(SyntaxState *state_ptr, bool cpp, char32_t *l } static void syntax_highlight_rust(SyntaxState *state, char32_t *line, u32 line_len, SyntaxCharType *char_types) { - u8 comment_depth = (u8)((*state & SYNTAX_STATE_RUST_COMMENT_DEPTH_MASK) / SYNTAX_STATE_RUST_COMMENT_DEPTH_MUL); + u32 comment_depth = (((u32)*state & SYNTAX_STATE_RUST_COMMENT_DEPTH_MASK) / SYNTAX_STATE_RUST_COMMENT_DEPTH_MUL); + bool in_string = (*state & SYNTAX_STATE_RUST_STRING) != 0; + bool string_is_raw = (*state & SYNTAX_STATE_RUST_STRING_IS_RAW) != 0; + bool in_number = false; + uint backslashes = 0; + for (u32 i = 0; i < line_len; ++i) { char32_t c = line[i]; bool dealt_with = false; - switch (c) { + bool has_1_char = i + 1 < line_len; + bool has_2_chars = i + 2 < line_len; + switch (c) { + case '/': + if (!in_string) { + if (i && line[i-1] == '*') { + // */ + if (comment_depth) + --comment_depth; + if (char_types) { + char_types[i] = SYNTAX_COMMENT; + dealt_with = true; + } + } else if (has_1_char && line[i+1] == '*') { + // /* + ++comment_depth; + } else if (!comment_depth && has_1_char && line[i+1] == '/') { + // // + // just handle it all now + if (char_types) { + for (u32 j = i; j < line_len; ++j) + char_types[j] = SYNTAX_COMMENT; + } + i = line_len - 1; + dealt_with = true; + break; + } + } + break; + case '"': + if (!comment_depth) { + if (in_string) { + if (backslashes % 2 == 0) { + if (!string_is_raw || (has_1_char && line[i+1] == '#')) { + // end of string literal + in_string = false; + if (char_types) { + char_types[i] = SYNTAX_STRING; + dealt_with = true; + } + string_is_raw = false; + } + } + } else { + // start of string literal + in_string = true; + if (i && line[i-1] == '#') + string_is_raw = true; + } + } + break; + case '\'': { + if (!comment_depth && !in_string && has_2_chars) { + // figure out if this is a character or a lifetime + u32 char_end; + backslashes = line[i+1] == '\\'; + for (char_end = i + 2; char_end < line_len; ++char_end) { + if (line[char_end] == '\'' && backslashes % 2 == 0) { + break; + } + if (line[char_end] < CHAR_MAX + && line[char_end - 1] != '\\' + && !strchr("abcdefABCDEF0123456789", (char)line[char_end])) + break; + } + if (char_end < line_len && line[char_end] == '\'') { + // a character literal + if (char_types) + for (u32 j = i; j <= char_end; ++j) + char_types[j] = SYNTAX_CHARACTER; + dealt_with = true; + i = char_end; + } else { + // a lifetime or something else + } + } + } break; + case '\\': + ++backslashes; + break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // don't you wish C had case ranges... + // a number! + if (char_types && !comment_depth && !in_string && !in_number) { + in_number = true; + if (i && (is32_ident(line[i - 1]) + || (line[i-1] == '.' && !(i >= 2 && line[i-2] == '.'))) + ) { + // actually, this isn't a number. it's something like a*6* or u3*2*. + // also, don't highlight the 0 in tuple.0 + in_number = false; + } + } + break; + default: { + if ((i && is32_ident(line[i - 1])) || !is32_ident(c)) + break; // can't be a keyword on its own. + + if (char_types && !in_string && !comment_depth && !in_number) { + u32 keyword_len = syntax_keyword_len(LANG_RUST, line, i, line_len); + char const *keyword = NULL; + char const *const *keywords = c < arr_count(syntax_all_keywords_rust) ? syntax_all_keywords_rust[c] : NULL; + if (keywords) { + for (size_t k = 0; keywords[k]; ++k) { + if (keyword_matches(&line[i], keyword_len, keywords[k])) { + keyword = keywords[k]; + break; + } + } + if (keyword) { + SyntaxCharType type = SYNTAX_KEYWORD; + if ((keyword_len == 4 && streq(keyword, "true")) + || (keyword_len == 5 && streq(keyword, "false")) + ) { + type = SYNTAX_CONSTANT; // these are constants, not keywords + } + for (size_t j = 0; keyword[j]; ++j) { + char_types[i++] = type; + } + --i; // we'll increment i from the for loop + dealt_with = true; + } + } + } + } break; } + if (c != '\\') backslashes = 0; + if (in_number && !syntax_number_continues(line, line_len, i)) + in_number = false; + if (char_types && !dealt_with) { SyntaxCharType type = SYNTAX_NORMAL; + if (comment_depth) { + type = SYNTAX_COMMENT; + } else if (in_string) { + type = SYNTAX_STRING; + } else if (in_number) { + type = SYNTAX_CONSTANT; + } char_types[i] = type; } + } - uint max_comment_depth = (1u<= max_comment_depth) - comment_depth = (u8)max_comment_depth; + comment_depth = max_comment_depth; *state = (SyntaxState)( - (comment_depth * SYNTAX_STATE_RUST_COMMENT_DEPTH_MUL) + (comment_depth * SYNTAX_STATE_RUST_COMMENT_DEPTH_MUL) + | (in_string * SYNTAX_STATE_RUST_STRING) + | (string_is_raw * SYNTAX_STATE_RUST_STRING_IS_RAW) ); } @@ -247,6 +424,9 @@ void syntax_highlight(SyntaxState *state, Language lang, char32_t *line, u32 lin case LANG_CPP: syntax_highlight_c_cpp(state, true, line, line_len, char_types); break; + case LANG_RUST: + syntax_highlight_rust(state, line, line_len, char_types); + break; case LANG_COUNT: assert(0); break; } } diff --git a/ted.cfg b/ted.cfg index 6ca9562..5158517 100644 --- a/ted.cfg +++ b/ted.cfg @@ -141,4 +141,5 @@ constant = #8ff [extensions] C = .c, .h -C++ = .cpp, .hpp, .C, .H, .cxx, .hxx, .cc, .hh \ No newline at end of file +C++ = .cpp, .hpp, .C, .H, .cxx, .hxx, .cc, .hh +Rust = .rs \ No newline at end of file diff --git a/ted.h b/ted.h index 5018879..97a8c64 100644 --- a/ted.h +++ b/ted.h @@ -18,6 +18,7 @@ enum { SYNTAX_STATE_RUST_COMMENT_DEPTH_MUL = 0x1u, SYNTAX_STATE_RUST_COMMENT_DEPTH_BITS = 4, // number of bits we allocate for the comment depth. SYNTAX_STATE_RUST_STRING = 0x10u, + SYNTAX_STATE_RUST_STRING_IS_RAW = 0x20u, }; typedef u8 SyntaxState; @@ -26,6 +27,7 @@ ENUM_U16 { LANG_NONE, LANG_C, LANG_CPP, + LANG_RUST, LANG_COUNT } ENUM_U16_END(Language); @@ -38,6 +40,7 @@ static LanguageName const language_names[] = { {LANG_NONE, "None"}, {LANG_C, "C"}, {LANG_CPP, "C++"}, + {LANG_RUST, "Rust"}, }; ENUM_U8 { diff --git a/test.rs b/test.rs index 56e8bf4..6c852d9 100644 --- a/test.rs +++ b/test.rs @@ -8,13 +8,19 @@ use std::io::{Result, BufRead, BufReader}; fn main() -> Result<()> { let file = File::open("test.rs")?; let mut reader = BufReader::new(file); + let mut lines = vec![]; + loop { let mut line = String::new(); if reader.read_line(&mut line)? == 0 { // reached end of file break; } - print!("{}", line); + line.pop(); + lines.push(line); + } + for line in lines { + println!("{}", line); } print!(" string -- cgit v1.2.3