use std::borrow::Cow; use std::error::Error; use std::fs::File; use std::io::{self, BufReader, prelude::*}; #[derive(Default)] struct Definitions { definitions: Vec<(Box, Section, Box)>, } // replace sequences of 2+ spaces with a single space fn compact_spaces(s: &str) -> String { let mut s: String = s.into(); // quite inefficient, but it doesn't really matter for our purposes. while s.contains(" ") { s = s.replace(" ", " ").to_owned(); } s } impl Definitions { fn add_definition(&mut self, word: &str, part_of_speech: Section, def: &str) { self.definitions.push(( compact_spaces(word).into(), part_of_speech, compact_spaces(def).into(), )) } fn sort(&mut self) { self.definitions.sort_by(|x, y| x.0.cmp(&y.0)); } fn write_to(&mut self, mut w: impl Write) -> io::Result<()> { for (title, part_of_speech, definition) in &self.definitions { writeln!(w, "{title} %{} {definition}", part_of_speech.to_str())?; } Ok(()) } } // remove HTML comments from string #[must_use] fn remove_comments(mut text: &str) -> Cow<'_, str> { if !text.contains("").map_or(text.len(), |i| i + 3); text = &text[comment_end..]; } new_str.push_str(text); Cow::Owned(new_str) } #[derive(Debug, Copy, Clone, PartialEq, Eq)] enum Section { Adjective, Noun, ProperNoun, Verb, Adverb, Interjection, Conjunction, PrepositionalPhrase, Proverb, Phrase, Suffix, Prefix, Circumfix, Infix, Interfix, Affix, Pronoun, Symbol, Preposition, PunctuationMark, DiacriticalMark, Determiner, Particle, Contraction, Letter, Number, UnknownPoS, NotDefinition, } impl Section { fn from_name(section: &str, title: &str) -> Option { use Section::*; Some(match section.trim() { // ("Adjectives" is a typo in 20250701, "Proper adjective" is basically non-existent) "Adjective" | "Proper adjective" | "Adjectives" => Adjective, "Noun" => Noun, "Proper noun" => ProperNoun, // All but Verb are not used enough to warrant their own categories "Verb" | "Verb phrase" | "Verb form" | "Participle" => Verb, "Adverbial phrase" | "Adverb" => Adverb, "Interjection" => Interjection, "Conjunction" => Conjunction, "Prepositional phrase" => PrepositionalPhrase, "Proverb" => Proverb, "Suffix" => Suffix, "Prefix" => Prefix, "Circumfix" => Circumfix, "Infix" => Infix, "Interfix" => Interfix, "Pronoun" => Pronoun, // Idiom is not used enough to warrant its own category (only appears 12 times) "Phrase" | "Idiom" => Phrase, "Symbol" | "Cuneiform sign" | "Iteration mark" => Symbol, "Preposition" => Preposition, "Punctuation mark" => PunctuationMark, "Diacritical mark" | "Diacritic" => DiacriticalMark, "Article" | "Determiner" => Determiner, "Particle" => Particle, "Contraction" => Contraction, "Letter" => Letter, "Affix" | "Combining form" | "Simulfix" => Affix, "Multiple parts of speech" | "Syllable" => UnknownPoS, // 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper" "Proper" | "Proper Noun" => ProperNoun, x if x.starts_with("Proper noun ") => ProperNoun, "Numeral" | "Number" => Number, "See also" | "Alternative forms" | "Further reading" | "References" | "Anagrams" | "Paronyms" | "Quotations" | "Related terms" | "Derived terms" | "Coordinate terms" | "Usage notes" | "Trivia" | "Sources" | "Citations" | "Translations" | "Attestations" | "Attestation" | "Meronyms" | "Holonyms" | "Hypernyms" | "Hyponyms" | "Antonyms" | "Parasynonyms" | "Synonyms" | "Other names" | "Homophones" | "Collocations" | "Derivations" | "Notes" | "Note" | "Description" | "Alternative spellings" | "Alternative spelling" | "Abbreviations" | "External links" | "Statistics" | "Further information" | "Descendants" | "Gallery" | "Dialects" | "Usage" | "Examples" | "Conjugation" | "Related forms" | "Symbols" | "Historical notes" | "Troponyms" | "Proper nouns" | "Common nouns" | "Sense overview" | "Initialisms" | "Comeronyms" | "Near-synonyms" | "Source" | "Links" | "Declension" | "Synonyms and related terms" | "Additional notes" | "Related vocabulary" | "Glyph origin" | "Han character" | "Derived characters" | "Related characters" | "Related symbols" | "Design" | "Forms" | "Ligature" | "Derived signs" | "Derived symbols" | "Derived glyphs" | "Production" | "Example" | "Origin" | "Bibliography" | "Formation" | "Derived Characters" | "Composition" | "Depiction" | "Derived forms" => NotDefinition, x if x.starts_with("Pronunciation") || x.starts_with("Etymology") || x.starts_with("Terms suffixed with ") || x.starts_with("Symbol origin") => { NotDefinition } // mistakes that exist in 20250701 dump "Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes" | "Translate" | "Etymolohy" | "Derived chracters" => NotDefinition, _ => { eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})"); return None; } }) } fn to_str(self) -> &'static str { use Section::*; match self { Adjective => "adjective", Noun => "noun", ProperNoun => "noun.proper", Verb => "verb", Adverb => "adverb", Interjection => "interjection", Conjunction => "conjunction", PrepositionalPhrase => "phrase.prepositional", Proverb => "phrase.proverb", Phrase => "phrase", Suffix => "affix.suffix", Prefix => "affix.prefix", Circumfix => "affix.circumfix", Infix => "affix.infix", Interfix => "affix.interfix", Affix => "affix", Pronoun => "pronoun", Symbol => "symbol", PunctuationMark => "symbol.punctuation", Letter => "symbol.letter", DiacriticalMark => "symbol.diacritic", Preposition => "preposition", Determiner => "determiner", Particle => "particle", Contraction => "contraction", Number => "number", UnknownPoS => "unknown", NotDefinition => panic!(), } } } fn parse_xml( reader: impl BufRead, output: &mut Definitions, language: &str, ) -> Result<(), Box> { let mut config = xml::reader::ParserConfig::default(); config.cdata_to_characters = true; #[derive(Debug, Clone, Copy)] enum Tag { Other, Title, Text, Ns, } let mut tags = vec![]; let mut title = String::new(); let mut body = String::new(); let mut ns = 1; let mut ns_str = String::new(); for event in xml::reader::EventReader::new_with_config(reader, config) { let event = event?; use xml::reader::XmlEvent; match (&event, tags.last()) { (XmlEvent::StartElement { name, .. }, _) => { let name = name.local_name.as_str(); match name { "title" => { tags.push(Tag::Title); title.clear(); } "text" => tags.push(Tag::Text), "ns" => { tags.push(Tag::Ns); ns_str.clear(); } _ => tags.push(Tag::Other), } } (XmlEvent::Characters(s), Some(Tag::Title)) => { title.push_str(s); } (XmlEvent::Characters(s), Some(Tag::Ns)) => { ns_str.push_str(s); } (XmlEvent::Characters(s), Some(Tag::Text)) => { body.push_str(s); } (XmlEvent::EndElement { name, .. }, _) => { tags.pop(); if name.local_name == "page" { title.clear(); } else if name.local_name == "text" { let body = std::mem::take(&mut body); if ns != 0 { continue; } let Some(lang_start) = body.find(&format!("=={language}==\n")) else { continue; }; let mut curr_section = None; let lang = &body[lang_start..]; let lang_end = lang .as_bytes() .windows(4) .position(|w| w.starts_with(b"\n==") && w[3] != b'=') .unwrap_or(lang.len()); let lang = &lang[..lang_end]; for (i, w) in lang.as_bytes().windows(3).enumerate() { if w == b"\n==" && lang.get(i + 3..i + 4) == Some("=") { let mut section = &lang[i + 3..]; while let Some(s) = section.strip_prefix('=') { section = s; } let Some((section, _)) = section .split_once('\n') .and_then(|(first_line, _)| first_line.split_once('=')) else { continue; }; curr_section = Section::from_name(section, &title); continue; } if curr_section == Some(Section::NotDefinition) { continue; } if w != b"\n# " { continue; } let definition = lang[i + 3..] .split_once('\n') .map_or(&lang[i + 3..], |x| x.0); let definition = remove_comments(definition); let curr_section = curr_section.unwrap_or_else(|| { if false { eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}"); } Section::UnknownPoS }); output.add_definition(&title[..], curr_section, &definition); } } else if name.local_name == "ns" { ns = ns_str.parse().unwrap_or(1); } } _ => {} } } Ok(()) } pub fn definitions(args: Vec) -> Result<(), Box> { let mut files: Vec = vec![]; for arg in args { if arg == "-h" || arg == "--help" { println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); println!(" Extract English-language and Translingual definitions from Wiktionary"); println!( " data dump files, writing output to en-definitions.txt and trans-definitions.txt." ); println!( " Each line of the output file is of the format: Word Part_of_Speech Definition" ); println!( " Note the two spaces—this avoids ambiguity when the word contains a space." ); println!(" A single Word can have multiple Definitions."); println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*"); return Ok(()); } files.push(arg.to_owned()); } let files_from_pwd = || -> Result, Box> { let mut files = vec![]; for file in std::fs::read_dir(".")? { let file = file?; let mut r#type = file.file_type()?; let name = file.file_name(); if r#type.is_symlink() { // get type of thing symlink is pointing to r#type = std::fs::metadata(file.path())?.file_type(); } if !r#type.is_file() { continue; } let Some(name) = name.to_str() else { continue; }; if name.contains("wiktionary") && name.contains(".xml-p") { files.push(name.into()); } } files.sort(); Ok(files) }; if files.is_empty() { files = files_from_pwd() .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?; println!("No files specified on command line."); println!("These files were found in the PWD:"); for file in &files { println!(" {file}"); } print!("Proceed with these files [Y/n]? "); _ = std::io::stdout().flush(); let mut line = String::new(); let result = std::io::stdin().read_line(&mut line); let line = line.trim(); if result.is_err() || line .chars() .next() .is_some_and(|c| c.to_lowercase().to_string() != "y") { return Err("Aborted.".into()); } } for (lang, abbrev) in [("English", "en"), ("Translingual", "trans")] { let mut output = Definitions::default(); for input_filename in &files { let input = File::open(input_filename) .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; let reader = BufReader::new(input); println!("({lang}) Parsing {input_filename}..."); parse_xml(reader, &mut output, lang) .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; } println!("Sorting {} definitions...", output.definitions.len()); output.sort(); crate::do_write(&format!("{abbrev}-definitions.txt"), |writer| { output.write_to(writer) })?; } println!("Done!"); Ok(()) }