diff options
Diffstat (limited to 'src/definitions.rs')
-rw-r--r-- | src/definitions.rs | 416 |
1 files changed, 416 insertions, 0 deletions
diff --git a/src/definitions.rs b/src/definitions.rs new file mode 100644 index 0000000..bb7eb04 --- /dev/null +++ b/src/definitions.rs @@ -0,0 +1,416 @@ +use std::borrow::Cow; +use std::error::Error; +use std::fs::File; +use std::io::{self, BufReader, prelude::*}; + +#[derive(Default)] +struct Definitions { + definitions: Vec<(Box<str>, Section, Box<str>)>, +} + +// replace sequences of 2+ spaces with a single space +fn compact_spaces(s: &str) -> String { + let mut s: String = s.into(); + // quite inefficient, but it doesn't really matter for our purposes. + while s.contains(" ") { + s = s.replace(" ", " ").to_owned(); + } + s +} + +impl Definitions { + fn add_definition(&mut self, word: &str, part_of_speech: Section, def: &str) { + self.definitions.push(( + compact_spaces(word).into(), + part_of_speech, + compact_spaces(def).into(), + )) + } + fn sort(&mut self) { + self.definitions.sort_by(|x, y| x.0.cmp(&y.0)); + } + fn write_to(&mut self, mut w: impl Write) -> io::Result<()> { + for (title, part_of_speech, definition) in &self.definitions { + writeln!(w, "{title} %{} {definition}", part_of_speech.to_str())?; + } + Ok(()) + } +} + +// remove HTML comments from string +#[must_use] +fn remove_comments(mut text: &str) -> Cow<'_, str> { + if !text.contains("<!--") { + // (by far) most common case + return Cow::Borrowed(text); + } + let mut new_str = String::new(); + while let Some(comment_start) = text.find("<!--") { + new_str.push_str(&text[..comment_start]); + text = &text[comment_start..]; + let comment_end = text.find("-->").map_or(text.len(), |i| i + 3); + text = &text[comment_end..]; + } + new_str.push_str(text); + Cow::Owned(new_str) +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum Section { + Adjective, + Noun, + ProperNoun, + Verb, + Adverb, + Interjection, + Conjunction, + PrepositionalPhrase, + Proverb, + Idiom, + Phrase, + Suffix, + Prefix, + Circumfix, + Infix, + Interfix, + Affix, + Pronoun, + Symbol, + Preposition, + PunctuationMark, + DiacriticalMark, + Determiner, + Participle, + Particle, + Contraction, + Letter, + Number, + UnknownPoS, + NotDefinition, +} + +impl Section { + fn from_name(section: &str, title: &str) -> Option<Self> { + use Section::*; + Some(match section.trim() { + // ("Adjectives" is a typo in 20250701, "Proper adjective" is basically non-existent) + "Adjective" | "Proper adjective" | "Adjectives" => Adjective, + "Noun" => Noun, + "Proper noun" => ProperNoun, + "Verb" | "Verb phrase" | "Verb form" => Verb, + "Adverbial phrase" | "Adverb" => Adverb, + "Interjection" => Interjection, + "Conjunction" => Conjunction, + "Prepositional phrase" => PrepositionalPhrase, + "Proverb" => Proverb, + "Suffix" => Suffix, + "Prefix" => Prefix, + "Circumfix" => Circumfix, + "Infix" => Infix, + "Interfix" => Interfix, + "Pronoun" => Pronoun, + "Phrase" => Phrase, + "Symbol" => Symbol, + "Preposition" => Preposition, + "Punctuation mark" => PunctuationMark, + "Diacritical mark" => DiacriticalMark, + "Article" | "Determiner" => Determiner, + "Participle" => Participle, + "Particle" => Particle, + "Contraction" => Contraction, + "Idiom" => Idiom, + "Letter" => Letter, + "Affix" | "Combining form" => Affix, + // currently at least ev (abbr. for even, ever, every) has this designation + "Multiple parts of speech" => UnknownPoS, + // 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper" + "Proper" | "Proper Noun" => ProperNoun, + x if x.starts_with("Proper noun ") => ProperNoun, + "Numeral" | "Number" => Number, + "See also" + | "Alternative forms" + | "Further reading" + | "References" + | "Anagrams" + | "Paronyms" + | "Quotations" + | "Related terms" + | "Derived terms" + | "Coordinate terms" + | "Usage notes" + | "Trivia" + | "Sources" + | "Citations" + | "Translations" + | "Attestations" + | "Attestation" + | "Meronyms" + | "Holonyms" + | "Hypernyms" + | "Hyponyms" + | "Antonyms" + | "Parasynonyms" + | "Synonyms" + | "Other names" + | "Homophones" + | "Collocations" + | "Derivations" + | "Notes" + | "Note" + | "Description" + | "Alternative spellings" + | "Alternative spelling" + | "Abbreviations" + | "External links" + | "Statistics" + | "Further information" + | "Descendants" + | "Gallery" + | "Dialects" + | "Usage" + | "Examples" + | "Conjugation" + | "Related forms" + | "Symbols" + | "Historical notes" + | "Troponyms" + | "Proper nouns" + | "Common nouns" + | "Sense overview" + | "Initialisms" + | "Comeronyms" + | "Near-synonyms" + | "Source" + | "Links" + | "Declension" + | "Synonyms and related terms" + | "Additional notes" + | "Related vocabulary" => NotDefinition, + x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition, + // mistakes that exist in 20250701 dump + "Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes" + | "Translate" => NotDefinition, + _ => { + eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})"); + return None; + } + }) + } + fn to_str(self) -> &'static str { + use Section::*; + match self { + Adjective => "adjective", + Noun => "noun", + ProperNoun => "noun.proper", + Verb => "verb", + Adverb => "adverb", + Interjection => "interjection", + Conjunction => "conjunction", + PrepositionalPhrase => "phrase.prepositional", + Proverb => "phrase.proverb", + Idiom => "phrase.idiom", + Phrase => "phrase", + Suffix => "affix.suffix", + Prefix => "affix.prefix", + Circumfix => "affix.circumfix", + Infix => "affix.infix", + Interfix => "affix.interfix", + Affix => "affix", + Pronoun => "pronoun", + Symbol => "symbol", + Preposition => "preposition", + PunctuationMark => "punctuation", + DiacriticalMark => "diacritic", + Determiner => "determiner", + Participle => "participle", + Particle => "particle", + Contraction => "contraction", + Letter => "letter", + Number => "number", + UnknownPoS => "unknown", + NotDefinition => panic!(), + } + } +} + +fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> { + let mut config = xml::reader::ParserConfig::default(); + config.cdata_to_characters = true; + #[derive(Debug, Clone, Copy)] + enum Tag { + Other, + Title, + Text, + Ns, + } + let mut tags = vec![]; + let mut title = String::new(); + let mut body = String::new(); + let mut ns = 1; + let mut ns_str = String::new(); + for event in xml::reader::EventReader::new_with_config(reader, config) { + let event = event?; + use xml::reader::XmlEvent; + match (&event, tags.last()) { + (XmlEvent::StartElement { name, .. }, _) => { + let name = name.local_name.as_str(); + match name { + "title" => { + tags.push(Tag::Title); + title.clear(); + } + "text" => tags.push(Tag::Text), + "ns" => { + tags.push(Tag::Ns); + ns_str.clear(); + } + _ => tags.push(Tag::Other), + } + } + (XmlEvent::Characters(s), Some(Tag::Title)) => { + title.push_str(s); + } + (XmlEvent::Characters(s), Some(Tag::Ns)) => { + ns_str.push_str(s); + } + (XmlEvent::Characters(s), Some(Tag::Text)) => { + body.push_str(s); + } + (XmlEvent::EndElement { name, .. }, _) => { + tags.pop(); + if name.local_name == "page" { + title.clear(); + } else if name.local_name == "text" { + if ns == 0 + && let Some(eng_start) = body.find("==English==\n") + { + let mut curr_section = None; + let eng = &body[eng_start..]; + let eng_end = eng + .as_bytes() + .windows(4) + .position(|w| w.starts_with(b"\n==") && w[3] != b'=') + .unwrap_or(eng.len()); + let eng = &eng[..eng_end]; + for (i, w) in eng.as_bytes().windows(3).enumerate() { + if w == b"\n==" && eng.get(i + 3..i + 4) == Some("=") { + let mut section = &eng[i + 3..]; + while let Some(s) = section.strip_prefix('=') { + section = s; + } + let Some((section, _)) = section + .split_once('\n') + .and_then(|(first_line, _)| first_line.split_once('=')) + else { + continue; + }; + curr_section = Section::from_name(section, &title); + continue; + } + if curr_section == Some(Section::NotDefinition) { + continue; + } + if w != b"\n# " { + continue; + } + let definition = + eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0); + let definition = remove_comments(definition); + if curr_section.is_none() { + eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}"); + } + output.add_definition( + &title[..], + curr_section.unwrap_or(Section::UnknownPoS), + &definition, + ); + } + } + body.clear(); + } else if name.local_name == "ns" { + ns = ns_str.parse().unwrap_or(1); + } + } + _ => {} + } + } + Ok(()) +} + +pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> { + let mut output = Definitions::default(); + let mut files: Vec<String> = vec![]; + for arg in args { + if arg == "-h" || arg == "--help" { + println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); + println!(" Extract English-language definitions from Wiktionary"); + println!(" data dump files, writing output to definitions.txt."); + println!(" Each line of the output file is of the format: Word Definition"); + println!( + " Note the two spaces—this avoids ambiguity when the word contains a space." + ); + println!(" A single Word can have multiple Definitions."); + println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*"); + return Ok(()); + } + files.push(arg.to_owned()); + } + let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> { + let mut files = vec![]; + for file in std::fs::read_dir(".")? { + let file = file?; + let mut r#type = file.file_type()?; + let name = file.file_name(); + if r#type.is_symlink() { + // get type of thing symlink is pointing to + r#type = std::fs::metadata(file.path())?.file_type(); + } + if !r#type.is_file() { + continue; + } + let Some(name) = name.to_str() else { + continue; + }; + if name.contains("wiktionary") && name.contains(".xml-p") { + files.push(name.into()); + } + } + files.sort(); + Ok(files) + }; + if files.is_empty() { + files = files_from_pwd() + .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?; + println!("No files specified on command line."); + println!("These files were found in the PWD:"); + for file in &files { + println!(" {file}"); + } + print!("Proceed with these files [Y/n]? "); + _ = std::io::stdout().flush(); + let mut line = String::new(); + let result = std::io::stdin().read_line(&mut line); + let line = line.trim(); + if result.is_err() + || line + .chars() + .next() + .is_some_and(|c| c.to_lowercase().to_string() != "y") + { + return Err("Aborted.".into()); + } + } + for input_filename in &files { + let input = File::open(input_filename) + .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; + let reader = BufReader::new(input); + println!("Parsing {input_filename}..."); + parse_xml(reader, &mut output) + .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; + } + println!("Sorting {} definitions...", output.definitions.len()); + output.sort(); + crate::do_write("definitions.txt", |writer| output.write_to(writer))?; + println!("Done!"); + Ok(()) +} |