summaryrefslogtreecommitdiff
path: root/src/definitions.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/definitions.rs')
-rw-r--r--src/definitions.rs416
1 files changed, 416 insertions, 0 deletions
diff --git a/src/definitions.rs b/src/definitions.rs
new file mode 100644
index 0000000..bb7eb04
--- /dev/null
+++ b/src/definitions.rs
@@ -0,0 +1,416 @@
+use std::borrow::Cow;
+use std::error::Error;
+use std::fs::File;
+use std::io::{self, BufReader, prelude::*};
+
+#[derive(Default)]
+struct Definitions {
+ definitions: Vec<(Box<str>, Section, Box<str>)>,
+}
+
+// replace sequences of 2+ spaces with a single space
+fn compact_spaces(s: &str) -> String {
+ let mut s: String = s.into();
+ // quite inefficient, but it doesn't really matter for our purposes.
+ while s.contains(" ") {
+ s = s.replace(" ", " ").to_owned();
+ }
+ s
+}
+
+impl Definitions {
+ fn add_definition(&mut self, word: &str, part_of_speech: Section, def: &str) {
+ self.definitions.push((
+ compact_spaces(word).into(),
+ part_of_speech,
+ compact_spaces(def).into(),
+ ))
+ }
+ fn sort(&mut self) {
+ self.definitions.sort_by(|x, y| x.0.cmp(&y.0));
+ }
+ fn write_to(&mut self, mut w: impl Write) -> io::Result<()> {
+ for (title, part_of_speech, definition) in &self.definitions {
+ writeln!(w, "{title} %{} {definition}", part_of_speech.to_str())?;
+ }
+ Ok(())
+ }
+}
+
+// remove HTML comments from string
+#[must_use]
+fn remove_comments(mut text: &str) -> Cow<'_, str> {
+ if !text.contains("<!--") {
+ // (by far) most common case
+ return Cow::Borrowed(text);
+ }
+ let mut new_str = String::new();
+ while let Some(comment_start) = text.find("<!--") {
+ new_str.push_str(&text[..comment_start]);
+ text = &text[comment_start..];
+ let comment_end = text.find("-->").map_or(text.len(), |i| i + 3);
+ text = &text[comment_end..];
+ }
+ new_str.push_str(text);
+ Cow::Owned(new_str)
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+enum Section {
+ Adjective,
+ Noun,
+ ProperNoun,
+ Verb,
+ Adverb,
+ Interjection,
+ Conjunction,
+ PrepositionalPhrase,
+ Proverb,
+ Idiom,
+ Phrase,
+ Suffix,
+ Prefix,
+ Circumfix,
+ Infix,
+ Interfix,
+ Affix,
+ Pronoun,
+ Symbol,
+ Preposition,
+ PunctuationMark,
+ DiacriticalMark,
+ Determiner,
+ Participle,
+ Particle,
+ Contraction,
+ Letter,
+ Number,
+ UnknownPoS,
+ NotDefinition,
+}
+
+impl Section {
+ fn from_name(section: &str, title: &str) -> Option<Self> {
+ use Section::*;
+ Some(match section.trim() {
+ // ("Adjectives" is a typo in 20250701, "Proper adjective" is basically non-existent)
+ "Adjective" | "Proper adjective" | "Adjectives" => Adjective,
+ "Noun" => Noun,
+ "Proper noun" => ProperNoun,
+ "Verb" | "Verb phrase" | "Verb form" => Verb,
+ "Adverbial phrase" | "Adverb" => Adverb,
+ "Interjection" => Interjection,
+ "Conjunction" => Conjunction,
+ "Prepositional phrase" => PrepositionalPhrase,
+ "Proverb" => Proverb,
+ "Suffix" => Suffix,
+ "Prefix" => Prefix,
+ "Circumfix" => Circumfix,
+ "Infix" => Infix,
+ "Interfix" => Interfix,
+ "Pronoun" => Pronoun,
+ "Phrase" => Phrase,
+ "Symbol" => Symbol,
+ "Preposition" => Preposition,
+ "Punctuation mark" => PunctuationMark,
+ "Diacritical mark" => DiacriticalMark,
+ "Article" | "Determiner" => Determiner,
+ "Participle" => Participle,
+ "Particle" => Particle,
+ "Contraction" => Contraction,
+ "Idiom" => Idiom,
+ "Letter" => Letter,
+ "Affix" | "Combining form" => Affix,
+ // currently at least ev (abbr. for even, ever, every) has this designation
+ "Multiple parts of speech" => UnknownPoS,
+ // 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper"
+ "Proper" | "Proper Noun" => ProperNoun,
+ x if x.starts_with("Proper noun ") => ProperNoun,
+ "Numeral" | "Number" => Number,
+ "See also"
+ | "Alternative forms"
+ | "Further reading"
+ | "References"
+ | "Anagrams"
+ | "Paronyms"
+ | "Quotations"
+ | "Related terms"
+ | "Derived terms"
+ | "Coordinate terms"
+ | "Usage notes"
+ | "Trivia"
+ | "Sources"
+ | "Citations"
+ | "Translations"
+ | "Attestations"
+ | "Attestation"
+ | "Meronyms"
+ | "Holonyms"
+ | "Hypernyms"
+ | "Hyponyms"
+ | "Antonyms"
+ | "Parasynonyms"
+ | "Synonyms"
+ | "Other names"
+ | "Homophones"
+ | "Collocations"
+ | "Derivations"
+ | "Notes"
+ | "Note"
+ | "Description"
+ | "Alternative spellings"
+ | "Alternative spelling"
+ | "Abbreviations"
+ | "External links"
+ | "Statistics"
+ | "Further information"
+ | "Descendants"
+ | "Gallery"
+ | "Dialects"
+ | "Usage"
+ | "Examples"
+ | "Conjugation"
+ | "Related forms"
+ | "Symbols"
+ | "Historical notes"
+ | "Troponyms"
+ | "Proper nouns"
+ | "Common nouns"
+ | "Sense overview"
+ | "Initialisms"
+ | "Comeronyms"
+ | "Near-synonyms"
+ | "Source"
+ | "Links"
+ | "Declension"
+ | "Synonyms and related terms"
+ | "Additional notes"
+ | "Related vocabulary" => NotDefinition,
+ x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition,
+ // mistakes that exist in 20250701 dump
+ "Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes"
+ | "Translate" => NotDefinition,
+ _ => {
+ eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})");
+ return None;
+ }
+ })
+ }
+ fn to_str(self) -> &'static str {
+ use Section::*;
+ match self {
+ Adjective => "adjective",
+ Noun => "noun",
+ ProperNoun => "noun.proper",
+ Verb => "verb",
+ Adverb => "adverb",
+ Interjection => "interjection",
+ Conjunction => "conjunction",
+ PrepositionalPhrase => "phrase.prepositional",
+ Proverb => "phrase.proverb",
+ Idiom => "phrase.idiom",
+ Phrase => "phrase",
+ Suffix => "affix.suffix",
+ Prefix => "affix.prefix",
+ Circumfix => "affix.circumfix",
+ Infix => "affix.infix",
+ Interfix => "affix.interfix",
+ Affix => "affix",
+ Pronoun => "pronoun",
+ Symbol => "symbol",
+ Preposition => "preposition",
+ PunctuationMark => "punctuation",
+ DiacriticalMark => "diacritic",
+ Determiner => "determiner",
+ Participle => "participle",
+ Particle => "particle",
+ Contraction => "contraction",
+ Letter => "letter",
+ Number => "number",
+ UnknownPoS => "unknown",
+ NotDefinition => panic!(),
+ }
+ }
+}
+
+fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
+ let mut config = xml::reader::ParserConfig::default();
+ config.cdata_to_characters = true;
+ #[derive(Debug, Clone, Copy)]
+ enum Tag {
+ Other,
+ Title,
+ Text,
+ Ns,
+ }
+ let mut tags = vec![];
+ let mut title = String::new();
+ let mut body = String::new();
+ let mut ns = 1;
+ let mut ns_str = String::new();
+ for event in xml::reader::EventReader::new_with_config(reader, config) {
+ let event = event?;
+ use xml::reader::XmlEvent;
+ match (&event, tags.last()) {
+ (XmlEvent::StartElement { name, .. }, _) => {
+ let name = name.local_name.as_str();
+ match name {
+ "title" => {
+ tags.push(Tag::Title);
+ title.clear();
+ }
+ "text" => tags.push(Tag::Text),
+ "ns" => {
+ tags.push(Tag::Ns);
+ ns_str.clear();
+ }
+ _ => tags.push(Tag::Other),
+ }
+ }
+ (XmlEvent::Characters(s), Some(Tag::Title)) => {
+ title.push_str(s);
+ }
+ (XmlEvent::Characters(s), Some(Tag::Ns)) => {
+ ns_str.push_str(s);
+ }
+ (XmlEvent::Characters(s), Some(Tag::Text)) => {
+ body.push_str(s);
+ }
+ (XmlEvent::EndElement { name, .. }, _) => {
+ tags.pop();
+ if name.local_name == "page" {
+ title.clear();
+ } else if name.local_name == "text" {
+ if ns == 0
+ && let Some(eng_start) = body.find("==English==\n")
+ {
+ let mut curr_section = None;
+ let eng = &body[eng_start..];
+ let eng_end = eng
+ .as_bytes()
+ .windows(4)
+ .position(|w| w.starts_with(b"\n==") && w[3] != b'=')
+ .unwrap_or(eng.len());
+ let eng = &eng[..eng_end];
+ for (i, w) in eng.as_bytes().windows(3).enumerate() {
+ if w == b"\n==" && eng.get(i + 3..i + 4) == Some("=") {
+ let mut section = &eng[i + 3..];
+ while let Some(s) = section.strip_prefix('=') {
+ section = s;
+ }
+ let Some((section, _)) = section
+ .split_once('\n')
+ .and_then(|(first_line, _)| first_line.split_once('='))
+ else {
+ continue;
+ };
+ curr_section = Section::from_name(section, &title);
+ continue;
+ }
+ if curr_section == Some(Section::NotDefinition) {
+ continue;
+ }
+ if w != b"\n# " {
+ continue;
+ }
+ let definition =
+ eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
+ let definition = remove_comments(definition);
+ if curr_section.is_none() {
+ eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}");
+ }
+ output.add_definition(
+ &title[..],
+ curr_section.unwrap_or(Section::UnknownPoS),
+ &definition,
+ );
+ }
+ }
+ body.clear();
+ } else if name.local_name == "ns" {
+ ns = ns_str.parse().unwrap_or(1);
+ }
+ }
+ _ => {}
+ }
+ }
+ Ok(())
+}
+
+pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
+ let mut output = Definitions::default();
+ let mut files: Vec<String> = vec![];
+ for arg in args {
+ if arg == "-h" || arg == "--help" {
+ println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
+ println!(" Extract English-language definitions from Wiktionary");
+ println!(" data dump files, writing output to definitions.txt.");
+ println!(" Each line of the output file is of the format: Word Definition");
+ println!(
+ " Note the two spaces—this avoids ambiguity when the word contains a space."
+ );
+ println!(" A single Word can have multiple Definitions.");
+ println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*");
+ return Ok(());
+ }
+ files.push(arg.to_owned());
+ }
+ let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> {
+ let mut files = vec![];
+ for file in std::fs::read_dir(".")? {
+ let file = file?;
+ let mut r#type = file.file_type()?;
+ let name = file.file_name();
+ if r#type.is_symlink() {
+ // get type of thing symlink is pointing to
+ r#type = std::fs::metadata(file.path())?.file_type();
+ }
+ if !r#type.is_file() {
+ continue;
+ }
+ let Some(name) = name.to_str() else {
+ continue;
+ };
+ if name.contains("wiktionary") && name.contains(".xml-p") {
+ files.push(name.into());
+ }
+ }
+ files.sort();
+ Ok(files)
+ };
+ if files.is_empty() {
+ files = files_from_pwd()
+ .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?;
+ println!("No files specified on command line.");
+ println!("These files were found in the PWD:");
+ for file in &files {
+ println!(" {file}");
+ }
+ print!("Proceed with these files [Y/n]? ");
+ _ = std::io::stdout().flush();
+ let mut line = String::new();
+ let result = std::io::stdin().read_line(&mut line);
+ let line = line.trim();
+ if result.is_err()
+ || line
+ .chars()
+ .next()
+ .is_some_and(|c| c.to_lowercase().to_string() != "y")
+ {
+ return Err("Aborted.".into());
+ }
+ }
+ for input_filename in &files {
+ let input = File::open(input_filename)
+ .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
+ let reader = BufReader::new(input);
+ println!("Parsing {input_filename}...");
+ parse_xml(reader, &mut output)
+ .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+ }
+ println!("Sorting {} definitions...", output.definitions.len());
+ output.sort();
+ crate::do_write("definitions.txt", |writer| output.write_to(writer))?;
+ println!("Done!");
+ Ok(())
+}