diff options
author | pommicket <pommicket@gmail.com> | 2025-09-25 13:57:37 -0400 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2025-09-25 13:57:37 -0400 |
commit | 1d6462d9c03c620d24d113443d24fcfce984c817 (patch) | |
tree | 0f0e3740aee3857ecb652067cbf195d95e6951d8 | |
parent | 2df6d43e44bb9852aa31764f998e28ea89a45267 (diff) |
Add parts of speech
-rw-r--r-- | src/definitions.rs | 416 | ||||
-rw-r--r-- | src/main.rs | 211 |
2 files changed, 419 insertions, 208 deletions
diff --git a/src/definitions.rs b/src/definitions.rs new file mode 100644 index 0000000..bb7eb04 --- /dev/null +++ b/src/definitions.rs @@ -0,0 +1,416 @@ +use std::borrow::Cow; +use std::error::Error; +use std::fs::File; +use std::io::{self, BufReader, prelude::*}; + +#[derive(Default)] +struct Definitions { + definitions: Vec<(Box<str>, Section, Box<str>)>, +} + +// replace sequences of 2+ spaces with a single space +fn compact_spaces(s: &str) -> String { + let mut s: String = s.into(); + // quite inefficient, but it doesn't really matter for our purposes. + while s.contains(" ") { + s = s.replace(" ", " ").to_owned(); + } + s +} + +impl Definitions { + fn add_definition(&mut self, word: &str, part_of_speech: Section, def: &str) { + self.definitions.push(( + compact_spaces(word).into(), + part_of_speech, + compact_spaces(def).into(), + )) + } + fn sort(&mut self) { + self.definitions.sort_by(|x, y| x.0.cmp(&y.0)); + } + fn write_to(&mut self, mut w: impl Write) -> io::Result<()> { + for (title, part_of_speech, definition) in &self.definitions { + writeln!(w, "{title} %{} {definition}", part_of_speech.to_str())?; + } + Ok(()) + } +} + +// remove HTML comments from string +#[must_use] +fn remove_comments(mut text: &str) -> Cow<'_, str> { + if !text.contains("<!--") { + // (by far) most common case + return Cow::Borrowed(text); + } + let mut new_str = String::new(); + while let Some(comment_start) = text.find("<!--") { + new_str.push_str(&text[..comment_start]); + text = &text[comment_start..]; + let comment_end = text.find("-->").map_or(text.len(), |i| i + 3); + text = &text[comment_end..]; + } + new_str.push_str(text); + Cow::Owned(new_str) +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum Section { + Adjective, + Noun, + ProperNoun, + Verb, + Adverb, + Interjection, + Conjunction, + PrepositionalPhrase, + Proverb, + Idiom, + Phrase, + Suffix, + Prefix, + Circumfix, + Infix, + Interfix, + Affix, + Pronoun, + Symbol, + Preposition, + PunctuationMark, + DiacriticalMark, + Determiner, + Participle, + Particle, + Contraction, + Letter, + Number, + UnknownPoS, + NotDefinition, +} + +impl Section { + fn from_name(section: &str, title: &str) -> Option<Self> { + use Section::*; + Some(match section.trim() { + // ("Adjectives" is a typo in 20250701, "Proper adjective" is basically non-existent) + "Adjective" | "Proper adjective" | "Adjectives" => Adjective, + "Noun" => Noun, + "Proper noun" => ProperNoun, + "Verb" | "Verb phrase" | "Verb form" => Verb, + "Adverbial phrase" | "Adverb" => Adverb, + "Interjection" => Interjection, + "Conjunction" => Conjunction, + "Prepositional phrase" => PrepositionalPhrase, + "Proverb" => Proverb, + "Suffix" => Suffix, + "Prefix" => Prefix, + "Circumfix" => Circumfix, + "Infix" => Infix, + "Interfix" => Interfix, + "Pronoun" => Pronoun, + "Phrase" => Phrase, + "Symbol" => Symbol, + "Preposition" => Preposition, + "Punctuation mark" => PunctuationMark, + "Diacritical mark" => DiacriticalMark, + "Article" | "Determiner" => Determiner, + "Participle" => Participle, + "Particle" => Particle, + "Contraction" => Contraction, + "Idiom" => Idiom, + "Letter" => Letter, + "Affix" | "Combining form" => Affix, + // currently at least ev (abbr. for even, ever, every) has this designation + "Multiple parts of speech" => UnknownPoS, + // 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper" + "Proper" | "Proper Noun" => ProperNoun, + x if x.starts_with("Proper noun ") => ProperNoun, + "Numeral" | "Number" => Number, + "See also" + | "Alternative forms" + | "Further reading" + | "References" + | "Anagrams" + | "Paronyms" + | "Quotations" + | "Related terms" + | "Derived terms" + | "Coordinate terms" + | "Usage notes" + | "Trivia" + | "Sources" + | "Citations" + | "Translations" + | "Attestations" + | "Attestation" + | "Meronyms" + | "Holonyms" + | "Hypernyms" + | "Hyponyms" + | "Antonyms" + | "Parasynonyms" + | "Synonyms" + | "Other names" + | "Homophones" + | "Collocations" + | "Derivations" + | "Notes" + | "Note" + | "Description" + | "Alternative spellings" + | "Alternative spelling" + | "Abbreviations" + | "External links" + | "Statistics" + | "Further information" + | "Descendants" + | "Gallery" + | "Dialects" + | "Usage" + | "Examples" + | "Conjugation" + | "Related forms" + | "Symbols" + | "Historical notes" + | "Troponyms" + | "Proper nouns" + | "Common nouns" + | "Sense overview" + | "Initialisms" + | "Comeronyms" + | "Near-synonyms" + | "Source" + | "Links" + | "Declension" + | "Synonyms and related terms" + | "Additional notes" + | "Related vocabulary" => NotDefinition, + x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition, + // mistakes that exist in 20250701 dump + "Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes" + | "Translate" => NotDefinition, + _ => { + eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})"); + return None; + } + }) + } + fn to_str(self) -> &'static str { + use Section::*; + match self { + Adjective => "adjective", + Noun => "noun", + ProperNoun => "noun.proper", + Verb => "verb", + Adverb => "adverb", + Interjection => "interjection", + Conjunction => "conjunction", + PrepositionalPhrase => "phrase.prepositional", + Proverb => "phrase.proverb", + Idiom => "phrase.idiom", + Phrase => "phrase", + Suffix => "affix.suffix", + Prefix => "affix.prefix", + Circumfix => "affix.circumfix", + Infix => "affix.infix", + Interfix => "affix.interfix", + Affix => "affix", + Pronoun => "pronoun", + Symbol => "symbol", + Preposition => "preposition", + PunctuationMark => "punctuation", + DiacriticalMark => "diacritic", + Determiner => "determiner", + Participle => "participle", + Particle => "particle", + Contraction => "contraction", + Letter => "letter", + Number => "number", + UnknownPoS => "unknown", + NotDefinition => panic!(), + } + } +} + +fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> { + let mut config = xml::reader::ParserConfig::default(); + config.cdata_to_characters = true; + #[derive(Debug, Clone, Copy)] + enum Tag { + Other, + Title, + Text, + Ns, + } + let mut tags = vec![]; + let mut title = String::new(); + let mut body = String::new(); + let mut ns = 1; + let mut ns_str = String::new(); + for event in xml::reader::EventReader::new_with_config(reader, config) { + let event = event?; + use xml::reader::XmlEvent; + match (&event, tags.last()) { + (XmlEvent::StartElement { name, .. }, _) => { + let name = name.local_name.as_str(); + match name { + "title" => { + tags.push(Tag::Title); + title.clear(); + } + "text" => tags.push(Tag::Text), + "ns" => { + tags.push(Tag::Ns); + ns_str.clear(); + } + _ => tags.push(Tag::Other), + } + } + (XmlEvent::Characters(s), Some(Tag::Title)) => { + title.push_str(s); + } + (XmlEvent::Characters(s), Some(Tag::Ns)) => { + ns_str.push_str(s); + } + (XmlEvent::Characters(s), Some(Tag::Text)) => { + body.push_str(s); + } + (XmlEvent::EndElement { name, .. }, _) => { + tags.pop(); + if name.local_name == "page" { + title.clear(); + } else if name.local_name == "text" { + if ns == 0 + && let Some(eng_start) = body.find("==English==\n") + { + let mut curr_section = None; + let eng = &body[eng_start..]; + let eng_end = eng + .as_bytes() + .windows(4) + .position(|w| w.starts_with(b"\n==") && w[3] != b'=') + .unwrap_or(eng.len()); + let eng = &eng[..eng_end]; + for (i, w) in eng.as_bytes().windows(3).enumerate() { + if w == b"\n==" && eng.get(i + 3..i + 4) == Some("=") { + let mut section = &eng[i + 3..]; + while let Some(s) = section.strip_prefix('=') { + section = s; + } + let Some((section, _)) = section + .split_once('\n') + .and_then(|(first_line, _)| first_line.split_once('=')) + else { + continue; + }; + curr_section = Section::from_name(section, &title); + continue; + } + if curr_section == Some(Section::NotDefinition) { + continue; + } + if w != b"\n# " { + continue; + } + let definition = + eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0); + let definition = remove_comments(definition); + if curr_section.is_none() { + eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}"); + } + output.add_definition( + &title[..], + curr_section.unwrap_or(Section::UnknownPoS), + &definition, + ); + } + } + body.clear(); + } else if name.local_name == "ns" { + ns = ns_str.parse().unwrap_or(1); + } + } + _ => {} + } + } + Ok(()) +} + +pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> { + let mut output = Definitions::default(); + let mut files: Vec<String> = vec![]; + for arg in args { + if arg == "-h" || arg == "--help" { + println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); + println!(" Extract English-language definitions from Wiktionary"); + println!(" data dump files, writing output to definitions.txt."); + println!(" Each line of the output file is of the format: Word Definition"); + println!( + " Note the two spaces—this avoids ambiguity when the word contains a space." + ); + println!(" A single Word can have multiple Definitions."); + println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*"); + return Ok(()); + } + files.push(arg.to_owned()); + } + let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> { + let mut files = vec![]; + for file in std::fs::read_dir(".")? { + let file = file?; + let mut r#type = file.file_type()?; + let name = file.file_name(); + if r#type.is_symlink() { + // get type of thing symlink is pointing to + r#type = std::fs::metadata(file.path())?.file_type(); + } + if !r#type.is_file() { + continue; + } + let Some(name) = name.to_str() else { + continue; + }; + if name.contains("wiktionary") && name.contains(".xml-p") { + files.push(name.into()); + } + } + files.sort(); + Ok(files) + }; + if files.is_empty() { + files = files_from_pwd() + .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?; + println!("No files specified on command line."); + println!("These files were found in the PWD:"); + for file in &files { + println!(" {file}"); + } + print!("Proceed with these files [Y/n]? "); + _ = std::io::stdout().flush(); + let mut line = String::new(); + let result = std::io::stdin().read_line(&mut line); + let line = line.trim(); + if result.is_err() + || line + .chars() + .next() + .is_some_and(|c| c.to_lowercase().to_string() != "y") + { + return Err("Aborted.".into()); + } + } + for input_filename in &files { + let input = File::open(input_filename) + .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; + let reader = BufReader::new(input); + println!("Parsing {input_filename}..."); + parse_xml(reader, &mut output) + .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; + } + println!("Sorting {} definitions...", output.definitions.len()); + output.sort(); + crate::do_write("definitions.txt", |writer| output.write_to(writer))?; + println!("Done!"); + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index 3b9ffa6..f484ade 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,136 +1,9 @@ -use std::borrow::Cow; use std::error::Error; use std::fs::File; -use std::io::{self, BufReader, BufWriter, prelude::*}; +use std::io::BufWriter; use std::process::ExitCode; -#[derive(Default)] -struct Definitions { - definitions: Vec<(Box<str>, Box<str>)>, -} - -// replace sequences of 2+ spaces with a single space -fn compact_spaces(s: &str) -> String { - let mut s: String = s.into(); - // quite inefficient, but it doesn't really matter for our purposes. - while s.contains(" ") { - s = s.replace(" ", " ").to_owned(); - } - s -} - -impl Definitions { - fn add_definition(&mut self, word: &str, def: &str) { - self.definitions - .push((compact_spaces(word).into(), compact_spaces(def).into())) - } - fn sort(&mut self) { - self.definitions.sort_by(|x, y| x.0.cmp(&y.0)); - } - fn write_to(&mut self, mut w: impl Write) -> io::Result<()> { - for (title, definition) in &self.definitions { - writeln!(w, "{title} {definition}")?; - } - Ok(()) - } -} - -// remove HTML comments from string -#[must_use] -fn remove_comments(mut text: &str) -> Cow<str> { - if !text.contains("<!--") { - // (by far) most common case - return Cow::Borrowed(text); - } - let mut new_str = String::new(); - while let Some(comment_start) = text.find("<!--") { - new_str.push_str(&text[..comment_start]); - text = &text[comment_start..]; - let comment_end = text.find("-->").map_or(text.len(), |i| i + 3); - text = &text[comment_end..]; - } - new_str.push_str(text); - Cow::Owned(new_str) -} - -fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> { - let mut config = xml::reader::ParserConfig::default(); - config.cdata_to_characters = true; - #[derive(Debug, Clone, Copy)] - enum Tag { - Other, - Title, - Text, - Ns, - } - let mut tags = vec![]; - let mut title = String::new(); - let mut body = String::new(); - let mut ns = 1; - let mut ns_str = String::new(); - for event in xml::reader::EventReader::new_with_config(reader, config) { - let event = event?; - use xml::reader::XmlEvent; - match (&event, tags.last()) { - (XmlEvent::StartElement { name, .. }, _) => { - let name = name.local_name.as_str(); - match name { - "title" => { - tags.push(Tag::Title); - title.clear(); - } - "text" => tags.push(Tag::Text), - "ns" => { - tags.push(Tag::Ns); - ns_str.clear(); - } - _ => tags.push(Tag::Other), - } - } - (XmlEvent::Characters(s), Some(Tag::Title)) => { - title.push_str(s); - } - (XmlEvent::Characters(s), Some(Tag::Ns)) => { - ns_str.push_str(s); - } - (XmlEvent::Characters(s), Some(Tag::Text)) => { - body.push_str(s); - } - (XmlEvent::EndElement { name, .. }, _) => { - tags.pop(); - if name.local_name == "page" { - title.clear(); - } else if name.local_name == "text" { - if ns == 0 - && let Some(eng_start) = body.find("==English==\n") - { - let eng = &body[eng_start..]; - let eng_end = eng - .as_bytes() - .windows(4) - .position(|w| w.starts_with(b"\n==") && w[3] != b'=') - .unwrap_or(eng.len()); - let eng = &eng[..eng_end]; - for (i, w) in eng.as_bytes().windows(3).enumerate() { - if w != b"\n# " { - continue; - } - let definition = - eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0); - let definition = remove_comments(definition); - output.add_definition(&title[..], &definition); - } - } - body.clear(); - } else if name.local_name == "ns" { - ns = ns_str.parse().unwrap_or(1); - } - } - _ => {} - } - } - Ok(()) -} +mod definitions; fn do_write<W, E: Error>(path: &str, write_func: W) -> Result<(), Box<dyn Error>> where @@ -147,84 +20,6 @@ where Ok(()) } -fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> { - let mut output = Definitions::default(); - let mut files: Vec<String> = vec![]; - for arg in args { - if arg == "-h" || arg == "--help" { - println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); - println!(" Extract English-language definitions from Wiktionary"); - println!(" data dump files, writing output to definitions.txt."); - println!(" Each line of the output file is of the format: Word Definition"); - println!( - " Note the two spaces—this avoids ambiguity when the word contains a space." - ); - println!(" A single Word can have multiple Definitions."); - println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*"); - return Ok(()); - } - files.push(arg.to_owned()); - } - let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> { - let mut files = vec![]; - for file in std::fs::read_dir(".")? { - let file = file?; - let mut r#type = file.file_type()?; - let name = file.file_name(); - if r#type.is_symlink() { - // get type of thing symlink is pointing to - r#type = std::fs::metadata(file.path())?.file_type(); - } - if !r#type.is_file() { - continue; - } - let Some(name) = name.to_str() else { - continue; - }; - if name.contains("wiktionary") && name.contains(".xml-p") { - files.push(name.into()); - } - } - files.sort(); - Ok(files) - }; - if files.is_empty() { - files = files_from_pwd() - .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?; - println!("No files specified on command line."); - println!("These files were found in the PWD:"); - for file in &files { - println!(" {file}"); - } - print!("Proceed with these files [Y/n]? "); - _ = std::io::stdout().flush(); - let mut line = String::new(); - let result = std::io::stdin().read_line(&mut line); - let line = line.trim(); - if result.is_err() - || line - .chars() - .next() - .is_some_and(|c| c.to_lowercase().to_string() != "y") - { - return Err("Aborted.".into()); - } - } - for input_filename in &files { - let input = File::open(input_filename) - .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; - let reader = BufReader::new(input); - println!("Parsing {input_filename}..."); - parse_xml(reader, &mut output) - .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; - } - println!("Sorting {} definitions...", output.definitions.len()); - output.sort(); - do_write("definitions.txt", |writer| output.write_to(writer))?; - println!("Done!"); - Ok(()) -} - fn try_main() -> Result<(), Box<dyn Error>> { let mut args = std::env::args_os().skip(1); let command = args.next(); @@ -248,7 +43,7 @@ fn try_main() -> Result<(), Box<dyn Error>> { command_args.push(arg.to_owned()); } match &command.to_string_lossy()[..] { - "definitions" => definitions(command_args), + "definitions" => definitions::definitions(command_args), x => Err(format!("Unrecognized command: {x}").into()), } } |