summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/definitions.rs416
-rw-r--r--src/main.rs211
2 files changed, 419 insertions, 208 deletions
diff --git a/src/definitions.rs b/src/definitions.rs
new file mode 100644
index 0000000..bb7eb04
--- /dev/null
+++ b/src/definitions.rs
@@ -0,0 +1,416 @@
+use std::borrow::Cow;
+use std::error::Error;
+use std::fs::File;
+use std::io::{self, BufReader, prelude::*};
+
+#[derive(Default)]
+struct Definitions {
+ definitions: Vec<(Box<str>, Section, Box<str>)>,
+}
+
+// replace sequences of 2+ spaces with a single space
+fn compact_spaces(s: &str) -> String {
+ let mut s: String = s.into();
+ // quite inefficient, but it doesn't really matter for our purposes.
+ while s.contains(" ") {
+ s = s.replace(" ", " ").to_owned();
+ }
+ s
+}
+
+impl Definitions {
+ fn add_definition(&mut self, word: &str, part_of_speech: Section, def: &str) {
+ self.definitions.push((
+ compact_spaces(word).into(),
+ part_of_speech,
+ compact_spaces(def).into(),
+ ))
+ }
+ fn sort(&mut self) {
+ self.definitions.sort_by(|x, y| x.0.cmp(&y.0));
+ }
+ fn write_to(&mut self, mut w: impl Write) -> io::Result<()> {
+ for (title, part_of_speech, definition) in &self.definitions {
+ writeln!(w, "{title} %{} {definition}", part_of_speech.to_str())?;
+ }
+ Ok(())
+ }
+}
+
+// remove HTML comments from string
+#[must_use]
+fn remove_comments(mut text: &str) -> Cow<'_, str> {
+ if !text.contains("<!--") {
+ // (by far) most common case
+ return Cow::Borrowed(text);
+ }
+ let mut new_str = String::new();
+ while let Some(comment_start) = text.find("<!--") {
+ new_str.push_str(&text[..comment_start]);
+ text = &text[comment_start..];
+ let comment_end = text.find("-->").map_or(text.len(), |i| i + 3);
+ text = &text[comment_end..];
+ }
+ new_str.push_str(text);
+ Cow::Owned(new_str)
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+enum Section {
+ Adjective,
+ Noun,
+ ProperNoun,
+ Verb,
+ Adverb,
+ Interjection,
+ Conjunction,
+ PrepositionalPhrase,
+ Proverb,
+ Idiom,
+ Phrase,
+ Suffix,
+ Prefix,
+ Circumfix,
+ Infix,
+ Interfix,
+ Affix,
+ Pronoun,
+ Symbol,
+ Preposition,
+ PunctuationMark,
+ DiacriticalMark,
+ Determiner,
+ Participle,
+ Particle,
+ Contraction,
+ Letter,
+ Number,
+ UnknownPoS,
+ NotDefinition,
+}
+
+impl Section {
+ fn from_name(section: &str, title: &str) -> Option<Self> {
+ use Section::*;
+ Some(match section.trim() {
+ // ("Adjectives" is a typo in 20250701, "Proper adjective" is basically non-existent)
+ "Adjective" | "Proper adjective" | "Adjectives" => Adjective,
+ "Noun" => Noun,
+ "Proper noun" => ProperNoun,
+ "Verb" | "Verb phrase" | "Verb form" => Verb,
+ "Adverbial phrase" | "Adverb" => Adverb,
+ "Interjection" => Interjection,
+ "Conjunction" => Conjunction,
+ "Prepositional phrase" => PrepositionalPhrase,
+ "Proverb" => Proverb,
+ "Suffix" => Suffix,
+ "Prefix" => Prefix,
+ "Circumfix" => Circumfix,
+ "Infix" => Infix,
+ "Interfix" => Interfix,
+ "Pronoun" => Pronoun,
+ "Phrase" => Phrase,
+ "Symbol" => Symbol,
+ "Preposition" => Preposition,
+ "Punctuation mark" => PunctuationMark,
+ "Diacritical mark" => DiacriticalMark,
+ "Article" | "Determiner" => Determiner,
+ "Participle" => Participle,
+ "Particle" => Particle,
+ "Contraction" => Contraction,
+ "Idiom" => Idiom,
+ "Letter" => Letter,
+ "Affix" | "Combining form" => Affix,
+ // currently at least ev (abbr. for even, ever, every) has this designation
+ "Multiple parts of speech" => UnknownPoS,
+ // 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper"
+ "Proper" | "Proper Noun" => ProperNoun,
+ x if x.starts_with("Proper noun ") => ProperNoun,
+ "Numeral" | "Number" => Number,
+ "See also"
+ | "Alternative forms"
+ | "Further reading"
+ | "References"
+ | "Anagrams"
+ | "Paronyms"
+ | "Quotations"
+ | "Related terms"
+ | "Derived terms"
+ | "Coordinate terms"
+ | "Usage notes"
+ | "Trivia"
+ | "Sources"
+ | "Citations"
+ | "Translations"
+ | "Attestations"
+ | "Attestation"
+ | "Meronyms"
+ | "Holonyms"
+ | "Hypernyms"
+ | "Hyponyms"
+ | "Antonyms"
+ | "Parasynonyms"
+ | "Synonyms"
+ | "Other names"
+ | "Homophones"
+ | "Collocations"
+ | "Derivations"
+ | "Notes"
+ | "Note"
+ | "Description"
+ | "Alternative spellings"
+ | "Alternative spelling"
+ | "Abbreviations"
+ | "External links"
+ | "Statistics"
+ | "Further information"
+ | "Descendants"
+ | "Gallery"
+ | "Dialects"
+ | "Usage"
+ | "Examples"
+ | "Conjugation"
+ | "Related forms"
+ | "Symbols"
+ | "Historical notes"
+ | "Troponyms"
+ | "Proper nouns"
+ | "Common nouns"
+ | "Sense overview"
+ | "Initialisms"
+ | "Comeronyms"
+ | "Near-synonyms"
+ | "Source"
+ | "Links"
+ | "Declension"
+ | "Synonyms and related terms"
+ | "Additional notes"
+ | "Related vocabulary" => NotDefinition,
+ x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition,
+ // mistakes that exist in 20250701 dump
+ "Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes"
+ | "Translate" => NotDefinition,
+ _ => {
+ eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})");
+ return None;
+ }
+ })
+ }
+ fn to_str(self) -> &'static str {
+ use Section::*;
+ match self {
+ Adjective => "adjective",
+ Noun => "noun",
+ ProperNoun => "noun.proper",
+ Verb => "verb",
+ Adverb => "adverb",
+ Interjection => "interjection",
+ Conjunction => "conjunction",
+ PrepositionalPhrase => "phrase.prepositional",
+ Proverb => "phrase.proverb",
+ Idiom => "phrase.idiom",
+ Phrase => "phrase",
+ Suffix => "affix.suffix",
+ Prefix => "affix.prefix",
+ Circumfix => "affix.circumfix",
+ Infix => "affix.infix",
+ Interfix => "affix.interfix",
+ Affix => "affix",
+ Pronoun => "pronoun",
+ Symbol => "symbol",
+ Preposition => "preposition",
+ PunctuationMark => "punctuation",
+ DiacriticalMark => "diacritic",
+ Determiner => "determiner",
+ Participle => "participle",
+ Particle => "particle",
+ Contraction => "contraction",
+ Letter => "letter",
+ Number => "number",
+ UnknownPoS => "unknown",
+ NotDefinition => panic!(),
+ }
+ }
+}
+
+fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
+ let mut config = xml::reader::ParserConfig::default();
+ config.cdata_to_characters = true;
+ #[derive(Debug, Clone, Copy)]
+ enum Tag {
+ Other,
+ Title,
+ Text,
+ Ns,
+ }
+ let mut tags = vec![];
+ let mut title = String::new();
+ let mut body = String::new();
+ let mut ns = 1;
+ let mut ns_str = String::new();
+ for event in xml::reader::EventReader::new_with_config(reader, config) {
+ let event = event?;
+ use xml::reader::XmlEvent;
+ match (&event, tags.last()) {
+ (XmlEvent::StartElement { name, .. }, _) => {
+ let name = name.local_name.as_str();
+ match name {
+ "title" => {
+ tags.push(Tag::Title);
+ title.clear();
+ }
+ "text" => tags.push(Tag::Text),
+ "ns" => {
+ tags.push(Tag::Ns);
+ ns_str.clear();
+ }
+ _ => tags.push(Tag::Other),
+ }
+ }
+ (XmlEvent::Characters(s), Some(Tag::Title)) => {
+ title.push_str(s);
+ }
+ (XmlEvent::Characters(s), Some(Tag::Ns)) => {
+ ns_str.push_str(s);
+ }
+ (XmlEvent::Characters(s), Some(Tag::Text)) => {
+ body.push_str(s);
+ }
+ (XmlEvent::EndElement { name, .. }, _) => {
+ tags.pop();
+ if name.local_name == "page" {
+ title.clear();
+ } else if name.local_name == "text" {
+ if ns == 0
+ && let Some(eng_start) = body.find("==English==\n")
+ {
+ let mut curr_section = None;
+ let eng = &body[eng_start..];
+ let eng_end = eng
+ .as_bytes()
+ .windows(4)
+ .position(|w| w.starts_with(b"\n==") && w[3] != b'=')
+ .unwrap_or(eng.len());
+ let eng = &eng[..eng_end];
+ for (i, w) in eng.as_bytes().windows(3).enumerate() {
+ if w == b"\n==" && eng.get(i + 3..i + 4) == Some("=") {
+ let mut section = &eng[i + 3..];
+ while let Some(s) = section.strip_prefix('=') {
+ section = s;
+ }
+ let Some((section, _)) = section
+ .split_once('\n')
+ .and_then(|(first_line, _)| first_line.split_once('='))
+ else {
+ continue;
+ };
+ curr_section = Section::from_name(section, &title);
+ continue;
+ }
+ if curr_section == Some(Section::NotDefinition) {
+ continue;
+ }
+ if w != b"\n# " {
+ continue;
+ }
+ let definition =
+ eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
+ let definition = remove_comments(definition);
+ if curr_section.is_none() {
+ eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}");
+ }
+ output.add_definition(
+ &title[..],
+ curr_section.unwrap_or(Section::UnknownPoS),
+ &definition,
+ );
+ }
+ }
+ body.clear();
+ } else if name.local_name == "ns" {
+ ns = ns_str.parse().unwrap_or(1);
+ }
+ }
+ _ => {}
+ }
+ }
+ Ok(())
+}
+
+pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
+ let mut output = Definitions::default();
+ let mut files: Vec<String> = vec![];
+ for arg in args {
+ if arg == "-h" || arg == "--help" {
+ println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
+ println!(" Extract English-language definitions from Wiktionary");
+ println!(" data dump files, writing output to definitions.txt.");
+ println!(" Each line of the output file is of the format: Word Definition");
+ println!(
+ " Note the two spaces—this avoids ambiguity when the word contains a space."
+ );
+ println!(" A single Word can have multiple Definitions.");
+ println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*");
+ return Ok(());
+ }
+ files.push(arg.to_owned());
+ }
+ let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> {
+ let mut files = vec![];
+ for file in std::fs::read_dir(".")? {
+ let file = file?;
+ let mut r#type = file.file_type()?;
+ let name = file.file_name();
+ if r#type.is_symlink() {
+ // get type of thing symlink is pointing to
+ r#type = std::fs::metadata(file.path())?.file_type();
+ }
+ if !r#type.is_file() {
+ continue;
+ }
+ let Some(name) = name.to_str() else {
+ continue;
+ };
+ if name.contains("wiktionary") && name.contains(".xml-p") {
+ files.push(name.into());
+ }
+ }
+ files.sort();
+ Ok(files)
+ };
+ if files.is_empty() {
+ files = files_from_pwd()
+ .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?;
+ println!("No files specified on command line.");
+ println!("These files were found in the PWD:");
+ for file in &files {
+ println!(" {file}");
+ }
+ print!("Proceed with these files [Y/n]? ");
+ _ = std::io::stdout().flush();
+ let mut line = String::new();
+ let result = std::io::stdin().read_line(&mut line);
+ let line = line.trim();
+ if result.is_err()
+ || line
+ .chars()
+ .next()
+ .is_some_and(|c| c.to_lowercase().to_string() != "y")
+ {
+ return Err("Aborted.".into());
+ }
+ }
+ for input_filename in &files {
+ let input = File::open(input_filename)
+ .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
+ let reader = BufReader::new(input);
+ println!("Parsing {input_filename}...");
+ parse_xml(reader, &mut output)
+ .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+ }
+ println!("Sorting {} definitions...", output.definitions.len());
+ output.sort();
+ crate::do_write("definitions.txt", |writer| output.write_to(writer))?;
+ println!("Done!");
+ Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
index 3b9ffa6..f484ade 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,136 +1,9 @@
-use std::borrow::Cow;
use std::error::Error;
use std::fs::File;
-use std::io::{self, BufReader, BufWriter, prelude::*};
+use std::io::BufWriter;
use std::process::ExitCode;
-#[derive(Default)]
-struct Definitions {
- definitions: Vec<(Box<str>, Box<str>)>,
-}
-
-// replace sequences of 2+ spaces with a single space
-fn compact_spaces(s: &str) -> String {
- let mut s: String = s.into();
- // quite inefficient, but it doesn't really matter for our purposes.
- while s.contains(" ") {
- s = s.replace(" ", " ").to_owned();
- }
- s
-}
-
-impl Definitions {
- fn add_definition(&mut self, word: &str, def: &str) {
- self.definitions
- .push((compact_spaces(word).into(), compact_spaces(def).into()))
- }
- fn sort(&mut self) {
- self.definitions.sort_by(|x, y| x.0.cmp(&y.0));
- }
- fn write_to(&mut self, mut w: impl Write) -> io::Result<()> {
- for (title, definition) in &self.definitions {
- writeln!(w, "{title} {definition}")?;
- }
- Ok(())
- }
-}
-
-// remove HTML comments from string
-#[must_use]
-fn remove_comments(mut text: &str) -> Cow<str> {
- if !text.contains("<!--") {
- // (by far) most common case
- return Cow::Borrowed(text);
- }
- let mut new_str = String::new();
- while let Some(comment_start) = text.find("<!--") {
- new_str.push_str(&text[..comment_start]);
- text = &text[comment_start..];
- let comment_end = text.find("-->").map_or(text.len(), |i| i + 3);
- text = &text[comment_end..];
- }
- new_str.push_str(text);
- Cow::Owned(new_str)
-}
-
-fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
- let mut config = xml::reader::ParserConfig::default();
- config.cdata_to_characters = true;
- #[derive(Debug, Clone, Copy)]
- enum Tag {
- Other,
- Title,
- Text,
- Ns,
- }
- let mut tags = vec![];
- let mut title = String::new();
- let mut body = String::new();
- let mut ns = 1;
- let mut ns_str = String::new();
- for event in xml::reader::EventReader::new_with_config(reader, config) {
- let event = event?;
- use xml::reader::XmlEvent;
- match (&event, tags.last()) {
- (XmlEvent::StartElement { name, .. }, _) => {
- let name = name.local_name.as_str();
- match name {
- "title" => {
- tags.push(Tag::Title);
- title.clear();
- }
- "text" => tags.push(Tag::Text),
- "ns" => {
- tags.push(Tag::Ns);
- ns_str.clear();
- }
- _ => tags.push(Tag::Other),
- }
- }
- (XmlEvent::Characters(s), Some(Tag::Title)) => {
- title.push_str(s);
- }
- (XmlEvent::Characters(s), Some(Tag::Ns)) => {
- ns_str.push_str(s);
- }
- (XmlEvent::Characters(s), Some(Tag::Text)) => {
- body.push_str(s);
- }
- (XmlEvent::EndElement { name, .. }, _) => {
- tags.pop();
- if name.local_name == "page" {
- title.clear();
- } else if name.local_name == "text" {
- if ns == 0
- && let Some(eng_start) = body.find("==English==\n")
- {
- let eng = &body[eng_start..];
- let eng_end = eng
- .as_bytes()
- .windows(4)
- .position(|w| w.starts_with(b"\n==") && w[3] != b'=')
- .unwrap_or(eng.len());
- let eng = &eng[..eng_end];
- for (i, w) in eng.as_bytes().windows(3).enumerate() {
- if w != b"\n# " {
- continue;
- }
- let definition =
- eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
- let definition = remove_comments(definition);
- output.add_definition(&title[..], &definition);
- }
- }
- body.clear();
- } else if name.local_name == "ns" {
- ns = ns_str.parse().unwrap_or(1);
- }
- }
- _ => {}
- }
- }
- Ok(())
-}
+mod definitions;
fn do_write<W, E: Error>(path: &str, write_func: W) -> Result<(), Box<dyn Error>>
where
@@ -147,84 +20,6 @@ where
Ok(())
}
-fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
- let mut output = Definitions::default();
- let mut files: Vec<String> = vec![];
- for arg in args {
- if arg == "-h" || arg == "--help" {
- println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
- println!(" Extract English-language definitions from Wiktionary");
- println!(" data dump files, writing output to definitions.txt.");
- println!(" Each line of the output file is of the format: Word Definition");
- println!(
- " Note the two spaces—this avoids ambiguity when the word contains a space."
- );
- println!(" A single Word can have multiple Definitions.");
- println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*");
- return Ok(());
- }
- files.push(arg.to_owned());
- }
- let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> {
- let mut files = vec![];
- for file in std::fs::read_dir(".")? {
- let file = file?;
- let mut r#type = file.file_type()?;
- let name = file.file_name();
- if r#type.is_symlink() {
- // get type of thing symlink is pointing to
- r#type = std::fs::metadata(file.path())?.file_type();
- }
- if !r#type.is_file() {
- continue;
- }
- let Some(name) = name.to_str() else {
- continue;
- };
- if name.contains("wiktionary") && name.contains(".xml-p") {
- files.push(name.into());
- }
- }
- files.sort();
- Ok(files)
- };
- if files.is_empty() {
- files = files_from_pwd()
- .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?;
- println!("No files specified on command line.");
- println!("These files were found in the PWD:");
- for file in &files {
- println!(" {file}");
- }
- print!("Proceed with these files [Y/n]? ");
- _ = std::io::stdout().flush();
- let mut line = String::new();
- let result = std::io::stdin().read_line(&mut line);
- let line = line.trim();
- if result.is_err()
- || line
- .chars()
- .next()
- .is_some_and(|c| c.to_lowercase().to_string() != "y")
- {
- return Err("Aborted.".into());
- }
- }
- for input_filename in &files {
- let input = File::open(input_filename)
- .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
- let reader = BufReader::new(input);
- println!("Parsing {input_filename}...");
- parse_xml(reader, &mut output)
- .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
- }
- println!("Sorting {} definitions...", output.definitions.len());
- output.sort();
- do_write("definitions.txt", |writer| output.write_to(writer))?;
- println!("Done!");
- Ok(())
-}
-
fn try_main() -> Result<(), Box<dyn Error>> {
let mut args = std::env::args_os().skip(1);
let command = args.next();
@@ -248,7 +43,7 @@ fn try_main() -> Result<(), Box<dyn Error>> {
command_args.push(arg.to_owned());
}
match &command.to_string_lossy()[..] {
- "definitions" => definitions(command_args),
+ "definitions" => definitions::definitions(command_args),
x => Err(format!("Unrecognized command: {x}").into()),
}
}