Add parts of speech

author: pommicket <pommicket@gmail.com> 2025-09-25 13:57:37 -0400
committer: pommicket <pommicket@gmail.com> 2025-09-25 13:57:37 -0400
commit: 1d6462d9c03c620d24d113443d24fcfce984c817 (patch)
tree: 0f0e3740aee3857ecb652067cbf195d95e6951d8
parent: 2df6d43e44bb9852aa31764f998e28ea89a45267 (diff)
2 files changed, 419 insertions, 208 deletions
diff --git a/src/definitions.rs b/src/definitions.rs
new file mode 100644
index 0000000..bb7eb04
--- /dev/null
+++ b/src/definitions.rs
@@ -0,0 +1,416 @@
+use std::borrow::Cow;
+use std::error::Error;
+use std::fs::File;
+use std::io::{self, BufReader, prelude::*};
+
+#[derive(Default)]
+struct Definitions {
+	definitions: Vec<(Box<str>, Section, Box<str>)>,
+}
+
+// replace sequences of 2+ spaces with a single space
+fn compact_spaces(s: &str) -> String {
+	let mut s: String = s.into();
+	// quite inefficient, but it doesn't really matter for our purposes.
+	while s.contains("  ") {
+		s = s.replace("  ", " ").to_owned();
+	}
+	s
+}
+
+impl Definitions {
+	fn add_definition(&mut self, word: &str, part_of_speech: Section, def: &str) {
+		self.definitions.push((
+			compact_spaces(word).into(),
+			part_of_speech,
+			compact_spaces(def).into(),
+		))
+	}
+	fn sort(&mut self) {
+		self.definitions.sort_by(|x, y| x.0.cmp(&y.0));
+	}
+	fn write_to(&mut self, mut w: impl Write) -> io::Result<()> {
+		for (title, part_of_speech, definition) in &self.definitions {
+			writeln!(w, "{title}  %{} {definition}", part_of_speech.to_str())?;
+		}
+		Ok(())
+	}
+}
+
+// remove HTML comments from string
+#[must_use]
+fn remove_comments(mut text: &str) -> Cow<'_, str> {
+	if !text.contains("<!--") {
+		// (by far) most common case
+		return Cow::Borrowed(text);
+	}
+	let mut new_str = String::new();
+	while let Some(comment_start) = text.find("<!--") {
+		new_str.push_str(&text[..comment_start]);
+		text = &text[comment_start..];
+		let comment_end = text.find("-->").map_or(text.len(), |i| i + 3);
+		text = &text[comment_end..];
+	}
+	new_str.push_str(text);
+	Cow::Owned(new_str)
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+enum Section {
+	Adjective,
+	Noun,
+	ProperNoun,
+	Verb,
+	Adverb,
+	Interjection,
+	Conjunction,
+	PrepositionalPhrase,
+	Proverb,
+	Idiom,
+	Phrase,
+	Suffix,
+	Prefix,
+	Circumfix,
+	Infix,
+	Interfix,
+	Affix,
+	Pronoun,
+	Symbol,
+	Preposition,
+	PunctuationMark,
+	DiacriticalMark,
+	Determiner,
+	Participle,
+	Particle,
+	Contraction,
+	Letter,
+	Number,
+	UnknownPoS,
+	NotDefinition,
+}
+
+impl Section {
+	fn from_name(section: &str, title: &str) -> Option<Self> {
+		use Section::*;
+		Some(match section.trim() {
+			// ("Adjectives" is a typo in 20250701, "Proper adjective" is basically non-existent)
+			"Adjective" | "Proper adjective" | "Adjectives" => Adjective,
+			"Noun" => Noun,
+			"Proper noun" => ProperNoun,
+			"Verb" | "Verb phrase" | "Verb form" => Verb,
+			"Adverbial phrase" | "Adverb" => Adverb,
+			"Interjection" => Interjection,
+			"Conjunction" => Conjunction,
+			"Prepositional phrase" => PrepositionalPhrase,
+			"Proverb" => Proverb,
+			"Suffix" => Suffix,
+			"Prefix" => Prefix,
+			"Circumfix" => Circumfix,
+			"Infix" => Infix,
+			"Interfix" => Interfix,
+			"Pronoun" => Pronoun,
+			"Phrase" => Phrase,
+			"Symbol" => Symbol,
+			"Preposition" => Preposition,
+			"Punctuation mark" => PunctuationMark,
+			"Diacritical mark" => DiacriticalMark,
+			"Article" | "Determiner" => Determiner,
+			"Participle" => Participle,
+			"Particle" => Particle,
+			"Contraction" => Contraction,
+			"Idiom" => Idiom,
+			"Letter" => Letter,
+			"Affix" | "Combining form" => Affix,
+			// currently at least ev (abbr. for even, ever, every) has this designation
+			"Multiple parts of speech" => UnknownPoS,
+			// 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper"
+			"Proper" | "Proper Noun" => ProperNoun,
+			x if x.starts_with("Proper noun ") => ProperNoun,
+			"Numeral" | "Number" => Number,
+			"See also"
+			| "Alternative forms"
+			| "Further reading"
+			| "References"
+			| "Anagrams"
+			| "Paronyms"
+			| "Quotations"
+			| "Related terms"
+			| "Derived terms"
+			| "Coordinate terms"
+			| "Usage notes"
+			| "Trivia"
+			| "Sources"
+			| "Citations"
+			| "Translations"
+			| "Attestations"
+			| "Attestation"
+			| "Meronyms"
+			| "Holonyms"
+			| "Hypernyms"
+			| "Hyponyms"
+			| "Antonyms"
+			| "Parasynonyms"
+			| "Synonyms"
+			| "Other names"
+			| "Homophones"
+			| "Collocations"
+			| "Derivations"
+			| "Notes"
+			| "Note"
+			| "Description"
+			| "Alternative spellings"
+			| "Alternative spelling"
+			| "Abbreviations"
+			| "External links"
+			| "Statistics"
+			| "Further information"
+			| "Descendants"
+			| "Gallery"
+			| "Dialects"
+			| "Usage"
+			| "Examples"
+			| "Conjugation"
+			| "Related forms"
+			| "Symbols"
+			| "Historical notes"
+			| "Troponyms"
+			| "Proper nouns"
+			| "Common nouns"
+			| "Sense overview"
+			| "Initialisms"
+			| "Comeronyms"
+			| "Near-synonyms"
+			| "Source"
+			| "Links"
+			| "Declension"
+			| "Synonyms and related terms"
+			| "Additional notes"
+			| "Related vocabulary" => NotDefinition,
+			x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition,
+			// mistakes that exist in 20250701 dump
+			"Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes"
+			| "Translate" => NotDefinition,
+			_ => {
+				eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})");
+				return None;
+			}
+		})
+	}
+	fn to_str(self) -> &'static str {
+		use Section::*;
+		match self {
+			Adjective => "adjective",
+			Noun => "noun",
+			ProperNoun => "noun.proper",
+			Verb => "verb",
+			Adverb => "adverb",
+			Interjection => "interjection",
+			Conjunction => "conjunction",
+			PrepositionalPhrase => "phrase.prepositional",
+			Proverb => "phrase.proverb",
+			Idiom => "phrase.idiom",
+			Phrase => "phrase",
+			Suffix => "affix.suffix",
+			Prefix => "affix.prefix",
+			Circumfix => "affix.circumfix",
+			Infix => "affix.infix",
+			Interfix => "affix.interfix",
+			Affix => "affix",
+			Pronoun => "pronoun",
+			Symbol => "symbol",
+			Preposition => "preposition",
+			PunctuationMark => "punctuation",
+			DiacriticalMark => "diacritic",
+			Determiner => "determiner",
+			Participle => "participle",
+			Particle => "particle",
+			Contraction => "contraction",
+			Letter => "letter",
+			Number => "number",
+			UnknownPoS => "unknown",
+			NotDefinition => panic!(),
+		}
+	}
+}
+
+fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
+	let mut config = xml::reader::ParserConfig::default();
+	config.cdata_to_characters = true;
+	#[derive(Debug, Clone, Copy)]
+	enum Tag {
+		Other,
+		Title,
+		Text,
+		Ns,
+	}
+	let mut tags = vec![];
+	let mut title = String::new();
+	let mut body = String::new();
+	let mut ns = 1;
+	let mut ns_str = String::new();
+	for event in xml::reader::EventReader::new_with_config(reader, config) {
+		let event = event?;
+		use xml::reader::XmlEvent;
+		match (&event, tags.last()) {
+			(XmlEvent::StartElement { name, .. }, _) => {
+				let name = name.local_name.as_str();
+				match name {
+					"title" => {
+						tags.push(Tag::Title);
+						title.clear();
+					}
+					"text" => tags.push(Tag::Text),
+					"ns" => {
+						tags.push(Tag::Ns);
+						ns_str.clear();
+					}
+					_ => tags.push(Tag::Other),
+				}
+			}
+			(XmlEvent::Characters(s), Some(Tag::Title)) => {
+				title.push_str(s);
+			}
+			(XmlEvent::Characters(s), Some(Tag::Ns)) => {
+				ns_str.push_str(s);
+			}
+			(XmlEvent::Characters(s), Some(Tag::Text)) => {
+				body.push_str(s);
+			}
+			(XmlEvent::EndElement { name, .. }, _) => {
+				tags.pop();
+				if name.local_name == "page" {
+					title.clear();
+				} else if name.local_name == "text" {
+					if ns == 0
+						&& let Some(eng_start) = body.find("==English==\n")
+					{
+						let mut curr_section = None;
+						let eng = &body[eng_start..];
+						let eng_end = eng
+							.as_bytes()
+							.windows(4)
+							.position(|w| w.starts_with(b"\n==") && w[3] != b'=')
+							.unwrap_or(eng.len());
+						let eng = &eng[..eng_end];
+						for (i, w) in eng.as_bytes().windows(3).enumerate() {
+							if w == b"\n==" && eng.get(i + 3..i + 4) == Some("=") {
+								let mut section = &eng[i + 3..];
+								while let Some(s) = section.strip_prefix('=') {
+									section = s;
+								}
+								let Some((section, _)) = section
+									.split_once('\n')
+									.and_then(|(first_line, _)| first_line.split_once('='))
+								else {
+									continue;
+								};
+								curr_section = Section::from_name(section, &title);
+								continue;
+							}
+							if curr_section == Some(Section::NotDefinition) {
+								continue;
+							}
+							if w != b"\n# " {
+								continue;
+							}
+							let definition =
+								eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
+							let definition = remove_comments(definition);
+							if curr_section.is_none() {
+								eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}");
+							}
+							output.add_definition(
+								&title[..],
+								curr_section.unwrap_or(Section::UnknownPoS),
+								&definition,
+							);
+						}
+					}
+					body.clear();
+				} else if name.local_name == "ns" {
+					ns = ns_str.parse().unwrap_or(1);
+				}
+			}
+			_ => {}
+		}
+	}
+	Ok(())
+}
+
+pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
+	let mut output = Definitions::default();
+	let mut files: Vec<String> = vec![];
+	for arg in args {
+		if arg == "-h" || arg == "--help" {
+			println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
+			println!("    Extract English-language definitions from Wiktionary");
+			println!("    data dump files, writing output to definitions.txt.");
+			println!("    Each line of the output file is of the format: Word  Definition");
+			println!(
+				"    Note the two spaces—this avoids ambiguity when the word contains a space."
+			);
+			println!("    A single Word can have multiple Definitions.");
+			println!("    If FILES is not specified, will use ./*wiktionary*.xml-p*");
+			return Ok(());
+		}
+		files.push(arg.to_owned());
+	}
+	let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> {
+		let mut files = vec![];
+		for file in std::fs::read_dir(".")? {
+			let file = file?;
+			let mut r#type = file.file_type()?;
+			let name = file.file_name();
+			if r#type.is_symlink() {
+				// get type of thing symlink is pointing to
+				r#type = std::fs::metadata(file.path())?.file_type();
+			}
+			if !r#type.is_file() {
+				continue;
+			}
+			let Some(name) = name.to_str() else {
+				continue;
+			};
+			if name.contains("wiktionary") && name.contains(".xml-p") {
+				files.push(name.into());
+			}
+		}
+		files.sort();
+		Ok(files)
+	};
+	if files.is_empty() {
+		files = files_from_pwd()
+			.map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?;
+		println!("No files specified on command line.");
+		println!("These files were found in the PWD:");
+		for file in &files {
+			println!("  {file}");
+		}
+		print!("Proceed with these files [Y/n]? ");
+		_ = std::io::stdout().flush();
+		let mut line = String::new();
+		let result = std::io::stdin().read_line(&mut line);
+		let line = line.trim();
+		if result.is_err()
+			|| line
+				.chars()
+				.next()
+				.is_some_and(|c| c.to_lowercase().to_string() != "y")
+		{
+			return Err("Aborted.".into());
+		}
+	}
+	for input_filename in &files {
+		let input = File::open(input_filename)
+			.map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
+		let reader = BufReader::new(input);
+		println!("Parsing {input_filename}...");
+		parse_xml(reader, &mut output)
+			.map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+	}
+	println!("Sorting {} definitions...", output.definitions.len());
+	output.sort();
+	crate::do_write("definitions.txt", |writer| output.write_to(writer))?;
+	println!("Done!");
+	Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
index 3b9ffa6..f484ade 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,136 +1,9 @@
-use std::borrow::Cow;
 use std::error::Error;
 use std::fs::File;
-use std::io::{self, BufReader, BufWriter, prelude::*};
+use std::io::BufWriter;
 use std::process::ExitCode;
 
-#[derive(Default)]
-struct Definitions {
-	definitions: Vec<(Box<str>, Box<str>)>,
-}
-
-// replace sequences of 2+ spaces with a single space
-fn compact_spaces(s: &str) -> String {
-	let mut s: String = s.into();
-	// quite inefficient, but it doesn't really matter for our purposes.
-	while s.contains("  ") {
-		s = s.replace("  ", " ").to_owned();
-	}
-	s
-}
-
-impl Definitions {
-	fn add_definition(&mut self, word: &str, def: &str) {
-		self.definitions
-			.push((compact_spaces(word).into(), compact_spaces(def).into()))
-	}
-	fn sort(&mut self) {
-		self.definitions.sort_by(|x, y| x.0.cmp(&y.0));
-	}
-	fn write_to(&mut self, mut w: impl Write) -> io::Result<()> {
-		for (title, definition) in &self.definitions {
-			writeln!(w, "{title}  {definition}")?;
-		}
-		Ok(())
-	}
-}
-
-// remove HTML comments from string
-#[must_use]
-fn remove_comments(mut text: &str) -> Cow<str> {
-	if !text.contains("<!--") {
-		// (by far) most common case
-		return Cow::Borrowed(text);
-	}
-	let mut new_str = String::new();
-	while let Some(comment_start) = text.find("<!--") {
-		new_str.push_str(&text[..comment_start]);
-		text = &text[comment_start..];
-		let comment_end = text.find("-->").map_or(text.len(), |i| i + 3);
-		text = &text[comment_end..];
-	}
-	new_str.push_str(text);
-	Cow::Owned(new_str)
-}
-
-fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
-	let mut config = xml::reader::ParserConfig::default();
-	config.cdata_to_characters = true;
-	#[derive(Debug, Clone, Copy)]
-	enum Tag {
-		Other,
-		Title,
-		Text,
-		Ns,
-	}
-	let mut tags = vec![];
-	let mut title = String::new();
-	let mut body = String::new();
-	let mut ns = 1;
-	let mut ns_str = String::new();
-	for event in xml::reader::EventReader::new_with_config(reader, config) {
-		let event = event?;
-		use xml::reader::XmlEvent;
-		match (&event, tags.last()) {
-			(XmlEvent::StartElement { name, .. }, _) => {
-				let name = name.local_name.as_str();
-				match name {
-					"title" => {
-						tags.push(Tag::Title);
-						title.clear();
-					}
-					"text" => tags.push(Tag::Text),
-					"ns" => {
-						tags.push(Tag::Ns);
-						ns_str.clear();
-					}
-					_ => tags.push(Tag::Other),
-				}
-			}
-			(XmlEvent::Characters(s), Some(Tag::Title)) => {
-				title.push_str(s);
-			}
-			(XmlEvent::Characters(s), Some(Tag::Ns)) => {
-				ns_str.push_str(s);
-			}
-			(XmlEvent::Characters(s), Some(Tag::Text)) => {
-				body.push_str(s);
-			}
-			(XmlEvent::EndElement { name, .. }, _) => {
-				tags.pop();
-				if name.local_name == "page" {
-					title.clear();
-				} else if name.local_name == "text" {
-					if ns == 0
-						&& let Some(eng_start) = body.find("==English==\n")
-					{
-						let eng = &body[eng_start..];
-						let eng_end = eng
-							.as_bytes()
-							.windows(4)
-							.position(|w| w.starts_with(b"\n==") && w[3] != b'=')
-							.unwrap_or(eng.len());
-						let eng = &eng[..eng_end];
-						for (i, w) in eng.as_bytes().windows(3).enumerate() {
-							if w != b"\n# " {
-								continue;
-							}
-							let definition =
-								eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
-							let definition = remove_comments(definition);
-							output.add_definition(&title[..], &definition);
-						}
-					}
-					body.clear();
-				} else if name.local_name == "ns" {
-					ns = ns_str.parse().unwrap_or(1);
-				}
-			}
-			_ => {}
-		}
-	}
-	Ok(())
-}
+mod definitions;
 
 fn do_write<W, E: Error>(path: &str, write_func: W) -> Result<(), Box<dyn Error>>
 where
@@ -147,84 +20,6 @@ where
 	Ok(())
 }
 
-fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
-	let mut output = Definitions::default();
-	let mut files: Vec<String> = vec![];
-	for arg in args {
-		if arg == "-h" || arg == "--help" {
-			println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
-			println!("    Extract English-language definitions from Wiktionary");
-			println!("    data dump files, writing output to definitions.txt.");
-			println!("    Each line of the output file is of the format: Word  Definition");
-			println!(
-				"    Note the two spaces—this avoids ambiguity when the word contains a space."
-			);
-			println!("    A single Word can have multiple Definitions.");
-			println!("    If FILES is not specified, will use ./*wiktionary*.xml-p*");
-			return Ok(());
-		}
-		files.push(arg.to_owned());
-	}
-	let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> {
-		let mut files = vec![];
-		for file in std::fs::read_dir(".")? {
-			let file = file?;
-			let mut r#type = file.file_type()?;
-			let name = file.file_name();
-			if r#type.is_symlink() {
-				// get type of thing symlink is pointing to
-				r#type = std::fs::metadata(file.path())?.file_type();
-			}
-			if !r#type.is_file() {
-				continue;
-			}
-			let Some(name) = name.to_str() else {
-				continue;
-			};
-			if name.contains("wiktionary") && name.contains(".xml-p") {
-				files.push(name.into());
-			}
-		}
-		files.sort();
-		Ok(files)
-	};
-	if files.is_empty() {
-		files = files_from_pwd()
-			.map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?;
-		println!("No files specified on command line.");
-		println!("These files were found in the PWD:");
-		for file in &files {
-			println!("  {file}");
-		}
-		print!("Proceed with these files [Y/n]? ");
-		_ = std::io::stdout().flush();
-		let mut line = String::new();
-		let result = std::io::stdin().read_line(&mut line);
-		let line = line.trim();
-		if result.is_err()
-			|| line
-				.chars()
-				.next()
-				.is_some_and(|c| c.to_lowercase().to_string() != "y")
-		{
-			return Err("Aborted.".into());
-		}
-	}
-	for input_filename in &files {
-		let input = File::open(input_filename)
-			.map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
-		let reader = BufReader::new(input);
-		println!("Parsing {input_filename}...");
-		parse_xml(reader, &mut output)
-			.map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
-	}
-	println!("Sorting {} definitions...", output.definitions.len());
-	output.sort();
-	do_write("definitions.txt", |writer| output.write_to(writer))?;
-	println!("Done!");
-	Ok(())
-}
-
 fn try_main() -> Result<(), Box<dyn Error>> {
 	let mut args = std::env::args_os().skip(1);
 	let command = args.next();
@@ -248,7 +43,7 @@ fn try_main() -> Result<(), Box<dyn Error>> {
 		command_args.push(arg.to_owned());
 	}
 	match &command.to_string_lossy()[..] {
-		"definitions" => definitions(command_args),
+		"definitions" => definitions::definitions(command_args),
 		x => Err(format!("Unrecognized command: {x}").into()),
 	}
 }
author	pommicket <pommicket@gmail.com>	2025-09-25 13:57:37 -0400
committer	pommicket <pommicket@gmail.com>	2025-09-25 13:57:37 -0400
commit	1d6462d9c03c620d24d113443d24fcfce984c817 (patch)
tree	0f0e3740aee3857ecb652067cbf195d95e6951d8
parent	2df6d43e44bb9852aa31764f998e28ea89a45267 (diff)