1 files changed, 416 insertions, 0 deletions
diff --git a/src/definitions.rs b/src/definitions.rs
new file mode 100644
index 0000000..bb7eb04
--- /dev/null
+++ b/src/definitions.rs
@@ -0,0 +1,416 @@
+use std::borrow::Cow;
+use std::error::Error;
+use std::fs::File;
+use std::io::{self, BufReader, prelude::*};
+
+#[derive(Default)]
+struct Definitions {
+	definitions: Vec<(Box<str>, Section, Box<str>)>,
+}
+
+// replace sequences of 2+ spaces with a single space
+fn compact_spaces(s: &str) -> String {
+	let mut s: String = s.into();
+	// quite inefficient, but it doesn't really matter for our purposes.
+	while s.contains("  ") {
+		s = s.replace("  ", " ").to_owned();
+	}
+	s
+}
+
+impl Definitions {
+	fn add_definition(&mut self, word: &str, part_of_speech: Section, def: &str) {
+		self.definitions.push((
+			compact_spaces(word).into(),
+			part_of_speech,
+			compact_spaces(def).into(),
+		))
+	}
+	fn sort(&mut self) {
+		self.definitions.sort_by(|x, y| x.0.cmp(&y.0));
+	}
+	fn write_to(&mut self, mut w: impl Write) -> io::Result<()> {
+		for (title, part_of_speech, definition) in &self.definitions {
+			writeln!(w, "{title}  %{} {definition}", part_of_speech.to_str())?;
+		}
+		Ok(())
+	}
+}
+
+// remove HTML comments from string
+#[must_use]
+fn remove_comments(mut text: &str) -> Cow<'_, str> {
+	if !text.contains("<!--") {
+		// (by far) most common case
+		return Cow::Borrowed(text);
+	}
+	let mut new_str = String::new();
+	while let Some(comment_start) = text.find("<!--") {
+		new_str.push_str(&text[..comment_start]);
+		text = &text[comment_start..];
+		let comment_end = text.find("-->").map_or(text.len(), |i| i + 3);
+		text = &text[comment_end..];
+	}
+	new_str.push_str(text);
+	Cow::Owned(new_str)
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+enum Section {
+	Adjective,
+	Noun,
+	ProperNoun,
+	Verb,
+	Adverb,
+	Interjection,
+	Conjunction,
+	PrepositionalPhrase,
+	Proverb,
+	Idiom,
+	Phrase,
+	Suffix,
+	Prefix,
+	Circumfix,
+	Infix,
+	Interfix,
+	Affix,
+	Pronoun,
+	Symbol,
+	Preposition,
+	PunctuationMark,
+	DiacriticalMark,
+	Determiner,
+	Participle,
+	Particle,
+	Contraction,
+	Letter,
+	Number,
+	UnknownPoS,
+	NotDefinition,
+}
+
+impl Section {
+	fn from_name(section: &str, title: &str) -> Option<Self> {
+		use Section::*;
+		Some(match section.trim() {
+			// ("Adjectives" is a typo in 20250701, "Proper adjective" is basically non-existent)
+			"Adjective" | "Proper adjective" | "Adjectives" => Adjective,
+			"Noun" => Noun,
+			"Proper noun" => ProperNoun,
+			"Verb" | "Verb phrase" | "Verb form" => Verb,
+			"Adverbial phrase" | "Adverb" => Adverb,
+			"Interjection" => Interjection,
+			"Conjunction" => Conjunction,
+			"Prepositional phrase" => PrepositionalPhrase,
+			"Proverb" => Proverb,
+			"Suffix" => Suffix,
+			"Prefix" => Prefix,
+			"Circumfix" => Circumfix,
+			"Infix" => Infix,
+			"Interfix" => Interfix,
+			"Pronoun" => Pronoun,
+			"Phrase" => Phrase,
+			"Symbol" => Symbol,
+			"Preposition" => Preposition,
+			"Punctuation mark" => PunctuationMark,
+			"Diacritical mark" => DiacriticalMark,
+			"Article" | "Determiner" => Determiner,
+			"Participle" => Participle,
+			"Particle" => Particle,
+			"Contraction" => Contraction,
+			"Idiom" => Idiom,
+			"Letter" => Letter,
+			"Affix" | "Combining form" => Affix,
+			// currently at least ev (abbr. for even, ever, every) has this designation
+			"Multiple parts of speech" => UnknownPoS,
+			// 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper"
+			"Proper" | "Proper Noun" => ProperNoun,
+			x if x.starts_with("Proper noun ") => ProperNoun,
+			"Numeral" | "Number" => Number,
+			"See also"
+			| "Alternative forms"
+			| "Further reading"
+			| "References"
+			| "Anagrams"
+			| "Paronyms"
+			| "Quotations"
+			| "Related terms"
+			| "Derived terms"
+			| "Coordinate terms"
+			| "Usage notes"
+			| "Trivia"
+			| "Sources"
+			| "Citations"
+			| "Translations"
+			| "Attestations"
+			| "Attestation"
+			| "Meronyms"
+			| "Holonyms"
+			| "Hypernyms"
+			| "Hyponyms"
+			| "Antonyms"
+			| "Parasynonyms"
+			| "Synonyms"
+			| "Other names"
+			| "Homophones"
+			| "Collocations"
+			| "Derivations"
+			| "Notes"
+			| "Note"
+			| "Description"
+			| "Alternative spellings"
+			| "Alternative spelling"
+			| "Abbreviations"
+			| "External links"
+			| "Statistics"
+			| "Further information"
+			| "Descendants"
+			| "Gallery"
+			| "Dialects"
+			| "Usage"
+			| "Examples"
+			| "Conjugation"
+			| "Related forms"
+			| "Symbols"
+			| "Historical notes"
+			| "Troponyms"
+			| "Proper nouns"
+			| "Common nouns"
+			| "Sense overview"
+			| "Initialisms"
+			| "Comeronyms"
+			| "Near-synonyms"
+			| "Source"
+			| "Links"
+			| "Declension"
+			| "Synonyms and related terms"
+			| "Additional notes"
+			| "Related vocabulary" => NotDefinition,
+			x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition,
+			// mistakes that exist in 20250701 dump
+			"Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes"
+			| "Translate" => NotDefinition,
+			_ => {
+				eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})");
+				return None;
+			}
+		})
+	}
+	fn to_str(self) -> &'static str {
+		use Section::*;
+		match self {
+			Adjective => "adjective",
+			Noun => "noun",
+			ProperNoun => "noun.proper",
+			Verb => "verb",
+			Adverb => "adverb",
+			Interjection => "interjection",
+			Conjunction => "conjunction",
+			PrepositionalPhrase => "phrase.prepositional",
+			Proverb => "phrase.proverb",
+			Idiom => "phrase.idiom",
+			Phrase => "phrase",
+			Suffix => "affix.suffix",
+			Prefix => "affix.prefix",
+			Circumfix => "affix.circumfix",
+			Infix => "affix.infix",
+			Interfix => "affix.interfix",
+			Affix => "affix",
+			Pronoun => "pronoun",
+			Symbol => "symbol",
+			Preposition => "preposition",
+			PunctuationMark => "punctuation",
+			DiacriticalMark => "diacritic",
+			Determiner => "determiner",
+			Participle => "participle",
+			Particle => "particle",
+			Contraction => "contraction",
+			Letter => "letter",
+			Number => "number",
+			UnknownPoS => "unknown",
+			NotDefinition => panic!(),
+		}
+	}
+}
+
+fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
+	let mut config = xml::reader::ParserConfig::default();
+	config.cdata_to_characters = true;
+	#[derive(Debug, Clone, Copy)]
+	enum Tag {
+		Other,
+		Title,
+		Text,
+		Ns,
+	}
+	let mut tags = vec![];
+	let mut title = String::new();
+	let mut body = String::new();
+	let mut ns = 1;
+	let mut ns_str = String::new();
+	for event in xml::reader::EventReader::new_with_config(reader, config) {
+		let event = event?;
+		use xml::reader::XmlEvent;
+		match (&event, tags.last()) {
+			(XmlEvent::StartElement { name, .. }, _) => {
+				let name = name.local_name.as_str();
+				match name {
+					"title" => {
+						tags.push(Tag::Title);
+						title.clear();
+					}
+					"text" => tags.push(Tag::Text),
+					"ns" => {
+						tags.push(Tag::Ns);
+						ns_str.clear();
+					}
+					_ => tags.push(Tag::Other),
+				}
+			}
+			(XmlEvent::Characters(s), Some(Tag::Title)) => {
+				title.push_str(s);
+			}
+			(XmlEvent::Characters(s), Some(Tag::Ns)) => {
+				ns_str.push_str(s);
+			}
+			(XmlEvent::Characters(s), Some(Tag::Text)) => {
+				body.push_str(s);
+			}
+			(XmlEvent::EndElement { name, .. }, _) => {
+				tags.pop();
+				if name.local_name == "page" {
+					title.clear();
+				} else if name.local_name == "text" {
+					if ns == 0
+						&& let Some(eng_start) = body.find("==English==\n")
+					{
+						let mut curr_section = None;
+						let eng = &body[eng_start..];
+						let eng_end = eng
+							.as_bytes()
+							.windows(4)
+							.position(|w| w.starts_with(b"\n==") && w[3] != b'=')
+							.unwrap_or(eng.len());
+						let eng = &eng[..eng_end];
+						for (i, w) in eng.as_bytes().windows(3).enumerate() {
+							if w == b"\n==" && eng.get(i + 3..i + 4) == Some("=") {
+								let mut section = &eng[i + 3..];
+								while let Some(s) = section.strip_prefix('=') {
+									section = s;
+								}
+								let Some((section, _)) = section
+									.split_once('\n')
+									.and_then(|(first_line, _)| first_line.split_once('='))
+								else {
+									continue;
+								};
+								curr_section = Section::from_name(section, &title);
+								continue;
+							}
+							if curr_section == Some(Section::NotDefinition) {
+								continue;
+							}
+							if w != b"\n# " {
+								continue;
+							}
+							let definition =
+								eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
+							let definition = remove_comments(definition);
+							if curr_section.is_none() {
+								eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}");
+							}
+							output.add_definition(
+								&title[..],
+								curr_section.unwrap_or(Section::UnknownPoS),
+								&definition,
+							);
+						}
+					}
+					body.clear();
+				} else if name.local_name == "ns" {
+					ns = ns_str.parse().unwrap_or(1);
+				}
+			}
+			_ => {}
+		}
+	}
+	Ok(())
+}
+
+pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
+	let mut output = Definitions::default();
+	let mut files: Vec<String> = vec![];
+	for arg in args {
+		if arg == "-h" || arg == "--help" {
+			println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
+			println!("    Extract English-language definitions from Wiktionary");
+			println!("    data dump files, writing output to definitions.txt.");
+			println!("    Each line of the output file is of the format: Word  Definition");
+			println!(
+				"    Note the two spaces—this avoids ambiguity when the word contains a space."
+			);
+			println!("    A single Word can have multiple Definitions.");
+			println!("    If FILES is not specified, will use ./*wiktionary*.xml-p*");
+			return Ok(());
+		}
+		files.push(arg.to_owned());
+	}
+	let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> {
+		let mut files = vec![];
+		for file in std::fs::read_dir(".")? {
+			let file = file?;
+			let mut r#type = file.file_type()?;
+			let name = file.file_name();
+			if r#type.is_symlink() {
+				// get type of thing symlink is pointing to
+				r#type = std::fs::metadata(file.path())?.file_type();
+			}
+			if !r#type.is_file() {
+				continue;
+			}
+			let Some(name) = name.to_str() else {
+				continue;
+			};
+			if name.contains("wiktionary") && name.contains(".xml-p") {
+				files.push(name.into());
+			}
+		}
+		files.sort();
+		Ok(files)
+	};
+	if files.is_empty() {
+		files = files_from_pwd()
+			.map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?;
+		println!("No files specified on command line.");
+		println!("These files were found in the PWD:");
+		for file in &files {
+			println!("  {file}");
+		}
+		print!("Proceed with these files [Y/n]? ");
+		_ = std::io::stdout().flush();
+		let mut line = String::new();
+		let result = std::io::stdin().read_line(&mut line);
+		let line = line.trim();
+		if result.is_err()
+			|| line
+				.chars()
+				.next()
+				.is_some_and(|c| c.to_lowercase().to_string() != "y")
+		{
+			return Err("Aborted.".into());
+		}
+	}
+	for input_filename in &files {
+		let input = File::open(input_filename)
+			.map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
+		let reader = BufReader::new(input);
+		println!("Parsing {input_filename}...");
+		parse_xml(reader, &mut output)
+			.map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+	}
+	println!("Sorting {} definitions...", output.definitions.len());
+	output.sort();
+	crate::do_write("definitions.txt", |writer| output.write_to(writer))?;
+	println!("Done!");
+	Ok(())
+}