7 files changed, 340 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ec0fb71
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+/target
+enwiktionary-*.xml-p*
+definitions.txt
+.*.tmp
+*~
+.vscode
+.vs
+*.txtree
+/taxa.txt
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..213875d
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,16 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "wicopy"
+version = "0.1.0"
+dependencies = [
+ "xml",
+]
+
+[[package]]
+name = "xml"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72e6e0a83ae73d886ab66fc2f82b598fbbb8f373357d5f2f9f783e50e4d06435"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..2a9267e
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "wicopy"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+xml = "1.0.0"
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ecd34b4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+# wicopy
+
+Various miscellaneous scripts for parsing [wiktionary data dumps](https://dumps.wikimedia.org/enwiktionary/).
+
+Some outputs from this tool which you may find useful: <https://s.pommicket.com/wiktionary/index.html>.
+
+## Acknowledgments
+
+Thanks to the `xml` Rust crate (aka `xml-rs`): <https://crates.io/crates/xml>
+
+And of course to the many contributors to and maintainers of Wiktionary.
+
+## License
+
+Do whatever with this.
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..218e203
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1 @@
+hard_tabs = true
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..3b9ffa6
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,268 @@
+use std::borrow::Cow;
+use std::error::Error;
+use std::fs::File;
+use std::io::{self, BufReader, BufWriter, prelude::*};
+use std::process::ExitCode;
+
+#[derive(Default)]
+struct Definitions {
+	definitions: Vec<(Box<str>, Box<str>)>,
+}
+
+// replace sequences of 2+ spaces with a single space
+fn compact_spaces(s: &str) -> String {
+	let mut s: String = s.into();
+	// quite inefficient, but it doesn't really matter for our purposes.
+	while s.contains("  ") {
+		s = s.replace("  ", " ").to_owned();
+	}
+	s
+}
+
+impl Definitions {
+	fn add_definition(&mut self, word: &str, def: &str) {
+		self.definitions
+			.push((compact_spaces(word).into(), compact_spaces(def).into()))
+	}
+	fn sort(&mut self) {
+		self.definitions.sort_by(|x, y| x.0.cmp(&y.0));
+	}
+	fn write_to(&mut self, mut w: impl Write) -> io::Result<()> {
+		for (title, definition) in &self.definitions {
+			writeln!(w, "{title}  {definition}")?;
+		}
+		Ok(())
+	}
+}
+
+// remove HTML comments from string
+#[must_use]
+fn remove_comments(mut text: &str) -> Cow<str> {
+	if !text.contains("<!--") {
+		// (by far) most common case
+		return Cow::Borrowed(text);
+	}
+	let mut new_str = String::new();
+	while let Some(comment_start) = text.find("<!--") {
+		new_str.push_str(&text[..comment_start]);
+		text = &text[comment_start..];
+		let comment_end = text.find("-->").map_or(text.len(), |i| i + 3);
+		text = &text[comment_end..];
+	}
+	new_str.push_str(text);
+	Cow::Owned(new_str)
+}
+
+fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
+	let mut config = xml::reader::ParserConfig::default();
+	config.cdata_to_characters = true;
+	#[derive(Debug, Clone, Copy)]
+	enum Tag {
+		Other,
+		Title,
+		Text,
+		Ns,
+	}
+	let mut tags = vec![];
+	let mut title = String::new();
+	let mut body = String::new();
+	let mut ns = 1;
+	let mut ns_str = String::new();
+	for event in xml::reader::EventReader::new_with_config(reader, config) {
+		let event = event?;
+		use xml::reader::XmlEvent;
+		match (&event, tags.last()) {
+			(XmlEvent::StartElement { name, .. }, _) => {
+				let name = name.local_name.as_str();
+				match name {
+					"title" => {
+						tags.push(Tag::Title);
+						title.clear();
+					}
+					"text" => tags.push(Tag::Text),
+					"ns" => {
+						tags.push(Tag::Ns);
+						ns_str.clear();
+					}
+					_ => tags.push(Tag::Other),
+				}
+			}
+			(XmlEvent::Characters(s), Some(Tag::Title)) => {
+				title.push_str(s);
+			}
+			(XmlEvent::Characters(s), Some(Tag::Ns)) => {
+				ns_str.push_str(s);
+			}
+			(XmlEvent::Characters(s), Some(Tag::Text)) => {
+				body.push_str(s);
+			}
+			(XmlEvent::EndElement { name, .. }, _) => {
+				tags.pop();
+				if name.local_name == "page" {
+					title.clear();
+				} else if name.local_name == "text" {
+					if ns == 0
+						&& let Some(eng_start) = body.find("==English==\n")
+					{
+						let eng = &body[eng_start..];
+						let eng_end = eng
+							.as_bytes()
+							.windows(4)
+							.position(|w| w.starts_with(b"\n==") && w[3] != b'=')
+							.unwrap_or(eng.len());
+						let eng = &eng[..eng_end];
+						for (i, w) in eng.as_bytes().windows(3).enumerate() {
+							if w != b"\n# " {
+								continue;
+							}
+							let definition =
+								eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
+							let definition = remove_comments(definition);
+							output.add_definition(&title[..], &definition);
+						}
+					}
+					body.clear();
+				} else if name.local_name == "ns" {
+					ns = ns_str.parse().unwrap_or(1);
+				}
+			}
+			_ => {}
+		}
+	}
+	Ok(())
+}
+
+fn do_write<W, E: Error>(path: &str, write_func: W) -> Result<(), Box<dyn Error>>
+where
+	W: FnOnce(BufWriter<File>) -> Result<(), E>,
+{
+	println!("Writing output to {path}...");
+	let tmp_name = format!(".{path}.tmp");
+	let file = File::create(&tmp_name).map_err(|e| format!("Error creating {tmp_name}: {e}"))?;
+	let writer = BufWriter::new(file);
+	write_func(writer).map_err(|e| format!("Error writing to {tmp_name}: {e}"))?;
+	_ = std::fs::remove_file(path); // OK if this already exists
+	std::fs::rename(&tmp_name, path)
+		.map_err(|e| format!("Error renaming {tmp_name} => {path}: {e}"))?;
+	Ok(())
+}
+
+fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
+	let mut output = Definitions::default();
+	let mut files: Vec<String> = vec![];
+	for arg in args {
+		if arg == "-h" || arg == "--help" {
+			println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
+			println!("    Extract English-language definitions from Wiktionary");
+			println!("    data dump files, writing output to definitions.txt.");
+			println!("    Each line of the output file is of the format: Word  Definition");
+			println!(
+				"    Note the two spaces—this avoids ambiguity when the word contains a space."
+			);
+			println!("    A single Word can have multiple Definitions.");
+			println!("    If FILES is not specified, will use ./*wiktionary*.xml-p*");
+			return Ok(());
+		}
+		files.push(arg.to_owned());
+	}
+	let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> {
+		let mut files = vec![];
+		for file in std::fs::read_dir(".")? {
+			let file = file?;
+			let mut r#type = file.file_type()?;
+			let name = file.file_name();
+			if r#type.is_symlink() {
+				// get type of thing symlink is pointing to
+				r#type = std::fs::metadata(file.path())?.file_type();
+			}
+			if !r#type.is_file() {
+				continue;
+			}
+			let Some(name) = name.to_str() else {
+				continue;
+			};
+			if name.contains("wiktionary") && name.contains(".xml-p") {
+				files.push(name.into());
+			}
+		}
+		files.sort();
+		Ok(files)
+	};
+	if files.is_empty() {
+		files = files_from_pwd()
+			.map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?;
+		println!("No files specified on command line.");
+		println!("These files were found in the PWD:");
+		for file in &files {
+			println!("  {file}");
+		}
+		print!("Proceed with these files [Y/n]? ");
+		_ = std::io::stdout().flush();
+		let mut line = String::new();
+		let result = std::io::stdin().read_line(&mut line);
+		let line = line.trim();
+		if result.is_err()
+			|| line
+				.chars()
+				.next()
+				.is_some_and(|c| c.to_lowercase().to_string() != "y")
+		{
+			return Err("Aborted.".into());
+		}
+	}
+	for input_filename in &files {
+		let input = File::open(input_filename)
+			.map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
+		let reader = BufReader::new(input);
+		println!("Parsing {input_filename}...");
+		parse_xml(reader, &mut output)
+			.map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+	}
+	println!("Sorting {} definitions...", output.definitions.len());
+	output.sort();
+	do_write("definitions.txt", |writer| output.write_to(writer))?;
+	println!("Done!");
+	Ok(())
+}
+
+fn try_main() -> Result<(), Box<dyn Error>> {
+	let mut args = std::env::args_os().skip(1);
+	let command = args.next();
+	let no_command = "No command specified. Commands available:
+- definitions";
+	let Some(command) = command else {
+		return Err(no_command.into());
+	};
+	if command == "-h" || command == "--help" {
+		return Err(no_command.into());
+	}
+	let mut command_args = vec![];
+	for arg in args {
+		let Some(arg) = arg.to_str() else {
+			return Err(format!(
+				"Bad UTF-8 in argument: {}",
+				arg.to_string_lossy().escape_debug()
+			)
+			.into());
+		};
+		command_args.push(arg.to_owned());
+	}
+	match &command.to_string_lossy()[..] {
+		"definitions" => definitions(command_args),
+		x => Err(format!("Unrecognized command: {x}").into()),
+	}
+}
+
+fn main() -> ExitCode {
+	use std::time::Instant;
+	let start_time = Instant::now();
+	if let Err(e) = try_main() {
+		eprintln!("Error: {e}");
+		return ExitCode::FAILURE;
+	}
+	println!(
+		"Time taken: {:?}",
+		Instant::now().duration_since(start_time)
+	);
+	ExitCode::SUCCESS
+}
diff --git a/src/parse_txtree.py b/src/parse_txtree.py
new file mode 100755
index 0000000..5aaf87c
--- /dev/null
+++ b/src/parse_txtree.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import regex
+
+high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$')
+species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U)
+parenthetical_regex = regex.compile(r' \((\w|\.)+\) ')
+
+with open('animalia.txtree') as animalia, open('taxa.txt', 'w') as taxa:
+	for line in animalia:
+		line = line.strip().lstrip('=').replace('†', '').replace(' [sensu lato] ', ' ')
+		line = parenthetical_regex.sub(' ', line)
+		if '[species]' in line:
+			words = line.split()
+			if len(words) < 2 or \
+				not high_taxon_regex.match(words[0]) or \
+				not species_regex.match(words[1]):
+				print('Weird line:',line)
+				continue
+			taxa.write(words[0] + ' ' + words[1] + '\n')
+		elif (i := line.find(' ')) > 0 and high_taxon_regex.match(taxon := line[:i]):
+			taxa.write(taxon + '\n')
+		else:
+			print('Weird line:', line)
+			continue