summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore9
-rw-r--r--Cargo.lock16
-rw-r--r--Cargo.toml7
-rw-r--r--README.md15
-rw-r--r--rustfmt.toml1
-rw-r--r--src/main.rs268
-rwxr-xr-xsrc/parse_txtree.py24
7 files changed, 340 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ec0fb71
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+/target
+enwiktionary-*.xml-p*
+definitions.txt
+.*.tmp
+*~
+.vscode
+.vs
+*.txtree
+/taxa.txt
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..213875d
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,16 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "wicopy"
+version = "0.1.0"
+dependencies = [
+ "xml",
+]
+
+[[package]]
+name = "xml"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72e6e0a83ae73d886ab66fc2f82b598fbbb8f373357d5f2f9f783e50e4d06435"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..2a9267e
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "wicopy"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+xml = "1.0.0"
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ecd34b4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+# wicopy
+
+Various miscellaneous scripts for parsing [wiktionary data dumps](https://dumps.wikimedia.org/enwiktionary/).
+
+Some outputs from this tool which you may find useful: <https://s.pommicket.com/wiktionary/index.html>.
+
+## Acknowledgments
+
+Thanks to the `xml` Rust crate (aka `xml-rs`): <https://crates.io/crates/xml>
+
+And of course to the many contributors to and maintainers of Wiktionary.
+
+## License
+
+Do whatever with this.
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..218e203
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1 @@
+hard_tabs = true
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..3b9ffa6
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,268 @@
+use std::borrow::Cow;
+use std::error::Error;
+use std::fs::File;
+use std::io::{self, BufReader, BufWriter, prelude::*};
+use std::process::ExitCode;
+
+#[derive(Default)]
+struct Definitions {
+ definitions: Vec<(Box<str>, Box<str>)>,
+}
+
+// replace sequences of 2+ spaces with a single space
+fn compact_spaces(s: &str) -> String {
+ let mut s: String = s.into();
+ // quite inefficient, but it doesn't really matter for our purposes.
+ while s.contains(" ") {
+ s = s.replace(" ", " ").to_owned();
+ }
+ s
+}
+
+impl Definitions {
+ fn add_definition(&mut self, word: &str, def: &str) {
+ self.definitions
+ .push((compact_spaces(word).into(), compact_spaces(def).into()))
+ }
+ fn sort(&mut self) {
+ self.definitions.sort_by(|x, y| x.0.cmp(&y.0));
+ }
+ fn write_to(&mut self, mut w: impl Write) -> io::Result<()> {
+ for (title, definition) in &self.definitions {
+ writeln!(w, "{title} {definition}")?;
+ }
+ Ok(())
+ }
+}
+
+// remove HTML comments from string
+#[must_use]
+fn remove_comments(mut text: &str) -> Cow<str> {
+ if !text.contains("<!--") {
+ // (by far) most common case
+ return Cow::Borrowed(text);
+ }
+ let mut new_str = String::new();
+ while let Some(comment_start) = text.find("<!--") {
+ new_str.push_str(&text[..comment_start]);
+ text = &text[comment_start..];
+ let comment_end = text.find("-->").map_or(text.len(), |i| i + 3);
+ text = &text[comment_end..];
+ }
+ new_str.push_str(text);
+ Cow::Owned(new_str)
+}
+
+fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
+ let mut config = xml::reader::ParserConfig::default();
+ config.cdata_to_characters = true;
+ #[derive(Debug, Clone, Copy)]
+ enum Tag {
+ Other,
+ Title,
+ Text,
+ Ns,
+ }
+ let mut tags = vec![];
+ let mut title = String::new();
+ let mut body = String::new();
+ let mut ns = 1;
+ let mut ns_str = String::new();
+ for event in xml::reader::EventReader::new_with_config(reader, config) {
+ let event = event?;
+ use xml::reader::XmlEvent;
+ match (&event, tags.last()) {
+ (XmlEvent::StartElement { name, .. }, _) => {
+ let name = name.local_name.as_str();
+ match name {
+ "title" => {
+ tags.push(Tag::Title);
+ title.clear();
+ }
+ "text" => tags.push(Tag::Text),
+ "ns" => {
+ tags.push(Tag::Ns);
+ ns_str.clear();
+ }
+ _ => tags.push(Tag::Other),
+ }
+ }
+ (XmlEvent::Characters(s), Some(Tag::Title)) => {
+ title.push_str(s);
+ }
+ (XmlEvent::Characters(s), Some(Tag::Ns)) => {
+ ns_str.push_str(s);
+ }
+ (XmlEvent::Characters(s), Some(Tag::Text)) => {
+ body.push_str(s);
+ }
+ (XmlEvent::EndElement { name, .. }, _) => {
+ tags.pop();
+ if name.local_name == "page" {
+ title.clear();
+ } else if name.local_name == "text" {
+ if ns == 0
+ && let Some(eng_start) = body.find("==English==\n")
+ {
+ let eng = &body[eng_start..];
+ let eng_end = eng
+ .as_bytes()
+ .windows(4)
+ .position(|w| w.starts_with(b"\n==") && w[3] != b'=')
+ .unwrap_or(eng.len());
+ let eng = &eng[..eng_end];
+ for (i, w) in eng.as_bytes().windows(3).enumerate() {
+ if w != b"\n# " {
+ continue;
+ }
+ let definition =
+ eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
+ let definition = remove_comments(definition);
+ output.add_definition(&title[..], &definition);
+ }
+ }
+ body.clear();
+ } else if name.local_name == "ns" {
+ ns = ns_str.parse().unwrap_or(1);
+ }
+ }
+ _ => {}
+ }
+ }
+ Ok(())
+}
+
+fn do_write<W, E: Error>(path: &str, write_func: W) -> Result<(), Box<dyn Error>>
+where
+ W: FnOnce(BufWriter<File>) -> Result<(), E>,
+{
+ println!("Writing output to {path}...");
+ let tmp_name = format!(".{path}.tmp");
+ let file = File::create(&tmp_name).map_err(|e| format!("Error creating {tmp_name}: {e}"))?;
+ let writer = BufWriter::new(file);
+ write_func(writer).map_err(|e| format!("Error writing to {tmp_name}: {e}"))?;
+ _ = std::fs::remove_file(path); // OK if this already exists
+ std::fs::rename(&tmp_name, path)
+ .map_err(|e| format!("Error renaming {tmp_name} => {path}: {e}"))?;
+ Ok(())
+}
+
+fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
+ let mut output = Definitions::default();
+ let mut files: Vec<String> = vec![];
+ for arg in args {
+ if arg == "-h" || arg == "--help" {
+ println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
+ println!(" Extract English-language definitions from Wiktionary");
+ println!(" data dump files, writing output to definitions.txt.");
+ println!(" Each line of the output file is of the format: Word Definition");
+ println!(
+ " Note the two spaces—this avoids ambiguity when the word contains a space."
+ );
+ println!(" A single Word can have multiple Definitions.");
+ println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*");
+ return Ok(());
+ }
+ files.push(arg.to_owned());
+ }
+ let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> {
+ let mut files = vec![];
+ for file in std::fs::read_dir(".")? {
+ let file = file?;
+ let mut r#type = file.file_type()?;
+ let name = file.file_name();
+ if r#type.is_symlink() {
+ // get type of thing symlink is pointing to
+ r#type = std::fs::metadata(file.path())?.file_type();
+ }
+ if !r#type.is_file() {
+ continue;
+ }
+ let Some(name) = name.to_str() else {
+ continue;
+ };
+ if name.contains("wiktionary") && name.contains(".xml-p") {
+ files.push(name.into());
+ }
+ }
+ files.sort();
+ Ok(files)
+ };
+ if files.is_empty() {
+ files = files_from_pwd()
+ .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?;
+ println!("No files specified on command line.");
+ println!("These files were found in the PWD:");
+ for file in &files {
+ println!(" {file}");
+ }
+ print!("Proceed with these files [Y/n]? ");
+ _ = std::io::stdout().flush();
+ let mut line = String::new();
+ let result = std::io::stdin().read_line(&mut line);
+ let line = line.trim();
+ if result.is_err()
+ || line
+ .chars()
+ .next()
+ .is_some_and(|c| c.to_lowercase().to_string() != "y")
+ {
+ return Err("Aborted.".into());
+ }
+ }
+ for input_filename in &files {
+ let input = File::open(input_filename)
+ .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
+ let reader = BufReader::new(input);
+ println!("Parsing {input_filename}...");
+ parse_xml(reader, &mut output)
+ .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+ }
+ println!("Sorting {} definitions...", output.definitions.len());
+ output.sort();
+ do_write("definitions.txt", |writer| output.write_to(writer))?;
+ println!("Done!");
+ Ok(())
+}
+
+fn try_main() -> Result<(), Box<dyn Error>> {
+ let mut args = std::env::args_os().skip(1);
+ let command = args.next();
+ let no_command = "No command specified. Commands available:
+- definitions";
+ let Some(command) = command else {
+ return Err(no_command.into());
+ };
+ if command == "-h" || command == "--help" {
+ return Err(no_command.into());
+ }
+ let mut command_args = vec![];
+ for arg in args {
+ let Some(arg) = arg.to_str() else {
+ return Err(format!(
+ "Bad UTF-8 in argument: {}",
+ arg.to_string_lossy().escape_debug()
+ )
+ .into());
+ };
+ command_args.push(arg.to_owned());
+ }
+ match &command.to_string_lossy()[..] {
+ "definitions" => definitions(command_args),
+ x => Err(format!("Unrecognized command: {x}").into()),
+ }
+}
+
+fn main() -> ExitCode {
+ use std::time::Instant;
+ let start_time = Instant::now();
+ if let Err(e) = try_main() {
+ eprintln!("Error: {e}");
+ return ExitCode::FAILURE;
+ }
+ println!(
+ "Time taken: {:?}",
+ Instant::now().duration_since(start_time)
+ );
+ ExitCode::SUCCESS
+}
diff --git a/src/parse_txtree.py b/src/parse_txtree.py
new file mode 100755
index 0000000..5aaf87c
--- /dev/null
+++ b/src/parse_txtree.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import regex
+
+high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$')
+species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U)
+parenthetical_regex = regex.compile(r' \((\w|\.)+\) ')
+
+with open('animalia.txtree') as animalia, open('taxa.txt', 'w') as taxa:
+ for line in animalia:
+ line = line.strip().lstrip('=').replace('†', '').replace(' [sensu lato] ', ' ')
+ line = parenthetical_regex.sub(' ', line)
+ if '[species]' in line:
+ words = line.split()
+ if len(words) < 2 or \
+ not high_taxon_regex.match(words[0]) or \
+ not species_regex.match(words[1]):
+ print('Weird line:',line)
+ continue
+ taxa.write(words[0] + ' ' + words[1] + '\n')
+ elif (i := line.find(' ')) > 0 and high_taxon_regex.match(taxon := line[:i]):
+ taxa.write(taxon + '\n')
+ else:
+ print('Weird line:', line)
+ continue