diff options
-rw-r--r-- | .gitignore | 9 | ||||
-rw-r--r-- | Cargo.lock | 16 | ||||
-rw-r--r-- | Cargo.toml | 7 | ||||
-rw-r--r-- | README.md | 15 | ||||
-rw-r--r-- | rustfmt.toml | 1 | ||||
-rw-r--r-- | src/main.rs | 268 | ||||
-rwxr-xr-x | src/parse_txtree.py | 24 |
7 files changed, 340 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ec0fb71 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +/target +enwiktionary-*.xml-p* +definitions.txt +.*.tmp +*~ +.vscode +.vs +*.txtree +/taxa.txt diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..213875d --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "wicopy" +version = "0.1.0" +dependencies = [ + "xml", +] + +[[package]] +name = "xml" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e6e0a83ae73d886ab66fc2f82b598fbbb8f373357d5f2f9f783e50e4d06435" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..2a9267e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "wicopy" +version = "0.1.0" +edition = "2024" + +[dependencies] +xml = "1.0.0" diff --git a/README.md b/README.md new file mode 100644 index 0000000..ecd34b4 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# wicopy + +Various miscellaneous scripts for parsing [wiktionary data dumps](https://dumps.wikimedia.org/enwiktionary/). + +Some outputs from this tool which you may find useful: <https://s.pommicket.com/wiktionary/index.html>. + +## Acknowledgments + +Thanks to the `xml` Rust crate (aka `xml-rs`): <https://crates.io/crates/xml> + +And of course to the many contributors to and maintainers of Wiktionary. + +## License + +Do whatever with this. diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..218e203 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +hard_tabs = true diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..3b9ffa6 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,268 @@ +use std::borrow::Cow; +use std::error::Error; +use std::fs::File; +use std::io::{self, BufReader, BufWriter, prelude::*}; +use std::process::ExitCode; + +#[derive(Default)] +struct Definitions { + definitions: Vec<(Box<str>, Box<str>)>, +} + +// replace sequences of 2+ spaces with a single space +fn compact_spaces(s: &str) -> String { + let mut s: String = s.into(); + // quite inefficient, but it doesn't really matter for our purposes. + while s.contains(" ") { + s = s.replace(" ", " ").to_owned(); + } + s +} + +impl Definitions { + fn add_definition(&mut self, word: &str, def: &str) { + self.definitions + .push((compact_spaces(word).into(), compact_spaces(def).into())) + } + fn sort(&mut self) { + self.definitions.sort_by(|x, y| x.0.cmp(&y.0)); + } + fn write_to(&mut self, mut w: impl Write) -> io::Result<()> { + for (title, definition) in &self.definitions { + writeln!(w, "{title} {definition}")?; + } + Ok(()) + } +} + +// remove HTML comments from string +#[must_use] +fn remove_comments(mut text: &str) -> Cow<str> { + if !text.contains("<!--") { + // (by far) most common case + return Cow::Borrowed(text); + } + let mut new_str = String::new(); + while let Some(comment_start) = text.find("<!--") { + new_str.push_str(&text[..comment_start]); + text = &text[comment_start..]; + let comment_end = text.find("-->").map_or(text.len(), |i| i + 3); + text = &text[comment_end..]; + } + new_str.push_str(text); + Cow::Owned(new_str) +} + +fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> { + let mut config = xml::reader::ParserConfig::default(); + config.cdata_to_characters = true; + #[derive(Debug, Clone, Copy)] + enum Tag { + Other, + Title, + Text, + Ns, + } + let mut tags = vec![]; + let mut title = String::new(); + let mut body = String::new(); + let mut ns = 1; + let mut ns_str = String::new(); + for event in xml::reader::EventReader::new_with_config(reader, config) { + let event = event?; + use xml::reader::XmlEvent; + match (&event, tags.last()) { + (XmlEvent::StartElement { name, .. }, _) => { + let name = name.local_name.as_str(); + match name { + "title" => { + tags.push(Tag::Title); + title.clear(); + } + "text" => tags.push(Tag::Text), + "ns" => { + tags.push(Tag::Ns); + ns_str.clear(); + } + _ => tags.push(Tag::Other), + } + } + (XmlEvent::Characters(s), Some(Tag::Title)) => { + title.push_str(s); + } + (XmlEvent::Characters(s), Some(Tag::Ns)) => { + ns_str.push_str(s); + } + (XmlEvent::Characters(s), Some(Tag::Text)) => { + body.push_str(s); + } + (XmlEvent::EndElement { name, .. }, _) => { + tags.pop(); + if name.local_name == "page" { + title.clear(); + } else if name.local_name == "text" { + if ns == 0 + && let Some(eng_start) = body.find("==English==\n") + { + let eng = &body[eng_start..]; + let eng_end = eng + .as_bytes() + .windows(4) + .position(|w| w.starts_with(b"\n==") && w[3] != b'=') + .unwrap_or(eng.len()); + let eng = &eng[..eng_end]; + for (i, w) in eng.as_bytes().windows(3).enumerate() { + if w != b"\n# " { + continue; + } + let definition = + eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0); + let definition = remove_comments(definition); + output.add_definition(&title[..], &definition); + } + } + body.clear(); + } else if name.local_name == "ns" { + ns = ns_str.parse().unwrap_or(1); + } + } + _ => {} + } + } + Ok(()) +} + +fn do_write<W, E: Error>(path: &str, write_func: W) -> Result<(), Box<dyn Error>> +where + W: FnOnce(BufWriter<File>) -> Result<(), E>, +{ + println!("Writing output to {path}..."); + let tmp_name = format!(".{path}.tmp"); + let file = File::create(&tmp_name).map_err(|e| format!("Error creating {tmp_name}: {e}"))?; + let writer = BufWriter::new(file); + write_func(writer).map_err(|e| format!("Error writing to {tmp_name}: {e}"))?; + _ = std::fs::remove_file(path); // OK if this already exists + std::fs::rename(&tmp_name, path) + .map_err(|e| format!("Error renaming {tmp_name} => {path}: {e}"))?; + Ok(()) +} + +fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> { + let mut output = Definitions::default(); + let mut files: Vec<String> = vec![]; + for arg in args { + if arg == "-h" || arg == "--help" { + println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); + println!(" Extract English-language definitions from Wiktionary"); + println!(" data dump files, writing output to definitions.txt."); + println!(" Each line of the output file is of the format: Word Definition"); + println!( + " Note the two spaces—this avoids ambiguity when the word contains a space." + ); + println!(" A single Word can have multiple Definitions."); + println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*"); + return Ok(()); + } + files.push(arg.to_owned()); + } + let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> { + let mut files = vec![]; + for file in std::fs::read_dir(".")? { + let file = file?; + let mut r#type = file.file_type()?; + let name = file.file_name(); + if r#type.is_symlink() { + // get type of thing symlink is pointing to + r#type = std::fs::metadata(file.path())?.file_type(); + } + if !r#type.is_file() { + continue; + } + let Some(name) = name.to_str() else { + continue; + }; + if name.contains("wiktionary") && name.contains(".xml-p") { + files.push(name.into()); + } + } + files.sort(); + Ok(files) + }; + if files.is_empty() { + files = files_from_pwd() + .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?; + println!("No files specified on command line."); + println!("These files were found in the PWD:"); + for file in &files { + println!(" {file}"); + } + print!("Proceed with these files [Y/n]? "); + _ = std::io::stdout().flush(); + let mut line = String::new(); + let result = std::io::stdin().read_line(&mut line); + let line = line.trim(); + if result.is_err() + || line + .chars() + .next() + .is_some_and(|c| c.to_lowercase().to_string() != "y") + { + return Err("Aborted.".into()); + } + } + for input_filename in &files { + let input = File::open(input_filename) + .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; + let reader = BufReader::new(input); + println!("Parsing {input_filename}..."); + parse_xml(reader, &mut output) + .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; + } + println!("Sorting {} definitions...", output.definitions.len()); + output.sort(); + do_write("definitions.txt", |writer| output.write_to(writer))?; + println!("Done!"); + Ok(()) +} + +fn try_main() -> Result<(), Box<dyn Error>> { + let mut args = std::env::args_os().skip(1); + let command = args.next(); + let no_command = "No command specified. Commands available: +- definitions"; + let Some(command) = command else { + return Err(no_command.into()); + }; + if command == "-h" || command == "--help" { + return Err(no_command.into()); + } + let mut command_args = vec![]; + for arg in args { + let Some(arg) = arg.to_str() else { + return Err(format!( + "Bad UTF-8 in argument: {}", + arg.to_string_lossy().escape_debug() + ) + .into()); + }; + command_args.push(arg.to_owned()); + } + match &command.to_string_lossy()[..] { + "definitions" => definitions(command_args), + x => Err(format!("Unrecognized command: {x}").into()), + } +} + +fn main() -> ExitCode { + use std::time::Instant; + let start_time = Instant::now(); + if let Err(e) = try_main() { + eprintln!("Error: {e}"); + return ExitCode::FAILURE; + } + println!( + "Time taken: {:?}", + Instant::now().duration_since(start_time) + ); + ExitCode::SUCCESS +} diff --git a/src/parse_txtree.py b/src/parse_txtree.py new file mode 100755 index 0000000..5aaf87c --- /dev/null +++ b/src/parse_txtree.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import regex + +high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$') +species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U) +parenthetical_regex = regex.compile(r' \((\w|\.)+\) ') + +with open('animalia.txtree') as animalia, open('taxa.txt', 'w') as taxa: + for line in animalia: + line = line.strip().lstrip('=').replace('†', '').replace(' [sensu lato] ', ' ') + line = parenthetical_regex.sub(' ', line) + if '[species]' in line: + words = line.split() + if len(words) < 2 or \ + not high_taxon_regex.match(words[0]) or \ + not species_regex.match(words[1]): + print('Weird line:',line) + continue + taxa.write(words[0] + ' ' + words[1] + '\n') + elif (i := line.find(' ')) > 0 and high_taxon_regex.match(taxon := line[:i]): + taxa.write(taxon + '\n') + else: + print('Weird line:', line) + continue |