From fa5db0d2f5cc19e578b74b3bb15f8bb9bc90b789 Mon Sep 17 00:00:00 2001 From: pommicket Date: Thu, 25 Sep 2025 23:36:05 -0400 Subject: Animalia --- .gitignore | 1 + Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 2 +- index.html | 15 ++++++++++----- src/animalia.rs | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- src/main.rs | 3 ++- 7 files changed, 69 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 811af54..b2d5af0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ /target enwiktionary-*.xml-p* *definitions.txt* +animalia.txt* .*.tmp *~ .vscode diff --git a/Cargo.lock b/Cargo.lock index 213875d..c8519f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3,7 +3,7 @@ version = 4 [[package]] -name = "wicopy" +name = "wiktionary" version = "0.1.0" dependencies = [ "xml", diff --git a/Cargo.toml b/Cargo.toml index 2a9267e..8361bae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "wicopy" +name = "wiktionary" version = "0.1.0" edition = "2024" diff --git a/README.md b/README.md index ecd34b4..92ef4dc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# wicopy +# wiktionary Various miscellaneous scripts for parsing [wiktionary data dumps](https://dumps.wikimedia.org/enwiktionary/). diff --git a/index.html b/index.html index 306087f..8dff7cd 100644 --- a/index.html +++ b/index.html @@ -21,11 +21,6 @@ Wiktionary's licensing, where applicable.

¹ Derived from enwiktionary-20250701 dump.

diff --git a/src/animalia.rs b/src/animalia.rs index f3b3261..e1ec8eb 100644 --- a/src/animalia.rs +++ b/src/animalia.rs @@ -1,9 +1,60 @@ use std::error::Error; - +use std::io::{self, prelude::*}; +use std::collections::HashSet; pub fn main(args: Vec) -> Result<(), Box> { if !args.is_empty() { Err("No arguments expected to 'animalia' command")?; } - + let defs_path = "en-definitions.txt"; + let taxa_path = "taxa.txt"; + let taxa_file = std::fs::File::open(taxa_path) + .map_err(|e| format!("couldn't open {taxa_path}: {e}"))?; + let mut species: HashSet = HashSet::new(); + let mut taxa: HashSet = HashSet::new(); + for line in io::BufReader::new(taxa_file).lines() { + let line = line.map_err(|e| format!("couldn't read {taxa_path}: {e}"))?; + let line = line.trim_end_matches(['\r', '\n']); + if line.contains(' ') { + species.insert(line.into()); + } else { + taxa.insert(line.into()); + } + } + let definitions = std::fs::File::open(defs_path) + .map_err(|e| format!("couldn't open {defs_path}: {e}"))?; + let levels: HashSet<&str> = [ + "kingdom", + "phylum", + "class", + "order", + "family", + "genus", + ].into_iter().collect(); + let mut animalia = vec![]; + for line in io::BufReader::new(definitions).lines() { + let line = line.map_err(|e| format!("error reading {defs_path}: {e}"))?; + let line = line.trim_end_matches(['\r', '\n']); + let (word, rest) = line.split_once(" ").expect("bad format for definitions file"); + let (_class, definition) = rest.split_once(' ').expect("bad format for definitions file"); +// println!("{word} {definition}"); + let parts: Vec<&str> = definition.split(|c: char| !c.is_alphabetic()).collect(); + for ws in parts.windows(2) { + if species.contains(&format!("{} {}",ws[0],ws[1])) || + // handles {{taxfmt|Felidae|family}} &c. + levels.contains(ws[1]) && taxa.contains(ws[0]) { + animalia.push(word.to_owned()); + } + } + } + animalia.sort_unstable(); + animalia.dedup(); + let output_path = "animalia.txt"; + let mut s = String::new(); + for animal in &animalia { + s.push_str(animal); + s.push('\n'); + } + std::fs::write(output_path, s) + .map_err(|e| format!("couldn't write {output_path}: {e}"))?; Ok(()) } diff --git a/src/main.rs b/src/main.rs index 0fcc3d6..8ca45fb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -25,7 +25,8 @@ fn try_main() -> Result<(), Box> { let mut args = std::env::args_os().skip(1); let command = args.next(); let no_command = "No command specified. Commands available: -- definitions"; +- definitions +- animalia"; let Some(command) = command else { return Err(no_command.into()); }; -- cgit v1.2.3