diff options
author | pommicket <pommicket@gmail.com> | 2025-09-25 23:36:05 -0400 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2025-09-25 23:36:40 -0400 |
commit | fa5db0d2f5cc19e578b74b3bb15f8bb9bc90b789 (patch) | |
tree | d6d33ac0170a05da616dd232b78db25a5194d308 | |
parent | af6810f0cc4559470409f476252efa13c424ab82 (diff) |
Animalia
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Cargo.lock | 2 | ||||
-rw-r--r-- | Cargo.toml | 2 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | index.html | 15 | ||||
-rw-r--r-- | src/animalia.rs | 55 | ||||
-rw-r--r-- | src/main.rs | 3 |
7 files changed, 69 insertions, 11 deletions
@@ -1,6 +1,7 @@ /target enwiktionary-*.xml-p* *definitions.txt* +animalia.txt* .*.tmp *~ .vscode @@ -3,7 +3,7 @@ version = 4 [[package]] -name = "wicopy" +name = "wiktionary" version = "0.1.0" dependencies = [ "xml", @@ -1,5 +1,5 @@ [package] -name = "wicopy" +name = "wiktionary" version = "0.1.0" edition = "2024" @@ -1,4 +1,4 @@ -# wicopy +# wiktionary Various miscellaneous scripts for parsing [wiktionary data dumps](https://dumps.wikimedia.org/enwiktionary/). @@ -22,11 +22,6 @@ </p> <ul> <li> - The Big List: <a href="/tmt/word-list.txt.xz">word-list.txt.xz (27MB compressed, 120MB uncompressed, 9,878,558 entries)</a>.¹<br> - Every English Wikipedia article title & entry in English Wiktionary; containing only ASCII a-z/A-Z/space, max 2 words.<br> - Words labelled <i>offensive</i> on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process). - </li> - <li> English definitions: <a href="/wiktionary/en-definitions.txt.xz">en-definitions.txt.xz (23MB compressed, 127MB uncompressed, 1,629,482 entries)</a> and<br>Translingual definitions: @@ -68,6 +63,16 @@ <code>DEFINITION</code> is in the wikitext format.<br> It’s possible that there are parsing errors, but I haven’t spotted any yet. </li> + <li> + All English animal terms: <a href="/wiktionary/animalia.txt.xz">animalia.txt.xz (62KB compressed, 192KB uncompressed)</a>.¹<br> + This includes both nouns referring to animals (e.g. <i>dog</i>) and animal-related adjectives (e.g. <i>canine</i>). + There could definitely be errors due to bad parsing (but I have checked a number of entries at random and they seem good). + </li> + <li> + The Big List: <a href="/tmt/word-list.txt.xz">word-list.txt.xz (27MB compressed, 120MB uncompressed, 9,878,558 entries)</a>.¹<br> + Every English Wikipedia article title & entry in English Wiktionary; containing only ASCII a-z/A-Z/space, max 2 words.<br> + Words labelled <i>offensive</i> on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process). + </li> </ul> <p>¹ Derived from <a href="https://dumps.wikimedia.org/enwiktionary/20250701/" target="_blank">enwiktionary-20250701</a> dump.</p> </body> diff --git a/src/animalia.rs b/src/animalia.rs index f3b3261..e1ec8eb 100644 --- a/src/animalia.rs +++ b/src/animalia.rs @@ -1,9 +1,60 @@ use std::error::Error; - +use std::io::{self, prelude::*}; +use std::collections::HashSet; pub fn main(args: Vec<String>) -> Result<(), Box<dyn Error>> { if !args.is_empty() { Err("No arguments expected to 'animalia' command")?; } - + let defs_path = "en-definitions.txt"; + let taxa_path = "taxa.txt"; + let taxa_file = std::fs::File::open(taxa_path) + .map_err(|e| format!("couldn't open {taxa_path}: {e}"))?; + let mut species: HashSet<String> = HashSet::new(); + let mut taxa: HashSet<String> = HashSet::new(); + for line in io::BufReader::new(taxa_file).lines() { + let line = line.map_err(|e| format!("couldn't read {taxa_path}: {e}"))?; + let line = line.trim_end_matches(['\r', '\n']); + if line.contains(' ') { + species.insert(line.into()); + } else { + taxa.insert(line.into()); + } + } + let definitions = std::fs::File::open(defs_path) + .map_err(|e| format!("couldn't open {defs_path}: {e}"))?; + let levels: HashSet<&str> = [ + "kingdom", + "phylum", + "class", + "order", + "family", + "genus", + ].into_iter().collect(); + let mut animalia = vec![]; + for line in io::BufReader::new(definitions).lines() { + let line = line.map_err(|e| format!("error reading {defs_path}: {e}"))?; + let line = line.trim_end_matches(['\r', '\n']); + let (word, rest) = line.split_once(" ").expect("bad format for definitions file"); + let (_class, definition) = rest.split_once(' ').expect("bad format for definitions file"); +// println!("{word} {definition}"); + let parts: Vec<&str> = definition.split(|c: char| !c.is_alphabetic()).collect(); + for ws in parts.windows(2) { + if species.contains(&format!("{} {}",ws[0],ws[1])) || + // handles {{taxfmt|Felidae|family}} &c. + levels.contains(ws[1]) && taxa.contains(ws[0]) { + animalia.push(word.to_owned()); + } + } + } + animalia.sort_unstable(); + animalia.dedup(); + let output_path = "animalia.txt"; + let mut s = String::new(); + for animal in &animalia { + s.push_str(animal); + s.push('\n'); + } + std::fs::write(output_path, s) + .map_err(|e| format!("couldn't write {output_path}: {e}"))?; Ok(()) } diff --git a/src/main.rs b/src/main.rs index 0fcc3d6..8ca45fb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -25,7 +25,8 @@ fn try_main() -> Result<(), Box<dyn Error>> { let mut args = std::env::args_os().skip(1); let command = args.next(); let no_command = "No command specified. Commands available: -- definitions"; +- definitions +- animalia"; let Some(command) = command else { return Err(no_command.into()); }; |