summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--Cargo.lock2
-rw-r--r--Cargo.toml2
-rw-r--r--README.md2
-rw-r--r--index.html15
-rw-r--r--src/animalia.rs55
-rw-r--r--src/main.rs3
7 files changed, 69 insertions, 11 deletions
diff --git a/.gitignore b/.gitignore
index 811af54..b2d5af0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
/target
enwiktionary-*.xml-p*
*definitions.txt*
+animalia.txt*
.*.tmp
*~
.vscode
diff --git a/Cargo.lock b/Cargo.lock
index 213875d..c8519f5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3,7 +3,7 @@
version = 4
[[package]]
-name = "wicopy"
+name = "wiktionary"
version = "0.1.0"
dependencies = [
"xml",
diff --git a/Cargo.toml b/Cargo.toml
index 2a9267e..8361bae 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
[package]
-name = "wicopy"
+name = "wiktionary"
version = "0.1.0"
edition = "2024"
diff --git a/README.md b/README.md
index ecd34b4..92ef4dc 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# wicopy
+# wiktionary
Various miscellaneous scripts for parsing [wiktionary data dumps](https://dumps.wikimedia.org/enwiktionary/).
diff --git a/index.html b/index.html
index 306087f..8dff7cd 100644
--- a/index.html
+++ b/index.html
@@ -22,11 +22,6 @@
</p>
<ul>
<li>
- The Big List: <a href="/tmt/word-list.txt.xz">word-list.txt.xz (27MB compressed, 120MB uncompressed, 9,878,558 entries)</a>.¹<br>
- Every English Wikipedia article title &amp; entry in English Wiktionary; containing only ASCII a-z/A-Z/space, max 2 words.<br>
- Words labelled <i>offensive</i> on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process).
- </li>
- <li>
English definitions:
<a href="/wiktionary/en-definitions.txt.xz">en-definitions.txt.xz (23MB compressed, 127MB uncompressed, 1,629,482 entries)</a>
and<br>Translingual definitions:
@@ -68,6 +63,16 @@
<code>DEFINITION</code> is in the wikitext format.<br>
It’s possible that there are parsing errors, but I haven’t spotted any yet.
</li>
+ <li>
+ All English animal terms: <a href="/wiktionary/animalia.txt.xz">animalia.txt.xz (62KB compressed, 192KB uncompressed)</a>.¹<br>
+ This includes both nouns referring to animals (e.g. <i>dog</i>) and animal-related adjectives (e.g. <i>canine</i>).
+ There could definitely be errors due to bad parsing (but I have checked a number of entries at random and they seem good).
+ </li>
+ <li>
+ The Big List: <a href="/tmt/word-list.txt.xz">word-list.txt.xz (27MB compressed, 120MB uncompressed, 9,878,558 entries)</a>.¹<br>
+ Every English Wikipedia article title &amp; entry in English Wiktionary; containing only ASCII a-z/A-Z/space, max 2 words.<br>
+ Words labelled <i>offensive</i> on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process).
+ </li>
</ul>
<p>¹ Derived from <a href="https://dumps.wikimedia.org/enwiktionary/20250701/" target="_blank">enwiktionary-20250701</a> dump.</p>
</body>
diff --git a/src/animalia.rs b/src/animalia.rs
index f3b3261..e1ec8eb 100644
--- a/src/animalia.rs
+++ b/src/animalia.rs
@@ -1,9 +1,60 @@
use std::error::Error;
-
+use std::io::{self, prelude::*};
+use std::collections::HashSet;
pub fn main(args: Vec<String>) -> Result<(), Box<dyn Error>> {
if !args.is_empty() {
Err("No arguments expected to 'animalia' command")?;
}
-
+ let defs_path = "en-definitions.txt";
+ let taxa_path = "taxa.txt";
+ let taxa_file = std::fs::File::open(taxa_path)
+ .map_err(|e| format!("couldn't open {taxa_path}: {e}"))?;
+ let mut species: HashSet<String> = HashSet::new();
+ let mut taxa: HashSet<String> = HashSet::new();
+ for line in io::BufReader::new(taxa_file).lines() {
+ let line = line.map_err(|e| format!("couldn't read {taxa_path}: {e}"))?;
+ let line = line.trim_end_matches(['\r', '\n']);
+ if line.contains(' ') {
+ species.insert(line.into());
+ } else {
+ taxa.insert(line.into());
+ }
+ }
+ let definitions = std::fs::File::open(defs_path)
+ .map_err(|e| format!("couldn't open {defs_path}: {e}"))?;
+ let levels: HashSet<&str> = [
+ "kingdom",
+ "phylum",
+ "class",
+ "order",
+ "family",
+ "genus",
+ ].into_iter().collect();
+ let mut animalia = vec![];
+ for line in io::BufReader::new(definitions).lines() {
+ let line = line.map_err(|e| format!("error reading {defs_path}: {e}"))?;
+ let line = line.trim_end_matches(['\r', '\n']);
+ let (word, rest) = line.split_once(" ").expect("bad format for definitions file");
+ let (_class, definition) = rest.split_once(' ').expect("bad format for definitions file");
+// println!("{word} {definition}");
+ let parts: Vec<&str> = definition.split(|c: char| !c.is_alphabetic()).collect();
+ for ws in parts.windows(2) {
+ if species.contains(&format!("{} {}",ws[0],ws[1])) ||
+ // handles {{taxfmt|Felidae|family}} &c.
+ levels.contains(ws[1]) && taxa.contains(ws[0]) {
+ animalia.push(word.to_owned());
+ }
+ }
+ }
+ animalia.sort_unstable();
+ animalia.dedup();
+ let output_path = "animalia.txt";
+ let mut s = String::new();
+ for animal in &animalia {
+ s.push_str(animal);
+ s.push('\n');
+ }
+ std::fs::write(output_path, s)
+ .map_err(|e| format!("couldn't write {output_path}: {e}"))?;
Ok(())
}
diff --git a/src/main.rs b/src/main.rs
index 0fcc3d6..8ca45fb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -25,7 +25,8 @@ fn try_main() -> Result<(), Box<dyn Error>> {
let mut args = std::env::args_os().skip(1);
let command = args.next();
let no_command = "No command specified. Commands available:
-- definitions";
+- definitions
+- animalia";
let Some(command) = command else {
return Err(no_command.into());
};