From fa5db0d2f5cc19e578b74b3bb15f8bb9bc90b789 Mon Sep 17 00:00:00 2001
From: pommicket
Date: Thu, 25 Sep 2025 23:36:05 -0400
Subject: Animalia
---
.gitignore | 1 +
Cargo.lock | 2 +-
Cargo.toml | 2 +-
README.md | 2 +-
index.html | 15 ++++++++++-----
src/animalia.rs | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
src/main.rs | 3 ++-
7 files changed, 69 insertions(+), 11 deletions(-)
diff --git a/.gitignore b/.gitignore
index 811af54..b2d5af0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
/target
enwiktionary-*.xml-p*
*definitions.txt*
+animalia.txt*
.*.tmp
*~
.vscode
diff --git a/Cargo.lock b/Cargo.lock
index 213875d..c8519f5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3,7 +3,7 @@
version = 4
[[package]]
-name = "wicopy"
+name = "wiktionary"
version = "0.1.0"
dependencies = [
"xml",
diff --git a/Cargo.toml b/Cargo.toml
index 2a9267e..8361bae 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
[package]
-name = "wicopy"
+name = "wiktionary"
version = "0.1.0"
edition = "2024"
diff --git a/README.md b/README.md
index ecd34b4..92ef4dc 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# wicopy
+# wiktionary
Various miscellaneous scripts for parsing [wiktionary data dumps](https://dumps.wikimedia.org/enwiktionary/).
diff --git a/index.html b/index.html
index 306087f..8dff7cd 100644
--- a/index.html
+++ b/index.html
@@ -21,11 +21,6 @@
Wiktionary's licensing, where applicable.
- -
- The Big List: word-list.txt.xz (27MB compressed, 120MB uncompressed, 9,878,558 entries).¹
- Every English Wikipedia article title & entry in English Wiktionary; containing only ASCII a-z/A-Z/space, max 2 words.
- Words labelled offensive on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process).
-
-
English definitions:
en-definitions.txt.xz (23MB compressed, 127MB uncompressed, 1,629,482 entries)
@@ -68,6 +63,16 @@
DEFINITION
is in the wikitext format.
It’s possible that there are parsing errors, but I haven’t spotted any yet.
+ -
+ All English animal terms: animalia.txt.xz (62KB compressed, 192KB uncompressed).¹
+ This includes both nouns referring to animals (e.g. dog) and animal-related adjectives (e.g. canine).
+ There could definitely be errors due to bad parsing (but I have checked a number of entries at random and they seem good).
+
+ -
+ The Big List: word-list.txt.xz (27MB compressed, 120MB uncompressed, 9,878,558 entries).¹
+ Every English Wikipedia article title & entry in English Wiktionary; containing only ASCII a-z/A-Z/space, max 2 words.
+ Words labelled offensive on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process).
+
¹ Derived from enwiktionary-20250701 dump.