summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpommicket <pommicket@gmail.com>2025-09-25 23:40:39 -0400
committerpommicket <pommicket@gmail.com>2025-09-25 23:40:39 -0400
commit01480d8186da5b23a977aa4d99649b0e4a961a14 (patch)
tree4f5eb17a31a4de068ef23cf7c286ce9be1f12d7e
parentfa5db0d2f5cc19e578b74b3bb15f8bb9bc90b789 (diff)
Small explainer for parse_txtree
-rwxr-xr-xsrc/parse_txtree.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/src/parse_txtree.py b/src/parse_txtree.py
index 5aaf87c..1b7600d 100755
--- a/src/parse_txtree.py
+++ b/src/parse_txtree.py
@@ -1,6 +1,14 @@
#!/usr/bin/env python3
import regex
+# Parse TXTREE from Catalogue of Life
+# https://www.catalogueoflife.org/data/download
+# To use this script:
+# - Manually remove all non-animal species (not too hard since animalia is the 1st kingdom)
+# - Save that as animalia.txtree
+# - Run script, ignoring all the warnings about weird lines so long as there aren't too many.
+# - Get output in taxa.txt
+
high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$')
species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U)
parenthetical_regex = regex.compile(r' \((\w|\.)+\) ')