summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xsrc/parse_txtree.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/src/parse_txtree.py b/src/parse_txtree.py
index 5aaf87c..1b7600d 100755
--- a/src/parse_txtree.py
+++ b/src/parse_txtree.py
@@ -1,6 +1,14 @@
#!/usr/bin/env python3
import regex
+# Parse TXTREE from Catalogue of Life
+# https://www.catalogueoflife.org/data/download
+# To use this script:
+# - Manually remove all non-animal species (not too hard since animalia is the 1st kingdom)
+# - Save that as animalia.txtree
+# - Run script, ignoring all the warnings about weird lines so long as there aren't too many.
+# - Get output in taxa.txt
+
high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$')
species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U)
parenthetical_regex = regex.compile(r' \((\w|\.)+\) ')