Small explainer for parse_txtree

author: pommicket <pommicket@gmail.com> 2025-09-25 23:40:39 -0400
committer: pommicket <pommicket@gmail.com> 2025-09-25 23:40:39 -0400
commit: 01480d8186da5b23a977aa4d99649b0e4a961a14 (patch)
tree: 4f5eb17a31a4de068ef23cf7c286ce9be1f12d7e
parent: fa5db0d2f5cc19e578b74b3bb15f8bb9bc90b789 (diff)
1 files changed, 8 insertions, 0 deletions
diff --git a/src/parse_txtree.py b/src/parse_txtree.py
index 5aaf87c..1b7600d 100755
--- a/src/parse_txtree.py
+++ b/src/parse_txtree.py
@@ -1,6 +1,14 @@
 #!/usr/bin/env python3
 import regex
 
+# Parse TXTREE from Catalogue of Life
+#  https://www.catalogueoflife.org/data/download
+# To use this script:
+# - Manually remove all non-animal species (not too hard since animalia is the 1st kingdom)
+# - Save that as animalia.txtree
+# - Run script, ignoring all the warnings about weird lines so long as there aren't too many.
+# - Get output in taxa.txt
+
 high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$')
 species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U)
 parenthetical_regex = regex.compile(r' \((\w|\.)+\) ')
author	pommicket <pommicket@gmail.com>	2025-09-25 23:40:39 -0400
committer	pommicket <pommicket@gmail.com>	2025-09-25 23:40:39 -0400
commit	01480d8186da5b23a977aa4d99649b0e4a961a14 (patch)
tree	4f5eb17a31a4de068ef23cf7c286ce9be1f12d7e
parent	fa5db0d2f5cc19e578b74b3bb15f8bb9bc90b789 (diff)