diff options
author | pommicket <pommicket@gmail.com> | 2025-09-25 23:40:39 -0400 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2025-09-25 23:40:39 -0400 |
commit | 01480d8186da5b23a977aa4d99649b0e4a961a14 (patch) | |
tree | 4f5eb17a31a4de068ef23cf7c286ce9be1f12d7e | |
parent | fa5db0d2f5cc19e578b74b3bb15f8bb9bc90b789 (diff) |
Small explainer for parse_txtree
-rwxr-xr-x | src/parse_txtree.py | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/src/parse_txtree.py b/src/parse_txtree.py index 5aaf87c..1b7600d 100755 --- a/src/parse_txtree.py +++ b/src/parse_txtree.py @@ -1,6 +1,14 @@ #!/usr/bin/env python3 import regex +# Parse TXTREE from Catalogue of Life +# https://www.catalogueoflife.org/data/download +# To use this script: +# - Manually remove all non-animal species (not too hard since animalia is the 1st kingdom) +# - Save that as animalia.txtree +# - Run script, ignoring all the warnings about weird lines so long as there aren't too many. +# - Get output in taxa.txt + high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$') species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U) parenthetical_regex = regex.compile(r' \((\w|\.)+\) ') |