diff options
Diffstat (limited to 'src/parse_txtree.py')
-rwxr-xr-x | src/parse_txtree.py | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/src/parse_txtree.py b/src/parse_txtree.py index 5aaf87c..1b7600d 100755 --- a/src/parse_txtree.py +++ b/src/parse_txtree.py @@ -1,6 +1,14 @@ #!/usr/bin/env python3 import regex +# Parse TXTREE from Catalogue of Life +# https://www.catalogueoflife.org/data/download +# To use this script: +# - Manually remove all non-animal species (not too hard since animalia is the 1st kingdom) +# - Save that as animalia.txtree +# - Run script, ignoring all the warnings about weird lines so long as there aren't too many. +# - Get output in taxa.txt + high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$') species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U) parenthetical_regex = regex.compile(r' \((\w|\.)+\) ') |