#!/usr/bin/env python3 import regex # Parse TXTREE from Catalogue of Life # https://www.catalogueoflife.org/data/download # To use this script: # - Manually remove all non-animal species (not too hard since animalia is the 1st kingdom) # - Save that as animalia.txtree # - Run script, ignoring all the warnings about weird lines so long as there aren't too many. # - Get output in taxa.txt high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$') species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U) parenthetical_regex = regex.compile(r' \((\w|\.)+\) ') with open('animalia.txtree') as animalia, open('taxa.txt', 'w') as taxa: for line in animalia: line = line.strip().lstrip('=').replace('†', '').replace(' [sensu lato] ', ' ') line = parenthetical_regex.sub(' ', line) if '[species]' in line: words = line.split() if len(words) < 2 or \ not high_taxon_regex.match(words[0]) or \ not species_regex.match(words[1]): print('Weird line:',line) continue taxa.write(words[0] + ' ' + words[1] + '\n') elif (i := line.find(' ')) > 0 and high_taxon_regex.match(taxon := line[:i]): taxa.write(taxon + '\n') else: print('Weird line:', line) continue