#!/usr/bin/env python3 import regex high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$') species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U) parenthetical_regex = regex.compile(r' \((\w|\.)+\) ') with open('animalia.txtree') as animalia, open('taxa.txt', 'w') as taxa: for line in animalia: line = line.strip().lstrip('=').replace('†', '').replace(' [sensu lato] ', ' ') line = parenthetical_regex.sub(' ', line) if '[species]' in line: words = line.split() if len(words) < 2 or \ not high_taxon_regex.match(words[0]) or \ not species_regex.match(words[1]): print('Weird line:',line) continue taxa.write(words[0] + ' ' + words[1] + '\n') elif (i := line.find(' ')) > 0 and high_taxon_regex.match(taxon := line[:i]): taxa.write(taxon + '\n') else: print('Weird line:', line) continue