diff options
Diffstat (limited to 'src/parse_txtree.py')
-rwxr-xr-x | src/parse_txtree.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/src/parse_txtree.py b/src/parse_txtree.py new file mode 100755 index 0000000..5aaf87c --- /dev/null +++ b/src/parse_txtree.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import regex + +high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$') +species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U) +parenthetical_regex = regex.compile(r' \((\w|\.)+\) ') + +with open('animalia.txtree') as animalia, open('taxa.txt', 'w') as taxa: + for line in animalia: + line = line.strip().lstrip('=').replace('†', '').replace(' [sensu lato] ', ' ') + line = parenthetical_regex.sub(' ', line) + if '[species]' in line: + words = line.split() + if len(words) < 2 or \ + not high_taxon_regex.match(words[0]) or \ + not species_regex.match(words[1]): + print('Weird line:',line) + continue + taxa.write(words[0] + ' ' + words[1] + '\n') + elif (i := line.find(' ')) > 0 and high_taxon_regex.match(taxon := line[:i]): + taxa.write(taxon + '\n') + else: + print('Weird line:', line) + continue |