summaryrefslogtreecommitdiff
path: root/src/parse_txtree.py
blob: 1b7600d7ca452d2ca03fdc79c25129f44383c321 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/env python3
import regex

# Parse TXTREE from Catalogue of Life
#  https://www.catalogueoflife.org/data/download
# To use this script:
# - Manually remove all non-animal species (not too hard since animalia is the 1st kingdom)
# - Save that as animalia.txtree
# - Run script, ignoring all the warnings about weird lines so long as there aren't too many.
# - Get output in taxa.txt

high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$')
species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U)
parenthetical_regex = regex.compile(r' \((\w|\.)+\) ')

with open('animalia.txtree') as animalia, open('taxa.txt', 'w') as taxa:
	for line in animalia:
		line = line.strip().lstrip('=').replace('†', '').replace(' [sensu lato] ', ' ')
		line = parenthetical_regex.sub(' ', line)
		if '[species]' in line:
			words = line.split()
			if len(words) < 2 or \
				not high_taxon_regex.match(words[0]) or \
				not species_regex.match(words[1]):
				print('Weird line:',line)
				continue
			taxa.write(words[0] + ' ' + words[1] + '\n')
		elif (i := line.find(' ')) > 0 and high_taxon_regex.match(taxon := line[:i]):
			taxa.write(taxon + '\n')
		else:
			print('Weird line:', line)
			continue