summaryrefslogtreecommitdiff
path: root/src/parse_txtree.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse_txtree.py')
-rwxr-xr-xsrc/parse_txtree.py24
1 files changed, 24 insertions, 0 deletions
diff --git a/src/parse_txtree.py b/src/parse_txtree.py
new file mode 100755
index 0000000..5aaf87c
--- /dev/null
+++ b/src/parse_txtree.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import regex
+
+high_taxon_regex = regex.compile(r'^[A-Z](-|\p{L})+$')
+species_regex = regex.compile(r"^(\p{Ll}|\p{Nd})(-|\p{L}|\p{Nd}|\.|')*$", flags=regex.U)
+parenthetical_regex = regex.compile(r' \((\w|\.)+\) ')
+
+with open('animalia.txtree') as animalia, open('taxa.txt', 'w') as taxa:
+ for line in animalia:
+ line = line.strip().lstrip('=').replace('†', '').replace(' [sensu lato] ', ' ')
+ line = parenthetical_regex.sub(' ', line)
+ if '[species]' in line:
+ words = line.split()
+ if len(words) < 2 or \
+ not high_taxon_regex.match(words[0]) or \
+ not species_regex.match(words[1]):
+ print('Weird line:',line)
+ continue
+ taxa.write(words[0] + ' ' + words[1] + '\n')
+ elif (i := line.find(' ')) > 0 and high_taxon_regex.match(taxon := line[:i]):
+ taxa.write(taxon + '\n')
+ else:
+ print('Weird line:', line)
+ continue