summaryrefslogtreecommitdiff
path: root/new_language_generation/perl_tools/generate_distribution.pl
diff options
context:
space:
mode:
authorJason Katz-Brown <jason@airbnb.com>2013-08-25 02:17:13 -0700
committerJason Katz-Brown <jason@airbnb.com>2013-08-25 02:17:13 -0700
commit9306cb60c32082c5403931de0823a9fd5daa196c (patch)
treeca1b6eb695fdf3f0c2294e92416b272164bae642 /new_language_generation/perl_tools/generate_distribution.pl
parent8fb2c681cecc01b46b0f4ba02d5cc177c4747b1c (diff)
Initial git commit.
Diffstat (limited to 'new_language_generation/perl_tools/generate_distribution.pl')
-rwxr-xr-xnew_language_generation/perl_tools/generate_distribution.pl57
1 files changed, 57 insertions, 0 deletions
diff --git a/new_language_generation/perl_tools/generate_distribution.pl b/new_language_generation/perl_tools/generate_distribution.pl
new file mode 100755
index 0000000..e86e04a
--- /dev/null
+++ b/new_language_generation/perl_tools/generate_distribution.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/perl
+
+# USAGE: generate_distribution.pl word_list > character_list
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+my $help = 0;
+
+GetOptions(
+ 'help|?' => \$help) or pod2usage(2);
+
+pod2usage(1) if $help;
+
+=pod
+
+=head1 NAME
+
+generate_distribution.pl - Generate letter distribution data
+
+=head1 SYNOPSIS
+
+One argument: the output of generate_words.pl.
+
+=cut
+
+binmode STDOUT, ':utf8';
+
+my %characters;
+
+sub read_characters {
+ for my $arg (@ARGV) {
+ open (my $input, "<:encoding(utf8)", $arg);
+
+ while (<$input>) {
+ chomp;
+ my ($word, $count) = split(/\s/, $_);
+
+ for my $character (split(/;/, $word)) {
+ $characters{$character} += $count;
+ }
+ }
+ }
+}
+
+sub spit_characters {
+ my @sorted_characters = sort { $characters{$b} <=> $characters{$a} } keys %characters;
+ for my $character (@sorted_characters) {
+ print "$character $characters{$character}\n";
+ }
+}
+
+read_characters();
+spit_characters();