diff options
author | Jason Katz-Brown <jason@airbnb.com> | 2013-08-25 02:17:13 -0700 |
---|---|---|
committer | Jason Katz-Brown <jason@airbnb.com> | 2013-08-25 02:17:13 -0700 |
commit | 9306cb60c32082c5403931de0823a9fd5daa196c (patch) | |
tree | ca1b6eb695fdf3f0c2294e92416b272164bae642 /new_language_generation/perl_tools/generate_distribution.pl | |
parent | 8fb2c681cecc01b46b0f4ba02d5cc177c4747b1c (diff) |
Initial git commit.
Diffstat (limited to 'new_language_generation/perl_tools/generate_distribution.pl')
-rwxr-xr-x | new_language_generation/perl_tools/generate_distribution.pl | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/new_language_generation/perl_tools/generate_distribution.pl b/new_language_generation/perl_tools/generate_distribution.pl new file mode 100755 index 0000000..e86e04a --- /dev/null +++ b/new_language_generation/perl_tools/generate_distribution.pl @@ -0,0 +1,57 @@ +#!/usr/bin/perl + +# USAGE: generate_distribution.pl word_list > character_list + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +my $help = 0; + +GetOptions( + 'help|?' => \$help) or pod2usage(2); + +pod2usage(1) if $help; + +=pod + +=head1 NAME + +generate_distribution.pl - Generate letter distribution data + +=head1 SYNOPSIS + +One argument: the output of generate_words.pl. + +=cut + +binmode STDOUT, ':utf8'; + +my %characters; + +sub read_characters { + for my $arg (@ARGV) { + open (my $input, "<:encoding(utf8)", $arg); + + while (<$input>) { + chomp; + my ($word, $count) = split(/\s/, $_); + + for my $character (split(/;/, $word)) { + $characters{$character} += $count; + } + } + } +} + +sub spit_characters { + my @sorted_characters = sort { $characters{$b} <=> $characters{$a} } keys %characters; + for my $character (@sorted_characters) { + print "$character $characters{$character}\n"; + } +} + +read_characters(); +spit_characters(); |