From 9306cb60c32082c5403931de0823a9fd5daa196c Mon Sep 17 00:00:00 2001 From: Jason Katz-Brown Date: Sun, 25 Aug 2013 02:17:13 -0700 Subject: Initial git commit. --- .../perl_tools/generate_distribution.pl | 57 ++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100755 new_language_generation/perl_tools/generate_distribution.pl (limited to 'new_language_generation/perl_tools/generate_distribution.pl') diff --git a/new_language_generation/perl_tools/generate_distribution.pl b/new_language_generation/perl_tools/generate_distribution.pl new file mode 100755 index 0000000..e86e04a --- /dev/null +++ b/new_language_generation/perl_tools/generate_distribution.pl @@ -0,0 +1,57 @@ +#!/usr/bin/perl + +# USAGE: generate_distribution.pl word_list > character_list + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +my $help = 0; + +GetOptions( + 'help|?' => \$help) or pod2usage(2); + +pod2usage(1) if $help; + +=pod + +=head1 NAME + +generate_distribution.pl - Generate letter distribution data + +=head1 SYNOPSIS + +One argument: the output of generate_words.pl. + +=cut + +binmode STDOUT, ':utf8'; + +my %characters; + +sub read_characters { + for my $arg (@ARGV) { + open (my $input, "<:encoding(utf8)", $arg); + + while (<$input>) { + chomp; + my ($word, $count) = split(/\s/, $_); + + for my $character (split(/;/, $word)) { + $characters{$character} += $count; + } + } + } +} + +sub spit_characters { + my @sorted_characters = sort { $characters{$b} <=> $characters{$a} } keys %characters; + for my $character (@sorted_characters) { + print "$character $characters{$character}\n"; + } +} + +read_characters(); +spit_characters(); -- cgit v1.2.3