From 9306cb60c32082c5403931de0823a9fd5daa196c Mon Sep 17 00:00:00 2001
From: Jason Katz-Brown <jason@airbnb.com>
Date: Sun, 25 Aug 2013 02:17:13 -0700
Subject: Initial git commit.

---
 .../perl_tools/generate_distribution.pl            | 57 ++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100755 new_language_generation/perl_tools/generate_distribution.pl

(limited to 'new_language_generation/perl_tools/generate_distribution.pl')

diff --git a/new_language_generation/perl_tools/generate_distribution.pl b/new_language_generation/perl_tools/generate_distribution.pl
new file mode 100755
index 0000000..e86e04a
--- /dev/null
+++ b/new_language_generation/perl_tools/generate_distribution.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/perl
+
+# USAGE: generate_distribution.pl word_list > character_list
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+my $help = 0;
+
+GetOptions(
+           'help|?' => \$help) or pod2usage(2);
+
+pod2usage(1) if $help;
+
+=pod
+
+=head1 NAME
+
+generate_distribution.pl - Generate letter distribution data
+
+=head1 SYNOPSIS
+
+One argument: the output of generate_words.pl.
+
+=cut
+
+binmode STDOUT, ':utf8';
+
+my %characters;
+
+sub read_characters {
+	for my $arg (@ARGV) {
+		open (my $input, "<:encoding(utf8)", $arg);
+
+		while (<$input>) {
+			chomp;
+			my ($word, $count) = split(/\s/, $_);
+
+			for my $character (split(/;/, $word)) {
+				$characters{$character} += $count;
+			}
+		}
+	}
+}
+
+sub spit_characters {
+	my @sorted_characters = sort { $characters{$b} <=> $characters{$a} } keys %characters;
+	for my $character (@sorted_characters) {
+		print "$character $characters{$character}\n";
+	}
+}
+
+read_characters();
+spit_characters();
-- 
cgit v1.2.3