summaryrefslogtreecommitdiff
path: root/new_language_generation/perl_tools/generate_alphabet.pl
diff options
context:
space:
mode:
authorJason Katz-Brown <jason@airbnb.com>2013-08-25 02:17:13 -0700
committerJason Katz-Brown <jason@airbnb.com>2013-08-25 02:17:13 -0700
commit9306cb60c32082c5403931de0823a9fd5daa196c (patch)
treeca1b6eb695fdf3f0c2294e92416b272164bae642 /new_language_generation/perl_tools/generate_alphabet.pl
parent8fb2c681cecc01b46b0f4ba02d5cc177c4747b1c (diff)
Initial git commit.
Diffstat (limited to 'new_language_generation/perl_tools/generate_alphabet.pl')
-rwxr-xr-xnew_language_generation/perl_tools/generate_alphabet.pl281
1 files changed, 281 insertions, 0 deletions
diff --git a/new_language_generation/perl_tools/generate_alphabet.pl b/new_language_generation/perl_tools/generate_alphabet.pl
new file mode 100755
index 0000000..31006a7
--- /dev/null
+++ b/new_language_generation/perl_tools/generate_alphabet.pl
@@ -0,0 +1,281 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+use Lingua::KO::Hangul::Util qw(:all);
+use POSIX qw(ceil floor);
+
+my $distribution_filename = 'undef';
+my $analysis_filename = 'undef';
+my $count_assigner_filename = 'undef';
+
+my $counts_only = 0;
+
+my $number_of_distinct_tiles = 30;
+my $number_of_blanks = 2;
+
+my $help = 0;
+
+GetOptions('distribution=s' => \$distribution_filename,
+ 'analysis=s' => \$analysis_filename,
+ 'count_assigner=s' => \$count_assigner_filename,
+ 'number_of_distinct_tiles=i' => \$number_of_distinct_tiles,
+ 'number_of_blanks=i' => \$number_of_blanks,
+ 'counts_only' => \$counts_only,
+ 'help|?' => \$help) or pod2usage(2);
+
+pod2usage(1) if $help;
+
+=pod
+
+=head1 NAME
+
+generate_alphabet.pl - Print out quackle alphabet file
+
+=head1 SYNOPSIS
+
+--distribution=<file>
+ output of generate_distribution.pl
+
+--analysis=<file>
+ output of analyse_gcgs.pl
+
+--count_assigner=<file>
+ file with one number per line. Number on nth line is how many
+ tiles are in bag of letter that appears nth least often.
+
+--number_of_distinct_tiles=[1, 30]
+ how many tiles are in the alphabet (should be same as length
+ of count assigner file).
+
+--number_of_blanks=<nonnegative integer>
+ how many scoreless blanks are in the bag.
+
+--counts_only
+ only output the counts of letters, in same format as
+ count_assigner input
+
+=cut
+
+my @count_assigner = ();
+my $has_assigned_counts = 0;
+
+my %character_counts;
+my $count_sum = 0;
+my $inverse_count_sum = 0;
+
+my %character_points_per_appearance;
+my $lowest_points_per_appearance = -1;
+my $highest_points_per_appearance = 0;
+my $maximum_value = 10;
+
+binmode STDOUT, ':utf8';
+
+sub read_count_assigner {
+ if ($count_assigner_filename eq 'undef') {
+ $has_assigned_counts = 0;
+ return;
+ }
+
+ if (!(-e $count_assigner_filename)) {
+ print STDERR "Count assigner $count_assigner_filename does not exist; ignoring.\n";
+ $has_assigned_counts = 0;
+ return;
+ }
+
+ $has_assigned_counts = 1;
+
+ open (my $input, "<:encoding(utf8)", $count_assigner_filename);
+
+ my $i = 0;
+ while (<$input>) {
+ chomp;
+ next if (/^\#/);
+
+ my ($count, @rest) = split /\s/, $_;
+ $count_assigner[$i] = $count;
+ ++$i;
+ }
+}
+
+sub read_analysis {
+ if ($analysis_filename eq 'undef') {
+ return;
+ }
+
+ if (!(-e $analysis_filename)) {
+ print STDERR "Analysis $analysis_filename does not exist; ignoring.\n";
+ return;
+ }
+
+ open (my $input, "<:encoding(utf8)", $analysis_filename);
+
+ my $i = 0;
+ while (<$input>) {
+ chomp;
+ next if (/^\#/);
+
+ my ($letter, $points_per_appearance) = split /\s/, $_;
+ next if (! defined $points_per_appearance);
+
+ next if ($letter =~ /\[/);
+ next if ($letter =~ /\]/);
+
+ $letter =~ s/\|//g;
+
+ $character_points_per_appearance{$letter} = $points_per_appearance;
+ }
+
+ calculate_bounding_points_per_appearance_values();
+}
+
+sub calculate_bounding_points_per_appearance_values {
+ my $i = 0;
+ for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) {
+ if (exists $character_counts{$character} && count_of_character($i, $character) > 0) {
+ my $points_per_appearance = $character_points_per_appearance{$character};
+ if ($lowest_points_per_appearance < 0 || $points_per_appearance < $lowest_points_per_appearance) {
+ $lowest_points_per_appearance = $points_per_appearance;
+ }
+
+ if ($points_per_appearance > $highest_points_per_appearance) {
+ $highest_points_per_appearance = $points_per_appearance;
+ }
+ }
+
+ ++$i;
+ }
+}
+
+sub read_characters {
+ open (my $input, "<:encoding(utf8)", $distribution_filename);
+
+ my $i = 0;
+ while (<$input>) {
+ last if ($i >= $number_of_distinct_tiles);
+
+ chomp;
+ my ($letter, $count) = split(/\s/, $_);
+
+ $character_counts{$letter} = $count;
+ $count_sum += $count;
+ $inverse_count_sum += exp(-.0005*$count);
+ ++$i;
+ }
+}
+
+sub max {
+ return $_[0] > $_[1]? $_[0] : $_[1];
+}
+
+sub value_of_character {
+ my ($letter) = @_;
+ $letter =~ s/\|//g;
+
+ if (exists $character_points_per_appearance{$letter}) {
+ my $value_multiplier = $maximum_value / ($highest_points_per_appearance - $lowest_points_per_appearance);
+
+ return max(floor(($highest_points_per_appearance - $character_points_per_appearance{$letter}) * $value_multiplier), 1);
+ }
+
+ # This was written VERY late at night.
+ #print "computing value of $letter; count_sum=$count_sum; my count=$character_counts{$letter}\n";
+ return max(floor(187 * exp(-.0005*$character_counts{$letter}) / $inverse_count_sum), 1);
+}
+
+sub hallucinate_assigned_counts {
+ my $total_rounded_count = $number_of_blanks;
+ my $i = 0;
+ for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) {
+ my $original_count = $character_counts{$character};
+ my $rounded_count;
+
+ my $truth = 100 * $original_count / $count_sum;
+ if ($truth < 0.1) {
+ $rounded_count = 0;
+ } else {
+ $rounded_count = max(floor($truth), 1);
+ }
+
+ $total_rounded_count += $rounded_count;
+ $count_assigner[$i] = $rounded_count;
+
+ ++$i;
+ }
+
+ $has_assigned_counts = 1;
+}
+
+sub count_of_character {
+ my ($index, $count) = @_;
+
+ if ($has_assigned_counts && $index < @count_assigner) {
+ return $count_assigner[$index]
+ }
+
+ return 0;
+}
+
+sub spit_characters {
+ if ($counts_only) {
+ my $i = 0;
+ for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) {
+ my $count = count_of_character($i, $character_counts{$character});
+ print "$count #" . ($i + 1) . ": $character\n";
+ ++$i;
+ }
+ return;
+ }
+
+ my $total_score = 0;
+ my $total_count = $number_of_blanks;
+ print "blank 0 $number_of_blanks\n";
+
+ print "#char\tblank\tscore\tcount\tvowel\n";
+ my $i = 0;
+ for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) {
+ my $blank_text = lc($character);
+
+ # TODO lower-case blanks currently broken.
+ my $disable_lower_case_blanks = 1;
+ if ($disable_lower_case_blanks || $blank_text eq $character) {
+ $blank_text = '['.$character.']';
+ $blank_text =~ s/\|//g;
+ }
+
+ my $score = value_of_character($character);
+ my $count = count_of_character($i, $character_counts{$character});
+ $total_count += $count;
+ $total_score += $score;
+
+ my $is_vowel = '0';
+
+ # TODO make work for unfilled jamo
+ if (isStandardForm($character)) {
+ my @codepoints = unpack('U*', $character);
+ if ($codepoints[0] == 0x115F) {
+ $is_vowel = '1';
+ }
+ }
+
+ ++$i;
+
+ print "$character\t$blank_text\t$score\t$count\t$is_vowel\n";
+ }
+
+ print "# Total count: $total_count\n";
+ print "# Total score: $total_score\n";
+}
+
+read_characters();
+read_count_assigner();
+if (!$has_assigned_counts) {
+ hallucinate_assigned_counts();
+}
+
+read_analysis();
+spit_characters();