From 9306cb60c32082c5403931de0823a9fd5daa196c Mon Sep 17 00:00:00 2001 From: Jason Katz-Brown Date: Sun, 25 Aug 2013 02:17:13 -0700 Subject: Initial git commit. --- new_language_generation/perl_tools/analyse_gcgs.pl | 56 ++++ .../perl_tools/generate_alphabet.pl | 281 ++++++++++++++++++ .../perl_tools/generate_distribution.pl | 57 ++++ .../perl_tools/generate_raw_words.pl | 23 ++ .../perl_tools/generate_tileset.pl | 103 +++++++ .../perl_tools/generate_words.pl | 317 +++++++++++++++++++++ 6 files changed, 837 insertions(+) create mode 100755 new_language_generation/perl_tools/analyse_gcgs.pl create mode 100755 new_language_generation/perl_tools/generate_alphabet.pl create mode 100755 new_language_generation/perl_tools/generate_distribution.pl create mode 100755 new_language_generation/perl_tools/generate_raw_words.pl create mode 100755 new_language_generation/perl_tools/generate_tileset.pl create mode 100755 new_language_generation/perl_tools/generate_words.pl (limited to 'new_language_generation') diff --git a/new_language_generation/perl_tools/analyse_gcgs.pl b/new_language_generation/perl_tools/analyse_gcgs.pl new file mode 100755 index 0000000..94e7cce --- /dev/null +++ b/new_language_generation/perl_tools/analyse_gcgs.pl @@ -0,0 +1,56 @@ +#!/usr/bin/perl + +# USAGE: analyse_gcgs.pl *.gcg > gcganalysis + +use warnings; +use strict; + +my %letter_points; +my %letter_counts; + +binmode STDOUT, ':utf8'; + +sub read_gcgs { + my $games_count = 0; + for my $arg (@ARGV) { + open (my $input, "<:encoding(utf8)", $arg); + ++$games_count; + + while (<$input>) { + chomp; + next if (!/^\>/); + + my ($player, $rack, $position, $play, $score, $cumulativescore) = split /\s+/, $_; + + next if (! defined $cumulativescore); + if ($play =~ /\[/) { + print "blank play! $play for $score\n"; + next; + } + + for my $letter (split /|/, $play) { + ++$letter_counts{$letter}; + $letter_points{$letter} += int($score); + + #print "$letter now seen " . $letter_counts{$letter} . " times, scored " . $letter_points{$letter} . " points\n"; + } + } + } + + print "# Read $games_count games.\n"; +} + +sub spit_analysis { + my %points_per_turn; + for my $letter (keys %letter_counts) { + $points_per_turn{$letter} = $letter_points{$letter} / $letter_counts{$letter}; + } + + for my $letter (sort { $points_per_turn{$b} <=> $points_per_turn{$a} } keys %points_per_turn) { + # TODO lame that this always puts out un-bar-surrounded letters + print "$letter " . $points_per_turn{$letter} . "\n"; + } +} + +read_gcgs(); +spit_analysis(); diff --git a/new_language_generation/perl_tools/generate_alphabet.pl b/new_language_generation/perl_tools/generate_alphabet.pl new file mode 100755 index 0000000..31006a7 --- /dev/null +++ b/new_language_generation/perl_tools/generate_alphabet.pl @@ -0,0 +1,281 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +use Lingua::KO::Hangul::Util qw(:all); +use POSIX qw(ceil floor); + +my $distribution_filename = 'undef'; +my $analysis_filename = 'undef'; +my $count_assigner_filename = 'undef'; + +my $counts_only = 0; + +my $number_of_distinct_tiles = 30; +my $number_of_blanks = 2; + +my $help = 0; + +GetOptions('distribution=s' => \$distribution_filename, + 'analysis=s' => \$analysis_filename, + 'count_assigner=s' => \$count_assigner_filename, + 'number_of_distinct_tiles=i' => \$number_of_distinct_tiles, + 'number_of_blanks=i' => \$number_of_blanks, + 'counts_only' => \$counts_only, + 'help|?' => \$help) or pod2usage(2); + +pod2usage(1) if $help; + +=pod + +=head1 NAME + +generate_alphabet.pl - Print out quackle alphabet file + +=head1 SYNOPSIS + +--distribution= + output of generate_distribution.pl + +--analysis= + output of analyse_gcgs.pl + +--count_assigner= + file with one number per line. Number on nth line is how many + tiles are in bag of letter that appears nth least often. + +--number_of_distinct_tiles=[1, 30] + how many tiles are in the alphabet (should be same as length + of count assigner file). + +--number_of_blanks= + how many scoreless blanks are in the bag. + +--counts_only + only output the counts of letters, in same format as + count_assigner input + +=cut + +my @count_assigner = (); +my $has_assigned_counts = 0; + +my %character_counts; +my $count_sum = 0; +my $inverse_count_sum = 0; + +my %character_points_per_appearance; +my $lowest_points_per_appearance = -1; +my $highest_points_per_appearance = 0; +my $maximum_value = 10; + +binmode STDOUT, ':utf8'; + +sub read_count_assigner { + if ($count_assigner_filename eq 'undef') { + $has_assigned_counts = 0; + return; + } + + if (!(-e $count_assigner_filename)) { + print STDERR "Count assigner $count_assigner_filename does not exist; ignoring.\n"; + $has_assigned_counts = 0; + return; + } + + $has_assigned_counts = 1; + + open (my $input, "<:encoding(utf8)", $count_assigner_filename); + + my $i = 0; + while (<$input>) { + chomp; + next if (/^\#/); + + my ($count, @rest) = split /\s/, $_; + $count_assigner[$i] = $count; + ++$i; + } +} + +sub read_analysis { + if ($analysis_filename eq 'undef') { + return; + } + + if (!(-e $analysis_filename)) { + print STDERR "Analysis $analysis_filename does not exist; ignoring.\n"; + return; + } + + open (my $input, "<:encoding(utf8)", $analysis_filename); + + my $i = 0; + while (<$input>) { + chomp; + next if (/^\#/); + + my ($letter, $points_per_appearance) = split /\s/, $_; + next if (! defined $points_per_appearance); + + next if ($letter =~ /\[/); + next if ($letter =~ /\]/); + + $letter =~ s/\|//g; + + $character_points_per_appearance{$letter} = $points_per_appearance; + } + + calculate_bounding_points_per_appearance_values(); +} + +sub calculate_bounding_points_per_appearance_values { + my $i = 0; + for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) { + if (exists $character_counts{$character} && count_of_character($i, $character) > 0) { + my $points_per_appearance = $character_points_per_appearance{$character}; + if ($lowest_points_per_appearance < 0 || $points_per_appearance < $lowest_points_per_appearance) { + $lowest_points_per_appearance = $points_per_appearance; + } + + if ($points_per_appearance > $highest_points_per_appearance) { + $highest_points_per_appearance = $points_per_appearance; + } + } + + ++$i; + } +} + +sub read_characters { + open (my $input, "<:encoding(utf8)", $distribution_filename); + + my $i = 0; + while (<$input>) { + last if ($i >= $number_of_distinct_tiles); + + chomp; + my ($letter, $count) = split(/\s/, $_); + + $character_counts{$letter} = $count; + $count_sum += $count; + $inverse_count_sum += exp(-.0005*$count); + ++$i; + } +} + +sub max { + return $_[0] > $_[1]? $_[0] : $_[1]; +} + +sub value_of_character { + my ($letter) = @_; + $letter =~ s/\|//g; + + if (exists $character_points_per_appearance{$letter}) { + my $value_multiplier = $maximum_value / ($highest_points_per_appearance - $lowest_points_per_appearance); + + return max(floor(($highest_points_per_appearance - $character_points_per_appearance{$letter}) * $value_multiplier), 1); + } + + # This was written VERY late at night. + #print "computing value of $letter; count_sum=$count_sum; my count=$character_counts{$letter}\n"; + return max(floor(187 * exp(-.0005*$character_counts{$letter}) / $inverse_count_sum), 1); +} + +sub hallucinate_assigned_counts { + my $total_rounded_count = $number_of_blanks; + my $i = 0; + for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) { + my $original_count = $character_counts{$character}; + my $rounded_count; + + my $truth = 100 * $original_count / $count_sum; + if ($truth < 0.1) { + $rounded_count = 0; + } else { + $rounded_count = max(floor($truth), 1); + } + + $total_rounded_count += $rounded_count; + $count_assigner[$i] = $rounded_count; + + ++$i; + } + + $has_assigned_counts = 1; +} + +sub count_of_character { + my ($index, $count) = @_; + + if ($has_assigned_counts && $index < @count_assigner) { + return $count_assigner[$index] + } + + return 0; +} + +sub spit_characters { + if ($counts_only) { + my $i = 0; + for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) { + my $count = count_of_character($i, $character_counts{$character}); + print "$count #" . ($i + 1) . ": $character\n"; + ++$i; + } + return; + } + + my $total_score = 0; + my $total_count = $number_of_blanks; + print "blank 0 $number_of_blanks\n"; + + print "#char\tblank\tscore\tcount\tvowel\n"; + my $i = 0; + for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) { + my $blank_text = lc($character); + + # TODO lower-case blanks currently broken. + my $disable_lower_case_blanks = 1; + if ($disable_lower_case_blanks || $blank_text eq $character) { + $blank_text = '['.$character.']'; + $blank_text =~ s/\|//g; + } + + my $score = value_of_character($character); + my $count = count_of_character($i, $character_counts{$character}); + $total_count += $count; + $total_score += $score; + + my $is_vowel = '0'; + + # TODO make work for unfilled jamo + if (isStandardForm($character)) { + my @codepoints = unpack('U*', $character); + if ($codepoints[0] == 0x115F) { + $is_vowel = '1'; + } + } + + ++$i; + + print "$character\t$blank_text\t$score\t$count\t$is_vowel\n"; + } + + print "# Total count: $total_count\n"; + print "# Total score: $total_score\n"; +} + +read_characters(); +read_count_assigner(); +if (!$has_assigned_counts) { + hallucinate_assigned_counts(); +} + +read_analysis(); +spit_characters(); diff --git a/new_language_generation/perl_tools/generate_distribution.pl b/new_language_generation/perl_tools/generate_distribution.pl new file mode 100755 index 0000000..e86e04a --- /dev/null +++ b/new_language_generation/perl_tools/generate_distribution.pl @@ -0,0 +1,57 @@ +#!/usr/bin/perl + +# USAGE: generate_distribution.pl word_list > character_list + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +my $help = 0; + +GetOptions( + 'help|?' => \$help) or pod2usage(2); + +pod2usage(1) if $help; + +=pod + +=head1 NAME + +generate_distribution.pl - Generate letter distribution data + +=head1 SYNOPSIS + +One argument: the output of generate_words.pl. + +=cut + +binmode STDOUT, ':utf8'; + +my %characters; + +sub read_characters { + for my $arg (@ARGV) { + open (my $input, "<:encoding(utf8)", $arg); + + while (<$input>) { + chomp; + my ($word, $count) = split(/\s/, $_); + + for my $character (split(/;/, $word)) { + $characters{$character} += $count; + } + } + } +} + +sub spit_characters { + my @sorted_characters = sort { $characters{$b} <=> $characters{$a} } keys %characters; + for my $character (@sorted_characters) { + print "$character $characters{$character}\n"; + } +} + +read_characters(); +spit_characters(); diff --git a/new_language_generation/perl_tools/generate_raw_words.pl b/new_language_generation/perl_tools/generate_raw_words.pl new file mode 100755 index 0000000..b32d810 --- /dev/null +++ b/new_language_generation/perl_tools/generate_raw_words.pl @@ -0,0 +1,23 @@ +#!/usr/bin/perl + +# USAGE: generate_raw_words.pl generate_words_output + +binmode STDOUT, ':utf8'; + +sub read_and_spit { + for my $arg (@ARGV) { + open (my $input, "<:encoding(utf8)", $arg); + + while (<$input>) { + chomp; + my ($word, $count) = split(/\s/, $_); + + my $clean_word = $word; + $clean_word =~ s/;//g; + + print "$clean_word\n"; + } + } +} + +read_and_spit(); diff --git a/new_language_generation/perl_tools/generate_tileset.pl b/new_language_generation/perl_tools/generate_tileset.pl new file mode 100755 index 0000000..0df2e3b --- /dev/null +++ b/new_language_generation/perl_tools/generate_tileset.pl @@ -0,0 +1,103 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +my $alphabet_filename = 'undef'; +my $help = 0; + +GetOptions('alphabet=s' => \$alphabet_filename, + 'help|?' => \$help) or pod2usage(2); + +pod2usage(1) if $help; + +=pod + +=head1 NAME + +generate_tileset.pl - Print out HTML tileset + +=head1 SYNOPSIS + +--alphabet= + output of generate_alphabet.pl + +=cut + +binmode STDOUT, ':utf8'; + +# tiles{count}{letter} = score +my %tiles; + +sub read_alphabet { + if ($alphabet_filename eq 'undef') { + return; + } + + open (my $input, "<:encoding(utf8)", $alphabet_filename); + + my $i = 0; + while (<$input>) { + chomp; + next if (/^\#/); + + my ($letter, $blank_text, $score, $count) = split /\s/, $_; + + if ($letter eq 'blank') { + $tiles{$score}{"　"} = 0; + next; + } + + next if (! defined $count); + + $letter =~ s/\|//g; + $tiles{$count}{$letter} = $score; + } +} + +sub spit_characters { + my $fontsize = "1.2cm"; + my $score_fontsize = "18pt"; + #my $color="#458B74"; + #my $color="black"; + #my $color="#EAC117"; + my $color="darkblue"; + my $width = "5in"; + my $height = "0"; + + print "\n\n\n\n\n"; + + print "\n"; + + print ""; + my $already_in_row = 0; + for my $count (sort keys %tiles) { + for my $letter (sort keys %{$tiles{$count}}) { + for my $i (1 .. $count) { + if ($already_in_row >= 10) { + print "\n"; + $already_in_row = 0; + } + + my $score = $tiles{$count}{$letter}; + my $align = 'center'; + print "\n"; + + ++$already_in_row; + } + } + } + print "\n"; + + print "

\n"; + print "$letter"; + print "_$score"; + print "

\n"; + print "\n\n"; +} + +read_alphabet(); +spit_characters(); diff --git a/new_language_generation/perl_tools/generate_words.pl b/new_language_generation/perl_tools/generate_words.pl new file mode 100755 index 0000000..858d978 --- /dev/null +++ b/new_language_generation/perl_tools/generate_words.pl @@ -0,0 +1,317 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use Encode qw/encode decode/; +use Getopt::Long; +use Pod::Usage; +use Unicode::Normalize; +use Lingua::KO::Hangul::Util qw(:all); + +my $granularity = 'letter'; +my $input_encoding = 'utf8'; +my $split_at_spaces_only = 0; +my $help = 0; + +GetOptions('granularity=s' => \$granularity, + 'input_encoding=s' => \$input_encoding, + 'split_at_spaces_only' => \$split_at_spaces_only, + 'help|?' => \$help) or pod2usage(2); + +pod2usage(1) if $help; + +=pod + +=head1 NAME + +generate_words.pl - Generate a word list + +=head1 SYNOPSIS + +--granularity= + how big a letter is. + +--input_encoding= + incoding of input texts + +=cut + +my %words; + +binmode STDOUT, ':utf8'; + +sub read_words { + @ARGV = map { /\.(gz|Z)$/ ? "gzip -dc $_ |" : $_ } @ARGV; + + while () { + chomp; + my @words; + if ($split_at_spaces_only) { + @words = split(/ /, $_); + } else { + @words = split(/[\s\w\-\)\(\,\.\;\:\!\"\~\=\'\`\%\<\>\[\]\/\‘\’\?\#\@]/, $_); + } + + for my $word (@words) { + my $utf8_word; + $utf8_word = decode($input_encoding, $word); + + if (!defined $utf8_word || length($utf8_word) == 0) { + next; + } + + my $normalized_utf8_word = uc($utf8_word); + + my $expanded_word = expand_word($normalized_utf8_word); + + if (length $expanded_word > 0) { + $words{$expanded_word}++; + } + } + } +} + +sub expand_word { + my ($word) = @_; + my @ret; + + if ($granularity eq 'jamo') { + @ret = expand_jamo($word); + } elsif ($granularity eq 'letter') { + @ret = split //, $word; + } else { + die "Granularity '$granularity' is not recognized.\n"; + } + + return join ';', @ret; +} + +sub expand_jamo { + my ($word) = @_; + + if (length($word) == 0) { + return $word; + } + + my @syllables = split(//, $word); + my @cumulative_jamos; + + for my $syllable (@syllables) { + my $decomposed_string = decomposeSyllable($syllable); + + for my $jamo (split(//, $decomposed_string)) { + push @cumulative_jamos, normalize_jamo($jamo); + } + } + + return @cumulative_jamos; +} + +sub spit_words { + for my $word (sort keys %words) { + print "$word $words{$word}\n" if (length($word) > 1); + } +} + +sub execute { + read_words(); + spit_words(); +} + +# Following is Hangul-specific code and then an execute() call. +# Constants and ideas borrowed from Lingua::KO::Hangul::Util. + +use constant SBase => 0xAC00; +use constant SFinal => 0xD7A3; # SBase -1 + SCount +use constant SCount => 11172; # LCount * NCount +use constant NCount => 588; # VCount * TCount +use constant LBase => 0x1100; +use constant LFinal => 0x1112; +use constant LCount => 19; # scalar @JamoL +use constant VBase => 0x1161; +use constant VFinal => 0x1175; +use constant VCount => 21; # scalar @JamoV +use constant TBase => 0x11A7; +use constant TFinal => 0x11C2; +use constant TCount => 28; # scalar @JamoT +use constant JBase => 0x1100; +use constant JFinal => 0x11FF; +use constant JCount => 256; + +use constant JamoLIni => 0x1100; +use constant JamoLFin => 0x1159; +use constant JamoLFill => 0x115F; +use constant JamoVIni => 0x1160; +use constant JamoVFin => 0x11A2; +use constant JamoTIni => 0x11A8; +use constant JamoTFin => 0x11F9; + +my @JamoL = ( # Initial (HANGUL CHOSEONG) + "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", + "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H", + ); + +my @JamoV = ( # Medial (HANGUL JUNGSEONG) + "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", + "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", + "YU", "EU", "YI", "I", + ); + +my @JamoT = ( # Final (HANGUL JONGSEONG) + "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", + "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", + "S", "SS", "NG", "J", "C", "K", "T", "P", "H", + ); + +my @TtoL = ( + "", # "" + "G", # "G" + "G;G", # "GG" + "G;S", # "GS" + "N", # "N" + "N;J", # "NJ" + "N;H", # "NH" + "D", # "D" + "R", # "L" + "R;G", # "LG" + "R;M", # "LM" + "R;B", # "LB" + "R;S", # "LS" + "R;T", # "LT" + "R;P", # "LP" + "R;H", # "LH" + "M", # "M" + "B", # "B" + "B;S", # "BS" + "S", # "S" + "SS", # "SS" + "", # "NG" + "J", # "J" + "C", # "C" + "K", # "K" + "T", # "T" + "P", # "P" + "H", # "H" +); + +my(%CodeL, %CodeV, %CodeT); +@CodeL{@JamoL} = 0 .. LCount-1; +@CodeV{@JamoV} = 0 .. VCount-1; +@CodeT{@JamoT} = 0 .. TCount-1; + +# Normalizes jamo then inserts fillers + +sub normalize_jamo { + my ($jamo) = @_; + + my @homogenized = convert_trailing_to_leading($jamo); + + my @homogenized_filled; + for my $homogenous_jamo_codepoint (@homogenized) { + my $filled = fill_jamo($homogenous_jamo_codepoint); + push @homogenized_filled, $filled; + } + + return @homogenized_filled; +} + +# Takes a L, V, or T jamo codepoint, fills in the rest, +# and returns a unicode string. +sub fill_jamo { + my ($codepoint) = @_; + my $syllable_type = getSyllableType($codepoint); + + # technique = jamo_alone, raw_jamo, romanize, jamo_letter, or crazy + my $technique = 'jamo_letter'; + + if ($technique eq 'raw_jamo') { + return '|' . pack('U*', $codepoint) . '|'; + } + + if ($technique eq 'jamo_alone') { + if ($syllable_type eq 'T') { + return "\x{25CB}"; + } + return insertFiller(pack('U*', $codepoint)); + } + + if ($technique eq 'romanize') { + return romanize_codepoint($codepoint); + } + + my $leading; + my $vowel; + my $trailing = 0; + + if ($syllable_type eq 'L') { + $leading = $codepoint - JamoLIni; + $vowel = $CodeV{'A'}; + } elsif ($syllable_type eq 'V') { + $leading = $CodeL{''}; + $vowel = $codepoint - JamoVIni - 1; + } elsif ($syllable_type eq 'T') { + # Always the circle. + $leading = $CodeL{''}; + $vowel = $CodeV{'A'}; + $trailing = $CodeT{'NG'}; + } else { + return pack('U*', $codepoint); + } + + return pack('U*', SBase + $leading * NCount + $vowel * TCount + $trailing); +} + +sub romanize_codepoint { + my ($codepoint) = @_; + + my $roman; + + my $syllable_type = getSyllableType($codepoint); + if ($syllable_type eq 'L') { + $roman = $JamoL[$codepoint - JamoLIni]; + } elsif ($syllable_type eq 'V') { + $roman = $JamoV[$codepoint - JamoVIni - 1]; + } elsif ($syllable_type eq 'T') { + $roman = "NG"; + } else { + return pack('U*', $codepoint); + } + + return "|$roman|"; +} + +# Converts a trailing consonant to a string of leading ones. +# Returns an array of codepoints. +sub convert_trailing_to_leading { + my ($jamo) = @_; + my @ret_codepoints = unpack('U*', $jamo); + + die 'normalize_jamo called with more than one codepoint' if (@ret_codepoints > 1); + my $codepoint = $ret_codepoints[0]; + + # Elide placeholder consonant. + if (@ret_codepoints == 1 && $ret_codepoints[0] == JamoLIni + $CodeL{''}) { + return (); + } + + if (getSyllableType($codepoint) eq 'T') { + my $Tindex = $codepoint - JamoTIni + 1; + + my @leaders = split /;/, $TtoL[$Tindex]; + push @leaders, '' if (@leaders == 0); # '' gets lost otherwise + + @ret_codepoints = (); + for my $leader (@leaders) { + my $new_codepoint = JamoLIni + $CodeL{$leader}; + push @ret_codepoints, $new_codepoint; + } + } + + if (@ret_codepoints == 1 && $ret_codepoints[0] == JamoLIni + $CodeL{''}) { + $ret_codepoints[0] = JamoTIni + $CodeT{'NG'} - 1; + } + + return @ret_codepoints; +} + +execute(); -- cgit v1.2.3