summaryrefslogtreecommitdiff
path: root/new_language_generation
diff options
context:
space:
mode:
Diffstat (limited to 'new_language_generation')
-rwxr-xr-xnew_language_generation/perl_tools/analyse_gcgs.pl56
-rwxr-xr-xnew_language_generation/perl_tools/generate_alphabet.pl281
-rwxr-xr-xnew_language_generation/perl_tools/generate_distribution.pl57
-rwxr-xr-xnew_language_generation/perl_tools/generate_raw_words.pl23
-rwxr-xr-xnew_language_generation/perl_tools/generate_tileset.pl103
-rwxr-xr-xnew_language_generation/perl_tools/generate_words.pl317
6 files changed, 837 insertions, 0 deletions
diff --git a/new_language_generation/perl_tools/analyse_gcgs.pl b/new_language_generation/perl_tools/analyse_gcgs.pl
new file mode 100755
index 0000000..94e7cce
--- /dev/null
+++ b/new_language_generation/perl_tools/analyse_gcgs.pl
@@ -0,0 +1,56 @@
+#!/usr/bin/perl
+
+# USAGE: analyse_gcgs.pl *.gcg > gcganalysis
+
+use warnings;
+use strict;
+
+my %letter_points;
+my %letter_counts;
+
+binmode STDOUT, ':utf8';
+
+sub read_gcgs {
+ my $games_count = 0;
+ for my $arg (@ARGV) {
+ open (my $input, "<:encoding(utf8)", $arg);
+ ++$games_count;
+
+ while (<$input>) {
+ chomp;
+ next if (!/^\>/);
+
+ my ($player, $rack, $position, $play, $score, $cumulativescore) = split /\s+/, $_;
+
+ next if (! defined $cumulativescore);
+ if ($play =~ /\[/) {
+ print "blank play! $play for $score\n";
+ next;
+ }
+
+ for my $letter (split /|/, $play) {
+ ++$letter_counts{$letter};
+ $letter_points{$letter} += int($score);
+
+ #print "$letter now seen " . $letter_counts{$letter} . " times, scored " . $letter_points{$letter} . " points\n";
+ }
+ }
+ }
+
+ print "# Read $games_count games.\n";
+}
+
+sub spit_analysis {
+ my %points_per_turn;
+ for my $letter (keys %letter_counts) {
+ $points_per_turn{$letter} = $letter_points{$letter} / $letter_counts{$letter};
+ }
+
+ for my $letter (sort { $points_per_turn{$b} <=> $points_per_turn{$a} } keys %points_per_turn) {
+ # TODO lame that this always puts out un-bar-surrounded letters
+ print "$letter " . $points_per_turn{$letter} . "\n";
+ }
+}
+
+read_gcgs();
+spit_analysis();
diff --git a/new_language_generation/perl_tools/generate_alphabet.pl b/new_language_generation/perl_tools/generate_alphabet.pl
new file mode 100755
index 0000000..31006a7
--- /dev/null
+++ b/new_language_generation/perl_tools/generate_alphabet.pl
@@ -0,0 +1,281 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+use Lingua::KO::Hangul::Util qw(:all);
+use POSIX qw(ceil floor);
+
+my $distribution_filename = 'undef';
+my $analysis_filename = 'undef';
+my $count_assigner_filename = 'undef';
+
+my $counts_only = 0;
+
+my $number_of_distinct_tiles = 30;
+my $number_of_blanks = 2;
+
+my $help = 0;
+
+GetOptions('distribution=s' => \$distribution_filename,
+ 'analysis=s' => \$analysis_filename,
+ 'count_assigner=s' => \$count_assigner_filename,
+ 'number_of_distinct_tiles=i' => \$number_of_distinct_tiles,
+ 'number_of_blanks=i' => \$number_of_blanks,
+ 'counts_only' => \$counts_only,
+ 'help|?' => \$help) or pod2usage(2);
+
+pod2usage(1) if $help;
+
+=pod
+
+=head1 NAME
+
+generate_alphabet.pl - Print out quackle alphabet file
+
+=head1 SYNOPSIS
+
+--distribution=<file>
+ output of generate_distribution.pl
+
+--analysis=<file>
+ output of analyse_gcgs.pl
+
+--count_assigner=<file>
+ file with one number per line. Number on nth line is how many
+ tiles are in bag of letter that appears nth least often.
+
+--number_of_distinct_tiles=[1, 30]
+ how many tiles are in the alphabet (should be same as length
+ of count assigner file).
+
+--number_of_blanks=<nonnegative integer>
+ how many scoreless blanks are in the bag.
+
+--counts_only
+ only output the counts of letters, in same format as
+ count_assigner input
+
+=cut
+
+my @count_assigner = ();
+my $has_assigned_counts = 0;
+
+my %character_counts;
+my $count_sum = 0;
+my $inverse_count_sum = 0;
+
+my %character_points_per_appearance;
+my $lowest_points_per_appearance = -1;
+my $highest_points_per_appearance = 0;
+my $maximum_value = 10;
+
+binmode STDOUT, ':utf8';
+
+sub read_count_assigner {
+ if ($count_assigner_filename eq 'undef') {
+ $has_assigned_counts = 0;
+ return;
+ }
+
+ if (!(-e $count_assigner_filename)) {
+ print STDERR "Count assigner $count_assigner_filename does not exist; ignoring.\n";
+ $has_assigned_counts = 0;
+ return;
+ }
+
+ $has_assigned_counts = 1;
+
+ open (my $input, "<:encoding(utf8)", $count_assigner_filename);
+
+ my $i = 0;
+ while (<$input>) {
+ chomp;
+ next if (/^\#/);
+
+ my ($count, @rest) = split /\s/, $_;
+ $count_assigner[$i] = $count;
+ ++$i;
+ }
+}
+
+sub read_analysis {
+ if ($analysis_filename eq 'undef') {
+ return;
+ }
+
+ if (!(-e $analysis_filename)) {
+ print STDERR "Analysis $analysis_filename does not exist; ignoring.\n";
+ return;
+ }
+
+ open (my $input, "<:encoding(utf8)", $analysis_filename);
+
+ my $i = 0;
+ while (<$input>) {
+ chomp;
+ next if (/^\#/);
+
+ my ($letter, $points_per_appearance) = split /\s/, $_;
+ next if (! defined $points_per_appearance);
+
+ next if ($letter =~ /\[/);
+ next if ($letter =~ /\]/);
+
+ $letter =~ s/\|//g;
+
+ $character_points_per_appearance{$letter} = $points_per_appearance;
+ }
+
+ calculate_bounding_points_per_appearance_values();
+}
+
+sub calculate_bounding_points_per_appearance_values {
+ my $i = 0;
+ for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) {
+ if (exists $character_counts{$character} && count_of_character($i, $character) > 0) {
+ my $points_per_appearance = $character_points_per_appearance{$character};
+ if ($lowest_points_per_appearance < 0 || $points_per_appearance < $lowest_points_per_appearance) {
+ $lowest_points_per_appearance = $points_per_appearance;
+ }
+
+ if ($points_per_appearance > $highest_points_per_appearance) {
+ $highest_points_per_appearance = $points_per_appearance;
+ }
+ }
+
+ ++$i;
+ }
+}
+
+sub read_characters {
+ open (my $input, "<:encoding(utf8)", $distribution_filename);
+
+ my $i = 0;
+ while (<$input>) {
+ last if ($i >= $number_of_distinct_tiles);
+
+ chomp;
+ my ($letter, $count) = split(/\s/, $_);
+
+ $character_counts{$letter} = $count;
+ $count_sum += $count;
+ $inverse_count_sum += exp(-.0005*$count);
+ ++$i;
+ }
+}
+
+sub max {
+ return $_[0] > $_[1]? $_[0] : $_[1];
+}
+
+sub value_of_character {
+ my ($letter) = @_;
+ $letter =~ s/\|//g;
+
+ if (exists $character_points_per_appearance{$letter}) {
+ my $value_multiplier = $maximum_value / ($highest_points_per_appearance - $lowest_points_per_appearance);
+
+ return max(floor(($highest_points_per_appearance - $character_points_per_appearance{$letter}) * $value_multiplier), 1);
+ }
+
+ # This was written VERY late at night.
+ #print "computing value of $letter; count_sum=$count_sum; my count=$character_counts{$letter}\n";
+ return max(floor(187 * exp(-.0005*$character_counts{$letter}) / $inverse_count_sum), 1);
+}
+
+sub hallucinate_assigned_counts {
+ my $total_rounded_count = $number_of_blanks;
+ my $i = 0;
+ for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) {
+ my $original_count = $character_counts{$character};
+ my $rounded_count;
+
+ my $truth = 100 * $original_count / $count_sum;
+ if ($truth < 0.1) {
+ $rounded_count = 0;
+ } else {
+ $rounded_count = max(floor($truth), 1);
+ }
+
+ $total_rounded_count += $rounded_count;
+ $count_assigner[$i] = $rounded_count;
+
+ ++$i;
+ }
+
+ $has_assigned_counts = 1;
+}
+
+sub count_of_character {
+ my ($index, $count) = @_;
+
+ if ($has_assigned_counts && $index < @count_assigner) {
+ return $count_assigner[$index]
+ }
+
+ return 0;
+}
+
+sub spit_characters {
+ if ($counts_only) {
+ my $i = 0;
+ for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) {
+ my $count = count_of_character($i, $character_counts{$character});
+ print "$count #" . ($i + 1) . ": $character\n";
+ ++$i;
+ }
+ return;
+ }
+
+ my $total_score = 0;
+ my $total_count = $number_of_blanks;
+ print "blank 0 $number_of_blanks\n";
+
+ print "#char\tblank\tscore\tcount\tvowel\n";
+ my $i = 0;
+ for my $character (sort { $character_counts{$a} <=> $character_counts{$b} } keys %character_counts) {
+ my $blank_text = lc($character);
+
+ # TODO lower-case blanks currently broken.
+ my $disable_lower_case_blanks = 1;
+ if ($disable_lower_case_blanks || $blank_text eq $character) {
+ $blank_text = '['.$character.']';
+ $blank_text =~ s/\|//g;
+ }
+
+ my $score = value_of_character($character);
+ my $count = count_of_character($i, $character_counts{$character});
+ $total_count += $count;
+ $total_score += $score;
+
+ my $is_vowel = '0';
+
+ # TODO make work for unfilled jamo
+ if (isStandardForm($character)) {
+ my @codepoints = unpack('U*', $character);
+ if ($codepoints[0] == 0x115F) {
+ $is_vowel = '1';
+ }
+ }
+
+ ++$i;
+
+ print "$character\t$blank_text\t$score\t$count\t$is_vowel\n";
+ }
+
+ print "# Total count: $total_count\n";
+ print "# Total score: $total_score\n";
+}
+
+read_characters();
+read_count_assigner();
+if (!$has_assigned_counts) {
+ hallucinate_assigned_counts();
+}
+
+read_analysis();
+spit_characters();
diff --git a/new_language_generation/perl_tools/generate_distribution.pl b/new_language_generation/perl_tools/generate_distribution.pl
new file mode 100755
index 0000000..e86e04a
--- /dev/null
+++ b/new_language_generation/perl_tools/generate_distribution.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/perl
+
+# USAGE: generate_distribution.pl word_list > character_list
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+my $help = 0;
+
+GetOptions(
+ 'help|?' => \$help) or pod2usage(2);
+
+pod2usage(1) if $help;
+
+=pod
+
+=head1 NAME
+
+generate_distribution.pl - Generate letter distribution data
+
+=head1 SYNOPSIS
+
+One argument: the output of generate_words.pl.
+
+=cut
+
+binmode STDOUT, ':utf8';
+
+my %characters;
+
+sub read_characters {
+ for my $arg (@ARGV) {
+ open (my $input, "<:encoding(utf8)", $arg);
+
+ while (<$input>) {
+ chomp;
+ my ($word, $count) = split(/\s/, $_);
+
+ for my $character (split(/;/, $word)) {
+ $characters{$character} += $count;
+ }
+ }
+ }
+}
+
+sub spit_characters {
+ my @sorted_characters = sort { $characters{$b} <=> $characters{$a} } keys %characters;
+ for my $character (@sorted_characters) {
+ print "$character $characters{$character}\n";
+ }
+}
+
+read_characters();
+spit_characters();
diff --git a/new_language_generation/perl_tools/generate_raw_words.pl b/new_language_generation/perl_tools/generate_raw_words.pl
new file mode 100755
index 0000000..b32d810
--- /dev/null
+++ b/new_language_generation/perl_tools/generate_raw_words.pl
@@ -0,0 +1,23 @@
+#!/usr/bin/perl
+
+# USAGE: generate_raw_words.pl generate_words_output
+
+binmode STDOUT, ':utf8';
+
+sub read_and_spit {
+ for my $arg (@ARGV) {
+ open (my $input, "<:encoding(utf8)", $arg);
+
+ while (<$input>) {
+ chomp;
+ my ($word, $count) = split(/\s/, $_);
+
+ my $clean_word = $word;
+ $clean_word =~ s/;//g;
+
+ print "$clean_word\n";
+ }
+ }
+}
+
+read_and_spit();
diff --git a/new_language_generation/perl_tools/generate_tileset.pl b/new_language_generation/perl_tools/generate_tileset.pl
new file mode 100755
index 0000000..0df2e3b
--- /dev/null
+++ b/new_language_generation/perl_tools/generate_tileset.pl
@@ -0,0 +1,103 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+my $alphabet_filename = 'undef';
+my $help = 0;
+
+GetOptions('alphabet=s' => \$alphabet_filename,
+ 'help|?' => \$help) or pod2usage(2);
+
+pod2usage(1) if $help;
+
+=pod
+
+=head1 NAME
+
+generate_tileset.pl - Print out HTML tileset
+
+=head1 SYNOPSIS
+
+--alphabet=<file>
+ output of generate_alphabet.pl
+
+=cut
+
+binmode STDOUT, ':utf8';
+
+# tiles{count}{letter} = score
+my %tiles;
+
+sub read_alphabet {
+ if ($alphabet_filename eq 'undef') {
+ return;
+ }
+
+ open (my $input, "<:encoding(utf8)", $alphabet_filename);
+
+ my $i = 0;
+ while (<$input>) {
+ chomp;
+ next if (/^\#/);
+
+ my ($letter, $blank_text, $score, $count) = split /\s/, $_;
+
+ if ($letter eq 'blank') {
+ $tiles{$score}{"&#12288;"} = 0;
+ next;
+ }
+
+ next if (! defined $count);
+
+ $letter =~ s/\|//g;
+ $tiles{$count}{$letter} = $score;
+ }
+}
+
+sub spit_characters {
+ my $fontsize = "1.2cm";
+ my $score_fontsize = "18pt";
+ #my $color="#458B74";
+ #my $color="black";
+ #my $color="#EAC117";
+ my $color="darkblue";
+ my $width = "5in";
+ my $height = "0";
+
+ print "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n</head>\n<body bgcolor=white>\n";
+
+ print "<table cellspacing=2 cellpadding=0>\n";
+
+ print "<tr>";
+ my $already_in_row = 0;
+ for my $count (sort keys %tiles) {
+ for my $letter (sort keys %{$tiles{$count}}) {
+ for my $i (1 .. $count) {
+ if ($already_in_row >= 10) {
+ print "</tr><tr>\n";
+ $already_in_row = 0;
+ }
+
+ my $score = $tiles{$count}{$letter};
+ my $align = 'center';
+ print "<td width=$width height=$height align=$align><nobr>\n";
+ print "<span style=\"font-size: $fontsize; color: $color\">$letter</span>";
+ print "<span style=\"font-size: $score_fontsize; color: $color\"><sub>$score</sub></span>";
+ print "</nobr></td>\n";
+
+ ++$already_in_row;
+ }
+ }
+ }
+ print "</tr>\n";
+
+ print "</table>\n";
+ print "</body>\n</html>\n";
+}
+
+read_alphabet();
+spit_characters();
diff --git a/new_language_generation/perl_tools/generate_words.pl b/new_language_generation/perl_tools/generate_words.pl
new file mode 100755
index 0000000..858d978
--- /dev/null
+++ b/new_language_generation/perl_tools/generate_words.pl
@@ -0,0 +1,317 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+use Encode qw/encode decode/;
+use Getopt::Long;
+use Pod::Usage;
+use Unicode::Normalize;
+use Lingua::KO::Hangul::Util qw(:all);
+
+my $granularity = 'letter';
+my $input_encoding = 'utf8';
+my $split_at_spaces_only = 0;
+my $help = 0;
+
+GetOptions('granularity=s' => \$granularity,
+ 'input_encoding=s' => \$input_encoding,
+ 'split_at_spaces_only' => \$split_at_spaces_only,
+ 'help|?' => \$help) or pod2usage(2);
+
+pod2usage(1) if $help;
+
+=pod
+
+=head1 NAME
+
+generate_words.pl - Generate a word list
+
+=head1 SYNOPSIS
+
+--granularity=<letter|jamo>
+ how big a letter is.
+
+--input_encoding=<utf8|euc-kr>
+ incoding of input texts
+
+=cut
+
+my %words;
+
+binmode STDOUT, ':utf8';
+
+sub read_words {
+ @ARGV = map { /\.(gz|Z)$/ ? "gzip -dc $_ |" : $_ } @ARGV;
+
+ while (<ARGV>) {
+ chomp;
+ my @words;
+ if ($split_at_spaces_only) {
+ @words = split(/ /, $_);
+ } else {
+ @words = split(/[\s\w\-\)\(\,\.\;\:\!\"\~\=\'\`\%\<\>\[\]\/\‘\’\?\#\@]/, $_);
+ }
+
+ for my $word (@words) {
+ my $utf8_word;
+ $utf8_word = decode($input_encoding, $word);
+
+ if (!defined $utf8_word || length($utf8_word) == 0) {
+ next;
+ }
+
+ my $normalized_utf8_word = uc($utf8_word);
+
+ my $expanded_word = expand_word($normalized_utf8_word);
+
+ if (length $expanded_word > 0) {
+ $words{$expanded_word}++;
+ }
+ }
+ }
+}
+
+sub expand_word {
+ my ($word) = @_;
+ my @ret;
+
+ if ($granularity eq 'jamo') {
+ @ret = expand_jamo($word);
+ } elsif ($granularity eq 'letter') {
+ @ret = split //, $word;
+ } else {
+ die "Granularity '$granularity' is not recognized.\n";
+ }
+
+ return join ';', @ret;
+}
+
+sub expand_jamo {
+ my ($word) = @_;
+
+ if (length($word) == 0) {
+ return $word;
+ }
+
+ my @syllables = split(//, $word);
+ my @cumulative_jamos;
+
+ for my $syllable (@syllables) {
+ my $decomposed_string = decomposeSyllable($syllable);
+
+ for my $jamo (split(//, $decomposed_string)) {
+ push @cumulative_jamos, normalize_jamo($jamo);
+ }
+ }
+
+ return @cumulative_jamos;
+}
+
+sub spit_words {
+ for my $word (sort keys %words) {
+ print "$word $words{$word}\n" if (length($word) > 1);
+ }
+}
+
+sub execute {
+ read_words();
+ spit_words();
+}
+
+# Following is Hangul-specific code and then an execute() call.
+# Constants and ideas borrowed from Lingua::KO::Hangul::Util.
+
+use constant SBase => 0xAC00;
+use constant SFinal => 0xD7A3; # SBase -1 + SCount
+use constant SCount => 11172; # LCount * NCount
+use constant NCount => 588; # VCount * TCount
+use constant LBase => 0x1100;
+use constant LFinal => 0x1112;
+use constant LCount => 19; # scalar @JamoL
+use constant VBase => 0x1161;
+use constant VFinal => 0x1175;
+use constant VCount => 21; # scalar @JamoV
+use constant TBase => 0x11A7;
+use constant TFinal => 0x11C2;
+use constant TCount => 28; # scalar @JamoT
+use constant JBase => 0x1100;
+use constant JFinal => 0x11FF;
+use constant JCount => 256;
+
+use constant JamoLIni => 0x1100;
+use constant JamoLFin => 0x1159;
+use constant JamoLFill => 0x115F;
+use constant JamoVIni => 0x1160;
+use constant JamoVFin => 0x11A2;
+use constant JamoTIni => 0x11A8;
+use constant JamoTFin => 0x11F9;
+
+my @JamoL = ( # Initial (HANGUL CHOSEONG)
+ "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
+ "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
+ );
+
+my @JamoV = ( # Medial (HANGUL JUNGSEONG)
+ "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
+ "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
+ "YU", "EU", "YI", "I",
+ );
+
+my @JamoT = ( # Final (HANGUL JONGSEONG)
+ "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
+ "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
+ "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
+ );
+
+my @TtoL = (
+ "", # ""
+ "G", # "G"
+ "G;G", # "GG"
+ "G;S", # "GS"
+ "N", # "N"
+ "N;J", # "NJ"
+ "N;H", # "NH"
+ "D", # "D"
+ "R", # "L"
+ "R;G", # "LG"
+ "R;M", # "LM"
+ "R;B", # "LB"
+ "R;S", # "LS"
+ "R;T", # "LT"
+ "R;P", # "LP"
+ "R;H", # "LH"
+ "M", # "M"
+ "B", # "B"
+ "B;S", # "BS"
+ "S", # "S"
+ "SS", # "SS"
+ "", # "NG"
+ "J", # "J"
+ "C", # "C"
+ "K", # "K"
+ "T", # "T"
+ "P", # "P"
+ "H", # "H"
+);
+
+my(%CodeL, %CodeV, %CodeT);
+@CodeL{@JamoL} = 0 .. LCount-1;
+@CodeV{@JamoV} = 0 .. VCount-1;
+@CodeT{@JamoT} = 0 .. TCount-1;
+
+# Normalizes jamo then inserts fillers
+
+sub normalize_jamo {
+ my ($jamo) = @_;
+
+ my @homogenized = convert_trailing_to_leading($jamo);
+
+ my @homogenized_filled;
+ for my $homogenous_jamo_codepoint (@homogenized) {
+ my $filled = fill_jamo($homogenous_jamo_codepoint);
+ push @homogenized_filled, $filled;
+ }
+
+ return @homogenized_filled;
+}
+
+# Takes a L, V, or T jamo codepoint, fills in the rest,
+# and returns a unicode string.
+sub fill_jamo {
+ my ($codepoint) = @_;
+ my $syllable_type = getSyllableType($codepoint);
+
+ # technique = jamo_alone, raw_jamo, romanize, jamo_letter, or crazy
+ my $technique = 'jamo_letter';
+
+ if ($technique eq 'raw_jamo') {
+ return '|' . pack('U*', $codepoint) . '|';
+ }
+
+ if ($technique eq 'jamo_alone') {
+ if ($syllable_type eq 'T') {
+ return "\x{25CB}";
+ }
+ return insertFiller(pack('U*', $codepoint));
+ }
+
+ if ($technique eq 'romanize') {
+ return romanize_codepoint($codepoint);
+ }
+
+ my $leading;
+ my $vowel;
+ my $trailing = 0;
+
+ if ($syllable_type eq 'L') {
+ $leading = $codepoint - JamoLIni;
+ $vowel = $CodeV{'A'};
+ } elsif ($syllable_type eq 'V') {
+ $leading = $CodeL{''};
+ $vowel = $codepoint - JamoVIni - 1;
+ } elsif ($syllable_type eq 'T') {
+ # Always the circle.
+ $leading = $CodeL{''};
+ $vowel = $CodeV{'A'};
+ $trailing = $CodeT{'NG'};
+ } else {
+ return pack('U*', $codepoint);
+ }
+
+ return pack('U*', SBase + $leading * NCount + $vowel * TCount + $trailing);
+}
+
+sub romanize_codepoint {
+ my ($codepoint) = @_;
+
+ my $roman;
+
+ my $syllable_type = getSyllableType($codepoint);
+ if ($syllable_type eq 'L') {
+ $roman = $JamoL[$codepoint - JamoLIni];
+ } elsif ($syllable_type eq 'V') {
+ $roman = $JamoV[$codepoint - JamoVIni - 1];
+ } elsif ($syllable_type eq 'T') {
+ $roman = "NG";
+ } else {
+ return pack('U*', $codepoint);
+ }
+
+ return "|$roman|";
+}
+
+# Converts a trailing consonant to a string of leading ones.
+# Returns an array of codepoints.
+sub convert_trailing_to_leading {
+ my ($jamo) = @_;
+ my @ret_codepoints = unpack('U*', $jamo);
+
+ die 'normalize_jamo called with more than one codepoint' if (@ret_codepoints > 1);
+ my $codepoint = $ret_codepoints[0];
+
+ # Elide placeholder consonant.
+ if (@ret_codepoints == 1 && $ret_codepoints[0] == JamoLIni + $CodeL{''}) {
+ return ();
+ }
+
+ if (getSyllableType($codepoint) eq 'T') {
+ my $Tindex = $codepoint - JamoTIni + 1;
+
+ my @leaders = split /;/, $TtoL[$Tindex];
+ push @leaders, '' if (@leaders == 0); # '' gets lost otherwise
+
+ @ret_codepoints = ();
+ for my $leader (@leaders) {
+ my $new_codepoint = JamoLIni + $CodeL{$leader};
+ push @ret_codepoints, $new_codepoint;
+ }
+ }
+
+ if (@ret_codepoints == 1 && $ret_codepoints[0] == JamoLIni + $CodeL{''}) {
+ $ret_codepoints[0] = JamoTIni + $CodeT{'NG'} - 1;
+ }
+
+ return @ret_codepoints;
+}
+
+execute();