summaryrefslogtreecommitdiff
path: root/new_language_generation/perl_tools/generate_words.pl
diff options
context:
space:
mode:
authorJason Katz-Brown <jason@airbnb.com>2013-08-25 02:17:13 -0700
committerJason Katz-Brown <jason@airbnb.com>2013-08-25 02:17:13 -0700
commit9306cb60c32082c5403931de0823a9fd5daa196c (patch)
treeca1b6eb695fdf3f0c2294e92416b272164bae642 /new_language_generation/perl_tools/generate_words.pl
parent8fb2c681cecc01b46b0f4ba02d5cc177c4747b1c (diff)
Initial git commit.
Diffstat (limited to 'new_language_generation/perl_tools/generate_words.pl')
-rwxr-xr-xnew_language_generation/perl_tools/generate_words.pl317
1 files changed, 317 insertions, 0 deletions
diff --git a/new_language_generation/perl_tools/generate_words.pl b/new_language_generation/perl_tools/generate_words.pl
new file mode 100755
index 0000000..858d978
--- /dev/null
+++ b/new_language_generation/perl_tools/generate_words.pl
@@ -0,0 +1,317 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+use Encode qw/encode decode/;
+use Getopt::Long;
+use Pod::Usage;
+use Unicode::Normalize;
+use Lingua::KO::Hangul::Util qw(:all);
+
+my $granularity = 'letter';
+my $input_encoding = 'utf8';
+my $split_at_spaces_only = 0;
+my $help = 0;
+
+GetOptions('granularity=s' => \$granularity,
+ 'input_encoding=s' => \$input_encoding,
+ 'split_at_spaces_only' => \$split_at_spaces_only,
+ 'help|?' => \$help) or pod2usage(2);
+
+pod2usage(1) if $help;
+
+=pod
+
+=head1 NAME
+
+generate_words.pl - Generate a word list
+
+=head1 SYNOPSIS
+
+--granularity=<letter|jamo>
+ how big a letter is.
+
+--input_encoding=<utf8|euc-kr>
+ incoding of input texts
+
+=cut
+
+my %words;
+
+binmode STDOUT, ':utf8';
+
+sub read_words {
+ @ARGV = map { /\.(gz|Z)$/ ? "gzip -dc $_ |" : $_ } @ARGV;
+
+ while (<ARGV>) {
+ chomp;
+ my @words;
+ if ($split_at_spaces_only) {
+ @words = split(/ /, $_);
+ } else {
+ @words = split(/[\s\w\-\)\(\,\.\;\:\!\"\~\=\'\`\%\<\>\[\]\/\‘\’\?\#\@]/, $_);
+ }
+
+ for my $word (@words) {
+ my $utf8_word;
+ $utf8_word = decode($input_encoding, $word);
+
+ if (!defined $utf8_word || length($utf8_word) == 0) {
+ next;
+ }
+
+ my $normalized_utf8_word = uc($utf8_word);
+
+ my $expanded_word = expand_word($normalized_utf8_word);
+
+ if (length $expanded_word > 0) {
+ $words{$expanded_word}++;
+ }
+ }
+ }
+}
+
+sub expand_word {
+ my ($word) = @_;
+ my @ret;
+
+ if ($granularity eq 'jamo') {
+ @ret = expand_jamo($word);
+ } elsif ($granularity eq 'letter') {
+ @ret = split //, $word;
+ } else {
+ die "Granularity '$granularity' is not recognized.\n";
+ }
+
+ return join ';', @ret;
+}
+
+sub expand_jamo {
+ my ($word) = @_;
+
+ if (length($word) == 0) {
+ return $word;
+ }
+
+ my @syllables = split(//, $word);
+ my @cumulative_jamos;
+
+ for my $syllable (@syllables) {
+ my $decomposed_string = decomposeSyllable($syllable);
+
+ for my $jamo (split(//, $decomposed_string)) {
+ push @cumulative_jamos, normalize_jamo($jamo);
+ }
+ }
+
+ return @cumulative_jamos;
+}
+
+sub spit_words {
+ for my $word (sort keys %words) {
+ print "$word $words{$word}\n" if (length($word) > 1);
+ }
+}
+
+sub execute {
+ read_words();
+ spit_words();
+}
+
+# Following is Hangul-specific code and then an execute() call.
+# Constants and ideas borrowed from Lingua::KO::Hangul::Util.
+
+use constant SBase => 0xAC00;
+use constant SFinal => 0xD7A3; # SBase -1 + SCount
+use constant SCount => 11172; # LCount * NCount
+use constant NCount => 588; # VCount * TCount
+use constant LBase => 0x1100;
+use constant LFinal => 0x1112;
+use constant LCount => 19; # scalar @JamoL
+use constant VBase => 0x1161;
+use constant VFinal => 0x1175;
+use constant VCount => 21; # scalar @JamoV
+use constant TBase => 0x11A7;
+use constant TFinal => 0x11C2;
+use constant TCount => 28; # scalar @JamoT
+use constant JBase => 0x1100;
+use constant JFinal => 0x11FF;
+use constant JCount => 256;
+
+use constant JamoLIni => 0x1100;
+use constant JamoLFin => 0x1159;
+use constant JamoLFill => 0x115F;
+use constant JamoVIni => 0x1160;
+use constant JamoVFin => 0x11A2;
+use constant JamoTIni => 0x11A8;
+use constant JamoTFin => 0x11F9;
+
+my @JamoL = ( # Initial (HANGUL CHOSEONG)
+ "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
+ "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
+ );
+
+my @JamoV = ( # Medial (HANGUL JUNGSEONG)
+ "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
+ "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
+ "YU", "EU", "YI", "I",
+ );
+
+my @JamoT = ( # Final (HANGUL JONGSEONG)
+ "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
+ "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
+ "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
+ );
+
+my @TtoL = (
+ "", # ""
+ "G", # "G"
+ "G;G", # "GG"
+ "G;S", # "GS"
+ "N", # "N"
+ "N;J", # "NJ"
+ "N;H", # "NH"
+ "D", # "D"
+ "R", # "L"
+ "R;G", # "LG"
+ "R;M", # "LM"
+ "R;B", # "LB"
+ "R;S", # "LS"
+ "R;T", # "LT"
+ "R;P", # "LP"
+ "R;H", # "LH"
+ "M", # "M"
+ "B", # "B"
+ "B;S", # "BS"
+ "S", # "S"
+ "SS", # "SS"
+ "", # "NG"
+ "J", # "J"
+ "C", # "C"
+ "K", # "K"
+ "T", # "T"
+ "P", # "P"
+ "H", # "H"
+);
+
+my(%CodeL, %CodeV, %CodeT);
+@CodeL{@JamoL} = 0 .. LCount-1;
+@CodeV{@JamoV} = 0 .. VCount-1;
+@CodeT{@JamoT} = 0 .. TCount-1;
+
+# Normalizes jamo then inserts fillers
+
+sub normalize_jamo {
+ my ($jamo) = @_;
+
+ my @homogenized = convert_trailing_to_leading($jamo);
+
+ my @homogenized_filled;
+ for my $homogenous_jamo_codepoint (@homogenized) {
+ my $filled = fill_jamo($homogenous_jamo_codepoint);
+ push @homogenized_filled, $filled;
+ }
+
+ return @homogenized_filled;
+}
+
+# Takes a L, V, or T jamo codepoint, fills in the rest,
+# and returns a unicode string.
+sub fill_jamo {
+ my ($codepoint) = @_;
+ my $syllable_type = getSyllableType($codepoint);
+
+ # technique = jamo_alone, raw_jamo, romanize, jamo_letter, or crazy
+ my $technique = 'jamo_letter';
+
+ if ($technique eq 'raw_jamo') {
+ return '|' . pack('U*', $codepoint) . '|';
+ }
+
+ if ($technique eq 'jamo_alone') {
+ if ($syllable_type eq 'T') {
+ return "\x{25CB}";
+ }
+ return insertFiller(pack('U*', $codepoint));
+ }
+
+ if ($technique eq 'romanize') {
+ return romanize_codepoint($codepoint);
+ }
+
+ my $leading;
+ my $vowel;
+ my $trailing = 0;
+
+ if ($syllable_type eq 'L') {
+ $leading = $codepoint - JamoLIni;
+ $vowel = $CodeV{'A'};
+ } elsif ($syllable_type eq 'V') {
+ $leading = $CodeL{''};
+ $vowel = $codepoint - JamoVIni - 1;
+ } elsif ($syllable_type eq 'T') {
+ # Always the circle.
+ $leading = $CodeL{''};
+ $vowel = $CodeV{'A'};
+ $trailing = $CodeT{'NG'};
+ } else {
+ return pack('U*', $codepoint);
+ }
+
+ return pack('U*', SBase + $leading * NCount + $vowel * TCount + $trailing);
+}
+
+sub romanize_codepoint {
+ my ($codepoint) = @_;
+
+ my $roman;
+
+ my $syllable_type = getSyllableType($codepoint);
+ if ($syllable_type eq 'L') {
+ $roman = $JamoL[$codepoint - JamoLIni];
+ } elsif ($syllable_type eq 'V') {
+ $roman = $JamoV[$codepoint - JamoVIni - 1];
+ } elsif ($syllable_type eq 'T') {
+ $roman = "NG";
+ } else {
+ return pack('U*', $codepoint);
+ }
+
+ return "|$roman|";
+}
+
+# Converts a trailing consonant to a string of leading ones.
+# Returns an array of codepoints.
+sub convert_trailing_to_leading {
+ my ($jamo) = @_;
+ my @ret_codepoints = unpack('U*', $jamo);
+
+ die 'normalize_jamo called with more than one codepoint' if (@ret_codepoints > 1);
+ my $codepoint = $ret_codepoints[0];
+
+ # Elide placeholder consonant.
+ if (@ret_codepoints == 1 && $ret_codepoints[0] == JamoLIni + $CodeL{''}) {
+ return ();
+ }
+
+ if (getSyllableType($codepoint) eq 'T') {
+ my $Tindex = $codepoint - JamoTIni + 1;
+
+ my @leaders = split /;/, $TtoL[$Tindex];
+ push @leaders, '' if (@leaders == 0); # '' gets lost otherwise
+
+ @ret_codepoints = ();
+ for my $leader (@leaders) {
+ my $new_codepoint = JamoLIni + $CodeL{$leader};
+ push @ret_codepoints, $new_codepoint;
+ }
+ }
+
+ if (@ret_codepoints == 1 && $ret_codepoints[0] == JamoLIni + $CodeL{''}) {
+ $ret_codepoints[0] = JamoTIni + $CodeT{'NG'} - 1;
+ }
+
+ return @ret_codepoints;
+}
+
+execute();