diff options
author | Jason Katz-Brown <jason@airbnb.com> | 2013-08-25 02:17:13 -0700 |
---|---|---|
committer | Jason Katz-Brown <jason@airbnb.com> | 2013-08-25 02:17:13 -0700 |
commit | 9306cb60c32082c5403931de0823a9fd5daa196c (patch) | |
tree | ca1b6eb695fdf3f0c2294e92416b272164bae642 /new_language_generation/perl_tools/generate_raw_words.pl | |
parent | 8fb2c681cecc01b46b0f4ba02d5cc177c4747b1c (diff) |
Initial git commit.
Diffstat (limited to 'new_language_generation/perl_tools/generate_raw_words.pl')
-rwxr-xr-x | new_language_generation/perl_tools/generate_raw_words.pl | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/new_language_generation/perl_tools/generate_raw_words.pl b/new_language_generation/perl_tools/generate_raw_words.pl new file mode 100755 index 0000000..b32d810 --- /dev/null +++ b/new_language_generation/perl_tools/generate_raw_words.pl @@ -0,0 +1,23 @@ +#!/usr/bin/perl + +# USAGE: generate_raw_words.pl generate_words_output + +binmode STDOUT, ':utf8'; + +sub read_and_spit { + for my $arg (@ARGV) { + open (my $input, "<:encoding(utf8)", $arg); + + while (<$input>) { + chomp; + my ($word, $count) = split(/\s/, $_); + + my $clean_word = $word; + $clean_word =~ s/;//g; + + print "$clean_word\n"; + } + } +} + +read_and_spit(); |