diff --git a/scripts/Makefile.gram b/scripts/Makefile.gram new file mode 100644 index 0000000000..0be0f9caa8 --- /dev/null +++ b/scripts/Makefile.gram @@ -0,0 +1,13 @@ + +main: grammar + +grammar: + @echo "Creating Grammar files..." + @../bin/mklm + @echo "Done." + +clean: + @echo "Cleaning Grammar files..." + @rm -f *.fsg *.words *.dic *~ \#* + @../bin/mklm clean + @echo "Done." diff --git a/scripts/mklm b/scripts/mklm new file mode 100755 index 0000000000..06a58afe85 --- /dev/null +++ b/scripts/mklm @@ -0,0 +1,95 @@ +#!/usr/bin/perl +# mklm - builds LM's from all avaliable sentence files. +# +# Processes all .sent files in the current directory. +# + +use Data::Dumper; + +@sent = <*.sent>; + +if ($ARGV[0] eq "clean") { + foreach $file (@sent) { + my($base,$ext) = split(/\./, $file); + system("rm -rf $base"); + } + exit; +} + + + +open(DIC,"<../conf/cmudict.0.6d"); +@dic = ; +close(DIC); + + + +foreach $file (@sent) { + my($base,$ext) = split(/\./, $file); + system("rm -rf $base"); + system("mkdir -p $base"); + system("../bin/quick_lm.pl -s $file -o $base/$base.lm 2>/dev/null"); +} + +foreach $file (@sent) { + my($word_file,$ext) = split(/\./, $file); + open(SENT,"<$file"); + @lines = ; + close(SENT); + undef @in; + undef @out; + open(WORDS,">$word_file.words"); + foreach $line (@lines) { + chomp($line); + $line =~ s/<(.*?)>//gi;; + $line =~ s/^\s*//; + $line =~ s/\s*$//; + @tmp = split(' ', $line); + foreach $tmp (@tmp) { + push(@in, uc($tmp)); + + } + } + print Dumpzer \@in; + undef %saw; + @saw{@in} = (); + @out = sort keys %saw; + foreach $line (@out) { + print WORDS "$line\n"; + + } + close(WORDS); +} + +@word_files = <*.words>; + +foreach $file (@word_files) { + my($dic,$ext) = split(/\./, $file); + + open(WORDS,"<$file"); + @words = ; + close(WORDS); + unlink($file); + unlink("$dic.words"); + open(DIC, ">$dic/$dic.dic"); + foreach $line (@dic) { + chomp $line; + if ($line =~ m/(.*)\s\s(.*)/) { + local $word = $1; + local $pron = $2; + $word =~ s/^\s*//; + $word =~ s/\s*$//; + $pron =~ s/^\s*//; + $pron =~ s/\s*$//; + foreach $myword (@words) { + chomp $myword; + $string = $word; + $string =~ s/\(\d\)//g; + if ($myword eq $string) { + print DIC "$word\t$pron\n"; + } + } + } + } + close(DIC); +} diff --git a/scripts/quick_lm.pl b/scripts/quick_lm.pl new file mode 100755 index 0000000000..9172acbd7e --- /dev/null +++ b/scripts/quick_lm.pl @@ -0,0 +1,257 @@ +#!/usr/bin/perl + +# NOTE: this is by no means an efficient implementation and performance will +# deteriorate rapidly as a function of the corpus size. Larger corpora should be +# processed using the toolkit available at http://www.speech.cs.cmu.edu/SLM_info.html + +# [2feb96] (air) +# cobbles together a language model from a set of exemplar sentences. +# features: 1) uniform discounting, 2) no cutoffs +# the "+" version allows insertion of extra words into the 1gram vector + +# [27nov97] (air) +# bulletproof a bit for use in conjunction with a cgi script + +# [20000711] (air) +# made visible the discount parmeter + +# [20011123] (air) +# cleaned-up version for distribution + +use Getopt::Std; + +$VERBOSE = 1; + +sub handler { local($sig) = @_; + print STDERR "quick_lm caught a SIG$sig -- dying\n"; + exit(0); + } +foreach (qw(XCPU KILL TERM STOP)) { $SIG{$_} = \&handler; } + + +if ($#ARGV < 0) { die("usage: quick_lm -s -o [-w ] [-d discount]\n"); } +Getopt::Std::getopts("s:w:d:o:x"); +$sentfile = $opt_s; +$wordfile = $opt_w; +$discount = $opt_d; +$output = $opt_o; + +$output or die("No output file\n"); +$sentfile or die("No sentence file\n"); + +$| = 1; # always flush buffers + +if ($VERBOSE>0) {print STDERR "Language model started at ",scalar localtime(),"\n";} + + +open(IN,"<$sentfile") or die("can't open $sentfile!\n"); +if ($wordfile ne "") { open(WORDS,"$wordfile"); $wflag = 1;} else { $wflag = 0; } + +$log10 = log(10.0); + +if ($discount ne "") { + if (($discount<=0.0) or ($discount>=1.0)) { + print STDERR "\discount value out of range: must be 0.0 < x < 1.0! ...using 0.5\n"; + $discount_mass = 0.5; # just use default + } else { + $discount_mass = $discount; + } +} else { + # Ben and Greg's experiments show that 0.5 is a way better default choice. + $discount_mass = 0.5; # Set a nominal discount... +} +$deflator = 1.0 - $discount_mass; + +# create count tables +$sent_cnt = 0; +while () { + s/^\s*//; s/\s*$//; + if ( $_ eq "" ) { next; } else { $sent_cnt++; } # skip empty lines + @word = split(/\s/); + for ($j=0;$j<($#word-1);$j++) { + $trigram{join(" ",$word[$j],$word[$j+1],$word[$j+2])}++; + $bigram{ join(" ",$word[$j],$word[$j+1])}++; + $unigram{$word[$j]}++; + } + # finish up the bi and uni's at the end of the sentence... + $bigram{join(" ",$word[$j],$word[$j+1])}++; + $unigram{$word[$j]}++; + + $unigram{$word[$j+1]}++; +} +close(IN); +if ($VERBOSE) { print STDERR "$sent_cnt sentences found.\n"; } + +# add in any words +if ($wflag) { + $new = 0; $read_in = 0; + while () { + s/^\s*//; s/\s*$//; + if ( $_ eq "" ) { next; } else { $read_in++; } # skip empty lines + if (! $unigram{$_}) { $unigram{$_} = 1; $new++; } + } + if ($VERBOSE) { print STDERR "tried to add $read_in word; $new were new words\n"; } + close (WORDS); +} +if ( ($sent_cnt==0) && ($new==0) ) { + print STDERR "no input?\n"; + exit; +} + +open(LM,">$output") or die("can't open $myfile.lm for output!\n"); + +$preface = ""; +$preface .= "Language model created by QuickLM on ".`date`; +$preface .= "Copyright (c) 1996-2002\nCarnegie Mellon University and Alexander I. Rudnicky\n\n"; +$preface .= "This model based on a corpus of $sent_cnt sentences and ".scalar (keys %unigram). " words\n"; +$preface .= "The (fixed) discount mass is $discount_mass\n\n"; + + +# compute counts +$unisum = 0; $uni_count = 0; $bi_count = 0; $tri_count = 0; +foreach $x (keys(%unigram)) { $uni_count++; $unisum += $unigram{$x}; } +foreach $x (keys(%bigram)) { $bi_count++; } +foreach $x (keys(%trigram)) { $tri_count++; } + +print LM $preface; +print LM "\\data\\\n"; +print LM "ngram 1=$uni_count\n"; +if ( $bi_count > 0 ) { print LM "ngram 2=$bi_count\n"; } +if ( $tri_count > 0 ) { print LM "ngram 3=$tri_count\n"; } +print LM "\n"; + +# compute uni probs +foreach $x (keys(%unigram)) { + $uniprob{$x} = ($unigram{$x}/$unisum) * $deflator; +} + +# compute alphas +foreach $y (keys(%unigram)) { + $w1 = $y; + $sum_denom = 0.0; + foreach $x (keys(%bigram)) { + if ( substr($x,0,rindex($x," ")) eq $w1 ) { + $w2 = substr($x,index($x," ")+1); + $sum_denom += $uniprob{$w2}; + } + } + $alpha{$w1} = $discount_mass / (1.0 - $sum_denom); +} + +print LM "\\1-grams:\n"; +foreach $x (sort keys(%unigram)) { + printf LM "%6.4f %s %6.4f\n", log($uniprob{$x})/$log10, $x, log($alpha{$x})/$log10; +} +print LM "\n"; + +#compute bi probs +foreach $x (keys(%bigram)) { + $w1 = substr($x,0,rindex($x," ")); + $biprob{$x} = ($bigram{$x}*$deflator)/$unigram{$w1}; +} + +#compute bialphas +foreach $x (keys(%bigram)) { + $w1w2 = $x; + $sum_denom = 0.0; + foreach $y (keys(%trigram)) { + if (substr($y,0,rindex($y," ")) eq $w1w2 ) { + $w2w3 = substr($y,index($y," ")+1); + $sum_denom += $biprob{$w2w3}; + } + } + $bialpha{$w1w2} = $discount_mass / (1.0 - $sum_denom); +} + +# output the bigrams and trigrams (now that we have the alphas computed). +if ( $bi_count > 0 ) { + print LM "\\2-grams:\n"; + foreach $x (sort keys(%bigram)) { + printf LM "%6.4f %s %6.4f\n", + log($biprob{$x})/$log10, $x, log($bialpha{$x})/$log10; + } + print LM "\n"; +} + +if ($tri_count > 0 ) { + print LM "\\3-grams:\n"; + foreach $x (sort keys(%trigram)) { + $w1w2 = substr($x,0,rindex($x," ")); + printf LM "%6.4f %s\n", + log(($trigram{$x}*$deflator)/$bigram{$w1w2})/$log10, $x; + } + print LM "\n"; +} + +print LM "\\end\\\n"; +close(LM); + +if ($VERBOSE>0) { print STDERR "Language model completed at ",scalar localtime(),"\n"; } + +# +__END__ +=pod + +/* ==================================================================== + * Copyright (c) 1996-2002 Alexander I. Rudnicky and Carnegie Mellon University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All copies, used or distributed, must preserve the original wording of + * the copyright notice included in the output file. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + + +Pretty Good Language Modeler, now with unigram vector augmentation! + +The Pretty Good Language Modeler is intended for quick construction of small +language models, typically as might be needed in application development. Depending +on the version of Perl that you are running, a practical limitation is a +maximum vocabulary size on the order of 1000-2000 words. The limiting factor +is the number of n-grams observed, since each n-gram is stored as a hash key. +(So smaller vocabularies may turn out to be a problem as well.) + +This package computes a stadard back-off language model. It differs in one significant +respect, which is the computation of the discount. We adopt a "proportional" (or ratio) +discount in which a certain percentage of probability mass is removed (typically 50%) +from observed n-grams and redistributed over unobserved n-grams. + +Conventionally, an absolute discount would be used, however we have found that the +proportional discount appears to be robust for extremely small languages, as might be +prototyped by a developer, as opposed to based on a collected corpus. We have found that +absolute and proportional discounts produce comparable recognition results with perhaps +a slight advantage for proportional discounting. A more systematic investigation of +this technique would be desirable. In any case it also has the virtue of using a very +simple computation. + +=end + diff --git a/src/mod/asr_tts/mod_pocketsphinx/Makefile b/src/mod/asr_tts/mod_pocketsphinx/Makefile index 1f0a138ae0..f3bfa49921 100644 --- a/src/mod/asr_tts/mod_pocketsphinx/Makefile +++ b/src/mod/asr_tts/mod_pocketsphinx/Makefile @@ -47,4 +47,12 @@ $(DESTDIR)$(PREFIX)/grammar/model/communicator: mkdir -p $(DESTDIR)$(PREFIX)/grammar/model/communicator mv $(BASE)/libs/Communicator_semi_40.cd_semi_6000/* $(DESTDIR)$(PREFIX)/grammar/model/communicator -local_install: $(DESTDIR)$(PREFIX)/grammar/model $(DESTDIR)$(PREFIX)/grammar/model/communicator +grammar_maker: + @cp -f $(BASE)/scripts/mklm $(DESTDIR)$(PREFIX)/bin/mklm + @cp -f $(BASE)/scripts/quick_lm.pl $(DESTDIR)$(PREFIX)/bin/quick_lm.pl + @chmod 755 $(DESTDIR)$(PREFIX)/bin/mklm + @chmod 755 $(DESTDIR)$(PREFIX)/bin/quick_lm.pl + @cp -f $(BASE)/scripts/Makefile.gram $(DESTDIR)$(PREFIX)/grammar/Makefile + @cp -f $(POCKETSPHINX_DIR)/model/lm/cmudict.0.6d $(DESTDIR)$(PREFIX)/conf/cmudict.0.6d + +local_install: $(DESTDIR)$(PREFIX)/grammar/model $(DESTDIR)$(PREFIX)/grammar/model/communicator grammar_maker