When you're down in Transylvania
And the night begins to fall You might get to feeling lonely If you do give me a call Just dial Transylvania 1, 2, 3, 4, 5 Transylvania 1, 2, 3, 4, 5 Transylvania 1, 2, 3, 4, 5 (oh and remove some stuff too) /b git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@12345 d0543943-73ff-0310-b7d9-9358b9ac24b2
This commit is contained in:
parent
e8c947163e
commit
7ac16e8aea
96
scripts/mklm
96
scripts/mklm
|
@ -1,96 +0,0 @@
|
|||
#!/usr/bin/perl
|
||||
# mklm - builds LM's from all avaliable sentence files.
|
||||
#
|
||||
# Processes all .sent files in the current directory.
|
||||
#
|
||||
|
||||
use Data::Dumper;
|
||||
|
||||
@sent = <*.sent>;
|
||||
|
||||
if ($ARGV[0] eq "clean") {
|
||||
foreach $file (@sent) {
|
||||
my($base,$ext) = split(/\./, $file);
|
||||
system("rm -rf $base");
|
||||
}
|
||||
exit;
|
||||
}
|
||||
|
||||
|
||||
|
||||
open(DIC,"<../conf/cmudict.0.6d");
|
||||
@dic = <DIC>;
|
||||
close(DIC);
|
||||
|
||||
|
||||
|
||||
foreach $file (@sent) {
|
||||
my($base,$ext) = split(/\./, $file);
|
||||
system("rm -rf $base");
|
||||
system("mkdir -p $base");
|
||||
system("../bin/quick_lm.pl -s $file -o $base/$base.lm 2>/dev/null");
|
||||
}
|
||||
|
||||
foreach $file (@sent) {
|
||||
my($word_file,$ext) = split(/\./, $file);
|
||||
open(SENT,"<$file");
|
||||
@lines = <SENT>;
|
||||
close(SENT);
|
||||
undef @in;
|
||||
undef @out;
|
||||
open(WORDS,">$word_file.words");
|
||||
foreach $line (@lines) {
|
||||
chomp($line);
|
||||
$line =~ s/<(.*?)>//gi;;
|
||||
$line =~ s/^\s*//;
|
||||
$line =~ s/\s*$//;
|
||||
$line =~ s/_/ /g;
|
||||
@tmp = split(' ', $line);
|
||||
foreach $tmp (@tmp) {
|
||||
push(@in, uc($tmp));
|
||||
|
||||
}
|
||||
}
|
||||
print Dumpzer \@in;
|
||||
undef %saw;
|
||||
@saw{@in} = ();
|
||||
@out = sort keys %saw;
|
||||
foreach $line (@out) {
|
||||
print WORDS "$line\n";
|
||||
|
||||
}
|
||||
close(WORDS);
|
||||
}
|
||||
|
||||
@word_files = <*.words>;
|
||||
|
||||
foreach $file (@word_files) {
|
||||
my($dic,$ext) = split(/\./, $file);
|
||||
|
||||
open(WORDS,"<$file");
|
||||
@words = <WORDS>;
|
||||
close(WORDS);
|
||||
unlink($file);
|
||||
unlink("$dic.words");
|
||||
open(DIC, ">$dic/$dic.dic");
|
||||
foreach $line (@dic) {
|
||||
chomp $line;
|
||||
if ($line =~ m/(.*)\s\s(.*)/) {
|
||||
local $word = $1;
|
||||
local $pron = $2;
|
||||
$word =~ s/^\s*//;
|
||||
$word =~ s/\s*$//;
|
||||
$pron =~ s/^\s*//;
|
||||
$pron =~ s/\s*$//;
|
||||
foreach $myword (@words) {
|
||||
chomp $myword;
|
||||
$string = $word;
|
||||
$string =~ s/\(\d\)//g;
|
||||
if ($myword eq $string) {
|
||||
print DIC "$word\t$pron\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(DIC);
|
||||
}
|
|
@ -1,257 +0,0 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
# NOTE: this is by no means an efficient implementation and performance will
|
||||
# deteriorate rapidly as a function of the corpus size. Larger corpora should be
|
||||
# processed using the toolkit available at http://www.speech.cs.cmu.edu/SLM_info.html
|
||||
|
||||
# [2feb96] (air)
|
||||
# cobbles together a language model from a set of exemplar sentences.
|
||||
# features: 1) uniform discounting, 2) no cutoffs
|
||||
# the "+" version allows insertion of extra words into the 1gram vector
|
||||
|
||||
# [27nov97] (air)
|
||||
# bulletproof a bit for use in conjunction with a cgi script
|
||||
|
||||
# [20000711] (air)
|
||||
# made visible the discount parmeter
|
||||
|
||||
# [20011123] (air)
|
||||
# cleaned-up version for distribution
|
||||
|
||||
use Getopt::Std;
|
||||
|
||||
$VERBOSE = 1;
|
||||
|
||||
sub handler { local($sig) = @_;
|
||||
print STDERR "quick_lm caught a SIG$sig -- dying\n";
|
||||
exit(0);
|
||||
}
|
||||
foreach (qw(XCPU KILL TERM STOP)) { $SIG{$_} = \&handler; }
|
||||
|
||||
|
||||
if ($#ARGV < 0) { die("usage: quick_lm -s <sentence_file> -o <output_file> [-w <word_file>] [-d discount]\n"); }
|
||||
Getopt::Std::getopts("s:w:d:o:x");
|
||||
$sentfile = $opt_s;
|
||||
$wordfile = $opt_w;
|
||||
$discount = $opt_d;
|
||||
$output = $opt_o;
|
||||
|
||||
$output or die("No output file\n");
|
||||
$sentfile or die("No sentence file\n");
|
||||
|
||||
$| = 1; # always flush buffers
|
||||
|
||||
if ($VERBOSE>0) {print STDERR "Language model started at ",scalar localtime(),"\n";}
|
||||
|
||||
|
||||
open(IN,"<$sentfile") or die("can't open $sentfile!\n");
|
||||
if ($wordfile ne "") { open(WORDS,"$wordfile"); $wflag = 1;} else { $wflag = 0; }
|
||||
|
||||
$log10 = log(10.0);
|
||||
|
||||
if ($discount ne "") {
|
||||
if (($discount<=0.0) or ($discount>=1.0)) {
|
||||
print STDERR "\discount value out of range: must be 0.0 < x < 1.0! ...using 0.5\n";
|
||||
$discount_mass = 0.5; # just use default
|
||||
} else {
|
||||
$discount_mass = $discount;
|
||||
}
|
||||
} else {
|
||||
# Ben and Greg's experiments show that 0.5 is a way better default choice.
|
||||
$discount_mass = 0.5; # Set a nominal discount...
|
||||
}
|
||||
$deflator = 1.0 - $discount_mass;
|
||||
|
||||
# create count tables
|
||||
$sent_cnt = 0;
|
||||
while (<IN>) {
|
||||
s/^\s*//; s/\s*$//;
|
||||
if ( $_ eq "" ) { next; } else { $sent_cnt++; } # skip empty lines
|
||||
@word = split(/\s/);
|
||||
for ($j=0;$j<($#word-1);$j++) {
|
||||
$trigram{join(" ",$word[$j],$word[$j+1],$word[$j+2])}++;
|
||||
$bigram{ join(" ",$word[$j],$word[$j+1])}++;
|
||||
$unigram{$word[$j]}++;
|
||||
}
|
||||
# finish up the bi and uni's at the end of the sentence...
|
||||
$bigram{join(" ",$word[$j],$word[$j+1])}++;
|
||||
$unigram{$word[$j]}++;
|
||||
|
||||
$unigram{$word[$j+1]}++;
|
||||
}
|
||||
close(IN);
|
||||
if ($VERBOSE) { print STDERR "$sent_cnt sentences found.\n"; }
|
||||
|
||||
# add in any words
|
||||
if ($wflag) {
|
||||
$new = 0; $read_in = 0;
|
||||
while (<WORDS>) {
|
||||
s/^\s*//; s/\s*$//;
|
||||
if ( $_ eq "" ) { next; } else { $read_in++; } # skip empty lines
|
||||
if (! $unigram{$_}) { $unigram{$_} = 1; $new++; }
|
||||
}
|
||||
if ($VERBOSE) { print STDERR "tried to add $read_in word; $new were new words\n"; }
|
||||
close (WORDS);
|
||||
}
|
||||
if ( ($sent_cnt==0) && ($new==0) ) {
|
||||
print STDERR "no input?\n";
|
||||
exit;
|
||||
}
|
||||
|
||||
open(LM,">$output") or die("can't open $myfile.lm for output!\n");
|
||||
|
||||
$preface = "";
|
||||
$preface .= "Language model created by QuickLM on ".`date`;
|
||||
$preface .= "Copyright (c) 1996-2002\nCarnegie Mellon University and Alexander I. Rudnicky\n\n";
|
||||
$preface .= "This model based on a corpus of $sent_cnt sentences and ".scalar (keys %unigram). " words\n";
|
||||
$preface .= "The (fixed) discount mass is $discount_mass\n\n";
|
||||
|
||||
|
||||
# compute counts
|
||||
$unisum = 0; $uni_count = 0; $bi_count = 0; $tri_count = 0;
|
||||
foreach $x (keys(%unigram)) { $uni_count++; $unisum += $unigram{$x}; }
|
||||
foreach $x (keys(%bigram)) { $bi_count++; }
|
||||
foreach $x (keys(%trigram)) { $tri_count++; }
|
||||
|
||||
print LM $preface;
|
||||
print LM "\\data\\\n";
|
||||
print LM "ngram 1=$uni_count\n";
|
||||
if ( $bi_count > 0 ) { print LM "ngram 2=$bi_count\n"; }
|
||||
if ( $tri_count > 0 ) { print LM "ngram 3=$tri_count\n"; }
|
||||
print LM "\n";
|
||||
|
||||
# compute uni probs
|
||||
foreach $x (keys(%unigram)) {
|
||||
$uniprob{$x} = ($unigram{$x}/$unisum) * $deflator;
|
||||
}
|
||||
|
||||
# compute alphas
|
||||
foreach $y (keys(%unigram)) {
|
||||
$w1 = $y;
|
||||
$sum_denom = 0.0;
|
||||
foreach $x (keys(%bigram)) {
|
||||
if ( substr($x,0,rindex($x," ")) eq $w1 ) {
|
||||
$w2 = substr($x,index($x," ")+1);
|
||||
$sum_denom += $uniprob{$w2};
|
||||
}
|
||||
}
|
||||
$alpha{$w1} = $discount_mass / (1.0 - $sum_denom);
|
||||
}
|
||||
|
||||
print LM "\\1-grams:\n";
|
||||
foreach $x (sort keys(%unigram)) {
|
||||
printf LM "%6.4f %s %6.4f\n", log($uniprob{$x})/$log10, $x, log($alpha{$x})/$log10;
|
||||
}
|
||||
print LM "\n";
|
||||
|
||||
#compute bi probs
|
||||
foreach $x (keys(%bigram)) {
|
||||
$w1 = substr($x,0,rindex($x," "));
|
||||
$biprob{$x} = ($bigram{$x}*$deflator)/$unigram{$w1};
|
||||
}
|
||||
|
||||
#compute bialphas
|
||||
foreach $x (keys(%bigram)) {
|
||||
$w1w2 = $x;
|
||||
$sum_denom = 0.0;
|
||||
foreach $y (keys(%trigram)) {
|
||||
if (substr($y,0,rindex($y," ")) eq $w1w2 ) {
|
||||
$w2w3 = substr($y,index($y," ")+1);
|
||||
$sum_denom += $biprob{$w2w3};
|
||||
}
|
||||
}
|
||||
$bialpha{$w1w2} = $discount_mass / (1.0 - $sum_denom);
|
||||
}
|
||||
|
||||
# output the bigrams and trigrams (now that we have the alphas computed).
|
||||
if ( $bi_count > 0 ) {
|
||||
print LM "\\2-grams:\n";
|
||||
foreach $x (sort keys(%bigram)) {
|
||||
printf LM "%6.4f %s %6.4f\n",
|
||||
log($biprob{$x})/$log10, $x, log($bialpha{$x})/$log10;
|
||||
}
|
||||
print LM "\n";
|
||||
}
|
||||
|
||||
if ($tri_count > 0 ) {
|
||||
print LM "\\3-grams:\n";
|
||||
foreach $x (sort keys(%trigram)) {
|
||||
$w1w2 = substr($x,0,rindex($x," "));
|
||||
printf LM "%6.4f %s\n",
|
||||
log(($trigram{$x}*$deflator)/$bigram{$w1w2})/$log10, $x;
|
||||
}
|
||||
print LM "\n";
|
||||
}
|
||||
|
||||
print LM "\\end\\\n";
|
||||
close(LM);
|
||||
|
||||
if ($VERBOSE>0) { print STDERR "Language model completed at ",scalar localtime(),"\n"; }
|
||||
|
||||
#
|
||||
__END__
|
||||
=pod
|
||||
|
||||
/* ====================================================================
|
||||
* Copyright (c) 1996-2002 Alexander I. Rudnicky and Carnegie Mellon University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. All copies, used or distributed, must preserve the original wording of
|
||||
* the copyright notice included in the output file.
|
||||
*
|
||||
* This work was supported in part by funding from the Defense Advanced
|
||||
* Research Projects Agency and the CMU Sphinx Speech Consortium.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
||||
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
||||
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* ====================================================================
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
Pretty Good Language Modeler, now with unigram vector augmentation!
|
||||
|
||||
The Pretty Good Language Modeler is intended for quick construction of small
|
||||
language models, typically as might be needed in application development. Depending
|
||||
on the version of Perl that you are running, a practical limitation is a
|
||||
maximum vocabulary size on the order of 1000-2000 words. The limiting factor
|
||||
is the number of n-grams observed, since each n-gram is stored as a hash key.
|
||||
(So smaller vocabularies may turn out to be a problem as well.)
|
||||
|
||||
This package computes a stadard back-off language model. It differs in one significant
|
||||
respect, which is the computation of the discount. We adopt a "proportional" (or ratio)
|
||||
discount in which a certain percentage of probability mass is removed (typically 50%)
|
||||
from observed n-grams and redistributed over unobserved n-grams.
|
||||
|
||||
Conventionally, an absolute discount would be used, however we have found that the
|
||||
proportional discount appears to be robust for extremely small languages, as might be
|
||||
prototyped by a developer, as opposed to based on a collected corpus. We have found that
|
||||
absolute and proportional discounts produce comparable recognition results with perhaps
|
||||
a slight advantage for proportional discounting. A more systematic investigation of
|
||||
this technique would be desirable. In any case it also has the virtue of using a very
|
||||
simple computation.
|
||||
|
||||
=end
|
||||
|
Loading…
Reference in New Issue