rhubarb-lip-sync/rhubarb/lib/flite-1.4/tools/build_lex

183 lines
7.5 KiB
Bash

#!/bin/sh
###########################################################################
## ##
## Language Technologies Institute ##
## Carnegie Mellon University ##
## Copyright (c) 2004 ##
## All Rights Reserved. ##
## ##
## Permission is hereby granted, free of charge, to use and distribute ##
## this software and its documentation without restriction, including ##
## without limitation the rights to use, copy, modify, merge, publish, ##
## distribute, sublicense, and/or sell copies of this work, and to ##
## permit persons to whom this work is furnished to do so, subject to ##
## the following conditions: ##
## 1. The code must retain the above copyright notice, this list of ##
## conditions and the following disclaimer. ##
## 2. Any modifications must be clearly marked as such. ##
## 3. Original authors' names are not deleted. ##
## 4. The authors' names are not used to endorse or promote products ##
## derived from this software without specific prior written ##
## permission. ##
## ##
## CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK ##
## DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ##
## ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ##
## SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE ##
## FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ##
## WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ##
## AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ##
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ##
## THIS SOFTWARE. ##
## ##
###########################################################################
## ##
## Make the files from the Festival Released version ##
## ##
## This doesn't work yet ##
## ##
###########################################################################
if [ "x$FLITEDIR" == "x" ]
then
FLITEDIR=`pwd`/../..
fi
LEXNAME=cmu_dari
if [ $# = 0 ]
then
$0 setup
$0 lts
$0 lex
$0 compresslex
$0 install
exit
fi
if [ "$1" = "setup" ]
then
tar zxvf festlex_CMU.tar.gz
cp -p $FESTVOXDIR/src/lts/build_lts festival/lib/dicts/cmu
cp -p $FESTVOXDIR/src/lts/build_lts_rules festival/lib/dicts/cmu
mkdir festival/lib/dicts/cmu/c
mkdir festival/lib/dicts/cmu/wfst
mkdir festival/lib/dicts/cmu/lts_scratch
fi
if [ "$1" = "lts" ]
then
(cd festival/lib/dicts/cmu;
festival --heap 10000000 -b allowables.scm '(dump-flat-entries-all "cmudict-0.4.out" "lts_scratch/lex_entries.out")'
./build_lts cummulate
./build_lts align
./build_lts build
./build_lts merge
./build_lts test
festival --heap 10000000 -b $FLITEDIR/tools/make_lts_wfst.scm lts_scratch/lex_lts_rules.scm '(lts_to_rg_to_wfst lex_lts_rules "wfst/")';
festival --heap 10000000 -b $FLITEDIR/tools/make_lts.scm lts_scratch/lex_lts_rules.scm '(ltsregextoC "cmu" lex_lts_rules "wfst/" "c")';
)
fi
if [ "$1" = "lex" ]
then
( cd festival/lib/dicts/cmu;
# Find the words to prune from the built lexicon
# We will prune words that aren't homographs and the LTS gets correct
# Use the lts_test.log to find the failed entries and only
# include them in the list
festival -b cmulex.scm ~/projects/flite/tools/make_lex.scm '(remove_predictable_entries "cmudict-0.4.out" "pruned_lex.scm" "lex_lts_rules.scm")'
festival --heap 10000000 -b '(lex.compile "pruned_lex.scm" "pruned_lex.out")'
festival --heap 10000000 -b $FLITEDIR/tools/make_lex.scm '(lextoC "cmu" "pruned_lex.out" "c")' ;
)
fi
if [ "$1" = "lex2" ]
then
# experiment: put Letter_Phone(s) together as things to compress
# does give much smaller result, though needs to be less than 256 symbols
# not fully implemented
( cd festival/lib/dicts/cmu;
# Use the lts_test.log to find the failed entries and only
# include them in the list
# this needs festival-1.96 or later to get the pos from the lts_test.log
grep "^failed " lts_test.log |
sed 's/(/( /g;s/)/ )/g' |
awk '{printf("( \"");
for (i=3; $i != ")"; i++)
printf("%s",$i);
i++;
printf("\" %s ( ",$i);
i++; i++
for ( ; $i != ")"; i++)
if ($i != "#")
printf("%s ",$i);
printf("))\n");
}' |
tr -d '()"' |
awk '{if ($2 == "nil")
printf("0_start ");
else
printf("%s_start ",$2);
for (i=1; i<=length($1); i++)
printf("%s_%s ",substr($1,i,1),$(i+2));
printf("\n");}' >pruned_lex2.data
# festival --heap 10000000 -b $FLITEDIR/tools/make_lex.scm '(lextoC "cmu" "pruned_lex.out" "c")' ;
)
fi
if [ "$1" = "compresslex" ]
then
# Compress the entries and phone strings by finding best ngrams
( cd c;
$FLITEDIR/tools/huff_table phones ${LEXNAME}_lex_data ${LEXNAME}_lex_phones_huff_table.c
$FLITEDIR/tools/huff_table entries ${LEXNAME}_lex_data ${LEXNAME}_lex_entries_huff_table.c
paste huff.entries.compressed huff.phones.compressed huff.tmp.corpus |
tr -d " " |
awk 'BEGIN {pcount = 1;
printf("/* index to compressed data */\n");
}
function unenoctal(x)
{
y = ((substr(x,1,1)+0)*64) + ((substr(x,2,1)+0)*8) + (substr(x,3,1)+0);
return y;
}
{printf(" ");
for (i=length($2)-3; i>0; i-=4)
{
printf("%d,",unenoctal(substr($2,i+1,3)));
pcount++;
}
pcount++;
printf(" 255, /* %d %s */ ",pcount,$3);
for (i=1; i<length($1); i+=4)
{
printf("%d,",unenoctal(substr($1,i+1,3)));
pcount++;
}
printf("0,\n");
pcount++;
}
END { printf("/* num_bytes = %d */\n",pcount);}' >${LEXNAME}_lex_data_compressed.c
grep "num_bytes = " ${LEXNAME}_lex_data_compressed.c |
awk '{print $4}' >${LEXNAME}_lex_num_bytes_compressed.c
)
fi
if [ "$1" = "install" ]
then
cp -p festival/lib/dicts/cmu/c/cmu_lex_data.c .
cp -p festival/lib/dicts/cmu/c/cmu_lex_data_compressed.c cmu_lex_data_raw.c
cp -p festival/lib/dicts/cmu/c/cmu_lex_phones_huff_table.c .
cp -p festival/lib/dicts/cmu/c/cmu_lex_entries_huff_table.c .
cp -p festival/lib/dicts/cmu/c/cmu_lex_entries.c .
cp -p festival/lib/dicts/cmu/c/cmu_lex_num_bytes_compressed.c cmu_lex_num_bytes.c
cp -p festival/lib/dicts/cmu/c/cmu_lts_model.c .
cp -p festival/lib/dicts/cmu/c/cmu_lts_model.h .
cp -p festival/lib/dicts/cmu/c/cmu_lts_rules.c .
fi