rhubarb-lip-sync/rhubarb/lib/flite-1.4/tools/huff_table

294 lines
9.6 KiB
Bash

#!/bin/sh
###########################################################################
## ##
## Language Technologies Institute ##
## Carnegie Mellon University ##
## Copyright (c) 2004 ##
## All Rights Reserved. ##
## ##
## Permission is hereby granted, free of charge, to use and distribute ##
## this software and its documentation without restriction, including ##
## without limitation the rights to use, copy, modify, merge, publish, ##
## distribute, sublicense, and/or sell copies of this work, and to ##
## permit persons to whom this work is furnished to do so, subject to ##
## the following conditions: ##
## 1. The code must retain the above copyright notice, this list of ##
## conditions and the following disclaimer. ##
## 2. Any modifications must be clearly marked as such. ##
## 3. Original authors' names are not deleted. ##
## 4. The authors' names are not used to endorse or promote products ##
## derived from this software without specific prior written ##
## permission. ##
## ##
## CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK ##
## DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ##
## ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ##
## SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE ##
## FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ##
## WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ##
## AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ##
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ##
## THIS SOFTWARE. ##
## ##
###########################################################################
## Author: Alan W Black (awb@cs.cmu.edu) ##
## Date: December 2004 ##
###########################################################################
## ##
## Make a Huffman table data ##
## done by finding the top singletons, bigrams, trigrams ... and byte ##
## coding them ##
## ##
## But this isn't full huffman coding yet ##
## ##
## Hmm, this should probably be written in something other than shell ##
###########################################################################
## There are N stages
## Find the alphabet (usually 256 things)
## This is done by finding the top singletons, bigrams, trigrams ...
## Find the frequencies (currently ignored, as codes are always 1 byte)
## Compress the data, and output the lookup tables
## There is some octal conversion in here that probably is unnecessary
# Tidy up the tmp files if we get interrupted
trap 'rm -f huff.tmp.corpus.[0-9]* ; exit' 0 2 3 5 10 13 15
# Newer versions of Linux have a not very useful value for LANG by default
LANG=C
export LANG
## 0 is reserved, 1 is reserved too
ALPHABET_SIZE=254
# if [ $1 = "xentries" ]
# then
# infile=$2
# outfile=$3
# # Normalize the string to be compressed
# cat $infile | awk '{print $1}' |
# tr "A-Z" "a-z" |
# awk '{for (i=1; i<=length($1); i++)
# printf("%s ",substr($1,i,1));
# printf("\n");}' >huff.tmp.corpus
# fi
if [ $1 = "entries" ]
then
# actually works for non-utf8 entries too
infile=$2
outfile=$3
# Normalize the string to be compressed
cat $infile | awk '{print $1}' |
cat >huff.entries.corpus
$ESTDIR/../festival/bin/festival -b $FLITEDIR/tools/make_lex.scm '(utf8entries "huff.entries.corpus" "huff.tmp.corpus")'
fi
if [ $1 = "phones" ]
then
infile=$2
outfile=$3
# Normalize the text to be compressed
cat $infile | awk '{print $2}' |
sed 's/\\/ /g' >huff.tmp.corpus
fi
if [ $1 = "residual" ]
then
# This really doesn't work: just some tests to see what's worthwhile
infile=$2
outfile=$3
# Normalized the text to be compressed
cat $infile |
awk '{if ($5 ~ /.*_res_/)
n=1;
else if (n==1)
{
printf("%s ",$i);
b=$i;
for (i=2; i<=NF; i++)
{
q=b-$i;
if (q < 0)
q+=256;
printf("%s ",q);
b=$i
}
printf("\n");
n=0
}}' |
sed 's/,//g;s/};//' >huff.tmp.corpus
fi
if [ $1 = "other" ]
then
infile=$2
outfile=$3
cat $infile >huff.tmp.corpus
fi
CORPUS_SIZE=`cat huff.tmp.corpus | wc -w`
echo "Original Corpus size = $CORPUS_SIZE"
# Find the base alphabet
cat huff.tmp.corpus |
awk '{for (i=1; i<=NF; i++)
printf("%s\n",$i);}' |
sort -u >huff.tmp.base.alphabet
BASE_ALPHABET_SIZE=`cat huff.tmp.base.alphabet | wc -l`
echo "Base alphabet = $BASE_ALPHABET_SIZE"
# Find the additional best n grams
PASS=000
cp -p huff.tmp.corpus huff.tmp.corpus.$PASS
PASS_ALPHABET_SIZE=$BASE_ALPHABET_SIZE
while [ $PASS_ALPHABET_SIZE != $ALPHABET_SIZE ]
do
# Loop until we get to the desired alphabet size
cat huff.tmp.corpus.$PASS |
awk '{for (i=1; i<NF; i++)
{
bb = sprintf("%s_%s",$i,$(i+1));
freq[bb]++;
part1[bb] = $i;
part2[bb] = $(i+1);
}}
END {best="";
bestscore=0;
for (i in freq)
{
score = freq[i];
if (score > bestscore)
{
bestscore = score;
best = i;
}
}
printf("%s %s %d %d\n",part1[best],part2[best],bestscore,freq[best])
}' >huff.tmp.corpus.$PASS.best
# Substitute the best bigram to give next corpus
NPASS=`echo $PASS | awk '{printf("%03d",$1+1)}'`
cat huff.tmp.corpus.$PASS |
awk 'BEGIN {p1="'`cat huff.tmp.corpus.$PASS.best | awk '{print $1}'`'";
p2="'`cat huff.tmp.corpus.$PASS.best | awk '{print $2}'`'";
}
{for (i=1; i<=NF; i++)
{
if (($i == p1) && ($(i+1) == p2))
{
printf("%s+%s ",$i,$(i+1));
i++;
}
else
printf("%s ",$i);
}
printf("\n");}' >huff.tmp.corpus.$NPASS
# Summary
echo $PASS joining `cat huff.tmp.corpus.$PASS.best`
echo " Corpus was "`cat huff.tmp.corpus.$PASS | wc -w`
echo " Corpus is "`cat huff.tmp.corpus.$NPASS | wc -w`
CORPUS_IS=`cat huff.tmp.corpus.$NPASS | wc -w`
echo $CORPUS_SIZE $CORPUS_IS |
awk '{printf(" Compression %0.2f\n",100*($2*1.0)/$1)}'
PASS=$NPASS
PASS_ALPHABET_SIZE=`cat huff.tmp.corpus.$NPASS |
awk '{for (i=1; i<=NF; i++)
printf("%s\n",$i);}' | sort -u | wc -l`
done
# Now build the map
mv huff.tmp.corpus.$NPASS huff.tmp.corpus.best
rm -f huff.tmp.corpus.[0-9]*
cat huff.tmp.corpus.best |
awk 'BEGIN {tot=0;}
{ for (i=1; i<=NF; i++)
{
freq[$i]++
tot++;
}
}
END { for (i in freq)
printf("%f %s\n",freq[i]/tot,i);}' |
sort -n |
awk '{printf("map[\"%s\"] = %d; unmap[%d] = \"%s\";\n",$2,NR,NR,$2)}' >huff.tmp.corpus.maptable
maptable=`cat huff.tmp.corpus.maptable`
# compress the corpus
cat huff.tmp.corpus.best |
awk 'BEGIN {'"$maptable"'}
function enoctal(x)
{
units = x % 8;
eights = ((x - units) / 8) % 8;
sixtyfours = ((((x - units) / 8) - eights) / 8) % 8;
return sprintf("%d%d%d", sixtyfours, eights, units);
}
{
for (i=1; i<=NF; i++)
printf("\\%s",enoctal(map[$i]));
printf("\n");
}' > huff.tmp.corpus.compressed
# uncompress the corpus
cat huff.tmp.corpus.compressed |
sed 's/\\/ /g' |
awk 'BEGIN {'"$maptable"'}
function unenoctal(x)
{
y = ((substr(x,1,1)+0)*64) + ((substr(x,2,1)+0)*8) + (substr(x,3,1)+0);
return y;
}
{
for (i=1; i<=NF; i++)
printf("%s ",unmap[unenoctal($i)]);
printf("\n");
}' > huff.tmp.corpus.uncompressed
if [ $1 = "phones" ]
then
cat huff.tmp.corpus.best |
sed 's/+/\\/g' |
awk '{ for (i=1; i<=NF; i++)
{
freq[$i]++
tot++;
}
}
END { for (i in freq)
printf("%f %s\n",freq[i]/tot,i);}' |
sort -n |
awk '{printf(" \"\\%s\" , /* %f */ \n",$2,$1);}' >$outfile
mv huff.tmp.corpus.compressed huff.phones.compressed
fi
if [ $1 = "entries" ]
then
cat huff.tmp.corpus.best |
sed 's/+//g' |
awk '{ for (i=1; i<=NF; i++)
{
freq[$i]++
tot++;
}
}
END { for (i in freq)
printf("%f %s\n",freq[i]/tot,i);}' |
sort -n |
awk '{printf(" \"%s\" , /* %f */ \n",$2,$1);}' >$outfile
mv huff.tmp.corpus.compressed huff.entries.compressed
fi
exit