294 lines
9.6 KiB
Bash
294 lines
9.6 KiB
Bash
#!/bin/sh
|
|
###########################################################################
|
|
## ##
|
|
## Language Technologies Institute ##
|
|
## Carnegie Mellon University ##
|
|
## Copyright (c) 2004 ##
|
|
## All Rights Reserved. ##
|
|
## ##
|
|
## Permission is hereby granted, free of charge, to use and distribute ##
|
|
## this software and its documentation without restriction, including ##
|
|
## without limitation the rights to use, copy, modify, merge, publish, ##
|
|
## distribute, sublicense, and/or sell copies of this work, and to ##
|
|
## permit persons to whom this work is furnished to do so, subject to ##
|
|
## the following conditions: ##
|
|
## 1. The code must retain the above copyright notice, this list of ##
|
|
## conditions and the following disclaimer. ##
|
|
## 2. Any modifications must be clearly marked as such. ##
|
|
## 3. Original authors' names are not deleted. ##
|
|
## 4. The authors' names are not used to endorse or promote products ##
|
|
## derived from this software without specific prior written ##
|
|
## permission. ##
|
|
## ##
|
|
## CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK ##
|
|
## DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ##
|
|
## ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ##
|
|
## SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE ##
|
|
## FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ##
|
|
## WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ##
|
|
## AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ##
|
|
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ##
|
|
## THIS SOFTWARE. ##
|
|
## ##
|
|
###########################################################################
|
|
## Author: Alan W Black (awb@cs.cmu.edu) ##
|
|
## Date: December 2004 ##
|
|
###########################################################################
|
|
## ##
|
|
## Make a Huffman table data ##
|
|
## done by finding the top singletons, bigrams, trigrams ... and byte ##
|
|
## coding them ##
|
|
## ##
|
|
## But this isn't full huffman coding yet ##
|
|
## ##
|
|
## Hmm, this should probably be written in something other than shell ##
|
|
###########################################################################
|
|
|
|
## There are N stages
|
|
## Find the alphabet (usually 256 things)
|
|
## This is done by finding the top singletons, bigrams, trigrams ...
|
|
## Find the frequencies (currently ignored, as codes are always 1 byte)
|
|
## Compress the data, and output the lookup tables
|
|
|
|
## There is some octal conversion in here that probably is unnecessary
|
|
|
|
# Tidy up the tmp files if we get interrupted
|
|
trap 'rm -f huff.tmp.corpus.[0-9]* ; exit' 0 2 3 5 10 13 15
|
|
|
|
# Newer versions of Linux have a not very useful value for LANG by default
|
|
LANG=C
|
|
export LANG
|
|
|
|
## 0 is reserved, 1 is reserved too
|
|
ALPHABET_SIZE=254
|
|
|
|
# if [ $1 = "xentries" ]
|
|
# then
|
|
# infile=$2
|
|
# outfile=$3
|
|
# # Normalize the string to be compressed
|
|
# cat $infile | awk '{print $1}' |
|
|
# tr "A-Z" "a-z" |
|
|
# awk '{for (i=1; i<=length($1); i++)
|
|
# printf("%s ",substr($1,i,1));
|
|
# printf("\n");}' >huff.tmp.corpus
|
|
# fi
|
|
|
|
if [ $1 = "entries" ]
|
|
then
|
|
# actually works for non-utf8 entries too
|
|
infile=$2
|
|
outfile=$3
|
|
# Normalize the string to be compressed
|
|
cat $infile | awk '{print $1}' |
|
|
cat >huff.entries.corpus
|
|
$ESTDIR/../festival/bin/festival -b $FLITEDIR/tools/make_lex.scm '(utf8entries "huff.entries.corpus" "huff.tmp.corpus")'
|
|
fi
|
|
|
|
if [ $1 = "phones" ]
|
|
then
|
|
infile=$2
|
|
outfile=$3
|
|
# Normalize the text to be compressed
|
|
cat $infile | awk '{print $2}' |
|
|
sed 's/\\/ /g' >huff.tmp.corpus
|
|
fi
|
|
|
|
if [ $1 = "residual" ]
|
|
then
|
|
# This really doesn't work: just some tests to see what's worthwhile
|
|
infile=$2
|
|
outfile=$3
|
|
# Normalized the text to be compressed
|
|
cat $infile |
|
|
awk '{if ($5 ~ /.*_res_/)
|
|
n=1;
|
|
else if (n==1)
|
|
{
|
|
printf("%s ",$i);
|
|
b=$i;
|
|
for (i=2; i<=NF; i++)
|
|
{
|
|
q=b-$i;
|
|
if (q < 0)
|
|
q+=256;
|
|
printf("%s ",q);
|
|
b=$i
|
|
}
|
|
printf("\n");
|
|
n=0
|
|
}}' |
|
|
sed 's/,//g;s/};//' >huff.tmp.corpus
|
|
fi
|
|
|
|
if [ $1 = "other" ]
|
|
then
|
|
infile=$2
|
|
outfile=$3
|
|
cat $infile >huff.tmp.corpus
|
|
fi
|
|
|
|
CORPUS_SIZE=`cat huff.tmp.corpus | wc -w`
|
|
echo "Original Corpus size = $CORPUS_SIZE"
|
|
|
|
# Find the base alphabet
|
|
cat huff.tmp.corpus |
|
|
awk '{for (i=1; i<=NF; i++)
|
|
printf("%s\n",$i);}' |
|
|
sort -u >huff.tmp.base.alphabet
|
|
|
|
BASE_ALPHABET_SIZE=`cat huff.tmp.base.alphabet | wc -l`
|
|
echo "Base alphabet = $BASE_ALPHABET_SIZE"
|
|
|
|
# Find the additional best n grams
|
|
PASS=000
|
|
cp -p huff.tmp.corpus huff.tmp.corpus.$PASS
|
|
|
|
PASS_ALPHABET_SIZE=$BASE_ALPHABET_SIZE
|
|
while [ $PASS_ALPHABET_SIZE != $ALPHABET_SIZE ]
|
|
do
|
|
# Loop until we get to the desired alphabet size
|
|
|
|
cat huff.tmp.corpus.$PASS |
|
|
awk '{for (i=1; i<NF; i++)
|
|
{
|
|
bb = sprintf("%s_%s",$i,$(i+1));
|
|
freq[bb]++;
|
|
part1[bb] = $i;
|
|
part2[bb] = $(i+1);
|
|
}}
|
|
END {best="";
|
|
bestscore=0;
|
|
for (i in freq)
|
|
{
|
|
score = freq[i];
|
|
if (score > bestscore)
|
|
{
|
|
bestscore = score;
|
|
best = i;
|
|
}
|
|
}
|
|
printf("%s %s %d %d\n",part1[best],part2[best],bestscore,freq[best])
|
|
}' >huff.tmp.corpus.$PASS.best
|
|
|
|
# Substitute the best bigram to give next corpus
|
|
NPASS=`echo $PASS | awk '{printf("%03d",$1+1)}'`
|
|
cat huff.tmp.corpus.$PASS |
|
|
awk 'BEGIN {p1="'`cat huff.tmp.corpus.$PASS.best | awk '{print $1}'`'";
|
|
p2="'`cat huff.tmp.corpus.$PASS.best | awk '{print $2}'`'";
|
|
}
|
|
{for (i=1; i<=NF; i++)
|
|
{
|
|
if (($i == p1) && ($(i+1) == p2))
|
|
{
|
|
printf("%s+%s ",$i,$(i+1));
|
|
i++;
|
|
}
|
|
else
|
|
printf("%s ",$i);
|
|
}
|
|
printf("\n");}' >huff.tmp.corpus.$NPASS
|
|
|
|
# Summary
|
|
echo $PASS joining `cat huff.tmp.corpus.$PASS.best`
|
|
echo " Corpus was "`cat huff.tmp.corpus.$PASS | wc -w`
|
|
echo " Corpus is "`cat huff.tmp.corpus.$NPASS | wc -w`
|
|
CORPUS_IS=`cat huff.tmp.corpus.$NPASS | wc -w`
|
|
echo $CORPUS_SIZE $CORPUS_IS |
|
|
awk '{printf(" Compression %0.2f\n",100*($2*1.0)/$1)}'
|
|
|
|
PASS=$NPASS
|
|
PASS_ALPHABET_SIZE=`cat huff.tmp.corpus.$NPASS |
|
|
awk '{for (i=1; i<=NF; i++)
|
|
printf("%s\n",$i);}' | sort -u | wc -l`
|
|
done
|
|
|
|
# Now build the map
|
|
mv huff.tmp.corpus.$NPASS huff.tmp.corpus.best
|
|
rm -f huff.tmp.corpus.[0-9]*
|
|
|
|
cat huff.tmp.corpus.best |
|
|
awk 'BEGIN {tot=0;}
|
|
{ for (i=1; i<=NF; i++)
|
|
{
|
|
freq[$i]++
|
|
tot++;
|
|
}
|
|
}
|
|
END { for (i in freq)
|
|
printf("%f %s\n",freq[i]/tot,i);}' |
|
|
sort -n |
|
|
awk '{printf("map[\"%s\"] = %d; unmap[%d] = \"%s\";\n",$2,NR,NR,$2)}' >huff.tmp.corpus.maptable
|
|
|
|
maptable=`cat huff.tmp.corpus.maptable`
|
|
|
|
# compress the corpus
|
|
cat huff.tmp.corpus.best |
|
|
awk 'BEGIN {'"$maptable"'}
|
|
function enoctal(x)
|
|
{
|
|
units = x % 8;
|
|
eights = ((x - units) / 8) % 8;
|
|
sixtyfours = ((((x - units) / 8) - eights) / 8) % 8;
|
|
return sprintf("%d%d%d", sixtyfours, eights, units);
|
|
}
|
|
{
|
|
for (i=1; i<=NF; i++)
|
|
printf("\\%s",enoctal(map[$i]));
|
|
printf("\n");
|
|
}' > huff.tmp.corpus.compressed
|
|
|
|
# uncompress the corpus
|
|
cat huff.tmp.corpus.compressed |
|
|
sed 's/\\/ /g' |
|
|
awk 'BEGIN {'"$maptable"'}
|
|
function unenoctal(x)
|
|
{
|
|
y = ((substr(x,1,1)+0)*64) + ((substr(x,2,1)+0)*8) + (substr(x,3,1)+0);
|
|
return y;
|
|
}
|
|
{
|
|
for (i=1; i<=NF; i++)
|
|
printf("%s ",unmap[unenoctal($i)]);
|
|
printf("\n");
|
|
}' > huff.tmp.corpus.uncompressed
|
|
|
|
if [ $1 = "phones" ]
|
|
then
|
|
cat huff.tmp.corpus.best |
|
|
sed 's/+/\\/g' |
|
|
awk '{ for (i=1; i<=NF; i++)
|
|
{
|
|
freq[$i]++
|
|
tot++;
|
|
}
|
|
}
|
|
END { for (i in freq)
|
|
printf("%f %s\n",freq[i]/tot,i);}' |
|
|
sort -n |
|
|
awk '{printf(" \"\\%s\" , /* %f */ \n",$2,$1);}' >$outfile
|
|
mv huff.tmp.corpus.compressed huff.phones.compressed
|
|
fi
|
|
|
|
if [ $1 = "entries" ]
|
|
then
|
|
cat huff.tmp.corpus.best |
|
|
sed 's/+//g' |
|
|
awk '{ for (i=1; i<=NF; i++)
|
|
{
|
|
freq[$i]++
|
|
tot++;
|
|
}
|
|
}
|
|
END { for (i in freq)
|
|
printf("%f %s\n",freq[i]/tot,i);}' |
|
|
sort -n |
|
|
awk '{printf(" \"%s\" , /* %f */ \n",$2,$1);}' >$outfile
|
|
mv huff.tmp.corpus.compressed huff.entries.compressed
|
|
fi
|
|
|
|
exit
|
|
|
|
|
|
|