#!/bin/sh ########################################################################### ## ## ## Language Technologies Institute ## ## Carnegie Mellon University ## ## Copyright (c) 2004 ## ## All Rights Reserved. ## ## ## ## Permission is hereby granted, free of charge, to use and distribute ## ## this software and its documentation without restriction, including ## ## without limitation the rights to use, copy, modify, merge, publish, ## ## distribute, sublicense, and/or sell copies of this work, and to ## ## permit persons to whom this work is furnished to do so, subject to ## ## the following conditions: ## ## 1. The code must retain the above copyright notice, this list of ## ## conditions and the following disclaimer. ## ## 2. Any modifications must be clearly marked as such. ## ## 3. Original authors' names are not deleted. ## ## 4. The authors' names are not used to endorse or promote products ## ## derived from this software without specific prior written ## ## permission. ## ## ## ## CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK ## ## DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ## ## ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ## ## SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE ## ## FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ## ## WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ## ## AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ## ## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ## ## THIS SOFTWARE. ## ## ## ########################################################################### ## Author: Alan W Black (awb@cs.cmu.edu) ## ## Date: December 2004 ## ########################################################################### ## ## ## Make a Huffman table data ## ## done by finding the top singletons, bigrams, trigrams ... and byte ## ## coding them ## ## ## ## But this isn't full huffman coding yet ## ## ## ## Hmm, this should probably be written in something other than shell ## ########################################################################### ## There are N stages ## Find the alphabet (usually 256 things) ## This is done by finding the top singletons, bigrams, trigrams ... ## Find the frequencies (currently ignored, as codes are always 1 byte) ## Compress the data, and output the lookup tables ## There is some octal conversion in here that probably is unnecessary # Tidy up the tmp files if we get interrupted trap 'rm -f huff.tmp.corpus.[0-9]* ; exit' 0 2 3 5 10 13 15 # Newer versions of Linux have a not very useful value for LANG by default LANG=C export LANG ## 0 is reserved, 1 is reserved too ALPHABET_SIZE=254 # if [ $1 = "xentries" ] # then # infile=$2 # outfile=$3 # # Normalize the string to be compressed # cat $infile | awk '{print $1}' | # tr "A-Z" "a-z" | # awk '{for (i=1; i<=length($1); i++) # printf("%s ",substr($1,i,1)); # printf("\n");}' >huff.tmp.corpus # fi if [ $1 = "entries" ] then # actually works for non-utf8 entries too infile=$2 outfile=$3 # Normalize the string to be compressed cat $infile | awk '{print $1}' | cat >huff.entries.corpus $ESTDIR/../festival/bin/festival -b $FLITEDIR/tools/make_lex.scm '(utf8entries "huff.entries.corpus" "huff.tmp.corpus")' fi if [ $1 = "phones" ] then infile=$2 outfile=$3 # Normalize the text to be compressed cat $infile | awk '{print $2}' | sed 's/\\/ /g' >huff.tmp.corpus fi if [ $1 = "residual" ] then # This really doesn't work: just some tests to see what's worthwhile infile=$2 outfile=$3 # Normalized the text to be compressed cat $infile | awk '{if ($5 ~ /.*_res_/) n=1; else if (n==1) { printf("%s ",$i); b=$i; for (i=2; i<=NF; i++) { q=b-$i; if (q < 0) q+=256; printf("%s ",q); b=$i } printf("\n"); n=0 }}' | sed 's/,//g;s/};//' >huff.tmp.corpus fi if [ $1 = "other" ] then infile=$2 outfile=$3 cat $infile >huff.tmp.corpus fi CORPUS_SIZE=`cat huff.tmp.corpus | wc -w` echo "Original Corpus size = $CORPUS_SIZE" # Find the base alphabet cat huff.tmp.corpus | awk '{for (i=1; i<=NF; i++) printf("%s\n",$i);}' | sort -u >huff.tmp.base.alphabet BASE_ALPHABET_SIZE=`cat huff.tmp.base.alphabet | wc -l` echo "Base alphabet = $BASE_ALPHABET_SIZE" # Find the additional best n grams PASS=000 cp -p huff.tmp.corpus huff.tmp.corpus.$PASS PASS_ALPHABET_SIZE=$BASE_ALPHABET_SIZE while [ $PASS_ALPHABET_SIZE != $ALPHABET_SIZE ] do # Loop until we get to the desired alphabet size cat huff.tmp.corpus.$PASS | awk '{for (i=1; i bestscore) { bestscore = score; best = i; } } printf("%s %s %d %d\n",part1[best],part2[best],bestscore,freq[best]) }' >huff.tmp.corpus.$PASS.best # Substitute the best bigram to give next corpus NPASS=`echo $PASS | awk '{printf("%03d",$1+1)}'` cat huff.tmp.corpus.$PASS | awk 'BEGIN {p1="'`cat huff.tmp.corpus.$PASS.best | awk '{print $1}'`'"; p2="'`cat huff.tmp.corpus.$PASS.best | awk '{print $2}'`'"; } {for (i=1; i<=NF; i++) { if (($i == p1) && ($(i+1) == p2)) { printf("%s+%s ",$i,$(i+1)); i++; } else printf("%s ",$i); } printf("\n");}' >huff.tmp.corpus.$NPASS # Summary echo $PASS joining `cat huff.tmp.corpus.$PASS.best` echo " Corpus was "`cat huff.tmp.corpus.$PASS | wc -w` echo " Corpus is "`cat huff.tmp.corpus.$NPASS | wc -w` CORPUS_IS=`cat huff.tmp.corpus.$NPASS | wc -w` echo $CORPUS_SIZE $CORPUS_IS | awk '{printf(" Compression %0.2f\n",100*($2*1.0)/$1)}' PASS=$NPASS PASS_ALPHABET_SIZE=`cat huff.tmp.corpus.$NPASS | awk '{for (i=1; i<=NF; i++) printf("%s\n",$i);}' | sort -u | wc -l` done # Now build the map mv huff.tmp.corpus.$NPASS huff.tmp.corpus.best rm -f huff.tmp.corpus.[0-9]* cat huff.tmp.corpus.best | awk 'BEGIN {tot=0;} { for (i=1; i<=NF; i++) { freq[$i]++ tot++; } } END { for (i in freq) printf("%f %s\n",freq[i]/tot,i);}' | sort -n | awk '{printf("map[\"%s\"] = %d; unmap[%d] = \"%s\";\n",$2,NR,NR,$2)}' >huff.tmp.corpus.maptable maptable=`cat huff.tmp.corpus.maptable` # compress the corpus cat huff.tmp.corpus.best | awk 'BEGIN {'"$maptable"'} function enoctal(x) { units = x % 8; eights = ((x - units) / 8) % 8; sixtyfours = ((((x - units) / 8) - eights) / 8) % 8; return sprintf("%d%d%d", sixtyfours, eights, units); } { for (i=1; i<=NF; i++) printf("\\%s",enoctal(map[$i])); printf("\n"); }' > huff.tmp.corpus.compressed # uncompress the corpus cat huff.tmp.corpus.compressed | sed 's/\\/ /g' | awk 'BEGIN {'"$maptable"'} function unenoctal(x) { y = ((substr(x,1,1)+0)*64) + ((substr(x,2,1)+0)*8) + (substr(x,3,1)+0); return y; } { for (i=1; i<=NF; i++) printf("%s ",unmap[unenoctal($i)]); printf("\n"); }' > huff.tmp.corpus.uncompressed if [ $1 = "phones" ] then cat huff.tmp.corpus.best | sed 's/+/\\/g' | awk '{ for (i=1; i<=NF; i++) { freq[$i]++ tot++; } } END { for (i in freq) printf("%f %s\n",freq[i]/tot,i);}' | sort -n | awk '{printf(" \"\\%s\" , /* %f */ \n",$2,$1);}' >$outfile mv huff.tmp.corpus.compressed huff.phones.compressed fi if [ $1 = "entries" ] then cat huff.tmp.corpus.best | sed 's/+//g' | awk '{ for (i=1; i<=NF; i++) { freq[$i]++ tot++; } } END { for (i in freq) printf("%f %s\n",freq[i]/tot,i);}' | sort -n | awk '{printf(" \"%s\" , /* %f */ \n",$2,$1);}' >$outfile mv huff.tmp.corpus.compressed huff.entries.compressed fi exit