#!/bin/sh
###########################################################################
##                                                                       ##
##                  Language Technologies Institute                      ##
##                     Carnegie Mellon University                        ##
##                        Copyright (c) 2004                             ##
##                        All Rights Reserved.                           ##
##                                                                       ##
##  Permission is hereby granted, free of charge, to use and distribute  ##
##  this software and its documentation without restriction, including   ##
##  without limitation the rights to use, copy, modify, merge, publish,  ##
##  distribute, sublicense, and/or sell copies of this work, and to      ##
##  permit persons to whom this work is furnished to do so, subject to   ##
##  the following conditions:                                            ##
##   1. The code must retain the above copyright notice, this list of    ##
##      conditions and the following disclaimer.                         ##
##   2. Any modifications must be clearly marked as such.                ##
##   3. Original authors' names are not deleted.                         ##
##   4. The authors' names are not used to endorse or promote products   ##
##      derived from this software without specific prior written        ##
##      permission.                                                      ##
##                                                                       ##
##  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         ##
##  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ##
##  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ##
##  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      ##
##  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ##
##  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ##
##  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ##
##  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ##
##  THIS SOFTWARE.                                                       ##
##                                                                       ##
###########################################################################
##                                                                       ##
##  Make the files from the Festival Released version                    ##
##                                                                       ##
###########################################################################

if [ "x$FLITEDIR" == "x" ]
then
   FLITEDIR=`pwd`/../..
fi

if [ $# = 0 ]
then
   $0 setup
   $0 lts
   $0 lex
   $0 compresslex
   $0 install
   exit
fi

if [ "$1" = "setup" ]
then
   tar zxvf festlex_CMU.tar.gz
   cp -p $FESTVOXDIR/src/lts/build_lts festival/lib/dicts/cmu
   cp -p $FESTVOXDIR/src/lts/build_lts_rules festival/lib/dicts/cmu
   mkdir festival/lib/dicts/cmu/c
   mkdir festival/lib/dicts/cmu/wfst
   mkdir festival/lib/dicts/cmu/lts_scratch
fi

if [ "$1" = "lts" ]
then
  (cd festival/lib/dicts/cmu;
   festival --heap 10000000 -b allowables.scm '(dump-flat-entries-all "cmudict-0.4.out" "lts_scratch/lex_entries.out")'
   ./build_lts cummulate
   ./build_lts align
   ./build_lts build
   ./build_lts merge
   ./build_lts test
   festival --heap 10000000 -b $FLITEDIR/tools/make_lts_wfst.scm lts_scratch/lex_lts_rules.scm '(lts_to_rg_to_wfst lex_lts_rules "wfst/")';
   festival --heap 10000000 -b $FLITEDIR/tools/make_lts.scm lts_scratch/lex_lts_rules.scm '(ltsregextoC "cmu" lex_lts_rules "wfst/" "c")';
   )
fi

if [ "$1" = "lex" ]
then
   ( cd festival/lib/dicts/cmu;
     # Find the words to prune from the built lexicon
     # We will prune words that aren't homographs and the LTS gets correct
     # Use the lts_test.log to find the failed entries and only
     # include them in the list 
     festival -b cmulex.scm ~/projects/flite/tools/make_lex.scm '(remove_predictable_entries "cmudict-0.4.out" "pruned_lex.scm" "lex_lts_rules.scm")'
     festival --heap 10000000 -b '(lex.compile "pruned_lex.scm" "pruned_lex.out")'
     festival --heap 10000000 -b $FLITEDIR/tools/make_lex.scm '(lextoC "cmu" "pruned_lex.out" "c")' ;
   )
fi

if [ "$1" = "lex2" ]
then
   # experiment: put Letter_Phone(s) together as things to compress
   # does give much smaller result, though needs to be less than 256 symbols
   # not fully implemented
   ( cd festival/lib/dicts/cmu;
     # Use the lts_test.log to find the failed entries and only
     # include them in the list 
     # this needs festival-1.96 or later to get the pos from the lts_test.log
     grep "^failed " lts_test.log |
     sed 's/(/( /g;s/)/ )/g' |
     awk '{printf("( \"");
           for (i=3; $i != ")"; i++)
               printf("%s",$i);
           i++;
           printf("\" %s ( ",$i);
           i++; i++
           for ( ; $i != ")"; i++)
              if ($i != "#")
                 printf("%s ",$i);
           printf("))\n");
         }' |
     tr -d '()"' |
     awk '{if ($2 == "nil") 
              printf("0_start ");
           else
              printf("%s_start ",$2);
           for (i=1; i<=length($1); i++)
              printf("%s_%s ",substr($1,i,1),$(i+2));
           printf("\n");}' >pruned_lex2.data
#     festival --heap 10000000 -b $FLITEDIR/tools/make_lex.scm '(lextoC "cmu" "pruned_lex.out" "c")' ;
   )
fi

if [ "$1" = "compresslex" ]
then
   # Compress the entries and phone strings by finding best ngrams 
   ( cd festival/lib/dicts/cmu/c;
     $FLITEDIR/tools/huff_table phones cmu_lex_data cmu_lex_phones_huff_table.c
     $FLITEDIR/tools/huff_table entries cmu_lex_data cmu_lex_entries_huff_table.c
     paste huff.entries.compressed huff.phones.compressed huff.tmp.corpus | 
     tr -d " " |
     awk 'BEGIN {pcount = 1;
                 printf("/* index to compressed data */\n");
                }
          function unenoctal(x)
          {
             y = ((substr(x,1,1)+0)*64) + ((substr(x,2,1)+0)*8) + (substr(x,3,1)+0);
             return y;
          }
          {printf("   ");
           for (i=length($2)-3; i>0; i-=4)
           {
              printf("%d,",unenoctal(substr($2,i+1,3)));
              pcount++;
           }
           pcount++;
           printf(" 255, /* %d %s */ ",pcount,$3);
           for (i=1; i<length($1); i+=4)
           {
              printf("%d,",unenoctal(substr($1,i+1,3)));
              pcount++;
           }
           printf("0,\n");
           pcount++;
          }
        END { printf("/* num_bytes = %d */\n",pcount);}' >cmu_lex_data_compressed.c
     grep "num_bytes = " cmu_lex_data_compressed.c |
     awk '{print $4}' >cmu_lex_num_bytes_compressed.c
   )
fi

if [ "$1" = "install" ]
then
   cp -p festival/lib/dicts/cmu/c/cmu_lex_data.c .
   cp -p festival/lib/dicts/cmu/c/cmu_lex_data_compressed.c cmu_lex_data_raw.c
   cp -p festival/lib/dicts/cmu/c/cmu_lex_phones_huff_table.c .
   cp -p festival/lib/dicts/cmu/c/cmu_lex_entries_huff_table.c .
   cp -p festival/lib/dicts/cmu/c/cmu_lex_entries.c .
   cp -p festival/lib/dicts/cmu/c/cmu_lex_num_bytes_compressed.c cmu_lex_num_bytes.c

   cp -p festival/lib/dicts/cmu/c/cmu_lts_model.c .
   cp -p festival/lib/dicts/cmu/c/cmu_lts_model.h .
   cp -p festival/lib/dicts/cmu/c/cmu_lts_rules.c .

fi