/* --------------------------------------------------------------- */ /* The HMM-Based Speech Synthesis System (HTS): version 1.1b */ /* HTS Working Group */ /* */ /* Department of Computer Science */ /* Nagoya Institute of Technology */ /* and */ /* Interdisciplinary Graduate School of Science and Engineering */ /* Tokyo Institute of Technology */ /* Copyright (c) 2001-2003 */ /* All Rights Reserved. */ /* */ /* Permission is hereby granted, free of charge, to use and */ /* distribute this software and its documentation without */ /* restriction, including without limitation the rights to use, */ /* copy, modify, merge, publish, distribute, sublicense, and/or */ /* sell copies of this work, and to permit persons to whom this */ /* work is furnished to do so, subject to the following conditions: */ /* */ /* 1. The code must retain the above copyright notice, this list */ /* of conditions and the following disclaimer. */ /* */ /* 2. Any modifications must be clearly marked as such. */ /* */ /* NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSITITUTE OF TECHNOLOGY, */ /* HTS WORKING GROUP, AND THE CONTRIBUTORS TO THIS WORK DISCLAIM */ /* ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL */ /* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ /* SHALL NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSITITUTE OF */ /* TECHNOLOGY, HTS WORKING GROUP, NOR THE CONTRIBUTORS BE LIABLE */ /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY */ /* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, */ /* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS */ /* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR */ /* PERFORMANCE OF THIS SOFTWARE. */ /* */ /* --------------------------------------------------------------- */ /* This is Zen's MLSA filter as ported by Toda to festvox vc */ /* and back ported into hts/festival so we can do MLSA filtering */ /* If I took more time I could probably make this use the same as */ /* as the other code in this directory -- awb@cs.cmu.edu 03JAN06 */ /* --------------------------------------------------------------- */ /* and then ported into Flite (November 2007 awb@cs.cmu.edu) */ /* with some speed uptimizations */ /*********************************************************************/ /* */ /* Mel-cepstral vocoder (pulse/noise excitation & MLSA filter) */ /* 2003/12/26 by Heiga Zen */ /* */ /* Extracted from HTS and slightly modified */ /* by Tomoki Toda (tomoki@ics.nitech.ac.jp) */ /* June 2004 */ /* Integrate as a Voice Conversion module */ /* */ /*-------------------------------------------------------------------*/ #include "cst_alloc.h" #include "cst_string.h" #include "cst_math.h" #include "cst_track.h" #include "cst_wave.h" #include "cst_audio.h" #ifdef UNDER_CE #define SPEED_HACK /* This is one of those other things you shouldn't do, but it makes CG voices in flowm fast enough on my phone */ #define double float #endif #include "cst_vc.h" #include "cst_cg.h" #include "cst_mlsa.h" static cst_wave *synthesis_body(const cst_track *params, const cst_track *str, double fs, double framem, cst_cg_db *cg_db, cst_audio_streaming_info *asi); cst_wave *mlsa_resynthesis(const cst_track *params, const cst_track *str, cst_cg_db *cg_db, cst_audio_streaming_info *asi) { /* Resynthesizes a wave from given track */ cst_wave *wave = 0; int sr = 16000; double shift; if (params->num_frames > 1) shift = 1000.0*(params->times[1]-params->times[0]); else shift = 5.0; wave = synthesis_body(params,str,sr,shift,cg_db,asi); return wave; } static cst_wave *synthesis_body(const cst_track *params, /* f0 + mcep */ const cst_track *str, double fs, /* sampling frequency (Hz) */ double framem, /* FFT length */ cst_cg_db *cg_db, cst_audio_streaming_info *asi) { long t, pos; int framel, i; double f0; VocoderSetup vs; cst_wave *wave = 0; double *mcep; int stream_mark; int rc = CST_AUDIO_STREAM_CONT; int num_mcep; num_mcep = params->num_channels-1; framel = (int)(framem * fs / 1000.0); init_vocoder(fs, framel, num_mcep, &vs, cg_db); if (str != NULL) vs.gauss = MFALSE; /* synthesize waveforms by MLSA filter */ wave = new_wave(); cst_wave_resize(wave,params->num_frames * (framel + 2),1); #ifdef SPEED_HACK /* This is a SPECTACULAR hack -- basically resample the original */ /* files to 8KHz but label them as 16KHz and do the full build */ /* then after synthesis get them to be played at 8KHz -- it works */ wave->sample_rate = 8000; #else wave->sample_rate = fs; #endif mcep = cst_alloc(double,num_mcep+1); for (t = 0, stream_mark = pos = 0; (rc == CST_AUDIO_STREAM_CONT) && (t < params->num_frames); t++) { f0 = (double)params->frames[t][0]; for (i=1; iframes[t][i]; mcep[i-1] = 0; vocoder(f0, mcep, str, t, num_mcep, cg_db, &vs, wave, &pos); if (asi && (pos-stream_mark > asi->min_buffsize)) { rc=(*asi->asc)(wave,stream_mark,pos-stream_mark,0,asi->userdata); stream_mark = pos; } } wave->num_samples = pos; if (asi && (rc == CST_AUDIO_STREAM_CONT)) { /* drain the last part of the waveform */ (*asi->asc)(wave,stream_mark,pos-stream_mark,1,asi->userdata); } /* memory free */ cst_free(mcep); free_vocoder(&vs); return wave; } static void init_vocoder(double fs, int framel, int m, VocoderSetup *vs, cst_cg_db *cg_db) { /* initialize global parameter */ vs->fprd = framel; vs->iprd = 1; vs->seed = 1; #ifdef SPEED_HACK /* This makes it about 25% faster and sounds basically the same */ vs->pd = 4; #else vs->pd = 5; #endif vs->next =1; vs->gauss = MTRUE; /* Pade' approximants */ vs->pade[ 0]=1.0; vs->pade[ 1]=1.0; vs->pade[ 2]=0.0; vs->pade[ 3]=1.0; vs->pade[ 4]=0.0; vs->pade[ 5]=0.0; vs->pade[ 6]=1.0; vs->pade[ 7]=0.0; vs->pade[ 8]=0.0; vs->pade[ 9]=0.0; vs->pade[10]=1.0; vs->pade[11]=0.4999273; vs->pade[12]=0.1067005; vs->pade[13]=0.01170221; vs->pade[14]=0.0005656279; vs->pade[15]=1.0; vs->pade[16]=0.4999391; vs->pade[17]=0.1107098; vs->pade[18]=0.01369984; vs->pade[19]=0.0009564853; vs->pade[20]=0.00003041721; vs->rate = fs; vs->c = cst_alloc(double,3 * (m + 1) + 3 * (vs->pd + 1) + vs->pd * (m + 2)); vs->p1 = -1; vs->sw = 0; vs->x = 0x55555555; /* for postfiltering */ vs->mc = NULL; vs->o = 0; vs->d = NULL; vs->irleng= 64; // for MIXED EXCITATION vs->ME_order = cg_db->ME_order; vs->ME_num = cg_db->ME_num; vs->hpulse = cst_alloc(double,vs->ME_order); vs->hnoise = cst_alloc(double,vs->ME_order); vs->xpulsesig = cst_alloc(double,vs->ME_order); vs->xnoisesig = cst_alloc(double,vs->ME_order); vs->h = cg_db->me_h; return; } static double plus_or_minus_one() { /* Randomly return 1 or -1 */ /* not sure rand() is portable */ if (rand() > RAND_MAX/2.0) return 1.0; else return -1.0; } static void vocoder(double p, double *mc, const cst_track *str, int t, int m, cst_cg_db *cg_db, VocoderSetup *vs, cst_wave *wav, long *pos) { double inc, x, e1, e2; int i, j, k; double xpulse, xnoise; double fxpulse, fxnoise; float gain=1.0; if (cg_db->gain != 0.0) gain = cg_db->gain; if (str != NULL) /* MIXED-EXCITATION */ { /* Copy in str's and build hpulse and hnoise for this frame */ for (i=0; iME_order; i++) { vs->hpulse[i] = vs->hnoise[i] = 0.0; for (j=0; jME_num; j++) { vs->hpulse[i] += str->frames[t][j] * vs->h[j][i]; vs->hnoise[i] += (1 - str->frames[t][j]) * vs->h[j][i]; } } } if (p != 0.0) p = vs->rate / p; /* f0 -> pitch */ if (vs->p1 < 0) { if (vs->gauss & (vs->seed != 1)) vs->next = srnd((unsigned)vs->seed); vs->p1 = p; vs->pc = vs->p1; vs->cc = vs->c + m + 1; vs->cinc = vs->cc + m + 1; vs->d1 = vs->cinc + m + 1; mc2b(mc, vs->c, m, cg_db->mlsa_alpha); if (cg_db->mlsa_beta > 0.0 && m > 1) { e1 = b2en(vs->c, m, cg_db->mlsa_alpha, vs); vs->c[1] -= cg_db->mlsa_beta * cg_db->mlsa_alpha * mc[2]; for (k=2;k<=m;k++) vs->c[k] *= (1.0 + cg_db->mlsa_beta); e2 = b2en(vs->c, m, cg_db->mlsa_alpha, vs); vs->c[0] += log(e1/e2)/2; } return; } mc2b(mc, vs->cc, m, cg_db->mlsa_alpha); if (cg_db->mlsa_beta>0.0 && m > 1) { e1 = b2en(vs->cc, m, cg_db->mlsa_alpha, vs); vs->cc[1] -= cg_db->mlsa_beta * cg_db->mlsa_alpha * mc[2]; for (k = 2; k <= m; k++) vs->cc[k] *= (1.0 + cg_db->mlsa_beta); e2 = b2en(vs->cc, m, cg_db->mlsa_alpha, vs); vs->cc[0] += log(e1 / e2) / 2.0; } for (k=0; k<=m; k++) vs->cinc[k] = (vs->cc[k] - vs->c[k]) * (double)vs->iprd / (double)vs->fprd; if (vs->p1!=0.0 && p!=0.0) { inc = (p - vs->p1) * (double)vs->iprd / (double)vs->fprd; } else { inc = 0.0; vs->pc = p; vs->p1 = 0.0; } for (j = vs->fprd, i = (vs->iprd + 1) / 2; j--;) { if (vs->p1 == 0.0) { if (vs->gauss) x = (double) nrandom(vs); else x = plus_or_minus_one(); if (str != NULL) /* MIXED EXCITATION */ { xnoise = x; xpulse = 0.0; } } else { if ((vs->pc += 1.0) >= vs->p1) { x = sqrt (vs->p1); vs->pc = vs->pc - vs->p1; } else x = 0.0; if (str != NULL) /* MIXED EXCITATION */ { xpulse = x; xnoise = plus_or_minus_one(); } } /* MIXED EXCITATION */ /* The real work -- apply shaping filters to pulse and noise */ if (str != NULL) { fxpulse = fxnoise = 0.0; for (k=vs->ME_order-1; k>0; k--) { fxpulse += vs->hpulse[k] * vs->xpulsesig[k]; fxnoise += vs->hnoise[k] * vs->xnoisesig[k]; vs->xpulsesig[k] = vs->xpulsesig[k-1]; vs->xnoisesig[k] = vs->xnoisesig[k-1]; } fxpulse += vs->hpulse[0] * xpulse; fxnoise += vs->hnoise[0] * xnoise; vs->xpulsesig[0] = xpulse; vs->xnoisesig[0] = xnoise; x = fxpulse + fxnoise; /* excitation is pulse plus noise */ } #ifdef SPEED_HACK /* 8KHz voices are too quiet */ x *= exp(vs->c[0])*2.0; #else x *= exp(vs->c[0])*gain; #endif x = mlsadf(x, vs->c, m, cg_db->mlsa_alpha, vs->pd, vs->d1, vs); wav->samples[*pos] = (short)x; *pos += 1; if (!--i) { vs->p1 += inc; for (k = 0; k <= m; k++) vs->c[k] += vs->cinc[k]; i = vs->iprd; } } vs->p1 = p; memmove(vs->c,vs->cc,sizeof(double)*(m+1)); return; } static double mlsadf(double x, double *b, int m, double a, int pd, double *d, VocoderSetup *vs) { vs->ppade = &(vs->pade[pd*(pd+1)/2]); x = mlsadf1 (x, b, m, a, pd, d, vs); x = mlsadf2 (x, b, m, a, pd, &d[2*(pd+1)], vs); return(x); } static double mlsadf1(double x, double *b, int m, double a, int pd, double *d, VocoderSetup *vs) { double v, out = 0.0, *pt, aa; register int i; aa = 1 - a*a; pt = &d[pd+1]; for (i=pd; i>=1; i--) { d[i] = aa*pt[i-1] + a*d[i]; pt[i] = d[i] * b[1]; v = pt[i] * vs->ppade[i]; x += (1 & i) ? v : -v; out += v; } pt[0] = x; out += x; return(out); } static double mlsadf2 (double x, double *b, int m, double a, int pd, double *d, VocoderSetup *vs) { double v, out = 0.0, *pt, aa; register int i; aa = 1 - a*a; pt = &d[pd * (m+2)]; for (i=pd; i>=1; i--) { pt[i] = mlsafir (pt[i-1], b, m, a, &d[(i-1)*(m+2)]); v = pt[i] * vs->ppade[i]; x += (1&i) ? v : -v; out += v; } pt[0] = x; out += x; return(out); } static double mlsafir (double x, double *b, int m, double a, double *d) { double y = 0.0; double aa; register int i; aa = 1 - a*a; d[0] = x; d[1] = aa*d[0] + a*d[1]; for (i=2; i<= m; i++) { d[i] = d[i] + a*(d[i+1]-d[i-1]); y += d[i]*b[i]; } for (i=m+1; i>1; i--) d[i] = d[i-1]; return(y); } static double nrandom (VocoderSetup *vs) { if (vs->sw == 0) { vs->sw = 1; do { vs->r1 = 2.0 * rnd(&vs->next) - 1.0; vs->r2 = 2.0 * rnd(&vs->next) - 1.0; vs->s = vs->r1 * vs->r1 + vs->r2 * vs->r2; } while (vs->s > 1 || vs->s == 0); vs->s = sqrt (-2 * log(vs->s) / vs->s); return(vs->r1*vs->s); } else { vs->sw = 0; return (vs->r2*vs->s); } } static double rnd (unsigned long *next) { double r; *next = *next * 1103515245L + 12345; r = (*next / 65536L) % 32768L; return(r/RANDMAX); } static unsigned long srnd ( unsigned long seed ) { return(seed); } /* mc2b : transform mel-cepstrum to MLSA digital fillter coefficients */ static void mc2b (double *mc, double *b, int m, double a) { b[m] = mc[m]; for (m--; m>=0; m--) b[m] = mc[m] - a * b[m+1]; return; } static double b2en (double *b, int m, double a, VocoderSetup *vs) { double en; int k; if (vs->omc != NULL) cst_free(vs->mc); vs->mc = cst_alloc(double,(m + 1) + 2 * vs->irleng); vs->cep = vs->mc + m+1; vs->ir = vs->cep + vs->irleng; } b2mc(b, vs->mc, m, a); freqt(vs->mc, m, vs->cep, vs->irleng-1, -a, vs); c2ir(vs->cep, vs->irleng, vs->ir, vs->irleng); en = 0.0; for (k=0;kirleng;k++) en += vs->ir[k] * vs->ir[k]; return(en); } /* b2bc : transform MLSA digital filter coefficients to mel-cepstrum */ static void b2mc (double *b, double *mc, int m, double a) { double d, o; d = mc[m] = b[m]; for (m--; m>=0; m--) { o = b[m] + a * d; d = b[m]; mc[m] = o; } return; } /* freqt : frequency transformation */ static void freqt (double *c1, int m1, double *c2, int m2, double a, VocoderSetup *vs) { register int i, j; double b; if (vs->d==NULL) { vs->size = m2; vs->d = cst_alloc(double,vs->size + vs->size + 2); vs->g = vs->d+vs->size+1; } if (m2>vs->size) { cst_free(vs->d); vs->size = m2; vs->d = cst_alloc(double,vs->size + vs->size + 2); vs->g = vs->d+vs->size+1; } b = 1-a*a; for (i=0; ig[i] = 0.0; for (i=-m1; i<=0; i++) { if (0 <= m2) vs->g[0] = c1[-i]+a*(vs->d[0]=vs->g[0]); if (1 <= m2) vs->g[1] = b*vs->d[0]+a*(vs->d[1]=vs->g[1]); for (j=2; j<=m2; j++) vs->g[j] = vs->d[j-1]+a*((vs->d[j]=vs->g[j])-vs->g[j-1]); } memmove(c2,vs->g,sizeof(double)*(m2+1)); return; } /* c2ir : The minimum phase impulse response is evaluated from the minimum phase cepstrum */ static void c2ir (double *c, int nc, double *h, int leng) { register int n, k, upl; double d; h[0] = exp(c[0]); for (n=1; n=nc) ? nc-1 : n; for (k=1; k<=upl; k++) d += k*c[k]*h[n-k]; h[n] = d/n; } return; } static void free_vocoder(VocoderSetup *vs) { cst_free(vs->c); cst_free(vs->mc); cst_free(vs->d); vs->c = NULL; vs->mc = NULL; vs->d = NULL; vs->ppade = NULL; vs->cc = NULL; vs->cinc = NULL; vs->d1 = NULL; vs->g = NULL; vs->cep = NULL; vs->ir = NULL; cst_free(vs->hpulse); cst_free(vs->hnoise); cst_free(vs->xpulsesig); cst_free(vs->xnoisesig); return; }