627 lines
19 KiB
C
627 lines
19 KiB
C
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
|
/* ====================================================================
|
|
* Copyright (c) 1996-2004 Carnegie Mellon University. All rights
|
|
* reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
*
|
|
* This work was supported in part by funding from the Defense Advanced
|
|
* Research Projects Agency and the National Science Foundation of the
|
|
* United States of America, and the CMU Sphinx Speech Consortium.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
|
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
|
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
|
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* ====================================================================
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* fe.h
|
|
*
|
|
* $Log: fe.h,v $
|
|
* Revision 1.11 2005/02/05 02:15:02 egouvea
|
|
* Removed fe_process(), never used
|
|
*
|
|
* Revision 1.10 2004/12/10 16:48:55 rkm
|
|
* Added continuous density acoustic model handling
|
|
*
|
|
*
|
|
*/
|
|
|
|
#if defined(_WIN32) && !defined(GNUWINCE)
|
|
#define srand48(x) srand(x)
|
|
#define lrand48() rand()
|
|
#endif
|
|
|
|
#ifndef _NEW_FE_H_
|
|
#define _NEW_FE_H_
|
|
|
|
/* Win32/WinCE DLL gunk */
|
|
#include <sphinxbase/sphinxbase_export.h>
|
|
|
|
#include <sphinxbase/cmd_ln.h>
|
|
#include <sphinxbase/fixpoint.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
#if 0
|
|
/* Fool Emacs. */
|
|
}
|
|
#endif
|
|
|
|
#ifdef WORDS_BIGENDIAN
|
|
#define NATIVE_ENDIAN "big"
|
|
#else
|
|
#define NATIVE_ENDIAN "little"
|
|
#endif
|
|
|
|
/** Default number of samples per second. */
|
|
#define DEFAULT_SAMPLING_RATE 16000
|
|
/** Default number of frames per second. */
|
|
#define DEFAULT_FRAME_RATE 100
|
|
/** Default spacing between frame starts (equal to
|
|
* DEFAULT_SAMPLING_RATE/DEFAULT_FRAME_RATE) */
|
|
#define DEFAULT_FRAME_SHIFT 160
|
|
/** Default size of each frame (410 samples @ 16000Hz). */
|
|
#define DEFAULT_WINDOW_LENGTH 0.025625
|
|
/** Default number of FFT points. */
|
|
#define DEFAULT_FFT_SIZE 512
|
|
/** Default number of MFCC coefficients in output. */
|
|
#define DEFAULT_NUM_CEPSTRA 13
|
|
/** Default number of filter bands used to generate MFCCs. */
|
|
#define DEFAULT_NUM_FILTERS 40
|
|
|
|
/** Default prespeech length */
|
|
#define DEFAULT_PRE_SPEECH 20
|
|
/** Default postspeech length */
|
|
#define DEFAULT_POST_SPEECH 50
|
|
/** Default postspeech length */
|
|
#define DEFAULT_START_SPEECH 10
|
|
|
|
/** Default lower edge of mel filter bank. */
|
|
#define DEFAULT_LOWER_FILT_FREQ 133.33334
|
|
/** Default upper edge of mel filter bank. */
|
|
#define DEFAULT_UPPER_FILT_FREQ 6855.4976
|
|
/** Default pre-emphasis filter coefficient. */
|
|
#define DEFAULT_PRE_EMPHASIS_ALPHA 0.97
|
|
/** Default type of frequency warping to use for VTLN. */
|
|
#define DEFAULT_WARP_TYPE "inverse_linear"
|
|
/** Default random number seed to use for dithering. */
|
|
#define SEED -1
|
|
|
|
#define waveform_to_cepstral_command_line_macro() \
|
|
{ "-logspec", \
|
|
ARG_BOOLEAN, \
|
|
"no", \
|
|
"Write out logspectral files instead of cepstra" }, \
|
|
\
|
|
{ "-smoothspec", \
|
|
ARG_BOOLEAN, \
|
|
"no", \
|
|
"Write out cepstral-smoothed logspectral files" }, \
|
|
\
|
|
{ "-transform", \
|
|
ARG_STRING, \
|
|
"legacy", \
|
|
"Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \
|
|
\
|
|
{ "-alpha", \
|
|
ARG_FLOAT32, \
|
|
ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \
|
|
"Preemphasis parameter" }, \
|
|
\
|
|
{ "-samprate", \
|
|
ARG_FLOAT32, \
|
|
ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \
|
|
"Sampling rate" }, \
|
|
\
|
|
{ "-frate", \
|
|
ARG_INT32, \
|
|
ARG_STRINGIFY(DEFAULT_FRAME_RATE), \
|
|
"Frame rate" }, \
|
|
\
|
|
{ "-wlen", \
|
|
ARG_FLOAT32, \
|
|
ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \
|
|
"Hamming window length" }, \
|
|
\
|
|
{ "-nfft", \
|
|
ARG_INT32, \
|
|
ARG_STRINGIFY(DEFAULT_FFT_SIZE), \
|
|
"Size of FFT" }, \
|
|
\
|
|
{ "-nfilt", \
|
|
ARG_INT32, \
|
|
ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \
|
|
"Number of filter banks" }, \
|
|
\
|
|
{ "-lowerf", \
|
|
ARG_FLOAT32, \
|
|
ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \
|
|
"Lower edge of filters" }, \
|
|
\
|
|
{ "-upperf", \
|
|
ARG_FLOAT32, \
|
|
ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \
|
|
"Upper edge of filters" }, \
|
|
\
|
|
{ "-unit_area", \
|
|
ARG_BOOLEAN, \
|
|
"yes", \
|
|
"Normalize mel filters to unit area" }, \
|
|
\
|
|
{ "-round_filters", \
|
|
ARG_BOOLEAN, \
|
|
"yes", \
|
|
"Round mel filter frequencies to DFT points" }, \
|
|
\
|
|
{ "-ncep", \
|
|
ARG_INT32, \
|
|
ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \
|
|
"Number of cep coefficients" }, \
|
|
\
|
|
{ "-doublebw", \
|
|
ARG_BOOLEAN, \
|
|
"no", \
|
|
"Use double bandwidth filters (same center freq)" }, \
|
|
\
|
|
{ "-lifter", \
|
|
ARG_INT32, \
|
|
"0", \
|
|
"Length of sin-curve for liftering, or 0 for no liftering." }, \
|
|
\
|
|
{ "-vad_prespeech", \
|
|
ARG_INT32, \
|
|
ARG_STRINGIFY(DEFAULT_PRE_SPEECH), \
|
|
"Num of speech frames to keep before silence to speech." }, \
|
|
\
|
|
{ "-vad_startspeech", \
|
|
ARG_INT32, \
|
|
ARG_STRINGIFY(DEFAULT_START_SPEECH), \
|
|
"Num of speech frames to trigger vad from silence to speech." }, \
|
|
\
|
|
{ "-vad_postspeech", \
|
|
ARG_INT32, \
|
|
ARG_STRINGIFY(DEFAULT_POST_SPEECH), \
|
|
"Num of silence frames to keep after from speech to silence." }, \
|
|
\
|
|
{ "-vad_threshold", \
|
|
ARG_FLOAT32, \
|
|
"2.0", \
|
|
"Threshold for decision between noise and silence frames. Log-ratio between signal level and noise level." }, \
|
|
\
|
|
{ "-input_endian", \
|
|
ARG_STRING, \
|
|
NATIVE_ENDIAN, \
|
|
"Endianness of input data, big or little, ignored if NIST or MS Wav" }, \
|
|
\
|
|
{ "-warp_type", \
|
|
ARG_STRING, \
|
|
DEFAULT_WARP_TYPE, \
|
|
"Warping function type (or shape)" }, \
|
|
\
|
|
{ "-warp_params", \
|
|
ARG_STRING, \
|
|
NULL, \
|
|
"Parameters defining the warping function" }, \
|
|
\
|
|
{ "-dither", \
|
|
ARG_BOOLEAN, \
|
|
"no", \
|
|
"Add 1/2-bit noise" }, \
|
|
\
|
|
{ "-seed", \
|
|
ARG_INT32, \
|
|
ARG_STRINGIFY(SEED), \
|
|
"Seed for random number generator; if less than zero, pick our own" }, \
|
|
\
|
|
{ "-remove_dc", \
|
|
ARG_BOOLEAN, \
|
|
"no", \
|
|
"Remove DC offset from each frame" }, \
|
|
\
|
|
{ "-remove_noise", \
|
|
ARG_BOOLEAN, \
|
|
"yes", \
|
|
"Remove noise with spectral subtraction in mel-energies" }, \
|
|
\
|
|
{ "-remove_silence", \
|
|
ARG_BOOLEAN, \
|
|
"yes", \
|
|
"Enables VAD, removes silence frames from processing" }, \
|
|
\
|
|
{ "-verbose", \
|
|
ARG_BOOLEAN, \
|
|
"no", \
|
|
"Show input filenames" } \
|
|
|
|
|
|
#ifdef FIXED_POINT
|
|
/** MFCC computation type. */
|
|
typedef fixed32 mfcc_t;
|
|
|
|
/** Convert a floating-point value to mfcc_t. */
|
|
#define FLOAT2MFCC(x) FLOAT2FIX(x)
|
|
/** Convert a mfcc_t value to floating-point. */
|
|
#define MFCC2FLOAT(x) FIX2FLOAT(x)
|
|
/** Multiply two mfcc_t values. */
|
|
#define MFCCMUL(a,b) FIXMUL(a,b)
|
|
#define MFCCLN(x,in,out) FIXLN_ANY(x,in,out)
|
|
#else /* !FIXED_POINT */
|
|
|
|
/** MFCC computation type. */
|
|
typedef float32 mfcc_t;
|
|
/** Convert a floating-point value to mfcc_t. */
|
|
#define FLOAT2MFCC(x) (x)
|
|
/** Convert a mfcc_t value to floating-point. */
|
|
#define MFCC2FLOAT(x) (x)
|
|
/** Multiply two mfcc_t values. */
|
|
#define MFCCMUL(a,b) ((a)*(b))
|
|
#define MFCCLN(x,in,out) log(x)
|
|
#endif /* !FIXED_POINT */
|
|
|
|
/**
|
|
* Structure for the front-end computation.
|
|
*/
|
|
typedef struct fe_s fe_t;
|
|
|
|
/**
|
|
* Error codes returned by stuff.
|
|
*/
|
|
enum fe_error_e {
|
|
FE_SUCCESS = 0,
|
|
FE_OUTPUT_FILE_SUCCESS = 0,
|
|
FE_CONTROL_FILE_ERROR = -1,
|
|
FE_START_ERROR = -2,
|
|
FE_UNKNOWN_SINGLE_OR_BATCH = -3,
|
|
FE_INPUT_FILE_OPEN_ERROR = -4,
|
|
FE_INPUT_FILE_READ_ERROR = -5,
|
|
FE_MEM_ALLOC_ERROR = -6,
|
|
FE_OUTPUT_FILE_WRITE_ERROR = -7,
|
|
FE_OUTPUT_FILE_OPEN_ERROR = -8,
|
|
FE_ZERO_ENERGY_ERROR = -9,
|
|
FE_INVALID_PARAM_ERROR = -10
|
|
};
|
|
|
|
/**
|
|
* Initialize a front-end object from global command-line.
|
|
*
|
|
* This is equivalent to calling fe_init_auto_r(cmd_ln_get()).
|
|
*
|
|
* @return Newly created front-end object.
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
fe_t* fe_init_auto(void);
|
|
|
|
/**
|
|
* Get the default set of arguments for fe_init_auto_r().
|
|
*
|
|
* @return Pointer to an argument structure which can be passed to
|
|
* cmd_ln_init() in friends to create argument structures for
|
|
* fe_init_auto_r().
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
arg_t const *fe_get_args(void);
|
|
|
|
/**
|
|
* Initialize a front-end object from a command-line parse.
|
|
*
|
|
* @param config Command-line object, as returned by cmd_ln_parse_r()
|
|
* or cmd_ln_parse_file(). Ownership of this object is
|
|
* claimed by the fe_t, so you must not attempt to free
|
|
* it manually. Use cmd_ln_retain() if you wish to
|
|
* reuse it.
|
|
* @return Newly created front-end object.
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
fe_t *fe_init_auto_r(cmd_ln_t *config);
|
|
|
|
/**
|
|
* Retrieve the command-line object used to initialize this front-end.
|
|
*
|
|
* @return command-line object for this front-end. This pointer is
|
|
* retained by the fe_t, so you should not attempt to free it
|
|
* manually.
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
const cmd_ln_t *fe_get_config(fe_t *fe);
|
|
|
|
/**
|
|
* Start processing of the stream, resets processed frame counter
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
void fe_start_stream(fe_t *fe);
|
|
|
|
/**
|
|
* Start processing an utterance.
|
|
* @return 0 for success, <0 for error (see enum fe_error_e)
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
int fe_start_utt(fe_t *fe);
|
|
|
|
/**
|
|
* Get the dimensionality of the output of this front-end object.
|
|
*
|
|
* This is guaranteed to be the number of values in one frame of
|
|
* output from fe_end_utt() and fe_process_frames().
|
|
* It is usually the number of MFCC
|
|
* coefficients, but it might be the number of log-spectrum bins, if
|
|
* the <tt>-logspec</tt> or <tt>-smoothspec</tt> options to
|
|
* fe_init_auto() were true.
|
|
*
|
|
* @return Dimensionality of front-end output.
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
int fe_get_output_size(fe_t *fe);
|
|
|
|
/**
|
|
* Get the dimensionality of the input to this front-end object.
|
|
*
|
|
* This function retrieves the number of input samples consumed by one
|
|
* frame of processing. To obtain one frame of output, you must have
|
|
* at least <code>*out_frame_size</code> samples. To obtain <i>N</i>
|
|
* frames of output, you must have at least <code>(N-1) *
|
|
* *out_frame_shift + *out_frame_size</code> input samples.
|
|
*
|
|
* @param out_frame_shift Output: Number of samples between each frame start.
|
|
* @param out_frame_size Output: Number of samples in each frame.
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
void fe_get_input_size(fe_t *fe, int *out_frame_shift,
|
|
int *out_frame_size);
|
|
|
|
/**
|
|
* Get vad state for the last processed frame
|
|
*
|
|
* @return 1 if speech, 0 if silence
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
uint8 fe_get_vad_state(fe_t *fe);
|
|
|
|
/**
|
|
* Finish processing an utterance.
|
|
*
|
|
* This function also collects any remaining samples and calculates a
|
|
* final cepstral vector. If there are overflow samples remaining, it
|
|
* will pad with zeros to make a complete frame.
|
|
*
|
|
* @param fe Front-end object.
|
|
* @param out_cepvector Buffer to hold a residual cepstral vector, or NULL
|
|
* if you wish to ignore it. Must be large enough
|
|
* @param out_nframes Number of frames of residual cepstra created
|
|
* (either 0 or 1).
|
|
* @return 0 for success, <0 for error (see enum fe_error_e)
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
int fe_end_utt(fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
|
|
|
|
/**
|
|
* Retain ownership of a front end object.
|
|
*
|
|
* @return pointer to the retained front end.
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
fe_t *fe_retain(fe_t *fe);
|
|
|
|
/**
|
|
* Free the front end.
|
|
*
|
|
* Releases resources associated with the front-end object.
|
|
*
|
|
* @return new reference count (0 if freed completely)
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
int fe_free(fe_t *fe);
|
|
|
|
/*
|
|
* Do same as fe_process_frames, but also returns
|
|
* voiced audio. Output audio is valid till next
|
|
* fe_process_frames call.
|
|
*
|
|
* DO NOT MIX fe_process_frames calls
|
|
*
|
|
* @param voiced_spch Output: obtain voiced audio samples here
|
|
*
|
|
* @param voiced_spch_nsamps Output: shows voiced_spch length
|
|
*
|
|
* @param out_frameidx Output: index of the utterance start
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
int fe_process_frames_ext(fe_t *fe,
|
|
int16 const **inout_spch,
|
|
size_t *inout_nsamps,
|
|
mfcc_t **buf_cep,
|
|
int32 *inout_nframes,
|
|
int16 *voiced_spch,
|
|
int32 *voiced_spch_nsamps,
|
|
int32 *out_frameidx);
|
|
|
|
/**
|
|
* Process a block of samples.
|
|
*
|
|
* This function generates up to <code>*inout_nframes</code> of
|
|
* features, or as many as can be generated from
|
|
* <code>*inout_nsamps</code> samples.
|
|
*
|
|
* On exit, the <code>inout_spch</code>, <code>inout_nsamps</code>,
|
|
* and <code>inout_nframes</code> parameters are updated to point to
|
|
* the remaining sample data, the number of remaining samples, and the
|
|
* number of frames processed, respectively. This allows you to call
|
|
* this repeatedly to process a large block of audio in small (say,
|
|
* 5-frame) chunks:
|
|
*
|
|
* int16 *bigbuf, *p;
|
|
* mfcc_t **cepstra;
|
|
* int32 nsamps;
|
|
* int32 nframes = 5;
|
|
*
|
|
* cepstra = (mfcc_t **)
|
|
* ckd_calloc_2d(nframes, fe_get_output_size(fe), sizeof(**cepstra));
|
|
* p = bigbuf;
|
|
* while (nsamps) {
|
|
* nframes = 5;
|
|
* fe_process_frames(fe, &p, &nsamps, cepstra, &nframes);
|
|
* // Now do something with these frames...
|
|
* if (nframes)
|
|
* do_some_stuff(cepstra, nframes);
|
|
* }
|
|
*
|
|
* @param inout_spch Input: Pointer to pointer to speech samples
|
|
* (signed 16-bit linear PCM).
|
|
* Output: Pointer to remaining samples.
|
|
* @param inout_nsamps Input: Pointer to maximum number of samples to
|
|
* process.
|
|
* Output: Number of samples remaining in input buffer.
|
|
* @param buf_cep Two-dimensional buffer (allocated with
|
|
* ckd_calloc_2d()) which will receive frames of output
|
|
* data. If NULL, no actual processing will be done,
|
|
* and the maximum number of output frames which would
|
|
* be generated is returned in
|
|
* <code>*inout_nframes</code>.
|
|
* @param inout_nframes Input: Pointer to maximum number of frames to
|
|
* generate.
|
|
* Output: Number of frames actually generated.
|
|
* @param out_frameidx Index of the first frame returned in a stream
|
|
*
|
|
* @return 0 for success, <0 for failure (see enum fe_error_e)
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
int fe_process_frames(fe_t *fe,
|
|
int16 const **inout_spch,
|
|
size_t *inout_nsamps,
|
|
mfcc_t **buf_cep,
|
|
int32 *inout_nframes,
|
|
int32 *out_frameidx);
|
|
|
|
/**
|
|
* Process a block of samples, returning as many frames as possible.
|
|
*
|
|
* This function processes all the samples in a block of data and
|
|
* returns a newly allocated block of feature vectors. This block
|
|
* needs to be freed with fe_free_2d() after use.
|
|
*
|
|
* It is possible for there to be some left-over data which could not
|
|
* fit in a complete frame. This data can be processed with
|
|
* fe_end_utt().
|
|
*
|
|
* This function is deprecated in favor of fe_process_frames().
|
|
*
|
|
* @return 0 for success, <0 for failure (see enum fe_error_e)
|
|
*/
|
|
SPHINXBASE_EXPORT
|
|
int fe_process_utt(fe_t *fe, /**< A front end object */
|
|
int16 const *spch, /**< The speech samples */
|
|
size_t nsamps, /**< number of samples*/
|
|
mfcc_t ***cep_block, /**< Output pointer to cepstra */
|
|
int32 *nframes /**< Number of frames processed */
|
|
);
|
|
|
|
/**
|
|
* Free the output pointer returned by fe_process_utt().
|
|
**/
|
|
SPHINXBASE_EXPORT
|
|
void fe_free_2d(void *arr);
|
|
|
|
/**
|
|
* Convert a block of mfcc_t to float32 (can be done in-place)
|
|
**/
|
|
SPHINXBASE_EXPORT
|
|
int fe_mfcc_to_float(fe_t *fe,
|
|
mfcc_t **input,
|
|
float32 **output,
|
|
int32 nframes);
|
|
|
|
/**
|
|
* Convert a block of float32 to mfcc_t (can be done in-place)
|
|
**/
|
|
SPHINXBASE_EXPORT
|
|
int fe_float_to_mfcc(fe_t *fe,
|
|
float32 **input,
|
|
mfcc_t **output,
|
|
int32 nframes);
|
|
|
|
/**
|
|
* Process one frame of log spectra into MFCC using discrete cosine
|
|
* transform.
|
|
*
|
|
* This uses a variant of the DCT-II where the first frequency bin is
|
|
* scaled by 0.5. Unless somebody misunderstood the DCT-III equations
|
|
* and thought that's what they were implementing here, this is
|
|
* ostensibly done to account for the symmetry properties of the
|
|
* DCT-II versus the DFT - the first coefficient of the input is
|
|
* assumed to be repeated in the negative frequencies, which is not
|
|
* the case for the DFT. (This begs the question, why not just use
|
|
* the DCT-I, since it has the appropriate symmetry properties...)
|
|
* Moreover, this is bogus since the mel-frequency bins on which we
|
|
* are doing the DCT don't extend to the edge of the DFT anyway.
|
|
*
|
|
* This also means that the matrix used in computing this DCT can not
|
|
* be made orthogonal, and thus inverting the transform is difficult.
|
|
* Therefore if you want to do cepstral smoothing or have some other
|
|
* reason to invert your MFCCs, use fe_logspec_dct2() and its inverse
|
|
* fe_logspec_dct3() instead.
|
|
*
|
|
* Also, it normalizes by 1/nfilt rather than 2/nfilt, for some reason.
|
|
**/
|
|
SPHINXBASE_EXPORT
|
|
int fe_logspec_to_mfcc(fe_t *fe, /**< A fe structure */
|
|
const mfcc_t *fr_spec, /**< One frame of spectrum */
|
|
mfcc_t *fr_cep /**< One frame of cepstrum */
|
|
);
|
|
|
|
/**
|
|
* Convert log spectra to MFCC using DCT-II.
|
|
*
|
|
* This uses the "unitary" form of the DCT-II, i.e. with a scaling
|
|
* factor of sqrt(2/N) and a "beta" factor of sqrt(1/2) applied to the
|
|
* cos(0) basis vector (i.e. the one corresponding to the DC
|
|
* coefficient in the output).
|
|
**/
|
|
SPHINXBASE_EXPORT
|
|
int fe_logspec_dct2(fe_t *fe, /**< A fe structure */
|
|
const mfcc_t *fr_spec, /**< One frame of spectrum */
|
|
mfcc_t *fr_cep /**< One frame of cepstrum */
|
|
);
|
|
|
|
/**
|
|
* Convert MFCC to log spectra using DCT-III.
|
|
*
|
|
* This uses the "unitary" form of the DCT-III, i.e. with a scaling
|
|
* factor of sqrt(2/N) and a "beta" factor of sqrt(1/2) applied to the
|
|
* cos(0) basis vector (i.e. the one corresponding to the DC
|
|
* coefficient in the input).
|
|
**/
|
|
SPHINXBASE_EXPORT
|
|
int fe_mfcc_dct3(fe_t *fe, /**< A fe structure */
|
|
const mfcc_t *fr_cep, /**< One frame of cepstrum */
|
|
mfcc_t *fr_spec /**< One frame of spectrum */
|
|
);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
|
|
#endif
|