rhubarb-lip-sync/rhubarb/lib/sphinxbase-rev13216/include/sphinxbase/feat.h

470 lines
20 KiB
C

/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
* Copyright (c) 1999-2004 Carnegie Mellon University. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* This work was supported in part by funding from the Defense Advanced
* Research Projects Agency and the National Science Foundation of the
* United States of America, and the CMU Sphinx Speech Consortium.
*
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ====================================================================
*
*/
/*
* feat.h -- Cepstral features computation.
*/
#ifndef _S3_FEAT_H_
#define _S3_FEAT_H_
#include <stdio.h>
/* Win32/WinCE DLL gunk */
#include <sphinxbase/sphinxbase_export.h>
#include <sphinxbase/prim_type.h>
#include <sphinxbase/fe.h>
#include <sphinxbase/cmn.h>
#include <sphinxbase/agc.h>
#ifdef __cplusplus
extern "C" {
#endif
#if 0
/* Fool Emacs. */
}
#endif
/** \file feat.h
* \brief compute the dynamic coefficients from the cepstral vector.
*/
#define LIVEBUFBLOCKSIZE 256 /** Blocks of 256 vectors allocated
for livemode decoder */
#define S3_MAX_FRAMES 15000 /* RAH, I believe this is still too large, but better than before */
#define cepstral_to_feature_command_line_macro() \
{ "-feat", \
ARG_STRING, \
"1s_c_d_dd", \
"Feature stream type, depends on the acoustic model" }, \
{ "-ceplen", \
ARG_INT32, \
"13", \
"Number of components in the input feature vector" }, \
{ "-cmn", \
ARG_STRING, \
"live", \
"Cepstral mean normalization scheme ('live', 'batch', or 'none')" }, \
{ "-cmninit", \
ARG_STRING, \
"40,3,-1", \
"Initial values (comma-separated) for cepstral mean when 'live' is used" }, \
{ "-varnorm", \
ARG_BOOLEAN, \
"no", \
"Variance normalize each utterance (only if CMN == current)" }, \
{ "-agc", \
ARG_STRING, \
"none", \
"Automatic gain control for c0 ('max', 'emax', 'noise', or 'none')" }, \
{ "-agcthresh", \
ARG_FLOAT32, \
"2.0", \
"Initial threshold for automatic gain control" }, \
{ "-lda", \
ARG_STRING, \
NULL, \
"File containing transformation matrix to be applied to features (single-stream features only)" }, \
{ "-ldadim", \
ARG_INT32, \
"0", \
"Dimensionality of output of feature transformation (0 to use entire matrix)" }, \
{"-svspec", \
ARG_STRING, \
NULL, \
"Subvector specification (e.g., 24,0-11/25,12-23/26-38 or 0-12/13-25/26-38)"}
/**
* \struct feat_t
* \brief Structure for describing a speech feature type
* Structure for describing a speech feature type (no. of streams and stream widths),
* as well as the computation for converting the input speech (e.g., Sphinx-II format
* MFC cepstra) into this type of feature vectors.
*/
typedef struct feat_s {
int refcount; /**< Reference count. */
char *name; /**< Printable name for this feature type */
int32 cepsize; /**< Size of input speech vector (typically, a cepstrum vector) */
int32 n_stream; /**< Number of feature streams; e.g., 4 in Sphinx-II */
uint32 *stream_len; /**< Vector length of each feature stream */
int32 window_size; /**< Number of extra frames around given input frame needed to compute
corresponding output feature (so total = window_size*2 + 1) */
int32 n_sv; /**< Number of subvectors */
uint32 *sv_len; /**< Vector length of each subvector */
int32 **subvecs; /**< Subvector specification (or NULL for none) */
mfcc_t *sv_buf; /**< Temporary copy buffer for subvector projection */
int32 sv_dim; /**< Total dimensionality of subvector (length of sv_buf) */
cmn_type_t cmn; /**< Type of CMN to be performed on each utterance */
int32 varnorm; /**< Whether variance normalization is to be performed on each utt;
Irrelevant if no CMN is performed */
agc_type_t agc; /**< Type of AGC to be performed on each utterance */
/**
* Feature computation function.
* @param fcb the feat_t describing this feature type
* @param input pointer into the input cepstra
* @param feat a 2-d array of output features (n_stream x stream_len)
* @return 0 if successful, -ve otherwise.
*
* Function for converting window of input speech vector
* (input[-window_size..window_size]) to output feature vector
* (feat[stream][]). If NULL, no conversion available, the
* speech input must be feature vector itself.
**/
void (*compute_feat)(struct feat_s *fcb, mfcc_t **input, mfcc_t **feat);
cmn_t *cmn_struct; /**< Structure that stores the temporary variables for cepstral
means normalization*/
agc_t *agc_struct; /**< Structure that stores the temporary variables for acoustic
gain control*/
mfcc_t **cepbuf; /**< Circular buffer of MFCC frames for live feature computation. */
mfcc_t **tmpcepbuf; /**< Array of pointers into cepbuf to handle border cases. */
int32 bufpos; /**< Write index in cepbuf. */
int32 curpos; /**< Read index in cepbuf. */
mfcc_t ***lda; /**< Array of linear transformations (for LDA, MLLT, or whatever) */
uint32 n_lda; /**< Number of linear transformations in lda. */
uint32 out_dim; /**< Output dimensionality */
} feat_t;
/**
* Name of feature type.
*/
#define feat_name(f) ((f)->name)
/**
* Input dimensionality of feature.
*/
#define feat_cepsize(f) ((f)->cepsize)
/**
* Size of dynamic feature window.
*/
#define feat_window_size(f) ((f)->window_size)
/**
* Number of feature streams.
*
* @deprecated Do not use this, use feat_dimension1() instead.
*/
#define feat_n_stream(f) ((f)->n_stream)
/**
* Length of feature stream i.
*
* @deprecated Do not use this, use feat_dimension2() instead.
*/
#define feat_stream_len(f,i) ((f)->stream_len[i])
/**
* Number of streams or subvectors in feature output.
*/
#define feat_dimension1(f) ((f)->n_sv ? (f)->n_sv : f->n_stream)
/**
* Dimensionality of stream/subvector i in feature output.
*/
#define feat_dimension2(f,i) ((f)->lda ? (f)->out_dim : ((f)->sv_len ? (f)->sv_len[i] : f->stream_len[i]))
/**
* Total dimensionality of feature output.
*/
#define feat_dimension(f) ((f)->out_dim)
/**
* Array with stream/subvector lengths
*/
#define feat_stream_lengths(f) ((f)->lda ? (&(f)->out_dim) : (f)->sv_len ? (f)->sv_len : f->stream_len)
/**
* Parse subvector specification string.
*
* Format of specification:
* \li '/' separated list of subvectors
* \li each subvector is a ',' separated list of subranges
* \li each subrange is a single \verbatim <number> \endverbatim or
* \verbatim <number>-<number> \endverbatim (inclusive), where
* \verbatim <number> \endverbatim is a feature vector dimension
* specifier.
*
* E.g., "24,0-11/25,12-23/26,27-38" has:
* \li 3 subvectors
* \li the 1st subvector has feature dims: 24, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, and 11.
* \li etc.
*
* @param str subvector specification string.
* @return allocated 2-D array of subvector specs (free with
* subvecs_free()). If there are N subvectors specified, subvec[N] =
* NULL; and each subvec[0]..subvec[N-1] is -1 terminated vector of
* feature dims.
*/
SPHINXBASE_EXPORT
int32 **parse_subvecs(char const *str);
/**
* Free array of subvector specs.
*/
SPHINXBASE_EXPORT
void subvecs_free(int32 **subvecs);
/**
* Allocate an array to hold several frames worth of feature vectors. The returned value
* is the mfcc_t ***data array, organized as follows:
*
* - data[0][0] = frame 0 stream 0 vector, data[0][1] = frame 0 stream 1 vector, ...
* - data[1][0] = frame 1 stream 0 vector, data[0][1] = frame 1 stream 1 vector, ...
* - data[2][0] = frame 2 stream 0 vector, data[0][1] = frame 2 stream 1 vector, ...
* - ...
*
* NOTE: For I/O convenience, the entire data area is allocated as one contiguous block.
* @return pointer to the allocated space if successful, NULL if any error.
*/
SPHINXBASE_EXPORT
mfcc_t ***feat_array_alloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used
to obtain number of streams and stream sizes */
int32 nfr /**< In: Number of frames for which to allocate */
);
/**
* Realloate the array of features. Requires us to know the old size
*/
SPHINXBASE_EXPORT
mfcc_t ***feat_array_realloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used
to obtain number of streams and stream sizes */
mfcc_t ***old_feat, /**< Feature array. Freed */
int32 ofr, /**< In: Previous number of frames */
int32 nfr /**< In: Number of frames for which to allocate */
);
/**
* Free a buffer allocated with feat_array_alloc()
*/
SPHINXBASE_EXPORT
void feat_array_free(mfcc_t ***feat);
/**
* Initialize feature module to use the selected type of feature stream.
* One-time only initialization at the beginning of the program. Input type
* is a string defining the kind of input->feature conversion desired:
*
* - "s2_4x": s2mfc->Sphinx-II 4-feature stream,
* - "1s_c_d_dd": s2mfc->Sphinx 3.x single feature stream,
* - "s3_1x39": s2mfc->Sphinx 3.0 single feature stream,
* - "n1,n2,n3,...": Explicit feature vector layout spec. with comma-separated
* feature stream lengths. In this case, the input data is already in the
* feature format and there is no conversion necessary.
*
* @return (feat_t *) descriptor if successful, NULL if error. Caller
* must not directly modify the contents of the returned value.
*/
SPHINXBASE_EXPORT
feat_t *feat_init(char const *type,/**< In: Type of feature stream */
cmn_type_t cmn, /**< In: Type of cepstram mean normalization to
be done before feature computation; can be
CMN_NONE (for none) */
int32 varnorm, /**< In: (boolean) Whether variance
normalization done on each utt; only
applicable if CMN also done */
agc_type_t agc, /**< In: Type of automatic gain control to be
done before feature computation */
int32 breport, /**< In: Whether to show a report for feat_t */
int32 cepsize /**< Number of components in the input vector
(or 0 for the default for this feature type,
which is usually 13) */
);
/**
* Add an LDA transformation to the feature module from a file.
* @return 0 for success or -1 if reading the LDA file failed.
**/
SPHINXBASE_EXPORT
int32 feat_read_lda(feat_t *feat, /**< In: Descriptor from feat_init() */
const char *ldafile, /**< In: File to read the LDA matrix from. */
int32 dim /**< In: Dimensionality of LDA output. */
);
/**
* Transform a block of features using the feature module's LDA transform.
**/
SPHINXBASE_EXPORT
void feat_lda_transform(feat_t *fcb, /**< In: Descriptor from feat_init() */
mfcc_t ***inout_feat, /**< Feature block to transform. */
uint32 nfr /**< In: Number of frames in inout_feat. */
);
/**
* Add a subvector specification to the feature module.
*
* The subvector splitting will be performed after dynamic feature
* computation, CMN, AGC, and any LDA transformation. The number of
* streams in the dynamic feature type must be one, as with LDA.
*
* After adding a subvector specification, the output of feature
* computation will be split into multiple subvectors, and
* feat_array_alloc() will allocate pointers accordingly. The number
* of <em>streams</em> will remain the
*
* @param fcb the feature descriptor.
* @param subvecs subvector specification. This pointer is retained
* by the feat_t and should not be freed manually.
* @return 0 for success or -1 if the subvector specification was
* invalid.
*/
SPHINXBASE_EXPORT
int feat_set_subvecs(feat_t *fcb, int32 **subvecs);
/**
* Print the given block of feature vectors to the given FILE.
*/
SPHINXBASE_EXPORT
void feat_print(feat_t *fcb, /**< In: Descriptor from feat_init() */
mfcc_t ***feat, /**< In: Feature data to be printed */
int32 nfr, /**< In: Number of frames of feature data above */
FILE *fp /**< In: Output file pointer */
);
/**
* Read a specified MFC file (or given segment within it), perform
* CMN/AGC as indicated by <code>fcb</code>, and compute feature
* vectors. Feature vectors are computed for the entire segment
* specified, by including additional surrounding or padding frames to
* accommodate the feature windows.
*
* @return Number of frames of feature vectors computed if successful;
* -1 if any error. <code>If</code> feat is NULL, then no actual
* computation will be done, and the number of frames which must be
* allocated will be returned.
*
* A note on how the file path is constructed: If the control file
* already specifies extension or absolute path, then these are not
* applied. The default extension is defined by the application.
*/
SPHINXBASE_EXPORT
int32 feat_s2mfc2feat(feat_t *fcb, /**< In: Descriptor from feat_init() */
const char *file, /**< In: File to be read */
const char *dir, /**< In: Directory prefix for file,
if needed; can be NULL */
const char *cepext,/**< In: Extension of the
cepstrum file.It cannot be
NULL */
int32 sf, int32 ef, /* Start/End frames
within file to be read. Use
0,-1 to process entire
file */
mfcc_t ***feat, /**< Out: Computed feature vectors;
caller must allocate this space */
int32 maxfr /**< In: Available space (number of frames) in
above feat array; it must be
sufficient to hold the result.
Pass -1 for no limit. */
);
/**
* Feature computation routine for live mode decoder.
*
* This function computes features for blocks of incoming data. It
* retains an internal buffer for computing deltas, which means that
* the number of output frames will not necessarily equal the number
* of input frames.
*
* <strong>It is very important</strong> to realize that the number of
* output frames can be <strong>greater than</strong> the number of
* input frames, specifically when <code>endutt</code> is true. It is
* guaranteed to never exceed <code>*inout_ncep +
* feat_window_size(fcb)</code>. You <strong>MUST</strong> have
* allocated at least that many frames in <code>ofeat</code>, or you
* will experience a buffer overflow.
*
* If beginutt and endutt are both true, CMN_CURRENT and AGC_MAX will
* be done. Otherwise only CMN_PRIOR and AGC_EMAX will be done.
*
* If beginutt is false, endutt is true, and the number of input
* frames exceeds the input size, then end-of-utterance processing
* won't actually be done. This condition can easily be checked,
* because <code>*inout_ncep</code> will equal the return value on
* exit, and will also be smaller than the value of
* <code>*inout_ncep</code> on entry.
*
* @return The number of output frames actually computed.
**/
SPHINXBASE_EXPORT
int32 feat_s2mfc2feat_live(feat_t *fcb, /**< In: Descriptor from feat_init() */
mfcc_t **uttcep, /**< In: Incoming cepstral buffer */
int32 *inout_ncep,/**< In: Size of incoming buffer.
Out: Number of incoming frames consumed. */
int32 beginutt, /**< In: Begining of utterance flag */
int32 endutt, /**< In: End of utterance flag */
mfcc_t ***ofeat /**< In: Output feature buffer. See
<strong>VERY IMPORTANT</strong> note
about the size of this buffer above. */
);
/**
* Update the normalization stats, possibly in the end of utterance
*
*/
SPHINXBASE_EXPORT
void feat_update_stats(feat_t *fcb);
/**
* Retain ownership of feat_t.
*
* @return pointer to retained feat_t.
*/
SPHINXBASE_EXPORT
feat_t *feat_retain(feat_t *f);
/**
* Release resource associated with feat_t
*
* @return new reference count (0 if freed)
*/
SPHINXBASE_EXPORT
int feat_free(feat_t *f /**< In: feat_t */
);
/**
* Report the feat_t data structure
*/
SPHINXBASE_EXPORT
void feat_report(feat_t *f /**< In: feat_t */
);
#ifdef __cplusplus
}
#endif
#endif