Performing per-utterance cepstral mean normalization

See discussion in https://sourceforge.net/p/cmusphinx/discussion/help/thread/51e2979b/
This commit is contained in:
Daniel Wolf 2016-09-18 22:02:02 +02:00
parent f4f9ffe883
commit d97c880754
3 changed files with 35 additions and 23 deletions

View File

@ -2,6 +2,7 @@
using std::function;
using std::vector;
using std::unique_ptr;
// Converts a float in the range -1..1 to a signed 16-bit int
inline int16_t floatSampleToInt16(float sample) {
@ -38,3 +39,11 @@ void process16bitAudioClip(const AudioClip& audioClip, function<void(const vecto
process16bitAudioClip(audioClip, processBuffer, capacity, progressSink);
}
unique_ptr<vector<int16_t>> copyTo16bitBuffer(const AudioClip& audioClip) {
auto result = std::make_unique<vector<int16_t>>(static_cast<size_t>(audioClip.size()));
int index = 0;
for (float sample : audioClip) {
(*result)[index++] = floatSampleToInt16(sample);
}
return std::move(result);
}

View File

@ -7,3 +7,4 @@
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink);
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink);
std::unique_ptr<std::vector<int16_t>> copyTo16bitBuffer(const AudioClip& audioClip);

View File

@ -96,7 +96,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
logging::log(logLevel, message);
}
BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable) {
// Convert audio stream to the exact format PocketSphinx requires
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
@ -107,12 +107,12 @@ BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decod
int error = ps_start_utt(&decoder);
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
// Process entire sound stream
auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
// Process entire audio clip
auto buffer = copyTo16bitBuffer(*audioClip);
const bool noRecognition = false;
const bool fullUtterance = true;
int searchedFrameCount = ps_process_raw(&decoder, buffer->data(), buffer->size(), noRecognition, fullUtterance);
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
};
process16bitAudioClip(*audioClip, processBuffer, progressSink);
// End recognition
error = ps_end_utt(&decoder);
@ -154,8 +154,7 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
optional<Timeline<Phone>> getPhoneAlignment(
const vector<s3wid_t>& wordIds,
const AudioClip& inputAudioClip,
ps_decoder_t& decoder,
ProgressSink& progressSink)
ps_decoder_t& decoder)
{
// Create alignment list
lambda_unique_ptr<ps_alignment_t> alignment(
@ -190,18 +189,17 @@ optional<Timeline<Phone>> getPhoneAlignment(
// Start search
ps_search_start(search.get());
// Process entire sound stream
auto processBuffer = [&](const vector<int16_t>& buffer) {
const int16* nextSample = buffer.data();
size_t remainingSamples = buffer.size();
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
// Process entire audio clip
auto buffer = copyTo16bitBuffer(*audioClip);
const int16* nextSample = buffer->data();
size_t remainingSamples = buffer->size();
bool fullUtterance = true;
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
while (acousticModel->n_feat_frame > 0) {
ps_search_step(search.get(), acousticModel->output_frame);
acmod_advance(acousticModel);
}
}
};
process16bitAudioClip(*audioClip, processBuffer, progressSink);
// End search
error = ps_search_finish(search.get());
@ -263,6 +261,8 @@ lambda_unique_ptr<ps_decoder_t> createDecoder(optional<u32string> dialog) {
"-dither", "yes",
// Disable VAD -- we're doing that ourselves
"-remove_silence", "no",
// Perform per-utterance cepstral mean normalization
"-cmn", "batch",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
@ -309,7 +309,8 @@ Timeline<Phone> utteranceToPhones(
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance);
// Get words
BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable, wordRecognitionProgressSink);
BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable);
wordRecognitionProgressSink.reportProgress(1.0);
for (Timed<string> timedWord : words) {
timedWord.getTimeRange().shift(utterance.getStart());
logging::logTimedEvent("word", timedWord);
@ -326,8 +327,9 @@ Timeline<Phone> utteranceToPhones(
#if BOOST_VERSION < 105600 // Support legacy syntax
#define value_or get_value_or
#endif
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder)
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
alignmentProgressSink.reportProgress(1.0);
segmentPhones.shift(utterance.getStart());
for (const auto& timedPhone : segmentPhones) {
logging::logTimedEvent("rawPhone", timedPhone);