From d97c880754c543f51303f6a32f5d36bad2d3fc5a Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Sun, 18 Sep 2016 22:02:02 +0200 Subject: [PATCH] Performing per-utterance cepstral mean normalization See discussion in https://sourceforge.net/p/cmusphinx/discussion/help/thread/51e2979b/ --- src/audio/processing.cpp | 9 ++++++++ src/audio/processing.h | 3 ++- src/phoneExtraction.cpp | 46 +++++++++++++++++++++------------------- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/src/audio/processing.cpp b/src/audio/processing.cpp index 855d5a4..d70aaf6 100644 --- a/src/audio/processing.cpp +++ b/src/audio/processing.cpp @@ -2,6 +2,7 @@ using std::function; using std::vector; +using std::unique_ptr; // Converts a float in the range -1..1 to a signed 16-bit int inline int16_t floatSampleToInt16(float sample) { @@ -38,3 +39,11 @@ void process16bitAudioClip(const AudioClip& audioClip, function> copyTo16bitBuffer(const AudioClip& audioClip) { + auto result = std::make_unique>(static_cast(audioClip.size())); + int index = 0; + for (float sample : audioClip) { + (*result)[index++] = floatSampleToInt16(sample); + } + return std::move(result); +} diff --git a/src/audio/processing.h b/src/audio/processing.h index 5f0c4a6..2ee1963 100644 --- a/src/audio/processing.h +++ b/src/audio/processing.h @@ -6,4 +6,5 @@ #include "ProgressBar.h" void process16bitAudioClip(const AudioClip& audioClip, std::function&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink); -void process16bitAudioClip(const AudioClip& audioClip, std::function&)> processBuffer, ProgressSink& progressSink); \ No newline at end of file +void process16bitAudioClip(const AudioClip& audioClip, std::function&)> processBuffer, ProgressSink& progressSink); +std::unique_ptr> copyTo16bitBuffer(const AudioClip& audioClip); \ No newline at end of file diff --git a/src/phoneExtraction.cpp b/src/phoneExtraction.cpp index ab358a5..9727739 100644 --- a/src/phoneExtraction.cpp +++ b/src/phoneExtraction.cpp @@ -96,7 +96,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format logging::log(logLevel, message); } -BoundedTimeline recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) { +BoundedTimeline recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable) { // Convert audio stream to the exact format PocketSphinx requires const unique_ptr audioClip = inputAudioClip.clone() | resample(sphinxSampleRate); @@ -107,12 +107,12 @@ BoundedTimeline recognizeWords(const AudioClip& inputAudioClip, ps_decod int error = ps_start_utt(&decoder); if (error) throw runtime_error("Error starting utterance processing for word recognition."); - // Process entire sound stream - auto processBuffer = [&decoder](const vector& buffer) { - int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false); - if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition."); - }; - process16bitAudioClip(*audioClip, processBuffer, progressSink); + // Process entire audio clip + auto buffer = copyTo16bitBuffer(*audioClip); + const bool noRecognition = false; + const bool fullUtterance = true; + int searchedFrameCount = ps_process_raw(&decoder, buffer->data(), buffer->size(), noRecognition, fullUtterance); + if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition."); // End recognition error = ps_end_utt(&decoder); @@ -154,8 +154,7 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) { optional> getPhoneAlignment( const vector& wordIds, const AudioClip& inputAudioClip, - ps_decoder_t& decoder, - ProgressSink& progressSink) + ps_decoder_t& decoder) { // Create alignment list lambda_unique_ptr alignment( @@ -190,18 +189,17 @@ optional> getPhoneAlignment( // Start search ps_search_start(search.get()); - // Process entire sound stream - auto processBuffer = [&](const vector& buffer) { - const int16* nextSample = buffer.data(); - size_t remainingSamples = buffer.size(); - while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) { - while (acousticModel->n_feat_frame > 0) { - ps_search_step(search.get(), acousticModel->output_frame); - acmod_advance(acousticModel); - } + // Process entire audio clip + auto buffer = copyTo16bitBuffer(*audioClip); + const int16* nextSample = buffer->data(); + size_t remainingSamples = buffer->size(); + bool fullUtterance = true; + while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) { + while (acousticModel->n_feat_frame > 0) { + ps_search_step(search.get(), acousticModel->output_frame); + acmod_advance(acousticModel); } - }; - process16bitAudioClip(*audioClip, processBuffer, progressSink); + } // End search error = ps_search_finish(search.get()); @@ -263,6 +261,8 @@ lambda_unique_ptr createDecoder(optional dialog) { "-dither", "yes", // Disable VAD -- we're doing that ourselves "-remove_silence", "no", + // Perform per-utterance cepstral mean normalization + "-cmn", "batch", nullptr), [](cmd_ln_t* config) { cmd_ln_free_r(config); }); if (!config) throw runtime_error("Error creating configuration."); @@ -309,7 +309,8 @@ Timeline utteranceToPhones( const unique_ptr clipSegment = audioClip.clone() | segment(utterance); // Get words - BoundedTimeline words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable, wordRecognitionProgressSink); + BoundedTimeline words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable); + wordRecognitionProgressSink.reportProgress(1.0); for (Timed timedWord : words) { timedWord.getTimeRange().shift(utterance.getStart()); logging::logTimedEvent("word", timedWord); @@ -326,8 +327,9 @@ Timeline utteranceToPhones( #if BOOST_VERSION < 105600 // Support legacy syntax #define value_or get_value_or #endif - Timeline segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink) + Timeline segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder) .value_or(ContinuousTimeline(clipSegment->getTruncatedRange(), Phone::Noise)); + alignmentProgressSink.reportProgress(1.0); segmentPhones.shift(utterance.getStart()); for (const auto& timedPhone : segmentPhones) { logging::logTimedEvent("rawPhone", timedPhone);