From ed27b8470c6bcb970b9a4f2f95fa5b0efe287ee9 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Thu, 30 Jun 2016 20:06:38 +0200 Subject: [PATCH] Workaround for PocketSphinx bug See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529 Also minor refactoring. --- src/phoneExtraction.cpp | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/phoneExtraction.cpp b/src/phoneExtraction.cpp index 795176a..504adc2 100644 --- a/src/phoneExtraction.cpp +++ b/src/phoneExtraction.cpp @@ -26,6 +26,7 @@ extern "C" { #include #include #include +#include } using std::runtime_error; @@ -95,7 +96,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format logging::log(logLevel, message); } -BoundedTimeline recognizeWords(unique_ptr audioStream, ps_decoder_t& decoder, ProgressSink& progressSink) { +BoundedTimeline recognizeWords(unique_ptr audioStream, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) { // Convert audio stream to the exact format PocketSphinx requires audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate); @@ -117,8 +118,18 @@ BoundedTimeline recognizeWords(unique_ptr audioStream, ps_d error = ps_end_utt(&decoder); if (error) throw runtime_error("Error ending utterance processing for word recognition."); - // Collect words + // PocketSphinx can't handle an utterance with no recognized words. + // As a result, the following utterance will be garbage. + // As a workaround, we throw away the decoder in this case. + // See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529 BoundedTimeline result(audioStream->getTruncatedRange()); + bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0; + if (noWordsRecognized) { + decoderIsStillUsable = false; + return result; + } + + // Collect words for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) { const char* word = ps_seg_word(it); int firstFrame, lastFrame; @@ -281,6 +292,7 @@ Timeline utteranceToPhones( AudioStream& audioStream, TimeRange utterance, ps_decoder_t& decoder, + bool& decoderIsStillUsable, ProgressSink& utteranceProgressSink) { ProgressMerger utteranceProgressMerger(utteranceProgressSink); @@ -290,7 +302,7 @@ Timeline utteranceToPhones( auto streamSegment = createSegment(audioStream.clone(true), utterance); // Get words - BoundedTimeline words = recognizeWords(streamSegment->clone(true), decoder, wordRecognitionProgressSink); + BoundedTimeline words = recognizeWords(streamSegment->clone(true), decoder, decoderIsStillUsable, wordRecognitionProgressSink); for (Timed timedWord : words) { timedWord.getTimeRange().shift(utterance.getStart()); logging::logTimedEvent("word", timedWord); @@ -359,20 +371,19 @@ BoundedTimeline detectPhones( }; BoundedTimeline result(audioStream->getTruncatedRange()); - std::mutex resultMutex, audioStreamMutex; + std::mutex resultMutex; auto processUtterance = [&](Timed timedUtterance, ProgressSink& utteranceProgressSink) { logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string("")); // Detect phones for utterance auto decoder = getDecoder(); - std::unique_ptr audioStreamCopy; - { - std::lock_guard lock(audioStreamMutex); - audioStreamCopy = audioStream->clone(true); - } + auto audioStreamCopy = audioStream->clone(true); + bool decoderIsStillUsable = true; Timeline phones = - utteranceToPhones(*audioStreamCopy, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink); - returnDecoder(std::move(decoder)); + utteranceToPhones(*audioStreamCopy, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink); + if (decoderIsStillUsable) { + returnDecoder(std::move(decoder)); + } // Copy phones to result timeline std::lock_guard lock(resultMutex);