Workaround for PocketSphinx bug

See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529
Also minor refactoring.
This commit is contained in:
Daniel Wolf 2016-06-30 20:06:38 +02:00
parent 2c0471e79f
commit ed27b8470c
1 changed files with 22 additions and 11 deletions

View File

@ -26,6 +26,7 @@ extern "C" {
#include <ps_alignment.h> #include <ps_alignment.h>
#include <state_align_search.h> #include <state_align_search.h>
#include <pocketsphinx_internal.h> #include <pocketsphinx_internal.h>
#include <ngram_search.h>
} }
using std::runtime_error; using std::runtime_error;
@ -95,7 +96,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
logging::log(logLevel, message); logging::log(logLevel, message);
} }
BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& decoder, ProgressSink& progressSink) { BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
// Convert audio stream to the exact format PocketSphinx requires // Convert audio stream to the exact format PocketSphinx requires
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate); audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
@ -117,8 +118,18 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
error = ps_end_utt(&decoder); error = ps_end_utt(&decoder);
if (error) throw runtime_error("Error ending utterance processing for word recognition."); if (error) throw runtime_error("Error ending utterance processing for word recognition.");
// Collect words // PocketSphinx can't handle an utterance with no recognized words.
// As a result, the following utterance will be garbage.
// As a workaround, we throw away the decoder in this case.
// See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529
BoundedTimeline<string> result(audioStream->getTruncatedRange()); BoundedTimeline<string> result(audioStream->getTruncatedRange());
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
if (noWordsRecognized) {
decoderIsStillUsable = false;
return result;
}
// Collect words
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) { for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
const char* word = ps_seg_word(it); const char* word = ps_seg_word(it);
int firstFrame, lastFrame; int firstFrame, lastFrame;
@ -281,6 +292,7 @@ Timeline<Phone> utteranceToPhones(
AudioStream& audioStream, AudioStream& audioStream,
TimeRange utterance, TimeRange utterance,
ps_decoder_t& decoder, ps_decoder_t& decoder,
bool& decoderIsStillUsable,
ProgressSink& utteranceProgressSink) ProgressSink& utteranceProgressSink)
{ {
ProgressMerger utteranceProgressMerger(utteranceProgressSink); ProgressMerger utteranceProgressMerger(utteranceProgressSink);
@ -290,7 +302,7 @@ Timeline<Phone> utteranceToPhones(
auto streamSegment = createSegment(audioStream.clone(true), utterance); auto streamSegment = createSegment(audioStream.clone(true), utterance);
// Get words // Get words
BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), decoder, wordRecognitionProgressSink); BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), decoder, decoderIsStillUsable, wordRecognitionProgressSink);
for (Timed<string> timedWord : words) { for (Timed<string> timedWord : words) {
timedWord.getTimeRange().shift(utterance.getStart()); timedWord.getTimeRange().shift(utterance.getStart());
logging::logTimedEvent("word", timedWord); logging::logTimedEvent("word", timedWord);
@ -359,20 +371,19 @@ BoundedTimeline<Phone> detectPhones(
}; };
BoundedTimeline<Phone> result(audioStream->getTruncatedRange()); BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
std::mutex resultMutex, audioStreamMutex; std::mutex resultMutex;
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) { auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string("")); logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
// Detect phones for utterance // Detect phones for utterance
auto decoder = getDecoder(); auto decoder = getDecoder();
std::unique_ptr<AudioStream> audioStreamCopy; auto audioStreamCopy = audioStream->clone(true);
{ bool decoderIsStillUsable = true;
std::lock_guard<std::mutex> lock(audioStreamMutex);
audioStreamCopy = audioStream->clone(true);
}
Timeline<Phone> phones = Timeline<Phone> phones =
utteranceToPhones(*audioStreamCopy, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink); utteranceToPhones(*audioStreamCopy, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink);
returnDecoder(std::move(decoder)); if (decoderIsStillUsable) {
returnDecoder(std::move(decoder));
}
// Copy phones to result timeline // Copy phones to result timeline
std::lock_guard<std::mutex> lock(resultMutex); std::lock_guard<std::mutex> lock(resultMutex);