Workaround for PocketSphinx bug
See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529 Also minor refactoring.
This commit is contained in:
parent
2c0471e79f
commit
ed27b8470c
|
@ -26,6 +26,7 @@ extern "C" {
|
||||||
#include <ps_alignment.h>
|
#include <ps_alignment.h>
|
||||||
#include <state_align_search.h>
|
#include <state_align_search.h>
|
||||||
#include <pocketsphinx_internal.h>
|
#include <pocketsphinx_internal.h>
|
||||||
|
#include <ngram_search.h>
|
||||||
}
|
}
|
||||||
|
|
||||||
using std::runtime_error;
|
using std::runtime_error;
|
||||||
|
@ -95,7 +96,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
|
||||||
logging::log(logLevel, message);
|
logging::log(logLevel, message);
|
||||||
}
|
}
|
||||||
|
|
||||||
BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& decoder, ProgressSink& progressSink) {
|
BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
|
||||||
// Convert audio stream to the exact format PocketSphinx requires
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
||||||
|
|
||||||
|
@ -117,8 +118,18 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
|
||||||
error = ps_end_utt(&decoder);
|
error = ps_end_utt(&decoder);
|
||||||
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
||||||
|
|
||||||
// Collect words
|
// PocketSphinx can't handle an utterance with no recognized words.
|
||||||
|
// As a result, the following utterance will be garbage.
|
||||||
|
// As a workaround, we throw away the decoder in this case.
|
||||||
|
// See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529
|
||||||
BoundedTimeline<string> result(audioStream->getTruncatedRange());
|
BoundedTimeline<string> result(audioStream->getTruncatedRange());
|
||||||
|
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
||||||
|
if (noWordsRecognized) {
|
||||||
|
decoderIsStillUsable = false;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect words
|
||||||
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
|
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
|
||||||
const char* word = ps_seg_word(it);
|
const char* word = ps_seg_word(it);
|
||||||
int firstFrame, lastFrame;
|
int firstFrame, lastFrame;
|
||||||
|
@ -281,6 +292,7 @@ Timeline<Phone> utteranceToPhones(
|
||||||
AudioStream& audioStream,
|
AudioStream& audioStream,
|
||||||
TimeRange utterance,
|
TimeRange utterance,
|
||||||
ps_decoder_t& decoder,
|
ps_decoder_t& decoder,
|
||||||
|
bool& decoderIsStillUsable,
|
||||||
ProgressSink& utteranceProgressSink)
|
ProgressSink& utteranceProgressSink)
|
||||||
{
|
{
|
||||||
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
|
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
|
||||||
|
@ -290,7 +302,7 @@ Timeline<Phone> utteranceToPhones(
|
||||||
auto streamSegment = createSegment(audioStream.clone(true), utterance);
|
auto streamSegment = createSegment(audioStream.clone(true), utterance);
|
||||||
|
|
||||||
// Get words
|
// Get words
|
||||||
BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), decoder, wordRecognitionProgressSink);
|
BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), decoder, decoderIsStillUsable, wordRecognitionProgressSink);
|
||||||
for (Timed<string> timedWord : words) {
|
for (Timed<string> timedWord : words) {
|
||||||
timedWord.getTimeRange().shift(utterance.getStart());
|
timedWord.getTimeRange().shift(utterance.getStart());
|
||||||
logging::logTimedEvent("word", timedWord);
|
logging::logTimedEvent("word", timedWord);
|
||||||
|
@ -359,20 +371,19 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
};
|
};
|
||||||
|
|
||||||
BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
|
BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
|
||||||
std::mutex resultMutex, audioStreamMutex;
|
std::mutex resultMutex;
|
||||||
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
||||||
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
|
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
|
||||||
|
|
||||||
// Detect phones for utterance
|
// Detect phones for utterance
|
||||||
auto decoder = getDecoder();
|
auto decoder = getDecoder();
|
||||||
std::unique_ptr<AudioStream> audioStreamCopy;
|
auto audioStreamCopy = audioStream->clone(true);
|
||||||
{
|
bool decoderIsStillUsable = true;
|
||||||
std::lock_guard<std::mutex> lock(audioStreamMutex);
|
|
||||||
audioStreamCopy = audioStream->clone(true);
|
|
||||||
}
|
|
||||||
Timeline<Phone> phones =
|
Timeline<Phone> phones =
|
||||||
utteranceToPhones(*audioStreamCopy, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
|
utteranceToPhones(*audioStreamCopy, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink);
|
||||||
|
if (decoderIsStillUsable) {
|
||||||
returnDecoder(std::move(decoder));
|
returnDecoder(std::move(decoder));
|
||||||
|
}
|
||||||
|
|
||||||
// Copy phones to result timeline
|
// Copy phones to result timeline
|
||||||
std::lock_guard<std::mutex> lock(resultMutex);
|
std::lock_guard<std::mutex> lock(resultMutex);
|
||||||
|
|
Loading…
Reference in New Issue