Refactoring and better logging

This commit is contained in:
Daniel Wolf 2016-09-29 10:44:34 +02:00
parent 750078618c
commit 760f6c2ce6
1 changed files with 64 additions and 46 deletions

View File

@ -278,9 +278,31 @@ lambda_unique_ptr<ps_decoder_t> createDecoder(optional<u32string> dialog) {
return decoder; return decoder;
} }
Timeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
Timeline<void> noiseSounds;
// Find utterance parts without recogniced phones
noiseSounds.set(utteranceTimeRange);
for (const auto& timedPhone : phones) {
noiseSounds.clear(timedPhone.getTimeRange());
}
// Remove undesired elements
const centiseconds minSoundLength = 5_cs;
for (const auto& unknownSound : Timeline<void>(noiseSounds)) {
bool startsAtZero = unknownSound.getStart() == 0_cs;
bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength;
if (startsAtZero || tooShort) {
noiseSounds.clear(unknownSound.getTimeRange());
}
}
return noiseSounds;
}
Timeline<Phone> utteranceToPhones( Timeline<Phone> utteranceToPhones(
const AudioClip& audioClip, const AudioClip& audioClip,
TimeRange utterance, TimeRange utteranceTimeRange,
ps_decoder_t& decoder, ps_decoder_t& decoder,
ProgressSink& utteranceProgressSink) ProgressSink& utteranceProgressSink)
{ {
@ -288,18 +310,42 @@ Timeline<Phone> utteranceToPhones(
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0); ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5); ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance) | resample(sphinxSampleRate); // Pad time range to give Pocketsphinx some breathing room
TimeRange paddedTimeRange = utteranceTimeRange;
const centiseconds padding(3);
paddedTimeRange.grow(padding);
paddedTimeRange.trim(audioClip.getTruncatedRange());
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
const auto audioBuffer = copyTo16bitBuffer(*clipSegment); const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
// Get words // Get words
BoundedTimeline<string> words = recognizeWords(audioBuffer, decoder); BoundedTimeline<string> words = recognizeWords(audioBuffer, decoder);
wordRecognitionProgressSink.reportProgress(1.0); wordRecognitionProgressSink.reportProgress(1.0);
// Log utterance text
string text;
for (auto& timedWord : words) {
string word = timedWord.getValue();
// Skip details
if (word == "<s>" || word == "</s>" || word == "<sil>") {
continue;
}
word = regex_replace(word, regex("\\(\\d\\)"), "");
if (text.size() > 0) {
text += " ";
}
text += word;
}
logging::logTimedEvent("utterance", utteranceTimeRange, text);
// Log words
for (Timed<string> timedWord : words) { for (Timed<string> timedWord : words) {
timedWord.getTimeRange().shift(utterance.getStart()); timedWord.getTimeRange().shift(paddedTimeRange.getStart());
logging::logTimedEvent("word", timedWord); logging::logTimedEvent("word", timedWord);
} }
// Look up words in dictionary // Convert word strings to word IDs using dictionary
vector<s3wid_t> wordIds; vector<s3wid_t> wordIds;
for (const auto& timedWord : words) { for (const auto& timedWord : words) {
wordIds.push_back(getWordId(timedWord.getValue(), *decoder.dict)); wordIds.push_back(getWordId(timedWord.getValue(), *decoder.dict));
@ -310,39 +356,28 @@ Timeline<Phone> utteranceToPhones(
#if BOOST_VERSION < 105600 // Support legacy syntax #if BOOST_VERSION < 105600 // Support legacy syntax
#define value_or get_value_or #define value_or get_value_or
#endif #endif
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, audioBuffer, decoder) Timeline<Phone> utterancePhones = getPhoneAlignment(wordIds, audioBuffer, decoder)
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise)); .value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
alignmentProgressSink.reportProgress(1.0); alignmentProgressSink.reportProgress(1.0);
segmentPhones.shift(utterance.getStart()); utterancePhones.shift(paddedTimeRange.getStart());
for (const auto& timedPhone : segmentPhones) {
// Log raw phones
for (const auto& timedPhone : utterancePhones) {
logging::logTimedEvent("rawPhone", timedPhone); logging::logTimedEvent("rawPhone", timedPhone);
} }
return segmentPhones; // Guess positions of noise sounds
} Timeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
for (const auto& noiseSound : noiseSounds) {
Timeline<void> getUnknownSounds(const Timeline<void>& utterances, const Timeline<Phone>& phones) { utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
Timeline<void> unknownSounds;
// Find utterance parts without recogniced phones
for (const auto& timedUtterance : utterances) {
unknownSounds.set(timedUtterance.getTimeRange());
}
for (const auto& timedPhone : phones) {
unknownSounds.clear(timedPhone.getTimeRange());
} }
// Remove undesired elements // Log phones
const centiseconds minSoundLength = 5_cs; for (const auto& timedPhone : utterancePhones) {
for (const auto& unknownSound : Timeline<void>(unknownSounds)) { logging::logTimedEvent("phone", timedPhone);
bool startsAtZero = unknownSound.getStart() == 0_cs;
bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength;
if (startsAtZero || tooShort) {
unknownSounds.clear(unknownSound.getTimeRange());
}
} }
return unknownSounds; return utterancePhones;
} }
BoundedTimeline<Phone> recognizePhones( BoundedTimeline<Phone> recognizePhones(
@ -380,18 +415,10 @@ BoundedTimeline<Phone> recognizePhones(
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange()); BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
std::mutex resultMutex; std::mutex resultMutex;
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) { auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
// Pad time range to give the recognizer some breathing room
TimeRange paddedTimeRange = timedUtterance.getTimeRange();
const centiseconds padding(3);
paddedTimeRange.grow(padding);
paddedTimeRange.trim(audioClip->getTruncatedRange());
// Detect phones for utterance // Detect phones for utterance
auto decoder = decoderPool.acquire(); auto decoder = decoderPool.acquire();
Timeline<Phone> utterancePhones = Timeline<Phone> utterancePhones =
utteranceToPhones(*audioClip, paddedTimeRange, *decoder, utteranceProgressSink); utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
// Copy phones to result timeline // Copy phones to result timeline
std::lock_guard<std::mutex> lock(resultMutex); std::lock_guard<std::mutex> lock(resultMutex);
@ -425,14 +452,5 @@ BoundedTimeline<Phone> recognizePhones(
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx.")); std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
} }
logging::debug("Detecting unknown sounds");
Timeline<void> unknownSounds = getUnknownSounds(utterances, phones);
for (const auto& unknownSound : unknownSounds) {
phones.set(unknownSound.getTimeRange(), Phone::Noise);
}
for (const auto& timedPhone : phones) {
logging::logTimedEvent("phone", timedPhone);
}
return phones; return phones;
} }