From 760f6c2ce67f890c62ecbdebc22d2b4bb3a013ba Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Thu, 29 Sep 2016 10:44:34 +0200 Subject: [PATCH] Refactoring and better logging --- src/phoneRecognition.cpp | 110 +++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/src/phoneRecognition.cpp b/src/phoneRecognition.cpp index af67abf..742de17 100644 --- a/src/phoneRecognition.cpp +++ b/src/phoneRecognition.cpp @@ -278,9 +278,31 @@ lambda_unique_ptr createDecoder(optional dialog) { return decoder; } +Timeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones) { + Timeline noiseSounds; + + // Find utterance parts without recogniced phones + noiseSounds.set(utteranceTimeRange); + for (const auto& timedPhone : phones) { + noiseSounds.clear(timedPhone.getTimeRange()); + } + + // Remove undesired elements + const centiseconds minSoundLength = 5_cs; + for (const auto& unknownSound : Timeline(noiseSounds)) { + bool startsAtZero = unknownSound.getStart() == 0_cs; + bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength; + if (startsAtZero || tooShort) { + noiseSounds.clear(unknownSound.getTimeRange()); + } + } + + return noiseSounds; +} + Timeline utteranceToPhones( const AudioClip& audioClip, - TimeRange utterance, + TimeRange utteranceTimeRange, ps_decoder_t& decoder, ProgressSink& utteranceProgressSink) { @@ -288,18 +310,42 @@ Timeline utteranceToPhones( ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0); ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5); - const unique_ptr clipSegment = audioClip.clone() | segment(utterance) | resample(sphinxSampleRate); + // Pad time range to give Pocketsphinx some breathing room + TimeRange paddedTimeRange = utteranceTimeRange; + const centiseconds padding(3); + paddedTimeRange.grow(padding); + paddedTimeRange.trim(audioClip.getTruncatedRange()); + + const unique_ptr clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate); const auto audioBuffer = copyTo16bitBuffer(*clipSegment); // Get words BoundedTimeline words = recognizeWords(audioBuffer, decoder); wordRecognitionProgressSink.reportProgress(1.0); + + // Log utterance text + string text; + for (auto& timedWord : words) { + string word = timedWord.getValue(); + // Skip details + if (word == "" || word == "" || word == "") { + continue; + } + word = regex_replace(word, regex("\\(\\d\\)"), ""); + if (text.size() > 0) { + text += " "; + } + text += word; + } + logging::logTimedEvent("utterance", utteranceTimeRange, text); + + // Log words for (Timed timedWord : words) { - timedWord.getTimeRange().shift(utterance.getStart()); + timedWord.getTimeRange().shift(paddedTimeRange.getStart()); logging::logTimedEvent("word", timedWord); } - // Look up words in dictionary + // Convert word strings to word IDs using dictionary vector wordIds; for (const auto& timedWord : words) { wordIds.push_back(getWordId(timedWord.getValue(), *decoder.dict)); @@ -310,39 +356,28 @@ Timeline utteranceToPhones( #if BOOST_VERSION < 105600 // Support legacy syntax #define value_or get_value_or #endif - Timeline segmentPhones = getPhoneAlignment(wordIds, audioBuffer, decoder) + Timeline utterancePhones = getPhoneAlignment(wordIds, audioBuffer, decoder) .value_or(ContinuousTimeline(clipSegment->getTruncatedRange(), Phone::Noise)); alignmentProgressSink.reportProgress(1.0); - segmentPhones.shift(utterance.getStart()); - for (const auto& timedPhone : segmentPhones) { + utterancePhones.shift(paddedTimeRange.getStart()); + + // Log raw phones + for (const auto& timedPhone : utterancePhones) { logging::logTimedEvent("rawPhone", timedPhone); } - return segmentPhones; -} - -Timeline getUnknownSounds(const Timeline& utterances, const Timeline& phones) { - Timeline unknownSounds; - - // Find utterance parts without recogniced phones - for (const auto& timedUtterance : utterances) { - unknownSounds.set(timedUtterance.getTimeRange()); - } - for (const auto& timedPhone : phones) { - unknownSounds.clear(timedPhone.getTimeRange()); + // Guess positions of noise sounds + Timeline noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones); + for (const auto& noiseSound : noiseSounds) { + utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise); } - // Remove undesired elements - const centiseconds minSoundLength = 5_cs; - for (const auto& unknownSound : Timeline(unknownSounds)) { - bool startsAtZero = unknownSound.getStart() == 0_cs; - bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength; - if (startsAtZero || tooShort) { - unknownSounds.clear(unknownSound.getTimeRange()); - } + // Log phones + for (const auto& timedPhone : utterancePhones) { + logging::logTimedEvent("phone", timedPhone); } - return unknownSounds; + return utterancePhones; } BoundedTimeline recognizePhones( @@ -380,18 +415,10 @@ BoundedTimeline recognizePhones( BoundedTimeline phones(audioClip->getTruncatedRange()); std::mutex resultMutex; auto processUtterance = [&](Timed timedUtterance, ProgressSink& utteranceProgressSink) { - logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string("")); - - // Pad time range to give the recognizer some breathing room - TimeRange paddedTimeRange = timedUtterance.getTimeRange(); - const centiseconds padding(3); - paddedTimeRange.grow(padding); - paddedTimeRange.trim(audioClip->getTruncatedRange()); - // Detect phones for utterance auto decoder = decoderPool.acquire(); Timeline utterancePhones = - utteranceToPhones(*audioClip, paddedTimeRange, *decoder, utteranceProgressSink); + utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink); // Copy phones to result timeline std::lock_guard lock(resultMutex); @@ -425,14 +452,5 @@ BoundedTimeline recognizePhones( std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx.")); } - logging::debug("Detecting unknown sounds"); - Timeline unknownSounds = getUnknownSounds(utterances, phones); - for (const auto& unknownSound : unknownSounds) { - phones.set(unknownSound.getTimeRange(), Phone::Noise); - } - for (const auto& timedPhone : phones) { - logging::logTimedEvent("phone", timedPhone); - } - return phones; }