Refactoring and better logging

2016-09-29 10:44:34 +02:00 · 2016-09-29 10:44:34 +02:00 · 760f6c2ce6
parent 750078618c
commit 760f6c2ce6
1 changed files with 64 additions and 46 deletions
--- a/src/phoneRecognition.cpp
+++ b/src/phoneRecognition.cpp
@ -278,9 +278,31 @@ lambda_unique_ptr<ps_decoder_t> createDecoder(optional<u32string> dialog) {
 	return decoder;
 }
 Timeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
 	Timeline<void> noiseSounds;
 	// Find utterance parts without recogniced phones
 	noiseSounds.set(utteranceTimeRange);
 	for (const auto& timedPhone : phones) {
 		noiseSounds.clear(timedPhone.getTimeRange());
 	}
 	// Remove undesired elements
 	const centiseconds minSoundLength = 5_cs;
 	for (const auto& unknownSound : Timeline<void>(noiseSounds)) {
 		bool startsAtZero = unknownSound.getStart() == 0_cs;
 		bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength;
 		if (startsAtZero || tooShort) {
 			noiseSounds.clear(unknownSound.getTimeRange());
 		}
 	}
 	return noiseSounds;
 }
 Timeline<Phone> utteranceToPhones(
 	const AudioClip& audioClip,
-	TimeRange utterance,
+	TimeRange utteranceTimeRange,
 	ps_decoder_t& decoder,
 	ProgressSink& utteranceProgressSink)
 {
@ -288,18 +310,42 @@ Timeline<Phone> utteranceToPhones(
 	ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
 	ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
-	const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance) | resample(sphinxSampleRate);
+	// Pad time range to give Pocketsphinx some breathing room
 	TimeRange paddedTimeRange = utteranceTimeRange;
 	const centiseconds padding(3);
 	paddedTimeRange.grow(padding);
 	paddedTimeRange.trim(audioClip.getTruncatedRange());
 	const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
 	const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
 	// Get words
 	BoundedTimeline<string> words = recognizeWords(audioBuffer, decoder);
 	wordRecognitionProgressSink.reportProgress(1.0);
 	// Log utterance text
 	string text;
 	for (auto& timedWord : words) {
 		string word = timedWord.getValue();
 		// Skip details
 		if (word == "<s>" || word == "</s>" || word == "<sil>") {
 			continue;
 		}
 		word = regex_replace(word, regex("\\(\\d\\)"), "");
 		if (text.size() > 0) {
 			text += " ";
 		}
 		text += word;
 	}
 	logging::logTimedEvent("utterance", utteranceTimeRange, text);
 	// Log words
 	for (Timed<string> timedWord : words) {
-		timedWord.getTimeRange().shift(utterance.getStart());
+		timedWord.getTimeRange().shift(paddedTimeRange.getStart());
 		logging::logTimedEvent("word", timedWord);
 	}
-	// Look up words in dictionary
+	// Convert word strings to word IDs using dictionary
 	vector<s3wid_t> wordIds;
 	for (const auto& timedWord : words) {
 		wordIds.push_back(getWordId(timedWord.getValue(), *decoder.dict));
@ -310,39 +356,28 @@ Timeline<Phone> utteranceToPhones(
 #if BOOST_VERSION < 105600 // Support legacy syntax
 #define value_or get_value_or
 #endif
-	Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, audioBuffer, decoder)
+	Timeline<Phone> utterancePhones = getPhoneAlignment(wordIds, audioBuffer, decoder)
 		.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
 	alignmentProgressSink.reportProgress(1.0);
-	segmentPhones.shift(utterance.getStart());
+	utterancePhones.shift(paddedTimeRange.getStart());
-	for (const auto& timedPhone : segmentPhones) {
+
 	// Log raw phones
 	for (const auto& timedPhone : utterancePhones) {
 		logging::logTimedEvent("rawPhone", timedPhone);
 	}
-	return segmentPhones;
+	// Guess positions of noise sounds
-}
+	Timeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
-
+	for (const auto& noiseSound : noiseSounds) {
-Timeline<void> getUnknownSounds(const Timeline<void>& utterances, const Timeline<Phone>& phones) {
+		utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
 	Timeline<void> unknownSounds;
 	// Find utterance parts without recogniced phones
 	for (const auto& timedUtterance : utterances) {
 		unknownSounds.set(timedUtterance.getTimeRange());
 	}
 	for (const auto& timedPhone : phones) {
 		unknownSounds.clear(timedPhone.getTimeRange());
 	}
-	// Remove undesired elements
+	// Log phones
-	const centiseconds minSoundLength = 5_cs;
+	for (const auto& timedPhone : utterancePhones) {
-	for (const auto& unknownSound : Timeline<void>(unknownSounds)) {
+		logging::logTimedEvent("phone", timedPhone);
 		bool startsAtZero = unknownSound.getStart() == 0_cs;
 		bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength;
 		if (startsAtZero || tooShort) {
 			unknownSounds.clear(unknownSound.getTimeRange());
 		}
 	}
-	return unknownSounds;
+	return utterancePhones;
 }
 BoundedTimeline<Phone> recognizePhones(
@ -380,18 +415,10 @@ BoundedTimeline<Phone> recognizePhones(
 	BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
 	std::mutex resultMutex;
 	auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
 		logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
 		// Pad time range to give the recognizer some breathing room
 		TimeRange paddedTimeRange = timedUtterance.getTimeRange();
 		const centiseconds padding(3);
 		paddedTimeRange.grow(padding);
 		paddedTimeRange.trim(audioClip->getTruncatedRange());
 		// Detect phones for utterance
 		auto decoder = decoderPool.acquire();
 		Timeline<Phone> utterancePhones =
-			utteranceToPhones(*audioClip, paddedTimeRange, *decoder, utteranceProgressSink);
+			utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
 		// Copy phones to result timeline
 		std::lock_guard<std::mutex> lock(resultMutex);
@ -425,14 +452,5 @@ BoundedTimeline<Phone> recognizePhones(
 		std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
 	}
 	logging::debug("Detecting unknown sounds");
 	Timeline<void> unknownSounds = getUnknownSounds(utterances, phones);
 	for (const auto& unknownSound : unknownSounds) {
 		phones.set(unknownSound.getTimeRange(), Phone::Noise);
 	}
 	for (const auto& timedPhone : phones) {
 		logging::logTimedEvent("phone", timedPhone);
 	}
 	return phones;
 }