From 206cde46582c21afa0cf599a49c691396156065d Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Thu, 11 Aug 2016 10:18:03 +0200
Subject: [PATCH] Supporting noises (breathing, smacking, etc.)

---
 extras/SonyVegas/Debug Rhubarb.cs    |  1 +
 src/Phone.cpp                        | 22 +++++++++--
 src/Phone.h                          | 14 +++++--
 src/audio/voiceActivityDetection.cpp |  6 ---
 src/g2p.cpp                          |  4 +-
 src/mouthAnimation.cpp               |  8 +++-
 src/phoneExtraction.cpp              | 57 +++++++++++++++++++++++-----
 7 files changed, 87 insertions(+), 25 deletions(-)
diff --git a/extras/SonyVegas/Debug Rhubarb.cs b/extras/SonyVegas/Debug Rhubarb.cs
index 7c12cc7..c13fa76 100644
--- a/extras/SonyVegas/Debug Rhubarb.cs	
+++ b/extras/SonyVegas/Debug Rhubarb.cs	
@@ -258,6 +258,7 @@ public class Visualization {
 public enum EventType {
 	Utterance,
 	Word,
+	RawPhone,
 	Phone,
 	Shape
 }
diff --git a/src/Phone.cpp b/src/Phone.cpp
index 6d0c5a3..e325f0f 100644
--- a/src/Phone.cpp
+++ b/src/Phone.cpp
@@ -14,7 +14,6 @@ string PhoneConverter::getTypeName() {
 
 EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
 	return member_data{
-		{ Phone::Unknown,	"Unknown" },
 		{ Phone::AO,		"AO" },
 		{ Phone::AA,		"AA" },
 		{ Phone::IY,		"IY" },
@@ -30,6 +29,7 @@ EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
 		{ Phone::AW,		"AW" },
 		{ Phone::OY,		"OY" },
 		{ Phone::ER,		"ER" },
+
 		{ Phone::P,			"P" },
 		{ Phone::B,			"B" },
 		{ Phone::T,			"T" },
@@ -53,13 +53,29 @@ EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
 		{ Phone::L,			"L" },
 		{ Phone::R,			"R" },
 		{ Phone::Y,			"Y" },
-		{ Phone::W,			"W" }
+		{ Phone::W,			"W" },
+
+		{ Phone::Breath,	"Breath" },
+		{ Phone::Cough,		"Cough" },
+		{ Phone::Smack,		"Smack" },
+		{ Phone::Noise,		"Noise" }
 	};
 }
 
 optional<Phone> PhoneConverter::tryParse(const string& s) {
 	auto result = EnumConverter<Phone>::tryParse(s);
-	return result ? result : Phone::Unknown;
+	if (result) return result;
+
+	if (s == "+BREATH+") {
+		return Phone::Breath;
+	}
+	if (s == "+COUGH+") {
+		return Phone::Cough;
+	}
+	if (s == "+SMACK+") {
+		return Phone::Smack;
+	}
+	return Phone::Noise;
 }
 
 std::ostream& operator<<(std::ostream& stream, Phone value) {
diff --git a/src/Phone.h b/src/Phone.h
index ef9183c..5d37f7e 100644
--- a/src/Phone.h
+++ b/src/Phone.h
@@ -1,11 +1,9 @@
-#pragma once
+﻿#pragma once
 
 #include "EnumConverter.h"
 
 // Defines a subset of the Arpabet
 enum class Phone {
-	Unknown,
-
 	/////////
 	// Vowels
 
@@ -67,7 +65,15 @@ enum class Phone {
 
 	// ... semivowels
 	Y,		// [j] as in [y]es
-	W		// [w] as in [w]ay
+	W,		// [w] as in [w]ay
+
+	/////////////
+	// Misc.
+
+	Breath,
+	Cough,
+	Smack,
+	Noise
 };
 
 class PhoneConverter : public EnumConverter<Phone> {
diff --git a/src/audio/voiceActivityDetection.cpp b/src/audio/voiceActivityDetection.cpp
index ec7abc2..b795ed6 100644
--- a/src/audio/voiceActivityDetection.cpp
+++ b/src/audio/voiceActivityDetection.cpp
@@ -100,12 +100,6 @@ BoundedTimeline<void> detectVoiceActivity(const AudioClip& inputAudioClip, Progr
 		}
 	}
 
-	// Pad each activity to give the recognizer some breathing room
-	const centiseconds padding(3);
-	for (const auto& element : BoundedTimeline<void>(activity)) {
-		activity.set(element.getStart() - padding, element.getEnd() + padding);
-	}
-
 	logging::debugFormat("Found {} sections of voice activity: {}", activity.size(),
 		join(activity | transformed([](const Timed<void>& t) { return format("{0}-{1}", t.getStart(), t.getEnd()); }), ", "));
 
diff --git a/src/g2p.cpp b/src/g2p.cpp
index 424f6da..7014bd9 100644
--- a/src/g2p.cpp
+++ b/src/g2p.cpp
@@ -65,7 +65,7 @@ Phone charToPhone(wchar_t c) {
 		case L'l': return Phone::L;
 		case L'h': return Phone::HH;
 	}
-	return Phone::Unknown;
+	return Phone::Noise;
 }
 
 vector<Phone> wordToPhones(const std::string& word) {
@@ -91,7 +91,7 @@ vector<Phone> wordToPhones(const std::string& word) {
 	vector<Phone> result;
 	for (wchar_t c : wideWord) {
 		Phone phone = charToPhone(c);
-		if (phone == Phone::Unknown) {
+		if (phone == Phone::Noise) {
 			logging::errorFormat("G2P error determining pronunciation for '{}': Character '{}' is not a recognized phone shorthand.",
 				word, static_cast<char>(c));
 		} else {
diff --git a/src/mouthAnimation.cpp b/src/mouthAnimation.cpp
index c190586..cd5d83b 100644
--- a/src/mouthAnimation.cpp
+++ b/src/mouthAnimation.cpp
@@ -53,7 +53,6 @@ Timeline<Viseme> animate(optional<Phone> phone, centiseconds duration, centiseco
 	if (!phone)				return single({ X });
 
 	switch (*phone) {
-	case Phone::Unknown:	return single({ B });
 	case Phone::AO:			return single({ E });
 	case Phone::AA:			return single({ D });
 	case Phone::IY:			return single({ B });
@@ -69,6 +68,7 @@ Timeline<Viseme> animate(optional<Phone> phone, centiseconds duration, centiseco
 	case Phone::AW:			return diphtong({ D }, { F });
 	case Phone::OY:			return diphtong({ F }, { B });
 	case Phone::ER:			return single({ { B }, 7_cs, { E } });
+	
 	case Phone::P:
 	case Phone::B:			return bilabialStop();
 	case Phone::T:
@@ -93,6 +93,12 @@ Timeline<Viseme> animate(optional<Phone> phone, centiseconds duration, centiseco
 	case Phone::R:			return single({ { B, B, B, B, F } });
 	case Phone::Y:			return single({ B });
 	case Phone::W:			return single({ F });
+	
+	case Phone::Breath:
+	case Phone::Cough:
+	case Phone::Smack:		return single({ C });
+	case Phone::Noise:	return single({ B });
+
 	default:
 		throw std::invalid_argument("Unexpected phone.");
 	}
diff --git a/src/phoneExtraction.cpp b/src/phoneExtraction.cpp
index 4a9244a..bf48b7d 100644
--- a/src/phoneExtraction.cpp
+++ b/src/phoneExtraction.cpp
@@ -323,15 +323,39 @@ Timeline<Phone> utteranceToPhones(
 
 	// Align the words' phones with speech
 	Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
-		.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Unknown));
+		.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
 	segmentPhones.shift(utterance.getStart());
 	for (const auto& timedPhone : segmentPhones) {
-		logging::logTimedEvent("phone", timedPhone);
+		logging::logTimedEvent("rawPhone", timedPhone);
 	}
 
 	return segmentPhones;
 }
 
+Timeline<void> getUnknownSounds(const Timeline<void>& utterances, const Timeline<Phone>& phones) {
+	Timeline<void> unknownSounds;
+
+	// Find utterance parts without recogniced phones
+	for (const auto& timedUtterance : utterances) {
+		unknownSounds.set(timedUtterance.getTimeRange());
+	}
+	for (const auto& timedPhone : phones) {
+		unknownSounds.clear(timedPhone.getTimeRange());
+	}
+
+	// Remove undesired elements
+	const centiseconds minSoundLength = 5_cs;
+	for (const auto& unknownSound : Timeline<void>(unknownSounds)) {
+		bool startsAtZero = unknownSound.getStart() == 0_cs;
+		bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength;
+		if (startsAtZero || tooShort) {
+			unknownSounds.clear(unknownSound.getTimeRange());
+		}
+	}
+
+	return unknownSounds;
+}
+
 BoundedTimeline<Phone> detectPhones(
 	const AudioClip& inputAudioClip,
 	optional<u32string> dialog,
@@ -378,24 +402,30 @@ BoundedTimeline<Phone> detectPhones(
 		decoderPool.push(std::move(decoder));
 	};
 
-	BoundedTimeline<Phone> result(audioClip->getTruncatedRange());
+	BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
 	std::mutex resultMutex;
 	auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
 		logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
 
+		// Pad time range to give the recognizer some breathing room
+		TimeRange paddedTimeRange = timedUtterance.getTimeRange();
+		const centiseconds padding(3);
+		paddedTimeRange.grow(padding);
+		paddedTimeRange.trim(audioClip->getTruncatedRange());
+
 		// Detect phones for utterance
 		auto decoder = getDecoder();
 		bool decoderIsStillUsable = true;
-		Timeline<Phone> phones =
-			utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink);
+		Timeline<Phone> utterancePhones =
+			utteranceToPhones(*audioClip, paddedTimeRange, *decoder, decoderIsStillUsable, utteranceProgressSink);
 		if (decoderIsStillUsable) {
 			returnDecoder(std::move(decoder));
 		}
 
 		// Copy phones to result timeline
 		std::lock_guard<std::mutex> lock(resultMutex);
-		for (const auto& timedPhone : phones) {
-			result.set(timedPhone);
+		for (const auto& timedPhone : utterancePhones) {
+			phones.set(timedPhone);
 		}
 	};
 
@@ -417,10 +447,19 @@ BoundedTimeline<Phone> detectPhones(
 		logging::debug("Speech recognition -- start");
 		runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
 		logging::debug("Speech recognition -- end");
-
-		return result;
 	}
 	catch (...) {
 		std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
 	}
+
+	logging::debug("Detecting unknown sounds");
+	Timeline<void> unknownSounds = getUnknownSounds(utterances, phones);
+	for (const auto& unknownSound : unknownSounds) {
+		phones.set(unknownSound.getTimeRange(), Phone::Noise);
+	}
+	for (const auto& timedPhone : phones) {
+		logging::logTimedEvent("phone", timedPhone);
+	}
+
+	return phones;
 }