From 206cde46582c21afa0cf599a49c691396156065d Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Thu, 11 Aug 2016 10:18:03 +0200 Subject: [PATCH] Supporting noises (breathing, smacking, etc.) --- extras/SonyVegas/Debug Rhubarb.cs | 1 + src/Phone.cpp | 22 +++++++++-- src/Phone.h | 14 +++++-- src/audio/voiceActivityDetection.cpp | 6 --- src/g2p.cpp | 4 +- src/mouthAnimation.cpp | 8 +++- src/phoneExtraction.cpp | 57 +++++++++++++++++++++++----- 7 files changed, 87 insertions(+), 25 deletions(-) diff --git a/extras/SonyVegas/Debug Rhubarb.cs b/extras/SonyVegas/Debug Rhubarb.cs index 7c12cc7..c13fa76 100644 --- a/extras/SonyVegas/Debug Rhubarb.cs +++ b/extras/SonyVegas/Debug Rhubarb.cs @@ -258,6 +258,7 @@ public class Visualization { public enum EventType { Utterance, Word, + RawPhone, Phone, Shape } diff --git a/src/Phone.cpp b/src/Phone.cpp index 6d0c5a3..e325f0f 100644 --- a/src/Phone.cpp +++ b/src/Phone.cpp @@ -14,7 +14,6 @@ string PhoneConverter::getTypeName() { EnumConverter::member_data PhoneConverter::getMemberData() { return member_data{ - { Phone::Unknown, "Unknown" }, { Phone::AO, "AO" }, { Phone::AA, "AA" }, { Phone::IY, "IY" }, @@ -30,6 +29,7 @@ EnumConverter::member_data PhoneConverter::getMemberData() { { Phone::AW, "AW" }, { Phone::OY, "OY" }, { Phone::ER, "ER" }, + { Phone::P, "P" }, { Phone::B, "B" }, { Phone::T, "T" }, @@ -53,13 +53,29 @@ EnumConverter::member_data PhoneConverter::getMemberData() { { Phone::L, "L" }, { Phone::R, "R" }, { Phone::Y, "Y" }, - { Phone::W, "W" } + { Phone::W, "W" }, + + { Phone::Breath, "Breath" }, + { Phone::Cough, "Cough" }, + { Phone::Smack, "Smack" }, + { Phone::Noise, "Noise" } }; } optional PhoneConverter::tryParse(const string& s) { auto result = EnumConverter::tryParse(s); - return result ? result : Phone::Unknown; + if (result) return result; + + if (s == "+BREATH+") { + return Phone::Breath; + } + if (s == "+COUGH+") { + return Phone::Cough; + } + if (s == "+SMACK+") { + return Phone::Smack; + } + return Phone::Noise; } std::ostream& operator<<(std::ostream& stream, Phone value) { diff --git a/src/Phone.h b/src/Phone.h index ef9183c..5d37f7e 100644 --- a/src/Phone.h +++ b/src/Phone.h @@ -1,11 +1,9 @@ -#pragma once +#pragma once #include "EnumConverter.h" // Defines a subset of the Arpabet enum class Phone { - Unknown, - ///////// // Vowels @@ -67,7 +65,15 @@ enum class Phone { // ... semivowels Y, // [j] as in [y]es - W // [w] as in [w]ay + W, // [w] as in [w]ay + + ///////////// + // Misc. + + Breath, + Cough, + Smack, + Noise }; class PhoneConverter : public EnumConverter { diff --git a/src/audio/voiceActivityDetection.cpp b/src/audio/voiceActivityDetection.cpp index ec7abc2..b795ed6 100644 --- a/src/audio/voiceActivityDetection.cpp +++ b/src/audio/voiceActivityDetection.cpp @@ -100,12 +100,6 @@ BoundedTimeline detectVoiceActivity(const AudioClip& inputAudioClip, Progr } } - // Pad each activity to give the recognizer some breathing room - const centiseconds padding(3); - for (const auto& element : BoundedTimeline(activity)) { - activity.set(element.getStart() - padding, element.getEnd() + padding); - } - logging::debugFormat("Found {} sections of voice activity: {}", activity.size(), join(activity | transformed([](const Timed& t) { return format("{0}-{1}", t.getStart(), t.getEnd()); }), ", ")); diff --git a/src/g2p.cpp b/src/g2p.cpp index 424f6da..7014bd9 100644 --- a/src/g2p.cpp +++ b/src/g2p.cpp @@ -65,7 +65,7 @@ Phone charToPhone(wchar_t c) { case L'l': return Phone::L; case L'h': return Phone::HH; } - return Phone::Unknown; + return Phone::Noise; } vector wordToPhones(const std::string& word) { @@ -91,7 +91,7 @@ vector wordToPhones(const std::string& word) { vector result; for (wchar_t c : wideWord) { Phone phone = charToPhone(c); - if (phone == Phone::Unknown) { + if (phone == Phone::Noise) { logging::errorFormat("G2P error determining pronunciation for '{}': Character '{}' is not a recognized phone shorthand.", word, static_cast(c)); } else { diff --git a/src/mouthAnimation.cpp b/src/mouthAnimation.cpp index c190586..cd5d83b 100644 --- a/src/mouthAnimation.cpp +++ b/src/mouthAnimation.cpp @@ -53,7 +53,6 @@ Timeline animate(optional phone, centiseconds duration, centiseco if (!phone) return single({ X }); switch (*phone) { - case Phone::Unknown: return single({ B }); case Phone::AO: return single({ E }); case Phone::AA: return single({ D }); case Phone::IY: return single({ B }); @@ -69,6 +68,7 @@ Timeline animate(optional phone, centiseconds duration, centiseco case Phone::AW: return diphtong({ D }, { F }); case Phone::OY: return diphtong({ F }, { B }); case Phone::ER: return single({ { B }, 7_cs, { E } }); + case Phone::P: case Phone::B: return bilabialStop(); case Phone::T: @@ -93,6 +93,12 @@ Timeline animate(optional phone, centiseconds duration, centiseco case Phone::R: return single({ { B, B, B, B, F } }); case Phone::Y: return single({ B }); case Phone::W: return single({ F }); + + case Phone::Breath: + case Phone::Cough: + case Phone::Smack: return single({ C }); + case Phone::Noise: return single({ B }); + default: throw std::invalid_argument("Unexpected phone."); } diff --git a/src/phoneExtraction.cpp b/src/phoneExtraction.cpp index 4a9244a..bf48b7d 100644 --- a/src/phoneExtraction.cpp +++ b/src/phoneExtraction.cpp @@ -323,15 +323,39 @@ Timeline utteranceToPhones( // Align the words' phones with speech Timeline segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink) - .value_or(ContinuousTimeline(clipSegment->getTruncatedRange(), Phone::Unknown)); + .value_or(ContinuousTimeline(clipSegment->getTruncatedRange(), Phone::Noise)); segmentPhones.shift(utterance.getStart()); for (const auto& timedPhone : segmentPhones) { - logging::logTimedEvent("phone", timedPhone); + logging::logTimedEvent("rawPhone", timedPhone); } return segmentPhones; } +Timeline getUnknownSounds(const Timeline& utterances, const Timeline& phones) { + Timeline unknownSounds; + + // Find utterance parts without recogniced phones + for (const auto& timedUtterance : utterances) { + unknownSounds.set(timedUtterance.getTimeRange()); + } + for (const auto& timedPhone : phones) { + unknownSounds.clear(timedPhone.getTimeRange()); + } + + // Remove undesired elements + const centiseconds minSoundLength = 5_cs; + for (const auto& unknownSound : Timeline(unknownSounds)) { + bool startsAtZero = unknownSound.getStart() == 0_cs; + bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength; + if (startsAtZero || tooShort) { + unknownSounds.clear(unknownSound.getTimeRange()); + } + } + + return unknownSounds; +} + BoundedTimeline detectPhones( const AudioClip& inputAudioClip, optional dialog, @@ -378,24 +402,30 @@ BoundedTimeline detectPhones( decoderPool.push(std::move(decoder)); }; - BoundedTimeline result(audioClip->getTruncatedRange()); + BoundedTimeline phones(audioClip->getTruncatedRange()); std::mutex resultMutex; auto processUtterance = [&](Timed timedUtterance, ProgressSink& utteranceProgressSink) { logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string("")); + // Pad time range to give the recognizer some breathing room + TimeRange paddedTimeRange = timedUtterance.getTimeRange(); + const centiseconds padding(3); + paddedTimeRange.grow(padding); + paddedTimeRange.trim(audioClip->getTruncatedRange()); + // Detect phones for utterance auto decoder = getDecoder(); bool decoderIsStillUsable = true; - Timeline phones = - utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink); + Timeline utterancePhones = + utteranceToPhones(*audioClip, paddedTimeRange, *decoder, decoderIsStillUsable, utteranceProgressSink); if (decoderIsStillUsable) { returnDecoder(std::move(decoder)); } // Copy phones to result timeline std::lock_guard lock(resultMutex); - for (const auto& timedPhone : phones) { - result.set(timedPhone); + for (const auto& timedPhone : utterancePhones) { + phones.set(timedPhone); } }; @@ -417,10 +447,19 @@ BoundedTimeline detectPhones( logging::debug("Speech recognition -- start"); runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight); logging::debug("Speech recognition -- end"); - - return result; } catch (...) { std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx.")); } + + logging::debug("Detecting unknown sounds"); + Timeline unknownSounds = getUnknownSounds(utterances, phones); + for (const auto& unknownSound : unknownSounds) { + phones.set(unknownSound.getTimeRange(), Phone::Noise); + } + for (const auto& timedPhone : phones) { + logging::logTimedEvent("phone", timedPhone); + } + + return phones; }