Supporting noises (breathing, smacking, etc.)

This commit is contained in:
Daniel Wolf 2016-08-11 10:18:03 +02:00
parent bd1f8226ec
commit 206cde4658
7 changed files with 87 additions and 25 deletions

View File

@ -258,6 +258,7 @@ public class Visualization {
public enum EventType { public enum EventType {
Utterance, Utterance,
Word, Word,
RawPhone,
Phone, Phone,
Shape Shape
} }

View File

@ -14,7 +14,6 @@ string PhoneConverter::getTypeName() {
EnumConverter<Phone>::member_data PhoneConverter::getMemberData() { EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
return member_data{ return member_data{
{ Phone::Unknown, "Unknown" },
{ Phone::AO, "AO" }, { Phone::AO, "AO" },
{ Phone::AA, "AA" }, { Phone::AA, "AA" },
{ Phone::IY, "IY" }, { Phone::IY, "IY" },
@ -30,6 +29,7 @@ EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
{ Phone::AW, "AW" }, { Phone::AW, "AW" },
{ Phone::OY, "OY" }, { Phone::OY, "OY" },
{ Phone::ER, "ER" }, { Phone::ER, "ER" },
{ Phone::P, "P" }, { Phone::P, "P" },
{ Phone::B, "B" }, { Phone::B, "B" },
{ Phone::T, "T" }, { Phone::T, "T" },
@ -53,13 +53,29 @@ EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
{ Phone::L, "L" }, { Phone::L, "L" },
{ Phone::R, "R" }, { Phone::R, "R" },
{ Phone::Y, "Y" }, { Phone::Y, "Y" },
{ Phone::W, "W" } { Phone::W, "W" },
{ Phone::Breath, "Breath" },
{ Phone::Cough, "Cough" },
{ Phone::Smack, "Smack" },
{ Phone::Noise, "Noise" }
}; };
} }
optional<Phone> PhoneConverter::tryParse(const string& s) { optional<Phone> PhoneConverter::tryParse(const string& s) {
auto result = EnumConverter<Phone>::tryParse(s); auto result = EnumConverter<Phone>::tryParse(s);
return result ? result : Phone::Unknown; if (result) return result;
if (s == "+BREATH+") {
return Phone::Breath;
}
if (s == "+COUGH+") {
return Phone::Cough;
}
if (s == "+SMACK+") {
return Phone::Smack;
}
return Phone::Noise;
} }
std::ostream& operator<<(std::ostream& stream, Phone value) { std::ostream& operator<<(std::ostream& stream, Phone value) {

View File

@ -1,11 +1,9 @@
#pragma once #pragma once
#include "EnumConverter.h" #include "EnumConverter.h"
// Defines a subset of the Arpabet // Defines a subset of the Arpabet
enum class Phone { enum class Phone {
Unknown,
///////// /////////
// Vowels // Vowels
@ -67,7 +65,15 @@ enum class Phone {
// ... semivowels // ... semivowels
Y, // [j] as in [y]es Y, // [j] as in [y]es
W // [w] as in [w]ay W, // [w] as in [w]ay
/////////////
// Misc.
Breath,
Cough,
Smack,
Noise
}; };
class PhoneConverter : public EnumConverter<Phone> { class PhoneConverter : public EnumConverter<Phone> {

View File

@ -100,12 +100,6 @@ BoundedTimeline<void> detectVoiceActivity(const AudioClip& inputAudioClip, Progr
} }
} }
// Pad each activity to give the recognizer some breathing room
const centiseconds padding(3);
for (const auto& element : BoundedTimeline<void>(activity)) {
activity.set(element.getStart() - padding, element.getEnd() + padding);
}
logging::debugFormat("Found {} sections of voice activity: {}", activity.size(), logging::debugFormat("Found {} sections of voice activity: {}", activity.size(),
join(activity | transformed([](const Timed<void>& t) { return format("{0}-{1}", t.getStart(), t.getEnd()); }), ", ")); join(activity | transformed([](const Timed<void>& t) { return format("{0}-{1}", t.getStart(), t.getEnd()); }), ", "));

View File

@ -65,7 +65,7 @@ Phone charToPhone(wchar_t c) {
case L'l': return Phone::L; case L'l': return Phone::L;
case L'h': return Phone::HH; case L'h': return Phone::HH;
} }
return Phone::Unknown; return Phone::Noise;
} }
vector<Phone> wordToPhones(const std::string& word) { vector<Phone> wordToPhones(const std::string& word) {
@ -91,7 +91,7 @@ vector<Phone> wordToPhones(const std::string& word) {
vector<Phone> result; vector<Phone> result;
for (wchar_t c : wideWord) { for (wchar_t c : wideWord) {
Phone phone = charToPhone(c); Phone phone = charToPhone(c);
if (phone == Phone::Unknown) { if (phone == Phone::Noise) {
logging::errorFormat("G2P error determining pronunciation for '{}': Character '{}' is not a recognized phone shorthand.", logging::errorFormat("G2P error determining pronunciation for '{}': Character '{}' is not a recognized phone shorthand.",
word, static_cast<char>(c)); word, static_cast<char>(c));
} else { } else {

View File

@ -53,7 +53,6 @@ Timeline<Viseme> animate(optional<Phone> phone, centiseconds duration, centiseco
if (!phone) return single({ X }); if (!phone) return single({ X });
switch (*phone) { switch (*phone) {
case Phone::Unknown: return single({ B });
case Phone::AO: return single({ E }); case Phone::AO: return single({ E });
case Phone::AA: return single({ D }); case Phone::AA: return single({ D });
case Phone::IY: return single({ B }); case Phone::IY: return single({ B });
@ -69,6 +68,7 @@ Timeline<Viseme> animate(optional<Phone> phone, centiseconds duration, centiseco
case Phone::AW: return diphtong({ D }, { F }); case Phone::AW: return diphtong({ D }, { F });
case Phone::OY: return diphtong({ F }, { B }); case Phone::OY: return diphtong({ F }, { B });
case Phone::ER: return single({ { B }, 7_cs, { E } }); case Phone::ER: return single({ { B }, 7_cs, { E } });
case Phone::P: case Phone::P:
case Phone::B: return bilabialStop(); case Phone::B: return bilabialStop();
case Phone::T: case Phone::T:
@ -93,6 +93,12 @@ Timeline<Viseme> animate(optional<Phone> phone, centiseconds duration, centiseco
case Phone::R: return single({ { B, B, B, B, F } }); case Phone::R: return single({ { B, B, B, B, F } });
case Phone::Y: return single({ B }); case Phone::Y: return single({ B });
case Phone::W: return single({ F }); case Phone::W: return single({ F });
case Phone::Breath:
case Phone::Cough:
case Phone::Smack: return single({ C });
case Phone::Noise: return single({ B });
default: default:
throw std::invalid_argument("Unexpected phone."); throw std::invalid_argument("Unexpected phone.");
} }

View File

@ -323,15 +323,39 @@ Timeline<Phone> utteranceToPhones(
// Align the words' phones with speech // Align the words' phones with speech
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink) Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Unknown)); .value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
segmentPhones.shift(utterance.getStart()); segmentPhones.shift(utterance.getStart());
for (const auto& timedPhone : segmentPhones) { for (const auto& timedPhone : segmentPhones) {
logging::logTimedEvent("phone", timedPhone); logging::logTimedEvent("rawPhone", timedPhone);
} }
return segmentPhones; return segmentPhones;
} }
Timeline<void> getUnknownSounds(const Timeline<void>& utterances, const Timeline<Phone>& phones) {
Timeline<void> unknownSounds;
// Find utterance parts without recogniced phones
for (const auto& timedUtterance : utterances) {
unknownSounds.set(timedUtterance.getTimeRange());
}
for (const auto& timedPhone : phones) {
unknownSounds.clear(timedPhone.getTimeRange());
}
// Remove undesired elements
const centiseconds minSoundLength = 5_cs;
for (const auto& unknownSound : Timeline<void>(unknownSounds)) {
bool startsAtZero = unknownSound.getStart() == 0_cs;
bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength;
if (startsAtZero || tooShort) {
unknownSounds.clear(unknownSound.getTimeRange());
}
}
return unknownSounds;
}
BoundedTimeline<Phone> detectPhones( BoundedTimeline<Phone> detectPhones(
const AudioClip& inputAudioClip, const AudioClip& inputAudioClip,
optional<u32string> dialog, optional<u32string> dialog,
@ -378,24 +402,30 @@ BoundedTimeline<Phone> detectPhones(
decoderPool.push(std::move(decoder)); decoderPool.push(std::move(decoder));
}; };
BoundedTimeline<Phone> result(audioClip->getTruncatedRange()); BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
std::mutex resultMutex; std::mutex resultMutex;
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) { auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string("")); logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
// Pad time range to give the recognizer some breathing room
TimeRange paddedTimeRange = timedUtterance.getTimeRange();
const centiseconds padding(3);
paddedTimeRange.grow(padding);
paddedTimeRange.trim(audioClip->getTruncatedRange());
// Detect phones for utterance // Detect phones for utterance
auto decoder = getDecoder(); auto decoder = getDecoder();
bool decoderIsStillUsable = true; bool decoderIsStillUsable = true;
Timeline<Phone> phones = Timeline<Phone> utterancePhones =
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink); utteranceToPhones(*audioClip, paddedTimeRange, *decoder, decoderIsStillUsable, utteranceProgressSink);
if (decoderIsStillUsable) { if (decoderIsStillUsable) {
returnDecoder(std::move(decoder)); returnDecoder(std::move(decoder));
} }
// Copy phones to result timeline // Copy phones to result timeline
std::lock_guard<std::mutex> lock(resultMutex); std::lock_guard<std::mutex> lock(resultMutex);
for (const auto& timedPhone : phones) { for (const auto& timedPhone : utterancePhones) {
result.set(timedPhone); phones.set(timedPhone);
} }
}; };
@ -417,10 +447,19 @@ BoundedTimeline<Phone> detectPhones(
logging::debug("Speech recognition -- start"); logging::debug("Speech recognition -- start");
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight); runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
logging::debug("Speech recognition -- end"); logging::debug("Speech recognition -- end");
return result;
} }
catch (...) { catch (...) {
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx.")); std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
} }
logging::debug("Detecting unknown sounds");
Timeline<void> unknownSounds = getUnknownSounds(utterances, phones);
for (const auto& unknownSound : unknownSounds) {
phones.set(unknownSound.getTimeRange(), Phone::Noise);
}
for (const auto& timedPhone : phones) {
logging::logTimedEvent("phone", timedPhone);
}
return phones;
} }