Supporting noises (breathing, smacking, etc.)
This commit is contained in:
parent
bd1f8226ec
commit
206cde4658
|
@ -258,6 +258,7 @@ public class Visualization {
|
||||||
public enum EventType {
|
public enum EventType {
|
||||||
Utterance,
|
Utterance,
|
||||||
Word,
|
Word,
|
||||||
|
RawPhone,
|
||||||
Phone,
|
Phone,
|
||||||
Shape
|
Shape
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,6 @@ string PhoneConverter::getTypeName() {
|
||||||
|
|
||||||
EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
|
EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
|
||||||
return member_data{
|
return member_data{
|
||||||
{ Phone::Unknown, "Unknown" },
|
|
||||||
{ Phone::AO, "AO" },
|
{ Phone::AO, "AO" },
|
||||||
{ Phone::AA, "AA" },
|
{ Phone::AA, "AA" },
|
||||||
{ Phone::IY, "IY" },
|
{ Phone::IY, "IY" },
|
||||||
|
@ -30,6 +29,7 @@ EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
|
||||||
{ Phone::AW, "AW" },
|
{ Phone::AW, "AW" },
|
||||||
{ Phone::OY, "OY" },
|
{ Phone::OY, "OY" },
|
||||||
{ Phone::ER, "ER" },
|
{ Phone::ER, "ER" },
|
||||||
|
|
||||||
{ Phone::P, "P" },
|
{ Phone::P, "P" },
|
||||||
{ Phone::B, "B" },
|
{ Phone::B, "B" },
|
||||||
{ Phone::T, "T" },
|
{ Phone::T, "T" },
|
||||||
|
@ -53,13 +53,29 @@ EnumConverter<Phone>::member_data PhoneConverter::getMemberData() {
|
||||||
{ Phone::L, "L" },
|
{ Phone::L, "L" },
|
||||||
{ Phone::R, "R" },
|
{ Phone::R, "R" },
|
||||||
{ Phone::Y, "Y" },
|
{ Phone::Y, "Y" },
|
||||||
{ Phone::W, "W" }
|
{ Phone::W, "W" },
|
||||||
|
|
||||||
|
{ Phone::Breath, "Breath" },
|
||||||
|
{ Phone::Cough, "Cough" },
|
||||||
|
{ Phone::Smack, "Smack" },
|
||||||
|
{ Phone::Noise, "Noise" }
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
optional<Phone> PhoneConverter::tryParse(const string& s) {
|
optional<Phone> PhoneConverter::tryParse(const string& s) {
|
||||||
auto result = EnumConverter<Phone>::tryParse(s);
|
auto result = EnumConverter<Phone>::tryParse(s);
|
||||||
return result ? result : Phone::Unknown;
|
if (result) return result;
|
||||||
|
|
||||||
|
if (s == "+BREATH+") {
|
||||||
|
return Phone::Breath;
|
||||||
|
}
|
||||||
|
if (s == "+COUGH+") {
|
||||||
|
return Phone::Cough;
|
||||||
|
}
|
||||||
|
if (s == "+SMACK+") {
|
||||||
|
return Phone::Smack;
|
||||||
|
}
|
||||||
|
return Phone::Noise;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& stream, Phone value) {
|
std::ostream& operator<<(std::ostream& stream, Phone value) {
|
||||||
|
|
14
src/Phone.h
14
src/Phone.h
|
@ -1,11 +1,9 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "EnumConverter.h"
|
#include "EnumConverter.h"
|
||||||
|
|
||||||
// Defines a subset of the Arpabet
|
// Defines a subset of the Arpabet
|
||||||
enum class Phone {
|
enum class Phone {
|
||||||
Unknown,
|
|
||||||
|
|
||||||
/////////
|
/////////
|
||||||
// Vowels
|
// Vowels
|
||||||
|
|
||||||
|
@ -67,7 +65,15 @@ enum class Phone {
|
||||||
|
|
||||||
// ... semivowels
|
// ... semivowels
|
||||||
Y, // [j] as in [y]es
|
Y, // [j] as in [y]es
|
||||||
W // [w] as in [w]ay
|
W, // [w] as in [w]ay
|
||||||
|
|
||||||
|
/////////////
|
||||||
|
// Misc.
|
||||||
|
|
||||||
|
Breath,
|
||||||
|
Cough,
|
||||||
|
Smack,
|
||||||
|
Noise
|
||||||
};
|
};
|
||||||
|
|
||||||
class PhoneConverter : public EnumConverter<Phone> {
|
class PhoneConverter : public EnumConverter<Phone> {
|
||||||
|
|
|
@ -100,12 +100,6 @@ BoundedTimeline<void> detectVoiceActivity(const AudioClip& inputAudioClip, Progr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pad each activity to give the recognizer some breathing room
|
|
||||||
const centiseconds padding(3);
|
|
||||||
for (const auto& element : BoundedTimeline<void>(activity)) {
|
|
||||||
activity.set(element.getStart() - padding, element.getEnd() + padding);
|
|
||||||
}
|
|
||||||
|
|
||||||
logging::debugFormat("Found {} sections of voice activity: {}", activity.size(),
|
logging::debugFormat("Found {} sections of voice activity: {}", activity.size(),
|
||||||
join(activity | transformed([](const Timed<void>& t) { return format("{0}-{1}", t.getStart(), t.getEnd()); }), ", "));
|
join(activity | transformed([](const Timed<void>& t) { return format("{0}-{1}", t.getStart(), t.getEnd()); }), ", "));
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ Phone charToPhone(wchar_t c) {
|
||||||
case L'l': return Phone::L;
|
case L'l': return Phone::L;
|
||||||
case L'h': return Phone::HH;
|
case L'h': return Phone::HH;
|
||||||
}
|
}
|
||||||
return Phone::Unknown;
|
return Phone::Noise;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<Phone> wordToPhones(const std::string& word) {
|
vector<Phone> wordToPhones(const std::string& word) {
|
||||||
|
@ -91,7 +91,7 @@ vector<Phone> wordToPhones(const std::string& word) {
|
||||||
vector<Phone> result;
|
vector<Phone> result;
|
||||||
for (wchar_t c : wideWord) {
|
for (wchar_t c : wideWord) {
|
||||||
Phone phone = charToPhone(c);
|
Phone phone = charToPhone(c);
|
||||||
if (phone == Phone::Unknown) {
|
if (phone == Phone::Noise) {
|
||||||
logging::errorFormat("G2P error determining pronunciation for '{}': Character '{}' is not a recognized phone shorthand.",
|
logging::errorFormat("G2P error determining pronunciation for '{}': Character '{}' is not a recognized phone shorthand.",
|
||||||
word, static_cast<char>(c));
|
word, static_cast<char>(c));
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -53,7 +53,6 @@ Timeline<Viseme> animate(optional<Phone> phone, centiseconds duration, centiseco
|
||||||
if (!phone) return single({ X });
|
if (!phone) return single({ X });
|
||||||
|
|
||||||
switch (*phone) {
|
switch (*phone) {
|
||||||
case Phone::Unknown: return single({ B });
|
|
||||||
case Phone::AO: return single({ E });
|
case Phone::AO: return single({ E });
|
||||||
case Phone::AA: return single({ D });
|
case Phone::AA: return single({ D });
|
||||||
case Phone::IY: return single({ B });
|
case Phone::IY: return single({ B });
|
||||||
|
@ -69,6 +68,7 @@ Timeline<Viseme> animate(optional<Phone> phone, centiseconds duration, centiseco
|
||||||
case Phone::AW: return diphtong({ D }, { F });
|
case Phone::AW: return diphtong({ D }, { F });
|
||||||
case Phone::OY: return diphtong({ F }, { B });
|
case Phone::OY: return diphtong({ F }, { B });
|
||||||
case Phone::ER: return single({ { B }, 7_cs, { E } });
|
case Phone::ER: return single({ { B }, 7_cs, { E } });
|
||||||
|
|
||||||
case Phone::P:
|
case Phone::P:
|
||||||
case Phone::B: return bilabialStop();
|
case Phone::B: return bilabialStop();
|
||||||
case Phone::T:
|
case Phone::T:
|
||||||
|
@ -93,6 +93,12 @@ Timeline<Viseme> animate(optional<Phone> phone, centiseconds duration, centiseco
|
||||||
case Phone::R: return single({ { B, B, B, B, F } });
|
case Phone::R: return single({ { B, B, B, B, F } });
|
||||||
case Phone::Y: return single({ B });
|
case Phone::Y: return single({ B });
|
||||||
case Phone::W: return single({ F });
|
case Phone::W: return single({ F });
|
||||||
|
|
||||||
|
case Phone::Breath:
|
||||||
|
case Phone::Cough:
|
||||||
|
case Phone::Smack: return single({ C });
|
||||||
|
case Phone::Noise: return single({ B });
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw std::invalid_argument("Unexpected phone.");
|
throw std::invalid_argument("Unexpected phone.");
|
||||||
}
|
}
|
||||||
|
|
|
@ -323,15 +323,39 @@ Timeline<Phone> utteranceToPhones(
|
||||||
|
|
||||||
// Align the words' phones with speech
|
// Align the words' phones with speech
|
||||||
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
|
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
|
||||||
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Unknown));
|
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
|
||||||
segmentPhones.shift(utterance.getStart());
|
segmentPhones.shift(utterance.getStart());
|
||||||
for (const auto& timedPhone : segmentPhones) {
|
for (const auto& timedPhone : segmentPhones) {
|
||||||
logging::logTimedEvent("phone", timedPhone);
|
logging::logTimedEvent("rawPhone", timedPhone);
|
||||||
}
|
}
|
||||||
|
|
||||||
return segmentPhones;
|
return segmentPhones;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Timeline<void> getUnknownSounds(const Timeline<void>& utterances, const Timeline<Phone>& phones) {
|
||||||
|
Timeline<void> unknownSounds;
|
||||||
|
|
||||||
|
// Find utterance parts without recogniced phones
|
||||||
|
for (const auto& timedUtterance : utterances) {
|
||||||
|
unknownSounds.set(timedUtterance.getTimeRange());
|
||||||
|
}
|
||||||
|
for (const auto& timedPhone : phones) {
|
||||||
|
unknownSounds.clear(timedPhone.getTimeRange());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove undesired elements
|
||||||
|
const centiseconds minSoundLength = 5_cs;
|
||||||
|
for (const auto& unknownSound : Timeline<void>(unknownSounds)) {
|
||||||
|
bool startsAtZero = unknownSound.getStart() == 0_cs;
|
||||||
|
bool tooShort = unknownSound.getTimeRange().getLength() < minSoundLength;
|
||||||
|
if (startsAtZero || tooShort) {
|
||||||
|
unknownSounds.clear(unknownSound.getTimeRange());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return unknownSounds;
|
||||||
|
}
|
||||||
|
|
||||||
BoundedTimeline<Phone> detectPhones(
|
BoundedTimeline<Phone> detectPhones(
|
||||||
const AudioClip& inputAudioClip,
|
const AudioClip& inputAudioClip,
|
||||||
optional<u32string> dialog,
|
optional<u32string> dialog,
|
||||||
|
@ -378,24 +402,30 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
decoderPool.push(std::move(decoder));
|
decoderPool.push(std::move(decoder));
|
||||||
};
|
};
|
||||||
|
|
||||||
BoundedTimeline<Phone> result(audioClip->getTruncatedRange());
|
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
|
||||||
std::mutex resultMutex;
|
std::mutex resultMutex;
|
||||||
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
||||||
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
|
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
|
||||||
|
|
||||||
|
// Pad time range to give the recognizer some breathing room
|
||||||
|
TimeRange paddedTimeRange = timedUtterance.getTimeRange();
|
||||||
|
const centiseconds padding(3);
|
||||||
|
paddedTimeRange.grow(padding);
|
||||||
|
paddedTimeRange.trim(audioClip->getTruncatedRange());
|
||||||
|
|
||||||
// Detect phones for utterance
|
// Detect phones for utterance
|
||||||
auto decoder = getDecoder();
|
auto decoder = getDecoder();
|
||||||
bool decoderIsStillUsable = true;
|
bool decoderIsStillUsable = true;
|
||||||
Timeline<Phone> phones =
|
Timeline<Phone> utterancePhones =
|
||||||
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink);
|
utteranceToPhones(*audioClip, paddedTimeRange, *decoder, decoderIsStillUsable, utteranceProgressSink);
|
||||||
if (decoderIsStillUsable) {
|
if (decoderIsStillUsable) {
|
||||||
returnDecoder(std::move(decoder));
|
returnDecoder(std::move(decoder));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy phones to result timeline
|
// Copy phones to result timeline
|
||||||
std::lock_guard<std::mutex> lock(resultMutex);
|
std::lock_guard<std::mutex> lock(resultMutex);
|
||||||
for (const auto& timedPhone : phones) {
|
for (const auto& timedPhone : utterancePhones) {
|
||||||
result.set(timedPhone);
|
phones.set(timedPhone);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -417,10 +447,19 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
logging::debug("Speech recognition -- start");
|
logging::debug("Speech recognition -- start");
|
||||||
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
||||||
logging::debug("Speech recognition -- end");
|
logging::debug("Speech recognition -- end");
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
catch (...) {
|
catch (...) {
|
||||||
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
|
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logging::debug("Detecting unknown sounds");
|
||||||
|
Timeline<void> unknownSounds = getUnknownSounds(utterances, phones);
|
||||||
|
for (const auto& unknownSound : unknownSounds) {
|
||||||
|
phones.set(unknownSound.getTimeRange(), Phone::Noise);
|
||||||
|
}
|
||||||
|
for (const auto& timedPhone : phones) {
|
||||||
|
logging::logTimedEvent("phone", timedPhone);
|
||||||
|
}
|
||||||
|
|
||||||
|
return phones;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue