rhubarb-lip-sync/rhubarb/src/recognition/PhoneticRecognizer.cpp

114 lines
3.7 KiB
C++

#include "PhoneticRecognizer.h"
#include "time/Timeline.h"
#include "audio/AudioSegment.h"
#include "audio/SampleRateConverter.h"
#include "audio/processing.h"
#include "time/timedLogging.h"
using std::runtime_error;
using std::unique_ptr;
using std::string;
using boost::optional;
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
UNUSED(dialog);
lambda_unique_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", (getSphinxModelDirectory() / "acoustic-model").u8string().c_str(),
// Set phonetic language model
"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").u8string().c_str(),
"-allphone_ci", "yes",
// Set language model probability weight.
// Low values (<= 0.4) can lead to fluttering animation.
// High values (>= 1.0) can lead to imprecise or freezing animation.
"-lw", "0.8",
// Add noise against zero silence
// (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
"-dither", "yes",
// Disable VAD -- we're doing that ourselves
"-remove_silence", "no",
// Perform per-utterance cepstral mean normalization
"-cmn", "batch",
// The following settings are recommended at
// http://cmusphinx.sourceforge.net/wiki/phonemerecognition
// Set beam width applied to every frame in Viterbi search
"-beam", "1e-20",
// Set beam width applied to phone transitions
"-pbeam", "1e-20",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
lambda_unique_ptr<ps_decoder_t> decoder(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!decoder) throw runtime_error("Error creating speech decoder.");
return decoder;
}
static Timeline<Phone> utteranceToPhones(
const AudioClip& audioClip,
TimeRange utteranceTimeRange,
ps_decoder_t& decoder,
ProgressSink& utteranceProgressSink
) {
// Pad time range to give PocketSphinx some breathing room
TimeRange paddedTimeRange = utteranceTimeRange;
const centiseconds padding(3);
paddedTimeRange.grow(padding);
paddedTimeRange.trim(audioClip.getTruncatedRange());
const unique_ptr<AudioClip> clipSegment = audioClip.clone()
| segment(paddedTimeRange)
| resample(sphinxSampleRate);
const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
// Detect phones (returned as words)
BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
phoneStrings.shift(paddedTimeRange.getStart());
Timeline<Phone> utterancePhones;
for (const auto& timedPhoneString : phoneStrings) {
Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
phone = Phone::Schwa;
}
utterancePhones.set(timedPhoneString.getTimeRange(), phone);
}
// Log raw phones
for (const auto& timedPhone : utterancePhones) {
logTimedEvent("rawPhone", timedPhone);
}
// Guess positions of noise sounds
JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
for (const auto& noiseSound : noiseSounds) {
utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
}
// Log phones
for (const auto& timedPhone : utterancePhones) {
logTimedEvent("phone", timedPhone);
}
utteranceProgressSink.reportProgress(1.0);
return utterancePhones;
}
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
const AudioClip& inputAudioClip,
optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink
) const {
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
}