114 lines
3.7 KiB
C++
114 lines
3.7 KiB
C++
#include "PhoneticRecognizer.h"
|
|
#include "time/Timeline.h"
|
|
#include "audio/AudioSegment.h"
|
|
#include "audio/SampleRateConverter.h"
|
|
#include "audio/processing.h"
|
|
#include "time/timedLogging.h"
|
|
|
|
using std::runtime_error;
|
|
using std::unique_ptr;
|
|
using std::string;
|
|
using boost::optional;
|
|
|
|
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
|
|
UNUSED(dialog);
|
|
|
|
lambda_unique_ptr<cmd_ln_t> config(
|
|
cmd_ln_init(
|
|
nullptr, ps_args(), true,
|
|
// Set acoustic model
|
|
"-hmm", (getSphinxModelDirectory() / "acoustic-model").u8string().c_str(),
|
|
// Set phonetic language model
|
|
"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").u8string().c_str(),
|
|
"-allphone_ci", "yes",
|
|
// Set language model probability weight.
|
|
// Low values (<= 0.4) can lead to fluttering animation.
|
|
// High values (>= 1.0) can lead to imprecise or freezing animation.
|
|
"-lw", "0.8",
|
|
// Add noise against zero silence
|
|
// (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
|
"-dither", "yes",
|
|
// Disable VAD -- we're doing that ourselves
|
|
"-remove_silence", "no",
|
|
// Perform per-utterance cepstral mean normalization
|
|
"-cmn", "batch",
|
|
|
|
// The following settings are recommended at
|
|
// http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
|
|
|
// Set beam width applied to every frame in Viterbi search
|
|
"-beam", "1e-20",
|
|
// Set beam width applied to phone transitions
|
|
"-pbeam", "1e-20",
|
|
nullptr),
|
|
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
|
if (!config) throw runtime_error("Error creating configuration.");
|
|
|
|
lambda_unique_ptr<ps_decoder_t> decoder(
|
|
ps_init(config.get()),
|
|
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
|
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
|
|
|
return decoder;
|
|
}
|
|
|
|
static Timeline<Phone> utteranceToPhones(
|
|
const AudioClip& audioClip,
|
|
TimeRange utteranceTimeRange,
|
|
ps_decoder_t& decoder,
|
|
ProgressSink& utteranceProgressSink
|
|
) {
|
|
// Pad time range to give PocketSphinx some breathing room
|
|
TimeRange paddedTimeRange = utteranceTimeRange;
|
|
const centiseconds padding(3);
|
|
paddedTimeRange.grow(padding);
|
|
paddedTimeRange.trim(audioClip.getTruncatedRange());
|
|
|
|
const unique_ptr<AudioClip> clipSegment = audioClip.clone()
|
|
| segment(paddedTimeRange)
|
|
| resample(sphinxSampleRate);
|
|
const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
|
|
|
|
// Detect phones (returned as words)
|
|
BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
|
|
phoneStrings.shift(paddedTimeRange.getStart());
|
|
Timeline<Phone> utterancePhones;
|
|
for (const auto& timedPhoneString : phoneStrings) {
|
|
Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
|
|
if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
|
|
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
|
|
phone = Phone::Schwa;
|
|
}
|
|
utterancePhones.set(timedPhoneString.getTimeRange(), phone);
|
|
}
|
|
|
|
// Log raw phones
|
|
for (const auto& timedPhone : utterancePhones) {
|
|
logTimedEvent("rawPhone", timedPhone);
|
|
}
|
|
|
|
// Guess positions of noise sounds
|
|
JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
|
|
for (const auto& noiseSound : noiseSounds) {
|
|
utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
|
|
}
|
|
|
|
// Log phones
|
|
for (const auto& timedPhone : utterancePhones) {
|
|
logTimedEvent("phone", timedPhone);
|
|
}
|
|
|
|
utteranceProgressSink.reportProgress(1.0);
|
|
|
|
return utterancePhones;
|
|
}
|
|
|
|
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
|
|
const AudioClip& inputAudioClip,
|
|
optional<std::string> dialog,
|
|
int maxThreadCount,
|
|
ProgressSink& progressSink
|
|
) const {
|
|
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
|
}
|