#include "PhoneticRecognizer.h" #include "time/Timeline.h" #include "audio/AudioSegment.h" #include "audio/SampleRateConverter.h" #include "audio/processing.h" #include "time/timedLogging.h" using std::runtime_error; using std::unique_ptr; using std::string; using boost::optional; static lambda_unique_ptr createDecoder(optional dialog) { UNUSED(dialog); lambda_unique_ptr config( cmd_ln_init( nullptr, ps_args(), true, // Set acoustic model "-hmm", (getSphinxModelDirectory() / "acoustic-model").u8string().c_str(), // Set phonetic language model "-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").u8string().c_str(), "-allphone_ci", "yes", // Set language model probability weight. // Low values (<= 0.4) can lead to fluttering animation. // High values (>= 1.0) can lead to imprecise or freezing animation. "-lw", "0.8", // Add noise against zero silence // (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor) "-dither", "yes", // Disable VAD -- we're doing that ourselves "-remove_silence", "no", // Perform per-utterance cepstral mean normalization "-cmn", "batch", // The following settings are recommended at // http://cmusphinx.sourceforge.net/wiki/phonemerecognition // Set beam width applied to every frame in Viterbi search "-beam", "1e-20", // Set beam width applied to phone transitions "-pbeam", "1e-20", nullptr), [](cmd_ln_t* config) { cmd_ln_free_r(config); }); if (!config) throw runtime_error("Error creating configuration."); lambda_unique_ptr decoder( ps_init(config.get()), [](ps_decoder_t* recognizer) { ps_free(recognizer); }); if (!decoder) throw runtime_error("Error creating speech decoder."); return decoder; } static Timeline utteranceToPhones( const AudioClip& audioClip, TimeRange utteranceTimeRange, ps_decoder_t& decoder, ProgressSink& utteranceProgressSink ) { // Pad time range to give PocketSphinx some breathing room TimeRange paddedTimeRange = utteranceTimeRange; const centiseconds padding(3); paddedTimeRange.grow(padding); paddedTimeRange.trim(audioClip.getTruncatedRange()); const unique_ptr clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate); const auto audioBuffer = copyTo16bitBuffer(*clipSegment); // Detect phones (returned as words) BoundedTimeline phoneStrings = recognizeWords(audioBuffer, decoder); phoneStrings.shift(paddedTimeRange.getStart()); Timeline utterancePhones; for (const auto& timedPhoneString : phoneStrings) { Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue()); if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) { // Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate. phone = Phone::Schwa; } utterancePhones.set(timedPhoneString.getTimeRange(), phone); } // Log raw phones for (const auto& timedPhone : utterancePhones) { logTimedEvent("rawPhone", timedPhone); } // Guess positions of noise sounds JoiningTimeline noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones); for (const auto& noiseSound : noiseSounds) { utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise); } // Log phones for (const auto& timedPhone : utterancePhones) { logTimedEvent("phone", timedPhone); } utteranceProgressSink.reportProgress(1.0); return utterancePhones; } BoundedTimeline PhoneticRecognizer::recognizePhones( const AudioClip& inputAudioClip, optional dialog, int maxThreadCount, ProgressSink& progressSink ) const { return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink); }