2018-10-08 18:30:45 +00:00
|
|
|
#include "PocketSphinxRecognizer.h"
|
2016-02-09 21:08:11 +00:00
|
|
|
#include <regex>
|
2016-02-29 19:58:58 +00:00
|
|
|
#include <gsl_util.h>
|
2017-09-10 20:17:17 +00:00
|
|
|
#include "audio/AudioSegment.h"
|
2018-10-08 18:30:45 +00:00
|
|
|
#include "audio/SampleRateConverter.h"
|
2016-06-03 19:07:49 +00:00
|
|
|
#include "languageModels.h"
|
|
|
|
#include "tokenization.h"
|
|
|
|
#include "g2p.h"
|
2017-09-10 20:17:17 +00:00
|
|
|
#include "time/ContinuousTimeline.h"
|
|
|
|
#include "audio/processing.h"
|
|
|
|
#include "time/timedLogging.h"
|
2015-11-19 20:17:35 +00:00
|
|
|
|
2015-12-21 12:09:09 +00:00
|
|
|
extern "C" {
|
2016-01-19 21:05:06 +00:00
|
|
|
#include <state_align_search.h>
|
2015-12-21 12:09:09 +00:00
|
|
|
}
|
|
|
|
|
2015-11-18 19:59:03 +00:00
|
|
|
using std::runtime_error;
|
2016-02-09 21:08:11 +00:00
|
|
|
using std::invalid_argument;
|
2015-11-18 19:59:03 +00:00
|
|
|
using std::unique_ptr;
|
|
|
|
using std::string;
|
2016-01-19 21:05:06 +00:00
|
|
|
using std::vector;
|
2015-11-18 19:59:03 +00:00
|
|
|
using std::map;
|
2021-06-18 20:00:01 +00:00
|
|
|
using std::filesystem::path;
|
2016-02-09 21:08:11 +00:00
|
|
|
using std::regex;
|
|
|
|
using std::regex_replace;
|
2016-06-03 19:07:49 +00:00
|
|
|
using boost::optional;
|
2016-10-21 19:41:50 +00:00
|
|
|
using std::array;
|
2015-11-18 19:59:03 +00:00
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
bool dictionaryContains(dict_t& dictionary, const string& word) {
|
|
|
|
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
|
2016-02-09 21:08:11 +00:00
|
|
|
}
|
|
|
|
|
2016-05-17 12:16:16 +00:00
|
|
|
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
2018-10-08 18:30:45 +00:00
|
|
|
const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
2016-05-17 12:16:16 +00:00
|
|
|
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
|
|
|
return wordId;
|
2016-01-19 21:05:06 +00:00
|
|
|
}
|
|
|
|
|
2016-06-03 19:07:49 +00:00
|
|
|
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
|
|
|
|
map<string, string> missingPronunciations;
|
|
|
|
for (const string& word : words) {
|
2016-06-25 19:52:04 +00:00
|
|
|
if (!dictionaryContains(*decoder.dict, word)) {
|
2016-06-03 19:07:49 +00:00
|
|
|
string pronunciation;
|
|
|
|
for (Phone phone : wordToPhones(word)) {
|
|
|
|
if (pronunciation.length() > 0) pronunciation += " ";
|
|
|
|
pronunciation += PhoneConverter::get().toString(phone);
|
|
|
|
}
|
|
|
|
missingPronunciations[word] = pronunciation;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
|
2018-10-08 18:30:45 +00:00
|
|
|
const bool isLast = it == --missingPronunciations.end();
|
2016-06-03 19:07:49 +00:00
|
|
|
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
|
|
|
|
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-21 19:41:50 +00:00
|
|
|
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
|
|
|
|
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
|
|
|
|
lambda_unique_ptr<ngram_model_t> result(
|
2021-06-18 20:00:01 +00:00
|
|
|
ngram_model_read(decoder.config, modelPath.u8string().c_str(), NGRAM_AUTO, decoder.lmath),
|
2016-10-21 19:41:50 +00:00
|
|
|
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
|
|
|
if (!result) {
|
2021-06-18 20:00:01 +00:00
|
|
|
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath.u8string()));
|
2016-10-21 19:41:50 +00:00
|
|
|
}
|
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
return result;
|
2016-10-21 19:41:50 +00:00
|
|
|
}
|
|
|
|
|
2019-01-02 19:00:34 +00:00
|
|
|
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(
|
|
|
|
ps_decoder_t& decoder,
|
|
|
|
const string& dialog
|
|
|
|
) {
|
2016-10-21 19:41:50 +00:00
|
|
|
// Split dialog into normalized words
|
2019-01-02 19:00:34 +00:00
|
|
|
vector<string> words = tokenizeText(
|
|
|
|
dialog,
|
|
|
|
[&](const string& word) { return dictionaryContains(*decoder.dict, word); }
|
|
|
|
);
|
2016-10-21 19:41:50 +00:00
|
|
|
|
|
|
|
// Add dialog-specific words to the dictionary
|
|
|
|
addMissingDictionaryWords(words, decoder);
|
|
|
|
|
|
|
|
// Create dialog-specific language model
|
|
|
|
words.insert(words.begin(), "<s>");
|
2018-10-08 18:30:45 +00:00
|
|
|
words.emplace_back("</s>");
|
2016-10-21 19:41:50 +00:00
|
|
|
return createLanguageModel(words, decoder);
|
|
|
|
}
|
|
|
|
|
2019-01-02 19:00:34 +00:00
|
|
|
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(
|
|
|
|
ps_decoder_t& decoder,
|
|
|
|
const string& dialog
|
|
|
|
) {
|
2016-10-21 19:41:50 +00:00
|
|
|
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
|
|
|
|
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
|
|
|
|
constexpr int modelCount = 2;
|
2019-01-02 19:00:34 +00:00
|
|
|
array<ngram_model_t*, modelCount> languageModels {
|
|
|
|
defaultLanguageModel.get(),
|
|
|
|
dialogLanguageModel.get()
|
|
|
|
};
|
|
|
|
array<const char*, modelCount> modelNames { "defaultLM", "dialogLM" };
|
|
|
|
array<float, modelCount> modelWeights { 0.1f, 0.9f };
|
2016-10-21 19:41:50 +00:00
|
|
|
lambda_unique_ptr<ngram_model_t> result(
|
2019-01-02 19:00:34 +00:00
|
|
|
ngram_model_set_init(
|
|
|
|
nullptr,
|
|
|
|
languageModels.data(),
|
|
|
|
const_cast<char**>(modelNames.data()),
|
|
|
|
modelWeights.data(),
|
|
|
|
modelCount
|
|
|
|
),
|
2016-10-21 19:41:50 +00:00
|
|
|
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
|
|
|
if (!result) {
|
|
|
|
throw runtime_error("Error creating biased language model.");
|
|
|
|
}
|
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
return result;
|
2016-10-21 19:41:50 +00:00
|
|
|
}
|
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
|
2016-06-26 19:05:09 +00:00
|
|
|
lambda_unique_ptr<cmd_ln_t> config(
|
|
|
|
cmd_ln_init(
|
|
|
|
nullptr, ps_args(), true,
|
|
|
|
// Set acoustic model
|
2021-06-18 20:00:01 +00:00
|
|
|
"-hmm", (getSphinxModelDirectory() / "acoustic-model").u8string().c_str(),
|
2016-06-26 19:05:09 +00:00
|
|
|
// Set pronunciation dictionary
|
2021-06-18 20:00:01 +00:00
|
|
|
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").u8string().c_str(),
|
2019-01-02 19:00:34 +00:00
|
|
|
// Add noise against zero silence
|
|
|
|
// (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
2016-06-26 19:05:09 +00:00
|
|
|
"-dither", "yes",
|
2016-08-10 18:46:32 +00:00
|
|
|
// Disable VAD -- we're doing that ourselves
|
|
|
|
"-remove_silence", "no",
|
2016-09-18 20:02:02 +00:00
|
|
|
// Perform per-utterance cepstral mean normalization
|
|
|
|
"-cmn", "batch",
|
2016-06-26 19:05:09 +00:00
|
|
|
nullptr),
|
|
|
|
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
|
|
|
if (!config) throw runtime_error("Error creating configuration.");
|
|
|
|
|
|
|
|
lambda_unique_ptr<ps_decoder_t> decoder(
|
|
|
|
ps_init(config.get()),
|
|
|
|
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
|
|
|
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
|
|
|
|
|
|
|
// Set language model
|
2016-10-21 19:41:50 +00:00
|
|
|
lambda_unique_ptr<ngram_model_t> languageModel(dialog
|
|
|
|
? createBiasedLanguageModel(*decoder, *dialog)
|
|
|
|
: createDefaultLanguageModel(*decoder));
|
2016-06-26 19:05:09 +00:00
|
|
|
ps_set_lm(decoder.get(), "lm", languageModel.get());
|
|
|
|
ps_set_search(decoder.get(), "lm");
|
|
|
|
|
|
|
|
return decoder;
|
|
|
|
}
|
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
optional<Timeline<Phone>> getPhoneAlignment(
|
|
|
|
const vector<s3wid_t>& wordIds,
|
|
|
|
const vector<int16_t>& audioBuffer,
|
|
|
|
ps_decoder_t& decoder)
|
|
|
|
{
|
2019-01-03 11:00:50 +00:00
|
|
|
if (wordIds.empty()) return boost::none;
|
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
// Create alignment list
|
|
|
|
lambda_unique_ptr<ps_alignment_t> alignment(
|
|
|
|
ps_alignment_init(decoder.d2p),
|
|
|
|
[](ps_alignment_t* alignment) { ps_alignment_free(alignment); });
|
|
|
|
if (!alignment) throw runtime_error("Error creating alignment.");
|
|
|
|
for (s3wid_t wordId : wordIds) {
|
|
|
|
// Add word. Initial value for duration is ignored.
|
|
|
|
ps_alignment_add_word(alignment.get(), wordId, 0);
|
2016-09-29 08:44:34 +00:00
|
|
|
}
|
2018-10-08 18:30:45 +00:00
|
|
|
int error = ps_alignment_populate(alignment.get());
|
|
|
|
if (error) throw runtime_error("Error populating alignment struct.");
|
2016-09-29 08:44:34 +00:00
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
// Create search structure
|
|
|
|
acmod_t* acousticModel = decoder.acmod;
|
|
|
|
lambda_unique_ptr<ps_search_t> search(
|
|
|
|
state_align_search_init("state_align", decoder.config, acousticModel, alignment.get()),
|
|
|
|
[](ps_search_t* search) { ps_search_free(search); });
|
|
|
|
if (!search) throw runtime_error("Error creating search.");
|
|
|
|
|
|
|
|
// Start recognition
|
|
|
|
error = acmod_start_utt(acousticModel);
|
|
|
|
if (error) throw runtime_error("Error starting utterance processing for alignment.");
|
|
|
|
|
|
|
|
{
|
|
|
|
// Eventually end recognition
|
|
|
|
auto endRecognition = gsl::finally([&]() { acmod_end_utt(acousticModel); });
|
|
|
|
|
|
|
|
// Start search
|
|
|
|
ps_search_start(search.get());
|
|
|
|
|
|
|
|
// Process entire audio clip
|
|
|
|
const int16* nextSample = audioBuffer.data();
|
|
|
|
size_t remainingSamples = audioBuffer.size();
|
|
|
|
const bool fullUtterance = true;
|
|
|
|
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
|
|
|
|
while (acousticModel->n_feat_frame > 0) {
|
|
|
|
ps_search_step(search.get(), acousticModel->output_frame);
|
|
|
|
acmod_advance(acousticModel);
|
|
|
|
}
|
2016-09-29 08:44:34 +00:00
|
|
|
}
|
2018-10-08 18:30:45 +00:00
|
|
|
|
|
|
|
// End search
|
|
|
|
error = ps_search_finish(search.get());
|
|
|
|
if (error) return boost::none;
|
2016-09-29 08:44:34 +00:00
|
|
|
}
|
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
// Extract phones with timestamps
|
|
|
|
char** phoneNames = decoder.dict->mdef->ciname;
|
|
|
|
Timeline<Phone> result;
|
2019-01-02 19:00:34 +00:00
|
|
|
for (
|
|
|
|
ps_alignment_iter_t* it = ps_alignment_phones(alignment.get());
|
|
|
|
it;
|
|
|
|
it = ps_alignment_iter_next(it)
|
|
|
|
) {
|
2018-10-08 18:30:45 +00:00
|
|
|
// Get phone
|
|
|
|
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
|
|
|
|
const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
|
|
|
string phoneName = phoneNames[phoneId];
|
|
|
|
|
|
|
|
if (phoneName == "SIL") continue;
|
|
|
|
|
|
|
|
// Add entry
|
|
|
|
centiseconds start(phoneEntry->start);
|
|
|
|
centiseconds duration(phoneEntry->duration);
|
|
|
|
Phone phone = PhoneConverter::get().parse(phoneName);
|
|
|
|
if (phone == Phone::AH && duration < 6_cs) {
|
|
|
|
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
|
|
|
|
phone = Phone::Schwa;
|
|
|
|
}
|
|
|
|
const Timed<Phone> timedPhone(start, start + duration, phone);
|
|
|
|
result.set(timedPhone);
|
|
|
|
}
|
|
|
|
return result;
|
2016-09-29 08:44:34 +00:00
|
|
|
}
|
|
|
|
|
2017-03-26 20:16:13 +00:00
|
|
|
// Some words have multiple pronunciations, one of which results in better animation than the others.
|
|
|
|
// This function returns the optimal pronunciation for a select set of these words.
|
|
|
|
string fixPronunciation(const string& word) {
|
2019-01-02 19:00:34 +00:00
|
|
|
const static map<string, string> replacements {
|
2018-10-08 18:30:45 +00:00
|
|
|
{ "into(2)", "into" },
|
|
|
|
{ "to(2)", "to" },
|
|
|
|
{ "to(3)", "to" },
|
|
|
|
{ "today(2)", "today" },
|
|
|
|
{ "tomorrow(2)", "tomorrow" },
|
|
|
|
{ "tonight(2)", "tonight" }
|
2017-03-26 20:16:13 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
const auto pair = replacements.find(word);
|
|
|
|
return pair != replacements.end() ? pair->second : word;
|
|
|
|
}
|
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
static Timeline<Phone> utteranceToPhones(
|
2016-07-20 18:42:27 +00:00
|
|
|
const AudioClip& audioClip,
|
2016-09-29 08:44:34 +00:00
|
|
|
TimeRange utteranceTimeRange,
|
2016-06-29 19:44:17 +00:00
|
|
|
ps_decoder_t& decoder,
|
2018-10-08 18:30:45 +00:00
|
|
|
ProgressSink& utteranceProgressSink
|
|
|
|
) {
|
2016-06-29 19:44:17 +00:00
|
|
|
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
|
2019-01-03 10:57:53 +00:00
|
|
|
ProgressSink& wordRecognitionProgressSink =
|
|
|
|
utteranceProgressMerger.addSource("word recognition (PocketSphinx recognizer)", 1.0);
|
|
|
|
ProgressSink& alignmentProgressSink =
|
|
|
|
utteranceProgressMerger.addSource("alignment (PocketSphinx recognizer)", 0.5);
|
2016-06-29 19:44:17 +00:00
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
// Pad time range to give PocketSphinx some breathing room
|
2016-09-29 08:44:34 +00:00
|
|
|
TimeRange paddedTimeRange = utteranceTimeRange;
|
|
|
|
const centiseconds padding(3);
|
|
|
|
paddedTimeRange.grow(padding);
|
|
|
|
paddedTimeRange.trim(audioClip.getTruncatedRange());
|
|
|
|
|
2019-01-02 19:00:34 +00:00
|
|
|
const unique_ptr<AudioClip> clipSegment = audioClip.clone()
|
|
|
|
| segment(paddedTimeRange)
|
|
|
|
| resample(sphinxSampleRate);
|
2016-09-26 11:11:01 +00:00
|
|
|
const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
|
2016-06-29 19:44:17 +00:00
|
|
|
|
|
|
|
// Get words
|
2016-09-26 11:11:01 +00:00
|
|
|
BoundedTimeline<string> words = recognizeWords(audioBuffer, decoder);
|
2016-09-18 20:02:02 +00:00
|
|
|
wordRecognitionProgressSink.reportProgress(1.0);
|
2016-09-29 08:44:34 +00:00
|
|
|
|
|
|
|
// Log utterance text
|
|
|
|
string text;
|
|
|
|
for (auto& timedWord : words) {
|
|
|
|
string word = timedWord.getValue();
|
|
|
|
// Skip details
|
|
|
|
if (word == "<s>" || word == "</s>" || word == "<sil>") {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
word = regex_replace(word, regex("\\(\\d\\)"), "");
|
2018-10-08 18:30:45 +00:00
|
|
|
if (!text.empty()) {
|
2016-09-29 08:44:34 +00:00
|
|
|
text += " ";
|
|
|
|
}
|
|
|
|
text += word;
|
|
|
|
}
|
2016-11-15 20:56:39 +00:00
|
|
|
logTimedEvent("utterance", utteranceTimeRange, text);
|
2016-09-29 08:44:34 +00:00
|
|
|
|
|
|
|
// Log words
|
2016-06-29 19:44:17 +00:00
|
|
|
for (Timed<string> timedWord : words) {
|
2016-09-29 08:44:34 +00:00
|
|
|
timedWord.getTimeRange().shift(paddedTimeRange.getStart());
|
2016-11-15 20:56:39 +00:00
|
|
|
logTimedEvent("word", timedWord);
|
2016-06-29 19:44:17 +00:00
|
|
|
}
|
|
|
|
|
2016-09-29 08:44:34 +00:00
|
|
|
// Convert word strings to word IDs using dictionary
|
2016-06-29 19:44:17 +00:00
|
|
|
vector<s3wid_t> wordIds;
|
|
|
|
for (const auto& timedWord : words) {
|
2017-03-26 20:16:13 +00:00
|
|
|
const string fixedWord = fixPronunciation(timedWord.getValue());
|
|
|
|
wordIds.push_back(getWordId(fixedWord, *decoder.dict));
|
2016-06-29 19:44:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Align the words' phones with speech
|
2016-09-08 20:23:19 +00:00
|
|
|
#if BOOST_VERSION < 105600 // Support legacy syntax
|
|
|
|
#define value_or get_value_or
|
|
|
|
#endif
|
2016-09-29 08:44:34 +00:00
|
|
|
Timeline<Phone> utterancePhones = getPhoneAlignment(wordIds, audioBuffer, decoder)
|
2016-08-11 08:18:03 +00:00
|
|
|
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
|
2016-09-18 20:02:02 +00:00
|
|
|
alignmentProgressSink.reportProgress(1.0);
|
2016-09-29 08:44:34 +00:00
|
|
|
utterancePhones.shift(paddedTimeRange.getStart());
|
|
|
|
|
|
|
|
// Log raw phones
|
|
|
|
for (const auto& timedPhone : utterancePhones) {
|
2016-11-15 20:56:39 +00:00
|
|
|
logTimedEvent("rawPhone", timedPhone);
|
2016-06-29 19:44:17 +00:00
|
|
|
}
|
|
|
|
|
2016-09-29 08:44:34 +00:00
|
|
|
// Guess positions of noise sounds
|
2016-12-08 08:55:49 +00:00
|
|
|
JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
|
2016-09-29 08:44:34 +00:00
|
|
|
for (const auto& noiseSound : noiseSounds) {
|
|
|
|
utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
|
2016-08-11 08:18:03 +00:00
|
|
|
}
|
|
|
|
|
2016-09-29 08:44:34 +00:00
|
|
|
// Log phones
|
|
|
|
for (const auto& timedPhone : utterancePhones) {
|
2016-11-15 20:56:39 +00:00
|
|
|
logTimedEvent("phone", timedPhone);
|
2016-08-11 08:18:03 +00:00
|
|
|
}
|
|
|
|
|
2016-09-29 08:44:34 +00:00
|
|
|
return utterancePhones;
|
2016-08-11 08:18:03 +00:00
|
|
|
}
|
|
|
|
|
2018-10-08 18:30:45 +00:00
|
|
|
BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
|
2016-07-20 18:42:27 +00:00
|
|
|
const AudioClip& inputAudioClip,
|
2018-10-08 18:30:45 +00:00
|
|
|
optional<std::string> dialog,
|
2016-08-11 08:29:01 +00:00
|
|
|
int maxThreadCount,
|
2018-10-08 18:30:45 +00:00
|
|
|
ProgressSink& progressSink
|
|
|
|
) const {
|
2019-01-02 19:00:34 +00:00
|
|
|
return ::recognizePhones(
|
|
|
|
inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
2015-11-18 19:59:03 +00:00
|
|
|
}
|