2015-11-18 19:59:03 +00:00
|
|
|
#include <boost/filesystem.hpp>
|
2015-12-29 15:26:01 +00:00
|
|
|
#include "phoneExtraction.h"
|
2016-03-08 17:21:17 +00:00
|
|
|
#include "audio/SampleRateConverter.h"
|
2015-12-29 15:26:01 +00:00
|
|
|
#include "platformTools.h"
|
2015-11-19 20:17:35 +00:00
|
|
|
#include "tools.h"
|
2016-01-19 21:05:06 +00:00
|
|
|
#include <format.h>
|
2016-02-09 21:08:11 +00:00
|
|
|
#include <s3types.h>
|
|
|
|
#include <regex>
|
2016-02-29 19:58:58 +00:00
|
|
|
#include <gsl_util.h>
|
2016-02-29 20:47:36 +00:00
|
|
|
#include <logging.h>
|
2016-03-15 18:56:02 +00:00
|
|
|
#include <audio/DCOffset.h>
|
2016-04-09 20:07:25 +00:00
|
|
|
#include <Timeline.h>
|
2016-05-17 12:16:16 +00:00
|
|
|
#include <audio/voiceActivityDetection.h>
|
2016-07-20 18:42:27 +00:00
|
|
|
#include "audio/AudioSegment.h"
|
2016-06-03 19:07:49 +00:00
|
|
|
#include "languageModels.h"
|
|
|
|
#include "tokenization.h"
|
|
|
|
#include "g2p.h"
|
2016-06-21 17:20:27 +00:00
|
|
|
#include "ContinuousTimeline.h"
|
2016-06-21 20:20:18 +00:00
|
|
|
#include "audio/processing.h"
|
2016-07-27 19:15:00 +00:00
|
|
|
#include "parallel.h"
|
2015-11-19 20:17:35 +00:00
|
|
|
|
2015-12-21 12:09:09 +00:00
|
|
|
extern "C" {
|
|
|
|
#include <pocketsphinx.h>
|
|
|
|
#include <sphinxbase/err.h>
|
2016-01-19 21:05:06 +00:00
|
|
|
#include <ps_alignment.h>
|
|
|
|
#include <state_align_search.h>
|
|
|
|
#include <pocketsphinx_internal.h>
|
2016-06-30 18:06:38 +00:00
|
|
|
#include <ngram_search.h>
|
2015-12-21 12:09:09 +00:00
|
|
|
}
|
|
|
|
|
2015-11-18 19:59:03 +00:00
|
|
|
using std::runtime_error;
|
2016-02-09 21:08:11 +00:00
|
|
|
using std::invalid_argument;
|
2015-11-18 19:59:03 +00:00
|
|
|
using std::unique_ptr;
|
|
|
|
using std::shared_ptr;
|
|
|
|
using std::string;
|
2016-01-19 21:05:06 +00:00
|
|
|
using std::vector;
|
2015-11-18 19:59:03 +00:00
|
|
|
using std::map;
|
|
|
|
using boost::filesystem::path;
|
2016-01-08 09:53:35 +00:00
|
|
|
using std::function;
|
2016-02-09 21:08:11 +00:00
|
|
|
using std::regex;
|
|
|
|
using std::regex_replace;
|
2016-03-01 20:57:05 +00:00
|
|
|
using std::chrono::duration;
|
2016-06-03 19:07:49 +00:00
|
|
|
using boost::optional;
|
|
|
|
using std::u32string;
|
2016-06-29 19:44:17 +00:00
|
|
|
using std::chrono::duration_cast;
|
2015-11-18 19:59:03 +00:00
|
|
|
|
2016-03-15 18:56:02 +00:00
|
|
|
constexpr int sphinxSampleRate = 16000;
|
2015-11-18 19:59:03 +00:00
|
|
|
|
2016-06-03 19:07:49 +00:00
|
|
|
const path& getSphinxModelDirectory() {
|
|
|
|
static path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
|
|
|
|
return sphinxModelDirectory;
|
|
|
|
}
|
|
|
|
|
2016-04-13 20:37:39 +00:00
|
|
|
logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
|
2016-02-29 20:47:36 +00:00
|
|
|
switch (errorLevel) {
|
|
|
|
case ERR_DEBUG:
|
|
|
|
case ERR_INFO:
|
|
|
|
case ERR_INFOCONT:
|
2016-04-13 20:37:39 +00:00
|
|
|
return logging::Level::Trace;
|
2016-02-29 20:47:36 +00:00
|
|
|
case ERR_WARN:
|
2016-04-13 20:37:39 +00:00
|
|
|
return logging::Level::Warn;
|
2016-02-29 20:47:36 +00:00
|
|
|
case ERR_ERROR:
|
2016-04-13 20:37:39 +00:00
|
|
|
return logging::Level::Error;
|
2016-02-29 20:47:36 +00:00
|
|
|
case ERR_FATAL:
|
2016-04-13 20:37:39 +00:00
|
|
|
return logging::Level::Fatal;
|
2016-02-29 20:47:36 +00:00
|
|
|
default:
|
|
|
|
throw invalid_argument("Unknown log level.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
|
|
|
|
UNUSED(user_data);
|
2015-11-19 21:48:17 +00:00
|
|
|
|
|
|
|
// Create varArgs list
|
|
|
|
va_list args;
|
|
|
|
va_start(args, format);
|
2016-02-29 19:58:58 +00:00
|
|
|
auto _ = gsl::finally([&args]() { va_end(args); });
|
2015-11-19 21:48:17 +00:00
|
|
|
|
|
|
|
// Format message
|
|
|
|
const int initialSize = 256;
|
2016-01-19 21:05:06 +00:00
|
|
|
vector<char> chars(initialSize);
|
2015-11-19 21:48:17 +00:00
|
|
|
bool success = false;
|
|
|
|
while (!success) {
|
|
|
|
int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
|
|
|
|
if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
|
|
|
|
|
|
|
|
success = charsWritten < static_cast<int>(chars.size());
|
|
|
|
if (!success) chars.resize(chars.size() * 2);
|
|
|
|
}
|
2016-05-17 15:56:11 +00:00
|
|
|
regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
|
|
|
|
string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
|
2015-11-19 21:48:17 +00:00
|
|
|
boost::algorithm::trim(message);
|
|
|
|
|
2016-04-13 20:37:39 +00:00
|
|
|
logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
|
|
|
|
logging::log(logLevel, message);
|
2015-11-19 21:48:17 +00:00
|
|
|
}
|
|
|
|
|
2016-07-20 18:42:27 +00:00
|
|
|
BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
|
2016-01-19 21:05:06 +00:00
|
|
|
// Convert audio stream to the exact format PocketSphinx requires
|
2016-07-20 18:42:27 +00:00
|
|
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
|
2016-01-19 21:05:06 +00:00
|
|
|
|
2016-05-17 12:16:16 +00:00
|
|
|
// Restart timing at 0
|
2016-06-03 19:07:49 +00:00
|
|
|
ps_start_stream(&decoder);
|
2016-05-17 12:16:16 +00:00
|
|
|
|
2016-01-19 21:05:06 +00:00
|
|
|
// Start recognition
|
2016-06-03 19:07:49 +00:00
|
|
|
int error = ps_start_utt(&decoder);
|
2016-01-19 21:05:06 +00:00
|
|
|
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
|
|
|
|
2016-06-14 18:12:12 +00:00
|
|
|
// Process entire sound stream
|
2016-06-03 19:07:49 +00:00
|
|
|
auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
|
|
|
|
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
|
2016-01-19 21:05:06 +00:00
|
|
|
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
|
|
|
};
|
2016-07-20 18:42:27 +00:00
|
|
|
process16bitAudioClip(*audioClip, processBuffer, progressSink);
|
2016-01-19 21:05:06 +00:00
|
|
|
|
|
|
|
// End recognition
|
2016-06-03 19:07:49 +00:00
|
|
|
error = ps_end_utt(&decoder);
|
2016-01-19 21:05:06 +00:00
|
|
|
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
|
|
|
|
2016-06-30 18:06:38 +00:00
|
|
|
// PocketSphinx can't handle an utterance with no recognized words.
|
|
|
|
// As a result, the following utterance will be garbage.
|
|
|
|
// As a workaround, we throw away the decoder in this case.
|
|
|
|
// See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529
|
2016-07-20 18:42:27 +00:00
|
|
|
BoundedTimeline<string> result(audioClip->getTruncatedRange());
|
2016-06-30 18:06:38 +00:00
|
|
|
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
|
|
|
if (noWordsRecognized) {
|
|
|
|
decoderIsStillUsable = false;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Collect words
|
2016-06-19 19:18:40 +00:00
|
|
|
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
|
2016-01-19 21:05:06 +00:00
|
|
|
const char* word = ps_seg_word(it);
|
2016-03-01 20:57:05 +00:00
|
|
|
int firstFrame, lastFrame;
|
|
|
|
ps_seg_frames(it, &firstFrame, &lastFrame);
|
2016-05-17 12:16:16 +00:00
|
|
|
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
|
2016-02-09 21:08:11 +00:00
|
|
|
}
|
2016-01-19 21:05:06 +00:00
|
|
|
|
2016-02-09 21:08:11 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-05-17 12:16:16 +00:00
|
|
|
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
|
|
|
s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
|
|
|
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
|
|
|
return wordId;
|
2016-01-19 21:05:06 +00:00
|
|
|
}
|
|
|
|
|
2016-06-29 19:44:17 +00:00
|
|
|
optional<Timeline<Phone>> getPhoneAlignment(
|
2016-05-02 18:31:59 +00:00
|
|
|
const vector<s3wid_t>& wordIds,
|
2016-07-20 18:42:27 +00:00
|
|
|
const AudioClip& inputAudioClip,
|
2016-06-03 19:07:49 +00:00
|
|
|
ps_decoder_t& decoder,
|
2016-05-02 18:31:59 +00:00
|
|
|
ProgressSink& progressSink)
|
|
|
|
{
|
2016-01-19 21:05:06 +00:00
|
|
|
// Create alignment list
|
|
|
|
lambda_unique_ptr<ps_alignment_t> alignment(
|
2016-06-03 19:07:49 +00:00
|
|
|
ps_alignment_init(decoder.d2p),
|
2016-01-19 21:05:06 +00:00
|
|
|
[](ps_alignment_t* alignment) { ps_alignment_free(alignment); });
|
|
|
|
if (!alignment) throw runtime_error("Error creating alignment.");
|
|
|
|
for (s3wid_t wordId : wordIds) {
|
|
|
|
// Add word. Initial value for duration is ignored.
|
|
|
|
ps_alignment_add_word(alignment.get(), wordId, 0);
|
|
|
|
}
|
|
|
|
int error = ps_alignment_populate(alignment.get());
|
|
|
|
if (error) throw runtime_error("Error populating alignment struct.");
|
|
|
|
|
|
|
|
// Convert audio stream to the exact format PocketSphinx requires
|
2016-07-20 18:42:27 +00:00
|
|
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
|
2016-01-19 21:05:06 +00:00
|
|
|
|
|
|
|
// Create search structure
|
2016-06-03 19:07:49 +00:00
|
|
|
acmod_t* acousticModel = decoder.acmod;
|
2016-01-19 21:05:06 +00:00
|
|
|
lambda_unique_ptr<ps_search_t> search(
|
2016-06-03 19:07:49 +00:00
|
|
|
state_align_search_init("state_align", decoder.config, acousticModel, alignment.get()),
|
2016-01-19 21:05:06 +00:00
|
|
|
[](ps_search_t* search) { ps_search_free(search); });
|
|
|
|
if (!search) throw runtime_error("Error creating search.");
|
|
|
|
|
|
|
|
// Start recognition
|
|
|
|
error = acmod_start_utt(acousticModel);
|
|
|
|
if (error) throw runtime_error("Error starting utterance processing for alignment.");
|
|
|
|
|
2016-06-21 17:20:27 +00:00
|
|
|
{
|
|
|
|
// Eventually end recognition
|
|
|
|
auto endRecognition = gsl::finally([&]() { acmod_end_utt(acousticModel); });
|
|
|
|
|
|
|
|
// Start search
|
|
|
|
ps_search_start(search.get());
|
|
|
|
|
|
|
|
// Process entire sound stream
|
|
|
|
auto processBuffer = [&](const vector<int16_t>& buffer) {
|
|
|
|
const int16* nextSample = buffer.data();
|
|
|
|
size_t remainingSamples = buffer.size();
|
|
|
|
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
|
|
|
while (acousticModel->n_feat_frame > 0) {
|
|
|
|
ps_search_step(search.get(), acousticModel->output_frame);
|
|
|
|
acmod_advance(acousticModel);
|
|
|
|
}
|
2016-01-19 21:05:06 +00:00
|
|
|
}
|
2016-06-21 17:20:27 +00:00
|
|
|
};
|
2016-07-20 18:42:27 +00:00
|
|
|
process16bitAudioClip(*audioClip, processBuffer, progressSink);
|
2016-01-19 21:05:06 +00:00
|
|
|
|
2016-06-21 17:20:27 +00:00
|
|
|
// End search
|
|
|
|
error = ps_search_finish(search.get());
|
|
|
|
if (error) return boost::none;
|
|
|
|
}
|
2016-01-19 21:05:06 +00:00
|
|
|
|
|
|
|
// Extract phones with timestamps
|
2016-06-03 19:07:49 +00:00
|
|
|
char** phoneNames = decoder.dict->mdef->ciname;
|
2016-06-29 19:44:17 +00:00
|
|
|
Timeline<Phone> result;
|
2016-01-19 21:05:06 +00:00
|
|
|
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
|
|
|
// Get phone
|
|
|
|
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
|
|
|
|
s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
2016-05-02 18:31:59 +00:00
|
|
|
string phoneName = phoneNames[phoneId];
|
|
|
|
|
|
|
|
if (phoneName == "SIL") continue;
|
2016-01-19 21:05:06 +00:00
|
|
|
|
2016-04-09 20:07:25 +00:00
|
|
|
// Add entry
|
|
|
|
centiseconds start(phoneEntry->start);
|
|
|
|
centiseconds duration(phoneEntry->duration);
|
2016-04-14 20:14:20 +00:00
|
|
|
Timed<Phone> timedPhone(start, start + duration, PhoneConverter::get().parse(phoneName));
|
2016-04-09 20:07:25 +00:00
|
|
|
result.set(timedPhone);
|
2016-01-19 21:05:06 +00:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-06-25 19:52:04 +00:00
|
|
|
bool dictionaryContains(dict_t& dictionary, const string& word) {
|
|
|
|
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
|
|
|
|
}
|
|
|
|
|
2016-06-03 19:07:49 +00:00
|
|
|
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
|
|
|
|
map<string, string> missingPronunciations;
|
|
|
|
for (const string& word : words) {
|
2016-06-25 19:52:04 +00:00
|
|
|
if (!dictionaryContains(*decoder.dict, word)) {
|
2016-06-03 19:07:49 +00:00
|
|
|
string pronunciation;
|
|
|
|
for (Phone phone : wordToPhones(word)) {
|
|
|
|
if (pronunciation.length() > 0) pronunciation += " ";
|
|
|
|
pronunciation += PhoneConverter::get().toString(phone);
|
|
|
|
}
|
|
|
|
missingPronunciations[word] = pronunciation;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
|
|
|
|
bool isLast = it == --missingPronunciations.end();
|
|
|
|
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
|
|
|
|
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-26 19:05:09 +00:00
|
|
|
lambda_unique_ptr<ps_decoder_t> createDecoder(optional<u32string> dialog) {
|
|
|
|
lambda_unique_ptr<cmd_ln_t> config(
|
|
|
|
cmd_ln_init(
|
|
|
|
nullptr, ps_args(), true,
|
|
|
|
// Set acoustic model
|
|
|
|
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
|
|
|
// Set pronunciation dictionary
|
|
|
|
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
|
|
|
|
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
|
|
|
"-dither", "yes",
|
|
|
|
nullptr),
|
|
|
|
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
|
|
|
if (!config) throw runtime_error("Error creating configuration.");
|
|
|
|
|
|
|
|
lambda_unique_ptr<ps_decoder_t> decoder(
|
|
|
|
ps_init(config.get()),
|
|
|
|
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
|
|
|
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
|
|
|
|
|
|
|
// Set language model
|
|
|
|
lambda_unique_ptr<ngram_model_t> languageModel;
|
|
|
|
if (dialog) {
|
|
|
|
// Create dialog-specific language model
|
|
|
|
vector<string> words = tokenizeText(*dialog, [&](const string& word) { return dictionaryContains(*decoder->dict, word); });
|
|
|
|
words.insert(words.begin(), "<s>");
|
|
|
|
words.push_back("</s>");
|
|
|
|
languageModel = createLanguageModel(words, *decoder->lmath);
|
|
|
|
|
|
|
|
// Add any dialog-specific words to the dictionary
|
|
|
|
addMissingDictionaryWords(words, *decoder);
|
|
|
|
} else {
|
|
|
|
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
|
|
|
|
languageModel = lambda_unique_ptr<ngram_model_t>(
|
|
|
|
ngram_model_read(decoder->config, modelPath.string().c_str(), NGRAM_AUTO, decoder->lmath),
|
|
|
|
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
|
|
|
}
|
|
|
|
ps_set_lm(decoder.get(), "lm", languageModel.get());
|
|
|
|
ps_set_search(decoder.get(), "lm");
|
|
|
|
|
|
|
|
return decoder;
|
|
|
|
}
|
|
|
|
|
2016-06-29 19:44:17 +00:00
|
|
|
Timeline<Phone> utteranceToPhones(
|
2016-07-20 18:42:27 +00:00
|
|
|
const AudioClip& audioClip,
|
2016-06-29 19:44:17 +00:00
|
|
|
TimeRange utterance,
|
|
|
|
ps_decoder_t& decoder,
|
2016-06-30 18:06:38 +00:00
|
|
|
bool& decoderIsStillUsable,
|
2016-06-29 19:44:17 +00:00
|
|
|
ProgressSink& utteranceProgressSink)
|
|
|
|
{
|
|
|
|
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
|
|
|
|
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
|
|
|
|
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
|
|
|
|
|
2016-07-20 18:42:27 +00:00
|
|
|
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance);
|
2016-06-29 19:44:17 +00:00
|
|
|
|
|
|
|
// Get words
|
2016-07-20 18:42:27 +00:00
|
|
|
BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable, wordRecognitionProgressSink);
|
2016-06-29 19:44:17 +00:00
|
|
|
for (Timed<string> timedWord : words) {
|
|
|
|
timedWord.getTimeRange().shift(utterance.getStart());
|
|
|
|
logging::logTimedEvent("word", timedWord);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Look up words in dictionary
|
|
|
|
vector<s3wid_t> wordIds;
|
|
|
|
for (const auto& timedWord : words) {
|
|
|
|
wordIds.push_back(getWordId(timedWord.getValue(), *decoder.dict));
|
|
|
|
}
|
|
|
|
if (wordIds.empty()) return Timeline<Phone>();
|
|
|
|
|
|
|
|
// Align the words' phones with speech
|
2016-07-20 18:42:27 +00:00
|
|
|
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
|
|
|
|
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Unknown));
|
2016-06-29 19:44:17 +00:00
|
|
|
segmentPhones.shift(utterance.getStart());
|
|
|
|
for (const auto& timedPhone : segmentPhones) {
|
|
|
|
logging::logTimedEvent("phone", timedPhone);
|
|
|
|
}
|
|
|
|
|
|
|
|
return segmentPhones;
|
|
|
|
}
|
|
|
|
|
2016-05-02 18:31:59 +00:00
|
|
|
BoundedTimeline<Phone> detectPhones(
|
2016-07-20 18:42:27 +00:00
|
|
|
const AudioClip& inputAudioClip,
|
2016-06-03 19:07:49 +00:00
|
|
|
optional<u32string> dialog,
|
2016-02-09 21:08:11 +00:00
|
|
|
ProgressSink& progressSink)
|
|
|
|
{
|
2016-06-29 19:44:17 +00:00
|
|
|
ProgressMerger totalProgressMerger(progressSink);
|
|
|
|
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
|
|
|
|
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
|
|
|
|
|
|
|
// Make sure audio stream has no DC offset
|
2016-07-20 18:42:27 +00:00
|
|
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDCOffset();
|
2016-06-29 19:44:17 +00:00
|
|
|
|
|
|
|
// Split audio into utterances
|
|
|
|
BoundedTimeline<void> utterances;
|
|
|
|
try {
|
2016-07-20 18:42:27 +00:00
|
|
|
utterances = detectVoiceActivity(*audioClip, voiceActivationProgressSink);
|
2016-06-29 19:44:17 +00:00
|
|
|
}
|
|
|
|
catch (...) {
|
|
|
|
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
|
|
|
}
|
|
|
|
|
2015-11-19 21:48:17 +00:00
|
|
|
// Discard Pocketsphinx output
|
|
|
|
err_set_logfp(nullptr);
|
|
|
|
|
2016-02-29 20:47:36 +00:00
|
|
|
// Redirect Pocketsphinx output to log
|
|
|
|
err_set_callback(sphinxLogCallback, nullptr);
|
2015-11-19 21:48:17 +00:00
|
|
|
|
2016-06-29 19:44:17 +00:00
|
|
|
// Prepare pool of decoders
|
|
|
|
std::stack<lambda_unique_ptr<ps_decoder_t>> decoderPool;
|
|
|
|
std::mutex decoderPoolMutex;
|
|
|
|
auto getDecoder = [&] {
|
|
|
|
std::lock_guard<std::mutex> lock(decoderPoolMutex);
|
|
|
|
if (decoderPool.empty()) {
|
|
|
|
decoderPool.push(createDecoder(dialog));
|
|
|
|
}
|
|
|
|
auto decoder = std::move(decoderPool.top());
|
|
|
|
decoderPool.pop();
|
|
|
|
return std::move(decoder);
|
|
|
|
};
|
|
|
|
auto returnDecoder = [&](lambda_unique_ptr<ps_decoder_t> decoder) {
|
|
|
|
std::lock_guard<std::mutex> lock(decoderPoolMutex);
|
|
|
|
decoderPool.push(std::move(decoder));
|
|
|
|
};
|
2016-06-14 18:12:12 +00:00
|
|
|
|
2016-07-20 18:42:27 +00:00
|
|
|
BoundedTimeline<Phone> result(audioClip->getTruncatedRange());
|
2016-06-30 18:06:38 +00:00
|
|
|
std::mutex resultMutex;
|
2016-06-29 19:44:17 +00:00
|
|
|
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
|
|
|
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
|
|
|
|
|
|
|
|
// Detect phones for utterance
|
|
|
|
auto decoder = getDecoder();
|
2016-06-30 18:06:38 +00:00
|
|
|
bool decoderIsStillUsable = true;
|
2016-06-29 19:44:17 +00:00
|
|
|
Timeline<Phone> phones =
|
2016-07-20 18:42:27 +00:00
|
|
|
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink);
|
2016-06-30 18:06:38 +00:00
|
|
|
if (decoderIsStillUsable) {
|
|
|
|
returnDecoder(std::move(decoder));
|
|
|
|
}
|
2016-06-29 19:44:17 +00:00
|
|
|
|
|
|
|
// Copy phones to result timeline
|
|
|
|
std::lock_guard<std::mutex> lock(resultMutex);
|
|
|
|
for (const auto& timedPhone : phones) {
|
|
|
|
result.set(timedPhone);
|
|
|
|
}
|
|
|
|
};
|
2016-05-17 12:16:16 +00:00
|
|
|
|
2016-06-29 19:44:17 +00:00
|
|
|
auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
|
|
|
|
return timedUtterance.getTimeRange().getLength().count();
|
|
|
|
};
|
2016-05-17 12:16:16 +00:00
|
|
|
|
2016-06-29 19:44:17 +00:00
|
|
|
// Perform speech recognition
|
|
|
|
try {
|
|
|
|
// Determine how many parallel threads to use
|
|
|
|
int threadCount = std::min({
|
|
|
|
// Don't use more threads than there are CPU cores
|
2016-07-27 19:15:00 +00:00
|
|
|
getProcessorCoreCount(),
|
2016-06-29 19:44:17 +00:00
|
|
|
// Don't use more threads than there are utterances to be processed
|
|
|
|
static_cast<int>(utterances.size()),
|
|
|
|
// Don't waste time creating additional threads (and decoders!) if the recording is short
|
2016-07-20 18:42:27 +00:00
|
|
|
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getLength()).count() / 10)
|
2016-06-29 19:44:17 +00:00
|
|
|
});
|
|
|
|
logging::debug("Speech recognition -- start");
|
2016-07-27 19:15:00 +00:00
|
|
|
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
2016-06-14 15:38:11 +00:00
|
|
|
logging::debug("Speech recognition -- end");
|
2015-11-19 17:32:14 +00:00
|
|
|
|
2016-01-19 21:05:06 +00:00
|
|
|
return result;
|
2015-12-21 12:09:09 +00:00
|
|
|
}
|
|
|
|
catch (...) {
|
2016-02-29 20:47:36 +00:00
|
|
|
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
|
2015-11-19 17:32:14 +00:00
|
|
|
}
|
2015-11-18 19:59:03 +00:00
|
|
|
}
|