2015-11-18 19:59:03 +00:00
|
|
|
#include <iostream>
|
|
|
|
#include <boost/filesystem.hpp>
|
2015-11-19 21:48:17 +00:00
|
|
|
#include <boost/algorithm/string.hpp>
|
2015-12-29 15:26:01 +00:00
|
|
|
#include "phoneExtraction.h"
|
|
|
|
#include "audioInput/SampleRateConverter.h"
|
|
|
|
#include "audioInput/ChannelDownmixer.h"
|
|
|
|
#include "platformTools.h"
|
2015-11-19 20:17:35 +00:00
|
|
|
#include "tools.h"
|
2016-01-19 21:05:06 +00:00
|
|
|
#include <format.h>
|
2016-02-09 21:08:11 +00:00
|
|
|
#include <s3types.h>
|
|
|
|
#include <regex>
|
2016-02-29 19:58:58 +00:00
|
|
|
#include <gsl_util.h>
|
2016-02-29 20:47:36 +00:00
|
|
|
#include <logging.h>
|
2015-11-19 20:17:35 +00:00
|
|
|
|
2015-12-21 12:09:09 +00:00
|
|
|
extern "C" {
|
|
|
|
#include <pocketsphinx.h>
|
|
|
|
#include <sphinxbase/err.h>
|
2016-01-19 21:05:06 +00:00
|
|
|
#include <ps_alignment.h>
|
|
|
|
#include <state_align_search.h>
|
|
|
|
#include <pocketsphinx_internal.h>
|
2015-12-21 12:09:09 +00:00
|
|
|
}
|
|
|
|
|
2015-11-18 19:59:03 +00:00
|
|
|
using std::runtime_error;
|
2016-02-09 21:08:11 +00:00
|
|
|
using std::invalid_argument;
|
2015-11-18 19:59:03 +00:00
|
|
|
using std::unique_ptr;
|
|
|
|
using std::shared_ptr;
|
|
|
|
using std::string;
|
2016-01-19 21:05:06 +00:00
|
|
|
using std::vector;
|
2015-11-18 19:59:03 +00:00
|
|
|
using std::map;
|
|
|
|
using boost::filesystem::path;
|
2016-01-08 09:53:35 +00:00
|
|
|
using std::function;
|
2016-02-09 21:08:11 +00:00
|
|
|
using std::regex;
|
|
|
|
using std::regex_replace;
|
2016-03-01 20:57:05 +00:00
|
|
|
using std::chrono::duration;
|
2015-11-18 19:59:03 +00:00
|
|
|
|
|
|
|
unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
|
|
|
|
// Downmix, if required
|
|
|
|
if (stream->getChannelCount() != 1) {
|
|
|
|
stream.reset(new ChannelDownmixer(std::move(stream)));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Downsample, if required
|
|
|
|
if (stream->getFrameRate() < 16000) {
|
2016-02-09 21:08:11 +00:00
|
|
|
throw invalid_argument("Audio sample rate must not be below 16kHz.");
|
2015-11-18 19:59:03 +00:00
|
|
|
}
|
|
|
|
if (stream->getFrameRate() != 16000) {
|
|
|
|
stream.reset(new SampleRateConverter(std::move(stream), 16000));
|
|
|
|
}
|
|
|
|
|
|
|
|
return stream;
|
|
|
|
}
|
|
|
|
|
2015-11-19 17:32:14 +00:00
|
|
|
lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {
|
|
|
|
lambda_unique_ptr<cmd_ln_t> config(
|
2015-11-18 19:59:03 +00:00
|
|
|
cmd_ln_init(
|
|
|
|
nullptr, ps_args(), true,
|
|
|
|
// Set acoustic model
|
2016-01-19 21:05:06 +00:00
|
|
|
"-hmm", (sphinxModelDirectory / "acoustic-model").string().c_str(),
|
|
|
|
// Set language model
|
|
|
|
"-lm", (sphinxModelDirectory / "en-us.lm.bin").string().c_str(),
|
|
|
|
// Set pronounciation dictionary
|
|
|
|
"-dict", (sphinxModelDirectory / "cmudict-en-us.dict").string().c_str(),
|
2016-02-01 19:26:14 +00:00
|
|
|
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
|
|
|
"-dither", "yes",
|
2016-01-28 20:52:50 +00:00
|
|
|
// Allow for long pauses in speech
|
|
|
|
"-vad_prespeech", "3000",
|
|
|
|
"-vad_postspeech", "3000",
|
2015-11-18 19:59:03 +00:00
|
|
|
nullptr),
|
|
|
|
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
|
|
|
if (!config) throw runtime_error("Error creating configuration.");
|
|
|
|
|
2015-11-19 17:32:14 +00:00
|
|
|
return config;
|
|
|
|
}
|
|
|
|
|
2016-01-19 21:05:06 +00:00
|
|
|
lambda_unique_ptr<ps_decoder_t> createSpeechRecognizer(cmd_ln_t& config) {
|
2015-11-19 17:32:14 +00:00
|
|
|
lambda_unique_ptr<ps_decoder_t> recognizer(
|
|
|
|
ps_init(&config),
|
2015-11-18 19:59:03 +00:00
|
|
|
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
|
|
|
if (!recognizer) throw runtime_error("Error creating speech recognizer.");
|
|
|
|
|
2015-11-19 17:32:14 +00:00
|
|
|
return recognizer;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Converts a float in the range -1..1 to a signed 16-bit int
|
|
|
|
int16_t floatSampleToInt16(float sample) {
|
|
|
|
sample = std::max(sample, -1.0f);
|
|
|
|
sample = std::min(sample, 1.0f);
|
|
|
|
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
|
|
|
|
}
|
|
|
|
|
2016-01-28 18:13:40 +00:00
|
|
|
void processAudioStream(AudioStream& audioStream16kHzMono, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
|
2015-11-18 19:59:03 +00:00
|
|
|
// Process entire sound file
|
2016-01-19 21:05:06 +00:00
|
|
|
vector<int16_t> buffer;
|
2015-11-18 19:59:03 +00:00
|
|
|
const int capacity = 1600; // 0.1 second capacity
|
|
|
|
buffer.reserve(capacity);
|
|
|
|
int sampleCount = 0;
|
|
|
|
do {
|
|
|
|
// Read to buffer
|
|
|
|
buffer.clear();
|
|
|
|
while (buffer.size() < capacity) {
|
2016-01-08 15:44:03 +00:00
|
|
|
// Read sample
|
|
|
|
float floatSample;
|
|
|
|
if (!audioStream16kHzMono.getNextSample(floatSample)) break;
|
|
|
|
int16_t sample = floatSampleToInt16(floatSample);
|
|
|
|
buffer.push_back(sample);
|
2015-11-18 19:59:03 +00:00
|
|
|
}
|
|
|
|
|
2016-01-19 21:05:06 +00:00
|
|
|
// Process buffer
|
|
|
|
processBuffer(buffer);
|
2015-11-18 19:59:03 +00:00
|
|
|
|
|
|
|
sampleCount += buffer.size();
|
2016-01-28 18:13:40 +00:00
|
|
|
progressSink.reportProgress(static_cast<double>(sampleCount) / audioStream16kHzMono.getFrameCount());
|
2015-11-18 19:59:03 +00:00
|
|
|
} while (buffer.size());
|
2015-11-19 17:32:14 +00:00
|
|
|
}
|
|
|
|
|
2016-02-29 20:47:36 +00:00
|
|
|
LogLevel ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
|
|
|
|
switch (errorLevel) {
|
|
|
|
case ERR_DEBUG:
|
|
|
|
case ERR_INFO:
|
|
|
|
case ERR_INFOCONT:
|
|
|
|
return LogLevel::Trace;
|
|
|
|
case ERR_WARN:
|
|
|
|
return LogLevel::Warning;
|
|
|
|
case ERR_ERROR:
|
|
|
|
return LogLevel::Error;
|
|
|
|
case ERR_FATAL:
|
|
|
|
return LogLevel::Fatal;
|
|
|
|
default:
|
|
|
|
throw invalid_argument("Unknown log level.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
|
|
|
|
UNUSED(user_data);
|
2015-11-19 21:48:17 +00:00
|
|
|
|
|
|
|
// Create varArgs list
|
|
|
|
va_list args;
|
|
|
|
va_start(args, format);
|
2016-02-29 19:58:58 +00:00
|
|
|
auto _ = gsl::finally([&args]() { va_end(args); });
|
2015-11-19 21:48:17 +00:00
|
|
|
|
|
|
|
// Format message
|
|
|
|
const int initialSize = 256;
|
2016-01-19 21:05:06 +00:00
|
|
|
vector<char> chars(initialSize);
|
2015-11-19 21:48:17 +00:00
|
|
|
bool success = false;
|
|
|
|
while (!success) {
|
|
|
|
int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
|
|
|
|
if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
|
|
|
|
|
|
|
|
success = charsWritten < static_cast<int>(chars.size());
|
|
|
|
if (!success) chars.resize(chars.size() * 2);
|
|
|
|
}
|
|
|
|
string message(chars.data());
|
|
|
|
boost::algorithm::trim(message);
|
|
|
|
|
2016-02-29 20:47:36 +00:00
|
|
|
LogLevel logLevel = ConvertSphinxErrorLevel(errorLevel);
|
|
|
|
LOG(logLevel) << message;
|
2015-11-19 21:48:17 +00:00
|
|
|
}
|
|
|
|
|
2016-02-09 21:08:11 +00:00
|
|
|
vector<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
2016-01-19 21:05:06 +00:00
|
|
|
// Convert audio stream to the exact format PocketSphinx requires
|
|
|
|
audioStream = to16kHzMono(std::move(audioStream));
|
|
|
|
|
|
|
|
// Start recognition
|
|
|
|
int error = ps_start_utt(&recognizer);
|
|
|
|
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
|
|
|
|
|
|
|
// Process entire sound file
|
|
|
|
auto processBuffer = [&recognizer](const vector<int16_t>& buffer) {
|
|
|
|
int searchedFrameCount = ps_process_raw(&recognizer, buffer.data(), buffer.size(), false, false);
|
|
|
|
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
|
|
|
};
|
2016-01-28 18:13:40 +00:00
|
|
|
processAudioStream(*audioStream.get(), processBuffer, progressSink);
|
2016-01-19 21:05:06 +00:00
|
|
|
|
|
|
|
// End recognition
|
|
|
|
error = ps_end_utt(&recognizer);
|
|
|
|
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
|
|
|
|
|
|
|
// Collect words
|
2016-02-09 21:08:11 +00:00
|
|
|
vector<string> result;
|
2016-01-19 21:05:06 +00:00
|
|
|
int32_t score;
|
|
|
|
for (ps_seg_t* it = ps_seg_iter(&recognizer, &score); it; it = ps_seg_next(it)) {
|
|
|
|
const char* word = ps_seg_word(it);
|
2016-02-09 21:08:11 +00:00
|
|
|
result.push_back(word);
|
2016-03-01 20:57:05 +00:00
|
|
|
|
|
|
|
int firstFrame, lastFrame;
|
|
|
|
ps_seg_frames(it, &firstFrame, &lastFrame);
|
|
|
|
logTimedEvent("word", centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
|
2016-02-09 21:08:11 +00:00
|
|
|
}
|
2016-01-19 21:05:06 +00:00
|
|
|
|
2016-02-09 21:08:11 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Splits dialog into words, doing minimal preprocessing.
|
|
|
|
// A robust solution should use TTS logic to cope with numbers, abbreviations, unknown words etc.
|
|
|
|
vector<string> extractDialogWords(string dialog) {
|
|
|
|
// Convert to lower case
|
|
|
|
boost::algorithm::to_lower(dialog);
|
|
|
|
|
|
|
|
// Insert silences where appropriate
|
2016-02-10 20:53:58 +00:00
|
|
|
dialog = regex_replace(dialog, regex("[,;.:!?] |-"), " <sil> ");
|
2016-02-09 21:08:11 +00:00
|
|
|
|
|
|
|
// Remove all undesired characters
|
2016-02-10 20:53:58 +00:00
|
|
|
dialog = regex_replace(dialog, regex("[^a-z.'\\0-9<>]"), " ");
|
2016-02-09 21:08:11 +00:00
|
|
|
|
|
|
|
// Collapse whitespace
|
|
|
|
dialog = regex_replace(dialog, regex("\\s+"), " ");
|
|
|
|
|
|
|
|
// Trim
|
|
|
|
boost::algorithm::trim(dialog);
|
|
|
|
|
|
|
|
// Ugly hack: Remove trailing period
|
|
|
|
if (boost::algorithm::ends_with(dialog, ".")) {
|
|
|
|
dialog.pop_back();
|
2016-01-19 21:05:06 +00:00
|
|
|
}
|
|
|
|
|
2016-02-09 21:08:11 +00:00
|
|
|
// Split into words
|
|
|
|
vector<string> result;
|
|
|
|
boost::algorithm::split(result, dialog, boost::is_space());
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
vector<s3wid_t> getWordIds(const vector<string>& words, dict_t& dictionary) {
|
|
|
|
vector<s3wid_t> result;
|
|
|
|
for (const string& word : words) {
|
|
|
|
s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
|
|
|
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
|
|
|
result.push_back(wordId);
|
|
|
|
}
|
2016-01-19 21:05:06 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-01-28 18:13:40 +00:00
|
|
|
map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
2016-01-19 21:05:06 +00:00
|
|
|
// Create alignment list
|
|
|
|
lambda_unique_ptr<ps_alignment_t> alignment(
|
|
|
|
ps_alignment_init(recognizer.d2p),
|
|
|
|
[](ps_alignment_t* alignment) { ps_alignment_free(alignment); });
|
|
|
|
if (!alignment) throw runtime_error("Error creating alignment.");
|
|
|
|
for (s3wid_t wordId : wordIds) {
|
|
|
|
// Add word. Initial value for duration is ignored.
|
|
|
|
ps_alignment_add_word(alignment.get(), wordId, 0);
|
|
|
|
}
|
|
|
|
int error = ps_alignment_populate(alignment.get());
|
|
|
|
if (error) throw runtime_error("Error populating alignment struct.");
|
|
|
|
|
|
|
|
// Convert audio stream to the exact format PocketSphinx requires
|
|
|
|
audioStream = to16kHzMono(std::move(audioStream));
|
|
|
|
|
|
|
|
// Create search structure
|
|
|
|
acmod_t* acousticModel = recognizer.acmod;
|
|
|
|
lambda_unique_ptr<ps_search_t> search(
|
|
|
|
state_align_search_init("state_align", recognizer.config, acousticModel, alignment.get()),
|
|
|
|
[](ps_search_t* search) { ps_search_free(search); });
|
|
|
|
if (!search) throw runtime_error("Error creating search.");
|
|
|
|
|
|
|
|
// Start recognition
|
|
|
|
error = acmod_start_utt(acousticModel);
|
|
|
|
if (error) throw runtime_error("Error starting utterance processing for alignment.");
|
|
|
|
|
|
|
|
// Start search
|
|
|
|
ps_search_start(search.get());
|
|
|
|
|
|
|
|
// Process entire sound file
|
|
|
|
auto processBuffer = [&recognizer, &acousticModel, &search](const vector<int16_t>& buffer) {
|
|
|
|
const int16* nextSample = buffer.data();
|
|
|
|
size_t remainingSamples = buffer.size();
|
|
|
|
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
|
|
|
while (acousticModel->n_feat_frame > 0) {
|
|
|
|
ps_search_step(search.get(), acousticModel->output_frame);
|
|
|
|
acmod_advance(acousticModel);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
2016-01-28 18:13:40 +00:00
|
|
|
processAudioStream(*audioStream.get(), processBuffer, progressSink);
|
2016-01-19 21:05:06 +00:00
|
|
|
|
|
|
|
// End search
|
|
|
|
ps_search_finish(search.get());
|
|
|
|
|
|
|
|
// End recognition
|
|
|
|
acmod_end_utt(acousticModel);
|
|
|
|
|
|
|
|
// Extract phones with timestamps
|
|
|
|
char** phoneNames = recognizer.dict->mdef->ciname;
|
|
|
|
map<centiseconds, Phone> result;
|
|
|
|
result[centiseconds(0)] = Phone::None;
|
|
|
|
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
|
|
|
// Get phone
|
|
|
|
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
|
|
|
|
s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
|
|
|
char* phoneName = phoneNames[phoneId];
|
|
|
|
|
|
|
|
// Get timing
|
|
|
|
int startFrame = phoneEntry->start;
|
|
|
|
int duration = phoneEntry->duration;
|
|
|
|
|
|
|
|
// Add map entries
|
2016-03-01 20:57:05 +00:00
|
|
|
centiseconds start(startFrame);
|
|
|
|
result[start] = stringToPhone(phoneName);
|
|
|
|
centiseconds end(startFrame + duration);
|
|
|
|
result[end] = Phone::None;
|
|
|
|
|
|
|
|
logTimedEvent("phone", start, end, phoneName);
|
2016-01-19 21:05:06 +00:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-02-09 21:08:11 +00:00
|
|
|
map<centiseconds, Phone> detectPhones(
|
|
|
|
std::function<std::unique_ptr<AudioStream>(void)> createAudioStream,
|
|
|
|
boost::optional<std::string> dialog,
|
|
|
|
ProgressSink& progressSink)
|
|
|
|
{
|
2015-11-19 21:48:17 +00:00
|
|
|
// Discard Pocketsphinx output
|
|
|
|
err_set_logfp(nullptr);
|
|
|
|
|
2016-02-29 20:47:36 +00:00
|
|
|
// Redirect Pocketsphinx output to log
|
|
|
|
err_set_callback(sphinxLogCallback, nullptr);
|
2015-11-19 21:48:17 +00:00
|
|
|
|
2015-11-19 17:32:14 +00:00
|
|
|
try {
|
|
|
|
// Create PocketSphinx configuration
|
2016-01-08 15:59:18 +00:00
|
|
|
path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
|
2015-11-19 17:32:14 +00:00
|
|
|
auto config = createConfig(sphinxModelDirectory);
|
|
|
|
|
2016-01-19 21:05:06 +00:00
|
|
|
// Create speech recognizer
|
|
|
|
auto recognizer = createSpeechRecognizer(*config.get());
|
2015-11-19 17:32:14 +00:00
|
|
|
|
2016-01-28 18:13:40 +00:00
|
|
|
ProgressMerger progressMerger(progressSink);
|
2016-02-09 21:08:11 +00:00
|
|
|
ProgressSink& wordRecognitionProgressSink = progressMerger.addSink(dialog ? 0.0 : 1.0);
|
2016-01-28 18:13:40 +00:00
|
|
|
ProgressSink& alignmentProgressSink = progressMerger.addSink(0.5);
|
|
|
|
|
2016-02-09 21:08:11 +00:00
|
|
|
// Get words
|
|
|
|
vector<string> words = dialog
|
|
|
|
? extractDialogWords(*dialog)
|
|
|
|
: recognizeWords(createAudioStream(), *recognizer.get(), wordRecognitionProgressSink);
|
|
|
|
|
|
|
|
// Look up words in dictionary
|
|
|
|
vector<s3wid_t> wordIds = getWordIds(words, *recognizer->dict);
|
2015-11-19 17:32:14 +00:00
|
|
|
|
2016-01-19 21:05:06 +00:00
|
|
|
// Align the word's phones with speech
|
2016-01-28 18:13:40 +00:00
|
|
|
map<centiseconds, Phone> result = getPhoneAlignment(wordIds, createAudioStream(), *recognizer.get(), alignmentProgressSink);
|
2016-01-19 21:05:06 +00:00
|
|
|
return result;
|
2015-12-21 12:09:09 +00:00
|
|
|
}
|
|
|
|
catch (...) {
|
2016-02-29 20:47:36 +00:00
|
|
|
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
|
2015-11-19 17:32:14 +00:00
|
|
|
}
|
2015-11-18 19:59:03 +00:00
|
|
|
}
|