Restored dialog option, this time based on language model
This approach should be more robust and error-tolerant.
This commit is contained in:
parent
4ed5908627
commit
0d488e8de2
|
@ -64,7 +64,7 @@ target_compile_options(cppFormat PRIVATE ${disableWarningsFlags})
|
||||||
set_target_properties(cppFormat PROPERTIES FOLDER lib)
|
set_target_properties(cppFormat PROPERTIES FOLDER lib)
|
||||||
|
|
||||||
# ... sphinxbase
|
# ... sphinxbase
|
||||||
include_directories(SYSTEM "lib/sphinxbase-5prealpha-2015-08-05/include")
|
include_directories(SYSTEM "lib/sphinxbase-5prealpha-2015-08-05/include" "lib/sphinxbase-5prealpha-2015-08-05/src")
|
||||||
FILE(GLOB_RECURSE sphinxbaseFiles "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c")
|
FILE(GLOB_RECURSE sphinxbaseFiles "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c")
|
||||||
add_library(sphinxbase ${sphinxbaseFiles})
|
add_library(sphinxbase ${sphinxbaseFiles})
|
||||||
target_compile_options(sphinxbase PRIVATE ${disableWarningsFlags})
|
target_compile_options(sphinxbase PRIVATE ${disableWarningsFlags})
|
||||||
|
@ -192,6 +192,8 @@ set(SOURCE_FILES
|
||||||
src/Exporter.cpp src/Exporter.h
|
src/Exporter.cpp src/Exporter.h
|
||||||
src/tokenization.cpp src/tokenization.h
|
src/tokenization.cpp src/tokenization.h
|
||||||
src/g2p.cpp src/g2p.h
|
src/g2p.cpp src/g2p.h
|
||||||
|
src/languageModels.cpp src/languageModels.h
|
||||||
|
src/tupleHash.h
|
||||||
)
|
)
|
||||||
add_executable(rhubarb ${SOURCE_FILES})
|
add_executable(rhubarb ${SOURCE_FILES})
|
||||||
target_link_libraries(rhubarb ${Boost_LIBRARIES} cppFormat sphinxbase pocketSphinx flite)
|
target_link_libraries(rhubarb ${Boost_LIBRARIES} cppFormat sphinxbase pocketSphinx flite)
|
||||||
|
|
|
@ -0,0 +1,183 @@
|
||||||
|
#include "languageModels.h"
|
||||||
|
#include <boost/range/adaptor/map.hpp>
|
||||||
|
#include <vector>
|
||||||
|
#include <regex>
|
||||||
|
#include <map>
|
||||||
|
#include <tuple>
|
||||||
|
#include "platformTools.h"
|
||||||
|
#include <boost/filesystem/fstream.hpp>
|
||||||
|
#include "appInfo.h"
|
||||||
|
#include <cmath>
|
||||||
|
#include <gsl_util.h>
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
using std::u32string;
|
||||||
|
using std::vector;
|
||||||
|
using std::regex;
|
||||||
|
using std::map;
|
||||||
|
using std::tuple;
|
||||||
|
using std::make_tuple;
|
||||||
|
using std::get;
|
||||||
|
using std::endl;
|
||||||
|
using boost::filesystem::path;
|
||||||
|
|
||||||
|
using unigram_t = string;
|
||||||
|
using bigram_t = tuple<string, string>;
|
||||||
|
using trigram_t = tuple<string, string, string>;
|
||||||
|
|
||||||
|
map<unigram_t, int> getUnigramCounts(const vector<string>& words) {
|
||||||
|
map<unigram_t, int> unigramCounts;
|
||||||
|
for (const unigram_t& unigram : words) {
|
||||||
|
++unigramCounts[unigram];
|
||||||
|
}
|
||||||
|
return unigramCounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
map<bigram_t, int> getBigramCounts(const vector<string>& words) {
|
||||||
|
map<bigram_t, int> bigramCounts;
|
||||||
|
for (auto it = words.begin(); it < words.end() - 1; ++it) {
|
||||||
|
++bigramCounts[bigram_t(*it, *(it + 1))];
|
||||||
|
}
|
||||||
|
return bigramCounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
map<trigram_t, int> getTrigramCounts(const vector<string>& words) {
|
||||||
|
map<trigram_t, int> trigramCounts;
|
||||||
|
if (words.size() >= 3) {
|
||||||
|
for (auto it = words.begin(); it < words.end() - 2; ++it) {
|
||||||
|
++trigramCounts[trigram_t(*it, *(it + 1), *(it + 2))];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return trigramCounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
map<unigram_t, double> getUnigramProbabilities(const vector<string>& words, const map<unigram_t, int>& unigramCounts, const double deflator) {
|
||||||
|
map<unigram_t, double> unigramProbabilities;
|
||||||
|
for (const auto& pair : unigramCounts) {
|
||||||
|
unigram_t unigram = get<0>(pair);
|
||||||
|
int unigramCount = get<1>(pair);
|
||||||
|
unigramProbabilities[unigram] = double(unigramCount) / words.size() * deflator;
|
||||||
|
}
|
||||||
|
return unigramProbabilities;
|
||||||
|
}
|
||||||
|
|
||||||
|
map<bigram_t, double> getBigramProbabilities(const map<unigram_t, int>& unigramCounts, const map<bigram_t, int>& bigramCounts, const double deflator) {
|
||||||
|
map<bigram_t, double> bigramProbabilities;
|
||||||
|
for (const auto& pair : bigramCounts) {
|
||||||
|
bigram_t bigram = get<0>(pair);
|
||||||
|
int bigramCount = get<1>(pair);
|
||||||
|
int unigramPrefixCount = unigramCounts.at(get<0>(bigram));
|
||||||
|
bigramProbabilities[bigram] = double(bigramCount) / unigramPrefixCount * deflator;
|
||||||
|
}
|
||||||
|
return bigramProbabilities;
|
||||||
|
}
|
||||||
|
|
||||||
|
map<trigram_t, double> getTrigramProbabilities(const map<bigram_t, int>& bigramCounts, const map<trigram_t, int>& trigramCounts, const double deflator) {
|
||||||
|
map<trigram_t, double> trigramProbabilities;
|
||||||
|
for (const auto& pair : trigramCounts) {
|
||||||
|
trigram_t trigram = get<0>(pair);
|
||||||
|
int trigramCount = get<1>(pair);
|
||||||
|
int bigramPrefixCount = bigramCounts.at(bigram_t(get<0>(trigram), get<1>(trigram)));
|
||||||
|
trigramProbabilities[trigram] = double(trigramCount) / bigramPrefixCount * deflator;
|
||||||
|
}
|
||||||
|
return trigramProbabilities;
|
||||||
|
}
|
||||||
|
|
||||||
|
map<unigram_t, double> getUnigramBackoffWeights(
|
||||||
|
const map<unigram_t, int>& unigramCounts,
|
||||||
|
const map<unigram_t, double>& unigramProbabilities,
|
||||||
|
const map<bigram_t, int>& bigramCounts,
|
||||||
|
const double discountMass)
|
||||||
|
{
|
||||||
|
map<unigram_t, double> unigramBackoffWeights;
|
||||||
|
for (const unigram_t& unigram : unigramCounts | boost::adaptors::map_keys) {
|
||||||
|
double denominator = 1;
|
||||||
|
for (const bigram_t& bigram : bigramCounts | boost::adaptors::map_keys) {
|
||||||
|
if (get<0>(bigram) == unigram) {
|
||||||
|
denominator -= unigramProbabilities.at(get<1>(bigram));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unigramBackoffWeights[unigram] = discountMass / denominator;
|
||||||
|
}
|
||||||
|
return unigramBackoffWeights;
|
||||||
|
}
|
||||||
|
|
||||||
|
map<bigram_t, double> getBigramBackoffWeights(
|
||||||
|
const map<bigram_t, int>& bigramCounts,
|
||||||
|
const map<bigram_t, double>& bigramProbabilities,
|
||||||
|
const map<trigram_t, int>& trigramCounts,
|
||||||
|
const double discountMass)
|
||||||
|
{
|
||||||
|
map<bigram_t, double> bigramBackoffWeights;
|
||||||
|
for (const bigram_t& bigram : bigramCounts | boost::adaptors::map_keys) {
|
||||||
|
double denominator = 1;
|
||||||
|
for (const trigram_t& trigram : trigramCounts | boost::adaptors::map_keys) {
|
||||||
|
if (bigram_t(get<0>(trigram), get<1>(trigram)) == bigram) {
|
||||||
|
denominator -= bigramProbabilities.at(bigram_t(get<1>(trigram), get<2>(trigram)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bigramBackoffWeights[bigram] = discountMass / denominator;
|
||||||
|
}
|
||||||
|
return bigramBackoffWeights;
|
||||||
|
}
|
||||||
|
|
||||||
|
void createLanguageModelFile(const vector<string>& words, path filePath) {
|
||||||
|
const double discountMass = 0.5;
|
||||||
|
const double deflator = 1.0 - discountMass;
|
||||||
|
|
||||||
|
map<unigram_t, int> unigramCounts = getUnigramCounts(words);
|
||||||
|
map<bigram_t, int> bigramCounts = getBigramCounts(words);
|
||||||
|
map<trigram_t, int> trigramCounts = getTrigramCounts(words);
|
||||||
|
|
||||||
|
map<unigram_t, double> unigramProbabilities = getUnigramProbabilities(words, unigramCounts, deflator);
|
||||||
|
map<bigram_t, double> bigramProbabilities = getBigramProbabilities(unigramCounts, bigramCounts, deflator);
|
||||||
|
map<trigram_t, double> trigramProbabilities = getTrigramProbabilities(bigramCounts, trigramCounts, deflator);
|
||||||
|
|
||||||
|
map<unigram_t, double> unigramBackoffWeights = getUnigramBackoffWeights(unigramCounts, unigramProbabilities, bigramCounts, discountMass);
|
||||||
|
map<bigram_t, double> bigramBackoffWeights = getBigramBackoffWeights(bigramCounts, bigramProbabilities, trigramCounts, discountMass);
|
||||||
|
|
||||||
|
boost::filesystem::ofstream file(filePath);
|
||||||
|
file << "Generated by " << appName << " " << appVersion << endl << endl;
|
||||||
|
|
||||||
|
file << "\\data\\" << endl;
|
||||||
|
file << "ngram 1=" << unigramCounts.size() << endl;
|
||||||
|
file << "ngram 2=" << bigramCounts.size() << endl;
|
||||||
|
file << "ngram 3=" << trigramCounts.size() << endl << endl;
|
||||||
|
|
||||||
|
file.setf(std::ios::fixed, std::ios::floatfield);
|
||||||
|
file.precision(4);
|
||||||
|
file << "\\1-grams:" << endl;
|
||||||
|
for (const unigram_t& unigram : unigramCounts | boost::adaptors::map_keys) {
|
||||||
|
file << log10(unigramProbabilities.at(unigram))
|
||||||
|
<< " " << unigram
|
||||||
|
<< " " << log10(unigramBackoffWeights.at(unigram)) << endl;
|
||||||
|
}
|
||||||
|
file << endl;
|
||||||
|
|
||||||
|
file << "\\2-grams:" << endl;
|
||||||
|
for (const bigram_t& bigram : bigramCounts | boost::adaptors::map_keys) {
|
||||||
|
file << log10(bigramProbabilities.at(bigram))
|
||||||
|
<< " " << get<0>(bigram) << " " << get<1>(bigram)
|
||||||
|
<< " " << log10(bigramBackoffWeights.at(bigram)) << endl;
|
||||||
|
}
|
||||||
|
file << endl;
|
||||||
|
|
||||||
|
file << "\\3-grams:" << endl;
|
||||||
|
for (const trigram_t& trigram : trigramCounts | boost::adaptors::map_keys) {
|
||||||
|
file << log10(trigramProbabilities.at(trigram))
|
||||||
|
<< " " << get<0>(trigram) << " " << get<1>(trigram) << " " << get<2>(trigram) << endl;
|
||||||
|
}
|
||||||
|
file << endl;
|
||||||
|
|
||||||
|
file << "\\end\\" << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
lambda_unique_ptr<ngram_model_t> createLanguageModel(const vector<string>& words, logmath_t& logMath) {
|
||||||
|
path tempFilePath = getTempFilePath();
|
||||||
|
createLanguageModelFile(words, tempFilePath);
|
||||||
|
auto deleteTempFile = gsl::finally([&]() { boost::filesystem::remove(tempFilePath); });
|
||||||
|
|
||||||
|
return lambda_unique_ptr<ngram_model_t>(
|
||||||
|
ngram_model_read(nullptr, tempFilePath.string().c_str(), NGRAM_ARPA, &logMath),
|
||||||
|
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||||
|
}
|
|
@ -0,0 +1,6 @@
|
||||||
|
#pragma once
|
||||||
|
#include <sphinxbase/ngram_model.h>
|
||||||
|
#include <vector>
|
||||||
|
#include "tools.h"
|
||||||
|
|
||||||
|
lambda_unique_ptr<ngram_model_t> createLanguageModel(const std::vector<std::string>& words, logmath_t& logMath);
|
24
src/main.cpp
24
src/main.cpp
|
@ -12,9 +12,12 @@
|
||||||
#include <gsl_util.h>
|
#include <gsl_util.h>
|
||||||
#include "Exporter.h"
|
#include "Exporter.h"
|
||||||
#include "ContinuousTimeline.h"
|
#include "ContinuousTimeline.h"
|
||||||
|
#include <boost/filesystem/operations.hpp>
|
||||||
|
#include "stringTools.h"
|
||||||
|
|
||||||
using std::exception;
|
using std::exception;
|
||||||
using std::string;
|
using std::string;
|
||||||
|
using std::u32string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
using std::make_unique;
|
using std::make_unique;
|
||||||
|
@ -75,6 +78,25 @@ void addFileSink(path path, logging::Level minLevel) {
|
||||||
logging::addSink(levelFilter);
|
logging::addSink(levelFilter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u32string readTextFile(path filePath) {
|
||||||
|
if (!exists(filePath)) {
|
||||||
|
throw std::invalid_argument(fmt::format("File {} does not exist.", filePath));
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
boost::filesystem::ifstream file;
|
||||||
|
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||||
|
file.open(filePath);
|
||||||
|
string utf8Text((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||||
|
try {
|
||||||
|
return utf8ToUtf32(utf8Text);
|
||||||
|
} catch (...) {
|
||||||
|
std::throw_with_nested(std::runtime_error(fmt::format("File encoding is not ASCII or UTF-8.", filePath)));
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
std::throw_with_nested(std::runtime_error(fmt::format("Error reading file {0}.", filePath)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
auto pausableStderrSink = addPausableStdErrSink(logging::Level::Warn);
|
auto pausableStderrSink = addPausableStdErrSink(logging::Level::Warn);
|
||||||
pausableStderrSink->pause();
|
pausableStderrSink->pause();
|
||||||
|
@ -88,6 +110,7 @@ int main(int argc, char *argv[]) {
|
||||||
tclap::ValuesConstraint<logging::Level> logLevelConstraint(logLevels);
|
tclap::ValuesConstraint<logging::Level> logLevelConstraint(logLevels);
|
||||||
tclap::ValueArg<logging::Level> logLevel("", "logLevel", "The minimum log level to log", false, logging::Level::Debug, &logLevelConstraint, cmd);
|
tclap::ValueArg<logging::Level> logLevel("", "logLevel", "The minimum log level to log", false, logging::Level::Debug, &logLevelConstraint, cmd);
|
||||||
tclap::ValueArg<string> logFileName("", "logFile", "The log file path.", false, string(), "string", cmd);
|
tclap::ValueArg<string> logFileName("", "logFile", "The log file path.", false, string(), "string", cmd);
|
||||||
|
tclap::ValueArg<string> dialogFile("d", "dialogFile", "A file containing the text of the dialog.", false, string(), "string", cmd);
|
||||||
auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
|
auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
|
||||||
tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
|
tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
|
||||||
tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::TSV, &exportFormatConstraint, cmd);
|
tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::TSV, &exportFormatConstraint, cmd);
|
||||||
|
@ -117,6 +140,7 @@ int main(int argc, char *argv[]) {
|
||||||
ProgressBar progressBar;
|
ProgressBar progressBar;
|
||||||
phones = detectPhones(
|
phones = detectPhones(
|
||||||
createAudioStream(inputFileName.getValue()),
|
createAudioStream(inputFileName.getValue()),
|
||||||
|
dialogFile.isSet() ? readTextFile(path(dialogFile.getValue())) : boost::optional<u32string>(),
|
||||||
progressBar);
|
progressBar);
|
||||||
}
|
}
|
||||||
std::cerr << "Done" << std::endl;
|
std::cerr << "Done" << std::endl;
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <boost/algorithm/string.hpp>
|
|
||||||
#include "phoneExtraction.h"
|
#include "phoneExtraction.h"
|
||||||
#include "audio/SampleRateConverter.h"
|
#include "audio/SampleRateConverter.h"
|
||||||
#include "platformTools.h"
|
#include "platformTools.h"
|
||||||
|
@ -14,6 +13,9 @@
|
||||||
#include <Timeline.h>
|
#include <Timeline.h>
|
||||||
#include <audio/voiceActivityDetection.h>
|
#include <audio/voiceActivityDetection.h>
|
||||||
#include <audio/AudioStreamSegment.h>
|
#include <audio/AudioStreamSegment.h>
|
||||||
|
#include "languageModels.h"
|
||||||
|
#include "tokenization.h"
|
||||||
|
#include "g2p.h"
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include <pocketsphinx.h>
|
#include <pocketsphinx.h>
|
||||||
|
@ -35,33 +37,34 @@ using std::function;
|
||||||
using std::regex;
|
using std::regex;
|
||||||
using std::regex_replace;
|
using std::regex_replace;
|
||||||
using std::chrono::duration;
|
using std::chrono::duration;
|
||||||
|
using boost::optional;
|
||||||
|
using std::u32string;
|
||||||
|
|
||||||
constexpr int sphinxSampleRate = 16000;
|
constexpr int sphinxSampleRate = 16000;
|
||||||
|
|
||||||
lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {
|
const path& getSphinxModelDirectory() {
|
||||||
|
static path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
|
||||||
|
return sphinxModelDirectory;
|
||||||
|
}
|
||||||
|
|
||||||
|
lambda_unique_ptr<ps_decoder_t> createDecoder() {
|
||||||
lambda_unique_ptr<cmd_ln_t> config(
|
lambda_unique_ptr<cmd_ln_t> config(
|
||||||
cmd_ln_init(
|
cmd_ln_init(
|
||||||
nullptr, ps_args(), true,
|
nullptr, ps_args(), true,
|
||||||
// Set acoustic model
|
// Set acoustic model
|
||||||
"-hmm", (sphinxModelDirectory / "acoustic-model").string().c_str(),
|
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
||||||
// Set language model
|
// Set pronunciation dictionary
|
||||||
"-lm", (sphinxModelDirectory / "en-us.lm.bin").string().c_str(),
|
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
|
||||||
// Set pronounciation dictionary
|
|
||||||
"-dict", (sphinxModelDirectory / "cmudict-en-us.dict").string().c_str(),
|
|
||||||
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
||||||
"-dither", "yes",
|
"-dither", "yes",
|
||||||
nullptr),
|
nullptr),
|
||||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||||
if (!config) throw runtime_error("Error creating configuration.");
|
if (!config) throw runtime_error("Error creating configuration.");
|
||||||
|
|
||||||
return config;
|
|
||||||
}
|
|
||||||
|
|
||||||
lambda_unique_ptr<ps_decoder_t> createSpeechRecognizer(cmd_ln_t& config) {
|
|
||||||
lambda_unique_ptr<ps_decoder_t> recognizer(
|
lambda_unique_ptr<ps_decoder_t> recognizer(
|
||||||
ps_init(&config),
|
ps_init(config.get()),
|
||||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||||
if (!recognizer) throw runtime_error("Error creating speech recognizer.");
|
if (!recognizer) throw runtime_error("Error creating speech decoder.");
|
||||||
|
|
||||||
return recognizer;
|
return recognizer;
|
||||||
}
|
}
|
||||||
|
@ -141,32 +144,32 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
|
||||||
logging::log(logLevel, message);
|
logging::log(logLevel, message);
|
||||||
}
|
}
|
||||||
|
|
||||||
BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& decoder, ProgressSink& progressSink) {
|
||||||
// Convert audio stream to the exact format PocketSphinx requires
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
||||||
|
|
||||||
// Restart timing at 0
|
// Restart timing at 0
|
||||||
ps_start_stream(&recognizer);
|
ps_start_stream(&decoder);
|
||||||
|
|
||||||
// Start recognition
|
// Start recognition
|
||||||
int error = ps_start_utt(&recognizer);
|
int error = ps_start_utt(&decoder);
|
||||||
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
||||||
|
|
||||||
// Process entire sound file
|
// Process entire sound file
|
||||||
auto processBuffer = [&recognizer](const vector<int16_t>& buffer) {
|
auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
|
||||||
int searchedFrameCount = ps_process_raw(&recognizer, buffer.data(), buffer.size(), false, false);
|
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
|
||||||
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||||
};
|
};
|
||||||
processAudioStream(*audioStream.get(), processBuffer, progressSink);
|
processAudioStream(*audioStream.get(), processBuffer, progressSink);
|
||||||
|
|
||||||
// End recognition
|
// End recognition
|
||||||
error = ps_end_utt(&recognizer);
|
error = ps_end_utt(&decoder);
|
||||||
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
||||||
|
|
||||||
// Collect words
|
// Collect words
|
||||||
BoundedTimeline<string> result(audioStream->getTruncatedRange());
|
BoundedTimeline<string> result(audioStream->getTruncatedRange());
|
||||||
int32_t score;
|
int32_t score;
|
||||||
for (ps_seg_t* it = ps_seg_iter(&recognizer, &score); it; it = ps_seg_next(it)) {
|
for (ps_seg_t* it = ps_seg_iter(&decoder, &score); it; it = ps_seg_next(it)) {
|
||||||
const char* word = ps_seg_word(it);
|
const char* word = ps_seg_word(it);
|
||||||
int firstFrame, lastFrame;
|
int firstFrame, lastFrame;
|
||||||
ps_seg_frames(it, &firstFrame, &lastFrame);
|
ps_seg_frames(it, &firstFrame, &lastFrame);
|
||||||
|
@ -176,35 +179,6 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Splits dialog into words, doing minimal preprocessing.
|
|
||||||
// A robust solution should use TTS logic to cope with numbers, abbreviations, unknown words etc.
|
|
||||||
vector<string> extractDialogWords(string dialog) {
|
|
||||||
// Convert to lower case
|
|
||||||
boost::algorithm::to_lower(dialog);
|
|
||||||
|
|
||||||
// Insert silences where appropriate
|
|
||||||
dialog = regex_replace(dialog, regex("[,;.:!?] |-"), " <sil> ");
|
|
||||||
|
|
||||||
// Remove all undesired characters
|
|
||||||
dialog = regex_replace(dialog, regex("[^a-z.'\\0-9<>]"), " ");
|
|
||||||
|
|
||||||
// Collapse whitespace
|
|
||||||
dialog = regex_replace(dialog, regex("\\s+"), " ");
|
|
||||||
|
|
||||||
// Trim
|
|
||||||
boost::algorithm::trim(dialog);
|
|
||||||
|
|
||||||
// Ugly hack: Remove trailing period
|
|
||||||
if (boost::algorithm::ends_with(dialog, ".")) {
|
|
||||||
dialog.pop_back();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Split into words
|
|
||||||
vector<string> result;
|
|
||||||
boost::algorithm::split(result, dialog, boost::is_space());
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||||
s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
||||||
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
||||||
|
@ -214,12 +188,12 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||||
BoundedTimeline<Phone> getPhoneAlignment(
|
BoundedTimeline<Phone> getPhoneAlignment(
|
||||||
const vector<s3wid_t>& wordIds,
|
const vector<s3wid_t>& wordIds,
|
||||||
unique_ptr<AudioStream> audioStream,
|
unique_ptr<AudioStream> audioStream,
|
||||||
ps_decoder_t& recognizer,
|
ps_decoder_t& decoder,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
// Create alignment list
|
// Create alignment list
|
||||||
lambda_unique_ptr<ps_alignment_t> alignment(
|
lambda_unique_ptr<ps_alignment_t> alignment(
|
||||||
ps_alignment_init(recognizer.d2p),
|
ps_alignment_init(decoder.d2p),
|
||||||
[](ps_alignment_t* alignment) { ps_alignment_free(alignment); });
|
[](ps_alignment_t* alignment) { ps_alignment_free(alignment); });
|
||||||
if (!alignment) throw runtime_error("Error creating alignment.");
|
if (!alignment) throw runtime_error("Error creating alignment.");
|
||||||
for (s3wid_t wordId : wordIds) {
|
for (s3wid_t wordId : wordIds) {
|
||||||
|
@ -233,9 +207,9 @@ BoundedTimeline<Phone> getPhoneAlignment(
|
||||||
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
||||||
|
|
||||||
// Create search structure
|
// Create search structure
|
||||||
acmod_t* acousticModel = recognizer.acmod;
|
acmod_t* acousticModel = decoder.acmod;
|
||||||
lambda_unique_ptr<ps_search_t> search(
|
lambda_unique_ptr<ps_search_t> search(
|
||||||
state_align_search_init("state_align", recognizer.config, acousticModel, alignment.get()),
|
state_align_search_init("state_align", decoder.config, acousticModel, alignment.get()),
|
||||||
[](ps_search_t* search) { ps_search_free(search); });
|
[](ps_search_t* search) { ps_search_free(search); });
|
||||||
if (!search) throw runtime_error("Error creating search.");
|
if (!search) throw runtime_error("Error creating search.");
|
||||||
|
|
||||||
|
@ -247,7 +221,7 @@ BoundedTimeline<Phone> getPhoneAlignment(
|
||||||
ps_search_start(search.get());
|
ps_search_start(search.get());
|
||||||
|
|
||||||
// Process entire sound file
|
// Process entire sound file
|
||||||
auto processBuffer = [&recognizer, &acousticModel, &search](const vector<int16_t>& buffer) {
|
auto processBuffer = [&decoder, &acousticModel, &search](const vector<int16_t>& buffer) {
|
||||||
const int16* nextSample = buffer.data();
|
const int16* nextSample = buffer.data();
|
||||||
size_t remainingSamples = buffer.size();
|
size_t remainingSamples = buffer.size();
|
||||||
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
||||||
|
@ -266,7 +240,7 @@ BoundedTimeline<Phone> getPhoneAlignment(
|
||||||
acmod_end_utt(acousticModel);
|
acmod_end_utt(acousticModel);
|
||||||
|
|
||||||
// Extract phones with timestamps
|
// Extract phones with timestamps
|
||||||
char** phoneNames = recognizer.dict->mdef->ciname;
|
char** phoneNames = decoder.dict->mdef->ciname;
|
||||||
BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
|
BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
|
||||||
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
||||||
// Get phone
|
// Get phone
|
||||||
|
@ -285,8 +259,28 @@ BoundedTimeline<Phone> getPhoneAlignment(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
|
||||||
|
map<string, string> missingPronunciations;
|
||||||
|
for (const string& word : words) {
|
||||||
|
if (dict_wordid(decoder.dict, word.c_str()) == BAD_S3WID) {
|
||||||
|
string pronunciation;
|
||||||
|
for (Phone phone : wordToPhones(word)) {
|
||||||
|
if (pronunciation.length() > 0) pronunciation += " ";
|
||||||
|
pronunciation += PhoneConverter::get().toString(phone);
|
||||||
|
}
|
||||||
|
missingPronunciations[word] = pronunciation;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
|
||||||
|
bool isLast = it == --missingPronunciations.end();
|
||||||
|
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
|
||||||
|
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
BoundedTimeline<Phone> detectPhones(
|
BoundedTimeline<Phone> detectPhones(
|
||||||
unique_ptr<AudioStream> audioStream,
|
unique_ptr<AudioStream> audioStream,
|
||||||
|
optional<u32string> dialog,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
// Pocketsphinx doesn't like empty input
|
// Pocketsphinx doesn't like empty input
|
||||||
|
@ -305,13 +299,6 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
audioStream = removeDCOffset(std::move(audioStream));
|
audioStream = removeDCOffset(std::move(audioStream));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Create PocketSphinx configuration
|
|
||||||
path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
|
|
||||||
auto config = createConfig(sphinxModelDirectory);
|
|
||||||
|
|
||||||
// Create speech recognizer
|
|
||||||
auto recognizer = createSpeechRecognizer(*config.get());
|
|
||||||
|
|
||||||
// Split audio into utterances
|
// Split audio into utterances
|
||||||
BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true));
|
BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true));
|
||||||
|
|
||||||
|
@ -323,6 +310,29 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
}
|
}
|
||||||
auto utteranceProgressSinkIt = utteranceProgressSinks.begin();
|
auto utteranceProgressSinkIt = utteranceProgressSinks.begin();
|
||||||
|
|
||||||
|
// Create speech recognizer
|
||||||
|
auto decoder = createDecoder();
|
||||||
|
|
||||||
|
// Set language model
|
||||||
|
lambda_unique_ptr<ngram_model_t> languageModel;
|
||||||
|
if (dialog) {
|
||||||
|
// Create dialog-specific language model
|
||||||
|
vector<string> words = tokenizeText(*dialog);
|
||||||
|
words.insert(words.begin(), "<s>");
|
||||||
|
words.push_back("</s>");
|
||||||
|
languageModel = createLanguageModel(words, *decoder->lmath);
|
||||||
|
|
||||||
|
// Add any dialog-specific words to the dictionary
|
||||||
|
addMissingDictionaryWords(words, *decoder);
|
||||||
|
} else {
|
||||||
|
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
|
||||||
|
languageModel = lambda_unique_ptr<ngram_model_t>(
|
||||||
|
ngram_model_read(decoder->config, modelPath.string().c_str(), NGRAM_AUTO, decoder->lmath),
|
||||||
|
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||||
|
}
|
||||||
|
ps_set_lm(decoder.get(), "lm", languageModel.get());
|
||||||
|
ps_set_search(decoder.get(), "lm");
|
||||||
|
|
||||||
BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
|
BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
|
||||||
for (const auto& timedUtterance : utterances) {
|
for (const auto& timedUtterance : utterances) {
|
||||||
ProgressMerger utteranceProgressMerger(**utteranceProgressSinkIt++);
|
ProgressMerger utteranceProgressMerger(**utteranceProgressSinkIt++);
|
||||||
|
@ -335,7 +345,7 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
auto streamSegment = createSegment(audioStream->clone(true), timeRange);
|
auto streamSegment = createSegment(audioStream->clone(true), timeRange);
|
||||||
|
|
||||||
// Get words
|
// Get words
|
||||||
BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), *recognizer.get(), wordRecognitionProgressSink);
|
BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), *decoder.get(), wordRecognitionProgressSink);
|
||||||
for (Timed<string> timedWord : words) {
|
for (Timed<string> timedWord : words) {
|
||||||
timedWord.getTimeRange().shift(timedUtterance.getStart());
|
timedWord.getTimeRange().shift(timedUtterance.getStart());
|
||||||
logging::logTimedEvent("word", timedWord);
|
logging::logTimedEvent("word", timedWord);
|
||||||
|
@ -344,12 +354,12 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
// Look up words in dictionary
|
// Look up words in dictionary
|
||||||
vector<s3wid_t> wordIds;
|
vector<s3wid_t> wordIds;
|
||||||
for (const auto& timedWord : words) {
|
for (const auto& timedWord : words) {
|
||||||
wordIds.push_back(getWordId(timedWord.getValue(), *recognizer->dict));
|
wordIds.push_back(getWordId(timedWord.getValue(), *decoder->dict));
|
||||||
}
|
}
|
||||||
if (wordIds.empty()) continue;
|
if (wordIds.empty()) continue;
|
||||||
|
|
||||||
// Align the words' phones with speech
|
// Align the words' phones with speech
|
||||||
BoundedTimeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), *recognizer.get(), alignmentProgressSink);
|
BoundedTimeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), *decoder.get(), alignmentProgressSink);
|
||||||
segmentPhones.shift(timedUtterance.getStart());
|
segmentPhones.shift(timedUtterance.getStart());
|
||||||
for (const auto& timedPhone : segmentPhones) {
|
for (const auto& timedPhone : segmentPhones) {
|
||||||
logging::logTimedEvent("phone", timedPhone);
|
logging::logTimedEvent("phone", timedPhone);
|
||||||
|
|
|
@ -8,4 +8,5 @@
|
||||||
|
|
||||||
BoundedTimeline<Phone> detectPhones(
|
BoundedTimeline<Phone> detectPhones(
|
||||||
std::unique_ptr<AudioStream> audioStream,
|
std::unique_ptr<AudioStream> audioStream,
|
||||||
|
boost::optional<std::u32string> dialog,
|
||||||
ProgressSink& progressSink);
|
ProgressSink& progressSink);
|
||||||
|
|
|
@ -2,10 +2,14 @@
|
||||||
#include <boost/filesystem/path.hpp>
|
#include <boost/filesystem/path.hpp>
|
||||||
#include <boost/predef.h>
|
#include <boost/predef.h>
|
||||||
#include <format.h>
|
#include <format.h>
|
||||||
|
#include <boost/uuid/uuid.hpp>
|
||||||
|
#include <boost/uuid/uuid_generators.hpp>
|
||||||
|
#include <boost/uuid/uuid_io.hpp>
|
||||||
|
|
||||||
#include "platformTools.h"
|
#include "platformTools.h"
|
||||||
|
|
||||||
using boost::filesystem::path;
|
using boost::filesystem::path;
|
||||||
|
using std::string;
|
||||||
|
|
||||||
constexpr int InitialBufferSize = 256;
|
constexpr int InitialBufferSize = 256;
|
||||||
|
|
||||||
|
@ -129,3 +133,10 @@ path getBinPath() {
|
||||||
path getBinDirectory() {
|
path getBinDirectory() {
|
||||||
return getBinPath().parent_path();
|
return getBinPath().parent_path();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
path getTempFilePath() {
|
||||||
|
path tempDirectory = boost::filesystem::temp_directory_path();
|
||||||
|
static auto generateUuid = boost::uuids::random_generator();
|
||||||
|
string fileName = to_string(generateUuid());
|
||||||
|
return tempDirectory / fileName;
|
||||||
|
}
|
||||||
|
|
|
@ -4,3 +4,4 @@
|
||||||
|
|
||||||
boost::filesystem::path getBinPath();
|
boost::filesystem::path getBinPath();
|
||||||
boost::filesystem::path getBinDirectory();
|
boost::filesystem::path getBinDirectory();
|
||||||
|
boost::filesystem::path getTempFilePath();
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "stringTools.h"
|
#include "stringTools.h"
|
||||||
#include <boost/algorithm/string/trim.hpp>
|
#include <boost/algorithm/string/trim.hpp>
|
||||||
|
#include <codecvt>
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::wstring;
|
using std::wstring;
|
||||||
|
@ -106,3 +107,12 @@ string toASCII(const u32string& s) {
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u32string utf8ToUtf32(const string& s) {
|
||||||
|
// Visual Studio 2015 has a bug regarding char32_t:
|
||||||
|
// https://connect.microsoft.com/VisualStudio/feedback/details/1403302/unresolved-external-when-using-codecvt-utf8
|
||||||
|
// Once VS2016 is out, we can use char32_t instead of uint32_t as type arguments and get rid of the outer conversion.
|
||||||
|
|
||||||
|
std::wstring_convert<std::codecvt_utf8<uint32_t>, uint32_t> convert;
|
||||||
|
return u32string(reinterpret_cast<const char32_t*>(convert.from_bytes(s).c_str()));
|
||||||
|
}
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <boost/optional.hpp>
|
#include <boost/optional.hpp>
|
||||||
|
|
||||||
|
@ -15,3 +14,5 @@ std::wstring latin1ToWide(const std::string& s);
|
||||||
boost::optional<char> toASCII(char32_t ch);
|
boost::optional<char> toASCII(char32_t ch);
|
||||||
|
|
||||||
std::string toASCII(const std::u32string& s);
|
std::string toASCII(const std::u32string& s);
|
||||||
|
|
||||||
|
std::u32string utf8ToUtf32(const std::string& s);
|
|
@ -0,0 +1,40 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <tuple>
|
||||||
|
|
||||||
|
namespace std {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void hash_combine(size_t& seed, const T& value) {
|
||||||
|
seed ^= std::hash<T>()(value) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recursive template code derived from Matthieu M.
|
||||||
|
template <typename Tuple, size_t Index = tuple_size<Tuple>::value - 1>
|
||||||
|
struct HashValueImpl {
|
||||||
|
static void apply(size_t& seed, const Tuple& tuple) {
|
||||||
|
HashValueImpl<Tuple, Index - 1>::apply(seed, tuple);
|
||||||
|
hash_combine(seed, std::get<Index>(tuple));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Tuple>
|
||||||
|
struct HashValueImpl<Tuple, 0> {
|
||||||
|
static void apply(size_t& seed, const Tuple& tuple) {
|
||||||
|
hash_combine(seed, std::get<0>(tuple));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ... TT>
|
||||||
|
struct hash<tuple<TT...>> {
|
||||||
|
size_t operator()(const tuple<TT...>& tt) const {
|
||||||
|
size_t seed = 0;
|
||||||
|
HashValueImpl<tuple<TT...> >::apply(seed, tt);
|
||||||
|
return seed;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue