From 610f4900469f40888feae428df27e8efca82d2d8 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Mon, 8 Oct 2018 20:30:45 +0200 Subject: [PATCH] Implement generic concept of recognizers with options pocketSphinx and phonetic --- rhubarb/CMakeLists.txt | 11 +- rhubarb/resharper.DotSettings | 23 + rhubarb/src/lib/rhubarbLib.cpp | 13 +- rhubarb/src/lib/rhubarbLib.h | 7 +- .../src/recognition/PhoneticRecognizer.cpp | 103 +++++ rhubarb/src/recognition/PhoneticRecognizer.h | 14 + ...gnition.cpp => PocketSphinxRecognizer.cpp} | 428 +++++------------- .../src/recognition/PocketSphinxRecognizer.h | 14 + rhubarb/src/recognition/Recognizer.h | 18 + rhubarb/src/recognition/phoneRecognition.h | 12 - rhubarb/src/recognition/pocketSphinxTools.cpp | 218 +++++++++ rhubarb/src/recognition/pocketSphinxTools.h | 39 ++ rhubarb/src/rhubarb/RecognizerType.cpp | 27 ++ rhubarb/src/rhubarb/RecognizerType.h | 20 + rhubarb/src/rhubarb/main.cpp | 25 +- 15 files changed, 635 insertions(+), 337 deletions(-) create mode 100644 rhubarb/src/recognition/PhoneticRecognizer.cpp create mode 100644 rhubarb/src/recognition/PhoneticRecognizer.h rename rhubarb/src/recognition/{phoneRecognition.cpp => PocketSphinxRecognizer.cpp} (55%) create mode 100644 rhubarb/src/recognition/PocketSphinxRecognizer.h create mode 100644 rhubarb/src/recognition/Recognizer.h delete mode 100644 rhubarb/src/recognition/phoneRecognition.h create mode 100644 rhubarb/src/recognition/pocketSphinxTools.cpp create mode 100644 rhubarb/src/recognition/pocketSphinxTools.h create mode 100644 rhubarb/src/rhubarb/RecognizerType.cpp create mode 100644 rhubarb/src/rhubarb/RecognizerType.h diff --git a/rhubarb/CMakeLists.txt b/rhubarb/CMakeLists.txt index 9353edf..0fef2b7 100644 --- a/rhubarb/CMakeLists.txt +++ b/rhubarb/CMakeLists.txt @@ -413,8 +413,13 @@ add_library(rhubarb-recognition src/recognition/g2p.h src/recognition/languageModels.cpp src/recognition/languageModels.h - src/recognition/phoneRecognition.cpp - src/recognition/phoneRecognition.h + src/recognition/PhoneticRecognizer.cpp + src/recognition/PhoneticRecognizer.h + src/recognition/PocketSphinxRecognizer.cpp + src/recognition/PocketSphinxRecognizer.h + src/recognition/pocketSphinxTools.cpp + src/recognition/pocketSphinxTools.h + src/recognition/Recognizer.h src/recognition/tokenization.cpp src/recognition/tokenization.h ) @@ -487,6 +492,8 @@ add_executable(rhubarb src/rhubarb/main.cpp src/rhubarb/ExportFormat.cpp src/rhubarb/ExportFormat.h + src/rhubarb/RecognizerType.cpp + src/rhubarb/RecognizerType.h src/rhubarb/semanticEntries.cpp src/rhubarb/semanticEntries.h src/rhubarb/sinks.cpp diff --git a/rhubarb/resharper.DotSettings b/rhubarb/resharper.DotSettings index b16b555..168efbe 100644 --- a/rhubarb/resharper.DotSettings +++ b/rhubarb/resharper.DotSettings @@ -1,7 +1,12 @@  + HINT + ERROR + DO_NOT_SHOW + USE_TABS_ONLY + USE_TABS_ONLY False False False @@ -29,6 +34,7 @@ CHOP_ALWAYS END_OF_LINE END_OF_LINE + USE_TABS_ONLY False END_OF_LINE END_OF_LINE @@ -44,6 +50,14 @@ False True False + USE_TABS_ONLY + USE_TABS_ONLY + USE_TABS_ONLY + USE_TABS_ONLY + USE_TABS_ONLY + USE_TABS_ONLY + USE_TABS_ONLY + USE_TABS_ONLY UseExplicitType UseVarWhenEvident <NamingElement Priority="10"><Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"><type Name="class field" /><type Name="struct field" /></Descriptor><Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /></NamingElement> @@ -108,7 +122,16 @@ C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches True True + True + True True True True + True + True + True + True + True + True + True \ No newline at end of file diff --git a/rhubarb/src/lib/rhubarbLib.cpp b/rhubarb/src/lib/rhubarbLib.cpp index ffadf68..5f8460f 100644 --- a/rhubarb/src/lib/rhubarbLib.cpp +++ b/rhubarb/src/lib/rhubarbLib.cpp @@ -1,6 +1,5 @@ #include "rhubarbLib.h" #include "core/Phone.h" -#include "recognition/phoneRecognition.h" #include "tools/textFiles.h" #include "animation/mouthAnimation.h" #include "audio/audioFileReading.h" @@ -8,27 +7,29 @@ using boost::optional; using std::string; using boost::filesystem::path; -using std::unique_ptr; JoiningContinuousTimeline animateAudioClip( const AudioClip& audioClip, - optional dialog, + const optional& dialog, + const Recognizer& recognizer, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink) { - BoundedTimeline phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink); + const BoundedTimeline phones = + recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink); JoiningContinuousTimeline result = animate(phones, targetShapeSet); return result; } JoiningContinuousTimeline animateWaveFile( path filePath, - optional dialog, + const optional& dialog, + const Recognizer& recognizer, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink) { const auto audioClip = createAudioFileClip(filePath); - return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink); + return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink); } diff --git a/rhubarb/src/lib/rhubarbLib.h b/rhubarb/src/lib/rhubarbLib.h index 8663761..ca40a06 100644 --- a/rhubarb/src/lib/rhubarbLib.h +++ b/rhubarb/src/lib/rhubarbLib.h @@ -6,17 +6,20 @@ #include "tools/ProgressBar.h" #include #include "animation/targetShapeSet.h" +#include "recognition/Recognizer.h" JoiningContinuousTimeline animateAudioClip( const AudioClip& audioClip, - boost::optional dialog, + const boost::optional& dialog, + const Recognizer& recognizer, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink); JoiningContinuousTimeline animateWaveFile( boost::filesystem::path filePath, - boost::optional dialog, + const boost::optional& dialog, + const Recognizer& recognizer, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink); diff --git a/rhubarb/src/recognition/PhoneticRecognizer.cpp b/rhubarb/src/recognition/PhoneticRecognizer.cpp new file mode 100644 index 0000000..bd9c9ac --- /dev/null +++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp @@ -0,0 +1,103 @@ +#include "PhoneticRecognizer.h" +#include "time/Timeline.h" +#include "audio/AudioSegment.h" +#include "audio/SampleRateConverter.h" +#include "audio/processing.h" +#include "time/timedLogging.h" + +using std::runtime_error; +using std::unique_ptr; +using std::string; +using boost::optional; + +static lambda_unique_ptr createDecoder(optional dialog) { + UNUSED(dialog); + + lambda_unique_ptr config( + cmd_ln_init( + nullptr, ps_args(), true, + // Set acoustic model + "-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(), + // Set phonetic language model + "-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(), + "-allphone_ci", "yes", + // Set language model probability weight. + // Low values (<= 0.4) can lead to fluttering animation. + // High values (>= 1.0) can lead to imprecise or freezing animation. + "-lw", "0.8", + + // The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition + + // Set beam width applied to every frame in Viterbi search + "-beam", "1e-20", + // Set beam width applied to phone transitions + "-pbeam", "1e-20", + nullptr), + [](cmd_ln_t* config) { cmd_ln_free_r(config); }); + if (!config) throw runtime_error("Error creating configuration."); + + lambda_unique_ptr decoder( + ps_init(config.get()), + [](ps_decoder_t* recognizer) { ps_free(recognizer); }); + if (!decoder) throw runtime_error("Error creating speech decoder."); + + return decoder; +} + +static Timeline utteranceToPhones( + const AudioClip& audioClip, + TimeRange utteranceTimeRange, + ps_decoder_t& decoder, + ProgressSink& utteranceProgressSink +) { + // Pad time range to give PocketSphinx some breathing room + TimeRange paddedTimeRange = utteranceTimeRange; + const centiseconds padding(3); + paddedTimeRange.grow(padding); + paddedTimeRange.trim(audioClip.getTruncatedRange()); + + const unique_ptr clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate); + const auto audioBuffer = copyTo16bitBuffer(*clipSegment); + + // Detect phones (returned as words) + BoundedTimeline phoneStrings = recognizeWords(audioBuffer, decoder); + phoneStrings.shift(paddedTimeRange.getStart()); + Timeline utterancePhones; + for (const auto& timedPhoneString : phoneStrings) { + Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue()); + if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) { + // Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate. + phone = Phone::Schwa; + } + utterancePhones.set(timedPhoneString.getTimeRange(), phone); + } + + // Log raw phones + for (const auto& timedPhone : utterancePhones) { + logTimedEvent("rawPhone", timedPhone); + } + + // Guess positions of noise sounds + JoiningTimeline noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones); + for (const auto& noiseSound : noiseSounds) { + utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise); + } + + // Log phones + for (const auto& timedPhone : utterancePhones) { + logTimedEvent("phone", timedPhone); + } + + utteranceProgressSink.reportProgress(1.0); + + return utterancePhones; +} + +BoundedTimeline PhoneticRecognizer::recognizePhones( + const AudioClip& inputAudioClip, + optional dialog, + int maxThreadCount, + ProgressSink& progressSink +) const { + return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink); +} diff --git a/rhubarb/src/recognition/PhoneticRecognizer.h b/rhubarb/src/recognition/PhoneticRecognizer.h new file mode 100644 index 0000000..96797cf --- /dev/null +++ b/rhubarb/src/recognition/PhoneticRecognizer.h @@ -0,0 +1,14 @@ +#pragma once + +#include "Recognizer.h" +#include "pocketSphinxTools.h" + +class PhoneticRecognizer : public Recognizer { +public: + BoundedTimeline recognizePhones( + const AudioClip& inputAudioClip, + boost::optional dialog, + int maxThreadCount, + ProgressSink& progressSink + ) const override; +}; diff --git a/rhubarb/src/recognition/phoneRecognition.cpp b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp similarity index 55% rename from rhubarb/src/recognition/phoneRecognition.cpp rename to rhubarb/src/recognition/PocketSphinxRecognizer.cpp index e45c765..b97c0b7 100644 --- a/rhubarb/src/recognition/phoneRecognition.cpp +++ b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp @@ -1,143 +1,133 @@ -#include -#include "phoneRecognition.h" -#include "audio/SampleRateConverter.h" -#include "tools/platformTools.h" -#include "tools/tools.h" -#include -#include +#include "PocketSphinxRecognizer.h" #include #include -#include "logging/logging.h" -#include "audio/DcOffset.h" -#include "time/Timeline.h" -#include "audio/voiceActivityDetection.h" #include "audio/AudioSegment.h" +#include "audio/SampleRateConverter.h" #include "languageModels.h" #include "tokenization.h" #include "g2p.h" #include "time/ContinuousTimeline.h" #include "audio/processing.h" -#include "tools/parallel.h" -#include -#include "tools/ObjectPool.h" #include "time/timedLogging.h" extern "C" { -#include -#include -#include #include -#include -#include } using std::runtime_error; using std::invalid_argument; using std::unique_ptr; -using std::shared_ptr; using std::string; using std::vector; using std::map; using boost::filesystem::path; -using std::function; using std::regex; using std::regex_replace; -using std::chrono::duration; using boost::optional; -using std::string; -using std::chrono::duration_cast; using std::array; -constexpr int sphinxSampleRate = 16000; - -const path& getSphinxModelDirectory() { - static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx"); - return sphinxModelDirectory; +bool dictionaryContains(dict_t& dictionary, const string& word) { + return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID; } -logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) { - switch (errorLevel) { - case ERR_DEBUG: - case ERR_INFO: - case ERR_INFOCONT: - return logging::Level::Trace; - case ERR_WARN: - return logging::Level::Warn; - case ERR_ERROR: - return logging::Level::Error; - case ERR_FATAL: - return logging::Level::Fatal; - default: - throw invalid_argument("Unknown log level."); +s3wid_t getWordId(const string& word, dict_t& dictionary) { + const s3wid_t wordId = dict_wordid(&dictionary, word.c_str()); + if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word)); + return wordId; +} + +void addMissingDictionaryWords(const vector& words, ps_decoder_t& decoder) { + map missingPronunciations; + for (const string& word : words) { + if (!dictionaryContains(*decoder.dict, word)) { + string pronunciation; + for (Phone phone : wordToPhones(word)) { + if (pronunciation.length() > 0) pronunciation += " "; + pronunciation += PhoneConverter::get().toString(phone); + } + missingPronunciations[word] = pronunciation; + } + } + for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) { + const bool isLast = it == --missingPronunciations.end(); + logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second); + ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast); } } -void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) { - UNUSED(user_data); - - // Create varArgs list - va_list args; - va_start(args, format); - auto _ = gsl::finally([&args]() { va_end(args); }); - - // Format message - const int initialSize = 256; - vector chars(initialSize); - bool success = false; - while (!success) { - int charsWritten = vsnprintf(chars.data(), chars.size(), format, args); - if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message."); - - success = charsWritten < static_cast(chars.size()); - if (!success) chars.resize(chars.size() * 2); - } - regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): "); - string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only); - boost::algorithm::trim(message); - - logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel); - logging::log(logLevel, message); -} - -BoundedTimeline recognizeWords(const vector& audioBuffer, ps_decoder_t& decoder) { - // Restart timing at 0 - ps_start_stream(&decoder); - - // Start recognition - int error = ps_start_utt(&decoder); - if (error) throw runtime_error("Error starting utterance processing for word recognition."); - - // Process entire audio clip - const bool noRecognition = false; - const bool fullUtterance = true; - int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance); - if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition."); - - // End recognition - error = ps_end_utt(&decoder); - if (error) throw runtime_error("Error ending utterance processing for word recognition."); - - BoundedTimeline result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate))); - bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0; - if (noWordsRecognized) { - return result; - } - - // Collect words - for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) { - const char* word = ps_seg_word(it); - int firstFrame, lastFrame; - ps_seg_frames(it, &firstFrame, &lastFrame); - result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word); +lambda_unique_ptr createDefaultLanguageModel(ps_decoder_t& decoder) { + path modelPath = getSphinxModelDirectory() / "en-us.lm.bin"; + lambda_unique_ptr result( + ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath), + [](ngram_model_t* lm) { ngram_model_free(lm); }); + if (!result) { + throw runtime_error(fmt::format("Error reading language model from {}.", modelPath)); } return result; } -s3wid_t getWordId(const string& word, dict_t& dictionary) { - s3wid_t wordId = dict_wordid(&dictionary, word.c_str()); - if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word)); - return wordId; +lambda_unique_ptr createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) { + // Split dialog into normalized words + vector words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); }); + + // Add dialog-specific words to the dictionary + addMissingDictionaryWords(words, decoder); + + // Create dialog-specific language model + words.insert(words.begin(), ""); + words.emplace_back(""); + return createLanguageModel(words, decoder); +} + +lambda_unique_ptr createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) { + auto defaultLanguageModel = createDefaultLanguageModel(decoder); + auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog); + constexpr int modelCount = 2; + array languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() }; + array modelNames{ "defaultLM", "dialogLM" }; + array modelWeights{ 0.1f, 0.9f }; + lambda_unique_ptr result( + ngram_model_set_init(nullptr, languageModels.data(), const_cast(modelNames.data()), modelWeights.data(), modelCount), + [](ngram_model_t* lm) { ngram_model_free(lm); }); + if (!result) { + throw runtime_error("Error creating biased language model."); + } + + return result; +} + +static lambda_unique_ptr createDecoder(optional dialog) { + lambda_unique_ptr config( + cmd_ln_init( + nullptr, ps_args(), true, + // Set acoustic model + "-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(), + // Set pronunciation dictionary + "-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(), + // Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor) + "-dither", "yes", + // Disable VAD -- we're doing that ourselves + "-remove_silence", "no", + // Perform per-utterance cepstral mean normalization + "-cmn", "batch", + nullptr), + [](cmd_ln_t* config) { cmd_ln_free_r(config); }); + if (!config) throw runtime_error("Error creating configuration."); + + lambda_unique_ptr decoder( + ps_init(config.get()), + [](ps_decoder_t* recognizer) { ps_free(recognizer); }); + if (!decoder) throw runtime_error("Error creating speech decoder."); + + // Set language model + lambda_unique_ptr languageModel(dialog + ? createBiasedLanguageModel(*decoder, *dialog) + : createDefaultLanguageModel(*decoder)); + ps_set_lm(decoder.get(), "lm", languageModel.get()); + ps_set_search(decoder.get(), "lm"); + + return decoder; } optional> getPhoneAlignment( @@ -178,7 +168,7 @@ optional> getPhoneAlignment( // Process entire audio clip const int16* nextSample = audioBuffer.data(); size_t remainingSamples = audioBuffer.size(); - bool fullUtterance = true; + const bool fullUtterance = true; while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) { while (acousticModel->n_feat_frame > 0) { ps_search_step(search.get(), acousticModel->output_frame); @@ -197,7 +187,7 @@ optional> getPhoneAlignment( for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) { // Get phone ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it); - s3cipid_t phoneId = phoneEntry->id.pid.cipid; + const s3cipid_t phoneId = phoneEntry->id.pid.cipid; string phoneName = phoneNames[phoneId]; if (phoneName == "SIL") continue; @@ -207,162 +197,42 @@ optional> getPhoneAlignment( centiseconds duration(phoneEntry->duration); Phone phone = PhoneConverter::get().parse(phoneName); if (phone == Phone::AH && duration < 6_cs) { - // Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate. + // Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate. phone = Phone::Schwa; } - Timed timedPhone(start, start + duration, phone); + const Timed timedPhone(start, start + duration, phone); result.set(timedPhone); } return result; } -bool dictionaryContains(dict_t& dictionary, const string& word) { - return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID; -} - -void addMissingDictionaryWords(const vector& words, ps_decoder_t& decoder) { - map missingPronunciations; - for (const string& word : words) { - if (!dictionaryContains(*decoder.dict, word)) { - string pronunciation; - for (Phone phone : wordToPhones(word)) { - if (pronunciation.length() > 0) pronunciation += " "; - pronunciation += PhoneConverter::get().toString(phone); - } - missingPronunciations[word] = pronunciation; - } - } - for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) { - bool isLast = it == --missingPronunciations.end(); - logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second); - ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast); - } -} - -lambda_unique_ptr createDefaultLanguageModel(ps_decoder_t& decoder) { - path modelPath = getSphinxModelDirectory() / "en-us.lm.bin"; - lambda_unique_ptr result( - ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath), - [](ngram_model_t* lm) { ngram_model_free(lm); }); - if (!result) { - throw runtime_error(fmt::format("Error reading language model from {}.", modelPath)); - } - - return std::move(result); -} - -lambda_unique_ptr createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) { - // Split dialog into normalized words - vector words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); }); - - // Add dialog-specific words to the dictionary - addMissingDictionaryWords(words, decoder); - - // Create dialog-specific language model - words.insert(words.begin(), ""); - words.push_back(""); - return createLanguageModel(words, decoder); -} - -lambda_unique_ptr createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) { - auto defaultLanguageModel = createDefaultLanguageModel(decoder); - auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog); - constexpr int modelCount = 2; - array languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() }; - array modelNames{ "defaultLM", "dialogLM" }; - array modelWeights{ 0.1f, 0.9f }; - lambda_unique_ptr result( - ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount), - [](ngram_model_t* lm) { ngram_model_free(lm); }); - if (!result) { - throw runtime_error("Error creating biased language model."); - } - - return std::move(result); -} - -lambda_unique_ptr createDecoder(optional dialog) { - lambda_unique_ptr config( - cmd_ln_init( - nullptr, ps_args(), true, - // Set acoustic model - "-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(), - // Set pronunciation dictionary - "-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(), - // Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor) - "-dither", "yes", - // Disable VAD -- we're doing that ourselves - "-remove_silence", "no", - // Perform per-utterance cepstral mean normalization - "-cmn", "batch", - nullptr), - [](cmd_ln_t* config) { cmd_ln_free_r(config); }); - if (!config) throw runtime_error("Error creating configuration."); - - lambda_unique_ptr decoder( - ps_init(config.get()), - [](ps_decoder_t* recognizer) { ps_free(recognizer); }); - if (!decoder) throw runtime_error("Error creating speech decoder."); - - // Set language model - lambda_unique_ptr languageModel(dialog - ? createBiasedLanguageModel(*decoder, *dialog) - : createDefaultLanguageModel(*decoder)); - ps_set_lm(decoder.get(), "lm", languageModel.get()); - ps_set_search(decoder.get(), "lm"); - - return decoder; -} - -JoiningTimeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones) { - JoiningTimeline noiseSounds; - - // Find utterance parts without recogniced phones - noiseSounds.set(utteranceTimeRange); - for (const auto& timedPhone : phones) { - noiseSounds.clear(timedPhone.getTimeRange()); - } - - // Remove undesired elements - const centiseconds minSoundDuration = 12_cs; - for (const auto& unknownSound : JoiningTimeline(noiseSounds)) { - bool startsAtZero = unknownSound.getStart() == 0_cs; - bool tooShort = unknownSound.getDuration() < minSoundDuration; - if (startsAtZero || tooShort) { - noiseSounds.clear(unknownSound.getTimeRange()); - } - } - - return noiseSounds; -} - // Some words have multiple pronunciations, one of which results in better animation than the others. // This function returns the optimal pronunciation for a select set of these words. string fixPronunciation(const string& word) { - const static map replacements { - {"into(2)", "into"}, - {"to(2)", "to"}, - {"to(3)", "to"}, - {"today(2)", "today"}, - {"tomorrow(2)", "tomorrow"}, - {"tonight(2)", "tonight"} + const static map replacements{ + { "into(2)", "into" }, + { "to(2)", "to" }, + { "to(3)", "to" }, + { "today(2)", "today" }, + { "tomorrow(2)", "tomorrow" }, + { "tonight(2)", "tonight" } }; const auto pair = replacements.find(word); return pair != replacements.end() ? pair->second : word; } -Timeline utteranceToPhones( +static Timeline utteranceToPhones( const AudioClip& audioClip, TimeRange utteranceTimeRange, ps_decoder_t& decoder, - ProgressSink& utteranceProgressSink) -{ + ProgressSink& utteranceProgressSink +) { ProgressMerger utteranceProgressMerger(utteranceProgressSink); ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0); ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5); - // Pad time range to give Pocketsphinx some breathing room + // Pad time range to give PocketSphinx some breathing room TimeRange paddedTimeRange = utteranceTimeRange; const centiseconds padding(3); paddedTimeRange.grow(padding); @@ -384,7 +254,7 @@ Timeline utteranceToPhones( continue; } word = regex_replace(word, regex("\\(\\d\\)"), ""); - if (text.size() > 0) { + if (!text.empty()) { text += " "; } text += word; @@ -403,7 +273,7 @@ Timeline utteranceToPhones( const string fixedWord = fixPronunciation(timedWord.getValue()); wordIds.push_back(getWordId(fixedWord, *decoder.dict)); } - if (wordIds.empty()) return {}; + if (wordIds.empty()) return{}; // Align the words' phones with speech #if BOOST_VERSION < 105600 // Support legacy syntax @@ -433,77 +303,11 @@ Timeline utteranceToPhones( return utterancePhones; } -BoundedTimeline recognizePhones( +BoundedTimeline PocketSphinxRecognizer::recognizePhones( const AudioClip& inputAudioClip, - optional dialog, + optional dialog, int maxThreadCount, - ProgressSink& progressSink) -{ - ProgressMerger totalProgressMerger(progressSink); - ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0); - ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15); - - // Make sure audio stream has no DC offset - const unique_ptr audioClip = inputAudioClip.clone() | removeDcOffset(); - - // Split audio into utterances - JoiningBoundedTimeline utterances; - try { - utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink); - } - catch (...) { - std::throw_with_nested(runtime_error("Error detecting segments of speech.")); - } - - // Discard Pocketsphinx output - err_set_logfp(nullptr); - - // Redirect Pocketsphinx output to log - err_set_callback(sphinxLogCallback, nullptr); - - // Prepare pool of decoders - ObjectPool> decoderPool( - [&dialog] { return createDecoder(dialog); }); - - BoundedTimeline phones(audioClip->getTruncatedRange()); - std::mutex resultMutex; - auto processUtterance = [&](Timed timedUtterance, ProgressSink& utteranceProgressSink) { - // Detect phones for utterance - auto decoder = decoderPool.acquire(); - Timeline utterancePhones = - utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink); - - // Copy phones to result timeline - std::lock_guard lock(resultMutex); - for (const auto& timedPhone : utterancePhones) { - phones.set(timedPhone); - } - }; - - auto getUtteranceProgressWeight = [](const Timed timedUtterance) { - return timedUtterance.getDuration().count(); - }; - - // Perform speech recognition - try { - // Determine how many parallel threads to use - int threadCount = std::min({ - maxThreadCount, - // Don't use more threads than there are utterances to be processed - static_cast(utterances.size()), - // Don't waste time creating additional threads (and decoders!) if the recording is short - static_cast(duration_cast(audioClip->getTruncatedRange().getDuration()).count() / 5) - }); - if (threadCount < 1) { - threadCount = 1; - } - logging::debugFormat("Speech recognition using {} threads -- start", threadCount); - runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight); - logging::debug("Speech recognition -- end"); - } - catch (...) { - std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx.")); - } - - return phones; + ProgressSink& progressSink +) const { + return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink); } diff --git a/rhubarb/src/recognition/PocketSphinxRecognizer.h b/rhubarb/src/recognition/PocketSphinxRecognizer.h new file mode 100644 index 0000000..dc11d2d --- /dev/null +++ b/rhubarb/src/recognition/PocketSphinxRecognizer.h @@ -0,0 +1,14 @@ +#pragma once + +#include "Recognizer.h" +#include "pocketSphinxTools.h" + +class PocketSphinxRecognizer : public Recognizer { +public: + BoundedTimeline recognizePhones( + const AudioClip& inputAudioClip, + boost::optional dialog, + int maxThreadCount, + ProgressSink& progressSink + ) const override; +}; diff --git a/rhubarb/src/recognition/Recognizer.h b/rhubarb/src/recognition/Recognizer.h new file mode 100644 index 0000000..05c445d --- /dev/null +++ b/rhubarb/src/recognition/Recognizer.h @@ -0,0 +1,18 @@ +#pragma once + +#include "audio/AudioClip.h" +#include "core/Phone.h" +#include "tools/ProgressBar.h" +#include "time/BoundedTimeline.h" + +class Recognizer { +public: + virtual ~Recognizer() = default; + + virtual BoundedTimelinerecognizePhones( + const AudioClip& audioClip, + boost::optional dialog, + int maxThreadCount, + ProgressSink& progressSink + ) const = 0; +}; \ No newline at end of file diff --git a/rhubarb/src/recognition/phoneRecognition.h b/rhubarb/src/recognition/phoneRecognition.h deleted file mode 100644 index 2e66305..0000000 --- a/rhubarb/src/recognition/phoneRecognition.h +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -#include "audio/AudioClip.h" -#include "core/Phone.h" -#include "tools/ProgressBar.h" -#include "time/BoundedTimeline.h" - -BoundedTimeline recognizePhones( - const AudioClip& audioClip, - boost::optional dialog, - int maxThreadCount, - ProgressSink& progressSink); diff --git a/rhubarb/src/recognition/pocketSphinxTools.cpp b/rhubarb/src/recognition/pocketSphinxTools.cpp new file mode 100644 index 0000000..87a13ea --- /dev/null +++ b/rhubarb/src/recognition/pocketSphinxTools.cpp @@ -0,0 +1,218 @@ +#include "pocketSphinxTools.h" + +#include "tools/platformTools.h" +#include +#include "audio/DcOffset.h" +#include "audio/voiceActivityDetection.h" +#include "tools/parallel.h" +#include "tools/ObjectPool.h" +#include "time/timedLogging.h" + +extern "C" { +#include +#include +#include +} + +using std::runtime_error; +using std::invalid_argument; +using std::unique_ptr; +using std::string; +using std::vector; +using boost::filesystem::path; +using std::regex; +using boost::optional; +using std::chrono::duration_cast; + +logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) { + switch (errorLevel) { + case ERR_DEBUG: + case ERR_INFO: + case ERR_INFOCONT: + return logging::Level::Trace; + case ERR_WARN: + return logging::Level::Warn; + case ERR_ERROR: + return logging::Level::Error; + case ERR_FATAL: + return logging::Level::Fatal; + default: + throw invalid_argument("Unknown log level."); + } +} + +void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) { + UNUSED(user_data); + + // Create varArgs list + va_list args; + va_start(args, format); + auto _ = gsl::finally([&args]() { va_end(args); }); + + // Format message + const int initialSize = 256; + vector chars(initialSize); + bool success = false; + while (!success) { + const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args); + if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message."); + + success = charsWritten < static_cast(chars.size()); + if (!success) chars.resize(chars.size() * 2); + } + const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): "); + string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only); + boost::algorithm::trim(message); + + const logging::Level logLevel = convertSphinxErrorLevel(errorLevel); + logging::log(logLevel, message); +} + +void redirectPocketSphinxOutput() { + static bool redirected = false; + if (redirected) return; + + // Discard PocketSphinx output + err_set_logfp(nullptr); + + // Redirect PocketSphinx output to log + err_set_callback(sphinxLogCallback, nullptr); + + redirected = true; +} + +BoundedTimeline recognizePhones( + const AudioClip& inputAudioClip, + optional dialog, + decoderFactory createDecoder, + utteranceToPhonesFunction utteranceToPhones, + int maxThreadCount, + ProgressSink& progressSink +) { + ProgressMerger totalProgressMerger(progressSink); + ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0); + ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15); + + // Make sure audio stream has no DC offset + const unique_ptr audioClip = inputAudioClip.clone() | removeDcOffset(); + + // Split audio into utterances + JoiningBoundedTimeline utterances; + try { + utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink); + } catch (...) { + std::throw_with_nested(runtime_error("Error detecting segments of speech.")); + } + + redirectPocketSphinxOutput(); + + // Prepare pool of decoders + ObjectPool> decoderPool( + [&] { return createDecoder(dialog); }); + + BoundedTimeline phones(audioClip->getTruncatedRange()); + std::mutex resultMutex; + const auto processUtterance = [&](Timed timedUtterance, ProgressSink& utteranceProgressSink) { + // Detect phones for utterance + const auto decoder = decoderPool.acquire(); + Timeline utterancePhones = + utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink); + + // Copy phones to result timeline + std::lock_guard lock(resultMutex); + for (const auto& timedPhone : utterancePhones) { + phones.set(timedPhone); + } + }; + + const auto getUtteranceProgressWeight = [](const Timed timedUtterance) { + return timedUtterance.getDuration().count(); + }; + + // Perform speech recognition + try { + // Determine how many parallel threads to use + int threadCount = std::min({ + maxThreadCount, + // Don't use more threads than there are utterances to be processed + static_cast(utterances.size()), + // Don't waste time creating additional threads (and decoders!) if the recording is short + static_cast(duration_cast(audioClip->getTruncatedRange().getDuration()).count() / 5) + }); + if (threadCount < 1) { + threadCount = 1; + } + logging::debugFormat("Speech recognition using {} threads -- start", threadCount); + runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight); + logging::debug("Speech recognition -- end"); + } catch (...) { + std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx.")); + } + + return phones; +} + +const path& getSphinxModelDirectory() { + static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx"); + return sphinxModelDirectory; +} + +JoiningTimeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones) { + JoiningTimeline noiseSounds; + + // Find utterance parts without recognized phones + noiseSounds.set(utteranceTimeRange); + for (const auto& timedPhone : phones) { + noiseSounds.clear(timedPhone.getTimeRange()); + } + + // Remove undesired elements + const centiseconds minSoundDuration = 12_cs; + for (const auto& unknownSound : JoiningTimeline(noiseSounds)) { + const bool startsAtZero = unknownSound.getStart() == 0_cs; + const bool tooShort = unknownSound.getDuration() < minSoundDuration; + if (startsAtZero || tooShort) { + noiseSounds.clear(unknownSound.getTimeRange()); + } + } + + return noiseSounds; +} + +BoundedTimeline recognizeWords(const vector& audioBuffer, ps_decoder_t& decoder) { + // Restart timing at 0 + ps_start_stream(&decoder); + + // Start recognition + int error = ps_start_utt(&decoder); + if (error) throw runtime_error("Error starting utterance processing for word recognition."); + + // Process entire audio clip + const bool noRecognition = false; + const bool fullUtterance = true; + const int searchedFrameCount = + ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance); + if (searchedFrameCount < 0) { + throw runtime_error("Error analyzing raw audio data for word recognition."); + } + + // End recognition + error = ps_end_utt(&decoder); + if (error) throw runtime_error("Error ending utterance processing for word recognition."); + + BoundedTimeline result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate))); + const bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0; + if (noWordsRecognized) { + return result; + } + + // Collect words + for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) { + const char* word = ps_seg_word(it); + int firstFrame, lastFrame; + ps_seg_frames(it, &firstFrame, &lastFrame); + result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word); + } + + return result; +} diff --git a/rhubarb/src/recognition/pocketSphinxTools.h b/rhubarb/src/recognition/pocketSphinxTools.h new file mode 100644 index 0000000..568ccbe --- /dev/null +++ b/rhubarb/src/recognition/pocketSphinxTools.h @@ -0,0 +1,39 @@ +#pragma once + +#include "time/BoundedTimeline.h" +#include "core/Phone.h" +#include "audio/AudioClip.h" +#include "tools/ProgressBar.h" +#include + +extern "C" { +#include +} + +typedef std::function( + boost::optional dialog +)> decoderFactory; + +typedef std::function( + const AudioClip& audioClip, + TimeRange utteranceTimeRange, + ps_decoder_t& decoder, + ProgressSink& utteranceProgressSink +)> utteranceToPhonesFunction; + +BoundedTimeline recognizePhones( + const AudioClip& inputAudioClip, + boost::optional dialog, + decoderFactory createDecoder, + utteranceToPhonesFunction utteranceToPhones, + int maxThreadCount, + ProgressSink& progressSink +); + +constexpr int sphinxSampleRate = 16000; + +const boost::filesystem::path& getSphinxModelDirectory(); + +JoiningTimeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones); + +BoundedTimeline recognizeWords(const std::vector& audioBuffer, ps_decoder_t& decoder); diff --git a/rhubarb/src/rhubarb/RecognizerType.cpp b/rhubarb/src/rhubarb/RecognizerType.cpp new file mode 100644 index 0000000..86f0837 --- /dev/null +++ b/rhubarb/src/rhubarb/RecognizerType.cpp @@ -0,0 +1,27 @@ +#include "RecognizerType.h" + +using std::string; + +RecognizerTypeConverter& RecognizerTypeConverter::get() { + static RecognizerTypeConverter converter; + return converter; +} + +string RecognizerTypeConverter::getTypeName() { + return "RecognizerType"; +} + +EnumConverter::member_data RecognizerTypeConverter::getMemberData() { + return member_data{ + { RecognizerType::PocketSphinx, "pocketSphinx" }, + { RecognizerType::Phonetic, "phonetic" } + }; +} + +std::ostream& operator<<(std::ostream& stream, RecognizerType value) { + return RecognizerTypeConverter::get().write(stream, value); +} + +std::istream& operator>>(std::istream& stream, RecognizerType& value) { + return RecognizerTypeConverter::get().read(stream, value); +} diff --git a/rhubarb/src/rhubarb/RecognizerType.h b/rhubarb/src/rhubarb/RecognizerType.h new file mode 100644 index 0000000..6f8cf12 --- /dev/null +++ b/rhubarb/src/rhubarb/RecognizerType.h @@ -0,0 +1,20 @@ +#pragma once + +#include "tools/EnumConverter.h" + +enum class RecognizerType { + PocketSphinx, + Phonetic +}; + +class RecognizerTypeConverter : public EnumConverter { +public: + static RecognizerTypeConverter& get(); +protected: + std::string getTypeName() override; + member_data getMemberData() override; +}; + +std::ostream& operator<<(std::ostream& stream, RecognizerType value); + +std::istream& operator>>(std::istream& stream, RecognizerType& value); diff --git a/rhubarb/src/rhubarb/main.cpp b/rhubarb/src/rhubarb/main.cpp index 104a6e8..703dd67 100644 --- a/rhubarb/src/rhubarb/main.cpp +++ b/rhubarb/src/rhubarb/main.cpp @@ -27,6 +27,9 @@ #include "tools/platformTools.h" #include "sinks.h" #include "semanticEntries.h" +#include "RecognizerType.h" +#include "recognition/PocketSphinxRecognizer.h" +#include "recognition/PhoneticRecognizer.h" using std::exception; using std::string; @@ -36,9 +39,6 @@ using std::unique_ptr; using std::make_unique; using std::shared_ptr; using std::make_shared; -using std::map; -using std::chrono::duration; -using std::chrono::duration_cast; using std::ofstream; using boost::filesystem::path; using boost::adaptors::transformed; @@ -56,6 +56,10 @@ namespace TCLAP { struct ArgTraits { typedef ValueLike ValueCategory; }; + template<> + struct ArgTraits { + typedef ValueLike ValueCategory; + }; } shared_ptr createFileSink(path path, logging::Level minLevel) { @@ -66,6 +70,17 @@ shared_ptr createFileSink(path path, logging::Level minLevel) { return make_shared(FileSink, minLevel); } +unique_ptr createRecognizer(RecognizerType recognizerType) { + switch (recognizerType) { + case RecognizerType::PocketSphinx: + return make_unique(); + case RecognizerType::Phonetic: + return make_unique(); + default: + throw std::runtime_error("Unknown recognizer."); + } +} + unique_ptr createExporter(ExportFormat exportFormat) { switch (exportFormat) { case ExportFormat::Tsv: @@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) { auto exportFormats = vector(ExportFormatConverter::get().getValues()); tclap::ValuesConstraint exportFormatConstraint(exportFormats); tclap::ValueArg exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd); + auto recognizerTypes = vector(RecognizerTypeConverter::get().getValues()); + tclap::ValuesConstraint recognizerConstraint(recognizerTypes); + tclap::ValueArg recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd); tclap::UnlabeledValueArg inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd); try { @@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) { JoiningContinuousTimeline animation = animateWaveFile( inputFilePath, dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional(), + *createRecognizer(recognizerType.getValue()), targetShapeSet, maxThreadCount.getValue(), progressSink);