diff --git a/rhubarb/CMakeLists.txt b/rhubarb/CMakeLists.txt
index 9353edf..0fef2b7 100644
--- a/rhubarb/CMakeLists.txt
+++ b/rhubarb/CMakeLists.txt
@@ -413,8 +413,13 @@ add_library(rhubarb-recognition
src/recognition/g2p.h
src/recognition/languageModels.cpp
src/recognition/languageModels.h
- src/recognition/phoneRecognition.cpp
- src/recognition/phoneRecognition.h
+ src/recognition/PhoneticRecognizer.cpp
+ src/recognition/PhoneticRecognizer.h
+ src/recognition/PocketSphinxRecognizer.cpp
+ src/recognition/PocketSphinxRecognizer.h
+ src/recognition/pocketSphinxTools.cpp
+ src/recognition/pocketSphinxTools.h
+ src/recognition/Recognizer.h
src/recognition/tokenization.cpp
src/recognition/tokenization.h
)
@@ -487,6 +492,8 @@ add_executable(rhubarb
src/rhubarb/main.cpp
src/rhubarb/ExportFormat.cpp
src/rhubarb/ExportFormat.h
+ src/rhubarb/RecognizerType.cpp
+ src/rhubarb/RecognizerType.h
src/rhubarb/semanticEntries.cpp
src/rhubarb/semanticEntries.h
src/rhubarb/sinks.cpp
diff --git a/rhubarb/resharper.DotSettings b/rhubarb/resharper.DotSettings
index b16b555..168efbe 100644
--- a/rhubarb/resharper.DotSettings
+++ b/rhubarb/resharper.DotSettings
@@ -1,7 +1,12 @@
+ HINT
+
ERROR
+
DO_NOT_SHOW
+
USE_TABS_ONLY
+ USE_TABS_ONLY
False
False
False
@@ -29,6 +34,7 @@
CHOP_ALWAYS
END_OF_LINE
END_OF_LINE
+ USE_TABS_ONLY
False
END_OF_LINE
END_OF_LINE
@@ -44,6 +50,14 @@
False
True
False
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
UseExplicitType
UseVarWhenEvident
<NamingElement Priority="10"><Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"><type Name="class field" /><type Name="struct field" /></Descriptor><Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /></NamingElement>
@@ -108,7 +122,16 @@
C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches
True
True
+ True
+ True
True
True
True
+ True
+ True
+ True
+ True
+ True
+ True
+ True
\ No newline at end of file
diff --git a/rhubarb/src/lib/rhubarbLib.cpp b/rhubarb/src/lib/rhubarbLib.cpp
index ffadf68..5f8460f 100644
--- a/rhubarb/src/lib/rhubarbLib.cpp
+++ b/rhubarb/src/lib/rhubarbLib.cpp
@@ -1,6 +1,5 @@
#include "rhubarbLib.h"
#include "core/Phone.h"
-#include "recognition/phoneRecognition.h"
#include "tools/textFiles.h"
#include "animation/mouthAnimation.h"
#include "audio/audioFileReading.h"
@@ -8,27 +7,29 @@
using boost::optional;
using std::string;
using boost::filesystem::path;
-using std::unique_ptr;
JoiningContinuousTimeline animateAudioClip(
const AudioClip& audioClip,
- optional dialog,
+ const optional& dialog,
+ const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink)
{
- BoundedTimeline phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
+ const BoundedTimeline phones =
+ recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
JoiningContinuousTimeline result = animate(phones, targetShapeSet);
return result;
}
JoiningContinuousTimeline animateWaveFile(
path filePath,
- optional dialog,
+ const optional& dialog,
+ const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink)
{
const auto audioClip = createAudioFileClip(filePath);
- return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink);
+ return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
}
diff --git a/rhubarb/src/lib/rhubarbLib.h b/rhubarb/src/lib/rhubarbLib.h
index 8663761..ca40a06 100644
--- a/rhubarb/src/lib/rhubarbLib.h
+++ b/rhubarb/src/lib/rhubarbLib.h
@@ -6,17 +6,20 @@
#include "tools/ProgressBar.h"
#include
#include "animation/targetShapeSet.h"
+#include "recognition/Recognizer.h"
JoiningContinuousTimeline animateAudioClip(
const AudioClip& audioClip,
- boost::optional dialog,
+ const boost::optional& dialog,
+ const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink);
JoiningContinuousTimeline animateWaveFile(
boost::filesystem::path filePath,
- boost::optional dialog,
+ const boost::optional& dialog,
+ const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink);
diff --git a/rhubarb/src/recognition/PhoneticRecognizer.cpp b/rhubarb/src/recognition/PhoneticRecognizer.cpp
new file mode 100644
index 0000000..bd9c9ac
--- /dev/null
+++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp
@@ -0,0 +1,103 @@
+#include "PhoneticRecognizer.h"
+#include "time/Timeline.h"
+#include "audio/AudioSegment.h"
+#include "audio/SampleRateConverter.h"
+#include "audio/processing.h"
+#include "time/timedLogging.h"
+
+using std::runtime_error;
+using std::unique_ptr;
+using std::string;
+using boost::optional;
+
+static lambda_unique_ptr createDecoder(optional dialog) {
+ UNUSED(dialog);
+
+ lambda_unique_ptr config(
+ cmd_ln_init(
+ nullptr, ps_args(), true,
+ // Set acoustic model
+ "-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
+ // Set phonetic language model
+ "-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
+ "-allphone_ci", "yes",
+ // Set language model probability weight.
+ // Low values (<= 0.4) can lead to fluttering animation.
+ // High values (>= 1.0) can lead to imprecise or freezing animation.
+ "-lw", "0.8",
+
+ // The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
+
+ // Set beam width applied to every frame in Viterbi search
+ "-beam", "1e-20",
+ // Set beam width applied to phone transitions
+ "-pbeam", "1e-20",
+ nullptr),
+ [](cmd_ln_t* config) { cmd_ln_free_r(config); });
+ if (!config) throw runtime_error("Error creating configuration.");
+
+ lambda_unique_ptr decoder(
+ ps_init(config.get()),
+ [](ps_decoder_t* recognizer) { ps_free(recognizer); });
+ if (!decoder) throw runtime_error("Error creating speech decoder.");
+
+ return decoder;
+}
+
+static Timeline utteranceToPhones(
+ const AudioClip& audioClip,
+ TimeRange utteranceTimeRange,
+ ps_decoder_t& decoder,
+ ProgressSink& utteranceProgressSink
+) {
+ // Pad time range to give PocketSphinx some breathing room
+ TimeRange paddedTimeRange = utteranceTimeRange;
+ const centiseconds padding(3);
+ paddedTimeRange.grow(padding);
+ paddedTimeRange.trim(audioClip.getTruncatedRange());
+
+ const unique_ptr clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
+ const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
+
+ // Detect phones (returned as words)
+ BoundedTimeline phoneStrings = recognizeWords(audioBuffer, decoder);
+ phoneStrings.shift(paddedTimeRange.getStart());
+ Timeline utterancePhones;
+ for (const auto& timedPhoneString : phoneStrings) {
+ Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
+ if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
+ // Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
+ phone = Phone::Schwa;
+ }
+ utterancePhones.set(timedPhoneString.getTimeRange(), phone);
+ }
+
+ // Log raw phones
+ for (const auto& timedPhone : utterancePhones) {
+ logTimedEvent("rawPhone", timedPhone);
+ }
+
+ // Guess positions of noise sounds
+ JoiningTimeline noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
+ for (const auto& noiseSound : noiseSounds) {
+ utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
+ }
+
+ // Log phones
+ for (const auto& timedPhone : utterancePhones) {
+ logTimedEvent("phone", timedPhone);
+ }
+
+ utteranceProgressSink.reportProgress(1.0);
+
+ return utterancePhones;
+}
+
+BoundedTimeline PhoneticRecognizer::recognizePhones(
+ const AudioClip& inputAudioClip,
+ optional dialog,
+ int maxThreadCount,
+ ProgressSink& progressSink
+) const {
+ return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
+}
diff --git a/rhubarb/src/recognition/PhoneticRecognizer.h b/rhubarb/src/recognition/PhoneticRecognizer.h
new file mode 100644
index 0000000..96797cf
--- /dev/null
+++ b/rhubarb/src/recognition/PhoneticRecognizer.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "Recognizer.h"
+#include "pocketSphinxTools.h"
+
+class PhoneticRecognizer : public Recognizer {
+public:
+ BoundedTimeline recognizePhones(
+ const AudioClip& inputAudioClip,
+ boost::optional dialog,
+ int maxThreadCount,
+ ProgressSink& progressSink
+ ) const override;
+};
diff --git a/rhubarb/src/recognition/phoneRecognition.cpp b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
similarity index 55%
rename from rhubarb/src/recognition/phoneRecognition.cpp
rename to rhubarb/src/recognition/PocketSphinxRecognizer.cpp
index e45c765..b97c0b7 100644
--- a/rhubarb/src/recognition/phoneRecognition.cpp
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
@@ -1,143 +1,133 @@
-#include
-#include "phoneRecognition.h"
-#include "audio/SampleRateConverter.h"
-#include "tools/platformTools.h"
-#include "tools/tools.h"
-#include
-#include
+#include "PocketSphinxRecognizer.h"
#include
#include
-#include "logging/logging.h"
-#include "audio/DcOffset.h"
-#include "time/Timeline.h"
-#include "audio/voiceActivityDetection.h"
#include "audio/AudioSegment.h"
+#include "audio/SampleRateConverter.h"
#include "languageModels.h"
#include "tokenization.h"
#include "g2p.h"
#include "time/ContinuousTimeline.h"
#include "audio/processing.h"
-#include "tools/parallel.h"
-#include
-#include "tools/ObjectPool.h"
#include "time/timedLogging.h"
extern "C" {
-#include
-#include
-#include
#include
-#include
-#include
}
using std::runtime_error;
using std::invalid_argument;
using std::unique_ptr;
-using std::shared_ptr;
using std::string;
using std::vector;
using std::map;
using boost::filesystem::path;
-using std::function;
using std::regex;
using std::regex_replace;
-using std::chrono::duration;
using boost::optional;
-using std::string;
-using std::chrono::duration_cast;
using std::array;
-constexpr int sphinxSampleRate = 16000;
-
-const path& getSphinxModelDirectory() {
- static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
- return sphinxModelDirectory;
+bool dictionaryContains(dict_t& dictionary, const string& word) {
+ return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
}
-logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
- switch (errorLevel) {
- case ERR_DEBUG:
- case ERR_INFO:
- case ERR_INFOCONT:
- return logging::Level::Trace;
- case ERR_WARN:
- return logging::Level::Warn;
- case ERR_ERROR:
- return logging::Level::Error;
- case ERR_FATAL:
- return logging::Level::Fatal;
- default:
- throw invalid_argument("Unknown log level.");
+s3wid_t getWordId(const string& word, dict_t& dictionary) {
+ const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
+ if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
+ return wordId;
+}
+
+void addMissingDictionaryWords(const vector& words, ps_decoder_t& decoder) {
+ map missingPronunciations;
+ for (const string& word : words) {
+ if (!dictionaryContains(*decoder.dict, word)) {
+ string pronunciation;
+ for (Phone phone : wordToPhones(word)) {
+ if (pronunciation.length() > 0) pronunciation += " ";
+ pronunciation += PhoneConverter::get().toString(phone);
+ }
+ missingPronunciations[word] = pronunciation;
+ }
+ }
+ for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
+ const bool isLast = it == --missingPronunciations.end();
+ logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
+ ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
}
}
-void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
- UNUSED(user_data);
-
- // Create varArgs list
- va_list args;
- va_start(args, format);
- auto _ = gsl::finally([&args]() { va_end(args); });
-
- // Format message
- const int initialSize = 256;
- vector chars(initialSize);
- bool success = false;
- while (!success) {
- int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
- if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
-
- success = charsWritten < static_cast(chars.size());
- if (!success) chars.resize(chars.size() * 2);
- }
- regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
- string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
- boost::algorithm::trim(message);
-
- logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
- logging::log(logLevel, message);
-}
-
-BoundedTimeline recognizeWords(const vector& audioBuffer, ps_decoder_t& decoder) {
- // Restart timing at 0
- ps_start_stream(&decoder);
-
- // Start recognition
- int error = ps_start_utt(&decoder);
- if (error) throw runtime_error("Error starting utterance processing for word recognition.");
-
- // Process entire audio clip
- const bool noRecognition = false;
- const bool fullUtterance = true;
- int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
- if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
-
- // End recognition
- error = ps_end_utt(&decoder);
- if (error) throw runtime_error("Error ending utterance processing for word recognition.");
-
- BoundedTimeline result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
- bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0;
- if (noWordsRecognized) {
- return result;
- }
-
- // Collect words
- for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
- const char* word = ps_seg_word(it);
- int firstFrame, lastFrame;
- ps_seg_frames(it, &firstFrame, &lastFrame);
- result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
+lambda_unique_ptr createDefaultLanguageModel(ps_decoder_t& decoder) {
+ path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
+ lambda_unique_ptr result(
+ ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
+ [](ngram_model_t* lm) { ngram_model_free(lm); });
+ if (!result) {
+ throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
}
return result;
}
-s3wid_t getWordId(const string& word, dict_t& dictionary) {
- s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
- if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
- return wordId;
+lambda_unique_ptr createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
+ // Split dialog into normalized words
+ vector words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
+
+ // Add dialog-specific words to the dictionary
+ addMissingDictionaryWords(words, decoder);
+
+ // Create dialog-specific language model
+ words.insert(words.begin(), "");
+ words.emplace_back("");
+ return createLanguageModel(words, decoder);
+}
+
+lambda_unique_ptr createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
+ auto defaultLanguageModel = createDefaultLanguageModel(decoder);
+ auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
+ constexpr int modelCount = 2;
+ array languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
+ array modelNames{ "defaultLM", "dialogLM" };
+ array modelWeights{ 0.1f, 0.9f };
+ lambda_unique_ptr result(
+ ngram_model_set_init(nullptr, languageModels.data(), const_cast(modelNames.data()), modelWeights.data(), modelCount),
+ [](ngram_model_t* lm) { ngram_model_free(lm); });
+ if (!result) {
+ throw runtime_error("Error creating biased language model.");
+ }
+
+ return result;
+}
+
+static lambda_unique_ptr createDecoder(optional dialog) {
+ lambda_unique_ptr config(
+ cmd_ln_init(
+ nullptr, ps_args(), true,
+ // Set acoustic model
+ "-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
+ // Set pronunciation dictionary
+ "-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
+ // Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
+ "-dither", "yes",
+ // Disable VAD -- we're doing that ourselves
+ "-remove_silence", "no",
+ // Perform per-utterance cepstral mean normalization
+ "-cmn", "batch",
+ nullptr),
+ [](cmd_ln_t* config) { cmd_ln_free_r(config); });
+ if (!config) throw runtime_error("Error creating configuration.");
+
+ lambda_unique_ptr decoder(
+ ps_init(config.get()),
+ [](ps_decoder_t* recognizer) { ps_free(recognizer); });
+ if (!decoder) throw runtime_error("Error creating speech decoder.");
+
+ // Set language model
+ lambda_unique_ptr languageModel(dialog
+ ? createBiasedLanguageModel(*decoder, *dialog)
+ : createDefaultLanguageModel(*decoder));
+ ps_set_lm(decoder.get(), "lm", languageModel.get());
+ ps_set_search(decoder.get(), "lm");
+
+ return decoder;
}
optional> getPhoneAlignment(
@@ -178,7 +168,7 @@ optional> getPhoneAlignment(
// Process entire audio clip
const int16* nextSample = audioBuffer.data();
size_t remainingSamples = audioBuffer.size();
- bool fullUtterance = true;
+ const bool fullUtterance = true;
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
while (acousticModel->n_feat_frame > 0) {
ps_search_step(search.get(), acousticModel->output_frame);
@@ -197,7 +187,7 @@ optional> getPhoneAlignment(
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
// Get phone
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
- s3cipid_t phoneId = phoneEntry->id.pid.cipid;
+ const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
string phoneName = phoneNames[phoneId];
if (phoneName == "SIL") continue;
@@ -207,162 +197,42 @@ optional> getPhoneAlignment(
centiseconds duration(phoneEntry->duration);
Phone phone = PhoneConverter::get().parse(phoneName);
if (phone == Phone::AH && duration < 6_cs) {
- // Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate.
+ // Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
phone = Phone::Schwa;
}
- Timed timedPhone(start, start + duration, phone);
+ const Timed timedPhone(start, start + duration, phone);
result.set(timedPhone);
}
return result;
}
-bool dictionaryContains(dict_t& dictionary, const string& word) {
- return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
-}
-
-void addMissingDictionaryWords(const vector& words, ps_decoder_t& decoder) {
- map missingPronunciations;
- for (const string& word : words) {
- if (!dictionaryContains(*decoder.dict, word)) {
- string pronunciation;
- for (Phone phone : wordToPhones(word)) {
- if (pronunciation.length() > 0) pronunciation += " ";
- pronunciation += PhoneConverter::get().toString(phone);
- }
- missingPronunciations[word] = pronunciation;
- }
- }
- for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
- bool isLast = it == --missingPronunciations.end();
- logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
- ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
- }
-}
-
-lambda_unique_ptr createDefaultLanguageModel(ps_decoder_t& decoder) {
- path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
- lambda_unique_ptr result(
- ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
- [](ngram_model_t* lm) { ngram_model_free(lm); });
- if (!result) {
- throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
- }
-
- return std::move(result);
-}
-
-lambda_unique_ptr createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
- // Split dialog into normalized words
- vector words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
-
- // Add dialog-specific words to the dictionary
- addMissingDictionaryWords(words, decoder);
-
- // Create dialog-specific language model
- words.insert(words.begin(), "");
- words.push_back("");
- return createLanguageModel(words, decoder);
-}
-
-lambda_unique_ptr createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
- auto defaultLanguageModel = createDefaultLanguageModel(decoder);
- auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
- constexpr int modelCount = 2;
- array languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
- array modelNames{ "defaultLM", "dialogLM" };
- array modelWeights{ 0.1f, 0.9f };
- lambda_unique_ptr result(
- ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
- [](ngram_model_t* lm) { ngram_model_free(lm); });
- if (!result) {
- throw runtime_error("Error creating biased language model.");
- }
-
- return std::move(result);
-}
-
-lambda_unique_ptr createDecoder(optional dialog) {
- lambda_unique_ptr config(
- cmd_ln_init(
- nullptr, ps_args(), true,
- // Set acoustic model
- "-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
- // Set pronunciation dictionary
- "-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
- // Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
- "-dither", "yes",
- // Disable VAD -- we're doing that ourselves
- "-remove_silence", "no",
- // Perform per-utterance cepstral mean normalization
- "-cmn", "batch",
- nullptr),
- [](cmd_ln_t* config) { cmd_ln_free_r(config); });
- if (!config) throw runtime_error("Error creating configuration.");
-
- lambda_unique_ptr decoder(
- ps_init(config.get()),
- [](ps_decoder_t* recognizer) { ps_free(recognizer); });
- if (!decoder) throw runtime_error("Error creating speech decoder.");
-
- // Set language model
- lambda_unique_ptr languageModel(dialog
- ? createBiasedLanguageModel(*decoder, *dialog)
- : createDefaultLanguageModel(*decoder));
- ps_set_lm(decoder.get(), "lm", languageModel.get());
- ps_set_search(decoder.get(), "lm");
-
- return decoder;
-}
-
-JoiningTimeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones) {
- JoiningTimeline noiseSounds;
-
- // Find utterance parts without recogniced phones
- noiseSounds.set(utteranceTimeRange);
- for (const auto& timedPhone : phones) {
- noiseSounds.clear(timedPhone.getTimeRange());
- }
-
- // Remove undesired elements
- const centiseconds minSoundDuration = 12_cs;
- for (const auto& unknownSound : JoiningTimeline(noiseSounds)) {
- bool startsAtZero = unknownSound.getStart() == 0_cs;
- bool tooShort = unknownSound.getDuration() < minSoundDuration;
- if (startsAtZero || tooShort) {
- noiseSounds.clear(unknownSound.getTimeRange());
- }
- }
-
- return noiseSounds;
-}
-
// Some words have multiple pronunciations, one of which results in better animation than the others.
// This function returns the optimal pronunciation for a select set of these words.
string fixPronunciation(const string& word) {
- const static map replacements {
- {"into(2)", "into"},
- {"to(2)", "to"},
- {"to(3)", "to"},
- {"today(2)", "today"},
- {"tomorrow(2)", "tomorrow"},
- {"tonight(2)", "tonight"}
+ const static map replacements{
+ { "into(2)", "into" },
+ { "to(2)", "to" },
+ { "to(3)", "to" },
+ { "today(2)", "today" },
+ { "tomorrow(2)", "tomorrow" },
+ { "tonight(2)", "tonight" }
};
const auto pair = replacements.find(word);
return pair != replacements.end() ? pair->second : word;
}
-Timeline utteranceToPhones(
+static Timeline utteranceToPhones(
const AudioClip& audioClip,
TimeRange utteranceTimeRange,
ps_decoder_t& decoder,
- ProgressSink& utteranceProgressSink)
-{
+ ProgressSink& utteranceProgressSink
+) {
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
- // Pad time range to give Pocketsphinx some breathing room
+ // Pad time range to give PocketSphinx some breathing room
TimeRange paddedTimeRange = utteranceTimeRange;
const centiseconds padding(3);
paddedTimeRange.grow(padding);
@@ -384,7 +254,7 @@ Timeline utteranceToPhones(
continue;
}
word = regex_replace(word, regex("\\(\\d\\)"), "");
- if (text.size() > 0) {
+ if (!text.empty()) {
text += " ";
}
text += word;
@@ -403,7 +273,7 @@ Timeline utteranceToPhones(
const string fixedWord = fixPronunciation(timedWord.getValue());
wordIds.push_back(getWordId(fixedWord, *decoder.dict));
}
- if (wordIds.empty()) return {};
+ if (wordIds.empty()) return{};
// Align the words' phones with speech
#if BOOST_VERSION < 105600 // Support legacy syntax
@@ -433,77 +303,11 @@ Timeline utteranceToPhones(
return utterancePhones;
}
-BoundedTimeline recognizePhones(
+BoundedTimeline PocketSphinxRecognizer::recognizePhones(
const AudioClip& inputAudioClip,
- optional dialog,
+ optional dialog,
int maxThreadCount,
- ProgressSink& progressSink)
-{
- ProgressMerger totalProgressMerger(progressSink);
- ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
- ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
-
- // Make sure audio stream has no DC offset
- const unique_ptr audioClip = inputAudioClip.clone() | removeDcOffset();
-
- // Split audio into utterances
- JoiningBoundedTimeline utterances;
- try {
- utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
- }
- catch (...) {
- std::throw_with_nested(runtime_error("Error detecting segments of speech."));
- }
-
- // Discard Pocketsphinx output
- err_set_logfp(nullptr);
-
- // Redirect Pocketsphinx output to log
- err_set_callback(sphinxLogCallback, nullptr);
-
- // Prepare pool of decoders
- ObjectPool> decoderPool(
- [&dialog] { return createDecoder(dialog); });
-
- BoundedTimeline phones(audioClip->getTruncatedRange());
- std::mutex resultMutex;
- auto processUtterance = [&](Timed timedUtterance, ProgressSink& utteranceProgressSink) {
- // Detect phones for utterance
- auto decoder = decoderPool.acquire();
- Timeline utterancePhones =
- utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
-
- // Copy phones to result timeline
- std::lock_guard lock(resultMutex);
- for (const auto& timedPhone : utterancePhones) {
- phones.set(timedPhone);
- }
- };
-
- auto getUtteranceProgressWeight = [](const Timed timedUtterance) {
- return timedUtterance.getDuration().count();
- };
-
- // Perform speech recognition
- try {
- // Determine how many parallel threads to use
- int threadCount = std::min({
- maxThreadCount,
- // Don't use more threads than there are utterances to be processed
- static_cast(utterances.size()),
- // Don't waste time creating additional threads (and decoders!) if the recording is short
- static_cast(duration_cast(audioClip->getTruncatedRange().getDuration()).count() / 5)
- });
- if (threadCount < 1) {
- threadCount = 1;
- }
- logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
- runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
- logging::debug("Speech recognition -- end");
- }
- catch (...) {
- std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
- }
-
- return phones;
+ ProgressSink& progressSink
+) const {
+ return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
}
diff --git a/rhubarb/src/recognition/PocketSphinxRecognizer.h b/rhubarb/src/recognition/PocketSphinxRecognizer.h
new file mode 100644
index 0000000..dc11d2d
--- /dev/null
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "Recognizer.h"
+#include "pocketSphinxTools.h"
+
+class PocketSphinxRecognizer : public Recognizer {
+public:
+ BoundedTimeline recognizePhones(
+ const AudioClip& inputAudioClip,
+ boost::optional dialog,
+ int maxThreadCount,
+ ProgressSink& progressSink
+ ) const override;
+};
diff --git a/rhubarb/src/recognition/Recognizer.h b/rhubarb/src/recognition/Recognizer.h
new file mode 100644
index 0000000..05c445d
--- /dev/null
+++ b/rhubarb/src/recognition/Recognizer.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "audio/AudioClip.h"
+#include "core/Phone.h"
+#include "tools/ProgressBar.h"
+#include "time/BoundedTimeline.h"
+
+class Recognizer {
+public:
+ virtual ~Recognizer() = default;
+
+ virtual BoundedTimelinerecognizePhones(
+ const AudioClip& audioClip,
+ boost::optional dialog,
+ int maxThreadCount,
+ ProgressSink& progressSink
+ ) const = 0;
+};
\ No newline at end of file
diff --git a/rhubarb/src/recognition/phoneRecognition.h b/rhubarb/src/recognition/phoneRecognition.h
deleted file mode 100644
index 2e66305..0000000
--- a/rhubarb/src/recognition/phoneRecognition.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include "audio/AudioClip.h"
-#include "core/Phone.h"
-#include "tools/ProgressBar.h"
-#include "time/BoundedTimeline.h"
-
-BoundedTimeline recognizePhones(
- const AudioClip& audioClip,
- boost::optional dialog,
- int maxThreadCount,
- ProgressSink& progressSink);
diff --git a/rhubarb/src/recognition/pocketSphinxTools.cpp b/rhubarb/src/recognition/pocketSphinxTools.cpp
new file mode 100644
index 0000000..87a13ea
--- /dev/null
+++ b/rhubarb/src/recognition/pocketSphinxTools.cpp
@@ -0,0 +1,218 @@
+#include "pocketSphinxTools.h"
+
+#include "tools/platformTools.h"
+#include
+#include "audio/DcOffset.h"
+#include "audio/voiceActivityDetection.h"
+#include "tools/parallel.h"
+#include "tools/ObjectPool.h"
+#include "time/timedLogging.h"
+
+extern "C" {
+#include
+#include
+#include
+}
+
+using std::runtime_error;
+using std::invalid_argument;
+using std::unique_ptr;
+using std::string;
+using std::vector;
+using boost::filesystem::path;
+using std::regex;
+using boost::optional;
+using std::chrono::duration_cast;
+
+logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
+ switch (errorLevel) {
+ case ERR_DEBUG:
+ case ERR_INFO:
+ case ERR_INFOCONT:
+ return logging::Level::Trace;
+ case ERR_WARN:
+ return logging::Level::Warn;
+ case ERR_ERROR:
+ return logging::Level::Error;
+ case ERR_FATAL:
+ return logging::Level::Fatal;
+ default:
+ throw invalid_argument("Unknown log level.");
+ }
+}
+
+void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
+ UNUSED(user_data);
+
+ // Create varArgs list
+ va_list args;
+ va_start(args, format);
+ auto _ = gsl::finally([&args]() { va_end(args); });
+
+ // Format message
+ const int initialSize = 256;
+ vector chars(initialSize);
+ bool success = false;
+ while (!success) {
+ const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
+ if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
+
+ success = charsWritten < static_cast(chars.size());
+ if (!success) chars.resize(chars.size() * 2);
+ }
+ const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
+ string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
+ boost::algorithm::trim(message);
+
+ const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
+ logging::log(logLevel, message);
+}
+
+void redirectPocketSphinxOutput() {
+ static bool redirected = false;
+ if (redirected) return;
+
+ // Discard PocketSphinx output
+ err_set_logfp(nullptr);
+
+ // Redirect PocketSphinx output to log
+ err_set_callback(sphinxLogCallback, nullptr);
+
+ redirected = true;
+}
+
+BoundedTimeline recognizePhones(
+ const AudioClip& inputAudioClip,
+ optional dialog,
+ decoderFactory createDecoder,
+ utteranceToPhonesFunction utteranceToPhones,
+ int maxThreadCount,
+ ProgressSink& progressSink
+) {
+ ProgressMerger totalProgressMerger(progressSink);
+ ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
+ ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
+
+ // Make sure audio stream has no DC offset
+ const unique_ptr audioClip = inputAudioClip.clone() | removeDcOffset();
+
+ // Split audio into utterances
+ JoiningBoundedTimeline utterances;
+ try {
+ utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
+ } catch (...) {
+ std::throw_with_nested(runtime_error("Error detecting segments of speech."));
+ }
+
+ redirectPocketSphinxOutput();
+
+ // Prepare pool of decoders
+ ObjectPool> decoderPool(
+ [&] { return createDecoder(dialog); });
+
+ BoundedTimeline phones(audioClip->getTruncatedRange());
+ std::mutex resultMutex;
+ const auto processUtterance = [&](Timed timedUtterance, ProgressSink& utteranceProgressSink) {
+ // Detect phones for utterance
+ const auto decoder = decoderPool.acquire();
+ Timeline utterancePhones =
+ utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
+
+ // Copy phones to result timeline
+ std::lock_guard lock(resultMutex);
+ for (const auto& timedPhone : utterancePhones) {
+ phones.set(timedPhone);
+ }
+ };
+
+ const auto getUtteranceProgressWeight = [](const Timed timedUtterance) {
+ return timedUtterance.getDuration().count();
+ };
+
+ // Perform speech recognition
+ try {
+ // Determine how many parallel threads to use
+ int threadCount = std::min({
+ maxThreadCount,
+ // Don't use more threads than there are utterances to be processed
+ static_cast(utterances.size()),
+ // Don't waste time creating additional threads (and decoders!) if the recording is short
+ static_cast(duration_cast(audioClip->getTruncatedRange().getDuration()).count() / 5)
+ });
+ if (threadCount < 1) {
+ threadCount = 1;
+ }
+ logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
+ runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
+ logging::debug("Speech recognition -- end");
+ } catch (...) {
+ std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
+ }
+
+ return phones;
+}
+
+const path& getSphinxModelDirectory() {
+ static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
+ return sphinxModelDirectory;
+}
+
+JoiningTimeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones) {
+ JoiningTimeline noiseSounds;
+
+ // Find utterance parts without recognized phones
+ noiseSounds.set(utteranceTimeRange);
+ for (const auto& timedPhone : phones) {
+ noiseSounds.clear(timedPhone.getTimeRange());
+ }
+
+ // Remove undesired elements
+ const centiseconds minSoundDuration = 12_cs;
+ for (const auto& unknownSound : JoiningTimeline(noiseSounds)) {
+ const bool startsAtZero = unknownSound.getStart() == 0_cs;
+ const bool tooShort = unknownSound.getDuration() < minSoundDuration;
+ if (startsAtZero || tooShort) {
+ noiseSounds.clear(unknownSound.getTimeRange());
+ }
+ }
+
+ return noiseSounds;
+}
+
+BoundedTimeline recognizeWords(const vector& audioBuffer, ps_decoder_t& decoder) {
+ // Restart timing at 0
+ ps_start_stream(&decoder);
+
+ // Start recognition
+ int error = ps_start_utt(&decoder);
+ if (error) throw runtime_error("Error starting utterance processing for word recognition.");
+
+ // Process entire audio clip
+ const bool noRecognition = false;
+ const bool fullUtterance = true;
+ const int searchedFrameCount =
+ ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
+ if (searchedFrameCount < 0) {
+ throw runtime_error("Error analyzing raw audio data for word recognition.");
+ }
+
+ // End recognition
+ error = ps_end_utt(&decoder);
+ if (error) throw runtime_error("Error ending utterance processing for word recognition.");
+
+ BoundedTimeline result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
+ const bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0;
+ if (noWordsRecognized) {
+ return result;
+ }
+
+ // Collect words
+ for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
+ const char* word = ps_seg_word(it);
+ int firstFrame, lastFrame;
+ ps_seg_frames(it, &firstFrame, &lastFrame);
+ result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
+ }
+
+ return result;
+}
diff --git a/rhubarb/src/recognition/pocketSphinxTools.h b/rhubarb/src/recognition/pocketSphinxTools.h
new file mode 100644
index 0000000..568ccbe
--- /dev/null
+++ b/rhubarb/src/recognition/pocketSphinxTools.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "time/BoundedTimeline.h"
+#include "core/Phone.h"
+#include "audio/AudioClip.h"
+#include "tools/ProgressBar.h"
+#include
+
+extern "C" {
+#include
+}
+
+typedef std::function(
+ boost::optional dialog
+)> decoderFactory;
+
+typedef std::function(
+ const AudioClip& audioClip,
+ TimeRange utteranceTimeRange,
+ ps_decoder_t& decoder,
+ ProgressSink& utteranceProgressSink
+)> utteranceToPhonesFunction;
+
+BoundedTimeline recognizePhones(
+ const AudioClip& inputAudioClip,
+ boost::optional dialog,
+ decoderFactory createDecoder,
+ utteranceToPhonesFunction utteranceToPhones,
+ int maxThreadCount,
+ ProgressSink& progressSink
+);
+
+constexpr int sphinxSampleRate = 16000;
+
+const boost::filesystem::path& getSphinxModelDirectory();
+
+JoiningTimeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones);
+
+BoundedTimeline recognizeWords(const std::vector& audioBuffer, ps_decoder_t& decoder);
diff --git a/rhubarb/src/rhubarb/RecognizerType.cpp b/rhubarb/src/rhubarb/RecognizerType.cpp
new file mode 100644
index 0000000..86f0837
--- /dev/null
+++ b/rhubarb/src/rhubarb/RecognizerType.cpp
@@ -0,0 +1,27 @@
+#include "RecognizerType.h"
+
+using std::string;
+
+RecognizerTypeConverter& RecognizerTypeConverter::get() {
+ static RecognizerTypeConverter converter;
+ return converter;
+}
+
+string RecognizerTypeConverter::getTypeName() {
+ return "RecognizerType";
+}
+
+EnumConverter::member_data RecognizerTypeConverter::getMemberData() {
+ return member_data{
+ { RecognizerType::PocketSphinx, "pocketSphinx" },
+ { RecognizerType::Phonetic, "phonetic" }
+ };
+}
+
+std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
+ return RecognizerTypeConverter::get().write(stream, value);
+}
+
+std::istream& operator>>(std::istream& stream, RecognizerType& value) {
+ return RecognizerTypeConverter::get().read(stream, value);
+}
diff --git a/rhubarb/src/rhubarb/RecognizerType.h b/rhubarb/src/rhubarb/RecognizerType.h
new file mode 100644
index 0000000..6f8cf12
--- /dev/null
+++ b/rhubarb/src/rhubarb/RecognizerType.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "tools/EnumConverter.h"
+
+enum class RecognizerType {
+ PocketSphinx,
+ Phonetic
+};
+
+class RecognizerTypeConverter : public EnumConverter {
+public:
+ static RecognizerTypeConverter& get();
+protected:
+ std::string getTypeName() override;
+ member_data getMemberData() override;
+};
+
+std::ostream& operator<<(std::ostream& stream, RecognizerType value);
+
+std::istream& operator>>(std::istream& stream, RecognizerType& value);
diff --git a/rhubarb/src/rhubarb/main.cpp b/rhubarb/src/rhubarb/main.cpp
index 104a6e8..703dd67 100644
--- a/rhubarb/src/rhubarb/main.cpp
+++ b/rhubarb/src/rhubarb/main.cpp
@@ -27,6 +27,9 @@
#include "tools/platformTools.h"
#include "sinks.h"
#include "semanticEntries.h"
+#include "RecognizerType.h"
+#include "recognition/PocketSphinxRecognizer.h"
+#include "recognition/PhoneticRecognizer.h"
using std::exception;
using std::string;
@@ -36,9 +39,6 @@ using std::unique_ptr;
using std::make_unique;
using std::shared_ptr;
using std::make_shared;
-using std::map;
-using std::chrono::duration;
-using std::chrono::duration_cast;
using std::ofstream;
using boost::filesystem::path;
using boost::adaptors::transformed;
@@ -56,6 +56,10 @@ namespace TCLAP {
struct ArgTraits {
typedef ValueLike ValueCategory;
};
+ template<>
+ struct ArgTraits {
+ typedef ValueLike ValueCategory;
+ };
}
shared_ptr createFileSink(path path, logging::Level minLevel) {
@@ -66,6 +70,17 @@ shared_ptr createFileSink(path path, logging::Level minLevel) {
return make_shared(FileSink, minLevel);
}
+unique_ptr createRecognizer(RecognizerType recognizerType) {
+ switch (recognizerType) {
+ case RecognizerType::PocketSphinx:
+ return make_unique();
+ case RecognizerType::Phonetic:
+ return make_unique();
+ default:
+ throw std::runtime_error("Unknown recognizer.");
+ }
+}
+
unique_ptr createExporter(ExportFormat exportFormat) {
switch (exportFormat) {
case ExportFormat::Tsv:
@@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
auto exportFormats = vector(ExportFormatConverter::get().getValues());
tclap::ValuesConstraint exportFormatConstraint(exportFormats);
tclap::ValueArg exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
+ auto recognizerTypes = vector(RecognizerTypeConverter::get().getValues());
+ tclap::ValuesConstraint recognizerConstraint(recognizerTypes);
+ tclap::ValueArg recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
tclap::UnlabeledValueArg inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
try {
@@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
JoiningContinuousTimeline animation = animateWaveFile(
inputFilePath,
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional(),
+ *createRecognizer(recognizerType.getValue()),
targetShapeSet,
maxThreadCount.getValue(),
progressSink);