Implement generic concept of recognizers with options pocketSphinx and phonetic

2018-10-08 20:30:45 +02:00 · 2018-10-08 20:30:45 +02:00 · 610f490046
parent 3ed38ada2f
commit 610f490046
15 changed files with 635 additions and 337 deletions
--- a/rhubarb/CMakeLists.txt
+++ b/rhubarb/CMakeLists.txt
@ -413,8 +413,13 @@ add_library(rhubarb-recognition
 	src/recognition/g2p.h
 	src/recognition/languageModels.cpp
 	src/recognition/languageModels.h
-	src/recognition/phoneRecognition.cpp
+	src/recognition/PhoneticRecognizer.cpp
-	src/recognition/phoneRecognition.h
+	src/recognition/PhoneticRecognizer.h
 	src/recognition/PocketSphinxRecognizer.cpp
 	src/recognition/PocketSphinxRecognizer.h
 	src/recognition/pocketSphinxTools.cpp
 	src/recognition/pocketSphinxTools.h
 	src/recognition/Recognizer.h
 	src/recognition/tokenization.cpp
 	src/recognition/tokenization.h
 )
@ -487,6 +492,8 @@ add_executable(rhubarb
 	src/rhubarb/main.cpp
 	src/rhubarb/ExportFormat.cpp
 	src/rhubarb/ExportFormat.h
 	src/rhubarb/RecognizerType.cpp
 	src/rhubarb/RecognizerType.h
 	src/rhubarb/semanticEntries.cpp
 	src/rhubarb/semanticEntries.h
 	src/rhubarb/sinks.cpp
--- a/rhubarb/resharper.DotSettings
+++ b/rhubarb/resharper.DotSettings
@ -1,7 +1,12 @@
 <wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
 	<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppClangTidyModernizeRawStringLiteral/@EntryIndexedValue">HINT</s:String>
 	<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String>
 	<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean>
@ -29,6 +34,7 @@
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String>
@ -44,6 +50,14 @@
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CssFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/HtmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/JavaScriptCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/ProtobufCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/ResxFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/VBFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlDocFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String>
 	<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String>
 	<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue">&lt;NamingElement Priority="10"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="class field" /&gt;&lt;type Name="struct field" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /&gt;&lt;/NamingElement&gt;</s:String>
@ -108,7 +122,16 @@
 	<s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpKeepExistingMigration/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpPlaceEmbeddedOnSameLineMigration/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002EFormat_002ESettingsUpgrade_002EAlignmentTabFillStyleMigration/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=allphone/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=cepstral/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=cmudict/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=pbeam/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=qwhy/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=Viterbi/@EntryIndexedValue">True</s:Boolean>
 </wpf:ResourceDictionary>
--- a/rhubarb/src/lib/rhubarbLib.cpp
+++ b/rhubarb/src/lib/rhubarbLib.cpp
@ -1,6 +1,5 @@
 #include "rhubarbLib.h"
 #include "core/Phone.h"
 #include "recognition/phoneRecognition.h"
 #include "tools/textFiles.h"
 #include "animation/mouthAnimation.h"
 #include "audio/audioFileReading.h"
@ -8,27 +7,29 @@
 using boost::optional;
 using std::string;
 using boost::filesystem::path;
 using std::unique_ptr;
 JoiningContinuousTimeline<Shape> animateAudioClip(
 	const AudioClip& audioClip,
-	optional<string> dialog,
+	const optional<string>& dialog,
 	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink)
 {
-	BoundedTimeline<Phone> phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
+	const BoundedTimeline<Phone> phones =
 		recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
 	JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
 	return result;
 }
 JoiningContinuousTimeline<Shape> animateWaveFile(
 	path filePath,
-	optional<string> dialog,
+	const optional<string>& dialog,
 	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink)
 {
 	const auto audioClip = createAudioFileClip(filePath);
-	return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink);
+	return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
 }
--- a/rhubarb/src/lib/rhubarbLib.h
+++ b/rhubarb/src/lib/rhubarbLib.h
@ -6,17 +6,20 @@
 #include "tools/ProgressBar.h"
 #include <boost/filesystem.hpp>
 #include "animation/targetShapeSet.h"
 #include "recognition/Recognizer.h"
 JoiningContinuousTimeline<Shape> animateAudioClip(
 	const AudioClip& audioClip,
-	boost::optional<std::string> dialog,
+	const boost::optional<std::string>& dialog,
 	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink);
 JoiningContinuousTimeline<Shape> animateWaveFile(
 	boost::filesystem::path filePath,
-	boost::optional<std::string> dialog,
+	const boost::optional<std::string>& dialog,
 	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink);
--- a/rhubarb/src/recognition/PhoneticRecognizer.cpp
+++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp
@ -0,0 +1,103 @@
 #include "PhoneticRecognizer.h"
 #include "time/Timeline.h"
 #include "audio/AudioSegment.h"
 #include "audio/SampleRateConverter.h"
 #include "audio/processing.h"
 #include "time/timedLogging.h"
 using std::runtime_error;
 using std::unique_ptr;
 using std::string;
 using boost::optional;
 static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
 	UNUSED(dialog);
 	lambda_unique_ptr<cmd_ln_t> config(
 		cmd_ln_init(
 			nullptr, ps_args(), true,
 			// Set acoustic model
 			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
 			// Set phonetic language model
 			"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
 			"-allphone_ci", "yes",
 			// Set language model probability weight.
 			// Low values (<= 0.4) can lead to fluttering animation.
 			// High values (>= 1.0) can lead to imprecise or freezing animation.
 			"-lw", "0.8",
 			// The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
 			// Set beam width applied to every frame in Viterbi search
 			"-beam", "1e-20",
 			// Set beam width applied to phone transitions
 			"-pbeam", "1e-20",
 			nullptr),
 		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
 	if (!config) throw runtime_error("Error creating configuration.");
 	lambda_unique_ptr<ps_decoder_t> decoder(
 		ps_init(config.get()),
 		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
 	if (!decoder) throw runtime_error("Error creating speech decoder.");
 	return decoder;
 }
 static Timeline<Phone> utteranceToPhones(
 	const AudioClip& audioClip,
 	TimeRange utteranceTimeRange,
 	ps_decoder_t& decoder,
 	ProgressSink& utteranceProgressSink
 ) {
 	// Pad time range to give PocketSphinx some breathing room
 	TimeRange paddedTimeRange = utteranceTimeRange;
 	const centiseconds padding(3);
 	paddedTimeRange.grow(padding);
 	paddedTimeRange.trim(audioClip.getTruncatedRange());
 	const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
 	const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
 	// Detect phones (returned as words)
 	BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
 	phoneStrings.shift(paddedTimeRange.getStart());
 	Timeline<Phone> utterancePhones;
 	for (const auto& timedPhoneString : phoneStrings) {
 		Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
 		if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
 			// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
 			phone = Phone::Schwa;
 		}
 		utterancePhones.set(timedPhoneString.getTimeRange(), phone);
 	}
 	// Log raw phones
 	for (const auto& timedPhone : utterancePhones) {
 		logTimedEvent("rawPhone", timedPhone);
 	}
 	// Guess positions of noise sounds
 	JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
 	for (const auto& noiseSound : noiseSounds) {
 		utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
 	}
 	// Log phones
 	for (const auto& timedPhone : utterancePhones) {
 		logTimedEvent("phone", timedPhone);
 	}
 	utteranceProgressSink.reportProgress(1.0);
 	return utterancePhones;
 }
 BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
 	const AudioClip& inputAudioClip,
 	optional<std::string> dialog,
 	int maxThreadCount,
 	ProgressSink& progressSink
 ) const {
 	return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
 }
--- a/rhubarb/src/recognition/PhoneticRecognizer.h
+++ b/rhubarb/src/recognition/PhoneticRecognizer.h
@ -0,0 +1,14 @@
 #pragma once
 #include "Recognizer.h"
 #include "pocketSphinxTools.h"
 class PhoneticRecognizer : public Recognizer {
 public:
 	BoundedTimeline<Phone> recognizePhones(
 		const AudioClip& inputAudioClip,
 		boost::optional<std::string> dialog,
 		int maxThreadCount,
 		ProgressSink& progressSink
 	) const override;
 };
--- a/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
@ -1,145 +1,135 @@
-#include <boost/filesystem.hpp>
+#include "PocketSphinxRecognizer.h"
 #include "phoneRecognition.h"
 #include "audio/SampleRateConverter.h"
 #include "tools/platformTools.h"
 #include "tools/tools.h"
 #include <format.h>
 #include <s3types.h>
 #include <regex>
 #include <gsl_util.h>
 #include "logging/logging.h"
 #include "audio/DcOffset.h"
 #include "time/Timeline.h"
 #include "audio/voiceActivityDetection.h"
 #include "audio/AudioSegment.h"
 #include "audio/SampleRateConverter.h"
 #include "languageModels.h"
 #include "tokenization.h"
 #include "g2p.h"
 #include "time/ContinuousTimeline.h"
 #include "audio/processing.h"
 #include "tools/parallel.h"
 #include <boost/version.hpp>
 #include "tools/ObjectPool.h"
 #include "time/timedLogging.h"
 extern "C" {
 #include <pocketsphinx.h>
 #include <sphinxbase/err.h>
 #include <ps_alignment.h>
 #include <state_align_search.h>
 #include <pocketsphinx_internal.h>
 #include <ngram_search.h>
 }
 using std::runtime_error;
 using std::invalid_argument;
 using std::unique_ptr;
 using std::shared_ptr;
 using std::string;
 using std::vector;
 using std::map;
 using boost::filesystem::path;
 using std::function;
 using std::regex;
 using std::regex_replace;
 using std::chrono::duration;
 using boost::optional;
 using std::string;
 using std::chrono::duration_cast;
 using std::array;
-constexpr int sphinxSampleRate = 16000;
+bool dictionaryContains(dict_t& dictionary, const string& word) {
-
+	return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
 const path& getSphinxModelDirectory() {
 	static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
 	return sphinxModelDirectory;
 }
 logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
 	switch (errorLevel) {
 	case ERR_DEBUG:
 	case ERR_INFO:
 	case ERR_INFOCONT:
 		return logging::Level::Trace;
 	case ERR_WARN:
 		return logging::Level::Warn;
 	case ERR_ERROR:
 		return logging::Level::Error;
 	case ERR_FATAL:
 		return logging::Level::Fatal;
 	default:
 		throw invalid_argument("Unknown log level.");
 	}
 }
 void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
 	UNUSED(user_data);
 	// Create varArgs list
 	va_list args;
 	va_start(args, format);
 	auto _ = gsl::finally([&args]() { va_end(args); });
 	// Format message
 	const int initialSize = 256;
 	vector<char> chars(initialSize);
 	bool success = false;
 	while (!success) {
 		int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
 		if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
 		success = charsWritten < static_cast<int>(chars.size());
 		if (!success) chars.resize(chars.size() * 2);
 	}
 	regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
 	string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
 	boost::algorithm::trim(message);
 	logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
 	logging::log(logLevel, message);
 }
 BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
 	// Restart timing at 0
 	ps_start_stream(&decoder);
 	// Start recognition
 	int error = ps_start_utt(&decoder);
 	if (error) throw runtime_error("Error starting utterance processing for word recognition.");
 	// Process entire audio clip
 	const bool noRecognition = false;
 	const bool fullUtterance = true;
 	int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
 	if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
 	// End recognition
 	error = ps_end_utt(&decoder);
 	if (error) throw runtime_error("Error ending utterance processing for word recognition.");
 	BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
 	bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
 	if (noWordsRecognized) {
 		return result;
 	}
 	// Collect words
 	for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
 		const char* word = ps_seg_word(it);
 		int firstFrame, lastFrame;
 		ps_seg_frames(it, &firstFrame, &lastFrame);
 		result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
 	}
 	return result;
 }
 s3wid_t getWordId(const string& word, dict_t& dictionary) {
-	s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
+	const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
 	if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
 	return wordId;
 }
 void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
 	map<string, string> missingPronunciations;
 	for (const string& word : words) {
 		if (!dictionaryContains(*decoder.dict, word)) {
 			string pronunciation;
 			for (Phone phone : wordToPhones(word)) {
 				if (pronunciation.length() > 0) pronunciation += " ";
 				pronunciation += PhoneConverter::get().toString(phone);
 			}
 			missingPronunciations[word] = pronunciation;
 		}
 	}
 	for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
 		const bool isLast = it == --missingPronunciations.end();
 		logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
 		ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
 	}
 }
 lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
 	path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
 	lambda_unique_ptr<ngram_model_t> result(
 		ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
 		[](ngram_model_t* lm) { ngram_model_free(lm); });
 	if (!result) {
 		throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
 	}
 	return result;
 }
 lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
 	// Split dialog into normalized words
 	vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
 	// Add dialog-specific words to the dictionary
 	addMissingDictionaryWords(words, decoder);
 	// Create dialog-specific language model
 	words.insert(words.begin(), "<s>");
 	words.emplace_back("</s>");
 	return createLanguageModel(words, decoder);
 }
 lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
 	auto defaultLanguageModel = createDefaultLanguageModel(decoder);
 	auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
 	constexpr int modelCount = 2;
 	array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
 	array<const char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
 	array<float, modelCount> modelWeights{ 0.1f, 0.9f };
 	lambda_unique_ptr<ngram_model_t> result(
 		ngram_model_set_init(nullptr, languageModels.data(), const_cast<char**>(modelNames.data()), modelWeights.data(), modelCount),
 		[](ngram_model_t* lm) { ngram_model_free(lm); });
 	if (!result) {
 		throw runtime_error("Error creating biased language model.");
 	}
 	return result;
 }
 static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
 	lambda_unique_ptr<cmd_ln_t> config(
 		cmd_ln_init(
 			nullptr, ps_args(), true,
 			// Set acoustic model
 			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
 			// Set pronunciation dictionary
 			"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
 			// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
 			"-dither", "yes",
 			// Disable VAD -- we're doing that ourselves
 			"-remove_silence", "no",
 			// Perform per-utterance cepstral mean normalization
 			"-cmn", "batch",
 			nullptr),
 		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
 	if (!config) throw runtime_error("Error creating configuration.");
 	lambda_unique_ptr<ps_decoder_t> decoder(
 		ps_init(config.get()),
 		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
 	if (!decoder) throw runtime_error("Error creating speech decoder.");
 	// Set language model
 	lambda_unique_ptr<ngram_model_t> languageModel(dialog
 		? createBiasedLanguageModel(*decoder, *dialog)
 		: createDefaultLanguageModel(*decoder));
 	ps_set_lm(decoder.get(), "lm", languageModel.get());
 	ps_set_search(decoder.get(), "lm");
 	return decoder;
 }
 optional<Timeline<Phone>> getPhoneAlignment(
 	const vector<s3wid_t>& wordIds,
 	const vector<int16_t>& audioBuffer,
@ -178,7 +168,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
 		// Process entire audio clip
 		const int16* nextSample = audioBuffer.data();
 		size_t remainingSamples = audioBuffer.size();
-		bool fullUtterance = true;
+		const bool fullUtterance = true;
 		while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
 			while (acousticModel->n_feat_frame > 0) {
 				ps_search_step(search.get(), acousticModel->output_frame);
@ -197,7 +187,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
 	for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
 		// Get phone
 		ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
-		s3cipid_t phoneId = phoneEntry->id.pid.cipid;
+		const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
 		string phoneName = phoneNames[phoneId];
 		if (phoneName == "SIL") continue;
@ -207,135 +197,15 @@ optional<Timeline<Phone>> getPhoneAlignment(
 		centiseconds duration(phoneEntry->duration);
 		Phone phone = PhoneConverter::get().parse(phoneName);
 		if (phone == Phone::AH && duration < 6_cs) {
-			// Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate.
+			// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
 			phone = Phone::Schwa;
 		}
-		Timed<Phone> timedPhone(start, start + duration, phone);
+		const Timed<Phone> timedPhone(start, start + duration, phone);
 		result.set(timedPhone);
 	}
 	return result;
 }
 bool dictionaryContains(dict_t& dictionary, const string& word) {
 	return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
 }
 void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
 	map<string, string> missingPronunciations;
 	for (const string& word : words) {
 		if (!dictionaryContains(*decoder.dict, word)) {
 			string pronunciation;
 			for (Phone phone : wordToPhones(word)) {
 				if (pronunciation.length() > 0) pronunciation += " ";
 				pronunciation += PhoneConverter::get().toString(phone);
 			}
 			missingPronunciations[word] = pronunciation;
 		}
 	}
 	for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
 		bool isLast = it == --missingPronunciations.end();
 		logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
 		ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
 	}
 }
 lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
 	path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
 	lambda_unique_ptr<ngram_model_t> result(
 		ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
 		[](ngram_model_t* lm) { ngram_model_free(lm); });
 	if (!result) {
 		throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
 	}
 	return std::move(result);
 }
 lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
 	// Split dialog into normalized words
 	vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
 	// Add dialog-specific words to the dictionary
 	addMissingDictionaryWords(words, decoder);
 	// Create dialog-specific language model
 	words.insert(words.begin(), "<s>");
 	words.push_back("</s>");
 	return createLanguageModel(words, decoder);
 }
 lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
 	auto defaultLanguageModel = createDefaultLanguageModel(decoder);
 	auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
 	constexpr int modelCount = 2;
 	array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
 	array<char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
 	array<float, modelCount> modelWeights{ 0.1f, 0.9f };
 	lambda_unique_ptr<ngram_model_t> result(
 		ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
 		[](ngram_model_t* lm) { ngram_model_free(lm); });
 	if (!result) {
 		throw runtime_error("Error creating biased language model.");
 	}
 	return std::move(result);
 }
 lambda_unique_ptr<ps_decoder_t> createDecoder(optional<string> dialog) {
 	lambda_unique_ptr<cmd_ln_t> config(
 		cmd_ln_init(
 			nullptr, ps_args(), true,
 			// Set acoustic model
 			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
 			// Set pronunciation dictionary
 			"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
 			// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
 			"-dither", "yes",
 			// Disable VAD -- we're doing that ourselves
 			"-remove_silence", "no",
 			// Perform per-utterance cepstral mean normalization
 			"-cmn", "batch",
 			nullptr),
 		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
 	if (!config) throw runtime_error("Error creating configuration.");
 	lambda_unique_ptr<ps_decoder_t> decoder(
 		ps_init(config.get()),
 		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
 	if (!decoder) throw runtime_error("Error creating speech decoder.");
 	// Set language model
 	lambda_unique_ptr<ngram_model_t> languageModel(dialog
 		? createBiasedLanguageModel(*decoder, *dialog)
 		: createDefaultLanguageModel(*decoder));
 	ps_set_lm(decoder.get(), "lm", languageModel.get());
 	ps_set_search(decoder.get(), "lm");
 	return decoder;
 }
 JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
 	JoiningTimeline<void> noiseSounds;
 	// Find utterance parts without recogniced phones
 	noiseSounds.set(utteranceTimeRange);
 	for (const auto& timedPhone : phones) {
 		noiseSounds.clear(timedPhone.getTimeRange());
 	}
 	// Remove undesired elements
 	const centiseconds minSoundDuration = 12_cs;
 	for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
 		bool startsAtZero = unknownSound.getStart() == 0_cs;
 		bool tooShort = unknownSound.getDuration() < minSoundDuration;
 		if (startsAtZero || tooShort) {
 			noiseSounds.clear(unknownSound.getTimeRange());
 		}
 	}
 	return noiseSounds;
 }
 // Some words have multiple pronunciations, one of which results in better animation than the others.
 // This function returns the optimal pronunciation for a select set of these words.
 string fixPronunciation(const string& word) {
@ -352,17 +222,17 @@ string fixPronunciation(const string& word) {
 	return pair != replacements.end() ? pair->second : word;
 }
-Timeline<Phone> utteranceToPhones(
+static Timeline<Phone> utteranceToPhones(
 	const AudioClip& audioClip,
 	TimeRange utteranceTimeRange,
 	ps_decoder_t& decoder,
-	ProgressSink& utteranceProgressSink)
+	ProgressSink& utteranceProgressSink
-{
+) {
 	ProgressMerger utteranceProgressMerger(utteranceProgressSink);
 	ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
 	ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
-	// Pad time range to give Pocketsphinx some breathing room
+	// Pad time range to give PocketSphinx some breathing room
 	TimeRange paddedTimeRange = utteranceTimeRange;
 	const centiseconds padding(3);
 	paddedTimeRange.grow(padding);
@ -384,7 +254,7 @@ Timeline<Phone> utteranceToPhones(
 			continue;
 		}
 		word = regex_replace(word, regex("\\(\\d\\)"), "");
-		if (text.size() > 0) {
+		if (!text.empty()) {
 			text += " ";
 		}
 		text += word;
@ -433,77 +303,11 @@ Timeline<Phone> utteranceToPhones(
 	return utterancePhones;
 }
-BoundedTimeline<Phone> recognizePhones(
+BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
 	const AudioClip& inputAudioClip,
-	optional<string> dialog,
+	optional<std::string> dialog,
 	int maxThreadCount,
-	ProgressSink& progressSink)
+	ProgressSink& progressSink
-{
+) const {
-	ProgressMerger totalProgressMerger(progressSink);
+	return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
 	ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
 	ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
 	// Make sure audio stream has no DC offset
 	const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
 	// Split audio into utterances
 	JoiningBoundedTimeline<void> utterances;
 	try {
 		utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
 	}
 	catch (...) {
 		std::throw_with_nested(runtime_error("Error detecting segments of speech."));
 	}
 	// Discard Pocketsphinx output
 	err_set_logfp(nullptr);
 	// Redirect Pocketsphinx output to log
 	err_set_callback(sphinxLogCallback, nullptr);
 	// Prepare pool of decoders
 	ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
 		[&dialog] { return createDecoder(dialog); });
 	BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
 	std::mutex resultMutex;
 	auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
 		// Detect phones for utterance
 		auto decoder = decoderPool.acquire();
 		Timeline<Phone> utterancePhones =
 			utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
 		// Copy phones to result timeline
 		std::lock_guard<std::mutex> lock(resultMutex);
 		for (const auto& timedPhone : utterancePhones) {
 			phones.set(timedPhone);
 		}
 	};
 	auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
 		return timedUtterance.getDuration().count();
 	};
 	// Perform speech recognition
 	try {
 		// Determine how many parallel threads to use
 		int threadCount = std::min({
 			maxThreadCount,
 			// Don't use more threads than there are utterances to be processed
 			static_cast<int>(utterances.size()),
 			// Don't waste time creating additional threads (and decoders!) if the recording is short
 			static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
 		});
 		if (threadCount < 1) {
 			threadCount = 1;
 		}
 		logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
 		runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
 		logging::debug("Speech recognition -- end");
 	}
 	catch (...) {
 		std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
 	}
 	return phones;
 }
--- a/rhubarb/src/recognition/PocketSphinxRecognizer.h
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.h
@ -0,0 +1,14 @@
 #pragma once
 #include "Recognizer.h"
 #include "pocketSphinxTools.h"
 class PocketSphinxRecognizer : public Recognizer {
 public:
 	BoundedTimeline<Phone> recognizePhones(
 		const AudioClip& inputAudioClip,
 		boost::optional<std::string> dialog,
 		int maxThreadCount,
 		ProgressSink& progressSink
 	) const override;
 };
--- a/rhubarb/src/recognition/Recognizer.h
+++ b/rhubarb/src/recognition/Recognizer.h
@ -0,0 +1,18 @@
 #pragma once
 #include "audio/AudioClip.h"
 #include "core/Phone.h"
 #include "tools/ProgressBar.h"
 #include "time/BoundedTimeline.h"
 class Recognizer {
 public:
 	virtual ~Recognizer() = default;
 	virtual BoundedTimeline<Phone>recognizePhones(
 		const AudioClip& audioClip,
 		boost::optional<std::string> dialog,
 		int maxThreadCount,
 		ProgressSink& progressSink
 	) const = 0;
 };
--- a/rhubarb/src/recognition/phoneRecognition.h
+++ b/rhubarb/src/recognition/phoneRecognition.h
@ -1,12 +0,0 @@
 #pragma once
 #include "audio/AudioClip.h"
 #include "core/Phone.h"
 #include "tools/ProgressBar.h"
 #include "time/BoundedTimeline.h"
 BoundedTimeline<Phone> recognizePhones(
 	const AudioClip& audioClip,
 	boost::optional<std::string> dialog,
 	int maxThreadCount,
 	ProgressSink& progressSink);
--- a/rhubarb/src/recognition/pocketSphinxTools.cpp
+++ b/rhubarb/src/recognition/pocketSphinxTools.cpp
@ -0,0 +1,218 @@
 #include "pocketSphinxTools.h"
 #include "tools/platformTools.h"
 #include <regex>
 #include "audio/DcOffset.h"
 #include "audio/voiceActivityDetection.h"
 #include "tools/parallel.h"
 #include "tools/ObjectPool.h"
 #include "time/timedLogging.h"
 extern "C" {
 #include <sphinxbase/err.h>
 #include <pocketsphinx_internal.h>
 #include <ngram_search.h>
 }
 using std::runtime_error;
 using std::invalid_argument;
 using std::unique_ptr;
 using std::string;
 using std::vector;
 using boost::filesystem::path;
 using std::regex;
 using boost::optional;
 using std::chrono::duration_cast;
 logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
 	switch (errorLevel) {
 	case ERR_DEBUG:
 	case ERR_INFO:
 	case ERR_INFOCONT:
 		return logging::Level::Trace;
 	case ERR_WARN:
 		return logging::Level::Warn;
 	case ERR_ERROR:
 		return logging::Level::Error;
 	case ERR_FATAL:
 		return logging::Level::Fatal;
 	default:
 		throw invalid_argument("Unknown log level.");
 	}
 }
 void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
 	UNUSED(user_data);
 	// Create varArgs list
 	va_list args;
 	va_start(args, format);
 	auto _ = gsl::finally([&args]() { va_end(args); });
 	// Format message
 	const int initialSize = 256;
 	vector<char> chars(initialSize);
 	bool success = false;
 	while (!success) {
 		const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
 		if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
 		success = charsWritten < static_cast<int>(chars.size());
 		if (!success) chars.resize(chars.size() * 2);
 	}
 	const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
 	string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
 	boost::algorithm::trim(message);
 	const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
 	logging::log(logLevel, message);
 }
 void redirectPocketSphinxOutput() {
 	static bool redirected = false;
 	if (redirected) return;
 	// Discard PocketSphinx output
 	err_set_logfp(nullptr);
 	// Redirect PocketSphinx output to log
 	err_set_callback(sphinxLogCallback, nullptr);
 	redirected = true;
 }
 BoundedTimeline<Phone> recognizePhones(
 	const AudioClip& inputAudioClip,
 	optional<std::string> dialog,
 	decoderFactory createDecoder,
 	utteranceToPhonesFunction utteranceToPhones,
 	int maxThreadCount,
 	ProgressSink& progressSink
 ) {
 	ProgressMerger totalProgressMerger(progressSink);
 	ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
 	ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
 	// Make sure audio stream has no DC offset
 	const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
 	// Split audio into utterances
 	JoiningBoundedTimeline<void> utterances;
 	try {
 		utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
 	} catch (...) {
 		std::throw_with_nested(runtime_error("Error detecting segments of speech."));
 	}
 	redirectPocketSphinxOutput();
 	// Prepare pool of decoders
 	ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
 		[&] { return createDecoder(dialog); });
 	BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
 	std::mutex resultMutex;
 	const auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
 		// Detect phones for utterance
 		const auto decoder = decoderPool.acquire();
 		Timeline<Phone> utterancePhones =
 			utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
 		// Copy phones to result timeline
 		std::lock_guard<std::mutex> lock(resultMutex);
 		for (const auto& timedPhone : utterancePhones) {
 			phones.set(timedPhone);
 		}
 	};
 	const auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
 		return timedUtterance.getDuration().count();
 	};
 	// Perform speech recognition
 	try {
 		// Determine how many parallel threads to use
 		int threadCount = std::min({
 			maxThreadCount,
 			// Don't use more threads than there are utterances to be processed
 			static_cast<int>(utterances.size()),
 			// Don't waste time creating additional threads (and decoders!) if the recording is short
 			static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
 		});
 		if (threadCount < 1) {
 			threadCount = 1;
 		}
 		logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
 		runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
 		logging::debug("Speech recognition -- end");
 	} catch (...) {
 		std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
 	}
 	return phones;
 }
 const path& getSphinxModelDirectory() {
 	static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
 	return sphinxModelDirectory;
 }
 JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
 	JoiningTimeline<void> noiseSounds;
 	// Find utterance parts without recognized phones
 	noiseSounds.set(utteranceTimeRange);
 	for (const auto& timedPhone : phones) {
 		noiseSounds.clear(timedPhone.getTimeRange());
 	}
 	// Remove undesired elements
 	const centiseconds minSoundDuration = 12_cs;
 	for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
 		const bool startsAtZero = unknownSound.getStart() == 0_cs;
 		const bool tooShort = unknownSound.getDuration() < minSoundDuration;
 		if (startsAtZero || tooShort) {
 			noiseSounds.clear(unknownSound.getTimeRange());
 		}
 	}
 	return noiseSounds;
 }
 BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
 	// Restart timing at 0
 	ps_start_stream(&decoder);
 	// Start recognition
 	int error = ps_start_utt(&decoder);
 	if (error) throw runtime_error("Error starting utterance processing for word recognition.");
 	// Process entire audio clip
 	const bool noRecognition = false;
 	const bool fullUtterance = true;
 	const int searchedFrameCount =
 		ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
 	if (searchedFrameCount < 0) {
 		throw runtime_error("Error analyzing raw audio data for word recognition.");
 	}
 	// End recognition
 	error = ps_end_utt(&decoder);
 	if (error) throw runtime_error("Error ending utterance processing for word recognition.");
 	BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
 	const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
 	if (noWordsRecognized) {
 		return result;
 	}
 	// Collect words
 	for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
 		const char* word = ps_seg_word(it);
 		int firstFrame, lastFrame;
 		ps_seg_frames(it, &firstFrame, &lastFrame);
 		result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
 	}
 	return result;
 }
--- a/rhubarb/src/recognition/pocketSphinxTools.h
+++ b/rhubarb/src/recognition/pocketSphinxTools.h
@ -0,0 +1,39 @@
 #pragma once
 #include "time/BoundedTimeline.h"
 #include "core/Phone.h"
 #include "audio/AudioClip.h"
 #include "tools/ProgressBar.h"
 #include <boost/filesystem/path.hpp>
 extern "C" {
 #include <pocketsphinx.h>
 }
 typedef std::function<lambda_unique_ptr<ps_decoder_t>(
 	boost::optional<std::string> dialog
 )> decoderFactory;
 typedef std::function<Timeline<Phone>(
 	const AudioClip& audioClip,
 	TimeRange utteranceTimeRange,
 	ps_decoder_t& decoder,
 	ProgressSink& utteranceProgressSink
 )> utteranceToPhonesFunction;
 BoundedTimeline<Phone> recognizePhones(
 	const AudioClip& inputAudioClip,
 	boost::optional<std::string> dialog,
 	decoderFactory createDecoder,
 	utteranceToPhonesFunction utteranceToPhones,
 	int maxThreadCount,
 	ProgressSink& progressSink
 );
 constexpr int sphinxSampleRate = 16000;
 const boost::filesystem::path& getSphinxModelDirectory();
 JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones);
 BoundedTimeline<std::string> recognizeWords(const std::vector<int16_t>& audioBuffer, ps_decoder_t& decoder);
--- a/rhubarb/src/rhubarb/RecognizerType.cpp
+++ b/rhubarb/src/rhubarb/RecognizerType.cpp
@ -0,0 +1,27 @@
 #include "RecognizerType.h"
 using std::string;
 RecognizerTypeConverter& RecognizerTypeConverter::get() {
 	static RecognizerTypeConverter converter;
 	return converter;
 }
 string RecognizerTypeConverter::getTypeName() {
 	return "RecognizerType";
 }
 EnumConverter<RecognizerType>::member_data RecognizerTypeConverter::getMemberData() {
 	return member_data{
 		{ RecognizerType::PocketSphinx,	"pocketSphinx" },
 		{ RecognizerType::Phonetic,		"phonetic" }
 	};
 }
 std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
 	return RecognizerTypeConverter::get().write(stream, value);
 }
 std::istream& operator>>(std::istream& stream, RecognizerType& value) {
 	return RecognizerTypeConverter::get().read(stream, value);
 }
--- a/rhubarb/src/rhubarb/RecognizerType.h
+++ b/rhubarb/src/rhubarb/RecognizerType.h
@ -0,0 +1,20 @@
 #pragma once
 #include "tools/EnumConverter.h"
 enum class RecognizerType {
 	PocketSphinx,
 	Phonetic
 };
 class RecognizerTypeConverter : public EnumConverter<RecognizerType> {
 public:
 	static RecognizerTypeConverter& get();
 protected:
 	std::string getTypeName() override;
 	member_data getMemberData() override;
 };
 std::ostream& operator<<(std::ostream& stream, RecognizerType value);
 std::istream& operator>>(std::istream& stream, RecognizerType& value);
--- a/rhubarb/src/rhubarb/main.cpp
+++ b/rhubarb/src/rhubarb/main.cpp
@ -27,6 +27,9 @@
 #include "tools/platformTools.h"
 #include "sinks.h"
 #include "semanticEntries.h"
 #include "RecognizerType.h"
 #include "recognition/PocketSphinxRecognizer.h"
 #include "recognition/PhoneticRecognizer.h"
 using std::exception;
 using std::string;
@ -36,9 +39,6 @@ using std::unique_ptr;
 using std::make_unique;
 using std::shared_ptr;
 using std::make_shared;
 using std::map;
 using std::chrono::duration;
 using std::chrono::duration_cast;
 using std::ofstream;
 using boost::filesystem::path;
 using boost::adaptors::transformed;
@ -56,6 +56,10 @@ namespace TCLAP {
 	struct ArgTraits<ExportFormat> {
 		typedef ValueLike ValueCategory;
 	};
 	template<>
 	struct ArgTraits<RecognizerType> {
 		typedef ValueLike ValueCategory;
 	};
 }
 shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
@ -66,6 +70,17 @@ shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
 	return make_shared<logging::LevelFilter>(FileSink, minLevel);
 }
 unique_ptr<Recognizer> createRecognizer(RecognizerType recognizerType) {
 	switch (recognizerType) {
 	case RecognizerType::PocketSphinx:
 		return make_unique<PocketSphinxRecognizer>();
 	case RecognizerType::Phonetic:
 		return make_unique<PhoneticRecognizer>();
 	default:
 		throw std::runtime_error("Unknown recognizer.");
 	}
 }
 unique_ptr<Exporter> createExporter(ExportFormat exportFormat) {
 	switch (exportFormat) {
 	case ExportFormat::Tsv:
@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
 	auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
 	tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
 	tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
 	auto recognizerTypes = vector<RecognizerType>(RecognizerTypeConverter::get().getValues());
 	tclap::ValuesConstraint<RecognizerType> recognizerConstraint(recognizerTypes);
 	tclap::ValueArg<RecognizerType> recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
 	tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
 	try {
@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
 			JoiningContinuousTimeline<Shape> animation = animateWaveFile(
 				inputFilePath,
 				dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
 				*createRecognizer(recognizerType.getValue()),
 				targetShapeSet,
 				maxThreadCount.getValue(),
 				progressSink);