Implement generic concept of recognizers with options pocketSphinx and phonetic

2018-10-08 20:30:45 +02:00 · 2018-10-08 20:30:45 +02:00 · 610f490046
parent 3ed38ada2f
commit 610f490046
15 changed files with 635 additions and 337 deletions
--- a/rhubarb/CMakeLists.txt
+++ b/rhubarb/CMakeLists.txt
@ -413,8 +413,13 @@ add_library(rhubarb-recognition
 	src/recognition/g2p.h
 	src/recognition/languageModels.cpp
 	src/recognition/languageModels.h
-	src/recognition/phoneRecognition.cpp
-	src/recognition/phoneRecognition.h
+	src/recognition/PhoneticRecognizer.cpp
+	src/recognition/PhoneticRecognizer.h
+	src/recognition/PocketSphinxRecognizer.cpp
+	src/recognition/PocketSphinxRecognizer.h
+	src/recognition/pocketSphinxTools.cpp
+	src/recognition/pocketSphinxTools.h
+	src/recognition/Recognizer.h
 	src/recognition/tokenization.cpp
 	src/recognition/tokenization.h
 )
@ -487,6 +492,8 @@ add_executable(rhubarb
 	src/rhubarb/main.cpp
 	src/rhubarb/ExportFormat.cpp
 	src/rhubarb/ExportFormat.h
+	src/rhubarb/RecognizerType.cpp
+	src/rhubarb/RecognizerType.h
 	src/rhubarb/semanticEntries.cpp
 	src/rhubarb/semanticEntries.h
 	src/rhubarb/sinks.cpp
--- a/rhubarb/resharper.DotSettings
+++ b/rhubarb/resharper.DotSettings
@ -1,7 +1,12 @@
 <wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
+	<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppClangTidyModernizeRawStringLiteral/@EntryIndexedValue">HINT</s:String>
+	
 	<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String>
+	
 	<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String>
+	
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean>
@ -29,6 +34,7 @@
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String>
@ -44,6 +50,14 @@
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CssFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/HtmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/JavaScriptCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/ProtobufCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/ResxFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/VBFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlDocFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String>
 	<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String>
 	<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue">&lt;NamingElement Priority="10"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="class field" /&gt;&lt;type Name="struct field" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /&gt;&lt;/NamingElement&gt;</s:String>
@ -108,7 +122,16 @@
 	<s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpKeepExistingMigration/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpPlaceEmbeddedOnSameLineMigration/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002EFormat_002ESettingsUpgrade_002EAlignmentTabFillStyleMigration/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=allphone/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=cepstral/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=cmudict/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=pbeam/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=qwhy/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=Viterbi/@EntryIndexedValue">True</s:Boolean>
 </wpf:ResourceDictionary>
--- a/rhubarb/src/lib/rhubarbLib.cpp
+++ b/rhubarb/src/lib/rhubarbLib.cpp
@ -1,6 +1,5 @@
 #include "rhubarbLib.h"
 #include "core/Phone.h"
-#include "recognition/phoneRecognition.h"
 #include "tools/textFiles.h"
 #include "animation/mouthAnimation.h"
 #include "audio/audioFileReading.h"
@ -8,27 +7,29 @@
 using boost::optional;
 using std::string;
 using boost::filesystem::path;
-using std::unique_ptr;

 JoiningContinuousTimeline<Shape> animateAudioClip(
 	const AudioClip& audioClip,
-	optional<string> dialog,
+	const optional<string>& dialog,
+	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink)
 {
-	BoundedTimeline<Phone> phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
+	const BoundedTimeline<Phone> phones =
+		recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
 	JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
 	return result;
 }

 JoiningContinuousTimeline<Shape> animateWaveFile(
 	path filePath,
-	optional<string> dialog,
+	const optional<string>& dialog,
+	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink)
 {
 	const auto audioClip = createAudioFileClip(filePath);
-	return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink);
+	return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
 }
--- a/rhubarb/src/lib/rhubarbLib.h
+++ b/rhubarb/src/lib/rhubarbLib.h
@ -6,17 +6,20 @@
 #include "tools/ProgressBar.h"
 #include <boost/filesystem.hpp>
 #include "animation/targetShapeSet.h"
+#include "recognition/Recognizer.h"

 JoiningContinuousTimeline<Shape> animateAudioClip(
 	const AudioClip& audioClip,
-	boost::optional<std::string> dialog,
+	const boost::optional<std::string>& dialog,
+	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink);

 JoiningContinuousTimeline<Shape> animateWaveFile(
 	boost::filesystem::path filePath,
-	boost::optional<std::string> dialog,
+	const boost::optional<std::string>& dialog,
+	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink);
--- a/rhubarb/src/recognition/PhoneticRecognizer.cpp
+++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp
@ -0,0 +1,103 @@
+#include "PhoneticRecognizer.h"
+#include "time/Timeline.h"
+#include "audio/AudioSegment.h"
+#include "audio/SampleRateConverter.h"
+#include "audio/processing.h"
+#include "time/timedLogging.h"
+
+using std::runtime_error;
+using std::unique_ptr;
+using std::string;
+using boost::optional;
+
+static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
+	UNUSED(dialog);
+
+	lambda_unique_ptr<cmd_ln_t> config(
+		cmd_ln_init(
+			nullptr, ps_args(), true,
+			// Set acoustic model
+			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
+			// Set phonetic language model
+			"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
+			"-allphone_ci", "yes",
+			// Set language model probability weight.
+			// Low values (<= 0.4) can lead to fluttering animation.
+			// High values (>= 1.0) can lead to imprecise or freezing animation.
+			"-lw", "0.8",
+
+			// The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
+
+			// Set beam width applied to every frame in Viterbi search
+			"-beam", "1e-20",
+			// Set beam width applied to phone transitions
+			"-pbeam", "1e-20",
+			nullptr),
+		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
+	if (!config) throw runtime_error("Error creating configuration.");
+
+	lambda_unique_ptr<ps_decoder_t> decoder(
+		ps_init(config.get()),
+		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
+	if (!decoder) throw runtime_error("Error creating speech decoder.");
+
+	return decoder;
+}
+
+static Timeline<Phone> utteranceToPhones(
+	const AudioClip& audioClip,
+	TimeRange utteranceTimeRange,
+	ps_decoder_t& decoder,
+	ProgressSink& utteranceProgressSink
+) {
+	// Pad time range to give PocketSphinx some breathing room
+	TimeRange paddedTimeRange = utteranceTimeRange;
+	const centiseconds padding(3);
+	paddedTimeRange.grow(padding);
+	paddedTimeRange.trim(audioClip.getTruncatedRange());
+
+	const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
+	const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
+
+	// Detect phones (returned as words)
+	BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
+	phoneStrings.shift(paddedTimeRange.getStart());
+	Timeline<Phone> utterancePhones;
+	for (const auto& timedPhoneString : phoneStrings) {
+		Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
+		if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
+			// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
+			phone = Phone::Schwa;
+		}
+		utterancePhones.set(timedPhoneString.getTimeRange(), phone);
+	}
+
+	// Log raw phones
+	for (const auto& timedPhone : utterancePhones) {
+		logTimedEvent("rawPhone", timedPhone);
+	}
+
+	// Guess positions of noise sounds
+	JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
+	for (const auto& noiseSound : noiseSounds) {
+		utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
+	}
+
+	// Log phones
+	for (const auto& timedPhone : utterancePhones) {
+		logTimedEvent("phone", timedPhone);
+	}
+
+	utteranceProgressSink.reportProgress(1.0);
+
+	return utterancePhones;
+}
+
+BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
+	const AudioClip& inputAudioClip,
+	optional<std::string> dialog,
+	int maxThreadCount,
+	ProgressSink& progressSink
+) const {
+	return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
+}
--- a/rhubarb/src/recognition/PhoneticRecognizer.h
+++ b/rhubarb/src/recognition/PhoneticRecognizer.h
@ -0,0 +1,14 @@
+#pragma once
+
+#include "Recognizer.h"
+#include "pocketSphinxTools.h"
+
+class PhoneticRecognizer : public Recognizer {
+public:
+	BoundedTimeline<Phone> recognizePhones(
+		const AudioClip& inputAudioClip,
+		boost::optional<std::string> dialog,
+		int maxThreadCount,
+		ProgressSink& progressSink
+	) const override;
+};
--- a/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
@ -1,143 +1,133 @@
-#include <boost/filesystem.hpp>
-#include "phoneRecognition.h"
-#include "audio/SampleRateConverter.h"
-#include "tools/platformTools.h"
-#include "tools/tools.h"
-#include <format.h>
-#include <s3types.h>
+#include "PocketSphinxRecognizer.h"
 #include <regex>
 #include <gsl_util.h>
-#include "logging/logging.h"
-#include "audio/DcOffset.h"
-#include "time/Timeline.h"
-#include "audio/voiceActivityDetection.h"
 #include "audio/AudioSegment.h"
+#include "audio/SampleRateConverter.h"
 #include "languageModels.h"
 #include "tokenization.h"
 #include "g2p.h"
 #include "time/ContinuousTimeline.h"
 #include "audio/processing.h"
-#include "tools/parallel.h"
-#include <boost/version.hpp>
-#include "tools/ObjectPool.h"
 #include "time/timedLogging.h"

 extern "C" {
-#include <pocketsphinx.h>
-#include <sphinxbase/err.h>
-#include <ps_alignment.h>
 #include <state_align_search.h>
-#include <pocketsphinx_internal.h>
-#include <ngram_search.h>
 }

 using std::runtime_error;
 using std::invalid_argument;
 using std::unique_ptr;
-using std::shared_ptr;
 using std::string;
 using std::vector;
 using std::map;
 using boost::filesystem::path;
-using std::function;
 using std::regex;
 using std::regex_replace;
-using std::chrono::duration;
 using boost::optional;
-using std::string;
-using std::chrono::duration_cast;
 using std::array;

-constexpr int sphinxSampleRate = 16000;
-
-const path& getSphinxModelDirectory() {
-	static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
-	return sphinxModelDirectory;
+bool dictionaryContains(dict_t& dictionary, const string& word) {
+	return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
 }

-logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
-	switch (errorLevel) {
-	case ERR_DEBUG:
-	case ERR_INFO:
-	case ERR_INFOCONT:
-		return logging::Level::Trace;
-	case ERR_WARN:
-		return logging::Level::Warn;
-	case ERR_ERROR:
-		return logging::Level::Error;
-	case ERR_FATAL:
-		return logging::Level::Fatal;
-	default:
-		throw invalid_argument("Unknown log level.");
+s3wid_t getWordId(const string& word, dict_t& dictionary) {
+	const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
+	if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
+	return wordId;
+}
+
+void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
+	map<string, string> missingPronunciations;
+	for (const string& word : words) {
+		if (!dictionaryContains(*decoder.dict, word)) {
+			string pronunciation;
+			for (Phone phone : wordToPhones(word)) {
+				if (pronunciation.length() > 0) pronunciation += " ";
+				pronunciation += PhoneConverter::get().toString(phone);
+			}
+			missingPronunciations[word] = pronunciation;
+		}
+	}
+	for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
+		const bool isLast = it == --missingPronunciations.end();
+		logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
+		ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
 	}
 }

-void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
-	UNUSED(user_data);
-
-	// Create varArgs list
-	va_list args;
-	va_start(args, format);
-	auto _ = gsl::finally([&args]() { va_end(args); });
-
-	// Format message
-	const int initialSize = 256;
-	vector<char> chars(initialSize);
-	bool success = false;
-	while (!success) {
-		int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
-		if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
-
-		success = charsWritten < static_cast<int>(chars.size());
-		if (!success) chars.resize(chars.size() * 2);
-	}
-	regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
-	string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
-	boost::algorithm::trim(message);
-
-	logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
-	logging::log(logLevel, message);
-}
-
-BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
-	// Restart timing at 0
-	ps_start_stream(&decoder);
-
-	// Start recognition
-	int error = ps_start_utt(&decoder);
-	if (error) throw runtime_error("Error starting utterance processing for word recognition.");
-
-	// Process entire audio clip
-	const bool noRecognition = false;
-	const bool fullUtterance = true;
-	int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
-	if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
-
-	// End recognition
-	error = ps_end_utt(&decoder);
-	if (error) throw runtime_error("Error ending utterance processing for word recognition.");
-
-	BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
-	bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
-	if (noWordsRecognized) {
-		return result;
-	}
-
-	// Collect words
-	for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
-		const char* word = ps_seg_word(it);
-		int firstFrame, lastFrame;
-		ps_seg_frames(it, &firstFrame, &lastFrame);
-		result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
+lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
+	path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
+	lambda_unique_ptr<ngram_model_t> result(
+		ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
+		[](ngram_model_t* lm) { ngram_model_free(lm); });
+	if (!result) {
+		throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
 	}

 	return result;
 }

-s3wid_t getWordId(const string& word, dict_t& dictionary) {
-	s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
-	if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
-	return wordId;
+lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
+	// Split dialog into normalized words
+	vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
+
+	// Add dialog-specific words to the dictionary
+	addMissingDictionaryWords(words, decoder);
+
+	// Create dialog-specific language model
+	words.insert(words.begin(), "<s>");
+	words.emplace_back("</s>");
+	return createLanguageModel(words, decoder);
+}
+
+lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
+	auto defaultLanguageModel = createDefaultLanguageModel(decoder);
+	auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
+	constexpr int modelCount = 2;
+	array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
+	array<const char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
+	array<float, modelCount> modelWeights{ 0.1f, 0.9f };
+	lambda_unique_ptr<ngram_model_t> result(
+		ngram_model_set_init(nullptr, languageModels.data(), const_cast<char**>(modelNames.data()), modelWeights.data(), modelCount),
+		[](ngram_model_t* lm) { ngram_model_free(lm); });
+	if (!result) {
+		throw runtime_error("Error creating biased language model.");
+	}
+
+	return result;
+}
+
+static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
+	lambda_unique_ptr<cmd_ln_t> config(
+		cmd_ln_init(
+			nullptr, ps_args(), true,
+			// Set acoustic model
+			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
+			// Set pronunciation dictionary
+			"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
+			// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
+			"-dither", "yes",
+			// Disable VAD -- we're doing that ourselves
+			"-remove_silence", "no",
+			// Perform per-utterance cepstral mean normalization
+			"-cmn", "batch",
+			nullptr),
+		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
+	if (!config) throw runtime_error("Error creating configuration.");
+
+	lambda_unique_ptr<ps_decoder_t> decoder(
+		ps_init(config.get()),
+		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
+	if (!decoder) throw runtime_error("Error creating speech decoder.");
+
+	// Set language model
+	lambda_unique_ptr<ngram_model_t> languageModel(dialog
+		? createBiasedLanguageModel(*decoder, *dialog)
+		: createDefaultLanguageModel(*decoder));
+	ps_set_lm(decoder.get(), "lm", languageModel.get());
+	ps_set_search(decoder.get(), "lm");
+
+	return decoder;
 }

 optional<Timeline<Phone>> getPhoneAlignment(
@ -178,7 +168,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
 		// Process entire audio clip
 		const int16* nextSample = audioBuffer.data();
 		size_t remainingSamples = audioBuffer.size();
-		bool fullUtterance = true;
+		const bool fullUtterance = true;
 		while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
 			while (acousticModel->n_feat_frame > 0) {
 				ps_search_step(search.get(), acousticModel->output_frame);
@ -197,7 +187,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
 	for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
 		// Get phone
 		ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
-		s3cipid_t phoneId = phoneEntry->id.pid.cipid;
+		const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
 		string phoneName = phoneNames[phoneId];

 		if (phoneName == "SIL") continue;
@ -207,162 +197,42 @@ optional<Timeline<Phone>> getPhoneAlignment(
 		centiseconds duration(phoneEntry->duration);
 		Phone phone = PhoneConverter::get().parse(phoneName);
 		if (phone == Phone::AH && duration < 6_cs) {
-			// Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate.
+			// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
 			phone = Phone::Schwa;
 		}
-		Timed<Phone> timedPhone(start, start + duration, phone);
+		const Timed<Phone> timedPhone(start, start + duration, phone);
 		result.set(timedPhone);
 	}
 	return result;
 }

-bool dictionaryContains(dict_t& dictionary, const string& word) {
-	return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
-}
-
-void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
-	map<string, string> missingPronunciations;
-	for (const string& word : words) {
-		if (!dictionaryContains(*decoder.dict, word)) {
-			string pronunciation;
-			for (Phone phone : wordToPhones(word)) {
-				if (pronunciation.length() > 0) pronunciation += " ";
-				pronunciation += PhoneConverter::get().toString(phone);
-			}
-			missingPronunciations[word] = pronunciation;
-		}
-	}
-	for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
-		bool isLast = it == --missingPronunciations.end();
-		logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
-		ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
-	}
-}
-
-lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
-	path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
-	lambda_unique_ptr<ngram_model_t> result(
-		ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
-		[](ngram_model_t* lm) { ngram_model_free(lm); });
-	if (!result) {
-		throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
-	}
-
-	return std::move(result);
-}
-
-lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
-	// Split dialog into normalized words
-	vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
-
-	// Add dialog-specific words to the dictionary
-	addMissingDictionaryWords(words, decoder);
-
-	// Create dialog-specific language model
-	words.insert(words.begin(), "<s>");
-	words.push_back("</s>");
-	return createLanguageModel(words, decoder);
-}
-
-lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
-	auto defaultLanguageModel = createDefaultLanguageModel(decoder);
-	auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
-	constexpr int modelCount = 2;
-	array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
-	array<char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
-	array<float, modelCount> modelWeights{ 0.1f, 0.9f };
-	lambda_unique_ptr<ngram_model_t> result(
-		ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
-		[](ngram_model_t* lm) { ngram_model_free(lm); });
-	if (!result) {
-		throw runtime_error("Error creating biased language model.");
-	}
-
-	return std::move(result);
-}
-
-lambda_unique_ptr<ps_decoder_t> createDecoder(optional<string> dialog) {
-	lambda_unique_ptr<cmd_ln_t> config(
-		cmd_ln_init(
-			nullptr, ps_args(), true,
-			// Set acoustic model
-			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
-			// Set pronunciation dictionary
-			"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
-			// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
-			"-dither", "yes",
-			// Disable VAD -- we're doing that ourselves
-			"-remove_silence", "no",
-			// Perform per-utterance cepstral mean normalization
-			"-cmn", "batch",
-			nullptr),
-		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
-	if (!config) throw runtime_error("Error creating configuration.");
-
-	lambda_unique_ptr<ps_decoder_t> decoder(
-		ps_init(config.get()),
-		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
-	if (!decoder) throw runtime_error("Error creating speech decoder.");
-
-	// Set language model
-	lambda_unique_ptr<ngram_model_t> languageModel(dialog
-		? createBiasedLanguageModel(*decoder, *dialog)
-		: createDefaultLanguageModel(*decoder));
-	ps_set_lm(decoder.get(), "lm", languageModel.get());
-	ps_set_search(decoder.get(), "lm");
-
-	return decoder;
-}
-
-JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
-	JoiningTimeline<void> noiseSounds;
-
-	// Find utterance parts without recogniced phones
-	noiseSounds.set(utteranceTimeRange);
-	for (const auto& timedPhone : phones) {
-		noiseSounds.clear(timedPhone.getTimeRange());
-	}
-
-	// Remove undesired elements
-	const centiseconds minSoundDuration = 12_cs;
-	for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
-		bool startsAtZero = unknownSound.getStart() == 0_cs;
-		bool tooShort = unknownSound.getDuration() < minSoundDuration;
-		if (startsAtZero || tooShort) {
-			noiseSounds.clear(unknownSound.getTimeRange());
-		}
-	}
-
-	return noiseSounds;
-}
-
 // Some words have multiple pronunciations, one of which results in better animation than the others.
 // This function returns the optimal pronunciation for a select set of these words.
 string fixPronunciation(const string& word) {
-	const static map<string, string> replacements {
-		{"into(2)", "into"},
-		{"to(2)", "to"},
-		{"to(3)", "to"},
-		{"today(2)", "today"},
-		{"tomorrow(2)", "tomorrow"},
-		{"tonight(2)", "tonight"}
+	const static map<string, string> replacements{
+		{ "into(2)", "into" },
+		{ "to(2)", "to" },
+		{ "to(3)", "to" },
+		{ "today(2)", "today" },
+		{ "tomorrow(2)", "tomorrow" },
+		{ "tonight(2)", "tonight" }
 	};

 	const auto pair = replacements.find(word);
 	return pair != replacements.end() ? pair->second : word;
 }

-Timeline<Phone> utteranceToPhones(
+static Timeline<Phone> utteranceToPhones(
 	const AudioClip& audioClip,
 	TimeRange utteranceTimeRange,
 	ps_decoder_t& decoder,
-	ProgressSink& utteranceProgressSink)
-{
+	ProgressSink& utteranceProgressSink
+) {
 	ProgressMerger utteranceProgressMerger(utteranceProgressSink);
 	ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
 	ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);

-	// Pad time range to give Pocketsphinx some breathing room
+	// Pad time range to give PocketSphinx some breathing room
 	TimeRange paddedTimeRange = utteranceTimeRange;
 	const centiseconds padding(3);
 	paddedTimeRange.grow(padding);
@ -384,7 +254,7 @@ Timeline<Phone> utteranceToPhones(
 			continue;
 		}
 		word = regex_replace(word, regex("\\(\\d\\)"), "");
-		if (text.size() > 0) {
+		if (!text.empty()) {
 			text += " ";
 		}
 		text += word;
@ -403,7 +273,7 @@ Timeline<Phone> utteranceToPhones(
 		const string fixedWord = fixPronunciation(timedWord.getValue());
 		wordIds.push_back(getWordId(fixedWord, *decoder.dict));
 	}
-	if (wordIds.empty()) return {};
+	if (wordIds.empty()) return{};

 	// Align the words' phones with speech
 #if BOOST_VERSION < 105600 // Support legacy syntax
@ -433,77 +303,11 @@ Timeline<Phone> utteranceToPhones(
 	return utterancePhones;
 }

-BoundedTimeline<Phone> recognizePhones(
+BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
 	const AudioClip& inputAudioClip,
-	optional<string> dialog,
+	optional<std::string> dialog,
 	int maxThreadCount,
-	ProgressSink& progressSink)
-{
-	ProgressMerger totalProgressMerger(progressSink);
-	ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
-	ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
-
-	// Make sure audio stream has no DC offset
-	const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
-
-	// Split audio into utterances
-	JoiningBoundedTimeline<void> utterances;
-	try {
-		utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
-	}
-	catch (...) {
-		std::throw_with_nested(runtime_error("Error detecting segments of speech."));
-	}
-
-	// Discard Pocketsphinx output
-	err_set_logfp(nullptr);
-
-	// Redirect Pocketsphinx output to log
-	err_set_callback(sphinxLogCallback, nullptr);
-
-	// Prepare pool of decoders
-	ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
-		[&dialog] { return createDecoder(dialog); });
-
-	BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
-	std::mutex resultMutex;
-	auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
-		// Detect phones for utterance
-		auto decoder = decoderPool.acquire();
-		Timeline<Phone> utterancePhones =
-			utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
-
-		// Copy phones to result timeline
-		std::lock_guard<std::mutex> lock(resultMutex);
-		for (const auto& timedPhone : utterancePhones) {
-			phones.set(timedPhone);
-		}
-	};
-
-	auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
-		return timedUtterance.getDuration().count();
-	};
-
-	// Perform speech recognition
-	try {
-		// Determine how many parallel threads to use
-		int threadCount = std::min({
-			maxThreadCount,
-			// Don't use more threads than there are utterances to be processed
-			static_cast<int>(utterances.size()),
-			// Don't waste time creating additional threads (and decoders!) if the recording is short
-			static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
-		});
-		if (threadCount < 1) {
-			threadCount = 1;
-		}
-		logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
-		runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
-		logging::debug("Speech recognition -- end");
-	}
-	catch (...) {
-		std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
-	}
-
-	return phones;
+	ProgressSink& progressSink
+) const {
+	return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
 }
--- a/rhubarb/src/recognition/PocketSphinxRecognizer.h
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.h
@ -0,0 +1,14 @@
+#pragma once
+
+#include "Recognizer.h"
+#include "pocketSphinxTools.h"
+
+class PocketSphinxRecognizer : public Recognizer {
+public:
+	BoundedTimeline<Phone> recognizePhones(
+		const AudioClip& inputAudioClip,
+		boost::optional<std::string> dialog,
+		int maxThreadCount,
+		ProgressSink& progressSink
+	) const override;
+};
--- a/rhubarb/src/recognition/Recognizer.h
+++ b/rhubarb/src/recognition/Recognizer.h
@ -0,0 +1,18 @@
+#pragma once
+
+#include "audio/AudioClip.h"
+#include "core/Phone.h"
+#include "tools/ProgressBar.h"
+#include "time/BoundedTimeline.h"
+
+class Recognizer {
+public:
+	virtual ~Recognizer() = default;
+
+	virtual BoundedTimeline<Phone>recognizePhones(
+		const AudioClip& audioClip,
+		boost::optional<std::string> dialog,
+		int maxThreadCount,
+		ProgressSink& progressSink
+	) const = 0;
+};
--- a/rhubarb/src/recognition/phoneRecognition.h
+++ b/rhubarb/src/recognition/phoneRecognition.h
@ -1,12 +0,0 @@
-#pragma once
-
-#include "audio/AudioClip.h"
-#include "core/Phone.h"
-#include "tools/ProgressBar.h"
-#include "time/BoundedTimeline.h"
-
-BoundedTimeline<Phone> recognizePhones(
-	const AudioClip& audioClip,
-	boost::optional<std::string> dialog,
-	int maxThreadCount,
-	ProgressSink& progressSink);
--- a/rhubarb/src/recognition/pocketSphinxTools.cpp
+++ b/rhubarb/src/recognition/pocketSphinxTools.cpp
@ -0,0 +1,218 @@
+#include "pocketSphinxTools.h"
+
+#include "tools/platformTools.h"
+#include <regex>
+#include "audio/DcOffset.h"
+#include "audio/voiceActivityDetection.h"
+#include "tools/parallel.h"
+#include "tools/ObjectPool.h"
+#include "time/timedLogging.h"
+
+extern "C" {
+#include <sphinxbase/err.h>
+#include <pocketsphinx_internal.h>
+#include <ngram_search.h>
+}
+
+using std::runtime_error;
+using std::invalid_argument;
+using std::unique_ptr;
+using std::string;
+using std::vector;
+using boost::filesystem::path;
+using std::regex;
+using boost::optional;
+using std::chrono::duration_cast;
+	
+logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
+	switch (errorLevel) {
+	case ERR_DEBUG:
+	case ERR_INFO:
+	case ERR_INFOCONT:
+		return logging::Level::Trace;
+	case ERR_WARN:
+		return logging::Level::Warn;
+	case ERR_ERROR:
+		return logging::Level::Error;
+	case ERR_FATAL:
+		return logging::Level::Fatal;
+	default:
+		throw invalid_argument("Unknown log level.");
+	}
+}
+
+void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
+	UNUSED(user_data);
+
+	// Create varArgs list
+	va_list args;
+	va_start(args, format);
+	auto _ = gsl::finally([&args]() { va_end(args); });
+
+	// Format message
+	const int initialSize = 256;
+	vector<char> chars(initialSize);
+	bool success = false;
+	while (!success) {
+		const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
+		if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
+
+		success = charsWritten < static_cast<int>(chars.size());
+		if (!success) chars.resize(chars.size() * 2);
+	}
+	const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
+	string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
+	boost::algorithm::trim(message);
+
+	const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
+	logging::log(logLevel, message);
+}
+
+void redirectPocketSphinxOutput() {
+	static bool redirected = false;
+	if (redirected) return;
+
+	// Discard PocketSphinx output
+	err_set_logfp(nullptr);
+
+	// Redirect PocketSphinx output to log
+	err_set_callback(sphinxLogCallback, nullptr);
+
+	redirected = true;
+}
+
+BoundedTimeline<Phone> recognizePhones(
+	const AudioClip& inputAudioClip,
+	optional<std::string> dialog,
+	decoderFactory createDecoder,
+	utteranceToPhonesFunction utteranceToPhones,
+	int maxThreadCount,
+	ProgressSink& progressSink
+) {
+	ProgressMerger totalProgressMerger(progressSink);
+	ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
+	ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
+
+	// Make sure audio stream has no DC offset
+	const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
+
+	// Split audio into utterances
+	JoiningBoundedTimeline<void> utterances;
+	try {
+		utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
+	} catch (...) {
+		std::throw_with_nested(runtime_error("Error detecting segments of speech."));
+	}
+
+	redirectPocketSphinxOutput();
+
+	// Prepare pool of decoders
+	ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
+		[&] { return createDecoder(dialog); });
+
+	BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
+	std::mutex resultMutex;
+	const auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
+		// Detect phones for utterance
+		const auto decoder = decoderPool.acquire();
+		Timeline<Phone> utterancePhones =
+			utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
+
+		// Copy phones to result timeline
+		std::lock_guard<std::mutex> lock(resultMutex);
+		for (const auto& timedPhone : utterancePhones) {
+			phones.set(timedPhone);
+		}
+	};
+
+	const auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
+		return timedUtterance.getDuration().count();
+	};
+
+	// Perform speech recognition
+	try {
+		// Determine how many parallel threads to use
+		int threadCount = std::min({
+			maxThreadCount,
+			// Don't use more threads than there are utterances to be processed
+			static_cast<int>(utterances.size()),
+			// Don't waste time creating additional threads (and decoders!) if the recording is short
+			static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
+		});
+		if (threadCount < 1) {
+			threadCount = 1;
+		}
+		logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
+		runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
+		logging::debug("Speech recognition -- end");
+	} catch (...) {
+		std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
+	}
+
+	return phones;
+}
+
+const path& getSphinxModelDirectory() {
+	static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
+	return sphinxModelDirectory;
+}
+
+JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
+	JoiningTimeline<void> noiseSounds;
+
+	// Find utterance parts without recognized phones
+	noiseSounds.set(utteranceTimeRange);
+	for (const auto& timedPhone : phones) {
+		noiseSounds.clear(timedPhone.getTimeRange());
+	}
+
+	// Remove undesired elements
+	const centiseconds minSoundDuration = 12_cs;
+	for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
+		const bool startsAtZero = unknownSound.getStart() == 0_cs;
+		const bool tooShort = unknownSound.getDuration() < minSoundDuration;
+		if (startsAtZero || tooShort) {
+			noiseSounds.clear(unknownSound.getTimeRange());
+		}
+	}
+
+	return noiseSounds;
+}
+
+BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
+	// Restart timing at 0
+	ps_start_stream(&decoder);
+
+	// Start recognition
+	int error = ps_start_utt(&decoder);
+	if (error) throw runtime_error("Error starting utterance processing for word recognition.");
+
+	// Process entire audio clip
+	const bool noRecognition = false;
+	const bool fullUtterance = true;
+	const int searchedFrameCount =
+		ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
+	if (searchedFrameCount < 0) {
+		throw runtime_error("Error analyzing raw audio data for word recognition.");
+	}
+
+	// End recognition
+	error = ps_end_utt(&decoder);
+	if (error) throw runtime_error("Error ending utterance processing for word recognition.");
+
+	BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
+	const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
+	if (noWordsRecognized) {
+		return result;
+	}
+
+	// Collect words
+	for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
+		const char* word = ps_seg_word(it);
+		int firstFrame, lastFrame;
+		ps_seg_frames(it, &firstFrame, &lastFrame);
+		result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
+	}
+
+	return result;
+}
--- a/rhubarb/src/recognition/pocketSphinxTools.h
+++ b/rhubarb/src/recognition/pocketSphinxTools.h
@ -0,0 +1,39 @@
+#pragma once
+
+#include "time/BoundedTimeline.h"
+#include "core/Phone.h"
+#include "audio/AudioClip.h"
+#include "tools/ProgressBar.h"
+#include <boost/filesystem/path.hpp>
+
+extern "C" {
+#include <pocketsphinx.h>
+}
+
+typedef std::function<lambda_unique_ptr<ps_decoder_t>(
+	boost::optional<std::string> dialog
+)> decoderFactory;
+
+typedef std::function<Timeline<Phone>(
+	const AudioClip& audioClip,
+	TimeRange utteranceTimeRange,
+	ps_decoder_t& decoder,
+	ProgressSink& utteranceProgressSink
+)> utteranceToPhonesFunction;
+
+BoundedTimeline<Phone> recognizePhones(
+	const AudioClip& inputAudioClip,
+	boost::optional<std::string> dialog,
+	decoderFactory createDecoder,
+	utteranceToPhonesFunction utteranceToPhones,
+	int maxThreadCount,
+	ProgressSink& progressSink
+);
+
+constexpr int sphinxSampleRate = 16000;
+
+const boost::filesystem::path& getSphinxModelDirectory();
+
+JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones);
+
+BoundedTimeline<std::string> recognizeWords(const std::vector<int16_t>& audioBuffer, ps_decoder_t& decoder);
--- a/rhubarb/src/rhubarb/RecognizerType.cpp
+++ b/rhubarb/src/rhubarb/RecognizerType.cpp
@ -0,0 +1,27 @@
+#include "RecognizerType.h"
+
+using std::string;
+
+RecognizerTypeConverter& RecognizerTypeConverter::get() {
+	static RecognizerTypeConverter converter;
+	return converter;
+}
+
+string RecognizerTypeConverter::getTypeName() {
+	return "RecognizerType";
+}
+
+EnumConverter<RecognizerType>::member_data RecognizerTypeConverter::getMemberData() {
+	return member_data{
+		{ RecognizerType::PocketSphinx,	"pocketSphinx" },
+		{ RecognizerType::Phonetic,		"phonetic" }
+	};
+}
+
+std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
+	return RecognizerTypeConverter::get().write(stream, value);
+}
+
+std::istream& operator>>(std::istream& stream, RecognizerType& value) {
+	return RecognizerTypeConverter::get().read(stream, value);
+}
--- a/rhubarb/src/rhubarb/RecognizerType.h
+++ b/rhubarb/src/rhubarb/RecognizerType.h
@ -0,0 +1,20 @@
+#pragma once
+
+#include "tools/EnumConverter.h"
+
+enum class RecognizerType {
+	PocketSphinx,
+	Phonetic
+};
+
+class RecognizerTypeConverter : public EnumConverter<RecognizerType> {
+public:
+	static RecognizerTypeConverter& get();
+protected:
+	std::string getTypeName() override;
+	member_data getMemberData() override;
+};
+
+std::ostream& operator<<(std::ostream& stream, RecognizerType value);
+
+std::istream& operator>>(std::istream& stream, RecognizerType& value);
--- a/rhubarb/src/rhubarb/main.cpp
+++ b/rhubarb/src/rhubarb/main.cpp
@ -27,6 +27,9 @@
 #include "tools/platformTools.h"
 #include "sinks.h"
 #include "semanticEntries.h"
+#include "RecognizerType.h"
+#include "recognition/PocketSphinxRecognizer.h"
+#include "recognition/PhoneticRecognizer.h"

 using std::exception;
 using std::string;
@ -36,9 +39,6 @@ using std::unique_ptr;
 using std::make_unique;
 using std::shared_ptr;
 using std::make_shared;
-using std::map;
-using std::chrono::duration;
-using std::chrono::duration_cast;
 using std::ofstream;
 using boost::filesystem::path;
 using boost::adaptors::transformed;
@ -56,6 +56,10 @@ namespace TCLAP {
 	struct ArgTraits<ExportFormat> {
 		typedef ValueLike ValueCategory;
 	};
+	template<>
+	struct ArgTraits<RecognizerType> {
+		typedef ValueLike ValueCategory;
+	};
 }

 shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
@ -66,6 +70,17 @@ shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
 	return make_shared<logging::LevelFilter>(FileSink, minLevel);
 }

+unique_ptr<Recognizer> createRecognizer(RecognizerType recognizerType) {
+	switch (recognizerType) {
+	case RecognizerType::PocketSphinx:
+		return make_unique<PocketSphinxRecognizer>();
+	case RecognizerType::Phonetic:
+		return make_unique<PhoneticRecognizer>();
+	default:
+		throw std::runtime_error("Unknown recognizer.");
+	}
+}
+
 unique_ptr<Exporter> createExporter(ExportFormat exportFormat) {
 	switch (exportFormat) {
 	case ExportFormat::Tsv:
@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
 	auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
 	tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
 	tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
+	auto recognizerTypes = vector<RecognizerType>(RecognizerTypeConverter::get().getValues());
+	tclap::ValuesConstraint<RecognizerType> recognizerConstraint(recognizerTypes);
+	tclap::ValueArg<RecognizerType> recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
 	tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);

 	try {
@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
 			JoiningContinuousTimeline<Shape> animation = animateWaveFile(
 				inputFilePath,
 				dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
+				*createRecognizer(recognizerType.getValue()),
 				targetShapeSet,
 				maxThreadCount.getValue(),
 				progressSink);