From 3ed38ada2f5f1632524a7d6d95ecfe711f9b51ab Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Mon, 8 Oct 2018 20:24:25 +0200
Subject: [PATCH 1/6] Fix path separator

---
 rhubarb/src/recognition/phoneRecognition.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rhubarb/src/recognition/phoneRecognition.cpp b/rhubarb/src/recognition/phoneRecognition.cpp
index 5139596..e45c765 100644
--- a/rhubarb/src/recognition/phoneRecognition.cpp
+++ b/rhubarb/src/recognition/phoneRecognition.cpp
@@ -51,7 +51,7 @@ using std::array;
 constexpr int sphinxSampleRate = 16000;
 
 const path& getSphinxModelDirectory() {
-	static path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
+	static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
 	return sphinxModelDirectory;
 }
 

From 610f4900469f40888feae428df27e8efca82d2d8 Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Mon, 8 Oct 2018 20:30:45 +0200
Subject: [PATCH 2/6] Implement generic concept of recognizers with options
 pocketSphinx and phonetic

---
 rhubarb/CMakeLists.txt                        |  11 +-
 rhubarb/resharper.DotSettings                 |  23 +
 rhubarb/src/lib/rhubarbLib.cpp                |  13 +-
 rhubarb/src/lib/rhubarbLib.h                  |   7 +-
 .../src/recognition/PhoneticRecognizer.cpp    | 103 +++++
 rhubarb/src/recognition/PhoneticRecognizer.h  |  14 +
 ...gnition.cpp => PocketSphinxRecognizer.cpp} | 428 +++++-------------
 .../src/recognition/PocketSphinxRecognizer.h  |  14 +
 rhubarb/src/recognition/Recognizer.h          |  18 +
 rhubarb/src/recognition/phoneRecognition.h    |  12 -
 rhubarb/src/recognition/pocketSphinxTools.cpp | 218 +++++++++
 rhubarb/src/recognition/pocketSphinxTools.h   |  39 ++
 rhubarb/src/rhubarb/RecognizerType.cpp        |  27 ++
 rhubarb/src/rhubarb/RecognizerType.h          |  20 +
 rhubarb/src/rhubarb/main.cpp                  |  25 +-
 15 files changed, 635 insertions(+), 337 deletions(-)
 create mode 100644 rhubarb/src/recognition/PhoneticRecognizer.cpp
 create mode 100644 rhubarb/src/recognition/PhoneticRecognizer.h
 rename rhubarb/src/recognition/{phoneRecognition.cpp => PocketSphinxRecognizer.cpp} (55%)
 create mode 100644 rhubarb/src/recognition/PocketSphinxRecognizer.h
 create mode 100644 rhubarb/src/recognition/Recognizer.h
 delete mode 100644 rhubarb/src/recognition/phoneRecognition.h
 create mode 100644 rhubarb/src/recognition/pocketSphinxTools.cpp
 create mode 100644 rhubarb/src/recognition/pocketSphinxTools.h
 create mode 100644 rhubarb/src/rhubarb/RecognizerType.cpp
 create mode 100644 rhubarb/src/rhubarb/RecognizerType.h

diff --git a/rhubarb/CMakeLists.txt b/rhubarb/CMakeLists.txt
index 9353edf..0fef2b7 100644
--- a/rhubarb/CMakeLists.txt
+++ b/rhubarb/CMakeLists.txt
@@ -413,8 +413,13 @@ add_library(rhubarb-recognition
 	src/recognition/g2p.h
 	src/recognition/languageModels.cpp
 	src/recognition/languageModels.h
-	src/recognition/phoneRecognition.cpp
-	src/recognition/phoneRecognition.h
+	src/recognition/PhoneticRecognizer.cpp
+	src/recognition/PhoneticRecognizer.h
+	src/recognition/PocketSphinxRecognizer.cpp
+	src/recognition/PocketSphinxRecognizer.h
+	src/recognition/pocketSphinxTools.cpp
+	src/recognition/pocketSphinxTools.h
+	src/recognition/Recognizer.h
 	src/recognition/tokenization.cpp
 	src/recognition/tokenization.h
 )
@@ -487,6 +492,8 @@ add_executable(rhubarb
 	src/rhubarb/main.cpp
 	src/rhubarb/ExportFormat.cpp
 	src/rhubarb/ExportFormat.h
+	src/rhubarb/RecognizerType.cpp
+	src/rhubarb/RecognizerType.h
 	src/rhubarb/semanticEntries.cpp
 	src/rhubarb/semanticEntries.h
 	src/rhubarb/sinks.cpp
diff --git a/rhubarb/resharper.DotSettings b/rhubarb/resharper.DotSettings
index b16b555..168efbe 100644
--- a/rhubarb/resharper.DotSettings
+++ b/rhubarb/resharper.DotSettings
@@ -1,7 +1,12 @@
 ﻿<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
+	<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppClangTidyModernizeRawStringLiteral/@EntryIndexedValue">HINT</s:String>
+	
 	<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String>
+	
 	<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String>
+	
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean>
@@ -29,6 +34,7 @@
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
 	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String>
@@ -44,6 +50,14 @@
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/CssFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/HtmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/JavaScriptCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/ProtobufCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/ResxFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/VBFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlDocFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
+	<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
 	<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String>
 	<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String>
 	<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue">&lt;NamingElement Priority="10"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="class field" /&gt;&lt;type Name="struct field" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /&gt;&lt;/NamingElement&gt;</s:String>
@@ -108,7 +122,16 @@
 	<s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpKeepExistingMigration/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpPlaceEmbeddedOnSameLineMigration/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002EFormat_002ESettingsUpgrade_002EAlignmentTabFillStyleMigration/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=allphone/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=cepstral/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=cmudict/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=pbeam/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=qwhy/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=Viterbi/@EntryIndexedValue">True</s:Boolean>
 </wpf:ResourceDictionary>
\ No newline at end of file
diff --git a/rhubarb/src/lib/rhubarbLib.cpp b/rhubarb/src/lib/rhubarbLib.cpp
index ffadf68..5f8460f 100644
--- a/rhubarb/src/lib/rhubarbLib.cpp
+++ b/rhubarb/src/lib/rhubarbLib.cpp
@@ -1,6 +1,5 @@
 #include "rhubarbLib.h"
 #include "core/Phone.h"
-#include "recognition/phoneRecognition.h"
 #include "tools/textFiles.h"
 #include "animation/mouthAnimation.h"
 #include "audio/audioFileReading.h"
@@ -8,27 +7,29 @@
 using boost::optional;
 using std::string;
 using boost::filesystem::path;
-using std::unique_ptr;
 
 JoiningContinuousTimeline<Shape> animateAudioClip(
 	const AudioClip& audioClip,
-	optional<string> dialog,
+	const optional<string>& dialog,
+	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink)
 {
-	BoundedTimeline<Phone> phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
+	const BoundedTimeline<Phone> phones =
+		recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
 	JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
 	return result;
 }
 
 JoiningContinuousTimeline<Shape> animateWaveFile(
 	path filePath,
-	optional<string> dialog,
+	const optional<string>& dialog,
+	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink)
 {
 	const auto audioClip = createAudioFileClip(filePath);
-	return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink);
+	return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
 }
diff --git a/rhubarb/src/lib/rhubarbLib.h b/rhubarb/src/lib/rhubarbLib.h
index 8663761..ca40a06 100644
--- a/rhubarb/src/lib/rhubarbLib.h
+++ b/rhubarb/src/lib/rhubarbLib.h
@@ -6,17 +6,20 @@
 #include "tools/ProgressBar.h"
 #include <boost/filesystem.hpp>
 #include "animation/targetShapeSet.h"
+#include "recognition/Recognizer.h"
 
 JoiningContinuousTimeline<Shape> animateAudioClip(
 	const AudioClip& audioClip,
-	boost::optional<std::string> dialog,
+	const boost::optional<std::string>& dialog,
+	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink);
 
 JoiningContinuousTimeline<Shape> animateWaveFile(
 	boost::filesystem::path filePath,
-	boost::optional<std::string> dialog,
+	const boost::optional<std::string>& dialog,
+	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink);
diff --git a/rhubarb/src/recognition/PhoneticRecognizer.cpp b/rhubarb/src/recognition/PhoneticRecognizer.cpp
new file mode 100644
index 0000000..bd9c9ac
--- /dev/null
+++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp
@@ -0,0 +1,103 @@
+#include "PhoneticRecognizer.h"
+#include "time/Timeline.h"
+#include "audio/AudioSegment.h"
+#include "audio/SampleRateConverter.h"
+#include "audio/processing.h"
+#include "time/timedLogging.h"
+
+using std::runtime_error;
+using std::unique_ptr;
+using std::string;
+using boost::optional;
+
+static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
+	UNUSED(dialog);
+
+	lambda_unique_ptr<cmd_ln_t> config(
+		cmd_ln_init(
+			nullptr, ps_args(), true,
+			// Set acoustic model
+			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
+			// Set phonetic language model
+			"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
+			"-allphone_ci", "yes",
+			// Set language model probability weight.
+			// Low values (<= 0.4) can lead to fluttering animation.
+			// High values (>= 1.0) can lead to imprecise or freezing animation.
+			"-lw", "0.8",
+
+			// The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
+
+			// Set beam width applied to every frame in Viterbi search
+			"-beam", "1e-20",
+			// Set beam width applied to phone transitions
+			"-pbeam", "1e-20",
+			nullptr),
+		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
+	if (!config) throw runtime_error("Error creating configuration.");
+
+	lambda_unique_ptr<ps_decoder_t> decoder(
+		ps_init(config.get()),
+		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
+	if (!decoder) throw runtime_error("Error creating speech decoder.");
+
+	return decoder;
+}
+
+static Timeline<Phone> utteranceToPhones(
+	const AudioClip& audioClip,
+	TimeRange utteranceTimeRange,
+	ps_decoder_t& decoder,
+	ProgressSink& utteranceProgressSink
+) {
+	// Pad time range to give PocketSphinx some breathing room
+	TimeRange paddedTimeRange = utteranceTimeRange;
+	const centiseconds padding(3);
+	paddedTimeRange.grow(padding);
+	paddedTimeRange.trim(audioClip.getTruncatedRange());
+
+	const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
+	const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
+
+	// Detect phones (returned as words)
+	BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
+	phoneStrings.shift(paddedTimeRange.getStart());
+	Timeline<Phone> utterancePhones;
+	for (const auto& timedPhoneString : phoneStrings) {
+		Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
+		if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
+			// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
+			phone = Phone::Schwa;
+		}
+		utterancePhones.set(timedPhoneString.getTimeRange(), phone);
+	}
+
+	// Log raw phones
+	for (const auto& timedPhone : utterancePhones) {
+		logTimedEvent("rawPhone", timedPhone);
+	}
+
+	// Guess positions of noise sounds
+	JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
+	for (const auto& noiseSound : noiseSounds) {
+		utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
+	}
+
+	// Log phones
+	for (const auto& timedPhone : utterancePhones) {
+		logTimedEvent("phone", timedPhone);
+	}
+
+	utteranceProgressSink.reportProgress(1.0);
+
+	return utterancePhones;
+}
+
+BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
+	const AudioClip& inputAudioClip,
+	optional<std::string> dialog,
+	int maxThreadCount,
+	ProgressSink& progressSink
+) const {
+	return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
+}
diff --git a/rhubarb/src/recognition/PhoneticRecognizer.h b/rhubarb/src/recognition/PhoneticRecognizer.h
new file mode 100644
index 0000000..96797cf
--- /dev/null
+++ b/rhubarb/src/recognition/PhoneticRecognizer.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "Recognizer.h"
+#include "pocketSphinxTools.h"
+
+class PhoneticRecognizer : public Recognizer {
+public:
+	BoundedTimeline<Phone> recognizePhones(
+		const AudioClip& inputAudioClip,
+		boost::optional<std::string> dialog,
+		int maxThreadCount,
+		ProgressSink& progressSink
+	) const override;
+};
diff --git a/rhubarb/src/recognition/phoneRecognition.cpp b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
similarity index 55%
rename from rhubarb/src/recognition/phoneRecognition.cpp
rename to rhubarb/src/recognition/PocketSphinxRecognizer.cpp
index e45c765..b97c0b7 100644
--- a/rhubarb/src/recognition/phoneRecognition.cpp
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
@@ -1,143 +1,133 @@
-#include <boost/filesystem.hpp>
-#include "phoneRecognition.h"
-#include "audio/SampleRateConverter.h"
-#include "tools/platformTools.h"
-#include "tools/tools.h"
-#include <format.h>
-#include <s3types.h>
+#include "PocketSphinxRecognizer.h"
 #include <regex>
 #include <gsl_util.h>
-#include "logging/logging.h"
-#include "audio/DcOffset.h"
-#include "time/Timeline.h"
-#include "audio/voiceActivityDetection.h"
 #include "audio/AudioSegment.h"
+#include "audio/SampleRateConverter.h"
 #include "languageModels.h"
 #include "tokenization.h"
 #include "g2p.h"
 #include "time/ContinuousTimeline.h"
 #include "audio/processing.h"
-#include "tools/parallel.h"
-#include <boost/version.hpp>
-#include "tools/ObjectPool.h"
 #include "time/timedLogging.h"
 
 extern "C" {
-#include <pocketsphinx.h>
-#include <sphinxbase/err.h>
-#include <ps_alignment.h>
 #include <state_align_search.h>
-#include <pocketsphinx_internal.h>
-#include <ngram_search.h>
 }
 
 using std::runtime_error;
 using std::invalid_argument;
 using std::unique_ptr;
-using std::shared_ptr;
 using std::string;
 using std::vector;
 using std::map;
 using boost::filesystem::path;
-using std::function;
 using std::regex;
 using std::regex_replace;
-using std::chrono::duration;
 using boost::optional;
-using std::string;
-using std::chrono::duration_cast;
 using std::array;
 
-constexpr int sphinxSampleRate = 16000;
-
-const path& getSphinxModelDirectory() {
-	static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
-	return sphinxModelDirectory;
+bool dictionaryContains(dict_t& dictionary, const string& word) {
+	return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
 }
 
-logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
-	switch (errorLevel) {
-	case ERR_DEBUG:
-	case ERR_INFO:
-	case ERR_INFOCONT:
-		return logging::Level::Trace;
-	case ERR_WARN:
-		return logging::Level::Warn;
-	case ERR_ERROR:
-		return logging::Level::Error;
-	case ERR_FATAL:
-		return logging::Level::Fatal;
-	default:
-		throw invalid_argument("Unknown log level.");
+s3wid_t getWordId(const string& word, dict_t& dictionary) {
+	const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
+	if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
+	return wordId;
+}
+
+void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
+	map<string, string> missingPronunciations;
+	for (const string& word : words) {
+		if (!dictionaryContains(*decoder.dict, word)) {
+			string pronunciation;
+			for (Phone phone : wordToPhones(word)) {
+				if (pronunciation.length() > 0) pronunciation += " ";
+				pronunciation += PhoneConverter::get().toString(phone);
+			}
+			missingPronunciations[word] = pronunciation;
+		}
+	}
+	for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
+		const bool isLast = it == --missingPronunciations.end();
+		logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
+		ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
 	}
 }
 
-void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
-	UNUSED(user_data);
-
-	// Create varArgs list
-	va_list args;
-	va_start(args, format);
-	auto _ = gsl::finally([&args]() { va_end(args); });
-
-	// Format message
-	const int initialSize = 256;
-	vector<char> chars(initialSize);
-	bool success = false;
-	while (!success) {
-		int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
-		if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
-
-		success = charsWritten < static_cast<int>(chars.size());
-		if (!success) chars.resize(chars.size() * 2);
-	}
-	regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
-	string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
-	boost::algorithm::trim(message);
-
-	logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
-	logging::log(logLevel, message);
-}
-
-BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
-	// Restart timing at 0
-	ps_start_stream(&decoder);
-
-	// Start recognition
-	int error = ps_start_utt(&decoder);
-	if (error) throw runtime_error("Error starting utterance processing for word recognition.");
-
-	// Process entire audio clip
-	const bool noRecognition = false;
-	const bool fullUtterance = true;
-	int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
-	if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
-
-	// End recognition
-	error = ps_end_utt(&decoder);
-	if (error) throw runtime_error("Error ending utterance processing for word recognition.");
-
-	BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
-	bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
-	if (noWordsRecognized) {
-		return result;
-	}
-
-	// Collect words
-	for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
-		const char* word = ps_seg_word(it);
-		int firstFrame, lastFrame;
-		ps_seg_frames(it, &firstFrame, &lastFrame);
-		result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
+lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
+	path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
+	lambda_unique_ptr<ngram_model_t> result(
+		ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
+		[](ngram_model_t* lm) { ngram_model_free(lm); });
+	if (!result) {
+		throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
 	}
 
 	return result;
 }
 
-s3wid_t getWordId(const string& word, dict_t& dictionary) {
-	s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
-	if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
-	return wordId;
+lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
+	// Split dialog into normalized words
+	vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
+
+	// Add dialog-specific words to the dictionary
+	addMissingDictionaryWords(words, decoder);
+
+	// Create dialog-specific language model
+	words.insert(words.begin(), "<s>");
+	words.emplace_back("</s>");
+	return createLanguageModel(words, decoder);
+}
+
+lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
+	auto defaultLanguageModel = createDefaultLanguageModel(decoder);
+	auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
+	constexpr int modelCount = 2;
+	array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
+	array<const char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
+	array<float, modelCount> modelWeights{ 0.1f, 0.9f };
+	lambda_unique_ptr<ngram_model_t> result(
+		ngram_model_set_init(nullptr, languageModels.data(), const_cast<char**>(modelNames.data()), modelWeights.data(), modelCount),
+		[](ngram_model_t* lm) { ngram_model_free(lm); });
+	if (!result) {
+		throw runtime_error("Error creating biased language model.");
+	}
+
+	return result;
+}
+
+static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
+	lambda_unique_ptr<cmd_ln_t> config(
+		cmd_ln_init(
+			nullptr, ps_args(), true,
+			// Set acoustic model
+			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
+			// Set pronunciation dictionary
+			"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
+			// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
+			"-dither", "yes",
+			// Disable VAD -- we're doing that ourselves
+			"-remove_silence", "no",
+			// Perform per-utterance cepstral mean normalization
+			"-cmn", "batch",
+			nullptr),
+		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
+	if (!config) throw runtime_error("Error creating configuration.");
+
+	lambda_unique_ptr<ps_decoder_t> decoder(
+		ps_init(config.get()),
+		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
+	if (!decoder) throw runtime_error("Error creating speech decoder.");
+
+	// Set language model
+	lambda_unique_ptr<ngram_model_t> languageModel(dialog
+		? createBiasedLanguageModel(*decoder, *dialog)
+		: createDefaultLanguageModel(*decoder));
+	ps_set_lm(decoder.get(), "lm", languageModel.get());
+	ps_set_search(decoder.get(), "lm");
+
+	return decoder;
 }
 
 optional<Timeline<Phone>> getPhoneAlignment(
@@ -178,7 +168,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
 		// Process entire audio clip
 		const int16* nextSample = audioBuffer.data();
 		size_t remainingSamples = audioBuffer.size();
-		bool fullUtterance = true;
+		const bool fullUtterance = true;
 		while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
 			while (acousticModel->n_feat_frame > 0) {
 				ps_search_step(search.get(), acousticModel->output_frame);
@@ -197,7 +187,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
 	for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
 		// Get phone
 		ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
-		s3cipid_t phoneId = phoneEntry->id.pid.cipid;
+		const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
 		string phoneName = phoneNames[phoneId];
 
 		if (phoneName == "SIL") continue;
@@ -207,162 +197,42 @@ optional<Timeline<Phone>> getPhoneAlignment(
 		centiseconds duration(phoneEntry->duration);
 		Phone phone = PhoneConverter::get().parse(phoneName);
 		if (phone == Phone::AH && duration < 6_cs) {
-			// Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate.
+			// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
 			phone = Phone::Schwa;
 		}
-		Timed<Phone> timedPhone(start, start + duration, phone);
+		const Timed<Phone> timedPhone(start, start + duration, phone);
 		result.set(timedPhone);
 	}
 	return result;
 }
 
-bool dictionaryContains(dict_t& dictionary, const string& word) {
-	return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
-}
-
-void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
-	map<string, string> missingPronunciations;
-	for (const string& word : words) {
-		if (!dictionaryContains(*decoder.dict, word)) {
-			string pronunciation;
-			for (Phone phone : wordToPhones(word)) {
-				if (pronunciation.length() > 0) pronunciation += " ";
-				pronunciation += PhoneConverter::get().toString(phone);
-			}
-			missingPronunciations[word] = pronunciation;
-		}
-	}
-	for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
-		bool isLast = it == --missingPronunciations.end();
-		logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
-		ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
-	}
-}
-
-lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
-	path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
-	lambda_unique_ptr<ngram_model_t> result(
-		ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
-		[](ngram_model_t* lm) { ngram_model_free(lm); });
-	if (!result) {
-		throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
-	}
-
-	return std::move(result);
-}
-
-lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
-	// Split dialog into normalized words
-	vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
-
-	// Add dialog-specific words to the dictionary
-	addMissingDictionaryWords(words, decoder);
-
-	// Create dialog-specific language model
-	words.insert(words.begin(), "<s>");
-	words.push_back("</s>");
-	return createLanguageModel(words, decoder);
-}
-
-lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
-	auto defaultLanguageModel = createDefaultLanguageModel(decoder);
-	auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
-	constexpr int modelCount = 2;
-	array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
-	array<char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
-	array<float, modelCount> modelWeights{ 0.1f, 0.9f };
-	lambda_unique_ptr<ngram_model_t> result(
-		ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
-		[](ngram_model_t* lm) { ngram_model_free(lm); });
-	if (!result) {
-		throw runtime_error("Error creating biased language model.");
-	}
-
-	return std::move(result);
-}
-
-lambda_unique_ptr<ps_decoder_t> createDecoder(optional<string> dialog) {
-	lambda_unique_ptr<cmd_ln_t> config(
-		cmd_ln_init(
-			nullptr, ps_args(), true,
-			// Set acoustic model
-			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
-			// Set pronunciation dictionary
-			"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
-			// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
-			"-dither", "yes",
-			// Disable VAD -- we're doing that ourselves
-			"-remove_silence", "no",
-			// Perform per-utterance cepstral mean normalization
-			"-cmn", "batch",
-			nullptr),
-		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
-	if (!config) throw runtime_error("Error creating configuration.");
-
-	lambda_unique_ptr<ps_decoder_t> decoder(
-		ps_init(config.get()),
-		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
-	if (!decoder) throw runtime_error("Error creating speech decoder.");
-
-	// Set language model
-	lambda_unique_ptr<ngram_model_t> languageModel(dialog
-		? createBiasedLanguageModel(*decoder, *dialog)
-		: createDefaultLanguageModel(*decoder));
-	ps_set_lm(decoder.get(), "lm", languageModel.get());
-	ps_set_search(decoder.get(), "lm");
-
-	return decoder;
-}
-
-JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
-	JoiningTimeline<void> noiseSounds;
-
-	// Find utterance parts without recogniced phones
-	noiseSounds.set(utteranceTimeRange);
-	for (const auto& timedPhone : phones) {
-		noiseSounds.clear(timedPhone.getTimeRange());
-	}
-
-	// Remove undesired elements
-	const centiseconds minSoundDuration = 12_cs;
-	for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
-		bool startsAtZero = unknownSound.getStart() == 0_cs;
-		bool tooShort = unknownSound.getDuration() < minSoundDuration;
-		if (startsAtZero || tooShort) {
-			noiseSounds.clear(unknownSound.getTimeRange());
-		}
-	}
-
-	return noiseSounds;
-}
-
 // Some words have multiple pronunciations, one of which results in better animation than the others.
 // This function returns the optimal pronunciation for a select set of these words.
 string fixPronunciation(const string& word) {
-	const static map<string, string> replacements {
-		{"into(2)", "into"},
-		{"to(2)", "to"},
-		{"to(3)", "to"},
-		{"today(2)", "today"},
-		{"tomorrow(2)", "tomorrow"},
-		{"tonight(2)", "tonight"}
+	const static map<string, string> replacements{
+		{ "into(2)", "into" },
+		{ "to(2)", "to" },
+		{ "to(3)", "to" },
+		{ "today(2)", "today" },
+		{ "tomorrow(2)", "tomorrow" },
+		{ "tonight(2)", "tonight" }
 	};
 
 	const auto pair = replacements.find(word);
 	return pair != replacements.end() ? pair->second : word;
 }
 
-Timeline<Phone> utteranceToPhones(
+static Timeline<Phone> utteranceToPhones(
 	const AudioClip& audioClip,
 	TimeRange utteranceTimeRange,
 	ps_decoder_t& decoder,
-	ProgressSink& utteranceProgressSink)
-{
+	ProgressSink& utteranceProgressSink
+) {
 	ProgressMerger utteranceProgressMerger(utteranceProgressSink);
 	ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
 	ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
 
-	// Pad time range to give Pocketsphinx some breathing room
+	// Pad time range to give PocketSphinx some breathing room
 	TimeRange paddedTimeRange = utteranceTimeRange;
 	const centiseconds padding(3);
 	paddedTimeRange.grow(padding);
@@ -384,7 +254,7 @@ Timeline<Phone> utteranceToPhones(
 			continue;
 		}
 		word = regex_replace(word, regex("\\(\\d\\)"), "");
-		if (text.size() > 0) {
+		if (!text.empty()) {
 			text += " ";
 		}
 		text += word;
@@ -403,7 +273,7 @@ Timeline<Phone> utteranceToPhones(
 		const string fixedWord = fixPronunciation(timedWord.getValue());
 		wordIds.push_back(getWordId(fixedWord, *decoder.dict));
 	}
-	if (wordIds.empty()) return {};
+	if (wordIds.empty()) return{};
 
 	// Align the words' phones with speech
 #if BOOST_VERSION < 105600 // Support legacy syntax
@@ -433,77 +303,11 @@ Timeline<Phone> utteranceToPhones(
 	return utterancePhones;
 }
 
-BoundedTimeline<Phone> recognizePhones(
+BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
 	const AudioClip& inputAudioClip,
-	optional<string> dialog,
+	optional<std::string> dialog,
 	int maxThreadCount,
-	ProgressSink& progressSink)
-{
-	ProgressMerger totalProgressMerger(progressSink);
-	ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
-	ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
-
-	// Make sure audio stream has no DC offset
-	const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
-
-	// Split audio into utterances
-	JoiningBoundedTimeline<void> utterances;
-	try {
-		utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
-	}
-	catch (...) {
-		std::throw_with_nested(runtime_error("Error detecting segments of speech."));
-	}
-
-	// Discard Pocketsphinx output
-	err_set_logfp(nullptr);
-
-	// Redirect Pocketsphinx output to log
-	err_set_callback(sphinxLogCallback, nullptr);
-
-	// Prepare pool of decoders
-	ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
-		[&dialog] { return createDecoder(dialog); });
-
-	BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
-	std::mutex resultMutex;
-	auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
-		// Detect phones for utterance
-		auto decoder = decoderPool.acquire();
-		Timeline<Phone> utterancePhones =
-			utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
-
-		// Copy phones to result timeline
-		std::lock_guard<std::mutex> lock(resultMutex);
-		for (const auto& timedPhone : utterancePhones) {
-			phones.set(timedPhone);
-		}
-	};
-
-	auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
-		return timedUtterance.getDuration().count();
-	};
-
-	// Perform speech recognition
-	try {
-		// Determine how many parallel threads to use
-		int threadCount = std::min({
-			maxThreadCount,
-			// Don't use more threads than there are utterances to be processed
-			static_cast<int>(utterances.size()),
-			// Don't waste time creating additional threads (and decoders!) if the recording is short
-			static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
-		});
-		if (threadCount < 1) {
-			threadCount = 1;
-		}
-		logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
-		runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
-		logging::debug("Speech recognition -- end");
-	}
-	catch (...) {
-		std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
-	}
-
-	return phones;
+	ProgressSink& progressSink
+) const {
+	return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
 }
diff --git a/rhubarb/src/recognition/PocketSphinxRecognizer.h b/rhubarb/src/recognition/PocketSphinxRecognizer.h
new file mode 100644
index 0000000..dc11d2d
--- /dev/null
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "Recognizer.h"
+#include "pocketSphinxTools.h"
+
+class PocketSphinxRecognizer : public Recognizer {
+public:
+	BoundedTimeline<Phone> recognizePhones(
+		const AudioClip& inputAudioClip,
+		boost::optional<std::string> dialog,
+		int maxThreadCount,
+		ProgressSink& progressSink
+	) const override;
+};
diff --git a/rhubarb/src/recognition/Recognizer.h b/rhubarb/src/recognition/Recognizer.h
new file mode 100644
index 0000000..05c445d
--- /dev/null
+++ b/rhubarb/src/recognition/Recognizer.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "audio/AudioClip.h"
+#include "core/Phone.h"
+#include "tools/ProgressBar.h"
+#include "time/BoundedTimeline.h"
+
+class Recognizer {
+public:
+	virtual ~Recognizer() = default;
+
+	virtual BoundedTimeline<Phone>recognizePhones(
+		const AudioClip& audioClip,
+		boost::optional<std::string> dialog,
+		int maxThreadCount,
+		ProgressSink& progressSink
+	) const = 0;
+};
\ No newline at end of file
diff --git a/rhubarb/src/recognition/phoneRecognition.h b/rhubarb/src/recognition/phoneRecognition.h
deleted file mode 100644
index 2e66305..0000000
--- a/rhubarb/src/recognition/phoneRecognition.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include "audio/AudioClip.h"
-#include "core/Phone.h"
-#include "tools/ProgressBar.h"
-#include "time/BoundedTimeline.h"
-
-BoundedTimeline<Phone> recognizePhones(
-	const AudioClip& audioClip,
-	boost::optional<std::string> dialog,
-	int maxThreadCount,
-	ProgressSink& progressSink);
diff --git a/rhubarb/src/recognition/pocketSphinxTools.cpp b/rhubarb/src/recognition/pocketSphinxTools.cpp
new file mode 100644
index 0000000..87a13ea
--- /dev/null
+++ b/rhubarb/src/recognition/pocketSphinxTools.cpp
@@ -0,0 +1,218 @@
+#include "pocketSphinxTools.h"
+
+#include "tools/platformTools.h"
+#include <regex>
+#include "audio/DcOffset.h"
+#include "audio/voiceActivityDetection.h"
+#include "tools/parallel.h"
+#include "tools/ObjectPool.h"
+#include "time/timedLogging.h"
+
+extern "C" {
+#include <sphinxbase/err.h>
+#include <pocketsphinx_internal.h>
+#include <ngram_search.h>
+}
+
+using std::runtime_error;
+using std::invalid_argument;
+using std::unique_ptr;
+using std::string;
+using std::vector;
+using boost::filesystem::path;
+using std::regex;
+using boost::optional;
+using std::chrono::duration_cast;
+	
+logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
+	switch (errorLevel) {
+	case ERR_DEBUG:
+	case ERR_INFO:
+	case ERR_INFOCONT:
+		return logging::Level::Trace;
+	case ERR_WARN:
+		return logging::Level::Warn;
+	case ERR_ERROR:
+		return logging::Level::Error;
+	case ERR_FATAL:
+		return logging::Level::Fatal;
+	default:
+		throw invalid_argument("Unknown log level.");
+	}
+}
+
+void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
+	UNUSED(user_data);
+
+	// Create varArgs list
+	va_list args;
+	va_start(args, format);
+	auto _ = gsl::finally([&args]() { va_end(args); });
+
+	// Format message
+	const int initialSize = 256;
+	vector<char> chars(initialSize);
+	bool success = false;
+	while (!success) {
+		const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
+		if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
+
+		success = charsWritten < static_cast<int>(chars.size());
+		if (!success) chars.resize(chars.size() * 2);
+	}
+	const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
+	string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
+	boost::algorithm::trim(message);
+
+	const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
+	logging::log(logLevel, message);
+}
+
+void redirectPocketSphinxOutput() {
+	static bool redirected = false;
+	if (redirected) return;
+
+	// Discard PocketSphinx output
+	err_set_logfp(nullptr);
+
+	// Redirect PocketSphinx output to log
+	err_set_callback(sphinxLogCallback, nullptr);
+
+	redirected = true;
+}
+
+BoundedTimeline<Phone> recognizePhones(
+	const AudioClip& inputAudioClip,
+	optional<std::string> dialog,
+	decoderFactory createDecoder,
+	utteranceToPhonesFunction utteranceToPhones,
+	int maxThreadCount,
+	ProgressSink& progressSink
+) {
+	ProgressMerger totalProgressMerger(progressSink);
+	ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
+	ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
+
+	// Make sure audio stream has no DC offset
+	const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
+
+	// Split audio into utterances
+	JoiningBoundedTimeline<void> utterances;
+	try {
+		utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
+	} catch (...) {
+		std::throw_with_nested(runtime_error("Error detecting segments of speech."));
+	}
+
+	redirectPocketSphinxOutput();
+
+	// Prepare pool of decoders
+	ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
+		[&] { return createDecoder(dialog); });
+
+	BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
+	std::mutex resultMutex;
+	const auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
+		// Detect phones for utterance
+		const auto decoder = decoderPool.acquire();
+		Timeline<Phone> utterancePhones =
+			utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
+
+		// Copy phones to result timeline
+		std::lock_guard<std::mutex> lock(resultMutex);
+		for (const auto& timedPhone : utterancePhones) {
+			phones.set(timedPhone);
+		}
+	};
+
+	const auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
+		return timedUtterance.getDuration().count();
+	};
+
+	// Perform speech recognition
+	try {
+		// Determine how many parallel threads to use
+		int threadCount = std::min({
+			maxThreadCount,
+			// Don't use more threads than there are utterances to be processed
+			static_cast<int>(utterances.size()),
+			// Don't waste time creating additional threads (and decoders!) if the recording is short
+			static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
+		});
+		if (threadCount < 1) {
+			threadCount = 1;
+		}
+		logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
+		runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
+		logging::debug("Speech recognition -- end");
+	} catch (...) {
+		std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
+	}
+
+	return phones;
+}
+
+const path& getSphinxModelDirectory() {
+	static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
+	return sphinxModelDirectory;
+}
+
+JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
+	JoiningTimeline<void> noiseSounds;
+
+	// Find utterance parts without recognized phones
+	noiseSounds.set(utteranceTimeRange);
+	for (const auto& timedPhone : phones) {
+		noiseSounds.clear(timedPhone.getTimeRange());
+	}
+
+	// Remove undesired elements
+	const centiseconds minSoundDuration = 12_cs;
+	for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
+		const bool startsAtZero = unknownSound.getStart() == 0_cs;
+		const bool tooShort = unknownSound.getDuration() < minSoundDuration;
+		if (startsAtZero || tooShort) {
+			noiseSounds.clear(unknownSound.getTimeRange());
+		}
+	}
+
+	return noiseSounds;
+}
+
+BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
+	// Restart timing at 0
+	ps_start_stream(&decoder);
+
+	// Start recognition
+	int error = ps_start_utt(&decoder);
+	if (error) throw runtime_error("Error starting utterance processing for word recognition.");
+
+	// Process entire audio clip
+	const bool noRecognition = false;
+	const bool fullUtterance = true;
+	const int searchedFrameCount =
+		ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
+	if (searchedFrameCount < 0) {
+		throw runtime_error("Error analyzing raw audio data for word recognition.");
+	}
+
+	// End recognition
+	error = ps_end_utt(&decoder);
+	if (error) throw runtime_error("Error ending utterance processing for word recognition.");
+
+	BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
+	const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
+	if (noWordsRecognized) {
+		return result;
+	}
+
+	// Collect words
+	for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
+		const char* word = ps_seg_word(it);
+		int firstFrame, lastFrame;
+		ps_seg_frames(it, &firstFrame, &lastFrame);
+		result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
+	}
+
+	return result;
+}
diff --git a/rhubarb/src/recognition/pocketSphinxTools.h b/rhubarb/src/recognition/pocketSphinxTools.h
new file mode 100644
index 0000000..568ccbe
--- /dev/null
+++ b/rhubarb/src/recognition/pocketSphinxTools.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "time/BoundedTimeline.h"
+#include "core/Phone.h"
+#include "audio/AudioClip.h"
+#include "tools/ProgressBar.h"
+#include <boost/filesystem/path.hpp>
+
+extern "C" {
+#include <pocketsphinx.h>
+}
+
+typedef std::function<lambda_unique_ptr<ps_decoder_t>(
+	boost::optional<std::string> dialog
+)> decoderFactory;
+
+typedef std::function<Timeline<Phone>(
+	const AudioClip& audioClip,
+	TimeRange utteranceTimeRange,
+	ps_decoder_t& decoder,
+	ProgressSink& utteranceProgressSink
+)> utteranceToPhonesFunction;
+
+BoundedTimeline<Phone> recognizePhones(
+	const AudioClip& inputAudioClip,
+	boost::optional<std::string> dialog,
+	decoderFactory createDecoder,
+	utteranceToPhonesFunction utteranceToPhones,
+	int maxThreadCount,
+	ProgressSink& progressSink
+);
+
+constexpr int sphinxSampleRate = 16000;
+
+const boost::filesystem::path& getSphinxModelDirectory();
+
+JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones);
+
+BoundedTimeline<std::string> recognizeWords(const std::vector<int16_t>& audioBuffer, ps_decoder_t& decoder);
diff --git a/rhubarb/src/rhubarb/RecognizerType.cpp b/rhubarb/src/rhubarb/RecognizerType.cpp
new file mode 100644
index 0000000..86f0837
--- /dev/null
+++ b/rhubarb/src/rhubarb/RecognizerType.cpp
@@ -0,0 +1,27 @@
+#include "RecognizerType.h"
+
+using std::string;
+
+RecognizerTypeConverter& RecognizerTypeConverter::get() {
+	static RecognizerTypeConverter converter;
+	return converter;
+}
+
+string RecognizerTypeConverter::getTypeName() {
+	return "RecognizerType";
+}
+
+EnumConverter<RecognizerType>::member_data RecognizerTypeConverter::getMemberData() {
+	return member_data{
+		{ RecognizerType::PocketSphinx,	"pocketSphinx" },
+		{ RecognizerType::Phonetic,		"phonetic" }
+	};
+}
+
+std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
+	return RecognizerTypeConverter::get().write(stream, value);
+}
+
+std::istream& operator>>(std::istream& stream, RecognizerType& value) {
+	return RecognizerTypeConverter::get().read(stream, value);
+}
diff --git a/rhubarb/src/rhubarb/RecognizerType.h b/rhubarb/src/rhubarb/RecognizerType.h
new file mode 100644
index 0000000..6f8cf12
--- /dev/null
+++ b/rhubarb/src/rhubarb/RecognizerType.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "tools/EnumConverter.h"
+
+enum class RecognizerType {
+	PocketSphinx,
+	Phonetic
+};
+
+class RecognizerTypeConverter : public EnumConverter<RecognizerType> {
+public:
+	static RecognizerTypeConverter& get();
+protected:
+	std::string getTypeName() override;
+	member_data getMemberData() override;
+};
+
+std::ostream& operator<<(std::ostream& stream, RecognizerType value);
+
+std::istream& operator>>(std::istream& stream, RecognizerType& value);
diff --git a/rhubarb/src/rhubarb/main.cpp b/rhubarb/src/rhubarb/main.cpp
index 104a6e8..703dd67 100644
--- a/rhubarb/src/rhubarb/main.cpp
+++ b/rhubarb/src/rhubarb/main.cpp
@@ -27,6 +27,9 @@
 #include "tools/platformTools.h"
 #include "sinks.h"
 #include "semanticEntries.h"
+#include "RecognizerType.h"
+#include "recognition/PocketSphinxRecognizer.h"
+#include "recognition/PhoneticRecognizer.h"
 
 using std::exception;
 using std::string;
@@ -36,9 +39,6 @@ using std::unique_ptr;
 using std::make_unique;
 using std::shared_ptr;
 using std::make_shared;
-using std::map;
-using std::chrono::duration;
-using std::chrono::duration_cast;
 using std::ofstream;
 using boost::filesystem::path;
 using boost::adaptors::transformed;
@@ -56,6 +56,10 @@ namespace TCLAP {
 	struct ArgTraits<ExportFormat> {
 		typedef ValueLike ValueCategory;
 	};
+	template<>
+	struct ArgTraits<RecognizerType> {
+		typedef ValueLike ValueCategory;
+	};
 }
 
 shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
@@ -66,6 +70,17 @@ shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
 	return make_shared<logging::LevelFilter>(FileSink, minLevel);
 }
 
+unique_ptr<Recognizer> createRecognizer(RecognizerType recognizerType) {
+	switch (recognizerType) {
+	case RecognizerType::PocketSphinx:
+		return make_unique<PocketSphinxRecognizer>();
+	case RecognizerType::Phonetic:
+		return make_unique<PhoneticRecognizer>();
+	default:
+		throw std::runtime_error("Unknown recognizer.");
+	}
+}
+
 unique_ptr<Exporter> createExporter(ExportFormat exportFormat) {
 	switch (exportFormat) {
 	case ExportFormat::Tsv:
@@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
 	auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
 	tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
 	tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
+	auto recognizerTypes = vector<RecognizerType>(RecognizerTypeConverter::get().getValues());
+	tclap::ValuesConstraint<RecognizerType> recognizerConstraint(recognizerTypes);
+	tclap::ValueArg<RecognizerType> recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
 	tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
 
 	try {
@@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
 			JoiningContinuousTimeline<Shape> animation = animateWaveFile(
 				inputFilePath,
 				dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
+				*createRecognizer(recognizerType.getValue()),
 				targetShapeSet,
 				maxThreadCount.getValue(),
 				progressSink);

From 7ebe9b53e8d297e451b9529c7c65aa53d53f72b2 Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Tue, 1 Jan 2019 21:51:22 +0100
Subject: [PATCH 3/6] Internal version 1.9.0-pre.1

---
 appInfo.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/appInfo.cmake b/appInfo.cmake
index 4f7dc9d..ed17020 100644
--- a/appInfo.cmake
+++ b/appInfo.cmake
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.2)
 
 set(appName "Rhubarb Lip Sync")
 set(appVersionMajor 1)
-set(appVersionMinor 8)
+set(appVersionMinor 9)
 set(appVersionPatch 0)
-set(appVersionSuffix "")
+set(appVersionSuffix "-pre.1")
 set(appVersion "${appVersionMajor}.${appVersionMinor}.${appVersionPatch}${appVersionSuffix}")

From 3bf7a00d42ec8fa447df9cce5eab736fed75f960 Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Tue, 1 Jan 2019 21:52:34 +0100
Subject: [PATCH 4/6] Add recognizer support to After Effects integration

---
 extras/AdobeAfterEffects/Rhubarb Lip Sync.jsx | 37 ++++++++++++++++---
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/extras/AdobeAfterEffects/Rhubarb Lip Sync.jsx b/extras/AdobeAfterEffects/Rhubarb Lip Sync.jsx
index d8dbfcd..a8c8152 100644
--- a/extras/AdobeAfterEffects/Rhubarb Lip Sync.jsx	
+++ b/extras/AdobeAfterEffects/Rhubarb Lip Sync.jsx	
@@ -323,6 +323,12 @@ function createDialogWindow() {
 								+ 'your After Effects project.'
 						})
 					}),
+					recognizer: Group({
+						label: StaticText({ text: 'Recognizer:' }),
+						value: DropDownList({
+							helpTip: 'The dialog recognizer.'
+						})
+					}),
 					dialogText: Group({
 						label: StaticText({ text: 'Dialog text (optional):' }),
 						value: EditText({
@@ -384,6 +390,7 @@ function createDialogWindow() {
 	var controls = {
 		audioFile: window.settings.audioFile.value,
 		dialogText: window.settings.dialogText.value,
+		recognizer: window.settings.recognizer.value,
 		mouthComp: window.settings.mouthComp.value,
 		targetFolder: window.settings.targetFolder.value,
 		frameRate: window.settings.frameRate.value,
@@ -402,6 +409,16 @@ function createDialogWindow() {
 		listItem.projectItem = projectItem;
 	});
 
+	// Add recognizer options
+	const recognizerOptions = [
+		{ text: 'PocketSphinx (use for English recordings)', value: 'pocketSphinx' },
+		{ text: 'Phonetic (use for non-English recordings)', value: 'phonetic' }
+	];
+	recognizerOptions.forEach(function(option) {
+		var listItem = controls.recognizer.add('item', option.text);
+		listItem.value = option.value;
+	});
+
 	// Add mouth composition options
 	var comps = toArrayBase1(app.project.items).filter(function (item) {
 		return item instanceof CompItem;
@@ -425,6 +442,7 @@ function createDialogWindow() {
 	var settings = readSettingsFile();
 	selectByTextOrFirst(controls.audioFile, settings.audioFile);
 	controls.dialogText.text = settings.dialogText || '';
+	selectByTextOrFirst(controls.recognizer, settings.recognizer);
 	selectByTextOrFirst(controls.mouthComp, settings.mouthComp);
 	extendedMouthShapeNames.forEach(function(shapeName) {
 		controls['mouthShape' + shapeName].value =
@@ -484,6 +502,7 @@ function createDialogWindow() {
 			// Store settings
 			var settings = {
 				audioFile: (controls.audioFile.selection || {}).text,
+				recognizer: (controls.recognizer.selection || {}).text,
 				dialogText: controls.dialogText.text,
 				mouthComp: (controls.mouthComp.selection || {}).text,
 				extendedMouthShapes: {},
@@ -543,7 +562,7 @@ function createDialogWindow() {
 
 		// Check for correct Rhubarb version
 		var version = exec(rhubarbPath + ' --version') || '';
-		var match = version.match(/Rhubarb Lip Sync version ((\d+)\.(\d+).(\d+))/);
+		var match = version.match(/Rhubarb Lip Sync version ((\d+)\.(\d+).(\d+)(-[0-9A-Za-z-.]+)?)/);
 		if (!match) {
 			var instructions = osIsWindows
 				? 'Make sure your PATH environment variable contains the ' + appName + ' '
@@ -555,13 +574,16 @@ function createDialogWindow() {
 		var versionString = match[1];
 		var major = Number(match[2]);
 		var minor = Number(match[3]);
-		if (major != 1 || minor < 6) {
-			return 'This script requires ' + appName + ' 1.6.0 or a later 1.x version. '
+		var requiredMajor = 1;
+		var minRequiredMinor = 9;
+		if (major != requiredMajor || minor < minRequiredMinor) {
+			return 'This script requires ' + appName + ' ' + requiredMajor + '.' + minRequiredMinor
+				+ '.0 or a later ' + requiredMajor + '.x version. '
 				+ 'Your installed version is ' + versionString + ', which is not compatible.';
 		}
 	}
 
-	function generateMouthCues(audioFileFootage, dialogText, mouthComp, extendedMouthShapeNames,
+	function generateMouthCues(audioFileFootage, recognizer, dialogText, mouthComp, extendedMouthShapeNames,
 		targetProjectFolder, frameRate)
 	{
 		var basePath = Folder.temp.fsName + '/' + createGuid();
@@ -575,6 +597,7 @@ function createDialogWindow() {
 			// Create command line
 			var commandLine = rhubarbPath
 				+ ' --dialogFile ' + cliEscape(dialogFile.fsName)
+				+ ' --recognizer ' + recognizer
 				+ ' --exportFormat json'
 				+ ' --extendedShapes ' + cliEscape(extendedMouthShapeNames.join(''))
 				+ ' --logFile ' + cliEscape(logFile.fsName)
@@ -660,11 +683,11 @@ function createDialogWindow() {
 		}
 	}
 
-	function animate(audioFileFootage, dialogText, mouthComp, extendedMouthShapeNames,
+	function animate(audioFileFootage, recognizer, dialogText, mouthComp, extendedMouthShapeNames,
 		targetProjectFolder, frameRate)
 	{
 		try {
-			var mouthCues = generateMouthCues(audioFileFootage, dialogText, mouthComp,
+			var mouthCues = generateMouthCues(audioFileFootage, recognizer, dialogText, mouthComp,
 				extendedMouthShapeNames, targetProjectFolder, frameRate);
 
 			app.beginUndoGroup(appName + ': Animation');
@@ -680,6 +703,7 @@ function createDialogWindow() {
 	// Handle changes
 	update();
 	controls.audioFile.onChange = update;
+	controls.recognizer.onChange = update;
 	controls.dialogText.onChanging = update;
 	controls.mouthComp.onChange = update;
 	extendedMouthShapeNames.forEach(function(shapeName) {
@@ -700,6 +724,7 @@ function createDialogWindow() {
 			window.close();
 			animate(
 				controls.audioFile.selection.projectItem,
+				controls.recognizer.selection.value,
 				controls.dialogText.text || '',
 				controls.mouthComp.selection.projectItem,
 				extendedMouthShapeNames.filter(function(shapeName) {

From bfc98a1c8105cbeeb9ea8def9192b95b0eab0b34 Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Tue, 1 Jan 2019 22:47:18 +0100
Subject: [PATCH 5/6] Add recognizer support to Spine integration

---
 .../rhubarb_for_spine/AudioFileModel.kt           |  3 ++-
 .../rhubarb_for_spine/MainModel.kt                | 15 ++++++++++++++-
 .../rhubarb_for_spine/MainView.kt                 | 15 +++++++++++++++
 .../rhubarb_for_spine/RhubarbTask.kt              |  3 ++-
 4 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/AudioFileModel.kt b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/AudioFileModel.kt
index e9f41c6..632782c 100644
--- a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/AudioFileModel.kt
+++ b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/AudioFileModel.kt
@@ -141,11 +141,12 @@ class AudioFileModel(
 
 	private fun startAnimation() {
 		val wrapperTask = Runnable {
+			val recognizer = parentModel.parentModel.recognizer.value
 			val extendedMouthShapes = parentModel.mouthShapes.filter { it.isExtended }.toSet()
 			val reportProgress: (Double?) -> Unit = {
 				progress -> runAndWait { this@AudioFileModel.animationProgress = progress }
 			}
-			val rhubarbTask = RhubarbTask(audioFilePath, dialog, extendedMouthShapes, reportProgress)
+			val rhubarbTask = RhubarbTask(audioFilePath, recognizer, dialog, extendedMouthShapes, reportProgress)
 			try {
 				try {
 					val result = rhubarbTask.call()
diff --git a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainModel.kt b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainModel.kt
index 9010146..6378aad 100644
--- a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainModel.kt
+++ b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainModel.kt
@@ -2,6 +2,8 @@ package com.rhubarb_lip_sync.rhubarb_for_spine
 
 import javafx.beans.property.SimpleObjectProperty
 import javafx.beans.property.SimpleStringProperty
+import javafx.collections.FXCollections
+import javafx.collections.ObservableList
 import tornadofx.FX
 import tornadofx.getValue
 import tornadofx.setValue
@@ -40,6 +42,15 @@ class MainModel(private val executor: ExecutorService) {
 	var animationFileModel by animationFileModelProperty
 		private set
 
+	val recognizersProperty = SimpleObjectProperty<ObservableList<Recognizer>>(FXCollections.observableArrayList(
+		Recognizer("pocketSphinx", "PocketSphinx (use for English recordings)"),
+		Recognizer("phonetic", "Phonetic (use for non-English recordings)")
+	))
+	private var recognizers: ObservableList<Recognizer> by recognizersProperty
+
+	val recognizerProperty = SimpleObjectProperty<Recognizer>(recognizers[0])
+	var recognizer: Recognizer by recognizerProperty
+
 	val animationPrefixProperty = SimpleStringProperty("say_")
 	var animationPrefix: String by animationPrefixProperty
 
@@ -47,4 +58,6 @@ class MainModel(private val executor: ExecutorService) {
 	var animationSuffix: String by animationSuffixProperty
 
 	private fun getDefaultPathString() = FX.application.parameters.raw.firstOrNull()
-}
\ No newline at end of file
+}
+
+class Recognizer(val value: String, val description: String)
diff --git a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainView.kt b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainView.kt
index 13e2316..7a67e91 100644
--- a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainView.kt
+++ b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainView.kt
@@ -17,6 +17,7 @@ import javafx.scene.text.Font
 import javafx.scene.text.FontWeight
 import javafx.scene.text.Text
 import javafx.stage.FileChooser
+import javafx.util.StringConverter
 import tornadofx.*
 import java.io.File
 import java.util.concurrent.Executors
@@ -83,6 +84,20 @@ class MainView : View() {
 					}
 				}
 			}
+			field("Dialog recognizer") {
+				combobox<Recognizer> {
+					itemsProperty().bind(mainModel.recognizersProperty)
+					this.converter = object : StringConverter<Recognizer>() {
+						override fun toString(recognizer: Recognizer?): String {
+							return recognizer?.description ?: ""
+						}
+						override fun fromString(string: String?): Recognizer {
+							throw NotImplementedError()
+						}
+					}
+					valueProperty().bindBidirectional(mainModel.recognizerProperty)
+				}
+			}
 			field("Animation naming") {
 				textfield {
 					maxWidth = 100.0
diff --git a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/RhubarbTask.kt b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/RhubarbTask.kt
index 0268003..0694e79 100644
--- a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/RhubarbTask.kt
+++ b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/RhubarbTask.kt
@@ -14,6 +14,7 @@ import java.util.concurrent.Callable
 
 class RhubarbTask(
 	val audioFilePath: Path,
+	val recognizer: String,
 	val dialog: String?,
 	val extendedMouthShapes: Set<MouthShape>,
 	val reportProgress: (Double?) -> Unit
@@ -89,6 +90,7 @@ class RhubarbTask(
 		return mutableListOf(
 			rhubarbBinFilePath.toString(),
 			"--machineReadable",
+			"--recognizer", recognizer,
 			"--exportFormat", "json",
 			"--extendedShapes", extendedMouthShapesString
 		).apply {
@@ -100,7 +102,6 @@ class RhubarbTask(
 		}.apply {
 			add(audioFilePath.toString())
 		}
-
 	}
 
 	private val guiBinDirectory: Path by lazy {

From d029458c7021e5a0a4fd7a43d2258f28c75f243f Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Tue, 1 Jan 2019 23:11:19 +0100
Subject: [PATCH 6/6] Document phonetic recognizer

---
 CHANGELOG.md |  4 ++++
 README.adoc  | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 516cbe2..c466bc4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Version history
 
+## Unreleased
+
+* **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)).
+
 ## Version 1.8.0
 
 * **Added** support for Ogg Vorbis (.ogg) file format ([issue #40](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/40)).
diff --git a/README.adoc b/README.adoc
index fb08c03..e0f4646 100644
--- a/README.adoc
+++ b/README.adoc
@@ -123,6 +123,11 @@ The following command-line options are the most common:
 | _<input file>_
 | The audio file to be analyzed. This must be the last command-line argument. Supported file formats are WAVE (.wav) and Ogg Vorbis (.ogg).
 
+| `-r` _<recognizer>_, `--recognizer` _<recognizer>_
+| Specifies how Rhubarb Lip Sync recognizes speech within the recording. Options: `pocketSphinx` (use for English recordings), `phonetic` (use for non-English recordings). For details, see <<recognizers>>.
+
+_Default value: ``pocketSphinx``_
+
 | `-f` _<format>_, `--exportFormat` _<format>_
 | The export format. Options: `tsv` (tab-separated values, see <<tsv,details>>), `xml` (see <<xml,details>>), `json` (see <<json,details>>).
 
@@ -192,6 +197,19 @@ Note that for short audio files, Rhubarb Lip Sync may choose to use fewer thread
 _Default value: as many threads as your CPU has cores_
 |===
 
+[[recognizers]]
+== Recognizers
+
+The first step in processing an audio file is determining what is being said. More specifically, Rhubarb Lip Sync uses speech recognition to figure out what sound is being said at what point in time. You can choose between two recognizers:
+
+=== PocketSphinx
+
+PocketSphinx is an open-source speech recognition library that generally gives good results. This is the default recognizer. The downside is that PocketSphinx only recognizes English dialog. So if your recordings are in a language other than English, this is not a good choice.
+
+=== Phonetic
+
+Rhubarb Lip Sync also comes with a phonetic recognizer. _Phonetic_ means that this recognizer won't try to understand entire (English) words and phrases. Instead, it will recognize individual sounds and syllables. The results are usually less precise than those from the PocketSphinx recognizer. The advantage is that this recognizer is language-independent. Use it if your recordings are not in English.
+
 [[outputFormats]]
 == Output formats