From f3d4cfbb31932031c6d82fc179e95704bae3e82d Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Fri, 18 Jan 2019 22:29:26 +0100
Subject: [PATCH 1/5] Fix gaps in phonetic recognition

Randomly, entire utterances yielded no phones with the phonetic recognizer.
The cause was a check for empty utterances that made sense for word
recognition, but not for phonetic recognition.
---
 rhubarb/src/recognition/pocketSphinxTools.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/rhubarb/src/recognition/pocketSphinxTools.cpp b/rhubarb/src/recognition/pocketSphinxTools.cpp
index cdb91b5..d571cc5 100644
--- a/rhubarb/src/recognition/pocketSphinxTools.cpp
+++ b/rhubarb/src/recognition/pocketSphinxTools.cpp
@@ -219,9 +219,18 @@ BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_de
 	BoundedTimeline<string> result(
 		TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate))
 	);
-	const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
-	if (noWordsRecognized) {
-		return result;
+	const bool phonetic = cmd_ln_boolean_r(decoder.config, "-allphone_ci");
+	if (!phonetic) {
+		// If the decoder is in word mode (as opposed to phonetic recognition), it expects each
+		// utterance to contain speech. If it doesn't, ps_seg_word() logs the annoying error
+		// "Couldn't find <s> in first frame".
+		// Not every utterance does contain speech, however. In this case, we exit early to prevent
+		// the log output.
+		// We *don't* to that in phonetic mode because here, the same code would omit valid phones.
+		const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
+		if (noWordsRecognized) {
+			return result;
+		}
 	}
 
 	// Collect words

From 21392d32cca653fde3f0f724d4aa7c7f6e750b27 Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Mon, 21 Jan 2019 22:26:39 +0100
Subject: [PATCH 2/5] Improve read performance for WAVE files

Fixes #58
---
 CHANGELOG.md                         | 1 +
 rhubarb/src/audio/WaveFileReader.cpp | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e5bd11e..b074763 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Unreleased
 
 * **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)).
+* **Improved** processing speed for WAVE files ([issue #58](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/58)).
 * **Fixed** a bug that resulted in unwanted mouth movement at beginning of a recording ([issue #53](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/53)).
 * **Fixed** a bug that garbled special characters in the output file path ([issue #54](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/54)).
 * **Fixed** a bug that prevented the progress bar from reaching 100% ([issue #48](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/48)).
diff --git a/rhubarb/src/audio/WaveFileReader.cpp b/rhubarb/src/audio/WaveFileReader.cpp
index e8aa872..5e4ae58 100644
--- a/rhubarb/src/audio/WaveFileReader.cpp
+++ b/rhubarb/src/audio/WaveFileReader.cpp
@@ -192,7 +192,9 @@ SampleReader WaveFileReader::createUnsafeSampleReader() const {
 		](size_type index) mutable {
 		const std::streampos newFilePos = formatInfo.dataOffset
 			+ static_cast<std::streamoff>(index * formatInfo.bytesPerFrame);
-		file->seekg(newFilePos);
+		if (newFilePos != filePos) {
+			file->seekg(newFilePos);
+		}
 		const value_type result =
 			readSample(*file, formatInfo.sampleFormat, formatInfo.channelCount);
 		filePos = newFilePos + static_cast<std::streamoff>(formatInfo.bytesPerFrame);

From c8217885698efa5ac92ac00ffa3c5f6d56f165c0 Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Mon, 21 Jan 2019 22:32:31 +0100
Subject: [PATCH 3/5] Discard very short segments of voice activity

This prevents short flickers from false VAD positives.
This fixes a regression recently introduced in 2bbad258c0.
---
 rhubarb/src/audio/voiceActivityDetection.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/rhubarb/src/audio/voiceActivityDetection.cpp b/rhubarb/src/audio/voiceActivityDetection.cpp
index 2e5ae17..d6f96f5 100644
--- a/rhubarb/src/audio/voiceActivityDetection.cpp
+++ b/rhubarb/src/audio/voiceActivityDetection.cpp
@@ -75,6 +75,14 @@ JoiningBoundedTimeline<void> detectVoiceActivity(
 		}
 	}
 
+	// Discard very short segments of activity
+	const centiseconds minSegmentLength(5);
+	for (const auto& segment : activity) {
+		if (segment.getDuration() < minSegmentLength) {
+			activity.clear(segment.getTimeRange());
+		}
+	}
+
 	logging::debugFormat(
 		"Found {} sections of voice activity: {}",
 		activity.size(),

From 06b4855d6d0161197f6c5cd8d1fd682d69db45d7 Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Mon, 21 Jan 2019 22:33:34 +0100
Subject: [PATCH 4/5] Set helpful configuration settings for the phonetic
 recognizer

Copied from the PocketSphinx recognizer
---
 rhubarb/src/recognition/PhoneticRecognizer.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/rhubarb/src/recognition/PhoneticRecognizer.cpp b/rhubarb/src/recognition/PhoneticRecognizer.cpp
index dff1956..262ba48 100644
--- a/rhubarb/src/recognition/PhoneticRecognizer.cpp
+++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp
@@ -25,6 +25,13 @@ static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialo
 			// Low values (<= 0.4) can lead to fluttering animation.
 			// High values (>= 1.0) can lead to imprecise or freezing animation.
 			"-lw", "0.8",
+			// Add noise against zero silence
+			// (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
+			"-dither", "yes",
+			// Disable VAD -- we're doing that ourselves
+			"-remove_silence", "no",
+			// Perform per-utterance cepstral mean normalization
+			"-cmn", "batch",
 
 			// The following settings are recommended at
 			// http://cmusphinx.sourceforge.net/wiki/phonemerecognition

From 8d958d09fb5a4ba3360df65e58e0807275fe027f Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Mon, 21 Jan 2019 22:48:16 +0100
Subject: [PATCH 5/5] Always export absolute audio file path in XML and JSON
 format

Fixes #59
---
 CHANGELOG.md                           | 1 +
 rhubarb/src/exporters/JsonExporter.cpp | 3 ++-
 rhubarb/src/exporters/XmlExporter.cpp  | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b074763..a213908 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 * **Fixed** a bug that resulted in unwanted mouth movement at beginning of a recording ([issue #53](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/53)).
 * **Fixed** a bug that garbled special characters in the output file path ([issue #54](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/54)).
 * **Fixed** a bug that prevented the progress bar from reaching 100% ([issue #48](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/48)).
+* **Fixed** file paths in exported XML and JSON files ([issue #59](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/59)).
 
 ## Version 1.8.0
 
diff --git a/rhubarb/src/exporters/JsonExporter.cpp b/rhubarb/src/exporters/JsonExporter.cpp
index 208ee9b..1cd3c50 100644
--- a/rhubarb/src/exporters/JsonExporter.cpp
+++ b/rhubarb/src/exporters/JsonExporter.cpp
@@ -1,6 +1,7 @@
 #include "JsonExporter.h"
 #include "exporterTools.h"
 #include "tools/stringTools.h"
+#include <boost/filesystem.hpp>
 
 using std::string;
 
@@ -10,7 +11,7 @@ void JsonExporter::exportAnimation(const ExporterInput& input, std::ostream& out
 	// the formatting.
 	outputStream << "{\n";
 	outputStream << "  \"metadata\": {\n";
-	outputStream << "    \"soundFile\": \"" << escapeJsonString(input.inputFilePath.string()) << "\",\n";
+	outputStream << "    \"soundFile\": \"" << escapeJsonString(absolute(input.inputFilePath).string()) << "\",\n";
 	outputStream << "    \"duration\": " << formatDuration(input.animation.getRange().getDuration()) << "\n";
 	outputStream << "  },\n";
 	outputStream << "  \"mouthCues\": [\n";
diff --git a/rhubarb/src/exporters/XmlExporter.cpp b/rhubarb/src/exporters/XmlExporter.cpp
index 479b401..763f473 100644
--- a/rhubarb/src/exporters/XmlExporter.cpp
+++ b/rhubarb/src/exporters/XmlExporter.cpp
@@ -3,6 +3,7 @@
 #include <boost/property_tree/xml_parser.hpp>
 #include <boost/version.hpp>
 #include "exporterTools.h"
+#include <boost/filesystem.hpp>
 
 using std::string;
 using boost::property_tree::ptree;
@@ -11,7 +12,7 @@ void XmlExporter::exportAnimation(const ExporterInput& input, std::ostream& outp
 	ptree tree;
 
 	// Add metadata
-	tree.put("rhubarbResult.metadata.soundFile", input.inputFilePath.string());
+	tree.put("rhubarbResult.metadata.soundFile", absolute(input.inputFilePath).string());
 	tree.put(
 		"rhubarbResult.metadata.duration",
 		formatDuration(input.animation.getRange().getDuration())