From f3d4cfbb31932031c6d82fc179e95704bae3e82d Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Fri, 18 Jan 2019 22:29:26 +0100 Subject: [PATCH 1/5] Fix gaps in phonetic recognition Randomly, entire utterances yielded no phones with the phonetic recognizer. The cause was a check for empty utterances that made sense for word recognition, but not for phonetic recognition. --- rhubarb/src/recognition/pocketSphinxTools.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/rhubarb/src/recognition/pocketSphinxTools.cpp b/rhubarb/src/recognition/pocketSphinxTools.cpp index cdb91b5..d571cc5 100644 --- a/rhubarb/src/recognition/pocketSphinxTools.cpp +++ b/rhubarb/src/recognition/pocketSphinxTools.cpp @@ -219,9 +219,18 @@ BoundedTimeline recognizeWords(const vector& audioBuffer, ps_de BoundedTimeline result( TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)) ); - const bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0; - if (noWordsRecognized) { - return result; + const bool phonetic = cmd_ln_boolean_r(decoder.config, "-allphone_ci"); + if (!phonetic) { + // If the decoder is in word mode (as opposed to phonetic recognition), it expects each + // utterance to contain speech. If it doesn't, ps_seg_word() logs the annoying error + // "Couldn't find in first frame". + // Not every utterance does contain speech, however. In this case, we exit early to prevent + // the log output. + // We *don't* to that in phonetic mode because here, the same code would omit valid phones. + const bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0; + if (noWordsRecognized) { + return result; + } } // Collect words From 21392d32cca653fde3f0f724d4aa7c7f6e750b27 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Mon, 21 Jan 2019 22:26:39 +0100 Subject: [PATCH 2/5] Improve read performance for WAVE files Fixes #58 --- CHANGELOG.md | 1 + rhubarb/src/audio/WaveFileReader.cpp | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5bd11e..b074763 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Unreleased * **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)). +* **Improved** processing speed for WAVE files ([issue #58](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/58)). * **Fixed** a bug that resulted in unwanted mouth movement at beginning of a recording ([issue #53](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/53)). * **Fixed** a bug that garbled special characters in the output file path ([issue #54](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/54)). * **Fixed** a bug that prevented the progress bar from reaching 100% ([issue #48](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/48)). diff --git a/rhubarb/src/audio/WaveFileReader.cpp b/rhubarb/src/audio/WaveFileReader.cpp index e8aa872..5e4ae58 100644 --- a/rhubarb/src/audio/WaveFileReader.cpp +++ b/rhubarb/src/audio/WaveFileReader.cpp @@ -192,7 +192,9 @@ SampleReader WaveFileReader::createUnsafeSampleReader() const { ](size_type index) mutable { const std::streampos newFilePos = formatInfo.dataOffset + static_cast(index * formatInfo.bytesPerFrame); - file->seekg(newFilePos); + if (newFilePos != filePos) { + file->seekg(newFilePos); + } const value_type result = readSample(*file, formatInfo.sampleFormat, formatInfo.channelCount); filePos = newFilePos + static_cast(formatInfo.bytesPerFrame); From c8217885698efa5ac92ac00ffa3c5f6d56f165c0 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Mon, 21 Jan 2019 22:32:31 +0100 Subject: [PATCH 3/5] Discard very short segments of voice activity This prevents short flickers from false VAD positives. This fixes a regression recently introduced in 2bbad258c0. --- rhubarb/src/audio/voiceActivityDetection.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/rhubarb/src/audio/voiceActivityDetection.cpp b/rhubarb/src/audio/voiceActivityDetection.cpp index 2e5ae17..d6f96f5 100644 --- a/rhubarb/src/audio/voiceActivityDetection.cpp +++ b/rhubarb/src/audio/voiceActivityDetection.cpp @@ -75,6 +75,14 @@ JoiningBoundedTimeline detectVoiceActivity( } } + // Discard very short segments of activity + const centiseconds minSegmentLength(5); + for (const auto& segment : activity) { + if (segment.getDuration() < minSegmentLength) { + activity.clear(segment.getTimeRange()); + } + } + logging::debugFormat( "Found {} sections of voice activity: {}", activity.size(), From 06b4855d6d0161197f6c5cd8d1fd682d69db45d7 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Mon, 21 Jan 2019 22:33:34 +0100 Subject: [PATCH 4/5] Set helpful configuration settings for the phonetic recognizer Copied from the PocketSphinx recognizer --- rhubarb/src/recognition/PhoneticRecognizer.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/rhubarb/src/recognition/PhoneticRecognizer.cpp b/rhubarb/src/recognition/PhoneticRecognizer.cpp index dff1956..262ba48 100644 --- a/rhubarb/src/recognition/PhoneticRecognizer.cpp +++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp @@ -25,6 +25,13 @@ static lambda_unique_ptr createDecoder(optional dialo // Low values (<= 0.4) can lead to fluttering animation. // High values (>= 1.0) can lead to imprecise or freezing animation. "-lw", "0.8", + // Add noise against zero silence + // (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor) + "-dither", "yes", + // Disable VAD -- we're doing that ourselves + "-remove_silence", "no", + // Perform per-utterance cepstral mean normalization + "-cmn", "batch", // The following settings are recommended at // http://cmusphinx.sourceforge.net/wiki/phonemerecognition From 8d958d09fb5a4ba3360df65e58e0807275fe027f Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Mon, 21 Jan 2019 22:48:16 +0100 Subject: [PATCH 5/5] Always export absolute audio file path in XML and JSON format Fixes #59 --- CHANGELOG.md | 1 + rhubarb/src/exporters/JsonExporter.cpp | 3 ++- rhubarb/src/exporters/XmlExporter.cpp | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b074763..a213908 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ * **Fixed** a bug that resulted in unwanted mouth movement at beginning of a recording ([issue #53](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/53)). * **Fixed** a bug that garbled special characters in the output file path ([issue #54](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/54)). * **Fixed** a bug that prevented the progress bar from reaching 100% ([issue #48](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/48)). +* **Fixed** file paths in exported XML and JSON files ([issue #59](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/59)). ## Version 1.8.0 diff --git a/rhubarb/src/exporters/JsonExporter.cpp b/rhubarb/src/exporters/JsonExporter.cpp index 208ee9b..1cd3c50 100644 --- a/rhubarb/src/exporters/JsonExporter.cpp +++ b/rhubarb/src/exporters/JsonExporter.cpp @@ -1,6 +1,7 @@ #include "JsonExporter.h" #include "exporterTools.h" #include "tools/stringTools.h" +#include using std::string; @@ -10,7 +11,7 @@ void JsonExporter::exportAnimation(const ExporterInput& input, std::ostream& out // the formatting. outputStream << "{\n"; outputStream << " \"metadata\": {\n"; - outputStream << " \"soundFile\": \"" << escapeJsonString(input.inputFilePath.string()) << "\",\n"; + outputStream << " \"soundFile\": \"" << escapeJsonString(absolute(input.inputFilePath).string()) << "\",\n"; outputStream << " \"duration\": " << formatDuration(input.animation.getRange().getDuration()) << "\n"; outputStream << " },\n"; outputStream << " \"mouthCues\": [\n"; diff --git a/rhubarb/src/exporters/XmlExporter.cpp b/rhubarb/src/exporters/XmlExporter.cpp index 479b401..763f473 100644 --- a/rhubarb/src/exporters/XmlExporter.cpp +++ b/rhubarb/src/exporters/XmlExporter.cpp @@ -3,6 +3,7 @@ #include #include #include "exporterTools.h" +#include using std::string; using boost::property_tree::ptree; @@ -11,7 +12,7 @@ void XmlExporter::exportAnimation(const ExporterInput& input, std::ostream& outp ptree tree; // Add metadata - tree.put("rhubarbResult.metadata.soundFile", input.inputFilePath.string()); + tree.put("rhubarbResult.metadata.soundFile", absolute(input.inputFilePath).string()); tree.put( "rhubarbResult.metadata.duration", formatDuration(input.animation.getRange().getDuration())