diff --git a/CHANGELOG.md b/CHANGELOG.md index e5bd11e..a213908 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,11 @@ ## Unreleased * **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)). +* **Improved** processing speed for WAVE files ([issue #58](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/58)). * **Fixed** a bug that resulted in unwanted mouth movement at beginning of a recording ([issue #53](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/53)). * **Fixed** a bug that garbled special characters in the output file path ([issue #54](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/54)). * **Fixed** a bug that prevented the progress bar from reaching 100% ([issue #48](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/48)). +* **Fixed** file paths in exported XML and JSON files ([issue #59](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/59)). ## Version 1.8.0 diff --git a/rhubarb/src/audio/WaveFileReader.cpp b/rhubarb/src/audio/WaveFileReader.cpp index e8aa872..5e4ae58 100644 --- a/rhubarb/src/audio/WaveFileReader.cpp +++ b/rhubarb/src/audio/WaveFileReader.cpp @@ -192,7 +192,9 @@ SampleReader WaveFileReader::createUnsafeSampleReader() const { ](size_type index) mutable { const std::streampos newFilePos = formatInfo.dataOffset + static_cast(index * formatInfo.bytesPerFrame); - file->seekg(newFilePos); + if (newFilePos != filePos) { + file->seekg(newFilePos); + } const value_type result = readSample(*file, formatInfo.sampleFormat, formatInfo.channelCount); filePos = newFilePos + static_cast(formatInfo.bytesPerFrame); diff --git a/rhubarb/src/audio/voiceActivityDetection.cpp b/rhubarb/src/audio/voiceActivityDetection.cpp index 2e5ae17..d6f96f5 100644 --- a/rhubarb/src/audio/voiceActivityDetection.cpp +++ b/rhubarb/src/audio/voiceActivityDetection.cpp @@ -75,6 +75,14 @@ JoiningBoundedTimeline detectVoiceActivity( } } + // Discard very short segments of activity + const centiseconds minSegmentLength(5); + for (const auto& segment : activity) { + if (segment.getDuration() < minSegmentLength) { + activity.clear(segment.getTimeRange()); + } + } + logging::debugFormat( "Found {} sections of voice activity: {}", activity.size(), diff --git a/rhubarb/src/exporters/JsonExporter.cpp b/rhubarb/src/exporters/JsonExporter.cpp index 208ee9b..1cd3c50 100644 --- a/rhubarb/src/exporters/JsonExporter.cpp +++ b/rhubarb/src/exporters/JsonExporter.cpp @@ -1,6 +1,7 @@ #include "JsonExporter.h" #include "exporterTools.h" #include "tools/stringTools.h" +#include using std::string; @@ -10,7 +11,7 @@ void JsonExporter::exportAnimation(const ExporterInput& input, std::ostream& out // the formatting. outputStream << "{\n"; outputStream << " \"metadata\": {\n"; - outputStream << " \"soundFile\": \"" << escapeJsonString(input.inputFilePath.string()) << "\",\n"; + outputStream << " \"soundFile\": \"" << escapeJsonString(absolute(input.inputFilePath).string()) << "\",\n"; outputStream << " \"duration\": " << formatDuration(input.animation.getRange().getDuration()) << "\n"; outputStream << " },\n"; outputStream << " \"mouthCues\": [\n"; diff --git a/rhubarb/src/exporters/XmlExporter.cpp b/rhubarb/src/exporters/XmlExporter.cpp index 479b401..763f473 100644 --- a/rhubarb/src/exporters/XmlExporter.cpp +++ b/rhubarb/src/exporters/XmlExporter.cpp @@ -3,6 +3,7 @@ #include #include #include "exporterTools.h" +#include using std::string; using boost::property_tree::ptree; @@ -11,7 +12,7 @@ void XmlExporter::exportAnimation(const ExporterInput& input, std::ostream& outp ptree tree; // Add metadata - tree.put("rhubarbResult.metadata.soundFile", input.inputFilePath.string()); + tree.put("rhubarbResult.metadata.soundFile", absolute(input.inputFilePath).string()); tree.put( "rhubarbResult.metadata.duration", formatDuration(input.animation.getRange().getDuration()) diff --git a/rhubarb/src/recognition/PhoneticRecognizer.cpp b/rhubarb/src/recognition/PhoneticRecognizer.cpp index dff1956..262ba48 100644 --- a/rhubarb/src/recognition/PhoneticRecognizer.cpp +++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp @@ -25,6 +25,13 @@ static lambda_unique_ptr createDecoder(optional dialo // Low values (<= 0.4) can lead to fluttering animation. // High values (>= 1.0) can lead to imprecise or freezing animation. "-lw", "0.8", + // Add noise against zero silence + // (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor) + "-dither", "yes", + // Disable VAD -- we're doing that ourselves + "-remove_silence", "no", + // Perform per-utterance cepstral mean normalization + "-cmn", "batch", // The following settings are recommended at // http://cmusphinx.sourceforge.net/wiki/phonemerecognition diff --git a/rhubarb/src/recognition/pocketSphinxTools.cpp b/rhubarb/src/recognition/pocketSphinxTools.cpp index cdb91b5..d571cc5 100644 --- a/rhubarb/src/recognition/pocketSphinxTools.cpp +++ b/rhubarb/src/recognition/pocketSphinxTools.cpp @@ -219,9 +219,18 @@ BoundedTimeline recognizeWords(const vector& audioBuffer, ps_de BoundedTimeline result( TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)) ); - const bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0; - if (noWordsRecognized) { - return result; + const bool phonetic = cmd_ln_boolean_r(decoder.config, "-allphone_ci"); + if (!phonetic) { + // If the decoder is in word mode (as opposed to phonetic recognition), it expects each + // utterance to contain speech. If it doesn't, ps_seg_word() logs the annoying error + // "Couldn't find in first frame". + // Not every utterance does contain speech, however. In this case, we exit early to prevent + // the log output. + // We *don't* to that in phonetic mode because here, the same code would omit valid phones. + const bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0; + if (noWordsRecognized) { + return result; + } } // Collect words