Merge pull request #61 from DanielSWolf/bugfix/misc
Multiple small fixes and improvements
This commit is contained in:
commit
f55bcebf73
|
@ -3,9 +3,11 @@
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
* **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)).
|
* **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)).
|
||||||
|
* **Improved** processing speed for WAVE files ([issue #58](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/58)).
|
||||||
* **Fixed** a bug that resulted in unwanted mouth movement at beginning of a recording ([issue #53](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/53)).
|
* **Fixed** a bug that resulted in unwanted mouth movement at beginning of a recording ([issue #53](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/53)).
|
||||||
* **Fixed** a bug that garbled special characters in the output file path ([issue #54](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/54)).
|
* **Fixed** a bug that garbled special characters in the output file path ([issue #54](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/54)).
|
||||||
* **Fixed** a bug that prevented the progress bar from reaching 100% ([issue #48](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/48)).
|
* **Fixed** a bug that prevented the progress bar from reaching 100% ([issue #48](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/48)).
|
||||||
|
* **Fixed** file paths in exported XML and JSON files ([issue #59](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/59)).
|
||||||
|
|
||||||
## Version 1.8.0
|
## Version 1.8.0
|
||||||
|
|
||||||
|
|
|
@ -192,7 +192,9 @@ SampleReader WaveFileReader::createUnsafeSampleReader() const {
|
||||||
](size_type index) mutable {
|
](size_type index) mutable {
|
||||||
const std::streampos newFilePos = formatInfo.dataOffset
|
const std::streampos newFilePos = formatInfo.dataOffset
|
||||||
+ static_cast<std::streamoff>(index * formatInfo.bytesPerFrame);
|
+ static_cast<std::streamoff>(index * formatInfo.bytesPerFrame);
|
||||||
file->seekg(newFilePos);
|
if (newFilePos != filePos) {
|
||||||
|
file->seekg(newFilePos);
|
||||||
|
}
|
||||||
const value_type result =
|
const value_type result =
|
||||||
readSample(*file, formatInfo.sampleFormat, formatInfo.channelCount);
|
readSample(*file, formatInfo.sampleFormat, formatInfo.channelCount);
|
||||||
filePos = newFilePos + static_cast<std::streamoff>(formatInfo.bytesPerFrame);
|
filePos = newFilePos + static_cast<std::streamoff>(formatInfo.bytesPerFrame);
|
||||||
|
|
|
@ -75,6 +75,14 @@ JoiningBoundedTimeline<void> detectVoiceActivity(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Discard very short segments of activity
|
||||||
|
const centiseconds minSegmentLength(5);
|
||||||
|
for (const auto& segment : activity) {
|
||||||
|
if (segment.getDuration() < minSegmentLength) {
|
||||||
|
activity.clear(segment.getTimeRange());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
logging::debugFormat(
|
logging::debugFormat(
|
||||||
"Found {} sections of voice activity: {}",
|
"Found {} sections of voice activity: {}",
|
||||||
activity.size(),
|
activity.size(),
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#include "JsonExporter.h"
|
#include "JsonExporter.h"
|
||||||
#include "exporterTools.h"
|
#include "exporterTools.h"
|
||||||
#include "tools/stringTools.h"
|
#include "tools/stringTools.h"
|
||||||
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
|
|
||||||
|
@ -10,7 +11,7 @@ void JsonExporter::exportAnimation(const ExporterInput& input, std::ostream& out
|
||||||
// the formatting.
|
// the formatting.
|
||||||
outputStream << "{\n";
|
outputStream << "{\n";
|
||||||
outputStream << " \"metadata\": {\n";
|
outputStream << " \"metadata\": {\n";
|
||||||
outputStream << " \"soundFile\": \"" << escapeJsonString(input.inputFilePath.string()) << "\",\n";
|
outputStream << " \"soundFile\": \"" << escapeJsonString(absolute(input.inputFilePath).string()) << "\",\n";
|
||||||
outputStream << " \"duration\": " << formatDuration(input.animation.getRange().getDuration()) << "\n";
|
outputStream << " \"duration\": " << formatDuration(input.animation.getRange().getDuration()) << "\n";
|
||||||
outputStream << " },\n";
|
outputStream << " },\n";
|
||||||
outputStream << " \"mouthCues\": [\n";
|
outputStream << " \"mouthCues\": [\n";
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#include <boost/property_tree/xml_parser.hpp>
|
#include <boost/property_tree/xml_parser.hpp>
|
||||||
#include <boost/version.hpp>
|
#include <boost/version.hpp>
|
||||||
#include "exporterTools.h"
|
#include "exporterTools.h"
|
||||||
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using boost::property_tree::ptree;
|
using boost::property_tree::ptree;
|
||||||
|
@ -11,7 +12,7 @@ void XmlExporter::exportAnimation(const ExporterInput& input, std::ostream& outp
|
||||||
ptree tree;
|
ptree tree;
|
||||||
|
|
||||||
// Add metadata
|
// Add metadata
|
||||||
tree.put("rhubarbResult.metadata.soundFile", input.inputFilePath.string());
|
tree.put("rhubarbResult.metadata.soundFile", absolute(input.inputFilePath).string());
|
||||||
tree.put(
|
tree.put(
|
||||||
"rhubarbResult.metadata.duration",
|
"rhubarbResult.metadata.duration",
|
||||||
formatDuration(input.animation.getRange().getDuration())
|
formatDuration(input.animation.getRange().getDuration())
|
||||||
|
|
|
@ -25,6 +25,13 @@ static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialo
|
||||||
// Low values (<= 0.4) can lead to fluttering animation.
|
// Low values (<= 0.4) can lead to fluttering animation.
|
||||||
// High values (>= 1.0) can lead to imprecise or freezing animation.
|
// High values (>= 1.0) can lead to imprecise or freezing animation.
|
||||||
"-lw", "0.8",
|
"-lw", "0.8",
|
||||||
|
// Add noise against zero silence
|
||||||
|
// (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
||||||
|
"-dither", "yes",
|
||||||
|
// Disable VAD -- we're doing that ourselves
|
||||||
|
"-remove_silence", "no",
|
||||||
|
// Perform per-utterance cepstral mean normalization
|
||||||
|
"-cmn", "batch",
|
||||||
|
|
||||||
// The following settings are recommended at
|
// The following settings are recommended at
|
||||||
// http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
// http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
||||||
|
|
|
@ -219,9 +219,18 @@ BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_de
|
||||||
BoundedTimeline<string> result(
|
BoundedTimeline<string> result(
|
||||||
TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate))
|
TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate))
|
||||||
);
|
);
|
||||||
const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
const bool phonetic = cmd_ln_boolean_r(decoder.config, "-allphone_ci");
|
||||||
if (noWordsRecognized) {
|
if (!phonetic) {
|
||||||
return result;
|
// If the decoder is in word mode (as opposed to phonetic recognition), it expects each
|
||||||
|
// utterance to contain speech. If it doesn't, ps_seg_word() logs the annoying error
|
||||||
|
// "Couldn't find <s> in first frame".
|
||||||
|
// Not every utterance does contain speech, however. In this case, we exit early to prevent
|
||||||
|
// the log output.
|
||||||
|
// We *don't* to that in phonetic mode because here, the same code would omit valid phones.
|
||||||
|
const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
||||||
|
if (noWordsRecognized) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect words
|
// Collect words
|
||||||
|
|
Loading…
Reference in New Issue