diff --git a/CHANGELOG.md b/CHANGELOG.md index d4e388b..bf8adfc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Unreleased * **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)). +* **Fixed** a bug that resulted in unwanted mouth movement at beginning of a recording ([issue #53](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/53)). * **Fixed** a bug that prevented the progress bar from reaching 100% ([issue #48](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/48)). ## Version 1.8.0 diff --git a/rhubarb/resharper.DotSettings b/rhubarb/resharper.DotSettings index 103065f..b695093 100644 --- a/rhubarb/resharper.DotSettings +++ b/rhubarb/resharper.DotSettings @@ -212,6 +212,7 @@ True True True + True True True True diff --git a/rhubarb/src/audio/voiceActivityDetection.cpp b/rhubarb/src/audio/voiceActivityDetection.cpp index 990ae88..2e5ae17 100644 --- a/rhubarb/src/audio/voiceActivityDetection.cpp +++ b/rhubarb/src/audio/voiceActivityDetection.cpp @@ -8,7 +8,7 @@ #include "processing.h" #include #include "tools/parallel.h" -#include "AudioSegment.h" +#include using std::vector; using boost::adaptors::transformed; @@ -16,124 +16,65 @@ using fmt::format; using std::runtime_error; using std::unique_ptr; -JoiningBoundedTimeline webRtcDetectVoiceActivity( - const AudioClip& audioClip, +JoiningBoundedTimeline detectVoiceActivity( + const AudioClip& inputAudioClip, ProgressSink& progressSink ) { + // Prepare audio for VAD + constexpr int webRtcSamplingRate = 8000; + const unique_ptr audioClip = inputAudioClip.clone() + | resample(webRtcSamplingRate) + | removeDcOffset(); + VadInst* vadHandle = WebRtcVad_Create(); if (!vadHandle) throw runtime_error("Error creating WebRTC VAD handle."); auto freeHandle = gsl::finally([&]() { WebRtcVad_Free(vadHandle); }); int error = WebRtcVad_Init(vadHandle); - if (error) throw runtime_error("Error initializing WebRTC VAD handle."); + if (error) throw runtime_error("Error initializing WebRTC VAD."); const int aggressiveness = 2; // 0..3. The higher, the more is cut off. error = WebRtcVad_set_mode(vadHandle, aggressiveness); if (error) throw runtime_error("Error setting WebRTC VAD aggressiveness."); - ProgressMerger progressMerger(progressSink); - ProgressSink& pass1ProgressSink = progressMerger.addSource("VAD pass 1", 1.0); - ProgressSink& pass2ProgressSink = progressMerger.addSource("VAD pass 2", 0.3); - // Detect activity - JoiningBoundedTimeline activity(audioClip.getTruncatedRange()); + JoiningBoundedTimeline activity(audioClip->getTruncatedRange()); centiseconds time = 0_cs; - const size_t bufferCapacity = audioClip.getSampleRate() / 100; + const size_t frameSize = webRtcSamplingRate / 100; const auto processBuffer = [&](const vector& buffer) { // WebRTC is picky regarding buffer size - if (buffer.size() < bufferCapacity) return; + if (buffer.size() < frameSize) return; const int result = WebRtcVad_Process( vadHandle, - audioClip.getSampleRate(), + webRtcSamplingRate, buffer.data(), buffer.size() - ) == 1; + ); if (result == -1) throw runtime_error("Error processing audio buffer using WebRTC VAD."); - const bool isActive = result != 0; + // Ignore the result of WebRtcVad_Process, instead directly interpret the internal VAD flag. + // The result of WebRtcVad_Process stays 1 for a number of frames after the last detected + // activity. + const bool isActive = reinterpret_cast(vadHandle)->vad == 1; + if (isActive) { activity.set(time, time + 1_cs); } + time += 1_cs; }; - process16bitAudioClip(audioClip, processBuffer, bufferCapacity, pass1ProgressSink); - - // WebRTC adapts to the audio. This means results may not be correct at the very beginning. - // It sometimes returns false activity at the very beginning, mistaking the background noise for - // speech. - // So we delete the first recognized utterance and re-process the corresponding audio segment. - if (!activity.empty()) { - TimeRange firstActivity = activity.begin()->getTimeRange(); - activity.clear(firstActivity); - const unique_ptr streamStart = audioClip.clone() - | segment(TimeRange(0_cs, firstActivity.getEnd())); - time = 0_cs; - process16bitAudioClip(*streamStart, processBuffer, bufferCapacity, pass2ProgressSink); - } - - return activity; -} - -JoiningBoundedTimeline detectVoiceActivity( - const AudioClip& inputAudioClip, - int maxThreadCount, - ProgressSink& progressSink -) { - // Prepare audio for VAD - const unique_ptr audioClip = inputAudioClip.clone() - | resample(16000) - | removeDcOffset(); - - JoiningBoundedTimeline activity(audioClip->getTruncatedRange()); - std::mutex activityMutex; - - // Split audio into segments and perform parallel VAD - const int segmentCount = maxThreadCount; - const centiseconds audioDuration = audioClip->getTruncatedRange().getDuration(); - vector audioSegments; - for (int i = 0; i < segmentCount; ++i) { - TimeRange segmentRange = TimeRange( - i * audioDuration / segmentCount, - (i + 1) * audioDuration / segmentCount - ); - audioSegments.push_back(segmentRange); - } - runParallel( - "VAD", - [&](const TimeRange& segmentRange, ProgressSink& segmentProgressSink) { - const unique_ptr audioSegment = audioClip->clone() | segment(segmentRange); - JoiningBoundedTimeline activitySegment = - webRtcDetectVoiceActivity(*audioSegment, segmentProgressSink); - - std::lock_guard lock(activityMutex); - for (auto activityRange : activitySegment) { - activityRange.getTimeRange().shift(segmentRange.getStart()); - activity.set(activityRange); - } - }, - audioSegments, - segmentCount, - progressSink - ); + process16bitAudioClip(*audioClip, processBuffer, frameSize, progressSink); // Fill small gaps in activity - const centiseconds maxGap(5); + const centiseconds maxGap(10); for (const auto& pair : getPairs(activity)) { if (pair.second.getStart() - pair.first.getEnd() <= maxGap) { activity.set(pair.first.getEnd(), pair.second.getStart()); } } - // Shorten activities. WebRTC adds a bit of buffer at the end. - const centiseconds tail(5); - for (const auto& utterance : JoiningBoundedTimeline(activity)) { - if (utterance.getDuration() > tail && utterance.getEnd() < audioDuration) { - activity.clear(utterance.getEnd() - tail, utterance.getEnd()); - } - } - logging::debugFormat( "Found {} sections of voice activity: {}", activity.size(), diff --git a/rhubarb/src/audio/voiceActivityDetection.h b/rhubarb/src/audio/voiceActivityDetection.h index 02658e0..4e85f9f 100644 --- a/rhubarb/src/audio/voiceActivityDetection.h +++ b/rhubarb/src/audio/voiceActivityDetection.h @@ -5,6 +5,5 @@ JoiningBoundedTimeline detectVoiceActivity( const AudioClip& audioClip, - int maxThreadCount, ProgressSink& progressSink ); diff --git a/rhubarb/src/recognition/pocketSphinxTools.cpp b/rhubarb/src/recognition/pocketSphinxTools.cpp index 4e9c117..cdb91b5 100644 --- a/rhubarb/src/recognition/pocketSphinxTools.cpp +++ b/rhubarb/src/recognition/pocketSphinxTools.cpp @@ -102,7 +102,7 @@ BoundedTimeline recognizePhones( // Split audio into utterances JoiningBoundedTimeline utterances; try { - utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink); + utterances = detectVoiceActivity(*audioClip, voiceActivationProgressSink); } catch (...) { std::throw_with_nested(runtime_error("Error detecting segments of speech.")); }