From a723942f2218c8acd60c7f2bdadef9f799a13e80 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Fri, 4 Jan 2019 20:23:57 +0100 Subject: [PATCH 1/2] Convert audio to 8kHz before feeding it to WebRTC for VAC This prevents false positives at the beginning of the audio stream. Fixes issue #52. --- CHANGELOG.md | 1 + rhubarb/src/audio/voiceActivityDetection.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4e388b..bf8adfc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Unreleased * **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)). +* **Fixed** a bug that resulted in unwanted mouth movement at beginning of a recording ([issue #53](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/53)). * **Fixed** a bug that prevented the progress bar from reaching 100% ([issue #48](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/48)). ## Version 1.8.0 diff --git a/rhubarb/src/audio/voiceActivityDetection.cpp b/rhubarb/src/audio/voiceActivityDetection.cpp index 990ae88..8de372c 100644 --- a/rhubarb/src/audio/voiceActivityDetection.cpp +++ b/rhubarb/src/audio/voiceActivityDetection.cpp @@ -83,7 +83,7 @@ JoiningBoundedTimeline detectVoiceActivity( ) { // Prepare audio for VAD const unique_ptr audioClip = inputAudioClip.clone() - | resample(16000) + | resample(8000) // Convert to the internal sampling rate of WebRTC | removeDcOffset(); JoiningBoundedTimeline activity(audioClip->getTruncatedRange()); From 2bbad258c0675ce466222e7174904a12501b6203 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Fri, 4 Jan 2019 20:28:57 +0100 Subject: [PATCH 2/2] Do not use multithreading for VAD WebRTC adapts to the audio signal. If we slice the audio clip into multiple shorter clips, then perform VAD on them in parallel, the result may not be as good. --- rhubarb/resharper.DotSettings | 1 + rhubarb/src/audio/voiceActivityDetection.cpp | 105 ++++-------------- rhubarb/src/audio/voiceActivityDetection.h | 1 - rhubarb/src/recognition/pocketSphinxTools.cpp | 2 +- 4 files changed, 25 insertions(+), 84 deletions(-) diff --git a/rhubarb/resharper.DotSettings b/rhubarb/resharper.DotSettings index 103065f..b695093 100644 --- a/rhubarb/resharper.DotSettings +++ b/rhubarb/resharper.DotSettings @@ -212,6 +212,7 @@ True True True + True True True True diff --git a/rhubarb/src/audio/voiceActivityDetection.cpp b/rhubarb/src/audio/voiceActivityDetection.cpp index 8de372c..2e5ae17 100644 --- a/rhubarb/src/audio/voiceActivityDetection.cpp +++ b/rhubarb/src/audio/voiceActivityDetection.cpp @@ -8,7 +8,7 @@ #include "processing.h" #include #include "tools/parallel.h" -#include "AudioSegment.h" +#include using std::vector; using boost::adaptors::transformed; @@ -16,124 +16,65 @@ using fmt::format; using std::runtime_error; using std::unique_ptr; -JoiningBoundedTimeline webRtcDetectVoiceActivity( - const AudioClip& audioClip, +JoiningBoundedTimeline detectVoiceActivity( + const AudioClip& inputAudioClip, ProgressSink& progressSink ) { + // Prepare audio for VAD + constexpr int webRtcSamplingRate = 8000; + const unique_ptr audioClip = inputAudioClip.clone() + | resample(webRtcSamplingRate) + | removeDcOffset(); + VadInst* vadHandle = WebRtcVad_Create(); if (!vadHandle) throw runtime_error("Error creating WebRTC VAD handle."); auto freeHandle = gsl::finally([&]() { WebRtcVad_Free(vadHandle); }); int error = WebRtcVad_Init(vadHandle); - if (error) throw runtime_error("Error initializing WebRTC VAD handle."); + if (error) throw runtime_error("Error initializing WebRTC VAD."); const int aggressiveness = 2; // 0..3. The higher, the more is cut off. error = WebRtcVad_set_mode(vadHandle, aggressiveness); if (error) throw runtime_error("Error setting WebRTC VAD aggressiveness."); - ProgressMerger progressMerger(progressSink); - ProgressSink& pass1ProgressSink = progressMerger.addSource("VAD pass 1", 1.0); - ProgressSink& pass2ProgressSink = progressMerger.addSource("VAD pass 2", 0.3); - // Detect activity - JoiningBoundedTimeline activity(audioClip.getTruncatedRange()); + JoiningBoundedTimeline activity(audioClip->getTruncatedRange()); centiseconds time = 0_cs; - const size_t bufferCapacity = audioClip.getSampleRate() / 100; + const size_t frameSize = webRtcSamplingRate / 100; const auto processBuffer = [&](const vector& buffer) { // WebRTC is picky regarding buffer size - if (buffer.size() < bufferCapacity) return; + if (buffer.size() < frameSize) return; const int result = WebRtcVad_Process( vadHandle, - audioClip.getSampleRate(), + webRtcSamplingRate, buffer.data(), buffer.size() - ) == 1; + ); if (result == -1) throw runtime_error("Error processing audio buffer using WebRTC VAD."); - const bool isActive = result != 0; + // Ignore the result of WebRtcVad_Process, instead directly interpret the internal VAD flag. + // The result of WebRtcVad_Process stays 1 for a number of frames after the last detected + // activity. + const bool isActive = reinterpret_cast(vadHandle)->vad == 1; + if (isActive) { activity.set(time, time + 1_cs); } + time += 1_cs; }; - process16bitAudioClip(audioClip, processBuffer, bufferCapacity, pass1ProgressSink); - - // WebRTC adapts to the audio. This means results may not be correct at the very beginning. - // It sometimes returns false activity at the very beginning, mistaking the background noise for - // speech. - // So we delete the first recognized utterance and re-process the corresponding audio segment. - if (!activity.empty()) { - TimeRange firstActivity = activity.begin()->getTimeRange(); - activity.clear(firstActivity); - const unique_ptr streamStart = audioClip.clone() - | segment(TimeRange(0_cs, firstActivity.getEnd())); - time = 0_cs; - process16bitAudioClip(*streamStart, processBuffer, bufferCapacity, pass2ProgressSink); - } - - return activity; -} - -JoiningBoundedTimeline detectVoiceActivity( - const AudioClip& inputAudioClip, - int maxThreadCount, - ProgressSink& progressSink -) { - // Prepare audio for VAD - const unique_ptr audioClip = inputAudioClip.clone() - | resample(8000) // Convert to the internal sampling rate of WebRTC - | removeDcOffset(); - - JoiningBoundedTimeline activity(audioClip->getTruncatedRange()); - std::mutex activityMutex; - - // Split audio into segments and perform parallel VAD - const int segmentCount = maxThreadCount; - const centiseconds audioDuration = audioClip->getTruncatedRange().getDuration(); - vector audioSegments; - for (int i = 0; i < segmentCount; ++i) { - TimeRange segmentRange = TimeRange( - i * audioDuration / segmentCount, - (i + 1) * audioDuration / segmentCount - ); - audioSegments.push_back(segmentRange); - } - runParallel( - "VAD", - [&](const TimeRange& segmentRange, ProgressSink& segmentProgressSink) { - const unique_ptr audioSegment = audioClip->clone() | segment(segmentRange); - JoiningBoundedTimeline activitySegment = - webRtcDetectVoiceActivity(*audioSegment, segmentProgressSink); - - std::lock_guard lock(activityMutex); - for (auto activityRange : activitySegment) { - activityRange.getTimeRange().shift(segmentRange.getStart()); - activity.set(activityRange); - } - }, - audioSegments, - segmentCount, - progressSink - ); + process16bitAudioClip(*audioClip, processBuffer, frameSize, progressSink); // Fill small gaps in activity - const centiseconds maxGap(5); + const centiseconds maxGap(10); for (const auto& pair : getPairs(activity)) { if (pair.second.getStart() - pair.first.getEnd() <= maxGap) { activity.set(pair.first.getEnd(), pair.second.getStart()); } } - // Shorten activities. WebRTC adds a bit of buffer at the end. - const centiseconds tail(5); - for (const auto& utterance : JoiningBoundedTimeline(activity)) { - if (utterance.getDuration() > tail && utterance.getEnd() < audioDuration) { - activity.clear(utterance.getEnd() - tail, utterance.getEnd()); - } - } - logging::debugFormat( "Found {} sections of voice activity: {}", activity.size(), diff --git a/rhubarb/src/audio/voiceActivityDetection.h b/rhubarb/src/audio/voiceActivityDetection.h index 02658e0..4e85f9f 100644 --- a/rhubarb/src/audio/voiceActivityDetection.h +++ b/rhubarb/src/audio/voiceActivityDetection.h @@ -5,6 +5,5 @@ JoiningBoundedTimeline detectVoiceActivity( const AudioClip& audioClip, - int maxThreadCount, ProgressSink& progressSink ); diff --git a/rhubarb/src/recognition/pocketSphinxTools.cpp b/rhubarb/src/recognition/pocketSphinxTools.cpp index 4e9c117..cdb91b5 100644 --- a/rhubarb/src/recognition/pocketSphinxTools.cpp +++ b/rhubarb/src/recognition/pocketSphinxTools.cpp @@ -102,7 +102,7 @@ BoundedTimeline recognizePhones( // Split audio into utterances JoiningBoundedTimeline utterances; try { - utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink); + utterances = detectVoiceActivity(*audioClip, voiceActivationProgressSink); } catch (...) { std::throw_with_nested(runtime_error("Error detecting segments of speech.")); }