#include "voiceActivityDetection.h" #include "DcOffset.h" #include "SampleRateConverter.h" #include "logging/logging.h" #include "tools/pairs.h" #include #include #include "processing.h" #include #include "tools/parallel.h" #include "AudioSegment.h" using std::vector; using boost::adaptors::transformed; using fmt::format; using std::runtime_error; using std::unique_ptr; JoiningBoundedTimeline webRtcDetectVoiceActivity( const AudioClip& audioClip, ProgressSink& progressSink ) { VadInst* vadHandle = WebRtcVad_Create(); if (!vadHandle) throw runtime_error("Error creating WebRTC VAD handle."); auto freeHandle = gsl::finally([&]() { WebRtcVad_Free(vadHandle); }); int error = WebRtcVad_Init(vadHandle); if (error) throw runtime_error("Error initializing WebRTC VAD handle."); const int aggressiveness = 2; // 0..3. The higher, the more is cut off. error = WebRtcVad_set_mode(vadHandle, aggressiveness); if (error) throw runtime_error("Error setting WebRTC VAD aggressiveness."); ProgressMerger progressMerger(progressSink); ProgressSink& pass1ProgressSink = progressMerger.addSink(1.0); ProgressSink& pass2ProgressSink = progressMerger.addSink(0.3); // Detect activity JoiningBoundedTimeline activity(audioClip.getTruncatedRange()); centiseconds time = 0_cs; const size_t bufferCapacity = audioClip.getSampleRate() / 100; const auto processBuffer = [&](const vector& buffer) { // WebRTC is picky regarding buffer size if (buffer.size() < bufferCapacity) return; const int result = WebRtcVad_Process( vadHandle, audioClip.getSampleRate(), buffer.data(), buffer.size() ) == 1; if (result == -1) throw runtime_error("Error processing audio buffer using WebRTC VAD."); const bool isActive = result != 0; if (isActive) { activity.set(time, time + 1_cs); } time += 1_cs; }; process16bitAudioClip(audioClip, processBuffer, bufferCapacity, pass1ProgressSink); // WebRTC adapts to the audio. This means results may not be correct at the very beginning. // It sometimes returns false activity at the very beginning, mistaking the background noise for // speech. // So we delete the first recognized utterance and re-process the corresponding audio segment. if (!activity.empty()) { TimeRange firstActivity = activity.begin()->getTimeRange(); activity.clear(firstActivity); const unique_ptr streamStart = audioClip.clone() | segment(TimeRange(0_cs, firstActivity.getEnd())); time = 0_cs; process16bitAudioClip(*streamStart, processBuffer, bufferCapacity, pass2ProgressSink); } return activity; } JoiningBoundedTimeline detectVoiceActivity( const AudioClip& inputAudioClip, int maxThreadCount, ProgressSink& progressSink ) { // Prepare audio for VAD const unique_ptr audioClip = inputAudioClip.clone() | resample(16000) | removeDcOffset(); JoiningBoundedTimeline activity(audioClip->getTruncatedRange()); std::mutex activityMutex; // Split audio into segments and perform parallel VAD const int segmentCount = maxThreadCount; const centiseconds audioDuration = audioClip->getTruncatedRange().getDuration(); vector audioSegments; for (int i = 0; i < segmentCount; ++i) { TimeRange segmentRange = TimeRange( i * audioDuration / segmentCount, (i + 1) * audioDuration / segmentCount ); audioSegments.push_back(segmentRange); } runParallel([&](const TimeRange& segmentRange, ProgressSink& segmentProgressSink) { const unique_ptr audioSegment = audioClip->clone() | segment(segmentRange); JoiningBoundedTimeline activitySegment = webRtcDetectVoiceActivity(*audioSegment, segmentProgressSink); std::lock_guard lock(activityMutex); for (auto activityRange : activitySegment) { activityRange.getTimeRange().shift(segmentRange.getStart()); activity.set(activityRange); } }, audioSegments, segmentCount, progressSink); // Fill small gaps in activity const centiseconds maxGap(5); for (const auto& pair : getPairs(activity)) { if (pair.second.getStart() - pair.first.getEnd() <= maxGap) { activity.set(pair.first.getEnd(), pair.second.getStart()); } } // Shorten activities. WebRTC adds a bit of buffer at the end. const centiseconds tail(5); for (const auto& utterance : JoiningBoundedTimeline(activity)) { if (utterance.getDuration() > tail && utterance.getEnd() < audioDuration) { activity.clear(utterance.getEnd() - tail, utterance.getEnd()); } } logging::debugFormat( "Found {} sections of voice activity: {}", activity.size(), join(activity | transformed([](const Timed& t) { return format("{0}-{1}", t.getStart(), t.getEnd()); }), ", ") ); return activity; }