From 43465523122d70cdc234236b9df438f5522d5111 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Tue, 14 Jun 2016 20:12:12 +0200 Subject: [PATCH] Improved speed of voice activity detection ... by factor 2 by removing second pass. Also added voice activity detection to progress calculation. --- src/audio/voiceActivityDetection.cpp | 34 ++++++++++++++++++++++++---- src/audio/voiceActivityDetection.h | 3 ++- src/phoneExtraction.cpp | 23 ++++++++----------- 3 files changed, 41 insertions(+), 19 deletions(-) diff --git a/src/audio/voiceActivityDetection.cpp b/src/audio/voiceActivityDetection.cpp index efe8173..0b94da9 100644 --- a/src/audio/voiceActivityDetection.cpp +++ b/src/audio/voiceActivityDetection.cpp @@ -5,6 +5,7 @@ #include #include #include +#include using std::numeric_limits; using std::vector; @@ -13,7 +14,7 @@ using boost::adaptors::transformed; using fmt::format; float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits::max()) { - double sum = 0; + double sum = 0; // Use double to prevent rounding errors with large number of summands int sampleCount; for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) { sum += std::pow(static_cast(audioStream.readSample()), 2); @@ -21,7 +22,17 @@ float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits: return sampleCount > 0 ? static_cast(std::sqrt(sum / sampleCount)) : 0.0f; } -BoundedTimeline detectVoiceActivity(std::unique_ptr audioStream) { +float getRMS(const vector& rmsSegments) { + if (rmsSegments.empty()) return 0; + + double sum = 0; // Use double to prevent rounding errors with large number of summands + for (float rmsSegment : rmsSegments) { + sum += rmsSegment; + } + return static_cast(std::sqrt(sum / rmsSegments.size())); +} + +BoundedTimeline detectVoiceActivity(std::unique_ptr audioStream, ProgressSink& progressSink) { // Make sure audio stream has no DC offset audioStream = removeDCOffset(std::move(audioStream)); @@ -30,12 +41,25 @@ BoundedTimeline detectVoiceActivity(std::unique_ptr audioStre constexpr int sampleRate = 2 * maxFrequency; audioStream = convertSampleRate(std::move(audioStream), sampleRate); + // Collect RMS data + vector rmsSegments; + logging::debug("RMS calculation -- start"); + int64_t centisecondCount = (audioStream->getSampleCount() - audioStream->getSampleIndex()) / 100; + for (int cs = 0; cs < centisecondCount; ++cs) { + rmsSegments.push_back(getRMS(*audioStream, sampleRate / 100)); + progressSink.reportProgress(static_cast(cs) / centisecondCount); + } + logging::debug("RMS calculation -- end"); + + const float rms = getRMS(rmsSegments); + logging::debugFormat("RMS value: {0:.5f}", rms); + // Detect activity - const float rms = getRMS(*audioStream->clone(true)); const float cutoff = rms / 25; + logging::debugFormat("RMS cutoff for voice activity detection: {0:.5f}", cutoff); BoundedTimeline activity(audioStream->getTruncatedRange()); - for (centiseconds time = centiseconds::zero(); !audioStream->endOfStream(); ++time) { - float currentRMS = getRMS(*audioStream, sampleRate / 100); + for (centiseconds time = centiseconds::zero(); static_cast(time.count()) < rmsSegments.size(); ++time) { + float currentRMS = rmsSegments[time.count()]; bool active = currentRMS > cutoff; if (active) { activity.set(time, time + centiseconds(1)); diff --git a/src/audio/voiceActivityDetection.h b/src/audio/voiceActivityDetection.h index fba0b40..aa6bc2a 100644 --- a/src/audio/voiceActivityDetection.h +++ b/src/audio/voiceActivityDetection.h @@ -2,5 +2,6 @@ #include #include "AudioStream.h" #include +#include -BoundedTimeline detectVoiceActivity(std::unique_ptr audioStream); +BoundedTimeline detectVoiceActivity(std::unique_ptr audioStream, ProgressSink& progressSink); diff --git a/src/phoneExtraction.cpp b/src/phoneExtraction.cpp index 91b56d6..dee3139 100644 --- a/src/phoneExtraction.cpp +++ b/src/phoneExtraction.cpp @@ -77,7 +77,7 @@ int16_t floatSampleToInt16(float sample) { } void processAudioStream(AudioStream& audioStream16kHz, function&)> processBuffer, ProgressSink& progressSink) { - // Process entire sound file + // Process entire sound stream vector buffer; const int capacity = 1600; // 0.1 second capacity buffer.reserve(capacity); @@ -155,7 +155,7 @@ BoundedTimeline recognizeWords(unique_ptr audioStream, ps_d int error = ps_start_utt(&decoder); if (error) throw runtime_error("Error starting utterance processing for word recognition."); - // Process entire sound file + // Process entire sound stream auto processBuffer = [&decoder](const vector& buffer) { int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false); if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition."); @@ -220,8 +220,8 @@ BoundedTimeline getPhoneAlignment( // Start search ps_search_start(search.get()); - // Process entire sound file - auto processBuffer = [&decoder, &acousticModel, &search](const vector& buffer) { + // Process entire sound stream + auto processBuffer = [&](const vector& buffer) { const int16* nextSample = buffer.data(); size_t remainingSamples = buffer.size(); while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) { @@ -283,12 +283,6 @@ BoundedTimeline detectPhones( optional dialog, ProgressSink& progressSink) { - // Pocketsphinx doesn't like empty input - TimeRange audioRange = audioStream->getTruncatedRange(); - if (audioRange.empty()) { - return BoundedTimeline(audioRange); - } - // Discard Pocketsphinx output err_set_logfp(nullptr); @@ -298,12 +292,15 @@ BoundedTimeline detectPhones( // Make sure audio stream has no DC offset audioStream = removeDCOffset(std::move(audioStream)); + ProgressMerger totalProgressMerger(progressSink); + ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0); + ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15); + try { // Split audio into utterances - BoundedTimeline utterances = detectVoiceActivity(audioStream->clone(true)); - + BoundedTimeline utterances = detectVoiceActivity(audioStream->clone(true), voiceActivationProgressSink); // For progress reporting: weigh utterances by length - ProgressMerger dialogProgressMerger(progressSink); + ProgressMerger dialogProgressMerger(dialogProgressSink); vector utteranceProgressSinks; for (const auto& timedUtterance : utterances) { utteranceProgressSinks.push_back(&dialogProgressMerger.addSink(timedUtterance.getTimeRange().getLength().count()));