From 43465523122d70cdc234236b9df438f5522d5111 Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Tue, 14 Jun 2016 20:12:12 +0200
Subject: [PATCH] Improved speed of voice activity detection

... by factor 2 by removing second pass.
Also added voice activity detection to progress calculation.
---
 src/audio/voiceActivityDetection.cpp | 34 ++++++++++++++++++++++++----
 src/audio/voiceActivityDetection.h   |  3 ++-
 src/phoneExtraction.cpp              | 23 ++++++++-----------
 3 files changed, 41 insertions(+), 19 deletions(-)
diff --git a/src/audio/voiceActivityDetection.cpp b/src/audio/voiceActivityDetection.cpp
index efe8173..0b94da9 100644
--- a/src/audio/voiceActivityDetection.cpp
+++ b/src/audio/voiceActivityDetection.cpp
@@ -5,6 +5,7 @@
 #include <logging.h>
 #include <pairs.h>
 #include <boost/range/adaptor/transformed.hpp>
+#include <stringTools.h>
 
 using std::numeric_limits;
 using std::vector;
@@ -13,7 +14,7 @@ using boost::adaptors::transformed;
 using fmt::format;
 
 float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {
-	double sum = 0;
+	double sum = 0; // Use double to prevent rounding errors with large number of summands
 	int sampleCount;
 	for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {
 		sum += std::pow(static_cast<double>(audioStream.readSample()), 2);
@@ -21,7 +22,17 @@ float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>:
 	return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
 }
 
-BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
+float getRMS(const vector<float>& rmsSegments) {
+	if (rmsSegments.empty()) return 0;
+
+	double sum = 0; // Use double to prevent rounding errors with large number of summands
+	for (float rmsSegment : rmsSegments) {
+		sum += rmsSegment;
+	}
+	return static_cast<float>(std::sqrt(sum / rmsSegments.size()));
+}
+
+BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink) {
 	// Make sure audio stream has no DC offset
 	audioStream = removeDCOffset(std::move(audioStream));
 
@@ -30,12 +41,25 @@ BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStre
 	constexpr int sampleRate = 2 * maxFrequency;
 	audioStream = convertSampleRate(std::move(audioStream), sampleRate);
 
+	// Collect RMS data
+	vector<float> rmsSegments;
+	logging::debug("RMS calculation -- start");
+	int64_t centisecondCount = (audioStream->getSampleCount() - audioStream->getSampleIndex()) / 100;
+	for (int cs = 0; cs < centisecondCount; ++cs) {
+		rmsSegments.push_back(getRMS(*audioStream, sampleRate / 100));
+		progressSink.reportProgress(static_cast<double>(cs) / centisecondCount);
+	}
+	logging::debug("RMS calculation -- end");
+
+	const float rms = getRMS(rmsSegments);
+	logging::debugFormat("RMS value: {0:.5f}", rms);
+
 	// Detect activity
-	const float rms = getRMS(*audioStream->clone(true));
 	const float cutoff = rms / 25;
+	logging::debugFormat("RMS cutoff for voice activity detection: {0:.5f}", cutoff);
 	BoundedTimeline<void> activity(audioStream->getTruncatedRange());
-	for (centiseconds time = centiseconds::zero(); !audioStream->endOfStream(); ++time) {
-		float currentRMS = getRMS(*audioStream, sampleRate / 100);
+	for (centiseconds time = centiseconds::zero(); static_cast<size_t>(time.count()) < rmsSegments.size(); ++time) {
+		float currentRMS = rmsSegments[time.count()];
 		bool active = currentRMS > cutoff;
 		if (active) {
 			activity.set(time, time + centiseconds(1));
diff --git a/src/audio/voiceActivityDetection.h b/src/audio/voiceActivityDetection.h
index fba0b40..aa6bc2a 100644
--- a/src/audio/voiceActivityDetection.h
+++ b/src/audio/voiceActivityDetection.h
@@ -2,5 +2,6 @@
 #include <memory>
 #include "AudioStream.h"
 #include <BoundedTimeline.h>
+#include <ProgressBar.h>
 
-BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);
+BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink);
diff --git a/src/phoneExtraction.cpp b/src/phoneExtraction.cpp
index 91b56d6..dee3139 100644
--- a/src/phoneExtraction.cpp
+++ b/src/phoneExtraction.cpp
@@ -77,7 +77,7 @@ int16_t floatSampleToInt16(float sample) {
 }
 
 void processAudioStream(AudioStream& audioStream16kHz, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
-	// Process entire sound file
+	// Process entire sound stream
 	vector<int16_t> buffer;
 	const int capacity = 1600; // 0.1 second capacity
 	buffer.reserve(capacity);
@@ -155,7 +155,7 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
 	int error = ps_start_utt(&decoder);
 	if (error) throw runtime_error("Error starting utterance processing for word recognition.");
 
-	// Process entire sound file
+	// Process entire sound stream
 	auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
 		int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
 		if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
@@ -220,8 +220,8 @@ BoundedTimeline<Phone> getPhoneAlignment(
 	// Start search
 	ps_search_start(search.get());
 
-	// Process entire sound file
-	auto processBuffer = [&decoder, &acousticModel, &search](const vector<int16_t>& buffer) {
+	// Process entire sound stream
+	auto processBuffer = [&](const vector<int16_t>& buffer) {
 		const int16* nextSample = buffer.data();
 		size_t remainingSamples = buffer.size();
 		while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
@@ -283,12 +283,6 @@ BoundedTimeline<Phone> detectPhones(
 	optional<u32string> dialog,
 	ProgressSink& progressSink)
 {
-	// Pocketsphinx doesn't like empty input
-	TimeRange audioRange = audioStream->getTruncatedRange();
-	if (audioRange.empty()) {
-		return BoundedTimeline<Phone>(audioRange);
-	}
-
 	// Discard Pocketsphinx output
 	err_set_logfp(nullptr);
 
@@ -298,12 +292,15 @@ BoundedTimeline<Phone> detectPhones(
 	// Make sure audio stream has no DC offset
 	audioStream = removeDCOffset(std::move(audioStream));
 
+	ProgressMerger totalProgressMerger(progressSink);
+	ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
+	ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
+
 	try {
 		// Split audio into utterances
-		BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true));
-
+		BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true), voiceActivationProgressSink);
 		// For progress reporting: weigh utterances by length
-		ProgressMerger dialogProgressMerger(progressSink);
+		ProgressMerger dialogProgressMerger(dialogProgressSink);
 		vector<ProgressSink*> utteranceProgressSinks;
 		for (const auto& timedUtterance : utterances) {
 			utteranceProgressSinks.push_back(&dialogProgressMerger.addSink(timedUtterance.getTimeRange().getLength().count()));