Implemented voice activity detection

2016-03-15 22:52:31 +01:00 · 2016-03-15 22:52:31 +01:00 · 8c1e24e9c8
parent 425f47491c
commit 8c1e24e9c8
7 changed files with 116 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -110,6 +110,7 @@ set(SOURCE_FILES
 	src/audio/DCOffset.cpp
 	src/audio/SampleRateConverter.cpp
 	src/audio/UnboundedStream.cpp
+	src/audio/voiceActivityDetection.cpp
 	src/audio/WaveFileReader.cpp
 	src/audio/waveFileWriting.cpp
 	src/stringTools.cpp
@ -117,6 +118,8 @@ set(SOURCE_FILES
 	src/TablePrinter.cpp
 	src/ProgressBar.cpp
 	src/logging.cpp
+	src/Timed.cpp
+	src/TimeSegment.cpp
 )
 add_executable(rhubarb ${SOURCE_FILES})
 target_link_libraries(rhubarb ${Boost_LIBRARIES} cppFormat sphinxbase pocketSphinx)
--- a/src/TimeSegment.cpp
+++ b/src/TimeSegment.cpp
@ -0,0 +1,18 @@
+#include "TimeSegment.h"
+
+TimeSegment::TimeSegment(centiseconds start, centiseconds end) :
+	start(start),
+	end(end)
+{}
+
+centiseconds TimeSegment::getStart() const {
+	return start;
+}
+
+centiseconds TimeSegment::getEnd() const {
+	return end;
+}
+
+centiseconds TimeSegment::getLength() const {
+	return end - start;
+}
--- a/src/TimeSegment.h
+++ b/src/TimeSegment.h
@ -0,0 +1,13 @@
+#pragma once
+#include "centiseconds.h"
+
+class TimeSegment {
+public:
+	TimeSegment(centiseconds start, centiseconds end);
+	centiseconds getStart() const;
+	centiseconds getEnd() const;
+	centiseconds getLength() const;
+
+private:
+	const centiseconds start, end;
+};
--- a/src/Timed.cpp
+++ b/src/Timed.cpp
@ -0,0 +1 @@
+#include "Timed.h"
--- a/src/Timed.h
+++ b/src/Timed.h
@ -0,0 +1,19 @@
+#pragma once
+
+#include <TimeSegment.h>
+
+template<typename TValue>
+class Timed : public TimeSegment {
+public:
+	Timed(centiseconds start, centiseconds end, TValue value) :
+		TimeSegment(start, end),
+		value(value)
+	{}
+
+	const TValue& getValue() const {
+		return value;
+	}
+
+private:
+	const TValue value;
+};
--- a/src/audio/voiceActivityDetection.cpp
+++ b/src/audio/voiceActivityDetection.cpp
@ -0,0 +1,55 @@
+#include "voiceActivityDetection.h"
+#include <audio/DCOffset.h>
+#include <audio/SampleRateConverter.h>
+#include <boost/optional/optional.hpp>
+#include <logging.h>
+
+using std::numeric_limits;
+using std::vector;
+using boost::optional;
+
+float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {
+	double sum = 0;
+	int sampleCount;
+	for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {
+		sum += std::pow(static_cast<double>(audioStream.readSample()), 2);
+	}
+	return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
+}
+
+vector<TimeSegment> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
+	// Make sure audio stream has no DC offset
+	audioStream = removeDCOffset(std::move(audioStream));
+
+	// Resample to remove noise
+	constexpr int maxFrequency = 1000;
+	constexpr int sampleRate = 2 * maxFrequency;
+	audioStream = convertSampleRate(std::move(audioStream), sampleRate);
+
+	float rms = getRMS(*audioStream->clone(true));
+	float cutoff = rms / 50;
+	centiseconds maxGap(10);
+
+	vector<TimeSegment> result;
+	optional<centiseconds> segmentStart, segmentEnd;
+	for (centiseconds time = centiseconds(0); !audioStream->endOfStream(); ++time) {
+		float currentPower = getRMS(*audioStream, sampleRate / 100);
+		bool active = currentPower > cutoff;
+		if (active) {
+			if (!segmentStart) {
+				segmentStart = time;
+			}
+			segmentEnd = time + centiseconds(1);
+		} else if (segmentEnd && time > segmentEnd.value() + maxGap) {
+			result.push_back(TimeSegment(segmentStart.value(), segmentEnd.value()));
+			logTimedEvent("utterance", segmentStart.value(), segmentEnd.value(), "");
+			segmentStart.reset();
+			segmentEnd.reset();
+		}
+	}
+	if (segmentEnd) {
+		result.push_back(TimeSegment(segmentStart.value(), segmentEnd.value()));
+	}
+
+	return result;
+}
--- a/src/audio/voiceActivityDetection.h
+++ b/src/audio/voiceActivityDetection.h
@ -0,0 +1,7 @@
+#pragma once
+#include <vector>
+#include <TimeSegment.h>
+#include <memory>
+#include "AudioStream.h"
+
+std::vector<TimeSegment> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);