rhubarb-lip-sync/src/audio/voiceActivityDetection.cpp

#include "voiceActivityDetection.h"
#include <audio/DCOffset.h>
#include <audio/SampleRateConverter.h>
#include <boost/optional/optional.hpp>
#include <logging.h>

using std::numeric_limits;
using std::vector;
using boost::optional;

float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {
	double sum = 0;
	int sampleCount;
	for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {
		sum += std::pow(static_cast<double>(audioStream.readSample()), 2);
	}
	return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
}

vector<TimeRange> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
	// Make sure audio stream has no DC offset
	audioStream = removeDCOffset(std::move(audioStream));

	// Resample to remove noise
	constexpr int maxFrequency = 1000;
	constexpr int sampleRate = 2 * maxFrequency;
	audioStream = convertSampleRate(std::move(audioStream), sampleRate);

	float rms = getRMS(*audioStream->clone(true));
	float cutoff = rms / 50;
	centiseconds maxGap(10);

	vector<TimeRange> result;
	optional<centiseconds> segmentStart, segmentEnd;
	for (centiseconds time = centiseconds(0); !audioStream->endOfStream(); ++time) {
		float currentPower = getRMS(*audioStream, sampleRate / 100);
		bool active = currentPower > cutoff;
		if (active) {
			if (!segmentStart) {
				segmentStart = time;
			}
			segmentEnd = time + centiseconds(1);
		} else if (segmentEnd && time > segmentEnd.value() + maxGap) {
			result.push_back(TimeRange(segmentStart.value(), segmentEnd.value()));
			logTimedEvent("utterance", segmentStart.value(), segmentEnd.value(), "");
			segmentStart.reset();
			segmentEnd.reset();
		}
	}
	if (segmentEnd) {
		result.push_back(TimeRange(segmentStart.value(), segmentEnd.value()));
	}

	return result;
}
Implemented voice activity detection 2016-03-15 21:52:31 +00:00			`#include "voiceActivityDetection.h"`
			`#include <audio/DCOffset.h>`
			`#include <audio/SampleRateConverter.h>`
			`#include <boost/optional/optional.hpp>`
			`#include <logging.h>`

			`using std::numeric_limits;`
			`using std::vector;`
			`using boost::optional;`

			`float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {`
			`double sum = 0;`
			`int sampleCount;`
			`for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {`
			`sum += std::pow(static_cast<double>(audioStream.readSample()), 2);`
			`}`
			`return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;`
			`}`

Renamed TimeSegment to TimeRange 2016-03-28 18:25:11 +00:00			`vector<TimeRange> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {`
Implemented voice activity detection 2016-03-15 21:52:31 +00:00			`// Make sure audio stream has no DC offset`
			`audioStream = removeDCOffset(std::move(audioStream));`

			`// Resample to remove noise`
			`constexpr int maxFrequency = 1000;`
			`constexpr int sampleRate = 2 * maxFrequency;`
			`audioStream = convertSampleRate(std::move(audioStream), sampleRate);`

			`float rms = getRMS(*audioStream->clone(true));`
			`float cutoff = rms / 50;`
			`centiseconds maxGap(10);`

Renamed TimeSegment to TimeRange 2016-03-28 18:25:11 +00:00			`vector<TimeRange> result;`
Implemented voice activity detection 2016-03-15 21:52:31 +00:00			`optional<centiseconds> segmentStart, segmentEnd;`
			`for (centiseconds time = centiseconds(0); !audioStream->endOfStream(); ++time) {`
			`float currentPower = getRMS(*audioStream, sampleRate / 100);`
			`bool active = currentPower > cutoff;`
			`if (active) {`
			`if (!segmentStart) {`
			`segmentStart = time;`
			`}`
			`segmentEnd = time + centiseconds(1);`
			`} else if (segmentEnd && time > segmentEnd.value() + maxGap) {`
Renamed TimeSegment to TimeRange 2016-03-28 18:25:11 +00:00			`result.push_back(TimeRange(segmentStart.value(), segmentEnd.value()));`
Implemented voice activity detection 2016-03-15 21:52:31 +00:00			`logTimedEvent("utterance", segmentStart.value(), segmentEnd.value(), "");`
			`segmentStart.reset();`
			`segmentEnd.reset();`
			`}`
			`}`
			`if (segmentEnd) {`
Renamed TimeSegment to TimeRange 2016-03-28 18:25:11 +00:00			`result.push_back(TimeRange(segmentStart.value(), segmentEnd.value()));`
Implemented voice activity detection 2016-03-15 21:52:31 +00:00			`}`

			`return result;`
			`}`