2016-03-15 21:52:31 +00:00
|
|
|
#include "voiceActivityDetection.h"
|
|
|
|
#include <audio/DCOffset.h>
|
|
|
|
#include <audio/SampleRateConverter.h>
|
|
|
|
#include <boost/optional/optional.hpp>
|
|
|
|
#include <logging.h>
|
2016-05-02 18:31:59 +00:00
|
|
|
#include <pairs.h>
|
2016-03-15 21:52:31 +00:00
|
|
|
|
|
|
|
using std::numeric_limits;
|
|
|
|
using std::vector;
|
|
|
|
using boost::optional;
|
|
|
|
|
|
|
|
float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {
|
|
|
|
double sum = 0;
|
|
|
|
int sampleCount;
|
|
|
|
for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {
|
|
|
|
sum += std::pow(static_cast<double>(audioStream.readSample()), 2);
|
|
|
|
}
|
|
|
|
return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
|
|
|
|
}
|
|
|
|
|
2016-05-02 18:31:59 +00:00
|
|
|
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
|
2016-03-15 21:52:31 +00:00
|
|
|
// Make sure audio stream has no DC offset
|
|
|
|
audioStream = removeDCOffset(std::move(audioStream));
|
|
|
|
|
|
|
|
// Resample to remove noise
|
|
|
|
constexpr int maxFrequency = 1000;
|
|
|
|
constexpr int sampleRate = 2 * maxFrequency;
|
|
|
|
audioStream = convertSampleRate(std::move(audioStream), sampleRate);
|
|
|
|
|
2016-04-09 20:07:25 +00:00
|
|
|
// Detect activity
|
|
|
|
const float rms = getRMS(*audioStream->clone(true));
|
|
|
|
const float cutoff = rms / 50;
|
2016-05-02 18:31:59 +00:00
|
|
|
BoundedTimeline<void> activity(audioStream->getTruncatedRange());
|
2016-04-09 20:07:25 +00:00
|
|
|
for (centiseconds time = centiseconds::zero(); !audioStream->endOfStream(); ++time) {
|
|
|
|
float currentRMS = getRMS(*audioStream, sampleRate / 100);
|
|
|
|
bool active = currentRMS > cutoff;
|
2016-03-15 21:52:31 +00:00
|
|
|
if (active) {
|
2016-05-02 18:31:59 +00:00
|
|
|
activity.set(time, time + centiseconds(1));
|
2016-03-15 21:52:31 +00:00
|
|
|
}
|
|
|
|
}
|
2016-04-09 20:07:25 +00:00
|
|
|
|
2016-05-02 18:31:59 +00:00
|
|
|
// Pad each activity to prevent cropping
|
|
|
|
const centiseconds padding(3);
|
|
|
|
for (const auto& element : BoundedTimeline<void>(activity)) {
|
|
|
|
activity.set(element.getStart() - padding, element.getEnd() + padding);
|
|
|
|
}
|
|
|
|
|
2016-04-09 20:07:25 +00:00
|
|
|
// Fill small gaps in activity
|
2016-05-02 18:31:59 +00:00
|
|
|
const centiseconds maxGap(5);
|
|
|
|
for (const auto& pair : getPairs(activity)) {
|
|
|
|
if (pair.second.getStart() - pair.first.getEnd() <= maxGap) {
|
|
|
|
activity.set(pair.first.getEnd(), pair.second.getStart());
|
2016-04-09 20:07:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Log
|
2016-05-02 18:31:59 +00:00
|
|
|
for (const auto& utterance : activity) {
|
|
|
|
logging::logTimedEvent("utterance", utterance.getTimeRange(), std::string());
|
2016-03-15 21:52:31 +00:00
|
|
|
}
|
|
|
|
|
2016-04-09 20:07:25 +00:00
|
|
|
return activity;
|
2016-03-15 21:52:31 +00:00
|
|
|
}
|