Implemented voice activity detection
This commit is contained in:
parent
425f47491c
commit
8c1e24e9c8
|
@ -110,6 +110,7 @@ set(SOURCE_FILES
|
|||
src/audio/DCOffset.cpp
|
||||
src/audio/SampleRateConverter.cpp
|
||||
src/audio/UnboundedStream.cpp
|
||||
src/audio/voiceActivityDetection.cpp
|
||||
src/audio/WaveFileReader.cpp
|
||||
src/audio/waveFileWriting.cpp
|
||||
src/stringTools.cpp
|
||||
|
@ -117,6 +118,8 @@ set(SOURCE_FILES
|
|||
src/TablePrinter.cpp
|
||||
src/ProgressBar.cpp
|
||||
src/logging.cpp
|
||||
src/Timed.cpp
|
||||
src/TimeSegment.cpp
|
||||
)
|
||||
add_executable(rhubarb ${SOURCE_FILES})
|
||||
target_link_libraries(rhubarb ${Boost_LIBRARIES} cppFormat sphinxbase pocketSphinx)
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
#include "TimeSegment.h"
|
||||
|
||||
TimeSegment::TimeSegment(centiseconds start, centiseconds end) :
|
||||
start(start),
|
||||
end(end)
|
||||
{}
|
||||
|
||||
centiseconds TimeSegment::getStart() const {
|
||||
return start;
|
||||
}
|
||||
|
||||
centiseconds TimeSegment::getEnd() const {
|
||||
return end;
|
||||
}
|
||||
|
||||
centiseconds TimeSegment::getLength() const {
|
||||
return end - start;
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
#pragma once
|
||||
#include "centiseconds.h"
|
||||
|
||||
class TimeSegment {
|
||||
public:
|
||||
TimeSegment(centiseconds start, centiseconds end);
|
||||
centiseconds getStart() const;
|
||||
centiseconds getEnd() const;
|
||||
centiseconds getLength() const;
|
||||
|
||||
private:
|
||||
const centiseconds start, end;
|
||||
};
|
|
@ -0,0 +1 @@
|
|||
#include "Timed.h"
|
|
@ -0,0 +1,19 @@
|
|||
#pragma once
|
||||
|
||||
#include <TimeSegment.h>
|
||||
|
||||
template<typename TValue>
|
||||
class Timed : public TimeSegment {
|
||||
public:
|
||||
Timed(centiseconds start, centiseconds end, TValue value) :
|
||||
TimeSegment(start, end),
|
||||
value(value)
|
||||
{}
|
||||
|
||||
const TValue& getValue() const {
|
||||
return value;
|
||||
}
|
||||
|
||||
private:
|
||||
const TValue value;
|
||||
};
|
|
@ -0,0 +1,55 @@
|
|||
#include "voiceActivityDetection.h"
|
||||
#include <audio/DCOffset.h>
|
||||
#include <audio/SampleRateConverter.h>
|
||||
#include <boost/optional/optional.hpp>
|
||||
#include <logging.h>
|
||||
|
||||
using std::numeric_limits;
|
||||
using std::vector;
|
||||
using boost::optional;
|
||||
|
||||
float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {
|
||||
double sum = 0;
|
||||
int sampleCount;
|
||||
for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {
|
||||
sum += std::pow(static_cast<double>(audioStream.readSample()), 2);
|
||||
}
|
||||
return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
|
||||
}
|
||||
|
||||
vector<TimeSegment> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
|
||||
// Make sure audio stream has no DC offset
|
||||
audioStream = removeDCOffset(std::move(audioStream));
|
||||
|
||||
// Resample to remove noise
|
||||
constexpr int maxFrequency = 1000;
|
||||
constexpr int sampleRate = 2 * maxFrequency;
|
||||
audioStream = convertSampleRate(std::move(audioStream), sampleRate);
|
||||
|
||||
float rms = getRMS(*audioStream->clone(true));
|
||||
float cutoff = rms / 50;
|
||||
centiseconds maxGap(10);
|
||||
|
||||
vector<TimeSegment> result;
|
||||
optional<centiseconds> segmentStart, segmentEnd;
|
||||
for (centiseconds time = centiseconds(0); !audioStream->endOfStream(); ++time) {
|
||||
float currentPower = getRMS(*audioStream, sampleRate / 100);
|
||||
bool active = currentPower > cutoff;
|
||||
if (active) {
|
||||
if (!segmentStart) {
|
||||
segmentStart = time;
|
||||
}
|
||||
segmentEnd = time + centiseconds(1);
|
||||
} else if (segmentEnd && time > segmentEnd.value() + maxGap) {
|
||||
result.push_back(TimeSegment(segmentStart.value(), segmentEnd.value()));
|
||||
logTimedEvent("utterance", segmentStart.value(), segmentEnd.value(), "");
|
||||
segmentStart.reset();
|
||||
segmentEnd.reset();
|
||||
}
|
||||
}
|
||||
if (segmentEnd) {
|
||||
result.push_back(TimeSegment(segmentStart.value(), segmentEnd.value()));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
#include <TimeSegment.h>
|
||||
#include <memory>
|
||||
#include "AudioStream.h"
|
||||
|
||||
std::vector<TimeSegment> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);
|
Loading…
Reference in New Issue