Improved speed of voice activity detection

... by factor 2 by removing second pass.
Also added voice activity detection to progress calculation.
This commit is contained in:
Daniel Wolf 2016-06-14 20:12:12 +02:00
parent c4b054176c
commit 4346552312
3 changed files with 41 additions and 19 deletions

View File

@ -5,6 +5,7 @@
#include <logging.h>
#include <pairs.h>
#include <boost/range/adaptor/transformed.hpp>
#include <stringTools.h>
using std::numeric_limits;
using std::vector;
@ -13,7 +14,7 @@ using boost::adaptors::transformed;
using fmt::format;
float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {
double sum = 0;
double sum = 0; // Use double to prevent rounding errors with large number of summands
int sampleCount;
for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {
sum += std::pow(static_cast<double>(audioStream.readSample()), 2);
@ -21,7 +22,17 @@ float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>:
return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
}
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
float getRMS(const vector<float>& rmsSegments) {
if (rmsSegments.empty()) return 0;
double sum = 0; // Use double to prevent rounding errors with large number of summands
for (float rmsSegment : rmsSegments) {
sum += rmsSegment;
}
return static_cast<float>(std::sqrt(sum / rmsSegments.size()));
}
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink) {
// Make sure audio stream has no DC offset
audioStream = removeDCOffset(std::move(audioStream));
@ -30,12 +41,25 @@ BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStre
constexpr int sampleRate = 2 * maxFrequency;
audioStream = convertSampleRate(std::move(audioStream), sampleRate);
// Collect RMS data
vector<float> rmsSegments;
logging::debug("RMS calculation -- start");
int64_t centisecondCount = (audioStream->getSampleCount() - audioStream->getSampleIndex()) / 100;
for (int cs = 0; cs < centisecondCount; ++cs) {
rmsSegments.push_back(getRMS(*audioStream, sampleRate / 100));
progressSink.reportProgress(static_cast<double>(cs) / centisecondCount);
}
logging::debug("RMS calculation -- end");
const float rms = getRMS(rmsSegments);
logging::debugFormat("RMS value: {0:.5f}", rms);
// Detect activity
const float rms = getRMS(*audioStream->clone(true));
const float cutoff = rms / 25;
logging::debugFormat("RMS cutoff for voice activity detection: {0:.5f}", cutoff);
BoundedTimeline<void> activity(audioStream->getTruncatedRange());
for (centiseconds time = centiseconds::zero(); !audioStream->endOfStream(); ++time) {
float currentRMS = getRMS(*audioStream, sampleRate / 100);
for (centiseconds time = centiseconds::zero(); static_cast<size_t>(time.count()) < rmsSegments.size(); ++time) {
float currentRMS = rmsSegments[time.count()];
bool active = currentRMS > cutoff;
if (active) {
activity.set(time, time + centiseconds(1));

View File

@ -2,5 +2,6 @@
#include <memory>
#include "AudioStream.h"
#include <BoundedTimeline.h>
#include <ProgressBar.h>
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink);

View File

@ -77,7 +77,7 @@ int16_t floatSampleToInt16(float sample) {
}
void processAudioStream(AudioStream& audioStream16kHz, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
// Process entire sound file
// Process entire sound stream
vector<int16_t> buffer;
const int capacity = 1600; // 0.1 second capacity
buffer.reserve(capacity);
@ -155,7 +155,7 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
int error = ps_start_utt(&decoder);
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
// Process entire sound file
// Process entire sound stream
auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
@ -220,8 +220,8 @@ BoundedTimeline<Phone> getPhoneAlignment(
// Start search
ps_search_start(search.get());
// Process entire sound file
auto processBuffer = [&decoder, &acousticModel, &search](const vector<int16_t>& buffer) {
// Process entire sound stream
auto processBuffer = [&](const vector<int16_t>& buffer) {
const int16* nextSample = buffer.data();
size_t remainingSamples = buffer.size();
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
@ -283,12 +283,6 @@ BoundedTimeline<Phone> detectPhones(
optional<u32string> dialog,
ProgressSink& progressSink)
{
// Pocketsphinx doesn't like empty input
TimeRange audioRange = audioStream->getTruncatedRange();
if (audioRange.empty()) {
return BoundedTimeline<Phone>(audioRange);
}
// Discard Pocketsphinx output
err_set_logfp(nullptr);
@ -298,12 +292,15 @@ BoundedTimeline<Phone> detectPhones(
// Make sure audio stream has no DC offset
audioStream = removeDCOffset(std::move(audioStream));
ProgressMerger totalProgressMerger(progressSink);
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
try {
// Split audio into utterances
BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true));
BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true), voiceActivationProgressSink);
// For progress reporting: weigh utterances by length
ProgressMerger dialogProgressMerger(progressSink);
ProgressMerger dialogProgressMerger(dialogProgressSink);
vector<ProgressSink*> utteranceProgressSinks;
for (const auto& timedUtterance : utterances) {
utteranceProgressSinks.push_back(&dialogProgressMerger.addSink(timedUtterance.getTimeRange().getLength().count()));