Improved speed of voice activity detection
... by factor 2 by removing second pass. Also added voice activity detection to progress calculation.
This commit is contained in:
parent
c4b054176c
commit
4346552312
|
@ -5,6 +5,7 @@
|
||||||
#include <logging.h>
|
#include <logging.h>
|
||||||
#include <pairs.h>
|
#include <pairs.h>
|
||||||
#include <boost/range/adaptor/transformed.hpp>
|
#include <boost/range/adaptor/transformed.hpp>
|
||||||
|
#include <stringTools.h>
|
||||||
|
|
||||||
using std::numeric_limits;
|
using std::numeric_limits;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
@ -13,7 +14,7 @@ using boost::adaptors::transformed;
|
||||||
using fmt::format;
|
using fmt::format;
|
||||||
|
|
||||||
float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {
|
float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {
|
||||||
double sum = 0;
|
double sum = 0; // Use double to prevent rounding errors with large number of summands
|
||||||
int sampleCount;
|
int sampleCount;
|
||||||
for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {
|
for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {
|
||||||
sum += std::pow(static_cast<double>(audioStream.readSample()), 2);
|
sum += std::pow(static_cast<double>(audioStream.readSample()), 2);
|
||||||
|
@ -21,7 +22,17 @@ float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>:
|
||||||
return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
|
return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
|
float getRMS(const vector<float>& rmsSegments) {
|
||||||
|
if (rmsSegments.empty()) return 0;
|
||||||
|
|
||||||
|
double sum = 0; // Use double to prevent rounding errors with large number of summands
|
||||||
|
for (float rmsSegment : rmsSegments) {
|
||||||
|
sum += rmsSegment;
|
||||||
|
}
|
||||||
|
return static_cast<float>(std::sqrt(sum / rmsSegments.size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink) {
|
||||||
// Make sure audio stream has no DC offset
|
// Make sure audio stream has no DC offset
|
||||||
audioStream = removeDCOffset(std::move(audioStream));
|
audioStream = removeDCOffset(std::move(audioStream));
|
||||||
|
|
||||||
|
@ -30,12 +41,25 @@ BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStre
|
||||||
constexpr int sampleRate = 2 * maxFrequency;
|
constexpr int sampleRate = 2 * maxFrequency;
|
||||||
audioStream = convertSampleRate(std::move(audioStream), sampleRate);
|
audioStream = convertSampleRate(std::move(audioStream), sampleRate);
|
||||||
|
|
||||||
|
// Collect RMS data
|
||||||
|
vector<float> rmsSegments;
|
||||||
|
logging::debug("RMS calculation -- start");
|
||||||
|
int64_t centisecondCount = (audioStream->getSampleCount() - audioStream->getSampleIndex()) / 100;
|
||||||
|
for (int cs = 0; cs < centisecondCount; ++cs) {
|
||||||
|
rmsSegments.push_back(getRMS(*audioStream, sampleRate / 100));
|
||||||
|
progressSink.reportProgress(static_cast<double>(cs) / centisecondCount);
|
||||||
|
}
|
||||||
|
logging::debug("RMS calculation -- end");
|
||||||
|
|
||||||
|
const float rms = getRMS(rmsSegments);
|
||||||
|
logging::debugFormat("RMS value: {0:.5f}", rms);
|
||||||
|
|
||||||
// Detect activity
|
// Detect activity
|
||||||
const float rms = getRMS(*audioStream->clone(true));
|
|
||||||
const float cutoff = rms / 25;
|
const float cutoff = rms / 25;
|
||||||
|
logging::debugFormat("RMS cutoff for voice activity detection: {0:.5f}", cutoff);
|
||||||
BoundedTimeline<void> activity(audioStream->getTruncatedRange());
|
BoundedTimeline<void> activity(audioStream->getTruncatedRange());
|
||||||
for (centiseconds time = centiseconds::zero(); !audioStream->endOfStream(); ++time) {
|
for (centiseconds time = centiseconds::zero(); static_cast<size_t>(time.count()) < rmsSegments.size(); ++time) {
|
||||||
float currentRMS = getRMS(*audioStream, sampleRate / 100);
|
float currentRMS = rmsSegments[time.count()];
|
||||||
bool active = currentRMS > cutoff;
|
bool active = currentRMS > cutoff;
|
||||||
if (active) {
|
if (active) {
|
||||||
activity.set(time, time + centiseconds(1));
|
activity.set(time, time + centiseconds(1));
|
||||||
|
|
|
@ -2,5 +2,6 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "AudioStream.h"
|
#include "AudioStream.h"
|
||||||
#include <BoundedTimeline.h>
|
#include <BoundedTimeline.h>
|
||||||
|
#include <ProgressBar.h>
|
||||||
|
|
||||||
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);
|
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink);
|
||||||
|
|
|
@ -77,7 +77,7 @@ int16_t floatSampleToInt16(float sample) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void processAudioStream(AudioStream& audioStream16kHz, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
|
void processAudioStream(AudioStream& audioStream16kHz, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
|
||||||
// Process entire sound file
|
// Process entire sound stream
|
||||||
vector<int16_t> buffer;
|
vector<int16_t> buffer;
|
||||||
const int capacity = 1600; // 0.1 second capacity
|
const int capacity = 1600; // 0.1 second capacity
|
||||||
buffer.reserve(capacity);
|
buffer.reserve(capacity);
|
||||||
|
@ -155,7 +155,7 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
|
||||||
int error = ps_start_utt(&decoder);
|
int error = ps_start_utt(&decoder);
|
||||||
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
||||||
|
|
||||||
// Process entire sound file
|
// Process entire sound stream
|
||||||
auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
|
auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
|
||||||
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
|
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
|
||||||
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||||
|
@ -220,8 +220,8 @@ BoundedTimeline<Phone> getPhoneAlignment(
|
||||||
// Start search
|
// Start search
|
||||||
ps_search_start(search.get());
|
ps_search_start(search.get());
|
||||||
|
|
||||||
// Process entire sound file
|
// Process entire sound stream
|
||||||
auto processBuffer = [&decoder, &acousticModel, &search](const vector<int16_t>& buffer) {
|
auto processBuffer = [&](const vector<int16_t>& buffer) {
|
||||||
const int16* nextSample = buffer.data();
|
const int16* nextSample = buffer.data();
|
||||||
size_t remainingSamples = buffer.size();
|
size_t remainingSamples = buffer.size();
|
||||||
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
||||||
|
@ -283,12 +283,6 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
optional<u32string> dialog,
|
optional<u32string> dialog,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
// Pocketsphinx doesn't like empty input
|
|
||||||
TimeRange audioRange = audioStream->getTruncatedRange();
|
|
||||||
if (audioRange.empty()) {
|
|
||||||
return BoundedTimeline<Phone>(audioRange);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Discard Pocketsphinx output
|
// Discard Pocketsphinx output
|
||||||
err_set_logfp(nullptr);
|
err_set_logfp(nullptr);
|
||||||
|
|
||||||
|
@ -298,12 +292,15 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
// Make sure audio stream has no DC offset
|
// Make sure audio stream has no DC offset
|
||||||
audioStream = removeDCOffset(std::move(audioStream));
|
audioStream = removeDCOffset(std::move(audioStream));
|
||||||
|
|
||||||
|
ProgressMerger totalProgressMerger(progressSink);
|
||||||
|
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
|
||||||
|
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Split audio into utterances
|
// Split audio into utterances
|
||||||
BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true));
|
BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true), voiceActivationProgressSink);
|
||||||
|
|
||||||
// For progress reporting: weigh utterances by length
|
// For progress reporting: weigh utterances by length
|
||||||
ProgressMerger dialogProgressMerger(progressSink);
|
ProgressMerger dialogProgressMerger(dialogProgressSink);
|
||||||
vector<ProgressSink*> utteranceProgressSinks;
|
vector<ProgressSink*> utteranceProgressSinks;
|
||||||
for (const auto& timedUtterance : utterances) {
|
for (const auto& timedUtterance : utterances) {
|
||||||
utteranceProgressSinks.push_back(&dialogProgressMerger.addSink(timedUtterance.getTimeRange().getLength().count()));
|
utteranceProgressSinks.push_back(&dialogProgressMerger.addSink(timedUtterance.getTimeRange().getLength().count()));
|
||||||
|
|
Loading…
Reference in New Issue