Improved speed of voice activity detection
... by factor 2 by removing second pass. Also added voice activity detection to progress calculation.
This commit is contained in:
parent
c4b054176c
commit
4346552312
|
@ -5,6 +5,7 @@
|
|||
#include <logging.h>
|
||||
#include <pairs.h>
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
#include <stringTools.h>
|
||||
|
||||
using std::numeric_limits;
|
||||
using std::vector;
|
||||
|
@ -13,7 +14,7 @@ using boost::adaptors::transformed;
|
|||
using fmt::format;
|
||||
|
||||
float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>::max()) {
|
||||
double sum = 0;
|
||||
double sum = 0; // Use double to prevent rounding errors with large number of summands
|
||||
int sampleCount;
|
||||
for (sampleCount = 0; sampleCount < maxSampleCount && !audioStream.endOfStream(); sampleCount++) {
|
||||
sum += std::pow(static_cast<double>(audioStream.readSample()), 2);
|
||||
|
@ -21,7 +22,17 @@ float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>:
|
|||
return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
|
||||
}
|
||||
|
||||
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
|
||||
float getRMS(const vector<float>& rmsSegments) {
|
||||
if (rmsSegments.empty()) return 0;
|
||||
|
||||
double sum = 0; // Use double to prevent rounding errors with large number of summands
|
||||
for (float rmsSegment : rmsSegments) {
|
||||
sum += rmsSegment;
|
||||
}
|
||||
return static_cast<float>(std::sqrt(sum / rmsSegments.size()));
|
||||
}
|
||||
|
||||
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink) {
|
||||
// Make sure audio stream has no DC offset
|
||||
audioStream = removeDCOffset(std::move(audioStream));
|
||||
|
||||
|
@ -30,12 +41,25 @@ BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStre
|
|||
constexpr int sampleRate = 2 * maxFrequency;
|
||||
audioStream = convertSampleRate(std::move(audioStream), sampleRate);
|
||||
|
||||
// Collect RMS data
|
||||
vector<float> rmsSegments;
|
||||
logging::debug("RMS calculation -- start");
|
||||
int64_t centisecondCount = (audioStream->getSampleCount() - audioStream->getSampleIndex()) / 100;
|
||||
for (int cs = 0; cs < centisecondCount; ++cs) {
|
||||
rmsSegments.push_back(getRMS(*audioStream, sampleRate / 100));
|
||||
progressSink.reportProgress(static_cast<double>(cs) / centisecondCount);
|
||||
}
|
||||
logging::debug("RMS calculation -- end");
|
||||
|
||||
const float rms = getRMS(rmsSegments);
|
||||
logging::debugFormat("RMS value: {0:.5f}", rms);
|
||||
|
||||
// Detect activity
|
||||
const float rms = getRMS(*audioStream->clone(true));
|
||||
const float cutoff = rms / 25;
|
||||
logging::debugFormat("RMS cutoff for voice activity detection: {0:.5f}", cutoff);
|
||||
BoundedTimeline<void> activity(audioStream->getTruncatedRange());
|
||||
for (centiseconds time = centiseconds::zero(); !audioStream->endOfStream(); ++time) {
|
||||
float currentRMS = getRMS(*audioStream, sampleRate / 100);
|
||||
for (centiseconds time = centiseconds::zero(); static_cast<size_t>(time.count()) < rmsSegments.size(); ++time) {
|
||||
float currentRMS = rmsSegments[time.count()];
|
||||
bool active = currentRMS > cutoff;
|
||||
if (active) {
|
||||
activity.set(time, time + centiseconds(1));
|
||||
|
|
|
@ -2,5 +2,6 @@
|
|||
#include <memory>
|
||||
#include "AudioStream.h"
|
||||
#include <BoundedTimeline.h>
|
||||
#include <ProgressBar.h>
|
||||
|
||||
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);
|
||||
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink);
|
||||
|
|
|
@ -77,7 +77,7 @@ int16_t floatSampleToInt16(float sample) {
|
|||
}
|
||||
|
||||
void processAudioStream(AudioStream& audioStream16kHz, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
|
||||
// Process entire sound file
|
||||
// Process entire sound stream
|
||||
vector<int16_t> buffer;
|
||||
const int capacity = 1600; // 0.1 second capacity
|
||||
buffer.reserve(capacity);
|
||||
|
@ -155,7 +155,7 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
|
|||
int error = ps_start_utt(&decoder);
|
||||
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
||||
|
||||
// Process entire sound file
|
||||
// Process entire sound stream
|
||||
auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
|
||||
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
|
||||
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||
|
@ -220,8 +220,8 @@ BoundedTimeline<Phone> getPhoneAlignment(
|
|||
// Start search
|
||||
ps_search_start(search.get());
|
||||
|
||||
// Process entire sound file
|
||||
auto processBuffer = [&decoder, &acousticModel, &search](const vector<int16_t>& buffer) {
|
||||
// Process entire sound stream
|
||||
auto processBuffer = [&](const vector<int16_t>& buffer) {
|
||||
const int16* nextSample = buffer.data();
|
||||
size_t remainingSamples = buffer.size();
|
||||
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
||||
|
@ -283,12 +283,6 @@ BoundedTimeline<Phone> detectPhones(
|
|||
optional<u32string> dialog,
|
||||
ProgressSink& progressSink)
|
||||
{
|
||||
// Pocketsphinx doesn't like empty input
|
||||
TimeRange audioRange = audioStream->getTruncatedRange();
|
||||
if (audioRange.empty()) {
|
||||
return BoundedTimeline<Phone>(audioRange);
|
||||
}
|
||||
|
||||
// Discard Pocketsphinx output
|
||||
err_set_logfp(nullptr);
|
||||
|
||||
|
@ -298,12 +292,15 @@ BoundedTimeline<Phone> detectPhones(
|
|||
// Make sure audio stream has no DC offset
|
||||
audioStream = removeDCOffset(std::move(audioStream));
|
||||
|
||||
ProgressMerger totalProgressMerger(progressSink);
|
||||
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
|
||||
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
||||
|
||||
try {
|
||||
// Split audio into utterances
|
||||
BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true));
|
||||
|
||||
BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true), voiceActivationProgressSink);
|
||||
// For progress reporting: weigh utterances by length
|
||||
ProgressMerger dialogProgressMerger(progressSink);
|
||||
ProgressMerger dialogProgressMerger(dialogProgressSink);
|
||||
vector<ProgressSink*> utteranceProgressSinks;
|
||||
for (const auto& timedUtterance : utterances) {
|
||||
utteranceProgressSinks.push_back(&dialogProgressMerger.addSink(timedUtterance.getTimeRange().getLength().count()));
|
||||
|
|
Loading…
Reference in New Issue