Improved VAD quality via dry run
This commit is contained in:
parent
6de7ba020a
commit
8fa494fb77
|
@ -14,6 +14,7 @@ using std::vector;
|
||||||
using boost::adaptors::transformed;
|
using boost::adaptors::transformed;
|
||||||
using fmt::format;
|
using fmt::format;
|
||||||
using std::runtime_error;
|
using std::runtime_error;
|
||||||
|
using std::unique_ptr;
|
||||||
|
|
||||||
BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, ProgressSink& progressSink) {
|
BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, ProgressSink& progressSink) {
|
||||||
VadInst* vadHandle = WebRtcVad_Create();
|
VadInst* vadHandle = WebRtcVad_Create();
|
||||||
|
@ -45,7 +46,18 @@ BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, Progre
|
||||||
}
|
}
|
||||||
time += centiseconds(1);
|
time += centiseconds(1);
|
||||||
};
|
};
|
||||||
process16bitAudioStream(audioStream, processBuffer, bufferCapacity, progressSink);
|
process16bitAudioStream(*audioStream.clone(true), processBuffer, bufferCapacity, progressSink);
|
||||||
|
|
||||||
|
// WebRTC adapts to the audio. This means results may not be correct at the very beginning.
|
||||||
|
// It sometimes returns false activity at the very beginning, mistaking the background noise for speech.
|
||||||
|
// So we delete the first recognized utterance and re-process the corresponding audio segment.
|
||||||
|
if (!activity.empty()) {
|
||||||
|
TimeRange firstActivity = activity.begin()->getTimeRange();
|
||||||
|
activity.clear(firstActivity);
|
||||||
|
unique_ptr<AudioStream> streamStart = createSegment(audioStream.clone(true), TimeRange(centiseconds::zero(), firstActivity.getEnd()));
|
||||||
|
time = centiseconds::zero();
|
||||||
|
process16bitAudioStream(*streamStart, processBuffer, bufferCapacity, progressSink);
|
||||||
|
}
|
||||||
|
|
||||||
return activity;
|
return activity;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue