From c67e9161854068e1f11cbc09535a988ff318722f Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Tue, 17 May 2016 14:16:16 +0200 Subject: [PATCH] Splitting audio into utterances before processing Advantages: * No problems with long silences (PocketSphinx doesn't like them) * Potential for parallelization * Potential for improved phone timing accuracy --- src/audio/voiceActivityDetection.cpp | 7 +-- src/phoneExtraction.cpp | 85 +++++++++++++++++++--------- 2 files changed, 59 insertions(+), 33 deletions(-) diff --git a/src/audio/voiceActivityDetection.cpp b/src/audio/voiceActivityDetection.cpp index 19f7ecd..9519985 100644 --- a/src/audio/voiceActivityDetection.cpp +++ b/src/audio/voiceActivityDetection.cpp @@ -29,7 +29,7 @@ BoundedTimeline detectVoiceActivity(std::unique_ptr audioStre // Detect activity const float rms = getRMS(*audioStream->clone(true)); - const float cutoff = rms / 50; + const float cutoff = rms / 25; BoundedTimeline activity(audioStream->getTruncatedRange()); for (centiseconds time = centiseconds::zero(); !audioStream->endOfStream(); ++time) { float currentRMS = getRMS(*audioStream, sampleRate / 100); @@ -53,10 +53,5 @@ BoundedTimeline detectVoiceActivity(std::unique_ptr audioStre } } - // Log - for (const auto& utterance : activity) { - logging::logTimedEvent("utterance", utterance.getTimeRange(), std::string()); - } - return activity; } diff --git a/src/phoneExtraction.cpp b/src/phoneExtraction.cpp index dab5ccb..0709c3e 100644 --- a/src/phoneExtraction.cpp +++ b/src/phoneExtraction.cpp @@ -12,6 +12,8 @@ #include #include