Performing per-utterance cepstral mean normalization
See discussion in https://sourceforge.net/p/cmusphinx/discussion/help/thread/51e2979b/
This commit is contained in:
parent
f4f9ffe883
commit
d97c880754
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
using std::function;
|
using std::function;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
using std::unique_ptr;
|
||||||
|
|
||||||
// Converts a float in the range -1..1 to a signed 16-bit int
|
// Converts a float in the range -1..1 to a signed 16-bit int
|
||||||
inline int16_t floatSampleToInt16(float sample) {
|
inline int16_t floatSampleToInt16(float sample) {
|
||||||
|
@ -38,3 +39,11 @@ void process16bitAudioClip(const AudioClip& audioClip, function<void(const vecto
|
||||||
process16bitAudioClip(audioClip, processBuffer, capacity, progressSink);
|
process16bitAudioClip(audioClip, processBuffer, capacity, progressSink);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unique_ptr<vector<int16_t>> copyTo16bitBuffer(const AudioClip& audioClip) {
|
||||||
|
auto result = std::make_unique<vector<int16_t>>(static_cast<size_t>(audioClip.size()));
|
||||||
|
int index = 0;
|
||||||
|
for (float sample : audioClip) {
|
||||||
|
(*result)[index++] = floatSampleToInt16(sample);
|
||||||
|
}
|
||||||
|
return std::move(result);
|
||||||
|
}
|
||||||
|
|
|
@ -6,4 +6,5 @@
|
||||||
#include "ProgressBar.h"
|
#include "ProgressBar.h"
|
||||||
|
|
||||||
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink);
|
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink);
|
||||||
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink);
|
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink);
|
||||||
|
std::unique_ptr<std::vector<int16_t>> copyTo16bitBuffer(const AudioClip& audioClip);
|
|
@ -96,7 +96,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
|
||||||
logging::log(logLevel, message);
|
logging::log(logLevel, message);
|
||||||
}
|
}
|
||||||
|
|
||||||
BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
|
BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable) {
|
||||||
// Convert audio stream to the exact format PocketSphinx requires
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
|
||||||
|
|
||||||
|
@ -107,12 +107,12 @@ BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decod
|
||||||
int error = ps_start_utt(&decoder);
|
int error = ps_start_utt(&decoder);
|
||||||
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
||||||
|
|
||||||
// Process entire sound stream
|
// Process entire audio clip
|
||||||
auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
|
auto buffer = copyTo16bitBuffer(*audioClip);
|
||||||
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
|
const bool noRecognition = false;
|
||||||
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
const bool fullUtterance = true;
|
||||||
};
|
int searchedFrameCount = ps_process_raw(&decoder, buffer->data(), buffer->size(), noRecognition, fullUtterance);
|
||||||
process16bitAudioClip(*audioClip, processBuffer, progressSink);
|
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||||
|
|
||||||
// End recognition
|
// End recognition
|
||||||
error = ps_end_utt(&decoder);
|
error = ps_end_utt(&decoder);
|
||||||
|
@ -154,8 +154,7 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||||
optional<Timeline<Phone>> getPhoneAlignment(
|
optional<Timeline<Phone>> getPhoneAlignment(
|
||||||
const vector<s3wid_t>& wordIds,
|
const vector<s3wid_t>& wordIds,
|
||||||
const AudioClip& inputAudioClip,
|
const AudioClip& inputAudioClip,
|
||||||
ps_decoder_t& decoder,
|
ps_decoder_t& decoder)
|
||||||
ProgressSink& progressSink)
|
|
||||||
{
|
{
|
||||||
// Create alignment list
|
// Create alignment list
|
||||||
lambda_unique_ptr<ps_alignment_t> alignment(
|
lambda_unique_ptr<ps_alignment_t> alignment(
|
||||||
|
@ -190,18 +189,17 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
||||||
// Start search
|
// Start search
|
||||||
ps_search_start(search.get());
|
ps_search_start(search.get());
|
||||||
|
|
||||||
// Process entire sound stream
|
// Process entire audio clip
|
||||||
auto processBuffer = [&](const vector<int16_t>& buffer) {
|
auto buffer = copyTo16bitBuffer(*audioClip);
|
||||||
const int16* nextSample = buffer.data();
|
const int16* nextSample = buffer->data();
|
||||||
size_t remainingSamples = buffer.size();
|
size_t remainingSamples = buffer->size();
|
||||||
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
bool fullUtterance = true;
|
||||||
while (acousticModel->n_feat_frame > 0) {
|
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
|
||||||
ps_search_step(search.get(), acousticModel->output_frame);
|
while (acousticModel->n_feat_frame > 0) {
|
||||||
acmod_advance(acousticModel);
|
ps_search_step(search.get(), acousticModel->output_frame);
|
||||||
}
|
acmod_advance(acousticModel);
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
process16bitAudioClip(*audioClip, processBuffer, progressSink);
|
|
||||||
|
|
||||||
// End search
|
// End search
|
||||||
error = ps_search_finish(search.get());
|
error = ps_search_finish(search.get());
|
||||||
|
@ -263,6 +261,8 @@ lambda_unique_ptr<ps_decoder_t> createDecoder(optional<u32string> dialog) {
|
||||||
"-dither", "yes",
|
"-dither", "yes",
|
||||||
// Disable VAD -- we're doing that ourselves
|
// Disable VAD -- we're doing that ourselves
|
||||||
"-remove_silence", "no",
|
"-remove_silence", "no",
|
||||||
|
// Perform per-utterance cepstral mean normalization
|
||||||
|
"-cmn", "batch",
|
||||||
nullptr),
|
nullptr),
|
||||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||||
if (!config) throw runtime_error("Error creating configuration.");
|
if (!config) throw runtime_error("Error creating configuration.");
|
||||||
|
@ -309,7 +309,8 @@ Timeline<Phone> utteranceToPhones(
|
||||||
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance);
|
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance);
|
||||||
|
|
||||||
// Get words
|
// Get words
|
||||||
BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable, wordRecognitionProgressSink);
|
BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable);
|
||||||
|
wordRecognitionProgressSink.reportProgress(1.0);
|
||||||
for (Timed<string> timedWord : words) {
|
for (Timed<string> timedWord : words) {
|
||||||
timedWord.getTimeRange().shift(utterance.getStart());
|
timedWord.getTimeRange().shift(utterance.getStart());
|
||||||
logging::logTimedEvent("word", timedWord);
|
logging::logTimedEvent("word", timedWord);
|
||||||
|
@ -326,8 +327,9 @@ Timeline<Phone> utteranceToPhones(
|
||||||
#if BOOST_VERSION < 105600 // Support legacy syntax
|
#if BOOST_VERSION < 105600 // Support legacy syntax
|
||||||
#define value_or get_value_or
|
#define value_or get_value_or
|
||||||
#endif
|
#endif
|
||||||
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
|
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder)
|
||||||
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
|
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
|
||||||
|
alignmentProgressSink.reportProgress(1.0);
|
||||||
segmentPhones.shift(utterance.getStart());
|
segmentPhones.shift(utterance.getStart());
|
||||||
for (const auto& timedPhone : segmentPhones) {
|
for (const auto& timedPhone : segmentPhones) {
|
||||||
logging::logTimedEvent("rawPhone", timedPhone);
|
logging::logTimedEvent("rawPhone", timedPhone);
|
||||||
|
|
Loading…
Reference in New Issue