Gracefully handling failed audio alignment
This commit is contained in:
parent
944c374415
commit
0e00e58d91
|
@ -16,6 +16,7 @@
|
||||||
#include "languageModels.h"
|
#include "languageModels.h"
|
||||||
#include "tokenization.h"
|
#include "tokenization.h"
|
||||||
#include "g2p.h"
|
#include "g2p.h"
|
||||||
|
#include "ContinuousTimeline.h"
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include <pocketsphinx.h>
|
#include <pocketsphinx.h>
|
||||||
|
@ -184,7 +185,7 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||||
return wordId;
|
return wordId;
|
||||||
}
|
}
|
||||||
|
|
||||||
BoundedTimeline<Phone> getPhoneAlignment(
|
optional<BoundedTimeline<Phone>> getPhoneAlignment(
|
||||||
const vector<s3wid_t>& wordIds,
|
const vector<s3wid_t>& wordIds,
|
||||||
unique_ptr<AudioStream> audioStream,
|
unique_ptr<AudioStream> audioStream,
|
||||||
ps_decoder_t& decoder,
|
ps_decoder_t& decoder,
|
||||||
|
@ -216,27 +217,30 @@ BoundedTimeline<Phone> getPhoneAlignment(
|
||||||
error = acmod_start_utt(acousticModel);
|
error = acmod_start_utt(acousticModel);
|
||||||
if (error) throw runtime_error("Error starting utterance processing for alignment.");
|
if (error) throw runtime_error("Error starting utterance processing for alignment.");
|
||||||
|
|
||||||
// Start search
|
{
|
||||||
ps_search_start(search.get());
|
// Eventually end recognition
|
||||||
|
auto endRecognition = gsl::finally([&]() { acmod_end_utt(acousticModel); });
|
||||||
|
|
||||||
// Process entire sound stream
|
// Start search
|
||||||
auto processBuffer = [&](const vector<int16_t>& buffer) {
|
ps_search_start(search.get());
|
||||||
const int16* nextSample = buffer.data();
|
|
||||||
size_t remainingSamples = buffer.size();
|
// Process entire sound stream
|
||||||
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
auto processBuffer = [&](const vector<int16_t>& buffer) {
|
||||||
while (acousticModel->n_feat_frame > 0) {
|
const int16* nextSample = buffer.data();
|
||||||
ps_search_step(search.get(), acousticModel->output_frame);
|
size_t remainingSamples = buffer.size();
|
||||||
acmod_advance(acousticModel);
|
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
|
||||||
|
while (acousticModel->n_feat_frame > 0) {
|
||||||
|
ps_search_step(search.get(), acousticModel->output_frame);
|
||||||
|
acmod_advance(acousticModel);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
};
|
processAudioStream(*audioStream.get(), processBuffer, progressSink);
|
||||||
processAudioStream(*audioStream.get(), processBuffer, progressSink);
|
|
||||||
|
|
||||||
// End search
|
// End search
|
||||||
ps_search_finish(search.get());
|
error = ps_search_finish(search.get());
|
||||||
|
if (error) return boost::none;
|
||||||
// End recognition
|
}
|
||||||
acmod_end_utt(acousticModel);
|
|
||||||
|
|
||||||
// Extract phones with timestamps
|
// Extract phones with timestamps
|
||||||
char** phoneNames = decoder.dict->mdef->ciname;
|
char** phoneNames = decoder.dict->mdef->ciname;
|
||||||
|
@ -356,7 +360,8 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
if (wordIds.empty()) continue;
|
if (wordIds.empty()) continue;
|
||||||
|
|
||||||
// Align the words' phones with speech
|
// Align the words' phones with speech
|
||||||
BoundedTimeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), *decoder.get(), alignmentProgressSink);
|
BoundedTimeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), *decoder.get(), alignmentProgressSink)
|
||||||
|
.value_or(ContinuousTimeline<Phone>(streamSegment->getTruncatedRange(), Phone::Unknown));
|
||||||
segmentPhones.shift(timedUtterance.getStart());
|
segmentPhones.shift(timedUtterance.getStart());
|
||||||
for (const auto& timedPhone : segmentPhones) {
|
for (const auto& timedPhone : segmentPhones) {
|
||||||
logging::logTimedEvent("phone", timedPhone);
|
logging::logTimedEvent("phone", timedPhone);
|
||||||
|
|
Loading…
Reference in New Issue