Sharing audio buffer between operations

This commit is contained in:
Daniel Wolf 2016-09-26 13:11:01 +02:00
parent de05f69507
commit 750078618c
3 changed files with 14 additions and 25 deletions

View File

@ -39,11 +39,11 @@ void process16bitAudioClip(const AudioClip& audioClip, function<void(const vecto
process16bitAudioClip(audioClip, processBuffer, capacity, progressSink); process16bitAudioClip(audioClip, processBuffer, capacity, progressSink);
} }
unique_ptr<vector<int16_t>> copyTo16bitBuffer(const AudioClip& audioClip) { vector<int16_t> copyTo16bitBuffer(const AudioClip& audioClip) {
auto result = std::make_unique<vector<int16_t>>(static_cast<size_t>(audioClip.size())); vector<int16_t> result(static_cast<size_t>(audioClip.size()));
int index = 0; int index = 0;
for (float sample : audioClip) { for (float sample : audioClip) {
(*result)[index++] = floatSampleToInt16(sample); result[index++] = floatSampleToInt16(sample);
} }
return std::move(result); return std::move(result);
} }

View File

@ -7,4 +7,4 @@
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink); void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink);
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink); void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink);
std::unique_ptr<std::vector<int16_t>> copyTo16bitBuffer(const AudioClip& audioClip); std::vector<int16_t> copyTo16bitBuffer(const AudioClip& audioClip);

View File

@ -97,10 +97,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
logging::log(logLevel, message); logging::log(logLevel, message);
} }
BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder) { BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
// Convert audio stream to the exact format PocketSphinx requires
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
// Restart timing at 0 // Restart timing at 0
ps_start_stream(&decoder); ps_start_stream(&decoder);
@ -109,21 +106,16 @@ BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decod
if (error) throw runtime_error("Error starting utterance processing for word recognition."); if (error) throw runtime_error("Error starting utterance processing for word recognition.");
// Process entire audio clip // Process entire audio clip
auto buffer = copyTo16bitBuffer(*audioClip);
const bool noRecognition = false; const bool noRecognition = false;
const bool fullUtterance = true; const bool fullUtterance = true;
int searchedFrameCount = ps_process_raw(&decoder, buffer->data(), buffer->size(), noRecognition, fullUtterance); int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition."); if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
// End recognition // End recognition
error = ps_end_utt(&decoder); error = ps_end_utt(&decoder);
if (error) throw runtime_error("Error ending utterance processing for word recognition."); if (error) throw runtime_error("Error ending utterance processing for word recognition.");
// PocketSphinx can't handle an utterance with no recognized words. BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
// As a result, the following utterance will be garbage.
// As a workaround, we throw away the decoder in this case.
// See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529
BoundedTimeline<string> result(audioClip->getTruncatedRange());
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0; bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
if (noWordsRecognized) { if (noWordsRecognized) {
return result; return result;
@ -148,7 +140,7 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
optional<Timeline<Phone>> getPhoneAlignment( optional<Timeline<Phone>> getPhoneAlignment(
const vector<s3wid_t>& wordIds, const vector<s3wid_t>& wordIds,
const AudioClip& inputAudioClip, const vector<int16_t>& audioBuffer,
ps_decoder_t& decoder) ps_decoder_t& decoder)
{ {
// Create alignment list // Create alignment list
@ -163,9 +155,6 @@ optional<Timeline<Phone>> getPhoneAlignment(
int error = ps_alignment_populate(alignment.get()); int error = ps_alignment_populate(alignment.get());
if (error) throw runtime_error("Error populating alignment struct."); if (error) throw runtime_error("Error populating alignment struct.");
// Convert audio stream to the exact format PocketSphinx requires
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
// Create search structure // Create search structure
acmod_t* acousticModel = decoder.acmod; acmod_t* acousticModel = decoder.acmod;
lambda_unique_ptr<ps_search_t> search( lambda_unique_ptr<ps_search_t> search(
@ -185,9 +174,8 @@ optional<Timeline<Phone>> getPhoneAlignment(
ps_search_start(search.get()); ps_search_start(search.get());
// Process entire audio clip // Process entire audio clip
auto buffer = copyTo16bitBuffer(*audioClip); const int16* nextSample = audioBuffer.data();
const int16* nextSample = buffer->data(); size_t remainingSamples = audioBuffer.size();
size_t remainingSamples = buffer->size();
bool fullUtterance = true; bool fullUtterance = true;
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) { while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
while (acousticModel->n_feat_frame > 0) { while (acousticModel->n_feat_frame > 0) {
@ -300,10 +288,11 @@ Timeline<Phone> utteranceToPhones(
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0); ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5); ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance); const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance) | resample(sphinxSampleRate);
const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
// Get words // Get words
BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder); BoundedTimeline<string> words = recognizeWords(audioBuffer, decoder);
wordRecognitionProgressSink.reportProgress(1.0); wordRecognitionProgressSink.reportProgress(1.0);
for (Timed<string> timedWord : words) { for (Timed<string> timedWord : words) {
timedWord.getTimeRange().shift(utterance.getStart()); timedWord.getTimeRange().shift(utterance.getStart());
@ -321,7 +310,7 @@ Timeline<Phone> utteranceToPhones(
#if BOOST_VERSION < 105600 // Support legacy syntax #if BOOST_VERSION < 105600 // Support legacy syntax
#define value_or get_value_or #define value_or get_value_or
#endif #endif
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder) Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, audioBuffer, decoder)
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise)); .value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
alignmentProgressSink.reportProgress(1.0); alignmentProgressSink.reportProgress(1.0);
segmentPhones.shift(utterance.getStart()); segmentPhones.shift(utterance.getStart());