Performing per-utterance cepstral mean normalization

See discussion in https://sourceforge.net/p/cmusphinx/discussion/help/thread/51e2979b/
2016-09-18 22:02:02 +02:00 · 2016-09-18 22:02:02 +02:00 · d97c880754
parent f4f9ffe883
commit d97c880754
3 changed files with 35 additions and 23 deletions
--- a/src/audio/processing.cpp
+++ b/src/audio/processing.cpp
@ -2,6 +2,7 @@

 using std::function;
 using std::vector;
+using std::unique_ptr;

 // Converts a float in the range -1..1 to a signed 16-bit int
 inline int16_t floatSampleToInt16(float sample) {
@ -38,3 +39,11 @@ void process16bitAudioClip(const AudioClip& audioClip, function<void(const vecto
 	process16bitAudioClip(audioClip, processBuffer, capacity, progressSink);
 }

+unique_ptr<vector<int16_t>> copyTo16bitBuffer(const AudioClip& audioClip) {
+	auto result = std::make_unique<vector<int16_t>>(static_cast<size_t>(audioClip.size()));
+	int index = 0;
+	for (float sample : audioClip) {
+		(*result)[index++] = floatSampleToInt16(sample);
+	}
+	return std::move(result);
+}
--- a/src/audio/processing.h
+++ b/src/audio/processing.h
@ -7,3 +7,4 @@

 void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink);
 void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink);
+std::unique_ptr<std::vector<int16_t>> copyTo16bitBuffer(const AudioClip& audioClip);
--- a/src/phoneExtraction.cpp
+++ b/src/phoneExtraction.cpp
@ -96,7 +96,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
 	logging::log(logLevel, message);
 }

-BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
+BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable) {
 	// Convert audio stream to the exact format PocketSphinx requires
 	const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);

@ -107,12 +107,12 @@ BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decod
 	int error = ps_start_utt(&decoder);
 	if (error) throw runtime_error("Error starting utterance processing for word recognition.");

-	// Process entire sound stream
-	auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
-		int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
+	// Process entire audio clip
+	auto buffer = copyTo16bitBuffer(*audioClip);
+	const bool noRecognition = false;
+	const bool fullUtterance = true;
+	int searchedFrameCount = ps_process_raw(&decoder, buffer->data(), buffer->size(), noRecognition, fullUtterance);
 	if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
-	};
-	process16bitAudioClip(*audioClip, processBuffer, progressSink);

 	// End recognition
 	error = ps_end_utt(&decoder);
@ -154,8 +154,7 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
 optional<Timeline<Phone>> getPhoneAlignment(
 	const vector<s3wid_t>& wordIds,
 	const AudioClip& inputAudioClip,
-	ps_decoder_t& decoder,
-	ProgressSink& progressSink)
+	ps_decoder_t& decoder)
 {
 	// Create alignment list
 	lambda_unique_ptr<ps_alignment_t> alignment(
@ -190,18 +189,17 @@ optional<Timeline<Phone>> getPhoneAlignment(
 		// Start search
 		ps_search_start(search.get());

-		// Process entire sound stream
-		auto processBuffer = [&](const vector<int16_t>& buffer) {
-			const int16* nextSample = buffer.data();
-			size_t remainingSamples = buffer.size();
-			while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
+		// Process entire audio clip
+		auto buffer = copyTo16bitBuffer(*audioClip);
+		const int16* nextSample = buffer->data();
+		size_t remainingSamples = buffer->size();
+		bool fullUtterance = true;
+		while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
 			while (acousticModel->n_feat_frame > 0) {
 				ps_search_step(search.get(), acousticModel->output_frame);
 				acmod_advance(acousticModel);
 			}
 		}
-		};
-		process16bitAudioClip(*audioClip, processBuffer, progressSink);

 		// End search
 		error = ps_search_finish(search.get());
@ -263,6 +261,8 @@ lambda_unique_ptr<ps_decoder_t> createDecoder(optional<u32string> dialog) {
 			"-dither", "yes",
 			// Disable VAD -- we're doing that ourselves
 			"-remove_silence", "no",
+			// Perform per-utterance cepstral mean normalization
+			"-cmn", "batch",
 			nullptr),
 		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
 	if (!config) throw runtime_error("Error creating configuration.");
@ -309,7 +309,8 @@ Timeline<Phone> utteranceToPhones(
 	const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance);

 	// Get words
-	BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable, wordRecognitionProgressSink);
+	BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable);
+	wordRecognitionProgressSink.reportProgress(1.0);
 	for (Timed<string> timedWord : words) {
 		timedWord.getTimeRange().shift(utterance.getStart());
 		logging::logTimedEvent("word", timedWord);
@ -326,8 +327,9 @@ Timeline<Phone> utteranceToPhones(
 #if BOOST_VERSION < 105600 // Support legacy syntax
 #define value_or get_value_or
 #endif
-	Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
+	Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder)
 		.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Noise));
+	alignmentProgressSink.reportProgress(1.0);
 	segmentPhones.shift(utterance.getStart());
 	for (const auto& timedPhone : segmentPhones) {
 		logging::logTimedEvent("rawPhone", timedPhone);