rhubarb-lip-sync/src/phone_extraction.cpp

#include <pocketsphinx.h>
#include <iostream>
#include <boost/filesystem.hpp>
#include "phone_extraction.h"
#include "audio_input/SampleRateConverter.h"
#include "audio_input/ChannelDownmixer.h"
#include "platform_tools.h"
using std::runtime_error;
using std::unique_ptr;
using std::shared_ptr;
using std::string;
using std::map;
using boost::filesystem::path;

template<typename T>
using lambda_unique_ptr = std::unique_ptr<T,std::function<void(T*)>>;

unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
	// Downmix, if required
	if (stream->getChannelCount() != 1) {
		stream.reset(new ChannelDownmixer(std::move(stream)));
	}

	// Downsample, if required
	if (stream->getFrameRate() < 16000) {
		throw runtime_error("Audio sample rate must not be below 16kHz.");
	}
	if (stream->getFrameRate() != 16000) {
		stream.reset(new SampleRateConverter(std::move(stream), 16000));
	}

	return stream;
}

lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {
	lambda_unique_ptr<cmd_ln_t> config(
		cmd_ln_init(
			nullptr, ps_args(), true,
			// Set acoustic model
			"-hmm", (sphinxModelDirectory / "acoustic_model").string().c_str(),
			// Set phonetic language model
			"-allphone", (sphinxModelDirectory / "en-us-phone.lm.bin").string().c_str(),
			"-allphone_ci", "yes",
			// The following settings are taken from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
			// Set beam width applied to every frame in Viterbi search
			"-beam", "1e-20",
			// Set beam width applied to phone transitions
			"-pbeam", "1e-20",
			// Set language model probability weight
			"-lw", "2.0",
			nullptr),
		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
	if (!config) throw runtime_error("Error creating configuration.");

	return config;
}

lambda_unique_ptr<ps_decoder_t> createPhoneRecognizer(cmd_ln_t& config) {
	lambda_unique_ptr<ps_decoder_t> recognizer(
		ps_init(&config),
		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
	if (!recognizer) throw runtime_error("Error creating speech recognizer.");

	return recognizer;
}

// Converts a float in the range -1..1 to a signed 16-bit int
int16_t floatSampleToInt16(float sample) {
	sample = std::max(sample, -1.0f);
	sample = std::min(sample, 1.0f);
	return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
}

void processAudioStream(AudioStream& audioStream16kHzMono, ps_decoder_t& recognizer) {
	// Start recognition
	int error = ps_start_utt(&recognizer);
	if (error) throw runtime_error("Error starting utterance processing.");

	// Process entire sound file
	std::vector<int16_t> buffer;
	const int capacity = 1600; // 0.1 second capacity
	buffer.reserve(capacity);
	int sampleCount = 0;
	do {
		// Read to buffer
		buffer.clear();
		while (buffer.size() < capacity) {
			float sample;
			if (!audioStream16kHzMono.getNextSample(sample)) break;
			buffer.push_back(floatSampleToInt16(sample));
		}

		// Analyze buffer
		int searchedFrameCount = ps_process_raw(&recognizer, buffer.data(), buffer.size(), false, false);
		if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data.");

		sampleCount += buffer.size();
	} while (buffer.size());
	error = ps_end_utt(&recognizer);
	if (error) throw runtime_error("Error ending utterance processing.");

}

map<centiseconds, Phone> getPhones(ps_decoder_t& recognizer) {
	map<centiseconds, Phone> result;
	ps_seg_t *segmentationIter;
	int32 score;
	int endFrame;
	for (segmentationIter = ps_seg_iter(&recognizer, &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
		// Get phone
		char const *phone = ps_seg_word(segmentationIter);

		// Get timing
		int startFrame;
		ps_seg_frames(segmentationIter, &startFrame, &endFrame);

		result[centiseconds(startFrame)] = stringToPhone(phone);
	}
	// Add dummy entry past the last phone
	result[centiseconds(endFrame + 1)] = Phone::None;
	return result;
};

map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
	try {
		// Create PocketSphinx configuration
		path sphinxModelDirectory(getBinDirectory().parent_path() / "res/sphinx");
		auto config = createConfig(sphinxModelDirectory);

		// Create phone recognizer
		auto recognizer = createPhoneRecognizer(*config.get());

		// Convert audio stream to the exact format PocketSphinx requires
		audioStream = to16kHzMono(std::move(audioStream));

		// Process data
		processAudioStream(*audioStream.get(), *recognizer.get());

		// Collect results into map
		return getPhones(*recognizer.get());
	} catch (...) {
		std::throw_with_nested(runtime_error("Error detecting phones via Pocketsphinx."));
	}
}
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`#include <pocketsphinx.h>`
			`#include <iostream>`
			`#include <boost/filesystem.hpp>`
			`#include "phone_extraction.h"`
			`#include "audio_input/SampleRateConverter.h"`
			`#include "audio_input/ChannelDownmixer.h"`
			`#include "platform_tools.h"`
			`using std::runtime_error;`
			`using std::unique_ptr;`
			`using std::shared_ptr;`
			`using std::string;`
			`using std::map;`
			`using boost::filesystem::path;`

Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`template<typename T>`
			`using lambda_unique_ptr = std::unique_ptr<T,std::function<void(T*)>>;`

Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {`
			`// Downmix, if required`
			`if (stream->getChannelCount() != 1) {`
			`stream.reset(new ChannelDownmixer(std::move(stream)));`
			`}`

			`// Downsample, if required`
			`if (stream->getFrameRate() < 16000) {`
Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`throw runtime_error("Audio sample rate must not be below 16kHz.");`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`}`
			`if (stream->getFrameRate() != 16000) {`
			`stream.reset(new SampleRateConverter(std::move(stream), 16000));`
			`}`

			`return stream;`
			`}`

Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {`
			`lambda_unique_ptr<cmd_ln_t> config(`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`cmd_ln_init(`
			`nullptr, ps_args(), true,`
			`// Set acoustic model`
Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`"-hmm", (sphinxModelDirectory / "acoustic_model").string().c_str(),`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`// Set phonetic language model`
Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`"-allphone", (sphinxModelDirectory / "en-us-phone.lm.bin").string().c_str(),`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`"-allphone_ci", "yes",`
			`// The following settings are taken from http://cmusphinx.sourceforge.net/wiki/phonemerecognition`
			`// Set beam width applied to every frame in Viterbi search`
			`"-beam", "1e-20",`
			`// Set beam width applied to phone transitions`
			`"-pbeam", "1e-20",`
			`// Set language model probability weight`
			`"-lw", "2.0",`
			`nullptr),`
			`[](cmd_ln_t* config) { cmd_ln_free_r(config); });`
			`if (!config) throw runtime_error("Error creating configuration.");`

Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`return config;`
			`}`

			`lambda_unique_ptr<ps_decoder_t> createPhoneRecognizer(cmd_ln_t& config) {`
			`lambda_unique_ptr<ps_decoder_t> recognizer(`
			`ps_init(&config),`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`[](ps_decoder_t* recognizer) { ps_free(recognizer); });`
			`if (!recognizer) throw runtime_error("Error creating speech recognizer.");`

Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`return recognizer;`
			`}`

			`// Converts a float in the range -1..1 to a signed 16-bit int`
			`int16_t floatSampleToInt16(float sample) {`
			`sample = std::max(sample, -1.0f);`
			`sample = std::min(sample, 1.0f);`
			`return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);`
			`}`

			`void processAudioStream(AudioStream& audioStream16kHzMono, ps_decoder_t& recognizer) {`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`// Start recognition`
Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`int error = ps_start_utt(&recognizer);`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`if (error) throw runtime_error("Error starting utterance processing.");`

			`// Process entire sound file`
			`std::vector<int16_t> buffer;`
			`const int capacity = 1600; // 0.1 second capacity`
			`buffer.reserve(capacity);`
			`int sampleCount = 0;`
			`do {`
			`// Read to buffer`
			`buffer.clear();`
			`while (buffer.size() < capacity) {`
			`float sample;`
Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`if (!audioStream16kHzMono.getNextSample(sample)) break;`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`buffer.push_back(floatSampleToInt16(sample));`
			`}`

			`// Analyze buffer`
Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`int searchedFrameCount = ps_process_raw(&recognizer, buffer.data(), buffer.size(), false, false);`
			`if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data.");`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00
			`sampleCount += buffer.size();`
			`} while (buffer.size());`
Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`error = ps_end_utt(&recognizer);`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`if (error) throw runtime_error("Error ending utterance processing.");`

Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`}`

			`map<centiseconds, Phone> getPhones(ps_decoder_t& recognizer) {`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`map<centiseconds, Phone> result;`
			`ps_seg_t *segmentationIter;`
			`int32 score;`
			`int endFrame;`
Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`for (segmentationIter = ps_seg_iter(&recognizer, &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`// Get phone`
			`char const *phone = ps_seg_word(segmentationIter);`

			`// Get timing`
			`int startFrame;`
			`ps_seg_frames(segmentationIter, &startFrame, &endFrame);`

			`result[centiseconds(startFrame)] = stringToPhone(phone);`
			`}`
			`// Add dummy entry past the last phone`
			`result[centiseconds(endFrame + 1)] = Phone::None;`
			`return result;`
Improved error handling Plus some refactoring 2015-11-19 17:32:14 +00:00			`};`

			`map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {`
			`try {`
			`// Create PocketSphinx configuration`
			`path sphinxModelDirectory(getBinDirectory().parent_path() / "res/sphinx");`
			`auto config = createConfig(sphinxModelDirectory);`

			`// Create phone recognizer`
			`auto recognizer = createPhoneRecognizer(*config.get());`

			`// Convert audio stream to the exact format PocketSphinx requires`
			`audioStream = to16kHzMono(std::move(audioStream));`

			`// Process data`
			`processAudioStream(audioStream.get(), recognizer.get());`

			`// Collect results into map`
			`return getPhones(*recognizer.get());`
			`} catch (...) {`
			`std::throw_with_nested(runtime_error("Error detecting phones via Pocketsphinx."));`
			`}`
Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization 2015-11-18 19:59:03 +00:00			`}`