Improved error handling

Plus some refactoring
This commit is contained in:
Daniel Wolf 2015-11-19 18:32:14 +01:00
parent f2f6f75932
commit 132adb1083
2 changed files with 90 additions and 36 deletions

View File

@ -2,16 +2,42 @@
#include "audio_input/WaveFileReader.h" #include "audio_input/WaveFileReader.h"
#include "phone_extraction.h" #include "phone_extraction.h"
int main(int argc, char *argv[]) { using std::exception;
// Create audio stream using std::string;
std::unique_ptr<AudioStream> audioStream( using std::unique_ptr;
new WaveFileReader(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)"));
std::map<centiseconds, Phone> phones = detectPhones(std::move(audioStream)); string getMessage(const exception& e) {
string result(e.what());
try {
std::rethrow_if_nested(e);
} catch(const exception& innerException) {
result += "\n" + getMessage(innerException);
} catch(...) {}
for (auto& pair : phones) { return result;
std::cout << pair.first << ": " << phoneToString(pair.second) << "\n"; }
unique_ptr<AudioStream> createAudioStream(string fileName) {
try {
return unique_ptr<AudioStream>(new WaveFileReader(fileName));
} catch (...) {
std::throw_with_nested(std::runtime_error("Could not open sound file.") );
} }
}
return 0; int main(int argc, char *argv[]) {
try {
unique_ptr<AudioStream> audioStream = createAudioStream(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)");
std::map<centiseconds, Phone> phones = detectPhones(std::move(audioStream));
for (auto &pair : phones) {
std::cout << pair.first << ": " << phoneToString(pair.second) << "\n";
}
return 0;
} catch (const exception& e) {
std::cout << "An error occurred. " << getMessage(e);
return 1;
}
} }

View File

@ -12,6 +12,9 @@ using std::string;
using std::map; using std::map;
using boost::filesystem::path; using boost::filesystem::path;
template<typename T>
using lambda_unique_ptr = std::unique_ptr<T,std::function<void(T*)>>;
unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) { unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
// Downmix, if required // Downmix, if required
if (stream->getChannelCount() != 1) { if (stream->getChannelCount() != 1) {
@ -20,7 +23,7 @@ unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
// Downsample, if required // Downsample, if required
if (stream->getFrameRate() < 16000) { if (stream->getFrameRate() < 16000) {
throw runtime_error("Sample rate must not be below 16kHz."); throw runtime_error("Audio sample rate must not be below 16kHz.");
} }
if (stream->getFrameRate() != 16000) { if (stream->getFrameRate() != 16000) {
stream.reset(new SampleRateConverter(std::move(stream), 16000)); stream.reset(new SampleRateConverter(std::move(stream), 16000));
@ -29,27 +32,14 @@ unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
return stream; return stream;
} }
// Converts a float in the range -1..1 to a signed 16-bit int lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {
int16_t floatSampleToInt16(float sample) { lambda_unique_ptr<cmd_ln_t> config(
sample = std::max(sample, -1.0f);
sample = std::min(sample, 1.0f);
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
}
map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
// Convert audio stream to the exact format PocketSphinx requires
audioStream = to16kHzMono(std::move(audioStream));
// Create PocketSphinx configuration
path binDirectory(getBinDirectory());
path resDirectory(binDirectory.parent_path() / "res");
shared_ptr<cmd_ln_t> config(
cmd_ln_init( cmd_ln_init(
nullptr, ps_args(), true, nullptr, ps_args(), true,
// Set acoustic model // Set acoustic model
"-hmm", (resDirectory / "sphinx/acoustic_model").string().c_str(), "-hmm", (sphinxModelDirectory / "acoustic_model").string().c_str(),
// Set phonetic language model // Set phonetic language model
"-allphone", (resDirectory / "sphinx/en-us-phone.lm.bin").string().c_str(), "-allphone", (sphinxModelDirectory / "en-us-phone.lm.bin").string().c_str(),
"-allphone_ci", "yes", "-allphone_ci", "yes",
// The following settings are taken from http://cmusphinx.sourceforge.net/wiki/phonemerecognition // The following settings are taken from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
// Set beam width applied to every frame in Viterbi search // Set beam width applied to every frame in Viterbi search
@ -62,14 +52,28 @@ map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
[](cmd_ln_t* config) { cmd_ln_free_r(config); }); [](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration."); if (!config) throw runtime_error("Error creating configuration.");
// Create phone recognizer return config;
shared_ptr<ps_decoder_t> recognizer( }
ps_init(config.get()),
lambda_unique_ptr<ps_decoder_t> createPhoneRecognizer(cmd_ln_t& config) {
lambda_unique_ptr<ps_decoder_t> recognizer(
ps_init(&config),
[](ps_decoder_t* recognizer) { ps_free(recognizer); }); [](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!recognizer) throw runtime_error("Error creating speech recognizer."); if (!recognizer) throw runtime_error("Error creating speech recognizer.");
return recognizer;
}
// Converts a float in the range -1..1 to a signed 16-bit int
int16_t floatSampleToInt16(float sample) {
sample = std::max(sample, -1.0f);
sample = std::min(sample, 1.0f);
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
}
void processAudioStream(AudioStream& audioStream16kHzMono, ps_decoder_t& recognizer) {
// Start recognition // Start recognition
int error = ps_start_utt(recognizer.get()); int error = ps_start_utt(&recognizer);
if (error) throw runtime_error("Error starting utterance processing."); if (error) throw runtime_error("Error starting utterance processing.");
// Process entire sound file // Process entire sound file
@ -82,25 +86,27 @@ map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
buffer.clear(); buffer.clear();
while (buffer.size() < capacity) { while (buffer.size() < capacity) {
float sample; float sample;
if (!audioStream->getNextSample(sample)) break; if (!audioStream16kHzMono.getNextSample(sample)) break;
buffer.push_back(floatSampleToInt16(sample)); buffer.push_back(floatSampleToInt16(sample));
} }
// Analyze buffer // Analyze buffer
int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false); int searchedFrameCount = ps_process_raw(&recognizer, buffer.data(), buffer.size(), false, false);
if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data."); if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data.");
sampleCount += buffer.size(); sampleCount += buffer.size();
} while (buffer.size()); } while (buffer.size());
error = ps_end_utt(recognizer.get()); error = ps_end_utt(&recognizer);
if (error) throw runtime_error("Error ending utterance processing."); if (error) throw runtime_error("Error ending utterance processing.");
// Collect results into map }
map<centiseconds, Phone> getPhones(ps_decoder_t& recognizer) {
map<centiseconds, Phone> result; map<centiseconds, Phone> result;
ps_seg_t *segmentationIter; ps_seg_t *segmentationIter;
int32 score; int32 score;
int endFrame; int endFrame;
for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) { for (segmentationIter = ps_seg_iter(&recognizer, &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
// Get phone // Get phone
char const *phone = ps_seg_word(segmentationIter); char const *phone = ps_seg_word(segmentationIter);
@ -113,4 +119,26 @@ map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
// Add dummy entry past the last phone // Add dummy entry past the last phone
result[centiseconds(endFrame + 1)] = Phone::None; result[centiseconds(endFrame + 1)] = Phone::None;
return result; return result;
};
map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
try {
// Create PocketSphinx configuration
path sphinxModelDirectory(getBinDirectory().parent_path() / "res/sphinx");
auto config = createConfig(sphinxModelDirectory);
// Create phone recognizer
auto recognizer = createPhoneRecognizer(*config.get());
// Convert audio stream to the exact format PocketSphinx requires
audioStream = to16kHzMono(std::move(audioStream));
// Process data
processAudioStream(*audioStream.get(), *recognizer.get());
// Collect results into map
return getPhones(*recognizer.get());
} catch (...) {
std::throw_with_nested(runtime_error("Error detecting phones via Pocketsphinx."));
}
} }