rhubarb-lip-sync/src/main.cpp

98 lines
3.1 KiB
C++
Raw Normal View History

2015-09-10 19:31:25 +00:00
#include <pocketsphinx.h>
#include <stdexcept>
#include <fstream>
#include <memory>
2015-09-28 19:19:39 +00:00
#include <vector>
#include <iostream>
#include <chrono>
#include "audio_input/16kHzMonoStream.h"
2015-09-10 18:05:05 +00:00
2015-09-10 19:31:25 +00:00
using std::runtime_error;
using std::shared_ptr;
2015-09-28 19:19:39 +00:00
using std::unique_ptr;
2015-09-10 18:05:05 +00:00
2015-10-15 19:07:11 +00:00
#define MODELDIR "X:/dev/projects/LipSync/lib/pocketsphinx-5prealpha-2015-08-05/model"
2015-09-10 19:31:25 +00:00
2015-09-28 19:19:39 +00:00
// Converts a float in the range -1..1 to a signed 16-bit int
int16_t floatSampleToInt16(float sample) {
sample = std::max(sample, -1.0f);
sample = std::min(sample, 1.0f);
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
}
2015-09-10 19:31:25 +00:00
int main(int argc, char *argv[]) {
shared_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", MODELDIR "/en-us/en-us",
// Set phonetic language model
"-allphone", MODELDIR "/en-us/en-us-phone.lm.bin",
2015-10-15 19:07:11 +00:00
"-allphone_ci", "yes",
2015-09-10 19:31:25 +00:00
// The following settings are Voodoo to me.
// I copied them from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
// Set beam width applied to every frame in Viterbi search
"-beam", "1e-20",
// Set beam width applied to phone transitions
"-pbeam", "1e-20",
// Set language model probability weight
"-lw", "2.0",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
shared_ptr<ps_decoder_t> recognizer(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!recognizer) throw runtime_error("Error creating speech recognizer.");
2015-09-28 19:19:39 +00:00
unique_ptr<AudioStream> audioStream =
create16kHzMonoStream(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)");
2015-09-10 19:31:25 +00:00
int error = ps_start_utt(recognizer.get());
if (error) throw runtime_error("Error starting utterance processing.");
2015-09-28 19:19:39 +00:00
auto start = std::chrono::steady_clock::now();
std::vector<int16_t> buffer;
2015-10-31 19:41:17 +00:00
const int capacity = 1600; // 0.1 second capacity
buffer.reserve(capacity);
2015-09-28 19:19:39 +00:00
int sampleCount = 0;
do {
// Read to buffer
buffer.clear();
while (buffer.size() < capacity) {
float sample;
if (!audioStream->getNextSample(sample)) break;
buffer.push_back(floatSampleToInt16(sample));
}
// Analyze buffer
int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false);
2015-09-10 19:31:25 +00:00
if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data.");
2015-09-28 19:19:39 +00:00
sampleCount += buffer.size();
std::cout << sampleCount / 16000.0 << "s\n";
} while (buffer.size());
2015-09-10 19:31:25 +00:00
error = ps_end_utt(recognizer.get());
if (error) throw runtime_error("Error ending utterance processing.");
2015-09-28 19:19:39 +00:00
auto end = std::chrono::steady_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count() << "\n";
2015-09-10 19:31:25 +00:00
ps_seg_t *segmentationIter;
int32 score;
for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
// Get phoneme
char const *phoneme = ps_seg_word(segmentationIter);
// Get timing
int startFrame, endFrame;
ps_seg_frames(segmentationIter, &startFrame, &endFrame);
printf(">>> %-5s %-5d %-5d\n", phoneme, startFrame, endFrame);
}
return 0;
2015-09-10 18:05:05 +00:00
}