diff --git a/src/main.cpp b/src/main.cpp index 53871d6..daeb639 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -2,12 +2,24 @@ #include #include #include +#include +#include +#include +#include "audio_input/16kHzMonoStream.h" using std::runtime_error; using std::shared_ptr; +using std::unique_ptr; #define MODELDIR "X:/dev/projects/LipSync/lib/pocketsphinx/model" +// Converts a float in the range -1..1 to a signed 16-bit int +int16_t floatSampleToInt16(float sample) { + sample = std::max(sample, -1.0f); + sample = std::min(sample, 1.0f); + return static_cast(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN); +} + int main(int argc, char *argv[]) { shared_ptr config( cmd_ln_init( @@ -33,23 +45,41 @@ int main(int argc, char *argv[]) { [](ps_decoder_t* recognizer) { ps_free(recognizer); }); if (!recognizer) throw runtime_error("Error creating speech recognizer."); - shared_ptr file( - fopen("X:/dev/projects/LipSync/lib/pocketsphinx/test/data/goforward.raw", "rb"), - [](FILE* file) { fclose(file); }); - if (!file) throw runtime_error("Error opening sound file."); + unique_ptr audioStream = + create16kHzMonoStream(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)"); int error = ps_start_utt(recognizer.get()); if (error) throw runtime_error("Error starting utterance processing."); - int16 buffer[512]; - while (!feof(file.get())) { - size_t sampleCount = fread(buffer, 2, 512, file.get()); - int searchedFrameCount = ps_process_raw(recognizer.get(), buffer, sampleCount, false, false); + auto start = std::chrono::steady_clock::now(); + + std::vector buffer; + const int capacity = 1600; + buffer.reserve(capacity); // 0.1 second capacity + int sampleCount = 0; + do { + // Read to buffer + buffer.clear(); + while (buffer.size() < capacity) { + float sample; + if (!audioStream->getNextSample(sample)) break; + buffer.push_back(floatSampleToInt16(sample)); + } + + // Analyze buffer + int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false); if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data."); - } + + sampleCount += buffer.size(); + + std::cout << sampleCount / 16000.0 << "s\n"; + } while (buffer.size()); error = ps_end_utt(recognizer.get()); if (error) throw runtime_error("Error ending utterance processing."); + auto end = std::chrono::steady_clock::now(); + std::cout << std::chrono::duration_cast>(end - start).count() << "\n"; + ps_seg_t *segmentationIter; int32 score; for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {