Refactoring
- Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization
This commit is contained in:
parent
9fbae36e70
commit
f2f6f75932
|
@ -126,19 +126,25 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slamch.c" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slapack_lite.c" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/strfuncs.c" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/AudioStream.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/IOTools.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/io_tools.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/wave_file_writing.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/wave_file_writing.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/centiseconds.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/centiseconds.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/Phone.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/Phone.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/phone_extraction.cpp" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/phone_extraction.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/platform_tools.h" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/platform_tools_win.cpp" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
|
@ -169,8 +175,8 @@
|
|||
<excluded>
|
||||
<root url="file://$MODULE_DIR$/lib/cppformat/format.cc" />
|
||||
<root url="file://$MODULE_DIR$/lib/cppformat/posix.cc" />
|
||||
<root url="file://$MODULE_DIR$/lib/cppformat/posix.h" />
|
||||
<root url="file://$MODULE_DIR$/lib/cppformat/format.h" />
|
||||
<root url="file://$MODULE_DIR$/lib/cppformat/posix.h" />
|
||||
</excluded>
|
||||
</library>
|
||||
</orderEntry>
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
cmake_minimum_required(VERSION 3.3)
|
||||
project(LipSync)
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall")
|
||||
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
|
||||
|
@ -11,10 +11,15 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
|
|||
set(Boost_USE_STATIC_LIBS ON) # Use static libs
|
||||
set(Boost_USE_MULTITHREADED ON) # Enable multithreading support
|
||||
set(Boost_USE_STATIC_RUNTIME ON) # Use static C++ runtime
|
||||
find_package(Boost REQUIRED COMPONENTS filesystem locale )
|
||||
find_package(Boost REQUIRED COMPONENTS filesystem locale system)
|
||||
include_directories(${Boost_INCLUDE_DIRS})
|
||||
|
||||
set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/16kHzMonoStream.cpp src/audio_input/16kHzMonoStream.h src/audio_input/WaveFileWriter.cpp src/audio_input/WaveFileWriter.h src/audio_input/IOTools.h)
|
||||
set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/wave_file_writing.cpp src/audio_input/wave_file_writing.h src/audio_input/io_tools.h src/platform_tools.h src/phone_extraction.cpp src/phone_extraction.h src/Phone.cpp src/Phone.h src/centiseconds.cpp src/centiseconds.h)
|
||||
if(WIN32)
|
||||
set(SOURCE_FILES "${SOURCE_FILES};src/platform_tools_win.cpp")
|
||||
else()
|
||||
message(FATAL_ERROR "Target platform not supported.")
|
||||
endif()
|
||||
|
||||
include_directories("lib/sphinxbase-5prealpha-2015-08-05/include" "lib/pocketsphinx-5prealpha-2015-08-05/include" "lib/cppformat")
|
||||
FILE(GLOB_RECURSE SPHINX_BASE "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c")
|
||||
|
@ -44,5 +49,5 @@ endfunction()
|
|||
|
||||
# Copy resource files
|
||||
set(modelDir "${CMAKE_SOURCE_DIR}/lib/pocketsphinx-5prealpha-2015-08-05/model")
|
||||
copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx/acoustic_model")
|
||||
copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/language_model")
|
||||
copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx")
|
||||
copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/acoustic_model")
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
#include <boost/bimap.hpp>
|
||||
#include "Phone.h"
|
||||
|
||||
using std::string;
|
||||
|
||||
template <typename L, typename R>
|
||||
boost::bimap<L, R>
|
||||
makeBimap(std::initializer_list<typename boost::bimap<L, R>::value_type> list) {
|
||||
return boost::bimap<L, R>(list.begin(), list.end());
|
||||
}
|
||||
|
||||
boost::bimap<string, Phone> phonesByName = makeBimap<string, Phone>({
|
||||
{ "None", Phone::None },
|
||||
{ "Unknown", Phone::Unknown },
|
||||
{ "AO", Phone::AO }, { "AA", Phone::AA }, { "IY", Phone::IY }, { "UW", Phone::UW },
|
||||
{ "EH", Phone::EH }, { "IH", Phone::IH }, { "UH", Phone::UH }, { "AH", Phone::AH },
|
||||
{ "AE", Phone::AE }, { "EY", Phone::EY }, { "AY", Phone::AY }, { "OW", Phone::OW },
|
||||
{ "AW", Phone::AW }, { "OY", Phone::OY }, { "ER", Phone::ER }, { "P", Phone::P },
|
||||
{ "B", Phone::B }, { "T", Phone::T }, { "D", Phone::D }, { "K", Phone::K },
|
||||
{ "G", Phone::G }, { "CH", Phone::CH }, { "JH", Phone::JH }, { "F", Phone::F },
|
||||
{ "V", Phone::V }, { "TH", Phone::TH }, { "DH", Phone::DH }, { "S", Phone::S },
|
||||
{ "Z", Phone::Z }, { "SH", Phone::SH }, { "ZH", Phone::ZH }, { "HH", Phone::HH },
|
||||
{ "M", Phone::M }, { "N", Phone::N }, { "NG", Phone::NG }, { "L", Phone::L },
|
||||
{ "R", Phone::R }, { "Y", Phone::Y }, { "W", Phone::W },
|
||||
});
|
||||
|
||||
Phone stringToPhone(const string& s) {
|
||||
auto it = phonesByName.left.find(s);
|
||||
return (it != phonesByName.left.end()) ? it->second : Phone::Unknown;
|
||||
}
|
||||
|
||||
string phoneToString(Phone phone) {
|
||||
auto it = phonesByName.right.find(phone);
|
||||
return (it != phonesByName.right.end()) ? it->second : phoneToString(Phone::Unknown);
|
||||
}
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
#ifndef LIPSYNC_PHONE_H
|
||||
#define LIPSYNC_PHONE_H
|
||||
|
||||
// Defines a subset of the Arpabet
|
||||
enum class Phone {
|
||||
None,
|
||||
Unknown,
|
||||
|
||||
/////////
|
||||
// Vowels
|
||||
|
||||
// ... monophthongs
|
||||
AO, // [ɔ] as in [o]ff, f[a]ll, fr[o]st
|
||||
AA, // [ɑ] as in f[a]ther
|
||||
IY, // [i] as in b[ee], sh[e]
|
||||
UW, // [u] as in y[ou], n[ew], f[oo]d
|
||||
EH, // [ɛ] as in r[e]d, m[e]n
|
||||
IH, // [ɪ] as in b[i]g, w[i]n
|
||||
UH, // [ʊ] as in sh[ou]ld, c[ou]ld
|
||||
AH, // [ʌ, ə] as in b[u]t, s[u]n, [a]lone, disc[u]s
|
||||
AE, // [æ] as in [a]t, b[a]t
|
||||
|
||||
// ... diphthongs
|
||||
EY, // [eɪ] as in s[ay], [ei]ght
|
||||
AY, // [aɪ] as in m[y], wh[y], r[i]de
|
||||
OW, // [oʊ] as in sh[ow], c[oa]t
|
||||
AW, // [aʊ] as in h[ow], n[ow]
|
||||
OY, // [ɔɪ] as in b[oy], t[oy]
|
||||
|
||||
// ... r-colored
|
||||
ER, // [ɝ] as in h[er], b[ir]d, h[ur]t
|
||||
|
||||
/////////////
|
||||
// Consonants
|
||||
|
||||
// ... stops
|
||||
P, // [p] as in [p]ay
|
||||
B, // [b] as in [b]uy
|
||||
T, // [t] as in [t]ake
|
||||
D, // [d] as in [d]ay
|
||||
K, // [k] as in [k]ey
|
||||
G, // [g] as in [g]o
|
||||
|
||||
// ... affricates
|
||||
CH, // [tʃ] as in [ch]air
|
||||
JH, // [dʒ] as in [j]ust
|
||||
|
||||
// ... fricatives
|
||||
F, // [f] as in [f]or
|
||||
V, // [v] as in [v]ery
|
||||
TH, // [θ] as in [th]anks
|
||||
DH, // [ð] as in [th]at
|
||||
S, // [s] as in [s]ay
|
||||
Z, // [z] as in [z]oo
|
||||
SH, // [ʃ] as in [sh]ow
|
||||
ZH, // [ʒ] as in mea[s]ure, plea[s]ure
|
||||
HH, // [h] as in [h]ouse
|
||||
|
||||
// ... nasals
|
||||
M, // [m] as in [m]an
|
||||
N, // [n] as in [no]
|
||||
NG, // [ŋ] as in si[ng]
|
||||
|
||||
// ... liquids
|
||||
L, // [ɫ] as in [l]ate
|
||||
R, // [r, ɹ] as in [r]un
|
||||
|
||||
// ... semivowels
|
||||
Y, // [j] as in [y]es
|
||||
W // [w] as in [w]ay
|
||||
};
|
||||
|
||||
Phone stringToPhone(const std::string& s);
|
||||
|
||||
std::string phoneToString(Phone phone);
|
||||
|
||||
|
||||
#endif //LIPSYNC_PHONE_H
|
|
@ -1,26 +0,0 @@
|
|||
#include "16kHzMonoStream.h"
|
||||
#include "WaveFileReader.h"
|
||||
#include "ChannelDownmixer.h"
|
||||
#include "SampleRateConverter.h"
|
||||
|
||||
using std::runtime_error;
|
||||
|
||||
std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName) {
|
||||
// Create audio stream
|
||||
std::unique_ptr<AudioStream> stream(new WaveFileReader(fileName));
|
||||
|
||||
// Downmix, if required
|
||||
if (stream->getChannelCount() != 1) {
|
||||
stream.reset(new ChannelDownmixer(std::move(stream)));
|
||||
}
|
||||
|
||||
// Downsample, if required
|
||||
if (stream->getFrameRate() < 16000) {
|
||||
throw runtime_error("Sample rate must not be below 16kHz.");
|
||||
}
|
||||
if (stream->getFrameRate() != 16000) {
|
||||
stream.reset(new SampleRateConverter(std::move(stream), 16000));
|
||||
}
|
||||
|
||||
return stream;
|
||||
}
|
|
@ -1,10 +0,0 @@
|
|||
#ifndef LIPSYNC_WAVEFILEREADER16KHZMONO_H
|
||||
#define LIPSYNC_WAVEFILEREADER16KHZMONO_H
|
||||
|
||||
#include "AudioStream.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName);
|
||||
|
||||
#endif //LIPSYNC_WAVEFILEREADER16KHZMONO_H
|
|
@ -1,6 +1,6 @@
|
|||
#include <format.h>
|
||||
#include "WaveFileReader.h"
|
||||
#include "IOTools.h"
|
||||
#include "io_tools.h"
|
||||
|
||||
using std::runtime_error;
|
||||
using fmt::format;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include <fstream>
|
||||
#include "WaveFileWriter.h"
|
||||
#include "IOTools.h"
|
||||
#include "wave_file_writing.h"
|
||||
#include "io_tools.h"
|
||||
|
||||
using namespace little_endian;
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
#include <ratio>
|
||||
#include <chrono>
|
||||
#include <ostream>
|
||||
#include "Centiseconds.h"
|
||||
|
||||
std::ostream& operator <<(std::ostream& stream, const centiseconds cs) {
|
||||
return stream << cs.count() << "cs";
|
||||
}
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
#ifndef LIPSYNC_CENTISECONDS_H
|
||||
#define LIPSYNC_CENTISECONDS_H
|
||||
|
||||
typedef std::chrono::duration<int, std::centi> centiseconds;
|
||||
|
||||
std::ostream& operator <<(std::ostream& stream, const centiseconds cs);
|
||||
|
||||
#endif //LIPSYNC_CENTISECONDS_H
|
97
src/main.cpp
97
src/main.cpp
|
@ -1,97 +1,16 @@
|
|||
#include <pocketsphinx.h>
|
||||
#include <stdexcept>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "audio_input/16kHzMonoStream.h"
|
||||
|
||||
using std::runtime_error;
|
||||
using std::shared_ptr;
|
||||
using std::unique_ptr;
|
||||
|
||||
#define MODELDIR "X:/dev/projects/LipSync/lib/pocketsphinx-5prealpha-2015-08-05/model"
|
||||
|
||||
// Converts a float in the range -1..1 to a signed 16-bit int
|
||||
int16_t floatSampleToInt16(float sample) {
|
||||
sample = std::max(sample, -1.0f);
|
||||
sample = std::min(sample, 1.0f);
|
||||
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
|
||||
}
|
||||
#include "audio_input/WaveFileReader.h"
|
||||
#include "phone_extraction.h"
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
shared_ptr<cmd_ln_t> config(
|
||||
cmd_ln_init(
|
||||
nullptr, ps_args(), true,
|
||||
// Set acoustic model
|
||||
"-hmm", MODELDIR "/en-us/en-us",
|
||||
// Set phonetic language model
|
||||
"-allphone", MODELDIR "/en-us/en-us-phone.lm.bin",
|
||||
"-allphone_ci", "yes",
|
||||
// The following settings are Voodoo to me.
|
||||
// I copied them from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
||||
// Set beam width applied to every frame in Viterbi search
|
||||
"-beam", "1e-20",
|
||||
// Set beam width applied to phone transitions
|
||||
"-pbeam", "1e-20",
|
||||
// Set language model probability weight
|
||||
"-lw", "2.0",
|
||||
nullptr),
|
||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||
if (!config) throw runtime_error("Error creating configuration.");
|
||||
// Create audio stream
|
||||
std::unique_ptr<AudioStream> audioStream(
|
||||
new WaveFileReader(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)"));
|
||||
|
||||
shared_ptr<ps_decoder_t> recognizer(
|
||||
ps_init(config.get()),
|
||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||
if (!recognizer) throw runtime_error("Error creating speech recognizer.");
|
||||
std::map<centiseconds, Phone> phones = detectPhones(std::move(audioStream));
|
||||
|
||||
unique_ptr<AudioStream> audioStream =
|
||||
create16kHzMonoStream(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)");
|
||||
|
||||
int error = ps_start_utt(recognizer.get());
|
||||
if (error) throw runtime_error("Error starting utterance processing.");
|
||||
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
|
||||
std::vector<int16_t> buffer;
|
||||
const int capacity = 1600; // 0.1 second capacity
|
||||
buffer.reserve(capacity);
|
||||
int sampleCount = 0;
|
||||
do {
|
||||
// Read to buffer
|
||||
buffer.clear();
|
||||
while (buffer.size() < capacity) {
|
||||
float sample;
|
||||
if (!audioStream->getNextSample(sample)) break;
|
||||
buffer.push_back(floatSampleToInt16(sample));
|
||||
}
|
||||
|
||||
// Analyze buffer
|
||||
int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false);
|
||||
if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data.");
|
||||
|
||||
sampleCount += buffer.size();
|
||||
|
||||
std::cout << sampleCount / 16000.0 << "s\n";
|
||||
} while (buffer.size());
|
||||
error = ps_end_utt(recognizer.get());
|
||||
if (error) throw runtime_error("Error ending utterance processing.");
|
||||
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
std::cout << std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count() << "\n";
|
||||
|
||||
ps_seg_t *segmentationIter;
|
||||
int32 score;
|
||||
for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
|
||||
// Get phoneme
|
||||
char const *phoneme = ps_seg_word(segmentationIter);
|
||||
|
||||
// Get timing
|
||||
int startFrame, endFrame;
|
||||
ps_seg_frames(segmentationIter, &startFrame, &endFrame);
|
||||
|
||||
printf(">>> %-5s %-5d %-5d\n", phoneme, startFrame, endFrame);
|
||||
for (auto& pair : phones) {
|
||||
std::cout << pair.first << ": " << phoneToString(pair.second) << "\n";
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
#include <pocketsphinx.h>
|
||||
#include <iostream>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include "phone_extraction.h"
|
||||
#include "audio_input/SampleRateConverter.h"
|
||||
#include "audio_input/ChannelDownmixer.h"
|
||||
#include "platform_tools.h"
|
||||
using std::runtime_error;
|
||||
using std::unique_ptr;
|
||||
using std::shared_ptr;
|
||||
using std::string;
|
||||
using std::map;
|
||||
using boost::filesystem::path;
|
||||
|
||||
unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
|
||||
// Downmix, if required
|
||||
if (stream->getChannelCount() != 1) {
|
||||
stream.reset(new ChannelDownmixer(std::move(stream)));
|
||||
}
|
||||
|
||||
// Downsample, if required
|
||||
if (stream->getFrameRate() < 16000) {
|
||||
throw runtime_error("Sample rate must not be below 16kHz.");
|
||||
}
|
||||
if (stream->getFrameRate() != 16000) {
|
||||
stream.reset(new SampleRateConverter(std::move(stream), 16000));
|
||||
}
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
// Converts a float in the range -1..1 to a signed 16-bit int
|
||||
int16_t floatSampleToInt16(float sample) {
|
||||
sample = std::max(sample, -1.0f);
|
||||
sample = std::min(sample, 1.0f);
|
||||
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
|
||||
}
|
||||
|
||||
map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
|
||||
// Convert audio stream to the exact format PocketSphinx requires
|
||||
audioStream = to16kHzMono(std::move(audioStream));
|
||||
|
||||
// Create PocketSphinx configuration
|
||||
path binDirectory(getBinDirectory());
|
||||
path resDirectory(binDirectory.parent_path() / "res");
|
||||
shared_ptr<cmd_ln_t> config(
|
||||
cmd_ln_init(
|
||||
nullptr, ps_args(), true,
|
||||
// Set acoustic model
|
||||
"-hmm", (resDirectory / "sphinx/acoustic_model").string().c_str(),
|
||||
// Set phonetic language model
|
||||
"-allphone", (resDirectory / "sphinx/en-us-phone.lm.bin").string().c_str(),
|
||||
"-allphone_ci", "yes",
|
||||
// The following settings are taken from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
||||
// Set beam width applied to every frame in Viterbi search
|
||||
"-beam", "1e-20",
|
||||
// Set beam width applied to phone transitions
|
||||
"-pbeam", "1e-20",
|
||||
// Set language model probability weight
|
||||
"-lw", "2.0",
|
||||
nullptr),
|
||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||
if (!config) throw runtime_error("Error creating configuration.");
|
||||
|
||||
// Create phone recognizer
|
||||
shared_ptr<ps_decoder_t> recognizer(
|
||||
ps_init(config.get()),
|
||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||
if (!recognizer) throw runtime_error("Error creating speech recognizer.");
|
||||
|
||||
// Start recognition
|
||||
int error = ps_start_utt(recognizer.get());
|
||||
if (error) throw runtime_error("Error starting utterance processing.");
|
||||
|
||||
// Process entire sound file
|
||||
std::vector<int16_t> buffer;
|
||||
const int capacity = 1600; // 0.1 second capacity
|
||||
buffer.reserve(capacity);
|
||||
int sampleCount = 0;
|
||||
do {
|
||||
// Read to buffer
|
||||
buffer.clear();
|
||||
while (buffer.size() < capacity) {
|
||||
float sample;
|
||||
if (!audioStream->getNextSample(sample)) break;
|
||||
buffer.push_back(floatSampleToInt16(sample));
|
||||
}
|
||||
|
||||
// Analyze buffer
|
||||
int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false);
|
||||
if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data.");
|
||||
|
||||
sampleCount += buffer.size();
|
||||
} while (buffer.size());
|
||||
error = ps_end_utt(recognizer.get());
|
||||
if (error) throw runtime_error("Error ending utterance processing.");
|
||||
|
||||
// Collect results into map
|
||||
map<centiseconds, Phone> result;
|
||||
ps_seg_t *segmentationIter;
|
||||
int32 score;
|
||||
int endFrame;
|
||||
for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
|
||||
// Get phone
|
||||
char const *phone = ps_seg_word(segmentationIter);
|
||||
|
||||
// Get timing
|
||||
int startFrame;
|
||||
ps_seg_frames(segmentationIter, &startFrame, &endFrame);
|
||||
|
||||
result[centiseconds(startFrame)] = stringToPhone(phone);
|
||||
}
|
||||
// Add dummy entry past the last phone
|
||||
result[centiseconds(endFrame + 1)] = Phone::None;
|
||||
return result;
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#ifndef LIPSYNC_PHONE_EXTRACTION_H
|
||||
#define LIPSYNC_PHONE_EXTRACTION_H
|
||||
|
||||
#include <map>
|
||||
#include <chrono>
|
||||
#include <ratio>
|
||||
#include <memory>
|
||||
#include "audio_input/AudioStream.h"
|
||||
#include "Phone.h"
|
||||
#include "centiseconds.h"
|
||||
|
||||
std::map<centiseconds, Phone> detectPhones(std::unique_ptr<AudioStream> audioStream);
|
||||
|
||||
#endif //LIPSYNC_PHONE_EXTRACTION_H
|
|
@ -0,0 +1,8 @@
|
|||
#ifndef LIPSYNC_PLATFORM_TOOLS_H
|
||||
#define LIPSYNC_PLATFORM_TOOLS_H
|
||||
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
boost::filesystem::path getBinDirectory();
|
||||
|
||||
#endif //LIPSYNC_PLATFORM_TOOLS_H
|
|
@ -0,0 +1,24 @@
|
|||
#include "platform_tools.h"
|
||||
|
||||
#include <Windows.h>
|
||||
|
||||
boost::filesystem::path getBinDirectory() {
|
||||
std::vector<wchar_t> executablePath(MAX_PATH);
|
||||
|
||||
// Try to get the executable path with a buffer of MAX_PATH characters.
|
||||
DWORD result = GetModuleFileNameW(0, executablePath.data(), executablePath.size());
|
||||
|
||||
// As long the function returns the buffer size, it is indicating that the buffer
|
||||
// was too small. Keep doubling the buffer size until it fits.
|
||||
while(result == executablePath.size()) {
|
||||
executablePath.resize(executablePath.size() * 2);
|
||||
result = GetModuleFileNameW(0, executablePath.data(), executablePath.size());
|
||||
}
|
||||
|
||||
// If the function returned 0, something went wrong
|
||||
if (result == 0) {
|
||||
throw std::runtime_error("Could not determine path of bin directory.");
|
||||
}
|
||||
|
||||
return boost::filesystem::path(executablePath.data()).parent_path();
|
||||
}
|
Loading…
Reference in New Issue