Refactoring
- Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization
This commit is contained in:
parent
9fbae36e70
commit
f2f6f75932
|
@ -126,19 +126,25 @@
|
||||||
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slamch.c" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slamch.c" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slapack_lite.c" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slapack_lite.c" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/strfuncs.c" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/strfuncs.c" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.cpp" isTestSource="false" />
|
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.h" isTestSource="false" />
|
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/AudioStream.h" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/AudioStream.h" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.cpp" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.cpp" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.h" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.h" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/IOTools.h" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/io_tools.h" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.cpp" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.cpp" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.h" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.h" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/wave_file_writing.cpp" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/wave_file_writing.h" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.cpp" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.cpp" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.h" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.h" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.cpp" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/centiseconds.cpp" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.h" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/centiseconds.h" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/src/main.cpp" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/src/main.cpp" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src/Phone.cpp" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src/Phone.h" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src/phone_extraction.cpp" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src/phone_extraction.h" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src/platform_tools.h" isTestSource="false" />
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src/platform_tools_win.cpp" isTestSource="false" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="inheritedJdk" />
|
<orderEntry type="inheritedJdk" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
@ -169,8 +175,8 @@
|
||||||
<excluded>
|
<excluded>
|
||||||
<root url="file://$MODULE_DIR$/lib/cppformat/format.cc" />
|
<root url="file://$MODULE_DIR$/lib/cppformat/format.cc" />
|
||||||
<root url="file://$MODULE_DIR$/lib/cppformat/posix.cc" />
|
<root url="file://$MODULE_DIR$/lib/cppformat/posix.cc" />
|
||||||
<root url="file://$MODULE_DIR$/lib/cppformat/posix.h" />
|
|
||||||
<root url="file://$MODULE_DIR$/lib/cppformat/format.h" />
|
<root url="file://$MODULE_DIR$/lib/cppformat/format.h" />
|
||||||
|
<root url="file://$MODULE_DIR$/lib/cppformat/posix.h" />
|
||||||
</excluded>
|
</excluded>
|
||||||
</library>
|
</library>
|
||||||
</orderEntry>
|
</orderEntry>
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
cmake_minimum_required(VERSION 3.3)
|
cmake_minimum_required(VERSION 3.3)
|
||||||
project(LipSync)
|
project(LipSync)
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall")
|
||||||
|
|
||||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
|
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
|
||||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
|
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
|
||||||
|
@ -11,10 +11,15 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
|
||||||
set(Boost_USE_STATIC_LIBS ON) # Use static libs
|
set(Boost_USE_STATIC_LIBS ON) # Use static libs
|
||||||
set(Boost_USE_MULTITHREADED ON) # Enable multithreading support
|
set(Boost_USE_MULTITHREADED ON) # Enable multithreading support
|
||||||
set(Boost_USE_STATIC_RUNTIME ON) # Use static C++ runtime
|
set(Boost_USE_STATIC_RUNTIME ON) # Use static C++ runtime
|
||||||
find_package(Boost REQUIRED COMPONENTS filesystem locale )
|
find_package(Boost REQUIRED COMPONENTS filesystem locale system)
|
||||||
include_directories(${Boost_INCLUDE_DIRS})
|
include_directories(${Boost_INCLUDE_DIRS})
|
||||||
|
|
||||||
set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/16kHzMonoStream.cpp src/audio_input/16kHzMonoStream.h src/audio_input/WaveFileWriter.cpp src/audio_input/WaveFileWriter.h src/audio_input/IOTools.h)
|
set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/wave_file_writing.cpp src/audio_input/wave_file_writing.h src/audio_input/io_tools.h src/platform_tools.h src/phone_extraction.cpp src/phone_extraction.h src/Phone.cpp src/Phone.h src/centiseconds.cpp src/centiseconds.h)
|
||||||
|
if(WIN32)
|
||||||
|
set(SOURCE_FILES "${SOURCE_FILES};src/platform_tools_win.cpp")
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "Target platform not supported.")
|
||||||
|
endif()
|
||||||
|
|
||||||
include_directories("lib/sphinxbase-5prealpha-2015-08-05/include" "lib/pocketsphinx-5prealpha-2015-08-05/include" "lib/cppformat")
|
include_directories("lib/sphinxbase-5prealpha-2015-08-05/include" "lib/pocketsphinx-5prealpha-2015-08-05/include" "lib/cppformat")
|
||||||
FILE(GLOB_RECURSE SPHINX_BASE "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c")
|
FILE(GLOB_RECURSE SPHINX_BASE "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c")
|
||||||
|
@ -44,5 +49,5 @@ endfunction()
|
||||||
|
|
||||||
# Copy resource files
|
# Copy resource files
|
||||||
set(modelDir "${CMAKE_SOURCE_DIR}/lib/pocketsphinx-5prealpha-2015-08-05/model")
|
set(modelDir "${CMAKE_SOURCE_DIR}/lib/pocketsphinx-5prealpha-2015-08-05/model")
|
||||||
copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx/acoustic_model")
|
copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx")
|
||||||
copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/language_model")
|
copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/acoustic_model")
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
#include <boost/bimap.hpp>
|
||||||
|
#include "Phone.h"
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
template <typename L, typename R>
|
||||||
|
boost::bimap<L, R>
|
||||||
|
makeBimap(std::initializer_list<typename boost::bimap<L, R>::value_type> list) {
|
||||||
|
return boost::bimap<L, R>(list.begin(), list.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::bimap<string, Phone> phonesByName = makeBimap<string, Phone>({
|
||||||
|
{ "None", Phone::None },
|
||||||
|
{ "Unknown", Phone::Unknown },
|
||||||
|
{ "AO", Phone::AO }, { "AA", Phone::AA }, { "IY", Phone::IY }, { "UW", Phone::UW },
|
||||||
|
{ "EH", Phone::EH }, { "IH", Phone::IH }, { "UH", Phone::UH }, { "AH", Phone::AH },
|
||||||
|
{ "AE", Phone::AE }, { "EY", Phone::EY }, { "AY", Phone::AY }, { "OW", Phone::OW },
|
||||||
|
{ "AW", Phone::AW }, { "OY", Phone::OY }, { "ER", Phone::ER }, { "P", Phone::P },
|
||||||
|
{ "B", Phone::B }, { "T", Phone::T }, { "D", Phone::D }, { "K", Phone::K },
|
||||||
|
{ "G", Phone::G }, { "CH", Phone::CH }, { "JH", Phone::JH }, { "F", Phone::F },
|
||||||
|
{ "V", Phone::V }, { "TH", Phone::TH }, { "DH", Phone::DH }, { "S", Phone::S },
|
||||||
|
{ "Z", Phone::Z }, { "SH", Phone::SH }, { "ZH", Phone::ZH }, { "HH", Phone::HH },
|
||||||
|
{ "M", Phone::M }, { "N", Phone::N }, { "NG", Phone::NG }, { "L", Phone::L },
|
||||||
|
{ "R", Phone::R }, { "Y", Phone::Y }, { "W", Phone::W },
|
||||||
|
});
|
||||||
|
|
||||||
|
Phone stringToPhone(const string& s) {
|
||||||
|
auto it = phonesByName.left.find(s);
|
||||||
|
return (it != phonesByName.left.end()) ? it->second : Phone::Unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
string phoneToString(Phone phone) {
|
||||||
|
auto it = phonesByName.right.find(phone);
|
||||||
|
return (it != phonesByName.right.end()) ? it->second : phoneToString(Phone::Unknown);
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,78 @@
|
||||||
|
#ifndef LIPSYNC_PHONE_H
|
||||||
|
#define LIPSYNC_PHONE_H
|
||||||
|
|
||||||
|
// Defines a subset of the Arpabet
|
||||||
|
enum class Phone {
|
||||||
|
None,
|
||||||
|
Unknown,
|
||||||
|
|
||||||
|
/////////
|
||||||
|
// Vowels
|
||||||
|
|
||||||
|
// ... monophthongs
|
||||||
|
AO, // [ɔ] as in [o]ff, f[a]ll, fr[o]st
|
||||||
|
AA, // [ɑ] as in f[a]ther
|
||||||
|
IY, // [i] as in b[ee], sh[e]
|
||||||
|
UW, // [u] as in y[ou], n[ew], f[oo]d
|
||||||
|
EH, // [ɛ] as in r[e]d, m[e]n
|
||||||
|
IH, // [ɪ] as in b[i]g, w[i]n
|
||||||
|
UH, // [ʊ] as in sh[ou]ld, c[ou]ld
|
||||||
|
AH, // [ʌ, ə] as in b[u]t, s[u]n, [a]lone, disc[u]s
|
||||||
|
AE, // [æ] as in [a]t, b[a]t
|
||||||
|
|
||||||
|
// ... diphthongs
|
||||||
|
EY, // [eɪ] as in s[ay], [ei]ght
|
||||||
|
AY, // [aɪ] as in m[y], wh[y], r[i]de
|
||||||
|
OW, // [oʊ] as in sh[ow], c[oa]t
|
||||||
|
AW, // [aʊ] as in h[ow], n[ow]
|
||||||
|
OY, // [ɔɪ] as in b[oy], t[oy]
|
||||||
|
|
||||||
|
// ... r-colored
|
||||||
|
ER, // [ɝ] as in h[er], b[ir]d, h[ur]t
|
||||||
|
|
||||||
|
/////////////
|
||||||
|
// Consonants
|
||||||
|
|
||||||
|
// ... stops
|
||||||
|
P, // [p] as in [p]ay
|
||||||
|
B, // [b] as in [b]uy
|
||||||
|
T, // [t] as in [t]ake
|
||||||
|
D, // [d] as in [d]ay
|
||||||
|
K, // [k] as in [k]ey
|
||||||
|
G, // [g] as in [g]o
|
||||||
|
|
||||||
|
// ... affricates
|
||||||
|
CH, // [tʃ] as in [ch]air
|
||||||
|
JH, // [dʒ] as in [j]ust
|
||||||
|
|
||||||
|
// ... fricatives
|
||||||
|
F, // [f] as in [f]or
|
||||||
|
V, // [v] as in [v]ery
|
||||||
|
TH, // [θ] as in [th]anks
|
||||||
|
DH, // [ð] as in [th]at
|
||||||
|
S, // [s] as in [s]ay
|
||||||
|
Z, // [z] as in [z]oo
|
||||||
|
SH, // [ʃ] as in [sh]ow
|
||||||
|
ZH, // [ʒ] as in mea[s]ure, plea[s]ure
|
||||||
|
HH, // [h] as in [h]ouse
|
||||||
|
|
||||||
|
// ... nasals
|
||||||
|
M, // [m] as in [m]an
|
||||||
|
N, // [n] as in [no]
|
||||||
|
NG, // [ŋ] as in si[ng]
|
||||||
|
|
||||||
|
// ... liquids
|
||||||
|
L, // [ɫ] as in [l]ate
|
||||||
|
R, // [r, ɹ] as in [r]un
|
||||||
|
|
||||||
|
// ... semivowels
|
||||||
|
Y, // [j] as in [y]es
|
||||||
|
W // [w] as in [w]ay
|
||||||
|
};
|
||||||
|
|
||||||
|
Phone stringToPhone(const std::string& s);
|
||||||
|
|
||||||
|
std::string phoneToString(Phone phone);
|
||||||
|
|
||||||
|
|
||||||
|
#endif //LIPSYNC_PHONE_H
|
|
@ -1,26 +0,0 @@
|
||||||
#include "16kHzMonoStream.h"
|
|
||||||
#include "WaveFileReader.h"
|
|
||||||
#include "ChannelDownmixer.h"
|
|
||||||
#include "SampleRateConverter.h"
|
|
||||||
|
|
||||||
using std::runtime_error;
|
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName) {
|
|
||||||
// Create audio stream
|
|
||||||
std::unique_ptr<AudioStream> stream(new WaveFileReader(fileName));
|
|
||||||
|
|
||||||
// Downmix, if required
|
|
||||||
if (stream->getChannelCount() != 1) {
|
|
||||||
stream.reset(new ChannelDownmixer(std::move(stream)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Downsample, if required
|
|
||||||
if (stream->getFrameRate() < 16000) {
|
|
||||||
throw runtime_error("Sample rate must not be below 16kHz.");
|
|
||||||
}
|
|
||||||
if (stream->getFrameRate() != 16000) {
|
|
||||||
stream.reset(new SampleRateConverter(std::move(stream), 16000));
|
|
||||||
}
|
|
||||||
|
|
||||||
return stream;
|
|
||||||
}
|
|
|
@ -1,10 +0,0 @@
|
||||||
#ifndef LIPSYNC_WAVEFILEREADER16KHZMONO_H
|
|
||||||
#define LIPSYNC_WAVEFILEREADER16KHZMONO_H
|
|
||||||
|
|
||||||
#include "AudioStream.h"
|
|
||||||
#include <memory>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName);
|
|
||||||
|
|
||||||
#endif //LIPSYNC_WAVEFILEREADER16KHZMONO_H
|
|
|
@ -1,6 +1,6 @@
|
||||||
#include <format.h>
|
#include <format.h>
|
||||||
#include "WaveFileReader.h"
|
#include "WaveFileReader.h"
|
||||||
#include "IOTools.h"
|
#include "io_tools.h"
|
||||||
|
|
||||||
using std::runtime_error;
|
using std::runtime_error;
|
||||||
using fmt::format;
|
using fmt::format;
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include "WaveFileWriter.h"
|
#include "wave_file_writing.h"
|
||||||
#include "IOTools.h"
|
#include "io_tools.h"
|
||||||
|
|
||||||
using namespace little_endian;
|
using namespace little_endian;
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
#include <ratio>
|
||||||
|
#include <chrono>
|
||||||
|
#include <ostream>
|
||||||
|
#include "Centiseconds.h"
|
||||||
|
|
||||||
|
std::ostream& operator <<(std::ostream& stream, const centiseconds cs) {
|
||||||
|
return stream << cs.count() << "cs";
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
#ifndef LIPSYNC_CENTISECONDS_H
|
||||||
|
#define LIPSYNC_CENTISECONDS_H
|
||||||
|
|
||||||
|
typedef std::chrono::duration<int, std::centi> centiseconds;
|
||||||
|
|
||||||
|
std::ostream& operator <<(std::ostream& stream, const centiseconds cs);
|
||||||
|
|
||||||
|
#endif //LIPSYNC_CENTISECONDS_H
|
97
src/main.cpp
97
src/main.cpp
|
@ -1,97 +1,16 @@
|
||||||
#include <pocketsphinx.h>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <fstream>
|
|
||||||
#include <memory>
|
|
||||||
#include <vector>
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <chrono>
|
#include "audio_input/WaveFileReader.h"
|
||||||
#include "audio_input/16kHzMonoStream.h"
|
#include "phone_extraction.h"
|
||||||
|
|
||||||
using std::runtime_error;
|
|
||||||
using std::shared_ptr;
|
|
||||||
using std::unique_ptr;
|
|
||||||
|
|
||||||
#define MODELDIR "X:/dev/projects/LipSync/lib/pocketsphinx-5prealpha-2015-08-05/model"
|
|
||||||
|
|
||||||
// Converts a float in the range -1..1 to a signed 16-bit int
|
|
||||||
int16_t floatSampleToInt16(float sample) {
|
|
||||||
sample = std::max(sample, -1.0f);
|
|
||||||
sample = std::min(sample, 1.0f);
|
|
||||||
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
shared_ptr<cmd_ln_t> config(
|
// Create audio stream
|
||||||
cmd_ln_init(
|
std::unique_ptr<AudioStream> audioStream(
|
||||||
nullptr, ps_args(), true,
|
new WaveFileReader(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)"));
|
||||||
// Set acoustic model
|
|
||||||
"-hmm", MODELDIR "/en-us/en-us",
|
|
||||||
// Set phonetic language model
|
|
||||||
"-allphone", MODELDIR "/en-us/en-us-phone.lm.bin",
|
|
||||||
"-allphone_ci", "yes",
|
|
||||||
// The following settings are Voodoo to me.
|
|
||||||
// I copied them from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
|
||||||
// Set beam width applied to every frame in Viterbi search
|
|
||||||
"-beam", "1e-20",
|
|
||||||
// Set beam width applied to phone transitions
|
|
||||||
"-pbeam", "1e-20",
|
|
||||||
// Set language model probability weight
|
|
||||||
"-lw", "2.0",
|
|
||||||
nullptr),
|
|
||||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
|
||||||
if (!config) throw runtime_error("Error creating configuration.");
|
|
||||||
|
|
||||||
shared_ptr<ps_decoder_t> recognizer(
|
std::map<centiseconds, Phone> phones = detectPhones(std::move(audioStream));
|
||||||
ps_init(config.get()),
|
|
||||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
|
||||||
if (!recognizer) throw runtime_error("Error creating speech recognizer.");
|
|
||||||
|
|
||||||
unique_ptr<AudioStream> audioStream =
|
for (auto& pair : phones) {
|
||||||
create16kHzMonoStream(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)");
|
std::cout << pair.first << ": " << phoneToString(pair.second) << "\n";
|
||||||
|
|
||||||
int error = ps_start_utt(recognizer.get());
|
|
||||||
if (error) throw runtime_error("Error starting utterance processing.");
|
|
||||||
|
|
||||||
auto start = std::chrono::steady_clock::now();
|
|
||||||
|
|
||||||
std::vector<int16_t> buffer;
|
|
||||||
const int capacity = 1600; // 0.1 second capacity
|
|
||||||
buffer.reserve(capacity);
|
|
||||||
int sampleCount = 0;
|
|
||||||
do {
|
|
||||||
// Read to buffer
|
|
||||||
buffer.clear();
|
|
||||||
while (buffer.size() < capacity) {
|
|
||||||
float sample;
|
|
||||||
if (!audioStream->getNextSample(sample)) break;
|
|
||||||
buffer.push_back(floatSampleToInt16(sample));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Analyze buffer
|
|
||||||
int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false);
|
|
||||||
if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data.");
|
|
||||||
|
|
||||||
sampleCount += buffer.size();
|
|
||||||
|
|
||||||
std::cout << sampleCount / 16000.0 << "s\n";
|
|
||||||
} while (buffer.size());
|
|
||||||
error = ps_end_utt(recognizer.get());
|
|
||||||
if (error) throw runtime_error("Error ending utterance processing.");
|
|
||||||
|
|
||||||
auto end = std::chrono::steady_clock::now();
|
|
||||||
std::cout << std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count() << "\n";
|
|
||||||
|
|
||||||
ps_seg_t *segmentationIter;
|
|
||||||
int32 score;
|
|
||||||
for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
|
|
||||||
// Get phoneme
|
|
||||||
char const *phoneme = ps_seg_word(segmentationIter);
|
|
||||||
|
|
||||||
// Get timing
|
|
||||||
int startFrame, endFrame;
|
|
||||||
ps_seg_frames(segmentationIter, &startFrame, &endFrame);
|
|
||||||
|
|
||||||
printf(">>> %-5s %-5d %-5d\n", phoneme, startFrame, endFrame);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -0,0 +1,116 @@
|
||||||
|
#include <pocketsphinx.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <boost/filesystem.hpp>
|
||||||
|
#include "phone_extraction.h"
|
||||||
|
#include "audio_input/SampleRateConverter.h"
|
||||||
|
#include "audio_input/ChannelDownmixer.h"
|
||||||
|
#include "platform_tools.h"
|
||||||
|
using std::runtime_error;
|
||||||
|
using std::unique_ptr;
|
||||||
|
using std::shared_ptr;
|
||||||
|
using std::string;
|
||||||
|
using std::map;
|
||||||
|
using boost::filesystem::path;
|
||||||
|
|
||||||
|
unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
|
||||||
|
// Downmix, if required
|
||||||
|
if (stream->getChannelCount() != 1) {
|
||||||
|
stream.reset(new ChannelDownmixer(std::move(stream)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Downsample, if required
|
||||||
|
if (stream->getFrameRate() < 16000) {
|
||||||
|
throw runtime_error("Sample rate must not be below 16kHz.");
|
||||||
|
}
|
||||||
|
if (stream->getFrameRate() != 16000) {
|
||||||
|
stream.reset(new SampleRateConverter(std::move(stream), 16000));
|
||||||
|
}
|
||||||
|
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Converts a float in the range -1..1 to a signed 16-bit int
|
||||||
|
int16_t floatSampleToInt16(float sample) {
|
||||||
|
sample = std::max(sample, -1.0f);
|
||||||
|
sample = std::min(sample, 1.0f);
|
||||||
|
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
|
||||||
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
|
audioStream = to16kHzMono(std::move(audioStream));
|
||||||
|
|
||||||
|
// Create PocketSphinx configuration
|
||||||
|
path binDirectory(getBinDirectory());
|
||||||
|
path resDirectory(binDirectory.parent_path() / "res");
|
||||||
|
shared_ptr<cmd_ln_t> config(
|
||||||
|
cmd_ln_init(
|
||||||
|
nullptr, ps_args(), true,
|
||||||
|
// Set acoustic model
|
||||||
|
"-hmm", (resDirectory / "sphinx/acoustic_model").string().c_str(),
|
||||||
|
// Set phonetic language model
|
||||||
|
"-allphone", (resDirectory / "sphinx/en-us-phone.lm.bin").string().c_str(),
|
||||||
|
"-allphone_ci", "yes",
|
||||||
|
// The following settings are taken from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
||||||
|
// Set beam width applied to every frame in Viterbi search
|
||||||
|
"-beam", "1e-20",
|
||||||
|
// Set beam width applied to phone transitions
|
||||||
|
"-pbeam", "1e-20",
|
||||||
|
// Set language model probability weight
|
||||||
|
"-lw", "2.0",
|
||||||
|
nullptr),
|
||||||
|
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||||
|
if (!config) throw runtime_error("Error creating configuration.");
|
||||||
|
|
||||||
|
// Create phone recognizer
|
||||||
|
shared_ptr<ps_decoder_t> recognizer(
|
||||||
|
ps_init(config.get()),
|
||||||
|
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||||
|
if (!recognizer) throw runtime_error("Error creating speech recognizer.");
|
||||||
|
|
||||||
|
// Start recognition
|
||||||
|
int error = ps_start_utt(recognizer.get());
|
||||||
|
if (error) throw runtime_error("Error starting utterance processing.");
|
||||||
|
|
||||||
|
// Process entire sound file
|
||||||
|
std::vector<int16_t> buffer;
|
||||||
|
const int capacity = 1600; // 0.1 second capacity
|
||||||
|
buffer.reserve(capacity);
|
||||||
|
int sampleCount = 0;
|
||||||
|
do {
|
||||||
|
// Read to buffer
|
||||||
|
buffer.clear();
|
||||||
|
while (buffer.size() < capacity) {
|
||||||
|
float sample;
|
||||||
|
if (!audioStream->getNextSample(sample)) break;
|
||||||
|
buffer.push_back(floatSampleToInt16(sample));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Analyze buffer
|
||||||
|
int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false);
|
||||||
|
if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data.");
|
||||||
|
|
||||||
|
sampleCount += buffer.size();
|
||||||
|
} while (buffer.size());
|
||||||
|
error = ps_end_utt(recognizer.get());
|
||||||
|
if (error) throw runtime_error("Error ending utterance processing.");
|
||||||
|
|
||||||
|
// Collect results into map
|
||||||
|
map<centiseconds, Phone> result;
|
||||||
|
ps_seg_t *segmentationIter;
|
||||||
|
int32 score;
|
||||||
|
int endFrame;
|
||||||
|
for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
|
||||||
|
// Get phone
|
||||||
|
char const *phone = ps_seg_word(segmentationIter);
|
||||||
|
|
||||||
|
// Get timing
|
||||||
|
int startFrame;
|
||||||
|
ps_seg_frames(segmentationIter, &startFrame, &endFrame);
|
||||||
|
|
||||||
|
result[centiseconds(startFrame)] = stringToPhone(phone);
|
||||||
|
}
|
||||||
|
// Add dummy entry past the last phone
|
||||||
|
result[centiseconds(endFrame + 1)] = Phone::None;
|
||||||
|
return result;
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
#ifndef LIPSYNC_PHONE_EXTRACTION_H
|
||||||
|
#define LIPSYNC_PHONE_EXTRACTION_H
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
#include <chrono>
|
||||||
|
#include <ratio>
|
||||||
|
#include <memory>
|
||||||
|
#include "audio_input/AudioStream.h"
|
||||||
|
#include "Phone.h"
|
||||||
|
#include "centiseconds.h"
|
||||||
|
|
||||||
|
std::map<centiseconds, Phone> detectPhones(std::unique_ptr<AudioStream> audioStream);
|
||||||
|
|
||||||
|
#endif //LIPSYNC_PHONE_EXTRACTION_H
|
|
@ -0,0 +1,8 @@
|
||||||
|
#ifndef LIPSYNC_PLATFORM_TOOLS_H
|
||||||
|
#define LIPSYNC_PLATFORM_TOOLS_H
|
||||||
|
|
||||||
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
|
boost::filesystem::path getBinDirectory();
|
||||||
|
|
||||||
|
#endif //LIPSYNC_PLATFORM_TOOLS_H
|
|
@ -0,0 +1,24 @@
|
||||||
|
#include "platform_tools.h"
|
||||||
|
|
||||||
|
#include <Windows.h>
|
||||||
|
|
||||||
|
boost::filesystem::path getBinDirectory() {
|
||||||
|
std::vector<wchar_t> executablePath(MAX_PATH);
|
||||||
|
|
||||||
|
// Try to get the executable path with a buffer of MAX_PATH characters.
|
||||||
|
DWORD result = GetModuleFileNameW(0, executablePath.data(), executablePath.size());
|
||||||
|
|
||||||
|
// As long the function returns the buffer size, it is indicating that the buffer
|
||||||
|
// was too small. Keep doubling the buffer size until it fits.
|
||||||
|
while(result == executablePath.size()) {
|
||||||
|
executablePath.resize(executablePath.size() * 2);
|
||||||
|
result = GetModuleFileNameW(0, executablePath.data(), executablePath.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the function returned 0, something went wrong
|
||||||
|
if (result == 0) {
|
||||||
|
throw std::runtime_error("Could not determine path of bin directory.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return boost::filesystem::path(executablePath.data()).parent_path();
|
||||||
|
}
|
Loading…
Reference in New Issue