Refactoring

- Moved phone recognition code to phone_extraction.cpp
- Introduced type centiseconds
- Code reorganization
This commit is contained in:
Daniel Wolf 2015-11-18 20:59:03 +01:00
parent 9fbae36e70
commit f2f6f75932
17 changed files with 326 additions and 139 deletions

View File

@ -126,19 +126,25 @@
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slamch.c" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slapack_lite.c" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/strfuncs.c" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/AudioStream.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/IOTools.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/io_tools.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/wave_file_writing.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/wave_file_writing.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/centiseconds.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/centiseconds.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/Phone.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/Phone.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/phone_extraction.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/phone_extraction.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/platform_tools.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/platform_tools_win.cpp" isTestSource="false" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
@ -169,8 +175,8 @@
<excluded>
<root url="file://$MODULE_DIR$/lib/cppformat/format.cc" />
<root url="file://$MODULE_DIR$/lib/cppformat/posix.cc" />
<root url="file://$MODULE_DIR$/lib/cppformat/posix.h" />
<root url="file://$MODULE_DIR$/lib/cppformat/format.h" />
<root url="file://$MODULE_DIR$/lib/cppformat/posix.h" />
</excluded>
</library>
</orderEntry>

View File

@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.3)
project(LipSync)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall")
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
@ -11,10 +11,15 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
set(Boost_USE_STATIC_LIBS ON) # Use static libs
set(Boost_USE_MULTITHREADED ON) # Enable multithreading support
set(Boost_USE_STATIC_RUNTIME ON) # Use static C++ runtime
find_package(Boost REQUIRED COMPONENTS filesystem locale )
find_package(Boost REQUIRED COMPONENTS filesystem locale system)
include_directories(${Boost_INCLUDE_DIRS})
set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/16kHzMonoStream.cpp src/audio_input/16kHzMonoStream.h src/audio_input/WaveFileWriter.cpp src/audio_input/WaveFileWriter.h src/audio_input/IOTools.h)
set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/wave_file_writing.cpp src/audio_input/wave_file_writing.h src/audio_input/io_tools.h src/platform_tools.h src/phone_extraction.cpp src/phone_extraction.h src/Phone.cpp src/Phone.h src/centiseconds.cpp src/centiseconds.h)
if(WIN32)
set(SOURCE_FILES "${SOURCE_FILES};src/platform_tools_win.cpp")
else()
message(FATAL_ERROR "Target platform not supported.")
endif()
include_directories("lib/sphinxbase-5prealpha-2015-08-05/include" "lib/pocketsphinx-5prealpha-2015-08-05/include" "lib/cppformat")
FILE(GLOB_RECURSE SPHINX_BASE "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c")
@ -44,5 +49,5 @@ endfunction()
# Copy resource files
set(modelDir "${CMAKE_SOURCE_DIR}/lib/pocketsphinx-5prealpha-2015-08-05/model")
copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx/acoustic_model")
copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/language_model")
copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx")
copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/acoustic_model")

36
src/Phone.cpp Normal file
View File

@ -0,0 +1,36 @@
#include <boost/bimap.hpp>
#include "Phone.h"
using std::string;
template <typename L, typename R>
boost::bimap<L, R>
makeBimap(std::initializer_list<typename boost::bimap<L, R>::value_type> list) {
return boost::bimap<L, R>(list.begin(), list.end());
}
boost::bimap<string, Phone> phonesByName = makeBimap<string, Phone>({
{ "None", Phone::None },
{ "Unknown", Phone::Unknown },
{ "AO", Phone::AO }, { "AA", Phone::AA }, { "IY", Phone::IY }, { "UW", Phone::UW },
{ "EH", Phone::EH }, { "IH", Phone::IH }, { "UH", Phone::UH }, { "AH", Phone::AH },
{ "AE", Phone::AE }, { "EY", Phone::EY }, { "AY", Phone::AY }, { "OW", Phone::OW },
{ "AW", Phone::AW }, { "OY", Phone::OY }, { "ER", Phone::ER }, { "P", Phone::P },
{ "B", Phone::B }, { "T", Phone::T }, { "D", Phone::D }, { "K", Phone::K },
{ "G", Phone::G }, { "CH", Phone::CH }, { "JH", Phone::JH }, { "F", Phone::F },
{ "V", Phone::V }, { "TH", Phone::TH }, { "DH", Phone::DH }, { "S", Phone::S },
{ "Z", Phone::Z }, { "SH", Phone::SH }, { "ZH", Phone::ZH }, { "HH", Phone::HH },
{ "M", Phone::M }, { "N", Phone::N }, { "NG", Phone::NG }, { "L", Phone::L },
{ "R", Phone::R }, { "Y", Phone::Y }, { "W", Phone::W },
});
Phone stringToPhone(const string& s) {
auto it = phonesByName.left.find(s);
return (it != phonesByName.left.end()) ? it->second : Phone::Unknown;
}
string phoneToString(Phone phone) {
auto it = phonesByName.right.find(phone);
return (it != phonesByName.right.end()) ? it->second : phoneToString(Phone::Unknown);
}

78
src/Phone.h Normal file
View File

@ -0,0 +1,78 @@
#ifndef LIPSYNC_PHONE_H
#define LIPSYNC_PHONE_H
// Defines a subset of the Arpabet
enum class Phone {
None,
Unknown,
/////////
// Vowels
// ... monophthongs
AO, // [ɔ] as in [o]ff, f[a]ll, fr[o]st
AA, // [ɑ] as in f[a]ther
IY, // [i] as in b[ee], sh[e]
UW, // [u] as in y[ou], n[ew], f[oo]d
EH, // [ɛ] as in r[e]d, m[e]n
IH, // [ɪ] as in b[i]g, w[i]n
UH, // [ʊ] as in sh[ou]ld, c[ou]ld
AH, // [ʌ, ə] as in b[u]t, s[u]n, [a]lone, disc[u]s
AE, // [æ] as in [a]t, b[a]t
// ... diphthongs
EY, // [eɪ] as in s[ay], [ei]ght
AY, // [aɪ] as in m[y], wh[y], r[i]de
OW, // [oʊ] as in sh[ow], c[oa]t
AW, // [aʊ] as in h[ow], n[ow]
OY, // [ɔɪ] as in b[oy], t[oy]
// ... r-colored
ER, // [ɝ] as in h[er], b[ir]d, h[ur]t
/////////////
// Consonants
// ... stops
P, // [p] as in [p]ay
B, // [b] as in [b]uy
T, // [t] as in [t]ake
D, // [d] as in [d]ay
K, // [k] as in [k]ey
G, // [g] as in [g]o
// ... affricates
CH, // [tʃ] as in [ch]air
JH, // [dʒ] as in [j]ust
// ... fricatives
F, // [f] as in [f]or
V, // [v] as in [v]ery
TH, // [θ] as in [th]anks
DH, // [ð] as in [th]at
S, // [s] as in [s]ay
Z, // [z] as in [z]oo
SH, // [ʃ] as in [sh]ow
ZH, // [ʒ] as in mea[s]ure, plea[s]ure
HH, // [h] as in [h]ouse
// ... nasals
M, // [m] as in [m]an
N, // [n] as in [no]
NG, // [ŋ] as in si[ng]
// ... liquids
L, // [ɫ] as in [l]ate
R, // [r, ɹ] as in [r]un
// ... semivowels
Y, // [j] as in [y]es
W // [w] as in [w]ay
};
Phone stringToPhone(const std::string& s);
std::string phoneToString(Phone phone);
#endif //LIPSYNC_PHONE_H

View File

@ -1,26 +0,0 @@
#include "16kHzMonoStream.h"
#include "WaveFileReader.h"
#include "ChannelDownmixer.h"
#include "SampleRateConverter.h"
using std::runtime_error;
std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName) {
// Create audio stream
std::unique_ptr<AudioStream> stream(new WaveFileReader(fileName));
// Downmix, if required
if (stream->getChannelCount() != 1) {
stream.reset(new ChannelDownmixer(std::move(stream)));
}
// Downsample, if required
if (stream->getFrameRate() < 16000) {
throw runtime_error("Sample rate must not be below 16kHz.");
}
if (stream->getFrameRate() != 16000) {
stream.reset(new SampleRateConverter(std::move(stream), 16000));
}
return stream;
}

View File

@ -1,10 +0,0 @@
#ifndef LIPSYNC_WAVEFILEREADER16KHZMONO_H
#define LIPSYNC_WAVEFILEREADER16KHZMONO_H
#include "AudioStream.h"
#include <memory>
#include <string>
std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName);
#endif //LIPSYNC_WAVEFILEREADER16KHZMONO_H

View File

@ -1,6 +1,6 @@
#include <format.h>
#include "WaveFileReader.h"
#include "IOTools.h"
#include "io_tools.h"
using std::runtime_error;
using fmt::format;

View File

@ -1,6 +1,6 @@
#include <fstream>
#include "WaveFileWriter.h"
#include "IOTools.h"
#include "wave_file_writing.h"
#include "io_tools.h"
using namespace little_endian;

9
src/centiseconds.cpp Normal file
View File

@ -0,0 +1,9 @@
#include <ratio>
#include <chrono>
#include <ostream>
#include "Centiseconds.h"
std::ostream& operator <<(std::ostream& stream, const centiseconds cs) {
return stream << cs.count() << "cs";
}

8
src/centiseconds.h Normal file
View File

@ -0,0 +1,8 @@
#ifndef LIPSYNC_CENTISECONDS_H
#define LIPSYNC_CENTISECONDS_H
typedef std::chrono::duration<int, std::centi> centiseconds;
std::ostream& operator <<(std::ostream& stream, const centiseconds cs);
#endif //LIPSYNC_CENTISECONDS_H

View File

@ -1,97 +1,16 @@
#include <pocketsphinx.h>
#include <stdexcept>
#include <fstream>
#include <memory>
#include <vector>
#include <iostream>
#include <chrono>
#include "audio_input/16kHzMonoStream.h"
using std::runtime_error;
using std::shared_ptr;
using std::unique_ptr;
#define MODELDIR "X:/dev/projects/LipSync/lib/pocketsphinx-5prealpha-2015-08-05/model"
// Converts a float in the range -1..1 to a signed 16-bit int
int16_t floatSampleToInt16(float sample) {
sample = std::max(sample, -1.0f);
sample = std::min(sample, 1.0f);
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
}
#include "audio_input/WaveFileReader.h"
#include "phone_extraction.h"
int main(int argc, char *argv[]) {
shared_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", MODELDIR "/en-us/en-us",
// Set phonetic language model
"-allphone", MODELDIR "/en-us/en-us-phone.lm.bin",
"-allphone_ci", "yes",
// The following settings are Voodoo to me.
// I copied them from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
// Set beam width applied to every frame in Viterbi search
"-beam", "1e-20",
// Set beam width applied to phone transitions
"-pbeam", "1e-20",
// Set language model probability weight
"-lw", "2.0",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
// Create audio stream
std::unique_ptr<AudioStream> audioStream(
new WaveFileReader(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)"));
shared_ptr<ps_decoder_t> recognizer(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!recognizer) throw runtime_error("Error creating speech recognizer.");
std::map<centiseconds, Phone> phones = detectPhones(std::move(audioStream));
unique_ptr<AudioStream> audioStream =
create16kHzMonoStream(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)");
int error = ps_start_utt(recognizer.get());
if (error) throw runtime_error("Error starting utterance processing.");
auto start = std::chrono::steady_clock::now();
std::vector<int16_t> buffer;
const int capacity = 1600; // 0.1 second capacity
buffer.reserve(capacity);
int sampleCount = 0;
do {
// Read to buffer
buffer.clear();
while (buffer.size() < capacity) {
float sample;
if (!audioStream->getNextSample(sample)) break;
buffer.push_back(floatSampleToInt16(sample));
}
// Analyze buffer
int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false);
if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data.");
sampleCount += buffer.size();
std::cout << sampleCount / 16000.0 << "s\n";
} while (buffer.size());
error = ps_end_utt(recognizer.get());
if (error) throw runtime_error("Error ending utterance processing.");
auto end = std::chrono::steady_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count() << "\n";
ps_seg_t *segmentationIter;
int32 score;
for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
// Get phoneme
char const *phoneme = ps_seg_word(segmentationIter);
// Get timing
int startFrame, endFrame;
ps_seg_frames(segmentationIter, &startFrame, &endFrame);
printf(">>> %-5s %-5d %-5d\n", phoneme, startFrame, endFrame);
for (auto& pair : phones) {
std::cout << pair.first << ": " << phoneToString(pair.second) << "\n";
}
return 0;

116
src/phone_extraction.cpp Normal file
View File

@ -0,0 +1,116 @@
#include <pocketsphinx.h>
#include <iostream>
#include <boost/filesystem.hpp>
#include "phone_extraction.h"
#include "audio_input/SampleRateConverter.h"
#include "audio_input/ChannelDownmixer.h"
#include "platform_tools.h"
using std::runtime_error;
using std::unique_ptr;
using std::shared_ptr;
using std::string;
using std::map;
using boost::filesystem::path;
unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
// Downmix, if required
if (stream->getChannelCount() != 1) {
stream.reset(new ChannelDownmixer(std::move(stream)));
}
// Downsample, if required
if (stream->getFrameRate() < 16000) {
throw runtime_error("Sample rate must not be below 16kHz.");
}
if (stream->getFrameRate() != 16000) {
stream.reset(new SampleRateConverter(std::move(stream), 16000));
}
return stream;
}
// Converts a float in the range -1..1 to a signed 16-bit int
int16_t floatSampleToInt16(float sample) {
sample = std::max(sample, -1.0f);
sample = std::min(sample, 1.0f);
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
}
map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
// Convert audio stream to the exact format PocketSphinx requires
audioStream = to16kHzMono(std::move(audioStream));
// Create PocketSphinx configuration
path binDirectory(getBinDirectory());
path resDirectory(binDirectory.parent_path() / "res");
shared_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", (resDirectory / "sphinx/acoustic_model").string().c_str(),
// Set phonetic language model
"-allphone", (resDirectory / "sphinx/en-us-phone.lm.bin").string().c_str(),
"-allphone_ci", "yes",
// The following settings are taken from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
// Set beam width applied to every frame in Viterbi search
"-beam", "1e-20",
// Set beam width applied to phone transitions
"-pbeam", "1e-20",
// Set language model probability weight
"-lw", "2.0",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
// Create phone recognizer
shared_ptr<ps_decoder_t> recognizer(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!recognizer) throw runtime_error("Error creating speech recognizer.");
// Start recognition
int error = ps_start_utt(recognizer.get());
if (error) throw runtime_error("Error starting utterance processing.");
// Process entire sound file
std::vector<int16_t> buffer;
const int capacity = 1600; // 0.1 second capacity
buffer.reserve(capacity);
int sampleCount = 0;
do {
// Read to buffer
buffer.clear();
while (buffer.size() < capacity) {
float sample;
if (!audioStream->getNextSample(sample)) break;
buffer.push_back(floatSampleToInt16(sample));
}
// Analyze buffer
int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false);
if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data.");
sampleCount += buffer.size();
} while (buffer.size());
error = ps_end_utt(recognizer.get());
if (error) throw runtime_error("Error ending utterance processing.");
// Collect results into map
map<centiseconds, Phone> result;
ps_seg_t *segmentationIter;
int32 score;
int endFrame;
for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
// Get phone
char const *phone = ps_seg_word(segmentationIter);
// Get timing
int startFrame;
ps_seg_frames(segmentationIter, &startFrame, &endFrame);
result[centiseconds(startFrame)] = stringToPhone(phone);
}
// Add dummy entry past the last phone
result[centiseconds(endFrame + 1)] = Phone::None;
return result;
}

14
src/phone_extraction.h Normal file
View File

@ -0,0 +1,14 @@
#ifndef LIPSYNC_PHONE_EXTRACTION_H
#define LIPSYNC_PHONE_EXTRACTION_H
#include <map>
#include <chrono>
#include <ratio>
#include <memory>
#include "audio_input/AudioStream.h"
#include "Phone.h"
#include "centiseconds.h"
std::map<centiseconds, Phone> detectPhones(std::unique_ptr<AudioStream> audioStream);
#endif //LIPSYNC_PHONE_EXTRACTION_H

8
src/platform_tools.h Normal file
View File

@ -0,0 +1,8 @@
#ifndef LIPSYNC_PLATFORM_TOOLS_H
#define LIPSYNC_PLATFORM_TOOLS_H
#include <boost/filesystem.hpp>
boost::filesystem::path getBinDirectory();
#endif //LIPSYNC_PLATFORM_TOOLS_H

View File

@ -0,0 +1,24 @@
#include "platform_tools.h"
#include <Windows.h>
boost::filesystem::path getBinDirectory() {
std::vector<wchar_t> executablePath(MAX_PATH);
// Try to get the executable path with a buffer of MAX_PATH characters.
DWORD result = GetModuleFileNameW(0, executablePath.data(), executablePath.size());
// As long the function returns the buffer size, it is indicating that the buffer
// was too small. Keep doubling the buffer size until it fits.
while(result == executablePath.size()) {
executablePath.resize(executablePath.size() * 2);
result = GetModuleFileNameW(0, executablePath.data(), executablePath.size());
}
// If the function returned 0, something went wrong
if (result == 0) {
throw std::runtime_error("Could not determine path of bin directory.");
}
return boost::filesystem::path(executablePath.data()).parent_path();
}