Refactoring

- Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization
2015-11-18 20:59:03 +01:00 · 2015-11-18 20:59:03 +01:00 · f2f6f75932
parent 9fbae36e70
commit f2f6f75932
17 changed files with 326 additions and 139 deletions
--- a/.idea/LipSync.iml
+++ b/.idea/LipSync.iml
@ -126,19 +126,25 @@
      <sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slamch.c" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/slapack_lite.c" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/util/strfuncs.c" isTestSource="false" />
-      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.cpp" isTestSource="false" />
-      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.h" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/AudioStream.h" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.cpp" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.h" isTestSource="false" />
-      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/IOTools.h" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/io_tools.h" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.cpp" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.h" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/wave_file_writing.cpp" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/wave_file_writing.h" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.cpp" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.h" isTestSource="false" />
-      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.cpp" isTestSource="false" />
-      <sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.h" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/centiseconds.cpp" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/centiseconds.h" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/src/main.cpp" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/Phone.cpp" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/Phone.h" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/phone_extraction.cpp" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/phone_extraction.h" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/platform_tools.h" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/platform_tools_win.cpp" isTestSource="false" />
    </content>
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
@ -169,8 +175,8 @@
        <excluded>
          <root url="file://$MODULE_DIR$/lib/cppformat/format.cc" />
          <root url="file://$MODULE_DIR$/lib/cppformat/posix.cc" />
-          <root url="file://$MODULE_DIR$/lib/cppformat/posix.h" />
          <root url="file://$MODULE_DIR$/lib/cppformat/format.h" />
+          <root url="file://$MODULE_DIR$/lib/cppformat/posix.h" />
        </excluded>
      </library>
    </orderEntry>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.3)
 project(LipSync)

-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall")

 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
@ -11,10 +11,15 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
 set(Boost_USE_STATIC_LIBS ON) # Use static libs
 set(Boost_USE_MULTITHREADED ON) # Enable multithreading support
 set(Boost_USE_STATIC_RUNTIME ON) # Use static C++ runtime
-find_package(Boost REQUIRED COMPONENTS filesystem locale )
+find_package(Boost REQUIRED COMPONENTS filesystem locale system)
 include_directories(${Boost_INCLUDE_DIRS})

-set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/16kHzMonoStream.cpp src/audio_input/16kHzMonoStream.h src/audio_input/WaveFileWriter.cpp src/audio_input/WaveFileWriter.h src/audio_input/IOTools.h)
+set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/wave_file_writing.cpp src/audio_input/wave_file_writing.h src/audio_input/io_tools.h src/platform_tools.h src/phone_extraction.cpp src/phone_extraction.h src/Phone.cpp src/Phone.h src/centiseconds.cpp src/centiseconds.h)
+if(WIN32)
+	set(SOURCE_FILES "${SOURCE_FILES};src/platform_tools_win.cpp")
+else()
+	message(FATAL_ERROR "Target platform not supported.")
+endif()

 include_directories("lib/sphinxbase-5prealpha-2015-08-05/include" "lib/pocketsphinx-5prealpha-2015-08-05/include" "lib/cppformat")
 FILE(GLOB_RECURSE SPHINX_BASE "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c")
@ -44,5 +49,5 @@ endfunction()

 # Copy resource files
 set(modelDir "${CMAKE_SOURCE_DIR}/lib/pocketsphinx-5prealpha-2015-08-05/model")
-copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx/acoustic_model")
-copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/language_model")
+copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx")
+copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/acoustic_model")
--- a/src/Phone.cpp
+++ b/src/Phone.cpp
@ -0,0 +1,36 @@
+#include <boost/bimap.hpp>
+#include "Phone.h"
+
+using std::string;
+
+template <typename L, typename R>
+boost::bimap<L, R>
+makeBimap(std::initializer_list<typename boost::bimap<L, R>::value_type> list) {
+	return boost::bimap<L, R>(list.begin(), list.end());
+}
+
+boost::bimap<string, Phone> phonesByName = makeBimap<string, Phone>({
+	{ "None", Phone::None },
+	{ "Unknown", Phone::Unknown },
+	{ "AO",	Phone::AO },	{ "AA",	Phone::AA },	{ "IY",	Phone::IY },	{ "UW",	Phone::UW },
+	{ "EH",	Phone::EH },	{ "IH",	Phone::IH },	{ "UH",	Phone::UH },	{ "AH",	Phone::AH },
+	{ "AE",	Phone::AE },	{ "EY",	Phone::EY },	{ "AY",	Phone::AY },	{ "OW",	Phone::OW },
+	{ "AW",	Phone::AW },	{ "OY",	Phone::OY },	{ "ER",	Phone::ER },	{ "P",	Phone::P },
+	{ "B",	Phone::B },		{ "T",	Phone::T },		{ "D",	Phone::D },		{ "K",	Phone::K },
+	{ "G",	Phone::G },		{ "CH",	Phone::CH },	{ "JH",	Phone::JH },	{ "F",	Phone::F },
+	{ "V",	Phone::V },		{ "TH",	Phone::TH },	{ "DH",	Phone::DH },	{ "S",	Phone::S },
+	{ "Z",	Phone::Z },		{ "SH",	Phone::SH },	{ "ZH",	Phone::ZH },	{ "HH",	Phone::HH },
+	{ "M",	Phone::M },		{ "N",	Phone::N },		{ "NG",	Phone::NG },	{ "L",	Phone::L },
+	{ "R",	Phone::R },		{ "Y",	Phone::Y },		{ "W",	Phone::W },
+});
+
+Phone stringToPhone(const string& s) {
+	auto it = phonesByName.left.find(s);
+	return (it != phonesByName.left.end()) ? it->second : Phone::Unknown;
+}
+
+string phoneToString(Phone phone) {
+	auto it = phonesByName.right.find(phone);
+	return (it != phonesByName.right.end()) ? it->second : phoneToString(Phone::Unknown);
+}
+
--- a/src/Phone.h
+++ b/src/Phone.h
@ -0,0 +1,78 @@
+#ifndef LIPSYNC_PHONE_H
+#define LIPSYNC_PHONE_H
+
+// Defines a subset of the Arpabet
+enum class Phone {
+	None,
+	Unknown,
+
+	/////////
+	// Vowels
+
+	// ... monophthongs
+	AO,		// [ɔ] as in [o]ff, f[a]ll, fr[o]st
+	AA,		// [ɑ] as in f[a]ther
+	IY,		// [i] as in b[ee], sh[e]
+	UW,		// [u] as in y[ou], n[ew], f[oo]d
+	EH,		// [ɛ] as in r[e]d, m[e]n
+	IH,		// [ɪ] as in b[i]g, w[i]n
+	UH,		// [ʊ] as in sh[ou]ld, c[ou]ld
+	AH,		// [ʌ, ə] as in b[u]t, s[u]n, [a]lone, disc[u]s
+	AE,		// [æ] as in [a]t, b[a]t
+
+	// ... diphthongs
+	EY,		// [eɪ] as in s[ay], [ei]ght
+	AY,		// [aɪ] as in m[y], wh[y], r[i]de
+	OW,		// [oʊ] as in sh[ow], c[oa]t
+	AW,		// [aʊ] as in h[ow], n[ow]
+	OY,		// [ɔɪ] as in b[oy], t[oy]
+
+	// ... r-colored
+	ER,		// [ɝ] as in h[er], b[ir]d, h[ur]t
+
+	/////////////
+	// Consonants
+
+	// ... stops
+	P,		// [p] as in [p]ay
+	B,		// [b] as in [b]uy
+	T,		// [t] as in [t]ake
+	D,		// [d] as in [d]ay
+	K,		// [k] as in [k]ey
+	G,		// [g] as in [g]o
+
+	// ... affricates
+	CH,		// [tʃ] as in [ch]air
+	JH,		// [dʒ] as in [j]ust
+
+	// ... fricatives
+	F,		// [f] as in [f]or
+	V,		// [v] as in [v]ery
+	TH,		// [θ] as in [th]anks
+	DH,		// [ð] as in [th]at
+	S,		// [s] as in [s]ay
+	Z,		// [z] as in [z]oo
+	SH,		// [ʃ] as in [sh]ow
+	ZH,		// [ʒ] as in mea[s]ure, plea[s]ure
+	HH,		// [h] as in [h]ouse
+
+	// ... nasals
+	M,		// [m] as in [m]an
+	N,		// [n] as in [no]
+	NG,		// [ŋ] as in si[ng]
+
+	// ... liquids
+	L,		// [ɫ] as in [l]ate
+	R,		// [r, ɹ] as in [r]un
+
+	// ... semivowels
+	Y,		// [j] as in [y]es
+	W		// [w] as in [w]ay
+};
+
+Phone stringToPhone(const std::string& s);
+
+std::string phoneToString(Phone phone);
+
+
+#endif //LIPSYNC_PHONE_H
--- a/src/audio_input/16kHzMonoStream.cpp
+++ b/src/audio_input/16kHzMonoStream.cpp
@ -1,26 +0,0 @@
-#include "16kHzMonoStream.h"
-#include "WaveFileReader.h"
-#include "ChannelDownmixer.h"
-#include "SampleRateConverter.h"
-
-using std::runtime_error;
-
-std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName) {
-	// Create audio stream
-	std::unique_ptr<AudioStream> stream(new WaveFileReader(fileName));
-
-	// Downmix, if required
-	if (stream->getChannelCount() != 1) {
-		stream.reset(new ChannelDownmixer(std::move(stream)));
-	}
-
-	// Downsample, if required
-	if (stream->getFrameRate() < 16000) {
-		throw runtime_error("Sample rate must not be below 16kHz.");
-	}
-	if (stream->getFrameRate() != 16000) {
-		stream.reset(new SampleRateConverter(std::move(stream), 16000));
-	}
-
-	return stream;
-}
--- a/src/audio_input/16kHzMonoStream.h
+++ b/src/audio_input/16kHzMonoStream.h
@ -1,10 +0,0 @@
-#ifndef LIPSYNC_WAVEFILEREADER16KHZMONO_H
-#define LIPSYNC_WAVEFILEREADER16KHZMONO_H
-
-#include "AudioStream.h"
-#include <memory>
-#include <string>
-
-std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName);
-
-#endif //LIPSYNC_WAVEFILEREADER16KHZMONO_H
--- a/src/audio_input/WaveFileReader.cpp
+++ b/src/audio_input/WaveFileReader.cpp
@ -1,6 +1,6 @@
 #include <format.h>
 #include "WaveFileReader.h"
-#include "IOTools.h"
+#include "io_tools.h"

 using std::runtime_error;
 using fmt::format;
--- a/src/audio_input/io_tools.h
+++ b/src/audio_input/io_tools.h
--- a/src/audio_input/wave_file_writing.cpp
+++ b/src/audio_input/wave_file_writing.cpp
@ -1,6 +1,6 @@
 #include <fstream>
-#include "WaveFileWriter.h"
-#include "IOTools.h"
+#include "wave_file_writing.h"
+#include "io_tools.h"

 using namespace little_endian;

--- a/src/audio_input/wave_file_writing.h
+++ b/src/audio_input/wave_file_writing.h
--- a/src/centiseconds.cpp
+++ b/src/centiseconds.cpp
@ -0,0 +1,9 @@
+#include <ratio>
+#include <chrono>
+#include <ostream>
+#include "Centiseconds.h"
+
+std::ostream& operator <<(std::ostream& stream, const centiseconds cs) {
+	return stream << cs.count() << "cs";
+}
+
--- a/src/centiseconds.h
+++ b/src/centiseconds.h
@ -0,0 +1,8 @@
+#ifndef LIPSYNC_CENTISECONDS_H
+#define LIPSYNC_CENTISECONDS_H
+
+typedef std::chrono::duration<int, std::centi> centiseconds;
+
+std::ostream& operator <<(std::ostream& stream, const centiseconds cs);
+
+#endif //LIPSYNC_CENTISECONDS_H
--- a/src/main.cpp
+++ b/src/main.cpp
@ -1,97 +1,16 @@
-#include <pocketsphinx.h>
-#include <stdexcept>
-#include <fstream>
-#include <memory>
-#include <vector>
 #include <iostream>
-#include <chrono>
-#include "audio_input/16kHzMonoStream.h"
-
-using std::runtime_error;
-using std::shared_ptr;
-using std::unique_ptr;
-
-#define MODELDIR "X:/dev/projects/LipSync/lib/pocketsphinx-5prealpha-2015-08-05/model"
-
-// Converts a float in the range -1..1 to a signed 16-bit int
-int16_t floatSampleToInt16(float sample) {
-	sample = std::max(sample, -1.0f);
-	sample = std::min(sample, 1.0f);
-	return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
-}
+#include "audio_input/WaveFileReader.h"
+#include "phone_extraction.h"

 int main(int argc, char *argv[]) {
-	shared_ptr<cmd_ln_t> config(
-		cmd_ln_init(
-			nullptr, ps_args(), true,
-			// Set acoustic model
-			"-hmm", MODELDIR "/en-us/en-us",
-			// Set phonetic language model
-			"-allphone", MODELDIR "/en-us/en-us-phone.lm.bin",
-			"-allphone_ci", "yes",
-			// The following settings are Voodoo to me.
-			// I copied them from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
-			// Set beam width applied to every frame in Viterbi search
-			"-beam", "1e-20",
-			// Set beam width applied to phone transitions
-			"-pbeam", "1e-20",
-			// Set language model probability weight
-			"-lw", "2.0",
-			nullptr),
-		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
-	if (!config) throw runtime_error("Error creating configuration.");
+	// Create audio stream
+	std::unique_ptr<AudioStream> audioStream(
+		new WaveFileReader(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)"));

-	shared_ptr<ps_decoder_t> recognizer(
-		ps_init(config.get()),
-		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
-	if (!recognizer) throw runtime_error("Error creating speech recognizer.");
+	std::map<centiseconds, Phone> phones = detectPhones(std::move(audioStream));

-	unique_ptr<AudioStream> audioStream =
-		create16kHzMonoStream(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)");
-
-	int error = ps_start_utt(recognizer.get());
-	if (error) throw runtime_error("Error starting utterance processing.");
-
-	auto start = std::chrono::steady_clock::now();
-
-	std::vector<int16_t> buffer;
-	const int capacity = 1600; // 0.1 second capacity
-	buffer.reserve(capacity);
-	int sampleCount = 0;
-	do {
-		// Read to buffer
-		buffer.clear();
-		while (buffer.size() < capacity) {
-			float sample;
-			if (!audioStream->getNextSample(sample)) break;
-			buffer.push_back(floatSampleToInt16(sample));
-		}
-
-		// Analyze buffer
-		int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false);
-		if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data.");
-
-		sampleCount += buffer.size();
-
-		std::cout << sampleCount / 16000.0 << "s\n";
-	} while (buffer.size());
-	error = ps_end_utt(recognizer.get());
-	if (error) throw runtime_error("Error ending utterance processing.");
-
-	auto end = std::chrono::steady_clock::now();
-	std::cout << std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count() << "\n";
-
-	ps_seg_t *segmentationIter;
-	int32 score;
-	for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
-		// Get phoneme
-		char const *phoneme = ps_seg_word(segmentationIter);
-
-		// Get timing
-		int startFrame, endFrame;
-		ps_seg_frames(segmentationIter, &startFrame, &endFrame);
-
-		printf(">>> %-5s %-5d %-5d\n", phoneme, startFrame, endFrame);
+	for (auto& pair : phones) {
+		std::cout << pair.first << ": " << phoneToString(pair.second) << "\n";
 	}

 	return 0;
--- a/src/phone_extraction.cpp
+++ b/src/phone_extraction.cpp
@ -0,0 +1,116 @@
+#include <pocketsphinx.h>
+#include <iostream>
+#include <boost/filesystem.hpp>
+#include "phone_extraction.h"
+#include "audio_input/SampleRateConverter.h"
+#include "audio_input/ChannelDownmixer.h"
+#include "platform_tools.h"
+using std::runtime_error;
+using std::unique_ptr;
+using std::shared_ptr;
+using std::string;
+using std::map;
+using boost::filesystem::path;
+
+unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
+	// Downmix, if required
+	if (stream->getChannelCount() != 1) {
+		stream.reset(new ChannelDownmixer(std::move(stream)));
+	}
+
+	// Downsample, if required
+	if (stream->getFrameRate() < 16000) {
+		throw runtime_error("Sample rate must not be below 16kHz.");
+	}
+	if (stream->getFrameRate() != 16000) {
+		stream.reset(new SampleRateConverter(std::move(stream), 16000));
+	}
+
+	return stream;
+}
+
+// Converts a float in the range -1..1 to a signed 16-bit int
+int16_t floatSampleToInt16(float sample) {
+	sample = std::max(sample, -1.0f);
+	sample = std::min(sample, 1.0f);
+	return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
+}
+
+map<centiseconds, Phone> detectPhones(unique_ptr<AudioStream> audioStream) {
+	// Convert audio stream to the exact format PocketSphinx requires
+	audioStream = to16kHzMono(std::move(audioStream));
+
+	// Create PocketSphinx configuration
+	path binDirectory(getBinDirectory());
+	path resDirectory(binDirectory.parent_path() / "res");
+	shared_ptr<cmd_ln_t> config(
+		cmd_ln_init(
+			nullptr, ps_args(), true,
+			// Set acoustic model
+			"-hmm", (resDirectory / "sphinx/acoustic_model").string().c_str(),
+			// Set phonetic language model
+			"-allphone", (resDirectory / "sphinx/en-us-phone.lm.bin").string().c_str(),
+			"-allphone_ci", "yes",
+			// The following settings are taken from http://cmusphinx.sourceforge.net/wiki/phonemerecognition
+			// Set beam width applied to every frame in Viterbi search
+			"-beam", "1e-20",
+			// Set beam width applied to phone transitions
+			"-pbeam", "1e-20",
+			// Set language model probability weight
+			"-lw", "2.0",
+			nullptr),
+		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
+	if (!config) throw runtime_error("Error creating configuration.");
+
+	// Create phone recognizer
+	shared_ptr<ps_decoder_t> recognizer(
+		ps_init(config.get()),
+		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
+	if (!recognizer) throw runtime_error("Error creating speech recognizer.");
+
+	// Start recognition
+	int error = ps_start_utt(recognizer.get());
+	if (error) throw runtime_error("Error starting utterance processing.");
+
+	// Process entire sound file
+	std::vector<int16_t> buffer;
+	const int capacity = 1600; // 0.1 second capacity
+	buffer.reserve(capacity);
+	int sampleCount = 0;
+	do {
+		// Read to buffer
+		buffer.clear();
+		while (buffer.size() < capacity) {
+			float sample;
+			if (!audioStream->getNextSample(sample)) break;
+			buffer.push_back(floatSampleToInt16(sample));
+		}
+
+		// Analyze buffer
+		int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false);
+		if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data.");
+
+		sampleCount += buffer.size();
+	} while (buffer.size());
+	error = ps_end_utt(recognizer.get());
+	if (error) throw runtime_error("Error ending utterance processing.");
+
+	// Collect results into map
+	map<centiseconds, Phone> result;
+	ps_seg_t *segmentationIter;
+	int32 score;
+	int endFrame;
+	for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) {
+		// Get phone
+		char const *phone = ps_seg_word(segmentationIter);
+
+		// Get timing
+		int startFrame;
+		ps_seg_frames(segmentationIter, &startFrame, &endFrame);
+
+		result[centiseconds(startFrame)] = stringToPhone(phone);
+	}
+	// Add dummy entry past the last phone
+	result[centiseconds(endFrame + 1)] = Phone::None;
+	return result;
+}
--- a/src/phone_extraction.h
+++ b/src/phone_extraction.h
@ -0,0 +1,14 @@
+#ifndef LIPSYNC_PHONE_EXTRACTION_H
+#define LIPSYNC_PHONE_EXTRACTION_H
+
+#include <map>
+#include <chrono>
+#include <ratio>
+#include <memory>
+#include "audio_input/AudioStream.h"
+#include "Phone.h"
+#include "centiseconds.h"
+
+std::map<centiseconds, Phone> detectPhones(std::unique_ptr<AudioStream> audioStream);
+
+#endif //LIPSYNC_PHONE_EXTRACTION_H
--- a/src/platform_tools.h
+++ b/src/platform_tools.h
@ -0,0 +1,8 @@
+#ifndef LIPSYNC_PLATFORM_TOOLS_H
+#define LIPSYNC_PLATFORM_TOOLS_H
+
+#include <boost/filesystem.hpp>
+
+boost::filesystem::path getBinDirectory();
+
+#endif //LIPSYNC_PLATFORM_TOOLS_H
--- a/src/platform_tools_win.cpp
+++ b/src/platform_tools_win.cpp
@ -0,0 +1,24 @@
+#include "platform_tools.h"
+
+#include <Windows.h>
+
+boost::filesystem::path getBinDirectory() {
+	std::vector<wchar_t> executablePath(MAX_PATH);
+
+	// Try to get the executable path with a buffer of MAX_PATH characters.
+	DWORD result = GetModuleFileNameW(0, executablePath.data(), executablePath.size());
+
+	// As long the function returns the buffer size, it is indicating that the buffer
+	// was too small. Keep doubling the buffer size until it fits.
+	while(result == executablePath.size()) {
+		executablePath.resize(executablePath.size() * 2);
+		result = GetModuleFileNameW(0, executablePath.data(), executablePath.size());
+	}
+
+	// If the function returned 0, something went wrong
+	if (result == 0) {
+		throw std::runtime_error("Could not determine path of bin directory.");
+	}
+
+	return boost::filesystem::path(executablePath.data()).parent_path();
+}