Simplified code using Timeline<T>

2016-04-09 22:07:25 +02:00 · 2016-04-09 22:07:25 +02:00 · 04c828506d
parent 83291aa96c
commit 04c828506d
12 changed files with 85 additions and 85 deletions
--- a/src/Timed.h
+++ b/src/Timed.h
@ -6,12 +6,12 @@
 template<typename TValue>
 class Timed : public TimeRange {
 public:
-	Timed(time_type start, time_type end, TValue value) :
+	Timed(time_type start, time_type end, const TValue& value) :
 		TimeRange(start, end),
 		value(value)
 	{}

-	Timed(TimeRange timeRange, TValue value) :
+	Timed(const TimeRange& timeRange, const TValue& value) :
 		TimeRange(timeRange),
 		value(value)
 	{}
--- a/src/audio/AudioStream.cpp
+++ b/src/audio/AudioStream.cpp
@ -1,5 +1,9 @@
 #include "AudioStream.h"

+TimeRange AudioStream::getTruncatedRange() {
+	return TimeRange(centiseconds::zero(), centiseconds(100 * getSampleCount() / getSampleRate()));
+}
+
 bool AudioStream::endOfStream() {
 	return getSampleIndex() >= getSampleCount();
 }
--- a/src/audio/AudioStream.h
+++ b/src/audio/AudioStream.h
@ -1,6 +1,7 @@
 #pragma once

 #include <memory>
+#include "TimeRange.h"

 // A mono stream of floating-point samples.
 class AudioStream {
@ -9,6 +10,7 @@ public:
 	virtual std::unique_ptr<AudioStream> clone(bool reset) = 0;
 	virtual int getSampleRate() = 0;
 	virtual int getSampleCount() = 0;
+	TimeRange getTruncatedRange();
 	virtual int getSampleIndex() = 0;
 	virtual void seek(int sampleIndex) = 0;
 	bool endOfStream();
--- a/src/audio/voiceActivityDetection.cpp
+++ b/src/audio/voiceActivityDetection.cpp
@ -17,7 +17,7 @@ float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>:
 	return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
 }

-vector<TimeRange> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
+Timeline<bool> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
 	// Make sure audio stream has no DC offset
 	audioStream = removeDCOffset(std::move(audioStream));

@ -26,30 +26,30 @@ vector<TimeRange> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream)
 	constexpr int sampleRate = 2 * maxFrequency;
 	audioStream = convertSampleRate(std::move(audioStream), sampleRate);

-	float rms = getRMS(*audioStream->clone(true));
-	float cutoff = rms / 50;
-	centiseconds maxGap(10);
-
-	vector<TimeRange> result;
-	optional<centiseconds> segmentStart, segmentEnd;
-	for (centiseconds time = centiseconds(0); !audioStream->endOfStream(); ++time) {
-		float currentPower = getRMS(*audioStream, sampleRate / 100);
-		bool active = currentPower > cutoff;
+	// Detect activity
+	const float rms = getRMS(*audioStream->clone(true));
+	const float cutoff = rms / 50;
+	Timeline<bool> activity(audioStream->getTruncatedRange());
+	for (centiseconds time = centiseconds::zero(); !audioStream->endOfStream(); ++time) {
+		float currentRMS = getRMS(*audioStream, sampleRate / 100);
+		bool active = currentRMS > cutoff;
 		if (active) {
-			if (!segmentStart) {
-				segmentStart = time;
-			}
-			segmentEnd = time + centiseconds(1);
-		} else if (segmentEnd && time > segmentEnd.value() + maxGap) {
-			result.push_back(TimeRange(segmentStart.value(), segmentEnd.value()));
-			logTimedEvent("utterance", segmentStart.value(), segmentEnd.value(), "");
-			segmentStart.reset();
-			segmentEnd.reset();
+			activity[time] = true;
 		}
 	}
-	if (segmentEnd) {
-		result.push_back(TimeRange(segmentStart.value(), segmentEnd.value()));
+
+	// Fill small gaps in activity
+	const centiseconds maxGap(10);
+	for (const auto& element : Timeline<bool>(activity)) {
+		if (!element.getValue() && element.getLength() <= maxGap) {
+			activity.set(static_cast<TimeRange>(element), true);
+		}
 	}

-	return result;
+	// Log
+	for (const auto& element : activity) {
+		logTimedEvent("utterance", static_cast<TimeRange>(element), std::string());
+	}
+
+	return activity;
 }
--- a/src/audio/voiceActivityDetection.h
+++ b/src/audio/voiceActivityDetection.h
@ -1,7 +1,6 @@
 #pragma once
-#include <vector>
-#include <TimeRange.h>
 #include <memory>
 #include "AudioStream.h"
+#include <Timeline.h>

-std::vector<TimeRange> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);
+Timeline<bool> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);
--- a/src/logging.cpp
+++ b/src/logging.cpp
@ -7,8 +7,7 @@
 #include <boost/log/utility/setup/common_attributes.hpp>
 // ReSharper disable once CppUnusedIncludeDirective
 #include <boost/log/support/date_time.hpp>
-#include <centiseconds.h>
-#include "tools.h"
+#include <Timed.h>

 using std::string;
 using std::lock_guard;
@ -122,7 +121,3 @@ void addFileSink(const boost::filesystem::path& logFilePath, LogLevel minLogLeve
 	sink->set_filter(severity >= minLogLevel);
 	boost::log::core::get()->add_sink(sink);
 }
-
-void logTimedEvent(const string& eventName, centiseconds start, centiseconds end, const string& value) {
-	LOG_DEBUG << "##" << eventName << "[" << formatDuration(start) << "-" << formatDuration(end) << "]: " << value;
-}
--- a/src/logging.h
+++ b/src/logging.h
@ -10,7 +10,9 @@
 #include <tuple>
 #include "centiseconds.h"
 #include <boost/filesystem.hpp>
+#include "tools.h"
 #include "enumTools.h"
+#include "Timed.h"

 enum class LogLevel {
 	Trace,
@ -66,4 +68,19 @@ boost::shared_ptr<PausableBackendAdapter> addPausableStderrSink(LogLevel minLogL

 void addFileSink(const boost::filesystem::path& logFilePath, LogLevel minLogLevel);

-void logTimedEvent(const std::string& eventName, centiseconds start, centiseconds end, const std::string& value);
+template<typename TValue>
+void logTimedEvent(const std::string& eventName, const Timed<TValue> timedValue) {
+	LOG_DEBUG
+		<< "##" << eventName << "[" << formatDuration(timedValue.getStart()) << "-" << formatDuration(timedValue.getEnd()) << "]: "
+		<< timedValue.getValue();
+}
+
+template<typename TValue>
+void logTimedEvent(const std::string& eventName, const TimeRange& timeRange, const TValue& value) {
+	logTimedEvent(eventName, Timed<TValue>(timeRange, value));
+}
+
+template<typename TValue>
+void logTimedEvent(const std::string& eventName, centiseconds start, centiseconds end, const TValue& value) {
+	logTimedEvent(eventName, Timed<TValue>(start, end, value));
+}
--- a/src/main.cpp
+++ b/src/main.cpp
@ -13,6 +13,7 @@
 #include "logging.h"
 #include <gsl_util.h>
 #include <tools.h>
+#include <Timeline.h>

 using std::exception;
 using std::string;
@ -39,34 +40,30 @@ string getMessage(const exception& e) {

 unique_ptr<AudioStream> createAudioStream(path filePath) {
 	try {
-		return unique_ptr<AudioStream>(new WaveFileReader(filePath));
+		return std::make_unique<WaveFileReader>(filePath);
 	} catch (...) {
 		std::throw_with_nested(std::runtime_error("Could not open sound file.") );
 	}
 }

-ptree createXmlTree(const path& filePath, const map<centiseconds, Phone>& phones, const map<centiseconds, Shape>& shapes) {
+ptree createXmlTree(const path& filePath, const Timeline<Phone>& phones, const Timeline<Shape>& shapes) {
 	ptree tree;

 	// Add sound file path
 	tree.add("rhubarbResult.info.soundFile", filePath.string());

 	// Add phones
-	for (auto it = phones.cbegin(), itNext = ++phones.cbegin(); itNext != phones.cend(); ++it, ++itNext) {
-		auto pair = *it;
-		auto nextPair = *itNext;
-		ptree& phoneElement = tree.add("rhubarbResult.phones.phone", pair.second);
-		phoneElement.add("<xmlattr>.start", formatDuration(pair.first));
-		phoneElement.add("<xmlattr>.duration", formatDuration(nextPair.first - pair.first));
+	for (auto& timedPhone : phones) {
+		ptree& phoneElement = tree.add("rhubarbResult.phones.phone", timedPhone.getValue());
+		phoneElement.add("<xmlattr>.start", formatDuration(timedPhone.getStart()));
+		phoneElement.add("<xmlattr>.duration", formatDuration(timedPhone.getLength()));
 	}

 	// Add mouth cues
-	for (auto it = shapes.cbegin(), itNext = ++shapes.cbegin(); itNext != shapes.cend(); ++it, ++itNext) {
-		auto pair = *it;
-		auto nextPair = *itNext;
-		ptree& mouthCueElement = tree.add("rhubarbResult.mouthCues.mouthCue", pair.second);
-		mouthCueElement.add("<xmlattr>.start", formatDuration(pair.first));
-		mouthCueElement.add("<xmlattr>.duration", formatDuration(nextPair.first - pair.first));
+	for (auto& timedShape : shapes) {
+		ptree& mouthCueElement = tree.add("rhubarbResult.mouthCues.mouthCue", timedShape.getValue());
+		mouthCueElement.add("<xmlattr>.start", formatDuration(timedShape.getStart()));
+		mouthCueElement.add("<xmlattr>.duration", formatDuration(timedShape.getLength()));
 	}

 	return tree;
@ -115,7 +112,7 @@ int main(int argc, char *argv[]) {
 		const int columnWidth = 30;
 		std::cerr << std::left;
 		std::cerr << std::setw(columnWidth) << "Analyzing input file";
-		map<centiseconds, Phone> phones;
+		Timeline<Phone> phones{};
 		{
 			ProgressBar progressBar;
 			phones = detectPhones(
@ -127,7 +124,7 @@ int main(int argc, char *argv[]) {

 		// Generate mouth shapes
 		std::cerr << std::setw(columnWidth) << "Generating mouth shapes";
-		map<centiseconds, Shape> shapes = animate(phones);
+		Timeline<Shape> shapes = animate(phones);
 		std::cerr << "Done" << std::endl;

 		std::cerr << std::endl;
--- a/src/mouthAnimation.cpp
+++ b/src/mouthAnimation.cpp
@ -67,20 +67,12 @@ Shape getShape(Phone phone) {
 	}
 }

-map<centiseconds, Shape> animate(const map<centiseconds, Phone> &phones) {
-	map<centiseconds, Shape> shapes;
-	Shape lastShape = Shape::Invalid;
-	for (auto it = phones.cbegin(); it != phones.cend(); ++it) {
-		Shape shape = getShape(it->second);
-		if (shape != lastShape || next(it) == phones.cend()) {
-			shapes[it->first] = shape;
-			lastShape = shape;
-		}
-	}
-
-	for (auto it = shapes.cbegin(); it != shapes.cend(); ++it) {
-		if (next(it) == shapes.cend()) break;
-		logTimedEvent("shape", it->first, next(it)->first, enumToString(it->second));
+Timeline<Shape> animate(const Timeline<Phone> &phones) {
+	Timeline<Shape> shapes(phones.getRange());
+	for (auto& timedPhone : phones) {
+		Timed<Shape> timedShape(static_cast<TimeRange>(timedPhone), getShape(timedPhone.getValue()));
+		shapes.set(timedShape);
+		logTimedEvent("shape", timedShape);
 	}

 	return shapes;
--- a/src/mouthAnimation.h
+++ b/src/mouthAnimation.h
@ -1,8 +1,7 @@
 #pragma once

-#include <map>
 #include "Phone.h"
-#include "centiseconds.h"
 #include "Shape.h"
+#include "Timeline.h"

-std::map<centiseconds, Shape> animate(const std::map<centiseconds, Phone>& phones);
+Timeline<Shape> animate(const Timeline<Phone>& phones);
--- a/src/phoneExtraction.cpp
+++ b/src/phoneExtraction.cpp
@ -11,6 +11,7 @@
 #include <gsl_util.h>
 #include <logging.h>
 #include <audio/DCOffset.h>
+#include <Timeline.h>

 extern "C" {
 #include <pocketsphinx.h>
@ -213,7 +214,7 @@ vector<s3wid_t> getWordIds(const vector<string>& words, dict_t& dictionary) {
 	return result;
 }

-map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
+Timeline<Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
 	// Create alignment list
 	lambda_unique_ptr<ps_alignment_t> alignment(
 		ps_alignment_init(recognizer.d2p),
@ -264,30 +265,25 @@ map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, uniqu

 	// Extract phones with timestamps
 	char** phoneNames = recognizer.dict->mdef->ciname;
-	map<centiseconds, Phone> result;
-	result[centiseconds(0)] = Phone::None;
+	Timeline<Phone> result(audioStream->getTruncatedRange());
 	for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
 		// Get phone
 		ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
 		s3cipid_t phoneId = phoneEntry->id.pid.cipid;
 		char* phoneName = phoneNames[phoneId];

-		// Get timing
-		int startFrame = phoneEntry->start;
-		int duration = phoneEntry->duration;
+		// Add entry
+		centiseconds start(phoneEntry->start);
+		centiseconds duration(phoneEntry->duration);
+		Timed<Phone> timedPhone(start, start + duration, parseEnum<Phone>(phoneName));
+		result.set(timedPhone);

-		// Add map entries
-		centiseconds start(startFrame);
-		result[start] = parseEnum<Phone>(phoneName);
-		centiseconds end(startFrame + duration);
-		result[end] = Phone::None;
-
-		logTimedEvent("phone", start, end, phoneName);
+		logTimedEvent("phone", timedPhone);
 	}
 	return result;
 }

-map<centiseconds, Phone> detectPhones(
+Timeline<Phone> detectPhones(
 	unique_ptr<AudioStream> audioStream,
 	boost::optional<std::string> dialog,
 	ProgressSink& progressSink)
@ -322,7 +318,7 @@ map<centiseconds, Phone> detectPhones(
 		vector<s3wid_t> wordIds = getWordIds(words, *recognizer->dict);

 		// Align the word's phones with speech
-		map<centiseconds, Phone> result = getPhoneAlignment(wordIds, std::move(audioStream), *recognizer.get(), alignmentProgressSink);
+		Timeline<Phone> result = getPhoneAlignment(wordIds, std::move(audioStream), *recognizer.get(), alignmentProgressSink);
 		return result;
 	}
 	catch (...) {
--- a/src/phoneExtraction.h
+++ b/src/phoneExtraction.h
@ -1,14 +1,13 @@
 #pragma once

-#include <map>
 #include <memory>
 #include "audio/AudioStream.h"
 #include "Phone.h"
-#include "centiseconds.h"
 #include "progressBar.h"
 #include <boost/optional/optional.hpp>
+#include "Timeline.h"

-std::map<centiseconds, Phone> detectPhones(
+Timeline<Phone> detectPhones(
 	std::unique_ptr<AudioStream> audioStream,
 	boost::optional<std::string> dialog,
 	ProgressSink& progressSink);