Simplified code using Timeline<T>
This commit is contained in:
parent
83291aa96c
commit
04c828506d
|
@ -6,12 +6,12 @@
|
||||||
template<typename TValue>
|
template<typename TValue>
|
||||||
class Timed : public TimeRange {
|
class Timed : public TimeRange {
|
||||||
public:
|
public:
|
||||||
Timed(time_type start, time_type end, TValue value) :
|
Timed(time_type start, time_type end, const TValue& value) :
|
||||||
TimeRange(start, end),
|
TimeRange(start, end),
|
||||||
value(value)
|
value(value)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
Timed(TimeRange timeRange, TValue value) :
|
Timed(const TimeRange& timeRange, const TValue& value) :
|
||||||
TimeRange(timeRange),
|
TimeRange(timeRange),
|
||||||
value(value)
|
value(value)
|
||||||
{}
|
{}
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
#include "AudioStream.h"
|
#include "AudioStream.h"
|
||||||
|
|
||||||
|
TimeRange AudioStream::getTruncatedRange() {
|
||||||
|
return TimeRange(centiseconds::zero(), centiseconds(100 * getSampleCount() / getSampleRate()));
|
||||||
|
}
|
||||||
|
|
||||||
bool AudioStream::endOfStream() {
|
bool AudioStream::endOfStream() {
|
||||||
return getSampleIndex() >= getSampleCount();
|
return getSampleIndex() >= getSampleCount();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include "TimeRange.h"
|
||||||
|
|
||||||
// A mono stream of floating-point samples.
|
// A mono stream of floating-point samples.
|
||||||
class AudioStream {
|
class AudioStream {
|
||||||
|
@ -9,6 +10,7 @@ public:
|
||||||
virtual std::unique_ptr<AudioStream> clone(bool reset) = 0;
|
virtual std::unique_ptr<AudioStream> clone(bool reset) = 0;
|
||||||
virtual int getSampleRate() = 0;
|
virtual int getSampleRate() = 0;
|
||||||
virtual int getSampleCount() = 0;
|
virtual int getSampleCount() = 0;
|
||||||
|
TimeRange getTruncatedRange();
|
||||||
virtual int getSampleIndex() = 0;
|
virtual int getSampleIndex() = 0;
|
||||||
virtual void seek(int sampleIndex) = 0;
|
virtual void seek(int sampleIndex) = 0;
|
||||||
bool endOfStream();
|
bool endOfStream();
|
||||||
|
|
|
@ -17,7 +17,7 @@ float getRMS(AudioStream& audioStream, int maxSampleCount = numeric_limits<int>:
|
||||||
return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
|
return sampleCount > 0 ? static_cast<float>(std::sqrt(sum / sampleCount)) : 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<TimeRange> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
|
Timeline<bool> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream) {
|
||||||
// Make sure audio stream has no DC offset
|
// Make sure audio stream has no DC offset
|
||||||
audioStream = removeDCOffset(std::move(audioStream));
|
audioStream = removeDCOffset(std::move(audioStream));
|
||||||
|
|
||||||
|
@ -26,30 +26,30 @@ vector<TimeRange> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream)
|
||||||
constexpr int sampleRate = 2 * maxFrequency;
|
constexpr int sampleRate = 2 * maxFrequency;
|
||||||
audioStream = convertSampleRate(std::move(audioStream), sampleRate);
|
audioStream = convertSampleRate(std::move(audioStream), sampleRate);
|
||||||
|
|
||||||
float rms = getRMS(*audioStream->clone(true));
|
// Detect activity
|
||||||
float cutoff = rms / 50;
|
const float rms = getRMS(*audioStream->clone(true));
|
||||||
centiseconds maxGap(10);
|
const float cutoff = rms / 50;
|
||||||
|
Timeline<bool> activity(audioStream->getTruncatedRange());
|
||||||
vector<TimeRange> result;
|
for (centiseconds time = centiseconds::zero(); !audioStream->endOfStream(); ++time) {
|
||||||
optional<centiseconds> segmentStart, segmentEnd;
|
float currentRMS = getRMS(*audioStream, sampleRate / 100);
|
||||||
for (centiseconds time = centiseconds(0); !audioStream->endOfStream(); ++time) {
|
bool active = currentRMS > cutoff;
|
||||||
float currentPower = getRMS(*audioStream, sampleRate / 100);
|
|
||||||
bool active = currentPower > cutoff;
|
|
||||||
if (active) {
|
if (active) {
|
||||||
if (!segmentStart) {
|
activity[time] = true;
|
||||||
segmentStart = time;
|
|
||||||
}
|
|
||||||
segmentEnd = time + centiseconds(1);
|
|
||||||
} else if (segmentEnd && time > segmentEnd.value() + maxGap) {
|
|
||||||
result.push_back(TimeRange(segmentStart.value(), segmentEnd.value()));
|
|
||||||
logTimedEvent("utterance", segmentStart.value(), segmentEnd.value(), "");
|
|
||||||
segmentStart.reset();
|
|
||||||
segmentEnd.reset();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (segmentEnd) {
|
|
||||||
result.push_back(TimeRange(segmentStart.value(), segmentEnd.value()));
|
// Fill small gaps in activity
|
||||||
|
const centiseconds maxGap(10);
|
||||||
|
for (const auto& element : Timeline<bool>(activity)) {
|
||||||
|
if (!element.getValue() && element.getLength() <= maxGap) {
|
||||||
|
activity.set(static_cast<TimeRange>(element), true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
// Log
|
||||||
|
for (const auto& element : activity) {
|
||||||
|
logTimedEvent("utterance", static_cast<TimeRange>(element), std::string());
|
||||||
|
}
|
||||||
|
|
||||||
|
return activity;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <vector>
|
|
||||||
#include <TimeRange.h>
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "AudioStream.h"
|
#include "AudioStream.h"
|
||||||
|
#include <Timeline.h>
|
||||||
|
|
||||||
std::vector<TimeRange> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);
|
Timeline<bool> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream);
|
||||||
|
|
|
@ -7,8 +7,7 @@
|
||||||
#include <boost/log/utility/setup/common_attributes.hpp>
|
#include <boost/log/utility/setup/common_attributes.hpp>
|
||||||
// ReSharper disable once CppUnusedIncludeDirective
|
// ReSharper disable once CppUnusedIncludeDirective
|
||||||
#include <boost/log/support/date_time.hpp>
|
#include <boost/log/support/date_time.hpp>
|
||||||
#include <centiseconds.h>
|
#include <Timed.h>
|
||||||
#include "tools.h"
|
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::lock_guard;
|
using std::lock_guard;
|
||||||
|
@ -122,7 +121,3 @@ void addFileSink(const boost::filesystem::path& logFilePath, LogLevel minLogLeve
|
||||||
sink->set_filter(severity >= minLogLevel);
|
sink->set_filter(severity >= minLogLevel);
|
||||||
boost::log::core::get()->add_sink(sink);
|
boost::log::core::get()->add_sink(sink);
|
||||||
}
|
}
|
||||||
|
|
||||||
void logTimedEvent(const string& eventName, centiseconds start, centiseconds end, const string& value) {
|
|
||||||
LOG_DEBUG << "##" << eventName << "[" << formatDuration(start) << "-" << formatDuration(end) << "]: " << value;
|
|
||||||
}
|
|
||||||
|
|
|
@ -10,7 +10,9 @@
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
#include "centiseconds.h"
|
#include "centiseconds.h"
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
|
#include "tools.h"
|
||||||
#include "enumTools.h"
|
#include "enumTools.h"
|
||||||
|
#include "Timed.h"
|
||||||
|
|
||||||
enum class LogLevel {
|
enum class LogLevel {
|
||||||
Trace,
|
Trace,
|
||||||
|
@ -66,4 +68,19 @@ boost::shared_ptr<PausableBackendAdapter> addPausableStderrSink(LogLevel minLogL
|
||||||
|
|
||||||
void addFileSink(const boost::filesystem::path& logFilePath, LogLevel minLogLevel);
|
void addFileSink(const boost::filesystem::path& logFilePath, LogLevel minLogLevel);
|
||||||
|
|
||||||
void logTimedEvent(const std::string& eventName, centiseconds start, centiseconds end, const std::string& value);
|
template<typename TValue>
|
||||||
|
void logTimedEvent(const std::string& eventName, const Timed<TValue> timedValue) {
|
||||||
|
LOG_DEBUG
|
||||||
|
<< "##" << eventName << "[" << formatDuration(timedValue.getStart()) << "-" << formatDuration(timedValue.getEnd()) << "]: "
|
||||||
|
<< timedValue.getValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename TValue>
|
||||||
|
void logTimedEvent(const std::string& eventName, const TimeRange& timeRange, const TValue& value) {
|
||||||
|
logTimedEvent(eventName, Timed<TValue>(timeRange, value));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename TValue>
|
||||||
|
void logTimedEvent(const std::string& eventName, centiseconds start, centiseconds end, const TValue& value) {
|
||||||
|
logTimedEvent(eventName, Timed<TValue>(start, end, value));
|
||||||
|
}
|
||||||
|
|
29
src/main.cpp
29
src/main.cpp
|
@ -13,6 +13,7 @@
|
||||||
#include "logging.h"
|
#include "logging.h"
|
||||||
#include <gsl_util.h>
|
#include <gsl_util.h>
|
||||||
#include <tools.h>
|
#include <tools.h>
|
||||||
|
#include <Timeline.h>
|
||||||
|
|
||||||
using std::exception;
|
using std::exception;
|
||||||
using std::string;
|
using std::string;
|
||||||
|
@ -39,34 +40,30 @@ string getMessage(const exception& e) {
|
||||||
|
|
||||||
unique_ptr<AudioStream> createAudioStream(path filePath) {
|
unique_ptr<AudioStream> createAudioStream(path filePath) {
|
||||||
try {
|
try {
|
||||||
return unique_ptr<AudioStream>(new WaveFileReader(filePath));
|
return std::make_unique<WaveFileReader>(filePath);
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
std::throw_with_nested(std::runtime_error("Could not open sound file.") );
|
std::throw_with_nested(std::runtime_error("Could not open sound file.") );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ptree createXmlTree(const path& filePath, const map<centiseconds, Phone>& phones, const map<centiseconds, Shape>& shapes) {
|
ptree createXmlTree(const path& filePath, const Timeline<Phone>& phones, const Timeline<Shape>& shapes) {
|
||||||
ptree tree;
|
ptree tree;
|
||||||
|
|
||||||
// Add sound file path
|
// Add sound file path
|
||||||
tree.add("rhubarbResult.info.soundFile", filePath.string());
|
tree.add("rhubarbResult.info.soundFile", filePath.string());
|
||||||
|
|
||||||
// Add phones
|
// Add phones
|
||||||
for (auto it = phones.cbegin(), itNext = ++phones.cbegin(); itNext != phones.cend(); ++it, ++itNext) {
|
for (auto& timedPhone : phones) {
|
||||||
auto pair = *it;
|
ptree& phoneElement = tree.add("rhubarbResult.phones.phone", timedPhone.getValue());
|
||||||
auto nextPair = *itNext;
|
phoneElement.add("<xmlattr>.start", formatDuration(timedPhone.getStart()));
|
||||||
ptree& phoneElement = tree.add("rhubarbResult.phones.phone", pair.second);
|
phoneElement.add("<xmlattr>.duration", formatDuration(timedPhone.getLength()));
|
||||||
phoneElement.add("<xmlattr>.start", formatDuration(pair.first));
|
|
||||||
phoneElement.add("<xmlattr>.duration", formatDuration(nextPair.first - pair.first));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add mouth cues
|
// Add mouth cues
|
||||||
for (auto it = shapes.cbegin(), itNext = ++shapes.cbegin(); itNext != shapes.cend(); ++it, ++itNext) {
|
for (auto& timedShape : shapes) {
|
||||||
auto pair = *it;
|
ptree& mouthCueElement = tree.add("rhubarbResult.mouthCues.mouthCue", timedShape.getValue());
|
||||||
auto nextPair = *itNext;
|
mouthCueElement.add("<xmlattr>.start", formatDuration(timedShape.getStart()));
|
||||||
ptree& mouthCueElement = tree.add("rhubarbResult.mouthCues.mouthCue", pair.second);
|
mouthCueElement.add("<xmlattr>.duration", formatDuration(timedShape.getLength()));
|
||||||
mouthCueElement.add("<xmlattr>.start", formatDuration(pair.first));
|
|
||||||
mouthCueElement.add("<xmlattr>.duration", formatDuration(nextPair.first - pair.first));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return tree;
|
return tree;
|
||||||
|
@ -115,7 +112,7 @@ int main(int argc, char *argv[]) {
|
||||||
const int columnWidth = 30;
|
const int columnWidth = 30;
|
||||||
std::cerr << std::left;
|
std::cerr << std::left;
|
||||||
std::cerr << std::setw(columnWidth) << "Analyzing input file";
|
std::cerr << std::setw(columnWidth) << "Analyzing input file";
|
||||||
map<centiseconds, Phone> phones;
|
Timeline<Phone> phones{};
|
||||||
{
|
{
|
||||||
ProgressBar progressBar;
|
ProgressBar progressBar;
|
||||||
phones = detectPhones(
|
phones = detectPhones(
|
||||||
|
@ -127,7 +124,7 @@ int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
// Generate mouth shapes
|
// Generate mouth shapes
|
||||||
std::cerr << std::setw(columnWidth) << "Generating mouth shapes";
|
std::cerr << std::setw(columnWidth) << "Generating mouth shapes";
|
||||||
map<centiseconds, Shape> shapes = animate(phones);
|
Timeline<Shape> shapes = animate(phones);
|
||||||
std::cerr << "Done" << std::endl;
|
std::cerr << "Done" << std::endl;
|
||||||
|
|
||||||
std::cerr << std::endl;
|
std::cerr << std::endl;
|
||||||
|
|
|
@ -67,20 +67,12 @@ Shape getShape(Phone phone) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
map<centiseconds, Shape> animate(const map<centiseconds, Phone> &phones) {
|
Timeline<Shape> animate(const Timeline<Phone> &phones) {
|
||||||
map<centiseconds, Shape> shapes;
|
Timeline<Shape> shapes(phones.getRange());
|
||||||
Shape lastShape = Shape::Invalid;
|
for (auto& timedPhone : phones) {
|
||||||
for (auto it = phones.cbegin(); it != phones.cend(); ++it) {
|
Timed<Shape> timedShape(static_cast<TimeRange>(timedPhone), getShape(timedPhone.getValue()));
|
||||||
Shape shape = getShape(it->second);
|
shapes.set(timedShape);
|
||||||
if (shape != lastShape || next(it) == phones.cend()) {
|
logTimedEvent("shape", timedShape);
|
||||||
shapes[it->first] = shape;
|
|
||||||
lastShape = shape;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto it = shapes.cbegin(); it != shapes.cend(); ++it) {
|
|
||||||
if (next(it) == shapes.cend()) break;
|
|
||||||
logTimedEvent("shape", it->first, next(it)->first, enumToString(it->second));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return shapes;
|
return shapes;
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <map>
|
|
||||||
#include "Phone.h"
|
#include "Phone.h"
|
||||||
#include "centiseconds.h"
|
|
||||||
#include "Shape.h"
|
#include "Shape.h"
|
||||||
|
#include "Timeline.h"
|
||||||
|
|
||||||
std::map<centiseconds, Shape> animate(const std::map<centiseconds, Phone>& phones);
|
Timeline<Shape> animate(const Timeline<Phone>& phones);
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
#include <gsl_util.h>
|
#include <gsl_util.h>
|
||||||
#include <logging.h>
|
#include <logging.h>
|
||||||
#include <audio/DCOffset.h>
|
#include <audio/DCOffset.h>
|
||||||
|
#include <Timeline.h>
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include <pocketsphinx.h>
|
#include <pocketsphinx.h>
|
||||||
|
@ -213,7 +214,7 @@ vector<s3wid_t> getWordIds(const vector<string>& words, dict_t& dictionary) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
Timeline<Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
||||||
// Create alignment list
|
// Create alignment list
|
||||||
lambda_unique_ptr<ps_alignment_t> alignment(
|
lambda_unique_ptr<ps_alignment_t> alignment(
|
||||||
ps_alignment_init(recognizer.d2p),
|
ps_alignment_init(recognizer.d2p),
|
||||||
|
@ -264,30 +265,25 @@ map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, uniqu
|
||||||
|
|
||||||
// Extract phones with timestamps
|
// Extract phones with timestamps
|
||||||
char** phoneNames = recognizer.dict->mdef->ciname;
|
char** phoneNames = recognizer.dict->mdef->ciname;
|
||||||
map<centiseconds, Phone> result;
|
Timeline<Phone> result(audioStream->getTruncatedRange());
|
||||||
result[centiseconds(0)] = Phone::None;
|
|
||||||
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
||||||
// Get phone
|
// Get phone
|
||||||
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
|
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
|
||||||
s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
||||||
char* phoneName = phoneNames[phoneId];
|
char* phoneName = phoneNames[phoneId];
|
||||||
|
|
||||||
// Get timing
|
// Add entry
|
||||||
int startFrame = phoneEntry->start;
|
centiseconds start(phoneEntry->start);
|
||||||
int duration = phoneEntry->duration;
|
centiseconds duration(phoneEntry->duration);
|
||||||
|
Timed<Phone> timedPhone(start, start + duration, parseEnum<Phone>(phoneName));
|
||||||
|
result.set(timedPhone);
|
||||||
|
|
||||||
// Add map entries
|
logTimedEvent("phone", timedPhone);
|
||||||
centiseconds start(startFrame);
|
|
||||||
result[start] = parseEnum<Phone>(phoneName);
|
|
||||||
centiseconds end(startFrame + duration);
|
|
||||||
result[end] = Phone::None;
|
|
||||||
|
|
||||||
logTimedEvent("phone", start, end, phoneName);
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
map<centiseconds, Phone> detectPhones(
|
Timeline<Phone> detectPhones(
|
||||||
unique_ptr<AudioStream> audioStream,
|
unique_ptr<AudioStream> audioStream,
|
||||||
boost::optional<std::string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
|
@ -322,7 +318,7 @@ map<centiseconds, Phone> detectPhones(
|
||||||
vector<s3wid_t> wordIds = getWordIds(words, *recognizer->dict);
|
vector<s3wid_t> wordIds = getWordIds(words, *recognizer->dict);
|
||||||
|
|
||||||
// Align the word's phones with speech
|
// Align the word's phones with speech
|
||||||
map<centiseconds, Phone> result = getPhoneAlignment(wordIds, std::move(audioStream), *recognizer.get(), alignmentProgressSink);
|
Timeline<Phone> result = getPhoneAlignment(wordIds, std::move(audioStream), *recognizer.get(), alignmentProgressSink);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
catch (...) {
|
catch (...) {
|
||||||
|
|
|
@ -1,14 +1,13 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <map>
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "audio/AudioStream.h"
|
#include "audio/AudioStream.h"
|
||||||
#include "Phone.h"
|
#include "Phone.h"
|
||||||
#include "centiseconds.h"
|
|
||||||
#include "progressBar.h"
|
#include "progressBar.h"
|
||||||
#include <boost/optional/optional.hpp>
|
#include <boost/optional/optional.hpp>
|
||||||
|
#include "Timeline.h"
|
||||||
|
|
||||||
std::map<centiseconds, Phone> detectPhones(
|
Timeline<Phone> detectPhones(
|
||||||
std::unique_ptr<AudioStream> audioStream,
|
std::unique_ptr<AudioStream> audioStream,
|
||||||
boost::optional<std::string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
ProgressSink& progressSink);
|
ProgressSink& progressSink);
|
||||||
|
|
Loading…
Reference in New Issue