Add hacky --alignmentFile option
This commit is contained in:
parent
364a5d4fe4
commit
ab42b861f0
|
@ -11,13 +11,14 @@ using std::filesystem::path;
|
||||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||||
const AudioClip& audioClip,
|
const AudioClip& audioClip,
|
||||||
const optional<string>& dialog,
|
const optional<string>& dialog,
|
||||||
|
const optional<BoundedTimeline<Phone>>& alignedPhones,
|
||||||
const Recognizer& recognizer,
|
const Recognizer& recognizer,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
const BoundedTimeline<Phone> phones =
|
const BoundedTimeline<Phone> phones =
|
||||||
recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
|
recognizer.recognizePhones(audioClip, dialog, alignedPhones, maxThreadCount, progressSink);
|
||||||
JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
|
JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -25,11 +26,12 @@ JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||||
path filePath,
|
path filePath,
|
||||||
const optional<string>& dialog,
|
const optional<string>& dialog,
|
||||||
|
const optional<BoundedTimeline<Phone>>& alignedPhones,
|
||||||
const Recognizer& recognizer,
|
const Recognizer& recognizer,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
const auto audioClip = createAudioFileClip(filePath);
|
const auto audioClip = createAudioFileClip(filePath);
|
||||||
return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
|
return animateAudioClip(*audioClip, dialog, alignedPhones, recognizer, targetShapeSet, maxThreadCount, progressSink);
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||||
const AudioClip& audioClip,
|
const AudioClip& audioClip,
|
||||||
const boost::optional<std::string>& dialog,
|
const boost::optional<std::string>& dialog,
|
||||||
|
const boost::optional<BoundedTimeline<Phone>>& alignedPhones,
|
||||||
const Recognizer& recognizer,
|
const Recognizer& recognizer,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
|
@ -19,6 +20,7 @@ JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||||
std::filesystem::path filePath,
|
std::filesystem::path filePath,
|
||||||
const boost::optional<std::string>& dialog,
|
const boost::optional<std::string>& dialog,
|
||||||
|
const boost::optional<BoundedTimeline<Phone>>& alignedPhones,
|
||||||
const Recognizer& recognizer,
|
const Recognizer& recognizer,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
|
|
|
@ -106,8 +106,12 @@ static Timeline<Phone> utteranceToPhones(
|
||||||
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
|
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
|
||||||
const AudioClip& inputAudioClip,
|
const AudioClip& inputAudioClip,
|
||||||
optional<std::string> dialog,
|
optional<std::string> dialog,
|
||||||
|
optional<BoundedTimeline<Phone>> alignedPhones,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink
|
ProgressSink& progressSink
|
||||||
) const {
|
) const {
|
||||||
|
if (alignedPhones) {
|
||||||
|
throw std::invalid_argument("Phonetic recognizer doesn't support specifying aligned phones in this POC.");
|
||||||
|
}
|
||||||
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,7 @@ public:
|
||||||
BoundedTimeline<Phone> recognizePhones(
|
BoundedTimeline<Phone> recognizePhones(
|
||||||
const AudioClip& inputAudioClip,
|
const AudioClip& inputAudioClip,
|
||||||
boost::optional<std::string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
|
boost::optional<BoundedTimeline<Phone>> alignedPhones,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink
|
ProgressSink& progressSink
|
||||||
) const override;
|
) const override;
|
||||||
|
|
|
@ -6,8 +6,10 @@
|
||||||
#include "languageModels.h"
|
#include "languageModels.h"
|
||||||
#include "tokenization.h"
|
#include "tokenization.h"
|
||||||
#include "g2p.h"
|
#include "g2p.h"
|
||||||
|
#include "audio/DcOffset.h"
|
||||||
#include "time/ContinuousTimeline.h"
|
#include "time/ContinuousTimeline.h"
|
||||||
#include "audio/processing.h"
|
#include "audio/processing.h"
|
||||||
|
#include "audio/voiceActivityDetection.h"
|
||||||
#include "time/timedLogging.h"
|
#include "time/timedLogging.h"
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
@ -334,9 +336,41 @@ static Timeline<Phone> utteranceToPhones(
|
||||||
BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
|
BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
|
||||||
const AudioClip& inputAudioClip,
|
const AudioClip& inputAudioClip,
|
||||||
optional<std::string> dialog,
|
optional<std::string> dialog,
|
||||||
|
optional<BoundedTimeline<Phone>> alignedPhones,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink
|
ProgressSink& progressSink
|
||||||
) const {
|
) const {
|
||||||
|
if (alignedPhones) {
|
||||||
|
// Make sure audio stream has no DC offset
|
||||||
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
|
||||||
|
|
||||||
|
// Split audio into utterances
|
||||||
|
JoiningBoundedTimeline<void> utterances;
|
||||||
|
try {
|
||||||
|
utterances = detectVoiceActivity(*audioClip, progressSink);
|
||||||
|
} catch (...) {
|
||||||
|
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
||||||
|
}
|
||||||
|
|
||||||
|
BoundedTimeline<Phone> result(utterances.getRange());
|
||||||
|
for (auto& utterance : utterances) {
|
||||||
|
// Copy over utterance phones
|
||||||
|
BoundedTimeline<Phone> utteranceResult(utterance.getTimeRange(), *alignedPhones);
|
||||||
|
|
||||||
|
// Guess positions of noise sounds
|
||||||
|
const JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceResult.getRange(), utteranceResult);
|
||||||
|
for (const auto& noiseSound : noiseSounds) {
|
||||||
|
utteranceResult.set(noiseSound.getTimeRange(), Phone::Noise);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto& timedValue : utteranceResult) {
|
||||||
|
result.set(timedValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
return ::recognizePhones(
|
return ::recognizePhones(
|
||||||
inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,7 @@ public:
|
||||||
BoundedTimeline<Phone> recognizePhones(
|
BoundedTimeline<Phone> recognizePhones(
|
||||||
const AudioClip& inputAudioClip,
|
const AudioClip& inputAudioClip,
|
||||||
boost::optional<std::string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
|
boost::optional<BoundedTimeline<Phone>> alignedPhones,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink
|
ProgressSink& progressSink
|
||||||
) const override;
|
) const override;
|
||||||
|
|
|
@ -12,6 +12,7 @@ public:
|
||||||
virtual BoundedTimeline<Phone> recognizePhones(
|
virtual BoundedTimeline<Phone> recognizePhones(
|
||||||
const AudioClip& audioClip,
|
const AudioClip& audioClip,
|
||||||
boost::optional<std::string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
|
boost::optional<BoundedTimeline<Phone>> alignedPhones,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink
|
ProgressSink& progressSink
|
||||||
) const = 0;
|
) const = 0;
|
||||||
|
|
|
@ -115,6 +115,33 @@ ShapeSet getTargetShapeSet(const string& extendedShapesString) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BoundedTimeline<Phone> readAlignmentFile(const path& filePath) {
|
||||||
|
if (!exists(filePath)) {
|
||||||
|
throw std::invalid_argument(fmt::format("File {} does not exist.", filePath.u8string()));
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
std::ifstream file;
|
||||||
|
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||||
|
file.open(filePath);
|
||||||
|
file.exceptions(0);
|
||||||
|
Timeline<Phone> result;
|
||||||
|
while (file) {
|
||||||
|
double start, end;
|
||||||
|
Phone phone;
|
||||||
|
file >> start >> end >> phone;
|
||||||
|
result.set(
|
||||||
|
centiseconds(static_cast<int>(start * 100)),
|
||||||
|
centiseconds(static_cast<int>(end * 100)),
|
||||||
|
phone
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return BoundedTimeline<Phone>(result.getRange(), result);
|
||||||
|
} catch (...) {
|
||||||
|
std::throw_with_nested(std::runtime_error(fmt::format("Error reading file {0}.", filePath.u8string())));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main(int platformArgc, char* platformArgv[]) {
|
int main(int platformArgc, char* platformArgv[]) {
|
||||||
// Set up default logging so early errors are printed to stdout
|
// Set up default logging so early errors are printed to stdout
|
||||||
const logging::Level defaultMinStderrLevel = logging::Level::Error;
|
const logging::Level defaultMinStderrLevel = logging::Level::Error;
|
||||||
|
@ -174,6 +201,11 @@ int main(int platformArgc, char* platformArgv[]) {
|
||||||
false, "GHX", "string", cmd
|
false, "GHX", "string", cmd
|
||||||
);
|
);
|
||||||
|
|
||||||
|
tclap::ValueArg<string> alignmentFile(
|
||||||
|
"", "alignmentFile", "A .tsv file containing pre-calculated phoneme alignment data.",
|
||||||
|
false, string(), "string", cmd
|
||||||
|
);
|
||||||
|
|
||||||
tclap::ValueArg<string> dialogFile(
|
tclap::ValueArg<string> dialogFile(
|
||||||
"d", "dialogFile", "A file containing the text of the dialog.",
|
"d", "dialogFile", "A file containing the text of the dialog.",
|
||||||
false, string(), "string", cmd
|
false, string(), "string", cmd
|
||||||
|
@ -263,6 +295,9 @@ int main(int platformArgc, char* platformArgv[]) {
|
||||||
dialogFile.isSet()
|
dialogFile.isSet()
|
||||||
? readUtf8File(u8path(dialogFile.getValue()))
|
? readUtf8File(u8path(dialogFile.getValue()))
|
||||||
: boost::optional<string>(),
|
: boost::optional<string>(),
|
||||||
|
alignmentFile.isSet()
|
||||||
|
? readAlignmentFile(u8path(alignmentFile.getValue()))
|
||||||
|
: boost::optional<BoundedTimeline<Phone>>(),
|
||||||
*createRecognizer(recognizerType.getValue()),
|
*createRecognizer(recognizerType.getValue()),
|
||||||
targetShapeSet,
|
targetShapeSet,
|
||||||
maxThreadCount.getValue(),
|
maxThreadCount.getValue(),
|
||||||
|
|
Loading…
Reference in New Issue