Implement generic concept of recognizers with options pocketSphinx and phonetic

This commit is contained in:
Daniel Wolf 2018-10-08 20:30:45 +02:00
parent 3ed38ada2f
commit 610f490046
15 changed files with 635 additions and 337 deletions

View File

@ -413,8 +413,13 @@ add_library(rhubarb-recognition
src/recognition/g2p.h src/recognition/g2p.h
src/recognition/languageModels.cpp src/recognition/languageModels.cpp
src/recognition/languageModels.h src/recognition/languageModels.h
src/recognition/phoneRecognition.cpp src/recognition/PhoneticRecognizer.cpp
src/recognition/phoneRecognition.h src/recognition/PhoneticRecognizer.h
src/recognition/PocketSphinxRecognizer.cpp
src/recognition/PocketSphinxRecognizer.h
src/recognition/pocketSphinxTools.cpp
src/recognition/pocketSphinxTools.h
src/recognition/Recognizer.h
src/recognition/tokenization.cpp src/recognition/tokenization.cpp
src/recognition/tokenization.h src/recognition/tokenization.h
) )
@ -487,6 +492,8 @@ add_executable(rhubarb
src/rhubarb/main.cpp src/rhubarb/main.cpp
src/rhubarb/ExportFormat.cpp src/rhubarb/ExportFormat.cpp
src/rhubarb/ExportFormat.h src/rhubarb/ExportFormat.h
src/rhubarb/RecognizerType.cpp
src/rhubarb/RecognizerType.h
src/rhubarb/semanticEntries.cpp src/rhubarb/semanticEntries.cpp
src/rhubarb/semanticEntries.h src/rhubarb/semanticEntries.h
src/rhubarb/sinks.cpp src/rhubarb/sinks.cpp

View File

@ -1,7 +1,12 @@
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation"> <wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppClangTidyModernizeRawStringLiteral/@EntryIndexedValue">HINT</s:String>
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String> <s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String>
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String> <s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String> <s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean> <s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean> <s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean> <s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean>
@ -29,6 +34,7 @@
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String> <s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String> <s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String> <s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean> <s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String> <s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String> <s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String>
@ -44,6 +50,14 @@
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean> <s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean> <s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean> <s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CssFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/HtmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/JavaScriptCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ProtobufCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ResxFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/VBFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlDocFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String> <s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String>
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String> <s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String>
<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue">&lt;NamingElement Priority="10"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="class field" /&gt;&lt;type Name="struct field" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /&gt;&lt;/NamingElement&gt;</s:String> <s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue">&lt;NamingElement Priority="10"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="class field" /&gt;&lt;type Name="struct field" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /&gt;&lt;/NamingElement&gt;</s:String>
@ -108,7 +122,16 @@
<s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String> <s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean> <s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean> <s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpKeepExistingMigration/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpPlaceEmbeddedOnSameLineMigration/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean> <s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean> <s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean> <s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002EFormat_002ESettingsUpgrade_002EAlignmentTabFillStyleMigration/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=allphone/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=cepstral/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=cmudict/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=pbeam/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=qwhy/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Viterbi/@EntryIndexedValue">True</s:Boolean>
</wpf:ResourceDictionary> </wpf:ResourceDictionary>

View File

@ -1,6 +1,5 @@
#include "rhubarbLib.h" #include "rhubarbLib.h"
#include "core/Phone.h" #include "core/Phone.h"
#include "recognition/phoneRecognition.h"
#include "tools/textFiles.h" #include "tools/textFiles.h"
#include "animation/mouthAnimation.h" #include "animation/mouthAnimation.h"
#include "audio/audioFileReading.h" #include "audio/audioFileReading.h"
@ -8,27 +7,29 @@
using boost::optional; using boost::optional;
using std::string; using std::string;
using boost::filesystem::path; using boost::filesystem::path;
using std::unique_ptr;
JoiningContinuousTimeline<Shape> animateAudioClip( JoiningContinuousTimeline<Shape> animateAudioClip(
const AudioClip& audioClip, const AudioClip& audioClip,
optional<string> dialog, const optional<string>& dialog,
const Recognizer& recognizer,
const ShapeSet& targetShapeSet, const ShapeSet& targetShapeSet,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink) ProgressSink& progressSink)
{ {
BoundedTimeline<Phone> phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink); const BoundedTimeline<Phone> phones =
recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet); JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
return result; return result;
} }
JoiningContinuousTimeline<Shape> animateWaveFile( JoiningContinuousTimeline<Shape> animateWaveFile(
path filePath, path filePath,
optional<string> dialog, const optional<string>& dialog,
const Recognizer& recognizer,
const ShapeSet& targetShapeSet, const ShapeSet& targetShapeSet,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink) ProgressSink& progressSink)
{ {
const auto audioClip = createAudioFileClip(filePath); const auto audioClip = createAudioFileClip(filePath);
return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink); return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
} }

View File

@ -6,17 +6,20 @@
#include "tools/ProgressBar.h" #include "tools/ProgressBar.h"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include "animation/targetShapeSet.h" #include "animation/targetShapeSet.h"
#include "recognition/Recognizer.h"
JoiningContinuousTimeline<Shape> animateAudioClip( JoiningContinuousTimeline<Shape> animateAudioClip(
const AudioClip& audioClip, const AudioClip& audioClip,
boost::optional<std::string> dialog, const boost::optional<std::string>& dialog,
const Recognizer& recognizer,
const ShapeSet& targetShapeSet, const ShapeSet& targetShapeSet,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink); ProgressSink& progressSink);
JoiningContinuousTimeline<Shape> animateWaveFile( JoiningContinuousTimeline<Shape> animateWaveFile(
boost::filesystem::path filePath, boost::filesystem::path filePath,
boost::optional<std::string> dialog, const boost::optional<std::string>& dialog,
const Recognizer& recognizer,
const ShapeSet& targetShapeSet, const ShapeSet& targetShapeSet,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink); ProgressSink& progressSink);

View File

@ -0,0 +1,103 @@
#include "PhoneticRecognizer.h"
#include "time/Timeline.h"
#include "audio/AudioSegment.h"
#include "audio/SampleRateConverter.h"
#include "audio/processing.h"
#include "time/timedLogging.h"
using std::runtime_error;
using std::unique_ptr;
using std::string;
using boost::optional;
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
UNUSED(dialog);
lambda_unique_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
// Set phonetic language model
"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
"-allphone_ci", "yes",
// Set language model probability weight.
// Low values (<= 0.4) can lead to fluttering animation.
// High values (>= 1.0) can lead to imprecise or freezing animation.
"-lw", "0.8",
// The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
// Set beam width applied to every frame in Viterbi search
"-beam", "1e-20",
// Set beam width applied to phone transitions
"-pbeam", "1e-20",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
lambda_unique_ptr<ps_decoder_t> decoder(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!decoder) throw runtime_error("Error creating speech decoder.");
return decoder;
}
static Timeline<Phone> utteranceToPhones(
const AudioClip& audioClip,
TimeRange utteranceTimeRange,
ps_decoder_t& decoder,
ProgressSink& utteranceProgressSink
) {
// Pad time range to give PocketSphinx some breathing room
TimeRange paddedTimeRange = utteranceTimeRange;
const centiseconds padding(3);
paddedTimeRange.grow(padding);
paddedTimeRange.trim(audioClip.getTruncatedRange());
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
// Detect phones (returned as words)
BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
phoneStrings.shift(paddedTimeRange.getStart());
Timeline<Phone> utterancePhones;
for (const auto& timedPhoneString : phoneStrings) {
Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
phone = Phone::Schwa;
}
utterancePhones.set(timedPhoneString.getTimeRange(), phone);
}
// Log raw phones
for (const auto& timedPhone : utterancePhones) {
logTimedEvent("rawPhone", timedPhone);
}
// Guess positions of noise sounds
JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
for (const auto& noiseSound : noiseSounds) {
utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
}
// Log phones
for (const auto& timedPhone : utterancePhones) {
logTimedEvent("phone", timedPhone);
}
utteranceProgressSink.reportProgress(1.0);
return utterancePhones;
}
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
const AudioClip& inputAudioClip,
optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink
) const {
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
}

View File

@ -0,0 +1,14 @@
#pragma once
#include "Recognizer.h"
#include "pocketSphinxTools.h"
class PhoneticRecognizer : public Recognizer {
public:
BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip,
boost::optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink
) const override;
};

View File

@ -1,145 +1,135 @@
#include <boost/filesystem.hpp> #include "PocketSphinxRecognizer.h"
#include "phoneRecognition.h"
#include "audio/SampleRateConverter.h"
#include "tools/platformTools.h"
#include "tools/tools.h"
#include <format.h>
#include <s3types.h>
#include <regex> #include <regex>
#include <gsl_util.h> #include <gsl_util.h>
#include "logging/logging.h"
#include "audio/DcOffset.h"
#include "time/Timeline.h"
#include "audio/voiceActivityDetection.h"
#include "audio/AudioSegment.h" #include "audio/AudioSegment.h"
#include "audio/SampleRateConverter.h"
#include "languageModels.h" #include "languageModels.h"
#include "tokenization.h" #include "tokenization.h"
#include "g2p.h" #include "g2p.h"
#include "time/ContinuousTimeline.h" #include "time/ContinuousTimeline.h"
#include "audio/processing.h" #include "audio/processing.h"
#include "tools/parallel.h"
#include <boost/version.hpp>
#include "tools/ObjectPool.h"
#include "time/timedLogging.h" #include "time/timedLogging.h"
extern "C" { extern "C" {
#include <pocketsphinx.h>
#include <sphinxbase/err.h>
#include <ps_alignment.h>
#include <state_align_search.h> #include <state_align_search.h>
#include <pocketsphinx_internal.h>
#include <ngram_search.h>
} }
using std::runtime_error; using std::runtime_error;
using std::invalid_argument; using std::invalid_argument;
using std::unique_ptr; using std::unique_ptr;
using std::shared_ptr;
using std::string; using std::string;
using std::vector; using std::vector;
using std::map; using std::map;
using boost::filesystem::path; using boost::filesystem::path;
using std::function;
using std::regex; using std::regex;
using std::regex_replace; using std::regex_replace;
using std::chrono::duration;
using boost::optional; using boost::optional;
using std::string;
using std::chrono::duration_cast;
using std::array; using std::array;
constexpr int sphinxSampleRate = 16000; bool dictionaryContains(dict_t& dictionary, const string& word) {
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
const path& getSphinxModelDirectory() {
static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
return sphinxModelDirectory;
}
logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
switch (errorLevel) {
case ERR_DEBUG:
case ERR_INFO:
case ERR_INFOCONT:
return logging::Level::Trace;
case ERR_WARN:
return logging::Level::Warn;
case ERR_ERROR:
return logging::Level::Error;
case ERR_FATAL:
return logging::Level::Fatal;
default:
throw invalid_argument("Unknown log level.");
}
}
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
UNUSED(user_data);
// Create varArgs list
va_list args;
va_start(args, format);
auto _ = gsl::finally([&args]() { va_end(args); });
// Format message
const int initialSize = 256;
vector<char> chars(initialSize);
bool success = false;
while (!success) {
int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
success = charsWritten < static_cast<int>(chars.size());
if (!success) chars.resize(chars.size() * 2);
}
regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
boost::algorithm::trim(message);
logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
logging::log(logLevel, message);
}
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
// Restart timing at 0
ps_start_stream(&decoder);
// Start recognition
int error = ps_start_utt(&decoder);
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
// Process entire audio clip
const bool noRecognition = false;
const bool fullUtterance = true;
int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
// End recognition
error = ps_end_utt(&decoder);
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
if (noWordsRecognized) {
return result;
}
// Collect words
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
const char* word = ps_seg_word(it);
int firstFrame, lastFrame;
ps_seg_frames(it, &firstFrame, &lastFrame);
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
}
return result;
} }
s3wid_t getWordId(const string& word, dict_t& dictionary) { s3wid_t getWordId(const string& word, dict_t& dictionary) {
s3wid_t wordId = dict_wordid(&dictionary, word.c_str()); const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word)); if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
return wordId; return wordId;
} }
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
map<string, string> missingPronunciations;
for (const string& word : words) {
if (!dictionaryContains(*decoder.dict, word)) {
string pronunciation;
for (Phone phone : wordToPhones(word)) {
if (pronunciation.length() > 0) pronunciation += " ";
pronunciation += PhoneConverter::get().toString(phone);
}
missingPronunciations[word] = pronunciation;
}
}
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
const bool isLast = it == --missingPronunciations.end();
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
}
}
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
lambda_unique_ptr<ngram_model_t> result(
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
[](ngram_model_t* lm) { ngram_model_free(lm); });
if (!result) {
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
}
return result;
}
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
// Split dialog into normalized words
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
// Add dialog-specific words to the dictionary
addMissingDictionaryWords(words, decoder);
// Create dialog-specific language model
words.insert(words.begin(), "<s>");
words.emplace_back("</s>");
return createLanguageModel(words, decoder);
}
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
constexpr int modelCount = 2;
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
array<const char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
lambda_unique_ptr<ngram_model_t> result(
ngram_model_set_init(nullptr, languageModels.data(), const_cast<char**>(modelNames.data()), modelWeights.data(), modelCount),
[](ngram_model_t* lm) { ngram_model_free(lm); });
if (!result) {
throw runtime_error("Error creating biased language model.");
}
return result;
}
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
lambda_unique_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
// Set pronunciation dictionary
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
"-dither", "yes",
// Disable VAD -- we're doing that ourselves
"-remove_silence", "no",
// Perform per-utterance cepstral mean normalization
"-cmn", "batch",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
lambda_unique_ptr<ps_decoder_t> decoder(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!decoder) throw runtime_error("Error creating speech decoder.");
// Set language model
lambda_unique_ptr<ngram_model_t> languageModel(dialog
? createBiasedLanguageModel(*decoder, *dialog)
: createDefaultLanguageModel(*decoder));
ps_set_lm(decoder.get(), "lm", languageModel.get());
ps_set_search(decoder.get(), "lm");
return decoder;
}
optional<Timeline<Phone>> getPhoneAlignment( optional<Timeline<Phone>> getPhoneAlignment(
const vector<s3wid_t>& wordIds, const vector<s3wid_t>& wordIds,
const vector<int16_t>& audioBuffer, const vector<int16_t>& audioBuffer,
@ -178,7 +168,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
// Process entire audio clip // Process entire audio clip
const int16* nextSample = audioBuffer.data(); const int16* nextSample = audioBuffer.data();
size_t remainingSamples = audioBuffer.size(); size_t remainingSamples = audioBuffer.size();
bool fullUtterance = true; const bool fullUtterance = true;
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) { while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
while (acousticModel->n_feat_frame > 0) { while (acousticModel->n_feat_frame > 0) {
ps_search_step(search.get(), acousticModel->output_frame); ps_search_step(search.get(), acousticModel->output_frame);
@ -197,7 +187,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) { for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
// Get phone // Get phone
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it); ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
s3cipid_t phoneId = phoneEntry->id.pid.cipid; const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
string phoneName = phoneNames[phoneId]; string phoneName = phoneNames[phoneId];
if (phoneName == "SIL") continue; if (phoneName == "SIL") continue;
@ -207,135 +197,15 @@ optional<Timeline<Phone>> getPhoneAlignment(
centiseconds duration(phoneEntry->duration); centiseconds duration(phoneEntry->duration);
Phone phone = PhoneConverter::get().parse(phoneName); Phone phone = PhoneConverter::get().parse(phoneName);
if (phone == Phone::AH && duration < 6_cs) { if (phone == Phone::AH && duration < 6_cs) {
// Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate. // Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
phone = Phone::Schwa; phone = Phone::Schwa;
} }
Timed<Phone> timedPhone(start, start + duration, phone); const Timed<Phone> timedPhone(start, start + duration, phone);
result.set(timedPhone); result.set(timedPhone);
} }
return result; return result;
} }
bool dictionaryContains(dict_t& dictionary, const string& word) {
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
}
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
map<string, string> missingPronunciations;
for (const string& word : words) {
if (!dictionaryContains(*decoder.dict, word)) {
string pronunciation;
for (Phone phone : wordToPhones(word)) {
if (pronunciation.length() > 0) pronunciation += " ";
pronunciation += PhoneConverter::get().toString(phone);
}
missingPronunciations[word] = pronunciation;
}
}
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
bool isLast = it == --missingPronunciations.end();
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
}
}
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
lambda_unique_ptr<ngram_model_t> result(
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
[](ngram_model_t* lm) { ngram_model_free(lm); });
if (!result) {
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
}
return std::move(result);
}
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
// Split dialog into normalized words
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
// Add dialog-specific words to the dictionary
addMissingDictionaryWords(words, decoder);
// Create dialog-specific language model
words.insert(words.begin(), "<s>");
words.push_back("</s>");
return createLanguageModel(words, decoder);
}
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
constexpr int modelCount = 2;
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
array<char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
lambda_unique_ptr<ngram_model_t> result(
ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
[](ngram_model_t* lm) { ngram_model_free(lm); });
if (!result) {
throw runtime_error("Error creating biased language model.");
}
return std::move(result);
}
lambda_unique_ptr<ps_decoder_t> createDecoder(optional<string> dialog) {
lambda_unique_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
// Set pronunciation dictionary
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
"-dither", "yes",
// Disable VAD -- we're doing that ourselves
"-remove_silence", "no",
// Perform per-utterance cepstral mean normalization
"-cmn", "batch",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
lambda_unique_ptr<ps_decoder_t> decoder(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!decoder) throw runtime_error("Error creating speech decoder.");
// Set language model
lambda_unique_ptr<ngram_model_t> languageModel(dialog
? createBiasedLanguageModel(*decoder, *dialog)
: createDefaultLanguageModel(*decoder));
ps_set_lm(decoder.get(), "lm", languageModel.get());
ps_set_search(decoder.get(), "lm");
return decoder;
}
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
JoiningTimeline<void> noiseSounds;
// Find utterance parts without recogniced phones
noiseSounds.set(utteranceTimeRange);
for (const auto& timedPhone : phones) {
noiseSounds.clear(timedPhone.getTimeRange());
}
// Remove undesired elements
const centiseconds minSoundDuration = 12_cs;
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
bool startsAtZero = unknownSound.getStart() == 0_cs;
bool tooShort = unknownSound.getDuration() < minSoundDuration;
if (startsAtZero || tooShort) {
noiseSounds.clear(unknownSound.getTimeRange());
}
}
return noiseSounds;
}
// Some words have multiple pronunciations, one of which results in better animation than the others. // Some words have multiple pronunciations, one of which results in better animation than the others.
// This function returns the optimal pronunciation for a select set of these words. // This function returns the optimal pronunciation for a select set of these words.
string fixPronunciation(const string& word) { string fixPronunciation(const string& word) {
@ -352,17 +222,17 @@ string fixPronunciation(const string& word) {
return pair != replacements.end() ? pair->second : word; return pair != replacements.end() ? pair->second : word;
} }
Timeline<Phone> utteranceToPhones( static Timeline<Phone> utteranceToPhones(
const AudioClip& audioClip, const AudioClip& audioClip,
TimeRange utteranceTimeRange, TimeRange utteranceTimeRange,
ps_decoder_t& decoder, ps_decoder_t& decoder,
ProgressSink& utteranceProgressSink) ProgressSink& utteranceProgressSink
{ ) {
ProgressMerger utteranceProgressMerger(utteranceProgressSink); ProgressMerger utteranceProgressMerger(utteranceProgressSink);
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0); ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5); ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
// Pad time range to give Pocketsphinx some breathing room // Pad time range to give PocketSphinx some breathing room
TimeRange paddedTimeRange = utteranceTimeRange; TimeRange paddedTimeRange = utteranceTimeRange;
const centiseconds padding(3); const centiseconds padding(3);
paddedTimeRange.grow(padding); paddedTimeRange.grow(padding);
@ -384,7 +254,7 @@ Timeline<Phone> utteranceToPhones(
continue; continue;
} }
word = regex_replace(word, regex("\\(\\d\\)"), ""); word = regex_replace(word, regex("\\(\\d\\)"), "");
if (text.size() > 0) { if (!text.empty()) {
text += " "; text += " ";
} }
text += word; text += word;
@ -433,77 +303,11 @@ Timeline<Phone> utteranceToPhones(
return utterancePhones; return utterancePhones;
} }
BoundedTimeline<Phone> recognizePhones( BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
const AudioClip& inputAudioClip, const AudioClip& inputAudioClip,
optional<string> dialog, optional<std::string> dialog,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink) ProgressSink& progressSink
{ ) const {
ProgressMerger totalProgressMerger(progressSink); return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
// Make sure audio stream has no DC offset
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
// Split audio into utterances
JoiningBoundedTimeline<void> utterances;
try {
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
}
catch (...) {
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
}
// Discard Pocketsphinx output
err_set_logfp(nullptr);
// Redirect Pocketsphinx output to log
err_set_callback(sphinxLogCallback, nullptr);
// Prepare pool of decoders
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
[&dialog] { return createDecoder(dialog); });
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
std::mutex resultMutex;
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
// Detect phones for utterance
auto decoder = decoderPool.acquire();
Timeline<Phone> utterancePhones =
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
// Copy phones to result timeline
std::lock_guard<std::mutex> lock(resultMutex);
for (const auto& timedPhone : utterancePhones) {
phones.set(timedPhone);
}
};
auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
return timedUtterance.getDuration().count();
};
// Perform speech recognition
try {
// Determine how many parallel threads to use
int threadCount = std::min({
maxThreadCount,
// Don't use more threads than there are utterances to be processed
static_cast<int>(utterances.size()),
// Don't waste time creating additional threads (and decoders!) if the recording is short
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
});
if (threadCount < 1) {
threadCount = 1;
}
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
logging::debug("Speech recognition -- end");
}
catch (...) {
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
}
return phones;
} }

View File

@ -0,0 +1,14 @@
#pragma once
#include "Recognizer.h"
#include "pocketSphinxTools.h"
class PocketSphinxRecognizer : public Recognizer {
public:
BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip,
boost::optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink
) const override;
};

View File

@ -0,0 +1,18 @@
#pragma once
#include "audio/AudioClip.h"
#include "core/Phone.h"
#include "tools/ProgressBar.h"
#include "time/BoundedTimeline.h"
class Recognizer {
public:
virtual ~Recognizer() = default;
virtual BoundedTimeline<Phone>recognizePhones(
const AudioClip& audioClip,
boost::optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink
) const = 0;
};

View File

@ -1,12 +0,0 @@
#pragma once
#include "audio/AudioClip.h"
#include "core/Phone.h"
#include "tools/ProgressBar.h"
#include "time/BoundedTimeline.h"
BoundedTimeline<Phone> recognizePhones(
const AudioClip& audioClip,
boost::optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink);

View File

@ -0,0 +1,218 @@
#include "pocketSphinxTools.h"
#include "tools/platformTools.h"
#include <regex>
#include "audio/DcOffset.h"
#include "audio/voiceActivityDetection.h"
#include "tools/parallel.h"
#include "tools/ObjectPool.h"
#include "time/timedLogging.h"
extern "C" {
#include <sphinxbase/err.h>
#include <pocketsphinx_internal.h>
#include <ngram_search.h>
}
using std::runtime_error;
using std::invalid_argument;
using std::unique_ptr;
using std::string;
using std::vector;
using boost::filesystem::path;
using std::regex;
using boost::optional;
using std::chrono::duration_cast;
logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
switch (errorLevel) {
case ERR_DEBUG:
case ERR_INFO:
case ERR_INFOCONT:
return logging::Level::Trace;
case ERR_WARN:
return logging::Level::Warn;
case ERR_ERROR:
return logging::Level::Error;
case ERR_FATAL:
return logging::Level::Fatal;
default:
throw invalid_argument("Unknown log level.");
}
}
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
UNUSED(user_data);
// Create varArgs list
va_list args;
va_start(args, format);
auto _ = gsl::finally([&args]() { va_end(args); });
// Format message
const int initialSize = 256;
vector<char> chars(initialSize);
bool success = false;
while (!success) {
const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
success = charsWritten < static_cast<int>(chars.size());
if (!success) chars.resize(chars.size() * 2);
}
const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
boost::algorithm::trim(message);
const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
logging::log(logLevel, message);
}
void redirectPocketSphinxOutput() {
static bool redirected = false;
if (redirected) return;
// Discard PocketSphinx output
err_set_logfp(nullptr);
// Redirect PocketSphinx output to log
err_set_callback(sphinxLogCallback, nullptr);
redirected = true;
}
BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip,
optional<std::string> dialog,
decoderFactory createDecoder,
utteranceToPhonesFunction utteranceToPhones,
int maxThreadCount,
ProgressSink& progressSink
) {
ProgressMerger totalProgressMerger(progressSink);
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
// Make sure audio stream has no DC offset
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
// Split audio into utterances
JoiningBoundedTimeline<void> utterances;
try {
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
} catch (...) {
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
}
redirectPocketSphinxOutput();
// Prepare pool of decoders
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
[&] { return createDecoder(dialog); });
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
std::mutex resultMutex;
const auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
// Detect phones for utterance
const auto decoder = decoderPool.acquire();
Timeline<Phone> utterancePhones =
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
// Copy phones to result timeline
std::lock_guard<std::mutex> lock(resultMutex);
for (const auto& timedPhone : utterancePhones) {
phones.set(timedPhone);
}
};
const auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
return timedUtterance.getDuration().count();
};
// Perform speech recognition
try {
// Determine how many parallel threads to use
int threadCount = std::min({
maxThreadCount,
// Don't use more threads than there are utterances to be processed
static_cast<int>(utterances.size()),
// Don't waste time creating additional threads (and decoders!) if the recording is short
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
});
if (threadCount < 1) {
threadCount = 1;
}
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
logging::debug("Speech recognition -- end");
} catch (...) {
std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
}
return phones;
}
const path& getSphinxModelDirectory() {
static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
return sphinxModelDirectory;
}
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
JoiningTimeline<void> noiseSounds;
// Find utterance parts without recognized phones
noiseSounds.set(utteranceTimeRange);
for (const auto& timedPhone : phones) {
noiseSounds.clear(timedPhone.getTimeRange());
}
// Remove undesired elements
const centiseconds minSoundDuration = 12_cs;
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
const bool startsAtZero = unknownSound.getStart() == 0_cs;
const bool tooShort = unknownSound.getDuration() < minSoundDuration;
if (startsAtZero || tooShort) {
noiseSounds.clear(unknownSound.getTimeRange());
}
}
return noiseSounds;
}
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
// Restart timing at 0
ps_start_stream(&decoder);
// Start recognition
int error = ps_start_utt(&decoder);
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
// Process entire audio clip
const bool noRecognition = false;
const bool fullUtterance = true;
const int searchedFrameCount =
ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
if (searchedFrameCount < 0) {
throw runtime_error("Error analyzing raw audio data for word recognition.");
}
// End recognition
error = ps_end_utt(&decoder);
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
if (noWordsRecognized) {
return result;
}
// Collect words
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
const char* word = ps_seg_word(it);
int firstFrame, lastFrame;
ps_seg_frames(it, &firstFrame, &lastFrame);
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
}
return result;
}

View File

@ -0,0 +1,39 @@
#pragma once
#include "time/BoundedTimeline.h"
#include "core/Phone.h"
#include "audio/AudioClip.h"
#include "tools/ProgressBar.h"
#include <boost/filesystem/path.hpp>
extern "C" {
#include <pocketsphinx.h>
}
typedef std::function<lambda_unique_ptr<ps_decoder_t>(
boost::optional<std::string> dialog
)> decoderFactory;
typedef std::function<Timeline<Phone>(
const AudioClip& audioClip,
TimeRange utteranceTimeRange,
ps_decoder_t& decoder,
ProgressSink& utteranceProgressSink
)> utteranceToPhonesFunction;
BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip,
boost::optional<std::string> dialog,
decoderFactory createDecoder,
utteranceToPhonesFunction utteranceToPhones,
int maxThreadCount,
ProgressSink& progressSink
);
constexpr int sphinxSampleRate = 16000;
const boost::filesystem::path& getSphinxModelDirectory();
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones);
BoundedTimeline<std::string> recognizeWords(const std::vector<int16_t>& audioBuffer, ps_decoder_t& decoder);

View File

@ -0,0 +1,27 @@
#include "RecognizerType.h"
using std::string;
RecognizerTypeConverter& RecognizerTypeConverter::get() {
static RecognizerTypeConverter converter;
return converter;
}
string RecognizerTypeConverter::getTypeName() {
return "RecognizerType";
}
EnumConverter<RecognizerType>::member_data RecognizerTypeConverter::getMemberData() {
return member_data{
{ RecognizerType::PocketSphinx, "pocketSphinx" },
{ RecognizerType::Phonetic, "phonetic" }
};
}
std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
return RecognizerTypeConverter::get().write(stream, value);
}
std::istream& operator>>(std::istream& stream, RecognizerType& value) {
return RecognizerTypeConverter::get().read(stream, value);
}

View File

@ -0,0 +1,20 @@
#pragma once
#include "tools/EnumConverter.h"
enum class RecognizerType {
PocketSphinx,
Phonetic
};
class RecognizerTypeConverter : public EnumConverter<RecognizerType> {
public:
static RecognizerTypeConverter& get();
protected:
std::string getTypeName() override;
member_data getMemberData() override;
};
std::ostream& operator<<(std::ostream& stream, RecognizerType value);
std::istream& operator>>(std::istream& stream, RecognizerType& value);

View File

@ -27,6 +27,9 @@
#include "tools/platformTools.h" #include "tools/platformTools.h"
#include "sinks.h" #include "sinks.h"
#include "semanticEntries.h" #include "semanticEntries.h"
#include "RecognizerType.h"
#include "recognition/PocketSphinxRecognizer.h"
#include "recognition/PhoneticRecognizer.h"
using std::exception; using std::exception;
using std::string; using std::string;
@ -36,9 +39,6 @@ using std::unique_ptr;
using std::make_unique; using std::make_unique;
using std::shared_ptr; using std::shared_ptr;
using std::make_shared; using std::make_shared;
using std::map;
using std::chrono::duration;
using std::chrono::duration_cast;
using std::ofstream; using std::ofstream;
using boost::filesystem::path; using boost::filesystem::path;
using boost::adaptors::transformed; using boost::adaptors::transformed;
@ -56,6 +56,10 @@ namespace TCLAP {
struct ArgTraits<ExportFormat> { struct ArgTraits<ExportFormat> {
typedef ValueLike ValueCategory; typedef ValueLike ValueCategory;
}; };
template<>
struct ArgTraits<RecognizerType> {
typedef ValueLike ValueCategory;
};
} }
shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) { shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
@ -66,6 +70,17 @@ shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
return make_shared<logging::LevelFilter>(FileSink, minLevel); return make_shared<logging::LevelFilter>(FileSink, minLevel);
} }
unique_ptr<Recognizer> createRecognizer(RecognizerType recognizerType) {
switch (recognizerType) {
case RecognizerType::PocketSphinx:
return make_unique<PocketSphinxRecognizer>();
case RecognizerType::Phonetic:
return make_unique<PhoneticRecognizer>();
default:
throw std::runtime_error("Unknown recognizer.");
}
}
unique_ptr<Exporter> createExporter(ExportFormat exportFormat) { unique_ptr<Exporter> createExporter(ExportFormat exportFormat) {
switch (exportFormat) { switch (exportFormat) {
case ExportFormat::Tsv: case ExportFormat::Tsv:
@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues()); auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats); tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd); tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
auto recognizerTypes = vector<RecognizerType>(RecognizerTypeConverter::get().getValues());
tclap::ValuesConstraint<RecognizerType> recognizerConstraint(recognizerTypes);
tclap::ValueArg<RecognizerType> recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd); tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
try { try {
@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
JoiningContinuousTimeline<Shape> animation = animateWaveFile( JoiningContinuousTimeline<Shape> animation = animateWaveFile(
inputFilePath, inputFilePath,
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(), dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
*createRecognizer(recognizerType.getValue()),
targetShapeSet, targetShapeSet,
maxThreadCount.getValue(), maxThreadCount.getValue(),
progressSink); progressSink);