Implement generic concept of recognizers with options pocketSphinx and phonetic
This commit is contained in:
parent
3ed38ada2f
commit
610f490046
|
@ -413,8 +413,13 @@ add_library(rhubarb-recognition
|
||||||
src/recognition/g2p.h
|
src/recognition/g2p.h
|
||||||
src/recognition/languageModels.cpp
|
src/recognition/languageModels.cpp
|
||||||
src/recognition/languageModels.h
|
src/recognition/languageModels.h
|
||||||
src/recognition/phoneRecognition.cpp
|
src/recognition/PhoneticRecognizer.cpp
|
||||||
src/recognition/phoneRecognition.h
|
src/recognition/PhoneticRecognizer.h
|
||||||
|
src/recognition/PocketSphinxRecognizer.cpp
|
||||||
|
src/recognition/PocketSphinxRecognizer.h
|
||||||
|
src/recognition/pocketSphinxTools.cpp
|
||||||
|
src/recognition/pocketSphinxTools.h
|
||||||
|
src/recognition/Recognizer.h
|
||||||
src/recognition/tokenization.cpp
|
src/recognition/tokenization.cpp
|
||||||
src/recognition/tokenization.h
|
src/recognition/tokenization.h
|
||||||
)
|
)
|
||||||
|
@ -487,6 +492,8 @@ add_executable(rhubarb
|
||||||
src/rhubarb/main.cpp
|
src/rhubarb/main.cpp
|
||||||
src/rhubarb/ExportFormat.cpp
|
src/rhubarb/ExportFormat.cpp
|
||||||
src/rhubarb/ExportFormat.h
|
src/rhubarb/ExportFormat.h
|
||||||
|
src/rhubarb/RecognizerType.cpp
|
||||||
|
src/rhubarb/RecognizerType.h
|
||||||
src/rhubarb/semanticEntries.cpp
|
src/rhubarb/semanticEntries.cpp
|
||||||
src/rhubarb/semanticEntries.h
|
src/rhubarb/semanticEntries.h
|
||||||
src/rhubarb/sinks.cpp
|
src/rhubarb/sinks.cpp
|
||||||
|
|
|
@ -1,7 +1,12 @@
|
||||||
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
|
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
|
||||||
|
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppClangTidyModernizeRawStringLiteral/@EntryIndexedValue">HINT</s:String>
|
||||||
|
|
||||||
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String>
|
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String>
|
||||||
|
|
||||||
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String>
|
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String>
|
||||||
|
|
||||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean>
|
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean>
|
||||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean>
|
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean>
|
||||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean>
|
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean>
|
||||||
|
@ -29,6 +34,7 @@
|
||||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String>
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String>
|
||||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean>
|
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean>
|
||||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String>
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||||
|
@ -44,6 +50,14 @@
|
||||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean>
|
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean>
|
||||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean>
|
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean>
|
||||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean>
|
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CssFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/HtmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/JavaScriptCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ProtobufCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ResxFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/VBFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlDocFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
|
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||||
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String>
|
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String>
|
||||||
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String>
|
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String>
|
||||||
<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue"><NamingElement Priority="10"><Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"><type Name="class field" /><type Name="struct field" /></Descriptor><Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /></NamingElement></s:String>
|
<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue"><NamingElement Priority="10"><Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"><type Name="class field" /><type Name="struct field" /></Descriptor><Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /></NamingElement></s:String>
|
||||||
|
@ -108,7 +122,16 @@
|
||||||
<s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String>
|
<s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String>
|
||||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
|
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
|
||||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
|
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
|
||||||
|
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpKeepExistingMigration/@EntryIndexedValue">True</s:Boolean>
|
||||||
|
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpPlaceEmbeddedOnSameLineMigration/@EntryIndexedValue">True</s:Boolean>
|
||||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean>
|
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean>
|
||||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean>
|
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean>
|
||||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
|
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
|
||||||
|
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002EFormat_002ESettingsUpgrade_002EAlignmentTabFillStyleMigration/@EntryIndexedValue">True</s:Boolean>
|
||||||
|
<s:Boolean x:Key="/Default/UserDictionary/Words/=allphone/@EntryIndexedValue">True</s:Boolean>
|
||||||
|
<s:Boolean x:Key="/Default/UserDictionary/Words/=cepstral/@EntryIndexedValue">True</s:Boolean>
|
||||||
|
<s:Boolean x:Key="/Default/UserDictionary/Words/=cmudict/@EntryIndexedValue">True</s:Boolean>
|
||||||
|
<s:Boolean x:Key="/Default/UserDictionary/Words/=pbeam/@EntryIndexedValue">True</s:Boolean>
|
||||||
|
<s:Boolean x:Key="/Default/UserDictionary/Words/=qwhy/@EntryIndexedValue">True</s:Boolean>
|
||||||
|
<s:Boolean x:Key="/Default/UserDictionary/Words/=Viterbi/@EntryIndexedValue">True</s:Boolean>
|
||||||
</wpf:ResourceDictionary>
|
</wpf:ResourceDictionary>
|
|
@ -1,6 +1,5 @@
|
||||||
#include "rhubarbLib.h"
|
#include "rhubarbLib.h"
|
||||||
#include "core/Phone.h"
|
#include "core/Phone.h"
|
||||||
#include "recognition/phoneRecognition.h"
|
|
||||||
#include "tools/textFiles.h"
|
#include "tools/textFiles.h"
|
||||||
#include "animation/mouthAnimation.h"
|
#include "animation/mouthAnimation.h"
|
||||||
#include "audio/audioFileReading.h"
|
#include "audio/audioFileReading.h"
|
||||||
|
@ -8,27 +7,29 @@
|
||||||
using boost::optional;
|
using boost::optional;
|
||||||
using std::string;
|
using std::string;
|
||||||
using boost::filesystem::path;
|
using boost::filesystem::path;
|
||||||
using std::unique_ptr;
|
|
||||||
|
|
||||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||||
const AudioClip& audioClip,
|
const AudioClip& audioClip,
|
||||||
optional<string> dialog,
|
const optional<string>& dialog,
|
||||||
|
const Recognizer& recognizer,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
BoundedTimeline<Phone> phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
|
const BoundedTimeline<Phone> phones =
|
||||||
|
recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
|
||||||
JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
|
JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||||
path filePath,
|
path filePath,
|
||||||
optional<string> dialog,
|
const optional<string>& dialog,
|
||||||
|
const Recognizer& recognizer,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
const auto audioClip = createAudioFileClip(filePath);
|
const auto audioClip = createAudioFileClip(filePath);
|
||||||
return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink);
|
return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,17 +6,20 @@
|
||||||
#include "tools/ProgressBar.h"
|
#include "tools/ProgressBar.h"
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include "animation/targetShapeSet.h"
|
#include "animation/targetShapeSet.h"
|
||||||
|
#include "recognition/Recognizer.h"
|
||||||
|
|
||||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||||
const AudioClip& audioClip,
|
const AudioClip& audioClip,
|
||||||
boost::optional<std::string> dialog,
|
const boost::optional<std::string>& dialog,
|
||||||
|
const Recognizer& recognizer,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink);
|
ProgressSink& progressSink);
|
||||||
|
|
||||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||||
boost::filesystem::path filePath,
|
boost::filesystem::path filePath,
|
||||||
boost::optional<std::string> dialog,
|
const boost::optional<std::string>& dialog,
|
||||||
|
const Recognizer& recognizer,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink);
|
ProgressSink& progressSink);
|
||||||
|
|
|
@ -0,0 +1,103 @@
|
||||||
|
#include "PhoneticRecognizer.h"
|
||||||
|
#include "time/Timeline.h"
|
||||||
|
#include "audio/AudioSegment.h"
|
||||||
|
#include "audio/SampleRateConverter.h"
|
||||||
|
#include "audio/processing.h"
|
||||||
|
#include "time/timedLogging.h"
|
||||||
|
|
||||||
|
using std::runtime_error;
|
||||||
|
using std::unique_ptr;
|
||||||
|
using std::string;
|
||||||
|
using boost::optional;
|
||||||
|
|
||||||
|
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
|
||||||
|
UNUSED(dialog);
|
||||||
|
|
||||||
|
lambda_unique_ptr<cmd_ln_t> config(
|
||||||
|
cmd_ln_init(
|
||||||
|
nullptr, ps_args(), true,
|
||||||
|
// Set acoustic model
|
||||||
|
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
||||||
|
// Set phonetic language model
|
||||||
|
"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
|
||||||
|
"-allphone_ci", "yes",
|
||||||
|
// Set language model probability weight.
|
||||||
|
// Low values (<= 0.4) can lead to fluttering animation.
|
||||||
|
// High values (>= 1.0) can lead to imprecise or freezing animation.
|
||||||
|
"-lw", "0.8",
|
||||||
|
|
||||||
|
// The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
||||||
|
|
||||||
|
// Set beam width applied to every frame in Viterbi search
|
||||||
|
"-beam", "1e-20",
|
||||||
|
// Set beam width applied to phone transitions
|
||||||
|
"-pbeam", "1e-20",
|
||||||
|
nullptr),
|
||||||
|
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||||
|
if (!config) throw runtime_error("Error creating configuration.");
|
||||||
|
|
||||||
|
lambda_unique_ptr<ps_decoder_t> decoder(
|
||||||
|
ps_init(config.get()),
|
||||||
|
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||||
|
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
||||||
|
|
||||||
|
return decoder;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Timeline<Phone> utteranceToPhones(
|
||||||
|
const AudioClip& audioClip,
|
||||||
|
TimeRange utteranceTimeRange,
|
||||||
|
ps_decoder_t& decoder,
|
||||||
|
ProgressSink& utteranceProgressSink
|
||||||
|
) {
|
||||||
|
// Pad time range to give PocketSphinx some breathing room
|
||||||
|
TimeRange paddedTimeRange = utteranceTimeRange;
|
||||||
|
const centiseconds padding(3);
|
||||||
|
paddedTimeRange.grow(padding);
|
||||||
|
paddedTimeRange.trim(audioClip.getTruncatedRange());
|
||||||
|
|
||||||
|
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
|
||||||
|
const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
|
||||||
|
|
||||||
|
// Detect phones (returned as words)
|
||||||
|
BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
|
||||||
|
phoneStrings.shift(paddedTimeRange.getStart());
|
||||||
|
Timeline<Phone> utterancePhones;
|
||||||
|
for (const auto& timedPhoneString : phoneStrings) {
|
||||||
|
Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
|
||||||
|
if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
|
||||||
|
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
|
||||||
|
phone = Phone::Schwa;
|
||||||
|
}
|
||||||
|
utterancePhones.set(timedPhoneString.getTimeRange(), phone);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log raw phones
|
||||||
|
for (const auto& timedPhone : utterancePhones) {
|
||||||
|
logTimedEvent("rawPhone", timedPhone);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Guess positions of noise sounds
|
||||||
|
JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
|
||||||
|
for (const auto& noiseSound : noiseSounds) {
|
||||||
|
utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log phones
|
||||||
|
for (const auto& timedPhone : utterancePhones) {
|
||||||
|
logTimedEvent("phone", timedPhone);
|
||||||
|
}
|
||||||
|
|
||||||
|
utteranceProgressSink.reportProgress(1.0);
|
||||||
|
|
||||||
|
return utterancePhones;
|
||||||
|
}
|
||||||
|
|
||||||
|
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
|
||||||
|
const AudioClip& inputAudioClip,
|
||||||
|
optional<std::string> dialog,
|
||||||
|
int maxThreadCount,
|
||||||
|
ProgressSink& progressSink
|
||||||
|
) const {
|
||||||
|
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "Recognizer.h"
|
||||||
|
#include "pocketSphinxTools.h"
|
||||||
|
|
||||||
|
class PhoneticRecognizer : public Recognizer {
|
||||||
|
public:
|
||||||
|
BoundedTimeline<Phone> recognizePhones(
|
||||||
|
const AudioClip& inputAudioClip,
|
||||||
|
boost::optional<std::string> dialog,
|
||||||
|
int maxThreadCount,
|
||||||
|
ProgressSink& progressSink
|
||||||
|
) const override;
|
||||||
|
};
|
|
@ -1,143 +1,133 @@
|
||||||
#include <boost/filesystem.hpp>
|
#include "PocketSphinxRecognizer.h"
|
||||||
#include "phoneRecognition.h"
|
|
||||||
#include "audio/SampleRateConverter.h"
|
|
||||||
#include "tools/platformTools.h"
|
|
||||||
#include "tools/tools.h"
|
|
||||||
#include <format.h>
|
|
||||||
#include <s3types.h>
|
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <gsl_util.h>
|
#include <gsl_util.h>
|
||||||
#include "logging/logging.h"
|
|
||||||
#include "audio/DcOffset.h"
|
|
||||||
#include "time/Timeline.h"
|
|
||||||
#include "audio/voiceActivityDetection.h"
|
|
||||||
#include "audio/AudioSegment.h"
|
#include "audio/AudioSegment.h"
|
||||||
|
#include "audio/SampleRateConverter.h"
|
||||||
#include "languageModels.h"
|
#include "languageModels.h"
|
||||||
#include "tokenization.h"
|
#include "tokenization.h"
|
||||||
#include "g2p.h"
|
#include "g2p.h"
|
||||||
#include "time/ContinuousTimeline.h"
|
#include "time/ContinuousTimeline.h"
|
||||||
#include "audio/processing.h"
|
#include "audio/processing.h"
|
||||||
#include "tools/parallel.h"
|
|
||||||
#include <boost/version.hpp>
|
|
||||||
#include "tools/ObjectPool.h"
|
|
||||||
#include "time/timedLogging.h"
|
#include "time/timedLogging.h"
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include <pocketsphinx.h>
|
|
||||||
#include <sphinxbase/err.h>
|
|
||||||
#include <ps_alignment.h>
|
|
||||||
#include <state_align_search.h>
|
#include <state_align_search.h>
|
||||||
#include <pocketsphinx_internal.h>
|
|
||||||
#include <ngram_search.h>
|
|
||||||
}
|
}
|
||||||
|
|
||||||
using std::runtime_error;
|
using std::runtime_error;
|
||||||
using std::invalid_argument;
|
using std::invalid_argument;
|
||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
using std::shared_ptr;
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using std::map;
|
using std::map;
|
||||||
using boost::filesystem::path;
|
using boost::filesystem::path;
|
||||||
using std::function;
|
|
||||||
using std::regex;
|
using std::regex;
|
||||||
using std::regex_replace;
|
using std::regex_replace;
|
||||||
using std::chrono::duration;
|
|
||||||
using boost::optional;
|
using boost::optional;
|
||||||
using std::string;
|
|
||||||
using std::chrono::duration_cast;
|
|
||||||
using std::array;
|
using std::array;
|
||||||
|
|
||||||
constexpr int sphinxSampleRate = 16000;
|
bool dictionaryContains(dict_t& dictionary, const string& word) {
|
||||||
|
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
|
||||||
const path& getSphinxModelDirectory() {
|
|
||||||
static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
|
|
||||||
return sphinxModelDirectory;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
|
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||||
switch (errorLevel) {
|
const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
||||||
case ERR_DEBUG:
|
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
||||||
case ERR_INFO:
|
return wordId;
|
||||||
case ERR_INFOCONT:
|
}
|
||||||
return logging::Level::Trace;
|
|
||||||
case ERR_WARN:
|
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
|
||||||
return logging::Level::Warn;
|
map<string, string> missingPronunciations;
|
||||||
case ERR_ERROR:
|
for (const string& word : words) {
|
||||||
return logging::Level::Error;
|
if (!dictionaryContains(*decoder.dict, word)) {
|
||||||
case ERR_FATAL:
|
string pronunciation;
|
||||||
return logging::Level::Fatal;
|
for (Phone phone : wordToPhones(word)) {
|
||||||
default:
|
if (pronunciation.length() > 0) pronunciation += " ";
|
||||||
throw invalid_argument("Unknown log level.");
|
pronunciation += PhoneConverter::get().toString(phone);
|
||||||
|
}
|
||||||
|
missingPronunciations[word] = pronunciation;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
|
||||||
|
const bool isLast = it == --missingPronunciations.end();
|
||||||
|
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
|
||||||
|
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
|
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
|
||||||
UNUSED(user_data);
|
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
|
||||||
|
lambda_unique_ptr<ngram_model_t> result(
|
||||||
// Create varArgs list
|
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
|
||||||
va_list args;
|
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||||
va_start(args, format);
|
if (!result) {
|
||||||
auto _ = gsl::finally([&args]() { va_end(args); });
|
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
|
||||||
|
|
||||||
// Format message
|
|
||||||
const int initialSize = 256;
|
|
||||||
vector<char> chars(initialSize);
|
|
||||||
bool success = false;
|
|
||||||
while (!success) {
|
|
||||||
int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
|
|
||||||
if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
|
|
||||||
|
|
||||||
success = charsWritten < static_cast<int>(chars.size());
|
|
||||||
if (!success) chars.resize(chars.size() * 2);
|
|
||||||
}
|
|
||||||
regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
|
|
||||||
string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
|
|
||||||
boost::algorithm::trim(message);
|
|
||||||
|
|
||||||
logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
|
|
||||||
logging::log(logLevel, message);
|
|
||||||
}
|
|
||||||
|
|
||||||
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
|
|
||||||
// Restart timing at 0
|
|
||||||
ps_start_stream(&decoder);
|
|
||||||
|
|
||||||
// Start recognition
|
|
||||||
int error = ps_start_utt(&decoder);
|
|
||||||
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
|
||||||
|
|
||||||
// Process entire audio clip
|
|
||||||
const bool noRecognition = false;
|
|
||||||
const bool fullUtterance = true;
|
|
||||||
int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
|
|
||||||
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
|
||||||
|
|
||||||
// End recognition
|
|
||||||
error = ps_end_utt(&decoder);
|
|
||||||
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
|
||||||
|
|
||||||
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
|
|
||||||
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
|
||||||
if (noWordsRecognized) {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect words
|
|
||||||
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
|
|
||||||
const char* word = ps_seg_word(it);
|
|
||||||
int firstFrame, lastFrame;
|
|
||||||
ps_seg_frames(it, &firstFrame, &lastFrame);
|
|
||||||
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||||
s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
// Split dialog into normalized words
|
||||||
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
|
||||||
return wordId;
|
|
||||||
|
// Add dialog-specific words to the dictionary
|
||||||
|
addMissingDictionaryWords(words, decoder);
|
||||||
|
|
||||||
|
// Create dialog-specific language model
|
||||||
|
words.insert(words.begin(), "<s>");
|
||||||
|
words.emplace_back("</s>");
|
||||||
|
return createLanguageModel(words, decoder);
|
||||||
|
}
|
||||||
|
|
||||||
|
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||||
|
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
|
||||||
|
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
|
||||||
|
constexpr int modelCount = 2;
|
||||||
|
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
|
||||||
|
array<const char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
|
||||||
|
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
|
||||||
|
lambda_unique_ptr<ngram_model_t> result(
|
||||||
|
ngram_model_set_init(nullptr, languageModels.data(), const_cast<char**>(modelNames.data()), modelWeights.data(), modelCount),
|
||||||
|
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||||
|
if (!result) {
|
||||||
|
throw runtime_error("Error creating biased language model.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
|
||||||
|
lambda_unique_ptr<cmd_ln_t> config(
|
||||||
|
cmd_ln_init(
|
||||||
|
nullptr, ps_args(), true,
|
||||||
|
// Set acoustic model
|
||||||
|
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
||||||
|
// Set pronunciation dictionary
|
||||||
|
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
|
||||||
|
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
||||||
|
"-dither", "yes",
|
||||||
|
// Disable VAD -- we're doing that ourselves
|
||||||
|
"-remove_silence", "no",
|
||||||
|
// Perform per-utterance cepstral mean normalization
|
||||||
|
"-cmn", "batch",
|
||||||
|
nullptr),
|
||||||
|
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||||
|
if (!config) throw runtime_error("Error creating configuration.");
|
||||||
|
|
||||||
|
lambda_unique_ptr<ps_decoder_t> decoder(
|
||||||
|
ps_init(config.get()),
|
||||||
|
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||||
|
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
||||||
|
|
||||||
|
// Set language model
|
||||||
|
lambda_unique_ptr<ngram_model_t> languageModel(dialog
|
||||||
|
? createBiasedLanguageModel(*decoder, *dialog)
|
||||||
|
: createDefaultLanguageModel(*decoder));
|
||||||
|
ps_set_lm(decoder.get(), "lm", languageModel.get());
|
||||||
|
ps_set_search(decoder.get(), "lm");
|
||||||
|
|
||||||
|
return decoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
optional<Timeline<Phone>> getPhoneAlignment(
|
optional<Timeline<Phone>> getPhoneAlignment(
|
||||||
|
@ -178,7 +168,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
||||||
// Process entire audio clip
|
// Process entire audio clip
|
||||||
const int16* nextSample = audioBuffer.data();
|
const int16* nextSample = audioBuffer.data();
|
||||||
size_t remainingSamples = audioBuffer.size();
|
size_t remainingSamples = audioBuffer.size();
|
||||||
bool fullUtterance = true;
|
const bool fullUtterance = true;
|
||||||
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
|
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
|
||||||
while (acousticModel->n_feat_frame > 0) {
|
while (acousticModel->n_feat_frame > 0) {
|
||||||
ps_search_step(search.get(), acousticModel->output_frame);
|
ps_search_step(search.get(), acousticModel->output_frame);
|
||||||
|
@ -197,7 +187,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
||||||
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
||||||
// Get phone
|
// Get phone
|
||||||
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
|
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
|
||||||
s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
||||||
string phoneName = phoneNames[phoneId];
|
string phoneName = phoneNames[phoneId];
|
||||||
|
|
||||||
if (phoneName == "SIL") continue;
|
if (phoneName == "SIL") continue;
|
||||||
|
@ -207,162 +197,42 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
||||||
centiseconds duration(phoneEntry->duration);
|
centiseconds duration(phoneEntry->duration);
|
||||||
Phone phone = PhoneConverter::get().parse(phoneName);
|
Phone phone = PhoneConverter::get().parse(phoneName);
|
||||||
if (phone == Phone::AH && duration < 6_cs) {
|
if (phone == Phone::AH && duration < 6_cs) {
|
||||||
// Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate.
|
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
|
||||||
phone = Phone::Schwa;
|
phone = Phone::Schwa;
|
||||||
}
|
}
|
||||||
Timed<Phone> timedPhone(start, start + duration, phone);
|
const Timed<Phone> timedPhone(start, start + duration, phone);
|
||||||
result.set(timedPhone);
|
result.set(timedPhone);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool dictionaryContains(dict_t& dictionary, const string& word) {
|
|
||||||
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
|
|
||||||
}
|
|
||||||
|
|
||||||
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
|
|
||||||
map<string, string> missingPronunciations;
|
|
||||||
for (const string& word : words) {
|
|
||||||
if (!dictionaryContains(*decoder.dict, word)) {
|
|
||||||
string pronunciation;
|
|
||||||
for (Phone phone : wordToPhones(word)) {
|
|
||||||
if (pronunciation.length() > 0) pronunciation += " ";
|
|
||||||
pronunciation += PhoneConverter::get().toString(phone);
|
|
||||||
}
|
|
||||||
missingPronunciations[word] = pronunciation;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
|
|
||||||
bool isLast = it == --missingPronunciations.end();
|
|
||||||
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
|
|
||||||
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
|
|
||||||
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
|
|
||||||
lambda_unique_ptr<ngram_model_t> result(
|
|
||||||
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
|
|
||||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
|
||||||
if (!result) {
|
|
||||||
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::move(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
|
||||||
// Split dialog into normalized words
|
|
||||||
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
|
|
||||||
|
|
||||||
// Add dialog-specific words to the dictionary
|
|
||||||
addMissingDictionaryWords(words, decoder);
|
|
||||||
|
|
||||||
// Create dialog-specific language model
|
|
||||||
words.insert(words.begin(), "<s>");
|
|
||||||
words.push_back("</s>");
|
|
||||||
return createLanguageModel(words, decoder);
|
|
||||||
}
|
|
||||||
|
|
||||||
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
|
||||||
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
|
|
||||||
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
|
|
||||||
constexpr int modelCount = 2;
|
|
||||||
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
|
|
||||||
array<char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
|
|
||||||
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
|
|
||||||
lambda_unique_ptr<ngram_model_t> result(
|
|
||||||
ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
|
|
||||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
|
||||||
if (!result) {
|
|
||||||
throw runtime_error("Error creating biased language model.");
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::move(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
lambda_unique_ptr<ps_decoder_t> createDecoder(optional<string> dialog) {
|
|
||||||
lambda_unique_ptr<cmd_ln_t> config(
|
|
||||||
cmd_ln_init(
|
|
||||||
nullptr, ps_args(), true,
|
|
||||||
// Set acoustic model
|
|
||||||
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
|
||||||
// Set pronunciation dictionary
|
|
||||||
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
|
|
||||||
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
|
||||||
"-dither", "yes",
|
|
||||||
// Disable VAD -- we're doing that ourselves
|
|
||||||
"-remove_silence", "no",
|
|
||||||
// Perform per-utterance cepstral mean normalization
|
|
||||||
"-cmn", "batch",
|
|
||||||
nullptr),
|
|
||||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
|
||||||
if (!config) throw runtime_error("Error creating configuration.");
|
|
||||||
|
|
||||||
lambda_unique_ptr<ps_decoder_t> decoder(
|
|
||||||
ps_init(config.get()),
|
|
||||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
|
||||||
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
|
||||||
|
|
||||||
// Set language model
|
|
||||||
lambda_unique_ptr<ngram_model_t> languageModel(dialog
|
|
||||||
? createBiasedLanguageModel(*decoder, *dialog)
|
|
||||||
: createDefaultLanguageModel(*decoder));
|
|
||||||
ps_set_lm(decoder.get(), "lm", languageModel.get());
|
|
||||||
ps_set_search(decoder.get(), "lm");
|
|
||||||
|
|
||||||
return decoder;
|
|
||||||
}
|
|
||||||
|
|
||||||
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
|
|
||||||
JoiningTimeline<void> noiseSounds;
|
|
||||||
|
|
||||||
// Find utterance parts without recogniced phones
|
|
||||||
noiseSounds.set(utteranceTimeRange);
|
|
||||||
for (const auto& timedPhone : phones) {
|
|
||||||
noiseSounds.clear(timedPhone.getTimeRange());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove undesired elements
|
|
||||||
const centiseconds minSoundDuration = 12_cs;
|
|
||||||
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
|
|
||||||
bool startsAtZero = unknownSound.getStart() == 0_cs;
|
|
||||||
bool tooShort = unknownSound.getDuration() < minSoundDuration;
|
|
||||||
if (startsAtZero || tooShort) {
|
|
||||||
noiseSounds.clear(unknownSound.getTimeRange());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return noiseSounds;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Some words have multiple pronunciations, one of which results in better animation than the others.
|
// Some words have multiple pronunciations, one of which results in better animation than the others.
|
||||||
// This function returns the optimal pronunciation for a select set of these words.
|
// This function returns the optimal pronunciation for a select set of these words.
|
||||||
string fixPronunciation(const string& word) {
|
string fixPronunciation(const string& word) {
|
||||||
const static map<string, string> replacements {
|
const static map<string, string> replacements{
|
||||||
{"into(2)", "into"},
|
{ "into(2)", "into" },
|
||||||
{"to(2)", "to"},
|
{ "to(2)", "to" },
|
||||||
{"to(3)", "to"},
|
{ "to(3)", "to" },
|
||||||
{"today(2)", "today"},
|
{ "today(2)", "today" },
|
||||||
{"tomorrow(2)", "tomorrow"},
|
{ "tomorrow(2)", "tomorrow" },
|
||||||
{"tonight(2)", "tonight"}
|
{ "tonight(2)", "tonight" }
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto pair = replacements.find(word);
|
const auto pair = replacements.find(word);
|
||||||
return pair != replacements.end() ? pair->second : word;
|
return pair != replacements.end() ? pair->second : word;
|
||||||
}
|
}
|
||||||
|
|
||||||
Timeline<Phone> utteranceToPhones(
|
static Timeline<Phone> utteranceToPhones(
|
||||||
const AudioClip& audioClip,
|
const AudioClip& audioClip,
|
||||||
TimeRange utteranceTimeRange,
|
TimeRange utteranceTimeRange,
|
||||||
ps_decoder_t& decoder,
|
ps_decoder_t& decoder,
|
||||||
ProgressSink& utteranceProgressSink)
|
ProgressSink& utteranceProgressSink
|
||||||
{
|
) {
|
||||||
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
|
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
|
||||||
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
|
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
|
||||||
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
|
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
|
||||||
|
|
||||||
// Pad time range to give Pocketsphinx some breathing room
|
// Pad time range to give PocketSphinx some breathing room
|
||||||
TimeRange paddedTimeRange = utteranceTimeRange;
|
TimeRange paddedTimeRange = utteranceTimeRange;
|
||||||
const centiseconds padding(3);
|
const centiseconds padding(3);
|
||||||
paddedTimeRange.grow(padding);
|
paddedTimeRange.grow(padding);
|
||||||
|
@ -384,7 +254,7 @@ Timeline<Phone> utteranceToPhones(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
word = regex_replace(word, regex("\\(\\d\\)"), "");
|
word = regex_replace(word, regex("\\(\\d\\)"), "");
|
||||||
if (text.size() > 0) {
|
if (!text.empty()) {
|
||||||
text += " ";
|
text += " ";
|
||||||
}
|
}
|
||||||
text += word;
|
text += word;
|
||||||
|
@ -403,7 +273,7 @@ Timeline<Phone> utteranceToPhones(
|
||||||
const string fixedWord = fixPronunciation(timedWord.getValue());
|
const string fixedWord = fixPronunciation(timedWord.getValue());
|
||||||
wordIds.push_back(getWordId(fixedWord, *decoder.dict));
|
wordIds.push_back(getWordId(fixedWord, *decoder.dict));
|
||||||
}
|
}
|
||||||
if (wordIds.empty()) return {};
|
if (wordIds.empty()) return{};
|
||||||
|
|
||||||
// Align the words' phones with speech
|
// Align the words' phones with speech
|
||||||
#if BOOST_VERSION < 105600 // Support legacy syntax
|
#if BOOST_VERSION < 105600 // Support legacy syntax
|
||||||
|
@ -433,77 +303,11 @@ Timeline<Phone> utteranceToPhones(
|
||||||
return utterancePhones;
|
return utterancePhones;
|
||||||
}
|
}
|
||||||
|
|
||||||
BoundedTimeline<Phone> recognizePhones(
|
BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
|
||||||
const AudioClip& inputAudioClip,
|
const AudioClip& inputAudioClip,
|
||||||
optional<string> dialog,
|
optional<std::string> dialog,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink
|
||||||
{
|
) const {
|
||||||
ProgressMerger totalProgressMerger(progressSink);
|
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
||||||
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
|
|
||||||
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
|
||||||
|
|
||||||
// Make sure audio stream has no DC offset
|
|
||||||
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
|
|
||||||
|
|
||||||
// Split audio into utterances
|
|
||||||
JoiningBoundedTimeline<void> utterances;
|
|
||||||
try {
|
|
||||||
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
|
|
||||||
}
|
|
||||||
catch (...) {
|
|
||||||
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Discard Pocketsphinx output
|
|
||||||
err_set_logfp(nullptr);
|
|
||||||
|
|
||||||
// Redirect Pocketsphinx output to log
|
|
||||||
err_set_callback(sphinxLogCallback, nullptr);
|
|
||||||
|
|
||||||
// Prepare pool of decoders
|
|
||||||
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
|
|
||||||
[&dialog] { return createDecoder(dialog); });
|
|
||||||
|
|
||||||
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
|
|
||||||
std::mutex resultMutex;
|
|
||||||
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
|
||||||
// Detect phones for utterance
|
|
||||||
auto decoder = decoderPool.acquire();
|
|
||||||
Timeline<Phone> utterancePhones =
|
|
||||||
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
|
|
||||||
|
|
||||||
// Copy phones to result timeline
|
|
||||||
std::lock_guard<std::mutex> lock(resultMutex);
|
|
||||||
for (const auto& timedPhone : utterancePhones) {
|
|
||||||
phones.set(timedPhone);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
|
|
||||||
return timedUtterance.getDuration().count();
|
|
||||||
};
|
|
||||||
|
|
||||||
// Perform speech recognition
|
|
||||||
try {
|
|
||||||
// Determine how many parallel threads to use
|
|
||||||
int threadCount = std::min({
|
|
||||||
maxThreadCount,
|
|
||||||
// Don't use more threads than there are utterances to be processed
|
|
||||||
static_cast<int>(utterances.size()),
|
|
||||||
// Don't waste time creating additional threads (and decoders!) if the recording is short
|
|
||||||
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
|
|
||||||
});
|
|
||||||
if (threadCount < 1) {
|
|
||||||
threadCount = 1;
|
|
||||||
}
|
|
||||||
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
|
|
||||||
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
|
||||||
logging::debug("Speech recognition -- end");
|
|
||||||
}
|
|
||||||
catch (...) {
|
|
||||||
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
|
|
||||||
}
|
|
||||||
|
|
||||||
return phones;
|
|
||||||
}
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "Recognizer.h"
|
||||||
|
#include "pocketSphinxTools.h"
|
||||||
|
|
||||||
|
class PocketSphinxRecognizer : public Recognizer {
|
||||||
|
public:
|
||||||
|
BoundedTimeline<Phone> recognizePhones(
|
||||||
|
const AudioClip& inputAudioClip,
|
||||||
|
boost::optional<std::string> dialog,
|
||||||
|
int maxThreadCount,
|
||||||
|
ProgressSink& progressSink
|
||||||
|
) const override;
|
||||||
|
};
|
|
@ -0,0 +1,18 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "audio/AudioClip.h"
|
||||||
|
#include "core/Phone.h"
|
||||||
|
#include "tools/ProgressBar.h"
|
||||||
|
#include "time/BoundedTimeline.h"
|
||||||
|
|
||||||
|
class Recognizer {
|
||||||
|
public:
|
||||||
|
virtual ~Recognizer() = default;
|
||||||
|
|
||||||
|
virtual BoundedTimeline<Phone>recognizePhones(
|
||||||
|
const AudioClip& audioClip,
|
||||||
|
boost::optional<std::string> dialog,
|
||||||
|
int maxThreadCount,
|
||||||
|
ProgressSink& progressSink
|
||||||
|
) const = 0;
|
||||||
|
};
|
|
@ -1,12 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "audio/AudioClip.h"
|
|
||||||
#include "core/Phone.h"
|
|
||||||
#include "tools/ProgressBar.h"
|
|
||||||
#include "time/BoundedTimeline.h"
|
|
||||||
|
|
||||||
BoundedTimeline<Phone> recognizePhones(
|
|
||||||
const AudioClip& audioClip,
|
|
||||||
boost::optional<std::string> dialog,
|
|
||||||
int maxThreadCount,
|
|
||||||
ProgressSink& progressSink);
|
|
|
@ -0,0 +1,218 @@
|
||||||
|
#include "pocketSphinxTools.h"
|
||||||
|
|
||||||
|
#include "tools/platformTools.h"
|
||||||
|
#include <regex>
|
||||||
|
#include "audio/DcOffset.h"
|
||||||
|
#include "audio/voiceActivityDetection.h"
|
||||||
|
#include "tools/parallel.h"
|
||||||
|
#include "tools/ObjectPool.h"
|
||||||
|
#include "time/timedLogging.h"
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
#include <sphinxbase/err.h>
|
||||||
|
#include <pocketsphinx_internal.h>
|
||||||
|
#include <ngram_search.h>
|
||||||
|
}
|
||||||
|
|
||||||
|
using std::runtime_error;
|
||||||
|
using std::invalid_argument;
|
||||||
|
using std::unique_ptr;
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
using boost::filesystem::path;
|
||||||
|
using std::regex;
|
||||||
|
using boost::optional;
|
||||||
|
using std::chrono::duration_cast;
|
||||||
|
|
||||||
|
logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
|
||||||
|
switch (errorLevel) {
|
||||||
|
case ERR_DEBUG:
|
||||||
|
case ERR_INFO:
|
||||||
|
case ERR_INFOCONT:
|
||||||
|
return logging::Level::Trace;
|
||||||
|
case ERR_WARN:
|
||||||
|
return logging::Level::Warn;
|
||||||
|
case ERR_ERROR:
|
||||||
|
return logging::Level::Error;
|
||||||
|
case ERR_FATAL:
|
||||||
|
return logging::Level::Fatal;
|
||||||
|
default:
|
||||||
|
throw invalid_argument("Unknown log level.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
|
||||||
|
UNUSED(user_data);
|
||||||
|
|
||||||
|
// Create varArgs list
|
||||||
|
va_list args;
|
||||||
|
va_start(args, format);
|
||||||
|
auto _ = gsl::finally([&args]() { va_end(args); });
|
||||||
|
|
||||||
|
// Format message
|
||||||
|
const int initialSize = 256;
|
||||||
|
vector<char> chars(initialSize);
|
||||||
|
bool success = false;
|
||||||
|
while (!success) {
|
||||||
|
const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
|
||||||
|
if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
|
||||||
|
|
||||||
|
success = charsWritten < static_cast<int>(chars.size());
|
||||||
|
if (!success) chars.resize(chars.size() * 2);
|
||||||
|
}
|
||||||
|
const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
|
||||||
|
string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
|
||||||
|
boost::algorithm::trim(message);
|
||||||
|
|
||||||
|
const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
|
||||||
|
logging::log(logLevel, message);
|
||||||
|
}
|
||||||
|
|
||||||
|
void redirectPocketSphinxOutput() {
|
||||||
|
static bool redirected = false;
|
||||||
|
if (redirected) return;
|
||||||
|
|
||||||
|
// Discard PocketSphinx output
|
||||||
|
err_set_logfp(nullptr);
|
||||||
|
|
||||||
|
// Redirect PocketSphinx output to log
|
||||||
|
err_set_callback(sphinxLogCallback, nullptr);
|
||||||
|
|
||||||
|
redirected = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
BoundedTimeline<Phone> recognizePhones(
|
||||||
|
const AudioClip& inputAudioClip,
|
||||||
|
optional<std::string> dialog,
|
||||||
|
decoderFactory createDecoder,
|
||||||
|
utteranceToPhonesFunction utteranceToPhones,
|
||||||
|
int maxThreadCount,
|
||||||
|
ProgressSink& progressSink
|
||||||
|
) {
|
||||||
|
ProgressMerger totalProgressMerger(progressSink);
|
||||||
|
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
|
||||||
|
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
||||||
|
|
||||||
|
// Make sure audio stream has no DC offset
|
||||||
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
|
||||||
|
|
||||||
|
// Split audio into utterances
|
||||||
|
JoiningBoundedTimeline<void> utterances;
|
||||||
|
try {
|
||||||
|
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
|
||||||
|
} catch (...) {
|
||||||
|
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
||||||
|
}
|
||||||
|
|
||||||
|
redirectPocketSphinxOutput();
|
||||||
|
|
||||||
|
// Prepare pool of decoders
|
||||||
|
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
|
||||||
|
[&] { return createDecoder(dialog); });
|
||||||
|
|
||||||
|
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
|
||||||
|
std::mutex resultMutex;
|
||||||
|
const auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
||||||
|
// Detect phones for utterance
|
||||||
|
const auto decoder = decoderPool.acquire();
|
||||||
|
Timeline<Phone> utterancePhones =
|
||||||
|
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
|
||||||
|
|
||||||
|
// Copy phones to result timeline
|
||||||
|
std::lock_guard<std::mutex> lock(resultMutex);
|
||||||
|
for (const auto& timedPhone : utterancePhones) {
|
||||||
|
phones.set(timedPhone);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
|
||||||
|
return timedUtterance.getDuration().count();
|
||||||
|
};
|
||||||
|
|
||||||
|
// Perform speech recognition
|
||||||
|
try {
|
||||||
|
// Determine how many parallel threads to use
|
||||||
|
int threadCount = std::min({
|
||||||
|
maxThreadCount,
|
||||||
|
// Don't use more threads than there are utterances to be processed
|
||||||
|
static_cast<int>(utterances.size()),
|
||||||
|
// Don't waste time creating additional threads (and decoders!) if the recording is short
|
||||||
|
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
|
||||||
|
});
|
||||||
|
if (threadCount < 1) {
|
||||||
|
threadCount = 1;
|
||||||
|
}
|
||||||
|
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
|
||||||
|
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
||||||
|
logging::debug("Speech recognition -- end");
|
||||||
|
} catch (...) {
|
||||||
|
std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
|
||||||
|
}
|
||||||
|
|
||||||
|
return phones;
|
||||||
|
}
|
||||||
|
|
||||||
|
const path& getSphinxModelDirectory() {
|
||||||
|
static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
|
||||||
|
return sphinxModelDirectory;
|
||||||
|
}
|
||||||
|
|
||||||
|
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
|
||||||
|
JoiningTimeline<void> noiseSounds;
|
||||||
|
|
||||||
|
// Find utterance parts without recognized phones
|
||||||
|
noiseSounds.set(utteranceTimeRange);
|
||||||
|
for (const auto& timedPhone : phones) {
|
||||||
|
noiseSounds.clear(timedPhone.getTimeRange());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove undesired elements
|
||||||
|
const centiseconds minSoundDuration = 12_cs;
|
||||||
|
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
|
||||||
|
const bool startsAtZero = unknownSound.getStart() == 0_cs;
|
||||||
|
const bool tooShort = unknownSound.getDuration() < minSoundDuration;
|
||||||
|
if (startsAtZero || tooShort) {
|
||||||
|
noiseSounds.clear(unknownSound.getTimeRange());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return noiseSounds;
|
||||||
|
}
|
||||||
|
|
||||||
|
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
|
||||||
|
// Restart timing at 0
|
||||||
|
ps_start_stream(&decoder);
|
||||||
|
|
||||||
|
// Start recognition
|
||||||
|
int error = ps_start_utt(&decoder);
|
||||||
|
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
||||||
|
|
||||||
|
// Process entire audio clip
|
||||||
|
const bool noRecognition = false;
|
||||||
|
const bool fullUtterance = true;
|
||||||
|
const int searchedFrameCount =
|
||||||
|
ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
|
||||||
|
if (searchedFrameCount < 0) {
|
||||||
|
throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// End recognition
|
||||||
|
error = ps_end_utt(&decoder);
|
||||||
|
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
||||||
|
|
||||||
|
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
|
||||||
|
const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
||||||
|
if (noWordsRecognized) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect words
|
||||||
|
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
|
||||||
|
const char* word = ps_seg_word(it);
|
||||||
|
int firstFrame, lastFrame;
|
||||||
|
ps_seg_frames(it, &firstFrame, &lastFrame);
|
||||||
|
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
|
@ -0,0 +1,39 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "time/BoundedTimeline.h"
|
||||||
|
#include "core/Phone.h"
|
||||||
|
#include "audio/AudioClip.h"
|
||||||
|
#include "tools/ProgressBar.h"
|
||||||
|
#include <boost/filesystem/path.hpp>
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
#include <pocketsphinx.h>
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef std::function<lambda_unique_ptr<ps_decoder_t>(
|
||||||
|
boost::optional<std::string> dialog
|
||||||
|
)> decoderFactory;
|
||||||
|
|
||||||
|
typedef std::function<Timeline<Phone>(
|
||||||
|
const AudioClip& audioClip,
|
||||||
|
TimeRange utteranceTimeRange,
|
||||||
|
ps_decoder_t& decoder,
|
||||||
|
ProgressSink& utteranceProgressSink
|
||||||
|
)> utteranceToPhonesFunction;
|
||||||
|
|
||||||
|
BoundedTimeline<Phone> recognizePhones(
|
||||||
|
const AudioClip& inputAudioClip,
|
||||||
|
boost::optional<std::string> dialog,
|
||||||
|
decoderFactory createDecoder,
|
||||||
|
utteranceToPhonesFunction utteranceToPhones,
|
||||||
|
int maxThreadCount,
|
||||||
|
ProgressSink& progressSink
|
||||||
|
);
|
||||||
|
|
||||||
|
constexpr int sphinxSampleRate = 16000;
|
||||||
|
|
||||||
|
const boost::filesystem::path& getSphinxModelDirectory();
|
||||||
|
|
||||||
|
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones);
|
||||||
|
|
||||||
|
BoundedTimeline<std::string> recognizeWords(const std::vector<int16_t>& audioBuffer, ps_decoder_t& decoder);
|
|
@ -0,0 +1,27 @@
|
||||||
|
#include "RecognizerType.h"
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
RecognizerTypeConverter& RecognizerTypeConverter::get() {
|
||||||
|
static RecognizerTypeConverter converter;
|
||||||
|
return converter;
|
||||||
|
}
|
||||||
|
|
||||||
|
string RecognizerTypeConverter::getTypeName() {
|
||||||
|
return "RecognizerType";
|
||||||
|
}
|
||||||
|
|
||||||
|
EnumConverter<RecognizerType>::member_data RecognizerTypeConverter::getMemberData() {
|
||||||
|
return member_data{
|
||||||
|
{ RecognizerType::PocketSphinx, "pocketSphinx" },
|
||||||
|
{ RecognizerType::Phonetic, "phonetic" }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
|
||||||
|
return RecognizerTypeConverter::get().write(stream, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::istream& operator>>(std::istream& stream, RecognizerType& value) {
|
||||||
|
return RecognizerTypeConverter::get().read(stream, value);
|
||||||
|
}
|
|
@ -0,0 +1,20 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "tools/EnumConverter.h"
|
||||||
|
|
||||||
|
enum class RecognizerType {
|
||||||
|
PocketSphinx,
|
||||||
|
Phonetic
|
||||||
|
};
|
||||||
|
|
||||||
|
class RecognizerTypeConverter : public EnumConverter<RecognizerType> {
|
||||||
|
public:
|
||||||
|
static RecognizerTypeConverter& get();
|
||||||
|
protected:
|
||||||
|
std::string getTypeName() override;
|
||||||
|
member_data getMemberData() override;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& stream, RecognizerType value);
|
||||||
|
|
||||||
|
std::istream& operator>>(std::istream& stream, RecognizerType& value);
|
|
@ -27,6 +27,9 @@
|
||||||
#include "tools/platformTools.h"
|
#include "tools/platformTools.h"
|
||||||
#include "sinks.h"
|
#include "sinks.h"
|
||||||
#include "semanticEntries.h"
|
#include "semanticEntries.h"
|
||||||
|
#include "RecognizerType.h"
|
||||||
|
#include "recognition/PocketSphinxRecognizer.h"
|
||||||
|
#include "recognition/PhoneticRecognizer.h"
|
||||||
|
|
||||||
using std::exception;
|
using std::exception;
|
||||||
using std::string;
|
using std::string;
|
||||||
|
@ -36,9 +39,6 @@ using std::unique_ptr;
|
||||||
using std::make_unique;
|
using std::make_unique;
|
||||||
using std::shared_ptr;
|
using std::shared_ptr;
|
||||||
using std::make_shared;
|
using std::make_shared;
|
||||||
using std::map;
|
|
||||||
using std::chrono::duration;
|
|
||||||
using std::chrono::duration_cast;
|
|
||||||
using std::ofstream;
|
using std::ofstream;
|
||||||
using boost::filesystem::path;
|
using boost::filesystem::path;
|
||||||
using boost::adaptors::transformed;
|
using boost::adaptors::transformed;
|
||||||
|
@ -56,6 +56,10 @@ namespace TCLAP {
|
||||||
struct ArgTraits<ExportFormat> {
|
struct ArgTraits<ExportFormat> {
|
||||||
typedef ValueLike ValueCategory;
|
typedef ValueLike ValueCategory;
|
||||||
};
|
};
|
||||||
|
template<>
|
||||||
|
struct ArgTraits<RecognizerType> {
|
||||||
|
typedef ValueLike ValueCategory;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
|
shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
|
||||||
|
@ -66,6 +70,17 @@ shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
|
||||||
return make_shared<logging::LevelFilter>(FileSink, minLevel);
|
return make_shared<logging::LevelFilter>(FileSink, minLevel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unique_ptr<Recognizer> createRecognizer(RecognizerType recognizerType) {
|
||||||
|
switch (recognizerType) {
|
||||||
|
case RecognizerType::PocketSphinx:
|
||||||
|
return make_unique<PocketSphinxRecognizer>();
|
||||||
|
case RecognizerType::Phonetic:
|
||||||
|
return make_unique<PhoneticRecognizer>();
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("Unknown recognizer.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
unique_ptr<Exporter> createExporter(ExportFormat exportFormat) {
|
unique_ptr<Exporter> createExporter(ExportFormat exportFormat) {
|
||||||
switch (exportFormat) {
|
switch (exportFormat) {
|
||||||
case ExportFormat::Tsv:
|
case ExportFormat::Tsv:
|
||||||
|
@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
|
||||||
auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
|
auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
|
||||||
tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
|
tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
|
||||||
tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
|
tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
|
||||||
|
auto recognizerTypes = vector<RecognizerType>(RecognizerTypeConverter::get().getValues());
|
||||||
|
tclap::ValuesConstraint<RecognizerType> recognizerConstraint(recognizerTypes);
|
||||||
|
tclap::ValueArg<RecognizerType> recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
|
||||||
tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
|
tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
|
||||||
JoiningContinuousTimeline<Shape> animation = animateWaveFile(
|
JoiningContinuousTimeline<Shape> animation = animateWaveFile(
|
||||||
inputFilePath,
|
inputFilePath,
|
||||||
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
|
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
|
||||||
|
*createRecognizer(recognizerType.getValue()),
|
||||||
targetShapeSet,
|
targetShapeSet,
|
||||||
maxThreadCount.getValue(),
|
maxThreadCount.getValue(),
|
||||||
progressSink);
|
progressSink);
|
||||||
|
|
Loading…
Reference in New Issue