Implement generic concept of recognizers with options pocketSphinx and phonetic
This commit is contained in:
parent
3ed38ada2f
commit
610f490046
|
@ -413,8 +413,13 @@ add_library(rhubarb-recognition
|
|||
src/recognition/g2p.h
|
||||
src/recognition/languageModels.cpp
|
||||
src/recognition/languageModels.h
|
||||
src/recognition/phoneRecognition.cpp
|
||||
src/recognition/phoneRecognition.h
|
||||
src/recognition/PhoneticRecognizer.cpp
|
||||
src/recognition/PhoneticRecognizer.h
|
||||
src/recognition/PocketSphinxRecognizer.cpp
|
||||
src/recognition/PocketSphinxRecognizer.h
|
||||
src/recognition/pocketSphinxTools.cpp
|
||||
src/recognition/pocketSphinxTools.h
|
||||
src/recognition/Recognizer.h
|
||||
src/recognition/tokenization.cpp
|
||||
src/recognition/tokenization.h
|
||||
)
|
||||
|
@ -487,6 +492,8 @@ add_executable(rhubarb
|
|||
src/rhubarb/main.cpp
|
||||
src/rhubarb/ExportFormat.cpp
|
||||
src/rhubarb/ExportFormat.h
|
||||
src/rhubarb/RecognizerType.cpp
|
||||
src/rhubarb/RecognizerType.h
|
||||
src/rhubarb/semanticEntries.cpp
|
||||
src/rhubarb/semanticEntries.h
|
||||
src/rhubarb/sinks.cpp
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
|
||||
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppClangTidyModernizeRawStringLiteral/@EntryIndexedValue">HINT</s:String>
|
||||
|
||||
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String>
|
||||
|
||||
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String>
|
||||
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean>
|
||||
|
@ -29,6 +34,7 @@
|
|||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||
|
@ -44,6 +50,14 @@
|
|||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CssFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/HtmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/JavaScriptCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ProtobufCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ResxFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/VBFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlDocFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue"><NamingElement Priority="10"><Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"><type Name="class field" /><type Name="struct field" /></Descriptor><Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /></NamingElement></s:String>
|
||||
|
@ -108,7 +122,16 @@
|
|||
<s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpKeepExistingMigration/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpPlaceEmbeddedOnSameLineMigration/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002EFormat_002ESettingsUpgrade_002EAlignmentTabFillStyleMigration/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=allphone/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=cepstral/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=cmudict/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=pbeam/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=qwhy/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=Viterbi/@EntryIndexedValue">True</s:Boolean>
|
||||
</wpf:ResourceDictionary>
|
|
@ -1,6 +1,5 @@
|
|||
#include "rhubarbLib.h"
|
||||
#include "core/Phone.h"
|
||||
#include "recognition/phoneRecognition.h"
|
||||
#include "tools/textFiles.h"
|
||||
#include "animation/mouthAnimation.h"
|
||||
#include "audio/audioFileReading.h"
|
||||
|
@ -8,27 +7,29 @@
|
|||
using boost::optional;
|
||||
using std::string;
|
||||
using boost::filesystem::path;
|
||||
using std::unique_ptr;
|
||||
|
||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||
const AudioClip& audioClip,
|
||||
optional<string> dialog,
|
||||
const optional<string>& dialog,
|
||||
const Recognizer& recognizer,
|
||||
const ShapeSet& targetShapeSet,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink)
|
||||
{
|
||||
BoundedTimeline<Phone> phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
|
||||
const BoundedTimeline<Phone> phones =
|
||||
recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
|
||||
JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||
path filePath,
|
||||
optional<string> dialog,
|
||||
const optional<string>& dialog,
|
||||
const Recognizer& recognizer,
|
||||
const ShapeSet& targetShapeSet,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink)
|
||||
{
|
||||
const auto audioClip = createAudioFileClip(filePath);
|
||||
return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink);
|
||||
return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
|
||||
}
|
||||
|
|
|
@ -6,17 +6,20 @@
|
|||
#include "tools/ProgressBar.h"
|
||||
#include <boost/filesystem.hpp>
|
||||
#include "animation/targetShapeSet.h"
|
||||
#include "recognition/Recognizer.h"
|
||||
|
||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||
const AudioClip& audioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
const boost::optional<std::string>& dialog,
|
||||
const Recognizer& recognizer,
|
||||
const ShapeSet& targetShapeSet,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink);
|
||||
|
||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||
boost::filesystem::path filePath,
|
||||
boost::optional<std::string> dialog,
|
||||
const boost::optional<std::string>& dialog,
|
||||
const Recognizer& recognizer,
|
||||
const ShapeSet& targetShapeSet,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink);
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
#include "PhoneticRecognizer.h"
|
||||
#include "time/Timeline.h"
|
||||
#include "audio/AudioSegment.h"
|
||||
#include "audio/SampleRateConverter.h"
|
||||
#include "audio/processing.h"
|
||||
#include "time/timedLogging.h"
|
||||
|
||||
using std::runtime_error;
|
||||
using std::unique_ptr;
|
||||
using std::string;
|
||||
using boost::optional;
|
||||
|
||||
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
|
||||
UNUSED(dialog);
|
||||
|
||||
lambda_unique_ptr<cmd_ln_t> config(
|
||||
cmd_ln_init(
|
||||
nullptr, ps_args(), true,
|
||||
// Set acoustic model
|
||||
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
||||
// Set phonetic language model
|
||||
"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
|
||||
"-allphone_ci", "yes",
|
||||
// Set language model probability weight.
|
||||
// Low values (<= 0.4) can lead to fluttering animation.
|
||||
// High values (>= 1.0) can lead to imprecise or freezing animation.
|
||||
"-lw", "0.8",
|
||||
|
||||
// The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
||||
|
||||
// Set beam width applied to every frame in Viterbi search
|
||||
"-beam", "1e-20",
|
||||
// Set beam width applied to phone transitions
|
||||
"-pbeam", "1e-20",
|
||||
nullptr),
|
||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||
if (!config) throw runtime_error("Error creating configuration.");
|
||||
|
||||
lambda_unique_ptr<ps_decoder_t> decoder(
|
||||
ps_init(config.get()),
|
||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
||||
|
||||
return decoder;
|
||||
}
|
||||
|
||||
static Timeline<Phone> utteranceToPhones(
|
||||
const AudioClip& audioClip,
|
||||
TimeRange utteranceTimeRange,
|
||||
ps_decoder_t& decoder,
|
||||
ProgressSink& utteranceProgressSink
|
||||
) {
|
||||
// Pad time range to give PocketSphinx some breathing room
|
||||
TimeRange paddedTimeRange = utteranceTimeRange;
|
||||
const centiseconds padding(3);
|
||||
paddedTimeRange.grow(padding);
|
||||
paddedTimeRange.trim(audioClip.getTruncatedRange());
|
||||
|
||||
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
|
||||
const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
|
||||
|
||||
// Detect phones (returned as words)
|
||||
BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
|
||||
phoneStrings.shift(paddedTimeRange.getStart());
|
||||
Timeline<Phone> utterancePhones;
|
||||
for (const auto& timedPhoneString : phoneStrings) {
|
||||
Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
|
||||
if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
|
||||
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
|
||||
phone = Phone::Schwa;
|
||||
}
|
||||
utterancePhones.set(timedPhoneString.getTimeRange(), phone);
|
||||
}
|
||||
|
||||
// Log raw phones
|
||||
for (const auto& timedPhone : utterancePhones) {
|
||||
logTimedEvent("rawPhone", timedPhone);
|
||||
}
|
||||
|
||||
// Guess positions of noise sounds
|
||||
JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
|
||||
for (const auto& noiseSound : noiseSounds) {
|
||||
utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
|
||||
}
|
||||
|
||||
// Log phones
|
||||
for (const auto& timedPhone : utterancePhones) {
|
||||
logTimedEvent("phone", timedPhone);
|
||||
}
|
||||
|
||||
utteranceProgressSink.reportProgress(1.0);
|
||||
|
||||
return utterancePhones;
|
||||
}
|
||||
|
||||
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) const {
|
||||
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
|
||||
#include "Recognizer.h"
|
||||
#include "pocketSphinxTools.h"
|
||||
|
||||
class PhoneticRecognizer : public Recognizer {
|
||||
public:
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) const override;
|
||||
};
|
|
@ -1,143 +1,133 @@
|
|||
#include <boost/filesystem.hpp>
|
||||
#include "phoneRecognition.h"
|
||||
#include "audio/SampleRateConverter.h"
|
||||
#include "tools/platformTools.h"
|
||||
#include "tools/tools.h"
|
||||
#include <format.h>
|
||||
#include <s3types.h>
|
||||
#include "PocketSphinxRecognizer.h"
|
||||
#include <regex>
|
||||
#include <gsl_util.h>
|
||||
#include "logging/logging.h"
|
||||
#include "audio/DcOffset.h"
|
||||
#include "time/Timeline.h"
|
||||
#include "audio/voiceActivityDetection.h"
|
||||
#include "audio/AudioSegment.h"
|
||||
#include "audio/SampleRateConverter.h"
|
||||
#include "languageModels.h"
|
||||
#include "tokenization.h"
|
||||
#include "g2p.h"
|
||||
#include "time/ContinuousTimeline.h"
|
||||
#include "audio/processing.h"
|
||||
#include "tools/parallel.h"
|
||||
#include <boost/version.hpp>
|
||||
#include "tools/ObjectPool.h"
|
||||
#include "time/timedLogging.h"
|
||||
|
||||
extern "C" {
|
||||
#include <pocketsphinx.h>
|
||||
#include <sphinxbase/err.h>
|
||||
#include <ps_alignment.h>
|
||||
#include <state_align_search.h>
|
||||
#include <pocketsphinx_internal.h>
|
||||
#include <ngram_search.h>
|
||||
}
|
||||
|
||||
using std::runtime_error;
|
||||
using std::invalid_argument;
|
||||
using std::unique_ptr;
|
||||
using std::shared_ptr;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::map;
|
||||
using boost::filesystem::path;
|
||||
using std::function;
|
||||
using std::regex;
|
||||
using std::regex_replace;
|
||||
using std::chrono::duration;
|
||||
using boost::optional;
|
||||
using std::string;
|
||||
using std::chrono::duration_cast;
|
||||
using std::array;
|
||||
|
||||
constexpr int sphinxSampleRate = 16000;
|
||||
|
||||
const path& getSphinxModelDirectory() {
|
||||
static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
|
||||
return sphinxModelDirectory;
|
||||
bool dictionaryContains(dict_t& dictionary, const string& word) {
|
||||
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
|
||||
}
|
||||
|
||||
logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
|
||||
switch (errorLevel) {
|
||||
case ERR_DEBUG:
|
||||
case ERR_INFO:
|
||||
case ERR_INFOCONT:
|
||||
return logging::Level::Trace;
|
||||
case ERR_WARN:
|
||||
return logging::Level::Warn;
|
||||
case ERR_ERROR:
|
||||
return logging::Level::Error;
|
||||
case ERR_FATAL:
|
||||
return logging::Level::Fatal;
|
||||
default:
|
||||
throw invalid_argument("Unknown log level.");
|
||||
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||
const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
||||
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
||||
return wordId;
|
||||
}
|
||||
|
||||
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
|
||||
map<string, string> missingPronunciations;
|
||||
for (const string& word : words) {
|
||||
if (!dictionaryContains(*decoder.dict, word)) {
|
||||
string pronunciation;
|
||||
for (Phone phone : wordToPhones(word)) {
|
||||
if (pronunciation.length() > 0) pronunciation += " ";
|
||||
pronunciation += PhoneConverter::get().toString(phone);
|
||||
}
|
||||
missingPronunciations[word] = pronunciation;
|
||||
}
|
||||
}
|
||||
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
|
||||
const bool isLast = it == --missingPronunciations.end();
|
||||
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
|
||||
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
|
||||
}
|
||||
}
|
||||
|
||||
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
|
||||
UNUSED(user_data);
|
||||
|
||||
// Create varArgs list
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
auto _ = gsl::finally([&args]() { va_end(args); });
|
||||
|
||||
// Format message
|
||||
const int initialSize = 256;
|
||||
vector<char> chars(initialSize);
|
||||
bool success = false;
|
||||
while (!success) {
|
||||
int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
|
||||
if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
|
||||
|
||||
success = charsWritten < static_cast<int>(chars.size());
|
||||
if (!success) chars.resize(chars.size() * 2);
|
||||
}
|
||||
regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
|
||||
string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
|
||||
boost::algorithm::trim(message);
|
||||
|
||||
logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
|
||||
logging::log(logLevel, message);
|
||||
}
|
||||
|
||||
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
|
||||
// Restart timing at 0
|
||||
ps_start_stream(&decoder);
|
||||
|
||||
// Start recognition
|
||||
int error = ps_start_utt(&decoder);
|
||||
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
||||
|
||||
// Process entire audio clip
|
||||
const bool noRecognition = false;
|
||||
const bool fullUtterance = true;
|
||||
int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
|
||||
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||
|
||||
// End recognition
|
||||
error = ps_end_utt(&decoder);
|
||||
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
||||
|
||||
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
|
||||
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
||||
if (noWordsRecognized) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Collect words
|
||||
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
|
||||
const char* word = ps_seg_word(it);
|
||||
int firstFrame, lastFrame;
|
||||
ps_seg_frames(it, &firstFrame, &lastFrame);
|
||||
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
|
||||
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
|
||||
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
|
||||
lambda_unique_ptr<ngram_model_t> result(
|
||||
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
|
||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||
if (!result) {
|
||||
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||
s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
||||
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
||||
return wordId;
|
||||
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||
// Split dialog into normalized words
|
||||
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
|
||||
|
||||
// Add dialog-specific words to the dictionary
|
||||
addMissingDictionaryWords(words, decoder);
|
||||
|
||||
// Create dialog-specific language model
|
||||
words.insert(words.begin(), "<s>");
|
||||
words.emplace_back("</s>");
|
||||
return createLanguageModel(words, decoder);
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
|
||||
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
|
||||
constexpr int modelCount = 2;
|
||||
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
|
||||
array<const char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
|
||||
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
|
||||
lambda_unique_ptr<ngram_model_t> result(
|
||||
ngram_model_set_init(nullptr, languageModels.data(), const_cast<char**>(modelNames.data()), modelWeights.data(), modelCount),
|
||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||
if (!result) {
|
||||
throw runtime_error("Error creating biased language model.");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
|
||||
lambda_unique_ptr<cmd_ln_t> config(
|
||||
cmd_ln_init(
|
||||
nullptr, ps_args(), true,
|
||||
// Set acoustic model
|
||||
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
||||
// Set pronunciation dictionary
|
||||
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
|
||||
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
||||
"-dither", "yes",
|
||||
// Disable VAD -- we're doing that ourselves
|
||||
"-remove_silence", "no",
|
||||
// Perform per-utterance cepstral mean normalization
|
||||
"-cmn", "batch",
|
||||
nullptr),
|
||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||
if (!config) throw runtime_error("Error creating configuration.");
|
||||
|
||||
lambda_unique_ptr<ps_decoder_t> decoder(
|
||||
ps_init(config.get()),
|
||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
||||
|
||||
// Set language model
|
||||
lambda_unique_ptr<ngram_model_t> languageModel(dialog
|
||||
? createBiasedLanguageModel(*decoder, *dialog)
|
||||
: createDefaultLanguageModel(*decoder));
|
||||
ps_set_lm(decoder.get(), "lm", languageModel.get());
|
||||
ps_set_search(decoder.get(), "lm");
|
||||
|
||||
return decoder;
|
||||
}
|
||||
|
||||
optional<Timeline<Phone>> getPhoneAlignment(
|
||||
|
@ -178,7 +168,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
|||
// Process entire audio clip
|
||||
const int16* nextSample = audioBuffer.data();
|
||||
size_t remainingSamples = audioBuffer.size();
|
||||
bool fullUtterance = true;
|
||||
const bool fullUtterance = true;
|
||||
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
|
||||
while (acousticModel->n_feat_frame > 0) {
|
||||
ps_search_step(search.get(), acousticModel->output_frame);
|
||||
|
@ -197,7 +187,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
|||
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
||||
// Get phone
|
||||
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
|
||||
s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
||||
const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
||||
string phoneName = phoneNames[phoneId];
|
||||
|
||||
if (phoneName == "SIL") continue;
|
||||
|
@ -207,162 +197,42 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
|||
centiseconds duration(phoneEntry->duration);
|
||||
Phone phone = PhoneConverter::get().parse(phoneName);
|
||||
if (phone == Phone::AH && duration < 6_cs) {
|
||||
// Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate.
|
||||
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
|
||||
phone = Phone::Schwa;
|
||||
}
|
||||
Timed<Phone> timedPhone(start, start + duration, phone);
|
||||
const Timed<Phone> timedPhone(start, start + duration, phone);
|
||||
result.set(timedPhone);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool dictionaryContains(dict_t& dictionary, const string& word) {
|
||||
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
|
||||
}
|
||||
|
||||
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
|
||||
map<string, string> missingPronunciations;
|
||||
for (const string& word : words) {
|
||||
if (!dictionaryContains(*decoder.dict, word)) {
|
||||
string pronunciation;
|
||||
for (Phone phone : wordToPhones(word)) {
|
||||
if (pronunciation.length() > 0) pronunciation += " ";
|
||||
pronunciation += PhoneConverter::get().toString(phone);
|
||||
}
|
||||
missingPronunciations[word] = pronunciation;
|
||||
}
|
||||
}
|
||||
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
|
||||
bool isLast = it == --missingPronunciations.end();
|
||||
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
|
||||
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
|
||||
}
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
|
||||
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
|
||||
lambda_unique_ptr<ngram_model_t> result(
|
||||
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
|
||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||
if (!result) {
|
||||
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
|
||||
}
|
||||
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||
// Split dialog into normalized words
|
||||
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
|
||||
|
||||
// Add dialog-specific words to the dictionary
|
||||
addMissingDictionaryWords(words, decoder);
|
||||
|
||||
// Create dialog-specific language model
|
||||
words.insert(words.begin(), "<s>");
|
||||
words.push_back("</s>");
|
||||
return createLanguageModel(words, decoder);
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
|
||||
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
|
||||
constexpr int modelCount = 2;
|
||||
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
|
||||
array<char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
|
||||
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
|
||||
lambda_unique_ptr<ngram_model_t> result(
|
||||
ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
|
||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||
if (!result) {
|
||||
throw runtime_error("Error creating biased language model.");
|
||||
}
|
||||
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ps_decoder_t> createDecoder(optional<string> dialog) {
|
||||
lambda_unique_ptr<cmd_ln_t> config(
|
||||
cmd_ln_init(
|
||||
nullptr, ps_args(), true,
|
||||
// Set acoustic model
|
||||
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
||||
// Set pronunciation dictionary
|
||||
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
|
||||
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
||||
"-dither", "yes",
|
||||
// Disable VAD -- we're doing that ourselves
|
||||
"-remove_silence", "no",
|
||||
// Perform per-utterance cepstral mean normalization
|
||||
"-cmn", "batch",
|
||||
nullptr),
|
||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||
if (!config) throw runtime_error("Error creating configuration.");
|
||||
|
||||
lambda_unique_ptr<ps_decoder_t> decoder(
|
||||
ps_init(config.get()),
|
||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
||||
|
||||
// Set language model
|
||||
lambda_unique_ptr<ngram_model_t> languageModel(dialog
|
||||
? createBiasedLanguageModel(*decoder, *dialog)
|
||||
: createDefaultLanguageModel(*decoder));
|
||||
ps_set_lm(decoder.get(), "lm", languageModel.get());
|
||||
ps_set_search(decoder.get(), "lm");
|
||||
|
||||
return decoder;
|
||||
}
|
||||
|
||||
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
|
||||
JoiningTimeline<void> noiseSounds;
|
||||
|
||||
// Find utterance parts without recogniced phones
|
||||
noiseSounds.set(utteranceTimeRange);
|
||||
for (const auto& timedPhone : phones) {
|
||||
noiseSounds.clear(timedPhone.getTimeRange());
|
||||
}
|
||||
|
||||
// Remove undesired elements
|
||||
const centiseconds minSoundDuration = 12_cs;
|
||||
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
|
||||
bool startsAtZero = unknownSound.getStart() == 0_cs;
|
||||
bool tooShort = unknownSound.getDuration() < minSoundDuration;
|
||||
if (startsAtZero || tooShort) {
|
||||
noiseSounds.clear(unknownSound.getTimeRange());
|
||||
}
|
||||
}
|
||||
|
||||
return noiseSounds;
|
||||
}
|
||||
|
||||
// Some words have multiple pronunciations, one of which results in better animation than the others.
|
||||
// This function returns the optimal pronunciation for a select set of these words.
|
||||
string fixPronunciation(const string& word) {
|
||||
const static map<string, string> replacements {
|
||||
{"into(2)", "into"},
|
||||
{"to(2)", "to"},
|
||||
{"to(3)", "to"},
|
||||
{"today(2)", "today"},
|
||||
{"tomorrow(2)", "tomorrow"},
|
||||
{"tonight(2)", "tonight"}
|
||||
const static map<string, string> replacements{
|
||||
{ "into(2)", "into" },
|
||||
{ "to(2)", "to" },
|
||||
{ "to(3)", "to" },
|
||||
{ "today(2)", "today" },
|
||||
{ "tomorrow(2)", "tomorrow" },
|
||||
{ "tonight(2)", "tonight" }
|
||||
};
|
||||
|
||||
const auto pair = replacements.find(word);
|
||||
return pair != replacements.end() ? pair->second : word;
|
||||
}
|
||||
|
||||
Timeline<Phone> utteranceToPhones(
|
||||
static Timeline<Phone> utteranceToPhones(
|
||||
const AudioClip& audioClip,
|
||||
TimeRange utteranceTimeRange,
|
||||
ps_decoder_t& decoder,
|
||||
ProgressSink& utteranceProgressSink)
|
||||
{
|
||||
ProgressSink& utteranceProgressSink
|
||||
) {
|
||||
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
|
||||
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
|
||||
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
|
||||
|
||||
// Pad time range to give Pocketsphinx some breathing room
|
||||
// Pad time range to give PocketSphinx some breathing room
|
||||
TimeRange paddedTimeRange = utteranceTimeRange;
|
||||
const centiseconds padding(3);
|
||||
paddedTimeRange.grow(padding);
|
||||
|
@ -384,7 +254,7 @@ Timeline<Phone> utteranceToPhones(
|
|||
continue;
|
||||
}
|
||||
word = regex_replace(word, regex("\\(\\d\\)"), "");
|
||||
if (text.size() > 0) {
|
||||
if (!text.empty()) {
|
||||
text += " ";
|
||||
}
|
||||
text += word;
|
||||
|
@ -403,7 +273,7 @@ Timeline<Phone> utteranceToPhones(
|
|||
const string fixedWord = fixPronunciation(timedWord.getValue());
|
||||
wordIds.push_back(getWordId(fixedWord, *decoder.dict));
|
||||
}
|
||||
if (wordIds.empty()) return {};
|
||||
if (wordIds.empty()) return{};
|
||||
|
||||
// Align the words' phones with speech
|
||||
#if BOOST_VERSION < 105600 // Support legacy syntax
|
||||
|
@ -433,77 +303,11 @@ Timeline<Phone> utteranceToPhones(
|
|||
return utterancePhones;
|
||||
}
|
||||
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
optional<string> dialog,
|
||||
optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink)
|
||||
{
|
||||
ProgressMerger totalProgressMerger(progressSink);
|
||||
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
|
||||
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
||||
|
||||
// Make sure audio stream has no DC offset
|
||||
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
|
||||
|
||||
// Split audio into utterances
|
||||
JoiningBoundedTimeline<void> utterances;
|
||||
try {
|
||||
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
|
||||
}
|
||||
catch (...) {
|
||||
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
||||
}
|
||||
|
||||
// Discard Pocketsphinx output
|
||||
err_set_logfp(nullptr);
|
||||
|
||||
// Redirect Pocketsphinx output to log
|
||||
err_set_callback(sphinxLogCallback, nullptr);
|
||||
|
||||
// Prepare pool of decoders
|
||||
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
|
||||
[&dialog] { return createDecoder(dialog); });
|
||||
|
||||
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
|
||||
std::mutex resultMutex;
|
||||
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
||||
// Detect phones for utterance
|
||||
auto decoder = decoderPool.acquire();
|
||||
Timeline<Phone> utterancePhones =
|
||||
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
|
||||
|
||||
// Copy phones to result timeline
|
||||
std::lock_guard<std::mutex> lock(resultMutex);
|
||||
for (const auto& timedPhone : utterancePhones) {
|
||||
phones.set(timedPhone);
|
||||
}
|
||||
};
|
||||
|
||||
auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
|
||||
return timedUtterance.getDuration().count();
|
||||
};
|
||||
|
||||
// Perform speech recognition
|
||||
try {
|
||||
// Determine how many parallel threads to use
|
||||
int threadCount = std::min({
|
||||
maxThreadCount,
|
||||
// Don't use more threads than there are utterances to be processed
|
||||
static_cast<int>(utterances.size()),
|
||||
// Don't waste time creating additional threads (and decoders!) if the recording is short
|
||||
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
|
||||
});
|
||||
if (threadCount < 1) {
|
||||
threadCount = 1;
|
||||
}
|
||||
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
|
||||
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
||||
logging::debug("Speech recognition -- end");
|
||||
}
|
||||
catch (...) {
|
||||
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
|
||||
}
|
||||
|
||||
return phones;
|
||||
ProgressSink& progressSink
|
||||
) const {
|
||||
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
|
||||
#include "Recognizer.h"
|
||||
#include "pocketSphinxTools.h"
|
||||
|
||||
class PocketSphinxRecognizer : public Recognizer {
|
||||
public:
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) const override;
|
||||
};
|
|
@ -0,0 +1,18 @@
|
|||
#pragma once
|
||||
|
||||
#include "audio/AudioClip.h"
|
||||
#include "core/Phone.h"
|
||||
#include "tools/ProgressBar.h"
|
||||
#include "time/BoundedTimeline.h"
|
||||
|
||||
class Recognizer {
|
||||
public:
|
||||
virtual ~Recognizer() = default;
|
||||
|
||||
virtual BoundedTimeline<Phone>recognizePhones(
|
||||
const AudioClip& audioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) const = 0;
|
||||
};
|
|
@ -1,12 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "audio/AudioClip.h"
|
||||
#include "core/Phone.h"
|
||||
#include "tools/ProgressBar.h"
|
||||
#include "time/BoundedTimeline.h"
|
||||
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& audioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink);
|
|
@ -0,0 +1,218 @@
|
|||
#include "pocketSphinxTools.h"
|
||||
|
||||
#include "tools/platformTools.h"
|
||||
#include <regex>
|
||||
#include "audio/DcOffset.h"
|
||||
#include "audio/voiceActivityDetection.h"
|
||||
#include "tools/parallel.h"
|
||||
#include "tools/ObjectPool.h"
|
||||
#include "time/timedLogging.h"
|
||||
|
||||
extern "C" {
|
||||
#include <sphinxbase/err.h>
|
||||
#include <pocketsphinx_internal.h>
|
||||
#include <ngram_search.h>
|
||||
}
|
||||
|
||||
using std::runtime_error;
|
||||
using std::invalid_argument;
|
||||
using std::unique_ptr;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using boost::filesystem::path;
|
||||
using std::regex;
|
||||
using boost::optional;
|
||||
using std::chrono::duration_cast;
|
||||
|
||||
logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
|
||||
switch (errorLevel) {
|
||||
case ERR_DEBUG:
|
||||
case ERR_INFO:
|
||||
case ERR_INFOCONT:
|
||||
return logging::Level::Trace;
|
||||
case ERR_WARN:
|
||||
return logging::Level::Warn;
|
||||
case ERR_ERROR:
|
||||
return logging::Level::Error;
|
||||
case ERR_FATAL:
|
||||
return logging::Level::Fatal;
|
||||
default:
|
||||
throw invalid_argument("Unknown log level.");
|
||||
}
|
||||
}
|
||||
|
||||
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
|
||||
UNUSED(user_data);
|
||||
|
||||
// Create varArgs list
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
auto _ = gsl::finally([&args]() { va_end(args); });
|
||||
|
||||
// Format message
|
||||
const int initialSize = 256;
|
||||
vector<char> chars(initialSize);
|
||||
bool success = false;
|
||||
while (!success) {
|
||||
const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
|
||||
if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
|
||||
|
||||
success = charsWritten < static_cast<int>(chars.size());
|
||||
if (!success) chars.resize(chars.size() * 2);
|
||||
}
|
||||
const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
|
||||
string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
|
||||
boost::algorithm::trim(message);
|
||||
|
||||
const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
|
||||
logging::log(logLevel, message);
|
||||
}
|
||||
|
||||
void redirectPocketSphinxOutput() {
|
||||
static bool redirected = false;
|
||||
if (redirected) return;
|
||||
|
||||
// Discard PocketSphinx output
|
||||
err_set_logfp(nullptr);
|
||||
|
||||
// Redirect PocketSphinx output to log
|
||||
err_set_callback(sphinxLogCallback, nullptr);
|
||||
|
||||
redirected = true;
|
||||
}
|
||||
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
optional<std::string> dialog,
|
||||
decoderFactory createDecoder,
|
||||
utteranceToPhonesFunction utteranceToPhones,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) {
|
||||
ProgressMerger totalProgressMerger(progressSink);
|
||||
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
|
||||
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
||||
|
||||
// Make sure audio stream has no DC offset
|
||||
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
|
||||
|
||||
// Split audio into utterances
|
||||
JoiningBoundedTimeline<void> utterances;
|
||||
try {
|
||||
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
|
||||
} catch (...) {
|
||||
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
||||
}
|
||||
|
||||
redirectPocketSphinxOutput();
|
||||
|
||||
// Prepare pool of decoders
|
||||
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
|
||||
[&] { return createDecoder(dialog); });
|
||||
|
||||
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
|
||||
std::mutex resultMutex;
|
||||
const auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
||||
// Detect phones for utterance
|
||||
const auto decoder = decoderPool.acquire();
|
||||
Timeline<Phone> utterancePhones =
|
||||
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
|
||||
|
||||
// Copy phones to result timeline
|
||||
std::lock_guard<std::mutex> lock(resultMutex);
|
||||
for (const auto& timedPhone : utterancePhones) {
|
||||
phones.set(timedPhone);
|
||||
}
|
||||
};
|
||||
|
||||
const auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
|
||||
return timedUtterance.getDuration().count();
|
||||
};
|
||||
|
||||
// Perform speech recognition
|
||||
try {
|
||||
// Determine how many parallel threads to use
|
||||
int threadCount = std::min({
|
||||
maxThreadCount,
|
||||
// Don't use more threads than there are utterances to be processed
|
||||
static_cast<int>(utterances.size()),
|
||||
// Don't waste time creating additional threads (and decoders!) if the recording is short
|
||||
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
|
||||
});
|
||||
if (threadCount < 1) {
|
||||
threadCount = 1;
|
||||
}
|
||||
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
|
||||
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
||||
logging::debug("Speech recognition -- end");
|
||||
} catch (...) {
|
||||
std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
|
||||
}
|
||||
|
||||
return phones;
|
||||
}
|
||||
|
||||
const path& getSphinxModelDirectory() {
|
||||
static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
|
||||
return sphinxModelDirectory;
|
||||
}
|
||||
|
||||
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
|
||||
JoiningTimeline<void> noiseSounds;
|
||||
|
||||
// Find utterance parts without recognized phones
|
||||
noiseSounds.set(utteranceTimeRange);
|
||||
for (const auto& timedPhone : phones) {
|
||||
noiseSounds.clear(timedPhone.getTimeRange());
|
||||
}
|
||||
|
||||
// Remove undesired elements
|
||||
const centiseconds minSoundDuration = 12_cs;
|
||||
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
|
||||
const bool startsAtZero = unknownSound.getStart() == 0_cs;
|
||||
const bool tooShort = unknownSound.getDuration() < minSoundDuration;
|
||||
if (startsAtZero || tooShort) {
|
||||
noiseSounds.clear(unknownSound.getTimeRange());
|
||||
}
|
||||
}
|
||||
|
||||
return noiseSounds;
|
||||
}
|
||||
|
||||
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
|
||||
// Restart timing at 0
|
||||
ps_start_stream(&decoder);
|
||||
|
||||
// Start recognition
|
||||
int error = ps_start_utt(&decoder);
|
||||
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
||||
|
||||
// Process entire audio clip
|
||||
const bool noRecognition = false;
|
||||
const bool fullUtterance = true;
|
||||
const int searchedFrameCount =
|
||||
ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
|
||||
if (searchedFrameCount < 0) {
|
||||
throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||
}
|
||||
|
||||
// End recognition
|
||||
error = ps_end_utt(&decoder);
|
||||
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
||||
|
||||
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
|
||||
const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
||||
if (noWordsRecognized) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Collect words
|
||||
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
|
||||
const char* word = ps_seg_word(it);
|
||||
int firstFrame, lastFrame;
|
||||
ps_seg_frames(it, &firstFrame, &lastFrame);
|
||||
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#include "time/BoundedTimeline.h"
|
||||
#include "core/Phone.h"
|
||||
#include "audio/AudioClip.h"
|
||||
#include "tools/ProgressBar.h"
|
||||
#include <boost/filesystem/path.hpp>
|
||||
|
||||
extern "C" {
|
||||
#include <pocketsphinx.h>
|
||||
}
|
||||
|
||||
typedef std::function<lambda_unique_ptr<ps_decoder_t>(
|
||||
boost::optional<std::string> dialog
|
||||
)> decoderFactory;
|
||||
|
||||
typedef std::function<Timeline<Phone>(
|
||||
const AudioClip& audioClip,
|
||||
TimeRange utteranceTimeRange,
|
||||
ps_decoder_t& decoder,
|
||||
ProgressSink& utteranceProgressSink
|
||||
)> utteranceToPhonesFunction;
|
||||
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
decoderFactory createDecoder,
|
||||
utteranceToPhonesFunction utteranceToPhones,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
);
|
||||
|
||||
constexpr int sphinxSampleRate = 16000;
|
||||
|
||||
const boost::filesystem::path& getSphinxModelDirectory();
|
||||
|
||||
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones);
|
||||
|
||||
BoundedTimeline<std::string> recognizeWords(const std::vector<int16_t>& audioBuffer, ps_decoder_t& decoder);
|
|
@ -0,0 +1,27 @@
|
|||
#include "RecognizerType.h"
|
||||
|
||||
using std::string;
|
||||
|
||||
RecognizerTypeConverter& RecognizerTypeConverter::get() {
|
||||
static RecognizerTypeConverter converter;
|
||||
return converter;
|
||||
}
|
||||
|
||||
string RecognizerTypeConverter::getTypeName() {
|
||||
return "RecognizerType";
|
||||
}
|
||||
|
||||
EnumConverter<RecognizerType>::member_data RecognizerTypeConverter::getMemberData() {
|
||||
return member_data{
|
||||
{ RecognizerType::PocketSphinx, "pocketSphinx" },
|
||||
{ RecognizerType::Phonetic, "phonetic" }
|
||||
};
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
|
||||
return RecognizerTypeConverter::get().write(stream, value);
|
||||
}
|
||||
|
||||
std::istream& operator>>(std::istream& stream, RecognizerType& value) {
|
||||
return RecognizerTypeConverter::get().read(stream, value);
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
#pragma once
|
||||
|
||||
#include "tools/EnumConverter.h"
|
||||
|
||||
enum class RecognizerType {
|
||||
PocketSphinx,
|
||||
Phonetic
|
||||
};
|
||||
|
||||
class RecognizerTypeConverter : public EnumConverter<RecognizerType> {
|
||||
public:
|
||||
static RecognizerTypeConverter& get();
|
||||
protected:
|
||||
std::string getTypeName() override;
|
||||
member_data getMemberData() override;
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& stream, RecognizerType value);
|
||||
|
||||
std::istream& operator>>(std::istream& stream, RecognizerType& value);
|
|
@ -27,6 +27,9 @@
|
|||
#include "tools/platformTools.h"
|
||||
#include "sinks.h"
|
||||
#include "semanticEntries.h"
|
||||
#include "RecognizerType.h"
|
||||
#include "recognition/PocketSphinxRecognizer.h"
|
||||
#include "recognition/PhoneticRecognizer.h"
|
||||
|
||||
using std::exception;
|
||||
using std::string;
|
||||
|
@ -36,9 +39,6 @@ using std::unique_ptr;
|
|||
using std::make_unique;
|
||||
using std::shared_ptr;
|
||||
using std::make_shared;
|
||||
using std::map;
|
||||
using std::chrono::duration;
|
||||
using std::chrono::duration_cast;
|
||||
using std::ofstream;
|
||||
using boost::filesystem::path;
|
||||
using boost::adaptors::transformed;
|
||||
|
@ -56,6 +56,10 @@ namespace TCLAP {
|
|||
struct ArgTraits<ExportFormat> {
|
||||
typedef ValueLike ValueCategory;
|
||||
};
|
||||
template<>
|
||||
struct ArgTraits<RecognizerType> {
|
||||
typedef ValueLike ValueCategory;
|
||||
};
|
||||
}
|
||||
|
||||
shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
|
||||
|
@ -66,6 +70,17 @@ shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
|
|||
return make_shared<logging::LevelFilter>(FileSink, minLevel);
|
||||
}
|
||||
|
||||
unique_ptr<Recognizer> createRecognizer(RecognizerType recognizerType) {
|
||||
switch (recognizerType) {
|
||||
case RecognizerType::PocketSphinx:
|
||||
return make_unique<PocketSphinxRecognizer>();
|
||||
case RecognizerType::Phonetic:
|
||||
return make_unique<PhoneticRecognizer>();
|
||||
default:
|
||||
throw std::runtime_error("Unknown recognizer.");
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<Exporter> createExporter(ExportFormat exportFormat) {
|
||||
switch (exportFormat) {
|
||||
case ExportFormat::Tsv:
|
||||
|
@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
|
|||
auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
|
||||
tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
|
||||
tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
|
||||
auto recognizerTypes = vector<RecognizerType>(RecognizerTypeConverter::get().getValues());
|
||||
tclap::ValuesConstraint<RecognizerType> recognizerConstraint(recognizerTypes);
|
||||
tclap::ValueArg<RecognizerType> recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
|
||||
tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
|
||||
|
||||
try {
|
||||
|
@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
|
|||
JoiningContinuousTimeline<Shape> animation = animateWaveFile(
|
||||
inputFilePath,
|
||||
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
|
||||
*createRecognizer(recognizerType.getValue()),
|
||||
targetShapeSet,
|
||||
maxThreadCount.getValue(),
|
||||
progressSink);
|
||||
|
|
Loading…
Reference in New Issue