Merge pull request #47 from DanielSWolf/feature/phonetic-recognition

Phonetic recognition
This commit is contained in:
Daniel Wolf 2019-01-02 11:36:08 +01:00 committed by GitHub
commit c078e6186e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 723 additions and 348 deletions

View File

@ -1,5 +1,9 @@
# Version history
## Unreleased
* **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)).
## Version 1.8.0
* **Added** support for Ogg Vorbis (.ogg) file format ([issue #40](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/40)).

View File

@ -123,6 +123,11 @@ The following command-line options are the most common:
| _<input file>_
| The audio file to be analyzed. This must be the last command-line argument. Supported file formats are WAVE (.wav) and Ogg Vorbis (.ogg).
| `-r` _<recognizer>_, `--recognizer` _<recognizer>_
| Specifies how Rhubarb Lip Sync recognizes speech within the recording. Options: `pocketSphinx` (use for English recordings), `phonetic` (use for non-English recordings). For details, see <<recognizers>>.
_Default value: ``pocketSphinx``_
| `-f` _<format>_, `--exportFormat` _<format>_
| The export format. Options: `tsv` (tab-separated values, see <<tsv,details>>), `xml` (see <<xml,details>>), `json` (see <<json,details>>).
@ -192,6 +197,19 @@ Note that for short audio files, Rhubarb Lip Sync may choose to use fewer thread
_Default value: as many threads as your CPU has cores_
|===
[[recognizers]]
== Recognizers
The first step in processing an audio file is determining what is being said. More specifically, Rhubarb Lip Sync uses speech recognition to figure out what sound is being said at what point in time. You can choose between two recognizers:
=== PocketSphinx
PocketSphinx is an open-source speech recognition library that generally gives good results. This is the default recognizer. The downside is that PocketSphinx only recognizes English dialog. So if your recordings are in a language other than English, this is not a good choice.
=== Phonetic
Rhubarb Lip Sync also comes with a phonetic recognizer. _Phonetic_ means that this recognizer won't try to understand entire (English) words and phrases. Instead, it will recognize individual sounds and syllables. The results are usually less precise than those from the PocketSphinx recognizer. The advantage is that this recognizer is language-independent. Use it if your recordings are not in English.
[[outputFormats]]
== Output formats

View File

@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.2)
set(appName "Rhubarb Lip Sync")
set(appVersionMajor 1)
set(appVersionMinor 8)
set(appVersionMinor 9)
set(appVersionPatch 0)
set(appVersionSuffix "")
set(appVersionSuffix "-pre.1")
set(appVersion "${appVersionMajor}.${appVersionMinor}.${appVersionPatch}${appVersionSuffix}")

View File

@ -323,6 +323,12 @@ function createDialogWindow() {
+ 'your After Effects project.'
})
}),
recognizer: Group({
label: StaticText({ text: 'Recognizer:' }),
value: DropDownList({
helpTip: 'The dialog recognizer.'
})
}),
dialogText: Group({
label: StaticText({ text: 'Dialog text (optional):' }),
value: EditText({
@ -384,6 +390,7 @@ function createDialogWindow() {
var controls = {
audioFile: window.settings.audioFile.value,
dialogText: window.settings.dialogText.value,
recognizer: window.settings.recognizer.value,
mouthComp: window.settings.mouthComp.value,
targetFolder: window.settings.targetFolder.value,
frameRate: window.settings.frameRate.value,
@ -402,6 +409,16 @@ function createDialogWindow() {
listItem.projectItem = projectItem;
});
// Add recognizer options
const recognizerOptions = [
{ text: 'PocketSphinx (use for English recordings)', value: 'pocketSphinx' },
{ text: 'Phonetic (use for non-English recordings)', value: 'phonetic' }
];
recognizerOptions.forEach(function(option) {
var listItem = controls.recognizer.add('item', option.text);
listItem.value = option.value;
});
// Add mouth composition options
var comps = toArrayBase1(app.project.items).filter(function (item) {
return item instanceof CompItem;
@ -425,6 +442,7 @@ function createDialogWindow() {
var settings = readSettingsFile();
selectByTextOrFirst(controls.audioFile, settings.audioFile);
controls.dialogText.text = settings.dialogText || '';
selectByTextOrFirst(controls.recognizer, settings.recognizer);
selectByTextOrFirst(controls.mouthComp, settings.mouthComp);
extendedMouthShapeNames.forEach(function(shapeName) {
controls['mouthShape' + shapeName].value =
@ -484,6 +502,7 @@ function createDialogWindow() {
// Store settings
var settings = {
audioFile: (controls.audioFile.selection || {}).text,
recognizer: (controls.recognizer.selection || {}).text,
dialogText: controls.dialogText.text,
mouthComp: (controls.mouthComp.selection || {}).text,
extendedMouthShapes: {},
@ -543,7 +562,7 @@ function createDialogWindow() {
// Check for correct Rhubarb version
var version = exec(rhubarbPath + ' --version') || '';
var match = version.match(/Rhubarb Lip Sync version ((\d+)\.(\d+).(\d+))/);
var match = version.match(/Rhubarb Lip Sync version ((\d+)\.(\d+).(\d+)(-[0-9A-Za-z-.]+)?)/);
if (!match) {
var instructions = osIsWindows
? 'Make sure your PATH environment variable contains the ' + appName + ' '
@ -555,13 +574,16 @@ function createDialogWindow() {
var versionString = match[1];
var major = Number(match[2]);
var minor = Number(match[3]);
if (major != 1 || minor < 6) {
return 'This script requires ' + appName + ' 1.6.0 or a later 1.x version. '
var requiredMajor = 1;
var minRequiredMinor = 9;
if (major != requiredMajor || minor < minRequiredMinor) {
return 'This script requires ' + appName + ' ' + requiredMajor + '.' + minRequiredMinor
+ '.0 or a later ' + requiredMajor + '.x version. '
+ 'Your installed version is ' + versionString + ', which is not compatible.';
}
}
function generateMouthCues(audioFileFootage, dialogText, mouthComp, extendedMouthShapeNames,
function generateMouthCues(audioFileFootage, recognizer, dialogText, mouthComp, extendedMouthShapeNames,
targetProjectFolder, frameRate)
{
var basePath = Folder.temp.fsName + '/' + createGuid();
@ -575,6 +597,7 @@ function createDialogWindow() {
// Create command line
var commandLine = rhubarbPath
+ ' --dialogFile ' + cliEscape(dialogFile.fsName)
+ ' --recognizer ' + recognizer
+ ' --exportFormat json'
+ ' --extendedShapes ' + cliEscape(extendedMouthShapeNames.join(''))
+ ' --logFile ' + cliEscape(logFile.fsName)
@ -660,11 +683,11 @@ function createDialogWindow() {
}
}
function animate(audioFileFootage, dialogText, mouthComp, extendedMouthShapeNames,
function animate(audioFileFootage, recognizer, dialogText, mouthComp, extendedMouthShapeNames,
targetProjectFolder, frameRate)
{
try {
var mouthCues = generateMouthCues(audioFileFootage, dialogText, mouthComp,
var mouthCues = generateMouthCues(audioFileFootage, recognizer, dialogText, mouthComp,
extendedMouthShapeNames, targetProjectFolder, frameRate);
app.beginUndoGroup(appName + ': Animation');
@ -680,6 +703,7 @@ function createDialogWindow() {
// Handle changes
update();
controls.audioFile.onChange = update;
controls.recognizer.onChange = update;
controls.dialogText.onChanging = update;
controls.mouthComp.onChange = update;
extendedMouthShapeNames.forEach(function(shapeName) {
@ -700,6 +724,7 @@ function createDialogWindow() {
window.close();
animate(
controls.audioFile.selection.projectItem,
controls.recognizer.selection.value,
controls.dialogText.text || '',
controls.mouthComp.selection.projectItem,
extendedMouthShapeNames.filter(function(shapeName) {

View File

@ -141,11 +141,12 @@ class AudioFileModel(
private fun startAnimation() {
val wrapperTask = Runnable {
val recognizer = parentModel.parentModel.recognizer.value
val extendedMouthShapes = parentModel.mouthShapes.filter { it.isExtended }.toSet()
val reportProgress: (Double?) -> Unit = {
progress -> runAndWait { this@AudioFileModel.animationProgress = progress }
}
val rhubarbTask = RhubarbTask(audioFilePath, dialog, extendedMouthShapes, reportProgress)
val rhubarbTask = RhubarbTask(audioFilePath, recognizer, dialog, extendedMouthShapes, reportProgress)
try {
try {
val result = rhubarbTask.call()

View File

@ -2,6 +2,8 @@ package com.rhubarb_lip_sync.rhubarb_for_spine
import javafx.beans.property.SimpleObjectProperty
import javafx.beans.property.SimpleStringProperty
import javafx.collections.FXCollections
import javafx.collections.ObservableList
import tornadofx.FX
import tornadofx.getValue
import tornadofx.setValue
@ -40,6 +42,15 @@ class MainModel(private val executor: ExecutorService) {
var animationFileModel by animationFileModelProperty
private set
val recognizersProperty = SimpleObjectProperty<ObservableList<Recognizer>>(FXCollections.observableArrayList(
Recognizer("pocketSphinx", "PocketSphinx (use for English recordings)"),
Recognizer("phonetic", "Phonetic (use for non-English recordings)")
))
private var recognizers: ObservableList<Recognizer> by recognizersProperty
val recognizerProperty = SimpleObjectProperty<Recognizer>(recognizers[0])
var recognizer: Recognizer by recognizerProperty
val animationPrefixProperty = SimpleStringProperty("say_")
var animationPrefix: String by animationPrefixProperty
@ -48,3 +59,5 @@ class MainModel(private val executor: ExecutorService) {
private fun getDefaultPathString() = FX.application.parameters.raw.firstOrNull()
}
class Recognizer(val value: String, val description: String)

View File

@ -17,6 +17,7 @@ import javafx.scene.text.Font
import javafx.scene.text.FontWeight
import javafx.scene.text.Text
import javafx.stage.FileChooser
import javafx.util.StringConverter
import tornadofx.*
import java.io.File
import java.util.concurrent.Executors
@ -83,6 +84,20 @@ class MainView : View() {
}
}
}
field("Dialog recognizer") {
combobox<Recognizer> {
itemsProperty().bind(mainModel.recognizersProperty)
this.converter = object : StringConverter<Recognizer>() {
override fun toString(recognizer: Recognizer?): String {
return recognizer?.description ?: ""
}
override fun fromString(string: String?): Recognizer {
throw NotImplementedError()
}
}
valueProperty().bindBidirectional(mainModel.recognizerProperty)
}
}
field("Animation naming") {
textfield {
maxWidth = 100.0

View File

@ -14,6 +14,7 @@ import java.util.concurrent.Callable
class RhubarbTask(
val audioFilePath: Path,
val recognizer: String,
val dialog: String?,
val extendedMouthShapes: Set<MouthShape>,
val reportProgress: (Double?) -> Unit
@ -89,6 +90,7 @@ class RhubarbTask(
return mutableListOf(
rhubarbBinFilePath.toString(),
"--machineReadable",
"--recognizer", recognizer,
"--exportFormat", "json",
"--extendedShapes", extendedMouthShapesString
).apply {
@ -100,7 +102,6 @@ class RhubarbTask(
}.apply {
add(audioFilePath.toString())
}
}
private val guiBinDirectory: Path by lazy {

View File

@ -413,8 +413,13 @@ add_library(rhubarb-recognition
src/recognition/g2p.h
src/recognition/languageModels.cpp
src/recognition/languageModels.h
src/recognition/phoneRecognition.cpp
src/recognition/phoneRecognition.h
src/recognition/PhoneticRecognizer.cpp
src/recognition/PhoneticRecognizer.h
src/recognition/PocketSphinxRecognizer.cpp
src/recognition/PocketSphinxRecognizer.h
src/recognition/pocketSphinxTools.cpp
src/recognition/pocketSphinxTools.h
src/recognition/Recognizer.h
src/recognition/tokenization.cpp
src/recognition/tokenization.h
)
@ -487,6 +492,8 @@ add_executable(rhubarb
src/rhubarb/main.cpp
src/rhubarb/ExportFormat.cpp
src/rhubarb/ExportFormat.h
src/rhubarb/RecognizerType.cpp
src/rhubarb/RecognizerType.h
src/rhubarb/semanticEntries.cpp
src/rhubarb/semanticEntries.h
src/rhubarb/sinks.cpp

View File

@ -1,7 +1,12 @@
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppClangTidyModernizeRawStringLiteral/@EntryIndexedValue">HINT</s:String>
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String>
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean>
@ -29,6 +34,7 @@
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String>
@ -44,6 +50,14 @@
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean>
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CssFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/HtmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/JavaScriptCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ProtobufCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ResxFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/VBFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlDocFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String>
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String>
<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue">&lt;NamingElement Priority="10"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="class field" /&gt;&lt;type Name="struct field" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /&gt;&lt;/NamingElement&gt;</s:String>
@ -108,7 +122,16 @@
<s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpKeepExistingMigration/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpPlaceEmbeddedOnSameLineMigration/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002EFormat_002ESettingsUpgrade_002EAlignmentTabFillStyleMigration/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=allphone/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=cepstral/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=cmudict/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=pbeam/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=qwhy/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Viterbi/@EntryIndexedValue">True</s:Boolean>
</wpf:ResourceDictionary>

View File

@ -1,6 +1,5 @@
#include "rhubarbLib.h"
#include "core/Phone.h"
#include "recognition/phoneRecognition.h"
#include "tools/textFiles.h"
#include "animation/mouthAnimation.h"
#include "audio/audioFileReading.h"
@ -8,27 +7,29 @@
using boost::optional;
using std::string;
using boost::filesystem::path;
using std::unique_ptr;
JoiningContinuousTimeline<Shape> animateAudioClip(
const AudioClip& audioClip,
optional<string> dialog,
const optional<string>& dialog,
const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink)
{
BoundedTimeline<Phone> phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
const BoundedTimeline<Phone> phones =
recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
return result;
}
JoiningContinuousTimeline<Shape> animateWaveFile(
path filePath,
optional<string> dialog,
const optional<string>& dialog,
const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink)
{
const auto audioClip = createAudioFileClip(filePath);
return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink);
return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
}

View File

@ -6,17 +6,20 @@
#include "tools/ProgressBar.h"
#include <boost/filesystem.hpp>
#include "animation/targetShapeSet.h"
#include "recognition/Recognizer.h"
JoiningContinuousTimeline<Shape> animateAudioClip(
const AudioClip& audioClip,
boost::optional<std::string> dialog,
const boost::optional<std::string>& dialog,
const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink);
JoiningContinuousTimeline<Shape> animateWaveFile(
boost::filesystem::path filePath,
boost::optional<std::string> dialog,
const boost::optional<std::string>& dialog,
const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink);

View File

@ -0,0 +1,103 @@
#include "PhoneticRecognizer.h"
#include "time/Timeline.h"
#include "audio/AudioSegment.h"
#include "audio/SampleRateConverter.h"
#include "audio/processing.h"
#include "time/timedLogging.h"
using std::runtime_error;
using std::unique_ptr;
using std::string;
using boost::optional;
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
UNUSED(dialog);
lambda_unique_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
// Set phonetic language model
"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
"-allphone_ci", "yes",
// Set language model probability weight.
// Low values (<= 0.4) can lead to fluttering animation.
// High values (>= 1.0) can lead to imprecise or freezing animation.
"-lw", "0.8",
// The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
// Set beam width applied to every frame in Viterbi search
"-beam", "1e-20",
// Set beam width applied to phone transitions
"-pbeam", "1e-20",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
lambda_unique_ptr<ps_decoder_t> decoder(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!decoder) throw runtime_error("Error creating speech decoder.");
return decoder;
}
static Timeline<Phone> utteranceToPhones(
const AudioClip& audioClip,
TimeRange utteranceTimeRange,
ps_decoder_t& decoder,
ProgressSink& utteranceProgressSink
) {
// Pad time range to give PocketSphinx some breathing room
TimeRange paddedTimeRange = utteranceTimeRange;
const centiseconds padding(3);
paddedTimeRange.grow(padding);
paddedTimeRange.trim(audioClip.getTruncatedRange());
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
// Detect phones (returned as words)
BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
phoneStrings.shift(paddedTimeRange.getStart());
Timeline<Phone> utterancePhones;
for (const auto& timedPhoneString : phoneStrings) {
Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
phone = Phone::Schwa;
}
utterancePhones.set(timedPhoneString.getTimeRange(), phone);
}
// Log raw phones
for (const auto& timedPhone : utterancePhones) {
logTimedEvent("rawPhone", timedPhone);
}
// Guess positions of noise sounds
JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
for (const auto& noiseSound : noiseSounds) {
utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
}
// Log phones
for (const auto& timedPhone : utterancePhones) {
logTimedEvent("phone", timedPhone);
}
utteranceProgressSink.reportProgress(1.0);
return utterancePhones;
}
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
const AudioClip& inputAudioClip,
optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink
) const {
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
}

View File

@ -0,0 +1,14 @@
#pragma once
#include "Recognizer.h"
#include "pocketSphinxTools.h"
class PhoneticRecognizer : public Recognizer {
public:
BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip,
boost::optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink
) const override;
};

View File

@ -1,143 +1,133 @@
#include <boost/filesystem.hpp>
#include "phoneRecognition.h"
#include "audio/SampleRateConverter.h"
#include "tools/platformTools.h"
#include "tools/tools.h"
#include <format.h>
#include <s3types.h>
#include "PocketSphinxRecognizer.h"
#include <regex>
#include <gsl_util.h>
#include "logging/logging.h"
#include "audio/DcOffset.h"
#include "time/Timeline.h"
#include "audio/voiceActivityDetection.h"
#include "audio/AudioSegment.h"
#include "audio/SampleRateConverter.h"
#include "languageModels.h"
#include "tokenization.h"
#include "g2p.h"
#include "time/ContinuousTimeline.h"
#include "audio/processing.h"
#include "tools/parallel.h"
#include <boost/version.hpp>
#include "tools/ObjectPool.h"
#include "time/timedLogging.h"
extern "C" {
#include <pocketsphinx.h>
#include <sphinxbase/err.h>
#include <ps_alignment.h>
#include <state_align_search.h>
#include <pocketsphinx_internal.h>
#include <ngram_search.h>
}
using std::runtime_error;
using std::invalid_argument;
using std::unique_ptr;
using std::shared_ptr;
using std::string;
using std::vector;
using std::map;
using boost::filesystem::path;
using std::function;
using std::regex;
using std::regex_replace;
using std::chrono::duration;
using boost::optional;
using std::string;
using std::chrono::duration_cast;
using std::array;
constexpr int sphinxSampleRate = 16000;
const path& getSphinxModelDirectory() {
static path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
return sphinxModelDirectory;
bool dictionaryContains(dict_t& dictionary, const string& word) {
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
}
logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
switch (errorLevel) {
case ERR_DEBUG:
case ERR_INFO:
case ERR_INFOCONT:
return logging::Level::Trace;
case ERR_WARN:
return logging::Level::Warn;
case ERR_ERROR:
return logging::Level::Error;
case ERR_FATAL:
return logging::Level::Fatal;
default:
throw invalid_argument("Unknown log level.");
s3wid_t getWordId(const string& word, dict_t& dictionary) {
const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
return wordId;
}
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
map<string, string> missingPronunciations;
for (const string& word : words) {
if (!dictionaryContains(*decoder.dict, word)) {
string pronunciation;
for (Phone phone : wordToPhones(word)) {
if (pronunciation.length() > 0) pronunciation += " ";
pronunciation += PhoneConverter::get().toString(phone);
}
missingPronunciations[word] = pronunciation;
}
}
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
const bool isLast = it == --missingPronunciations.end();
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
}
}
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
UNUSED(user_data);
// Create varArgs list
va_list args;
va_start(args, format);
auto _ = gsl::finally([&args]() { va_end(args); });
// Format message
const int initialSize = 256;
vector<char> chars(initialSize);
bool success = false;
while (!success) {
int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
success = charsWritten < static_cast<int>(chars.size());
if (!success) chars.resize(chars.size() * 2);
}
regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
boost::algorithm::trim(message);
logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
logging::log(logLevel, message);
}
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
// Restart timing at 0
ps_start_stream(&decoder);
// Start recognition
int error = ps_start_utt(&decoder);
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
// Process entire audio clip
const bool noRecognition = false;
const bool fullUtterance = true;
int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
// End recognition
error = ps_end_utt(&decoder);
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
if (noWordsRecognized) {
return result;
}
// Collect words
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
const char* word = ps_seg_word(it);
int firstFrame, lastFrame;
ps_seg_frames(it, &firstFrame, &lastFrame);
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
lambda_unique_ptr<ngram_model_t> result(
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
[](ngram_model_t* lm) { ngram_model_free(lm); });
if (!result) {
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
}
return result;
}
s3wid_t getWordId(const string& word, dict_t& dictionary) {
s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
return wordId;
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
// Split dialog into normalized words
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
// Add dialog-specific words to the dictionary
addMissingDictionaryWords(words, decoder);
// Create dialog-specific language model
words.insert(words.begin(), "<s>");
words.emplace_back("</s>");
return createLanguageModel(words, decoder);
}
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
constexpr int modelCount = 2;
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
array<const char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
lambda_unique_ptr<ngram_model_t> result(
ngram_model_set_init(nullptr, languageModels.data(), const_cast<char**>(modelNames.data()), modelWeights.data(), modelCount),
[](ngram_model_t* lm) { ngram_model_free(lm); });
if (!result) {
throw runtime_error("Error creating biased language model.");
}
return result;
}
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
lambda_unique_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
// Set pronunciation dictionary
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
"-dither", "yes",
// Disable VAD -- we're doing that ourselves
"-remove_silence", "no",
// Perform per-utterance cepstral mean normalization
"-cmn", "batch",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
lambda_unique_ptr<ps_decoder_t> decoder(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!decoder) throw runtime_error("Error creating speech decoder.");
// Set language model
lambda_unique_ptr<ngram_model_t> languageModel(dialog
? createBiasedLanguageModel(*decoder, *dialog)
: createDefaultLanguageModel(*decoder));
ps_set_lm(decoder.get(), "lm", languageModel.get());
ps_set_search(decoder.get(), "lm");
return decoder;
}
optional<Timeline<Phone>> getPhoneAlignment(
@ -178,7 +168,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
// Process entire audio clip
const int16* nextSample = audioBuffer.data();
size_t remainingSamples = audioBuffer.size();
bool fullUtterance = true;
const bool fullUtterance = true;
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
while (acousticModel->n_feat_frame > 0) {
ps_search_step(search.get(), acousticModel->output_frame);
@ -197,7 +187,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
// Get phone
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
s3cipid_t phoneId = phoneEntry->id.pid.cipid;
const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
string phoneName = phoneNames[phoneId];
if (phoneName == "SIL") continue;
@ -207,162 +197,42 @@ optional<Timeline<Phone>> getPhoneAlignment(
centiseconds duration(phoneEntry->duration);
Phone phone = PhoneConverter::get().parse(phoneName);
if (phone == Phone::AH && duration < 6_cs) {
// Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate.
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
phone = Phone::Schwa;
}
Timed<Phone> timedPhone(start, start + duration, phone);
const Timed<Phone> timedPhone(start, start + duration, phone);
result.set(timedPhone);
}
return result;
}
bool dictionaryContains(dict_t& dictionary, const string& word) {
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
}
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
map<string, string> missingPronunciations;
for (const string& word : words) {
if (!dictionaryContains(*decoder.dict, word)) {
string pronunciation;
for (Phone phone : wordToPhones(word)) {
if (pronunciation.length() > 0) pronunciation += " ";
pronunciation += PhoneConverter::get().toString(phone);
}
missingPronunciations[word] = pronunciation;
}
}
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
bool isLast = it == --missingPronunciations.end();
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
}
}
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
lambda_unique_ptr<ngram_model_t> result(
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
[](ngram_model_t* lm) { ngram_model_free(lm); });
if (!result) {
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
}
return std::move(result);
}
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
// Split dialog into normalized words
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
// Add dialog-specific words to the dictionary
addMissingDictionaryWords(words, decoder);
// Create dialog-specific language model
words.insert(words.begin(), "<s>");
words.push_back("</s>");
return createLanguageModel(words, decoder);
}
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
constexpr int modelCount = 2;
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
array<char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
lambda_unique_ptr<ngram_model_t> result(
ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
[](ngram_model_t* lm) { ngram_model_free(lm); });
if (!result) {
throw runtime_error("Error creating biased language model.");
}
return std::move(result);
}
lambda_unique_ptr<ps_decoder_t> createDecoder(optional<string> dialog) {
lambda_unique_ptr<cmd_ln_t> config(
cmd_ln_init(
nullptr, ps_args(), true,
// Set acoustic model
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
// Set pronunciation dictionary
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
"-dither", "yes",
// Disable VAD -- we're doing that ourselves
"-remove_silence", "no",
// Perform per-utterance cepstral mean normalization
"-cmn", "batch",
nullptr),
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
if (!config) throw runtime_error("Error creating configuration.");
lambda_unique_ptr<ps_decoder_t> decoder(
ps_init(config.get()),
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
if (!decoder) throw runtime_error("Error creating speech decoder.");
// Set language model
lambda_unique_ptr<ngram_model_t> languageModel(dialog
? createBiasedLanguageModel(*decoder, *dialog)
: createDefaultLanguageModel(*decoder));
ps_set_lm(decoder.get(), "lm", languageModel.get());
ps_set_search(decoder.get(), "lm");
return decoder;
}
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
JoiningTimeline<void> noiseSounds;
// Find utterance parts without recogniced phones
noiseSounds.set(utteranceTimeRange);
for (const auto& timedPhone : phones) {
noiseSounds.clear(timedPhone.getTimeRange());
}
// Remove undesired elements
const centiseconds minSoundDuration = 12_cs;
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
bool startsAtZero = unknownSound.getStart() == 0_cs;
bool tooShort = unknownSound.getDuration() < minSoundDuration;
if (startsAtZero || tooShort) {
noiseSounds.clear(unknownSound.getTimeRange());
}
}
return noiseSounds;
}
// Some words have multiple pronunciations, one of which results in better animation than the others.
// This function returns the optimal pronunciation for a select set of these words.
string fixPronunciation(const string& word) {
const static map<string, string> replacements {
{"into(2)", "into"},
{"to(2)", "to"},
{"to(3)", "to"},
{"today(2)", "today"},
{"tomorrow(2)", "tomorrow"},
{"tonight(2)", "tonight"}
const static map<string, string> replacements{
{ "into(2)", "into" },
{ "to(2)", "to" },
{ "to(3)", "to" },
{ "today(2)", "today" },
{ "tomorrow(2)", "tomorrow" },
{ "tonight(2)", "tonight" }
};
const auto pair = replacements.find(word);
return pair != replacements.end() ? pair->second : word;
}
Timeline<Phone> utteranceToPhones(
static Timeline<Phone> utteranceToPhones(
const AudioClip& audioClip,
TimeRange utteranceTimeRange,
ps_decoder_t& decoder,
ProgressSink& utteranceProgressSink)
{
ProgressSink& utteranceProgressSink
) {
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
// Pad time range to give Pocketsphinx some breathing room
// Pad time range to give PocketSphinx some breathing room
TimeRange paddedTimeRange = utteranceTimeRange;
const centiseconds padding(3);
paddedTimeRange.grow(padding);
@ -384,7 +254,7 @@ Timeline<Phone> utteranceToPhones(
continue;
}
word = regex_replace(word, regex("\\(\\d\\)"), "");
if (text.size() > 0) {
if (!text.empty()) {
text += " ";
}
text += word;
@ -403,7 +273,7 @@ Timeline<Phone> utteranceToPhones(
const string fixedWord = fixPronunciation(timedWord.getValue());
wordIds.push_back(getWordId(fixedWord, *decoder.dict));
}
if (wordIds.empty()) return {};
if (wordIds.empty()) return{};
// Align the words' phones with speech
#if BOOST_VERSION < 105600 // Support legacy syntax
@ -433,77 +303,11 @@ Timeline<Phone> utteranceToPhones(
return utterancePhones;
}
BoundedTimeline<Phone> recognizePhones(
BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
const AudioClip& inputAudioClip,
optional<string> dialog,
optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink)
{
ProgressMerger totalProgressMerger(progressSink);
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
// Make sure audio stream has no DC offset
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
// Split audio into utterances
JoiningBoundedTimeline<void> utterances;
try {
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
}
catch (...) {
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
}
// Discard Pocketsphinx output
err_set_logfp(nullptr);
// Redirect Pocketsphinx output to log
err_set_callback(sphinxLogCallback, nullptr);
// Prepare pool of decoders
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
[&dialog] { return createDecoder(dialog); });
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
std::mutex resultMutex;
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
// Detect phones for utterance
auto decoder = decoderPool.acquire();
Timeline<Phone> utterancePhones =
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
// Copy phones to result timeline
std::lock_guard<std::mutex> lock(resultMutex);
for (const auto& timedPhone : utterancePhones) {
phones.set(timedPhone);
}
};
auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
return timedUtterance.getDuration().count();
};
// Perform speech recognition
try {
// Determine how many parallel threads to use
int threadCount = std::min({
maxThreadCount,
// Don't use more threads than there are utterances to be processed
static_cast<int>(utterances.size()),
// Don't waste time creating additional threads (and decoders!) if the recording is short
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
});
if (threadCount < 1) {
threadCount = 1;
}
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
logging::debug("Speech recognition -- end");
}
catch (...) {
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
}
return phones;
ProgressSink& progressSink
) const {
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
}

View File

@ -0,0 +1,14 @@
#pragma once
#include "Recognizer.h"
#include "pocketSphinxTools.h"
class PocketSphinxRecognizer : public Recognizer {
public:
BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip,
boost::optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink
) const override;
};

View File

@ -0,0 +1,18 @@
#pragma once
#include "audio/AudioClip.h"
#include "core/Phone.h"
#include "tools/ProgressBar.h"
#include "time/BoundedTimeline.h"
class Recognizer {
public:
virtual ~Recognizer() = default;
virtual BoundedTimeline<Phone>recognizePhones(
const AudioClip& audioClip,
boost::optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink
) const = 0;
};

View File

@ -1,12 +0,0 @@
#pragma once
#include "audio/AudioClip.h"
#include "core/Phone.h"
#include "tools/ProgressBar.h"
#include "time/BoundedTimeline.h"
BoundedTimeline<Phone> recognizePhones(
const AudioClip& audioClip,
boost::optional<std::string> dialog,
int maxThreadCount,
ProgressSink& progressSink);

View File

@ -0,0 +1,218 @@
#include "pocketSphinxTools.h"
#include "tools/platformTools.h"
#include <regex>
#include "audio/DcOffset.h"
#include "audio/voiceActivityDetection.h"
#include "tools/parallel.h"
#include "tools/ObjectPool.h"
#include "time/timedLogging.h"
extern "C" {
#include <sphinxbase/err.h>
#include <pocketsphinx_internal.h>
#include <ngram_search.h>
}
using std::runtime_error;
using std::invalid_argument;
using std::unique_ptr;
using std::string;
using std::vector;
using boost::filesystem::path;
using std::regex;
using boost::optional;
using std::chrono::duration_cast;
logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
switch (errorLevel) {
case ERR_DEBUG:
case ERR_INFO:
case ERR_INFOCONT:
return logging::Level::Trace;
case ERR_WARN:
return logging::Level::Warn;
case ERR_ERROR:
return logging::Level::Error;
case ERR_FATAL:
return logging::Level::Fatal;
default:
throw invalid_argument("Unknown log level.");
}
}
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
UNUSED(user_data);
// Create varArgs list
va_list args;
va_start(args, format);
auto _ = gsl::finally([&args]() { va_end(args); });
// Format message
const int initialSize = 256;
vector<char> chars(initialSize);
bool success = false;
while (!success) {
const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
success = charsWritten < static_cast<int>(chars.size());
if (!success) chars.resize(chars.size() * 2);
}
const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
boost::algorithm::trim(message);
const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
logging::log(logLevel, message);
}
void redirectPocketSphinxOutput() {
static bool redirected = false;
if (redirected) return;
// Discard PocketSphinx output
err_set_logfp(nullptr);
// Redirect PocketSphinx output to log
err_set_callback(sphinxLogCallback, nullptr);
redirected = true;
}
BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip,
optional<std::string> dialog,
decoderFactory createDecoder,
utteranceToPhonesFunction utteranceToPhones,
int maxThreadCount,
ProgressSink& progressSink
) {
ProgressMerger totalProgressMerger(progressSink);
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
// Make sure audio stream has no DC offset
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
// Split audio into utterances
JoiningBoundedTimeline<void> utterances;
try {
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
} catch (...) {
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
}
redirectPocketSphinxOutput();
// Prepare pool of decoders
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
[&] { return createDecoder(dialog); });
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
std::mutex resultMutex;
const auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
// Detect phones for utterance
const auto decoder = decoderPool.acquire();
Timeline<Phone> utterancePhones =
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
// Copy phones to result timeline
std::lock_guard<std::mutex> lock(resultMutex);
for (const auto& timedPhone : utterancePhones) {
phones.set(timedPhone);
}
};
const auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
return timedUtterance.getDuration().count();
};
// Perform speech recognition
try {
// Determine how many parallel threads to use
int threadCount = std::min({
maxThreadCount,
// Don't use more threads than there are utterances to be processed
static_cast<int>(utterances.size()),
// Don't waste time creating additional threads (and decoders!) if the recording is short
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
});
if (threadCount < 1) {
threadCount = 1;
}
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
logging::debug("Speech recognition -- end");
} catch (...) {
std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
}
return phones;
}
const path& getSphinxModelDirectory() {
static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
return sphinxModelDirectory;
}
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
JoiningTimeline<void> noiseSounds;
// Find utterance parts without recognized phones
noiseSounds.set(utteranceTimeRange);
for (const auto& timedPhone : phones) {
noiseSounds.clear(timedPhone.getTimeRange());
}
// Remove undesired elements
const centiseconds minSoundDuration = 12_cs;
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
const bool startsAtZero = unknownSound.getStart() == 0_cs;
const bool tooShort = unknownSound.getDuration() < minSoundDuration;
if (startsAtZero || tooShort) {
noiseSounds.clear(unknownSound.getTimeRange());
}
}
return noiseSounds;
}
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
// Restart timing at 0
ps_start_stream(&decoder);
// Start recognition
int error = ps_start_utt(&decoder);
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
// Process entire audio clip
const bool noRecognition = false;
const bool fullUtterance = true;
const int searchedFrameCount =
ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
if (searchedFrameCount < 0) {
throw runtime_error("Error analyzing raw audio data for word recognition.");
}
// End recognition
error = ps_end_utt(&decoder);
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
if (noWordsRecognized) {
return result;
}
// Collect words
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
const char* word = ps_seg_word(it);
int firstFrame, lastFrame;
ps_seg_frames(it, &firstFrame, &lastFrame);
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
}
return result;
}

View File

@ -0,0 +1,39 @@
#pragma once
#include "time/BoundedTimeline.h"
#include "core/Phone.h"
#include "audio/AudioClip.h"
#include "tools/ProgressBar.h"
#include <boost/filesystem/path.hpp>
extern "C" {
#include <pocketsphinx.h>
}
typedef std::function<lambda_unique_ptr<ps_decoder_t>(
boost::optional<std::string> dialog
)> decoderFactory;
typedef std::function<Timeline<Phone>(
const AudioClip& audioClip,
TimeRange utteranceTimeRange,
ps_decoder_t& decoder,
ProgressSink& utteranceProgressSink
)> utteranceToPhonesFunction;
BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip,
boost::optional<std::string> dialog,
decoderFactory createDecoder,
utteranceToPhonesFunction utteranceToPhones,
int maxThreadCount,
ProgressSink& progressSink
);
constexpr int sphinxSampleRate = 16000;
const boost::filesystem::path& getSphinxModelDirectory();
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones);
BoundedTimeline<std::string> recognizeWords(const std::vector<int16_t>& audioBuffer, ps_decoder_t& decoder);

View File

@ -0,0 +1,27 @@
#include "RecognizerType.h"
using std::string;
RecognizerTypeConverter& RecognizerTypeConverter::get() {
static RecognizerTypeConverter converter;
return converter;
}
string RecognizerTypeConverter::getTypeName() {
return "RecognizerType";
}
EnumConverter<RecognizerType>::member_data RecognizerTypeConverter::getMemberData() {
return member_data{
{ RecognizerType::PocketSphinx, "pocketSphinx" },
{ RecognizerType::Phonetic, "phonetic" }
};
}
std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
return RecognizerTypeConverter::get().write(stream, value);
}
std::istream& operator>>(std::istream& stream, RecognizerType& value) {
return RecognizerTypeConverter::get().read(stream, value);
}

View File

@ -0,0 +1,20 @@
#pragma once
#include "tools/EnumConverter.h"
enum class RecognizerType {
PocketSphinx,
Phonetic
};
class RecognizerTypeConverter : public EnumConverter<RecognizerType> {
public:
static RecognizerTypeConverter& get();
protected:
std::string getTypeName() override;
member_data getMemberData() override;
};
std::ostream& operator<<(std::ostream& stream, RecognizerType value);
std::istream& operator>>(std::istream& stream, RecognizerType& value);

View File

@ -27,6 +27,9 @@
#include "tools/platformTools.h"
#include "sinks.h"
#include "semanticEntries.h"
#include "RecognizerType.h"
#include "recognition/PocketSphinxRecognizer.h"
#include "recognition/PhoneticRecognizer.h"
using std::exception;
using std::string;
@ -36,9 +39,6 @@ using std::unique_ptr;
using std::make_unique;
using std::shared_ptr;
using std::make_shared;
using std::map;
using std::chrono::duration;
using std::chrono::duration_cast;
using std::ofstream;
using boost::filesystem::path;
using boost::adaptors::transformed;
@ -56,6 +56,10 @@ namespace TCLAP {
struct ArgTraits<ExportFormat> {
typedef ValueLike ValueCategory;
};
template<>
struct ArgTraits<RecognizerType> {
typedef ValueLike ValueCategory;
};
}
shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
@ -66,6 +70,17 @@ shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
return make_shared<logging::LevelFilter>(FileSink, minLevel);
}
unique_ptr<Recognizer> createRecognizer(RecognizerType recognizerType) {
switch (recognizerType) {
case RecognizerType::PocketSphinx:
return make_unique<PocketSphinxRecognizer>();
case RecognizerType::Phonetic:
return make_unique<PhoneticRecognizer>();
default:
throw std::runtime_error("Unknown recognizer.");
}
}
unique_ptr<Exporter> createExporter(ExportFormat exportFormat) {
switch (exportFormat) {
case ExportFormat::Tsv:
@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
auto recognizerTypes = vector<RecognizerType>(RecognizerTypeConverter::get().getValues());
tclap::ValuesConstraint<RecognizerType> recognizerConstraint(recognizerTypes);
tclap::ValueArg<RecognizerType> recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
try {
@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
JoiningContinuousTimeline<Shape> animation = animateWaveFile(
inputFilePath,
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
*createRecognizer(recognizerType.getValue()),
targetShapeSet,
maxThreadCount.getValue(),
progressSink);