Merge pull request #47 from DanielSWolf/feature/phonetic-recognition
Phonetic recognition
This commit is contained in:
commit
c078e6186e
|
@ -1,5 +1,9 @@
|
|||
# Version history
|
||||
|
||||
## Unreleased
|
||||
|
||||
* **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)).
|
||||
|
||||
## Version 1.8.0
|
||||
|
||||
* **Added** support for Ogg Vorbis (.ogg) file format ([issue #40](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/40)).
|
||||
|
|
18
README.adoc
18
README.adoc
|
@ -123,6 +123,11 @@ The following command-line options are the most common:
|
|||
| _<input file>_
|
||||
| The audio file to be analyzed. This must be the last command-line argument. Supported file formats are WAVE (.wav) and Ogg Vorbis (.ogg).
|
||||
|
||||
| `-r` _<recognizer>_, `--recognizer` _<recognizer>_
|
||||
| Specifies how Rhubarb Lip Sync recognizes speech within the recording. Options: `pocketSphinx` (use for English recordings), `phonetic` (use for non-English recordings). For details, see <<recognizers>>.
|
||||
|
||||
_Default value: ``pocketSphinx``_
|
||||
|
||||
| `-f` _<format>_, `--exportFormat` _<format>_
|
||||
| The export format. Options: `tsv` (tab-separated values, see <<tsv,details>>), `xml` (see <<xml,details>>), `json` (see <<json,details>>).
|
||||
|
||||
|
@ -192,6 +197,19 @@ Note that for short audio files, Rhubarb Lip Sync may choose to use fewer thread
|
|||
_Default value: as many threads as your CPU has cores_
|
||||
|===
|
||||
|
||||
[[recognizers]]
|
||||
== Recognizers
|
||||
|
||||
The first step in processing an audio file is determining what is being said. More specifically, Rhubarb Lip Sync uses speech recognition to figure out what sound is being said at what point in time. You can choose between two recognizers:
|
||||
|
||||
=== PocketSphinx
|
||||
|
||||
PocketSphinx is an open-source speech recognition library that generally gives good results. This is the default recognizer. The downside is that PocketSphinx only recognizes English dialog. So if your recordings are in a language other than English, this is not a good choice.
|
||||
|
||||
=== Phonetic
|
||||
|
||||
Rhubarb Lip Sync also comes with a phonetic recognizer. _Phonetic_ means that this recognizer won't try to understand entire (English) words and phrases. Instead, it will recognize individual sounds and syllables. The results are usually less precise than those from the PocketSphinx recognizer. The advantage is that this recognizer is language-independent. Use it if your recordings are not in English.
|
||||
|
||||
[[outputFormats]]
|
||||
== Output formats
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.2)
|
|||
|
||||
set(appName "Rhubarb Lip Sync")
|
||||
set(appVersionMajor 1)
|
||||
set(appVersionMinor 8)
|
||||
set(appVersionMinor 9)
|
||||
set(appVersionPatch 0)
|
||||
set(appVersionSuffix "")
|
||||
set(appVersionSuffix "-pre.1")
|
||||
set(appVersion "${appVersionMajor}.${appVersionMinor}.${appVersionPatch}${appVersionSuffix}")
|
||||
|
|
|
@ -323,6 +323,12 @@ function createDialogWindow() {
|
|||
+ 'your After Effects project.'
|
||||
})
|
||||
}),
|
||||
recognizer: Group({
|
||||
label: StaticText({ text: 'Recognizer:' }),
|
||||
value: DropDownList({
|
||||
helpTip: 'The dialog recognizer.'
|
||||
})
|
||||
}),
|
||||
dialogText: Group({
|
||||
label: StaticText({ text: 'Dialog text (optional):' }),
|
||||
value: EditText({
|
||||
|
@ -384,6 +390,7 @@ function createDialogWindow() {
|
|||
var controls = {
|
||||
audioFile: window.settings.audioFile.value,
|
||||
dialogText: window.settings.dialogText.value,
|
||||
recognizer: window.settings.recognizer.value,
|
||||
mouthComp: window.settings.mouthComp.value,
|
||||
targetFolder: window.settings.targetFolder.value,
|
||||
frameRate: window.settings.frameRate.value,
|
||||
|
@ -402,6 +409,16 @@ function createDialogWindow() {
|
|||
listItem.projectItem = projectItem;
|
||||
});
|
||||
|
||||
// Add recognizer options
|
||||
const recognizerOptions = [
|
||||
{ text: 'PocketSphinx (use for English recordings)', value: 'pocketSphinx' },
|
||||
{ text: 'Phonetic (use for non-English recordings)', value: 'phonetic' }
|
||||
];
|
||||
recognizerOptions.forEach(function(option) {
|
||||
var listItem = controls.recognizer.add('item', option.text);
|
||||
listItem.value = option.value;
|
||||
});
|
||||
|
||||
// Add mouth composition options
|
||||
var comps = toArrayBase1(app.project.items).filter(function (item) {
|
||||
return item instanceof CompItem;
|
||||
|
@ -425,6 +442,7 @@ function createDialogWindow() {
|
|||
var settings = readSettingsFile();
|
||||
selectByTextOrFirst(controls.audioFile, settings.audioFile);
|
||||
controls.dialogText.text = settings.dialogText || '';
|
||||
selectByTextOrFirst(controls.recognizer, settings.recognizer);
|
||||
selectByTextOrFirst(controls.mouthComp, settings.mouthComp);
|
||||
extendedMouthShapeNames.forEach(function(shapeName) {
|
||||
controls['mouthShape' + shapeName].value =
|
||||
|
@ -484,6 +502,7 @@ function createDialogWindow() {
|
|||
// Store settings
|
||||
var settings = {
|
||||
audioFile: (controls.audioFile.selection || {}).text,
|
||||
recognizer: (controls.recognizer.selection || {}).text,
|
||||
dialogText: controls.dialogText.text,
|
||||
mouthComp: (controls.mouthComp.selection || {}).text,
|
||||
extendedMouthShapes: {},
|
||||
|
@ -543,7 +562,7 @@ function createDialogWindow() {
|
|||
|
||||
// Check for correct Rhubarb version
|
||||
var version = exec(rhubarbPath + ' --version') || '';
|
||||
var match = version.match(/Rhubarb Lip Sync version ((\d+)\.(\d+).(\d+))/);
|
||||
var match = version.match(/Rhubarb Lip Sync version ((\d+)\.(\d+).(\d+)(-[0-9A-Za-z-.]+)?)/);
|
||||
if (!match) {
|
||||
var instructions = osIsWindows
|
||||
? 'Make sure your PATH environment variable contains the ' + appName + ' '
|
||||
|
@ -555,13 +574,16 @@ function createDialogWindow() {
|
|||
var versionString = match[1];
|
||||
var major = Number(match[2]);
|
||||
var minor = Number(match[3]);
|
||||
if (major != 1 || minor < 6) {
|
||||
return 'This script requires ' + appName + ' 1.6.0 or a later 1.x version. '
|
||||
var requiredMajor = 1;
|
||||
var minRequiredMinor = 9;
|
||||
if (major != requiredMajor || minor < minRequiredMinor) {
|
||||
return 'This script requires ' + appName + ' ' + requiredMajor + '.' + minRequiredMinor
|
||||
+ '.0 or a later ' + requiredMajor + '.x version. '
|
||||
+ 'Your installed version is ' + versionString + ', which is not compatible.';
|
||||
}
|
||||
}
|
||||
|
||||
function generateMouthCues(audioFileFootage, dialogText, mouthComp, extendedMouthShapeNames,
|
||||
function generateMouthCues(audioFileFootage, recognizer, dialogText, mouthComp, extendedMouthShapeNames,
|
||||
targetProjectFolder, frameRate)
|
||||
{
|
||||
var basePath = Folder.temp.fsName + '/' + createGuid();
|
||||
|
@ -575,6 +597,7 @@ function createDialogWindow() {
|
|||
// Create command line
|
||||
var commandLine = rhubarbPath
|
||||
+ ' --dialogFile ' + cliEscape(dialogFile.fsName)
|
||||
+ ' --recognizer ' + recognizer
|
||||
+ ' --exportFormat json'
|
||||
+ ' --extendedShapes ' + cliEscape(extendedMouthShapeNames.join(''))
|
||||
+ ' --logFile ' + cliEscape(logFile.fsName)
|
||||
|
@ -660,11 +683,11 @@ function createDialogWindow() {
|
|||
}
|
||||
}
|
||||
|
||||
function animate(audioFileFootage, dialogText, mouthComp, extendedMouthShapeNames,
|
||||
function animate(audioFileFootage, recognizer, dialogText, mouthComp, extendedMouthShapeNames,
|
||||
targetProjectFolder, frameRate)
|
||||
{
|
||||
try {
|
||||
var mouthCues = generateMouthCues(audioFileFootage, dialogText, mouthComp,
|
||||
var mouthCues = generateMouthCues(audioFileFootage, recognizer, dialogText, mouthComp,
|
||||
extendedMouthShapeNames, targetProjectFolder, frameRate);
|
||||
|
||||
app.beginUndoGroup(appName + ': Animation');
|
||||
|
@ -680,6 +703,7 @@ function createDialogWindow() {
|
|||
// Handle changes
|
||||
update();
|
||||
controls.audioFile.onChange = update;
|
||||
controls.recognizer.onChange = update;
|
||||
controls.dialogText.onChanging = update;
|
||||
controls.mouthComp.onChange = update;
|
||||
extendedMouthShapeNames.forEach(function(shapeName) {
|
||||
|
@ -700,6 +724,7 @@ function createDialogWindow() {
|
|||
window.close();
|
||||
animate(
|
||||
controls.audioFile.selection.projectItem,
|
||||
controls.recognizer.selection.value,
|
||||
controls.dialogText.text || '',
|
||||
controls.mouthComp.selection.projectItem,
|
||||
extendedMouthShapeNames.filter(function(shapeName) {
|
||||
|
|
|
@ -141,11 +141,12 @@ class AudioFileModel(
|
|||
|
||||
private fun startAnimation() {
|
||||
val wrapperTask = Runnable {
|
||||
val recognizer = parentModel.parentModel.recognizer.value
|
||||
val extendedMouthShapes = parentModel.mouthShapes.filter { it.isExtended }.toSet()
|
||||
val reportProgress: (Double?) -> Unit = {
|
||||
progress -> runAndWait { this@AudioFileModel.animationProgress = progress }
|
||||
}
|
||||
val rhubarbTask = RhubarbTask(audioFilePath, dialog, extendedMouthShapes, reportProgress)
|
||||
val rhubarbTask = RhubarbTask(audioFilePath, recognizer, dialog, extendedMouthShapes, reportProgress)
|
||||
try {
|
||||
try {
|
||||
val result = rhubarbTask.call()
|
||||
|
|
|
@ -2,6 +2,8 @@ package com.rhubarb_lip_sync.rhubarb_for_spine
|
|||
|
||||
import javafx.beans.property.SimpleObjectProperty
|
||||
import javafx.beans.property.SimpleStringProperty
|
||||
import javafx.collections.FXCollections
|
||||
import javafx.collections.ObservableList
|
||||
import tornadofx.FX
|
||||
import tornadofx.getValue
|
||||
import tornadofx.setValue
|
||||
|
@ -40,6 +42,15 @@ class MainModel(private val executor: ExecutorService) {
|
|||
var animationFileModel by animationFileModelProperty
|
||||
private set
|
||||
|
||||
val recognizersProperty = SimpleObjectProperty<ObservableList<Recognizer>>(FXCollections.observableArrayList(
|
||||
Recognizer("pocketSphinx", "PocketSphinx (use for English recordings)"),
|
||||
Recognizer("phonetic", "Phonetic (use for non-English recordings)")
|
||||
))
|
||||
private var recognizers: ObservableList<Recognizer> by recognizersProperty
|
||||
|
||||
val recognizerProperty = SimpleObjectProperty<Recognizer>(recognizers[0])
|
||||
var recognizer: Recognizer by recognizerProperty
|
||||
|
||||
val animationPrefixProperty = SimpleStringProperty("say_")
|
||||
var animationPrefix: String by animationPrefixProperty
|
||||
|
||||
|
@ -48,3 +59,5 @@ class MainModel(private val executor: ExecutorService) {
|
|||
|
||||
private fun getDefaultPathString() = FX.application.parameters.raw.firstOrNull()
|
||||
}
|
||||
|
||||
class Recognizer(val value: String, val description: String)
|
||||
|
|
|
@ -17,6 +17,7 @@ import javafx.scene.text.Font
|
|||
import javafx.scene.text.FontWeight
|
||||
import javafx.scene.text.Text
|
||||
import javafx.stage.FileChooser
|
||||
import javafx.util.StringConverter
|
||||
import tornadofx.*
|
||||
import java.io.File
|
||||
import java.util.concurrent.Executors
|
||||
|
@ -83,6 +84,20 @@ class MainView : View() {
|
|||
}
|
||||
}
|
||||
}
|
||||
field("Dialog recognizer") {
|
||||
combobox<Recognizer> {
|
||||
itemsProperty().bind(mainModel.recognizersProperty)
|
||||
this.converter = object : StringConverter<Recognizer>() {
|
||||
override fun toString(recognizer: Recognizer?): String {
|
||||
return recognizer?.description ?: ""
|
||||
}
|
||||
override fun fromString(string: String?): Recognizer {
|
||||
throw NotImplementedError()
|
||||
}
|
||||
}
|
||||
valueProperty().bindBidirectional(mainModel.recognizerProperty)
|
||||
}
|
||||
}
|
||||
field("Animation naming") {
|
||||
textfield {
|
||||
maxWidth = 100.0
|
||||
|
|
|
@ -14,6 +14,7 @@ import java.util.concurrent.Callable
|
|||
|
||||
class RhubarbTask(
|
||||
val audioFilePath: Path,
|
||||
val recognizer: String,
|
||||
val dialog: String?,
|
||||
val extendedMouthShapes: Set<MouthShape>,
|
||||
val reportProgress: (Double?) -> Unit
|
||||
|
@ -89,6 +90,7 @@ class RhubarbTask(
|
|||
return mutableListOf(
|
||||
rhubarbBinFilePath.toString(),
|
||||
"--machineReadable",
|
||||
"--recognizer", recognizer,
|
||||
"--exportFormat", "json",
|
||||
"--extendedShapes", extendedMouthShapesString
|
||||
).apply {
|
||||
|
@ -100,7 +102,6 @@ class RhubarbTask(
|
|||
}.apply {
|
||||
add(audioFilePath.toString())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private val guiBinDirectory: Path by lazy {
|
||||
|
|
|
@ -413,8 +413,13 @@ add_library(rhubarb-recognition
|
|||
src/recognition/g2p.h
|
||||
src/recognition/languageModels.cpp
|
||||
src/recognition/languageModels.h
|
||||
src/recognition/phoneRecognition.cpp
|
||||
src/recognition/phoneRecognition.h
|
||||
src/recognition/PhoneticRecognizer.cpp
|
||||
src/recognition/PhoneticRecognizer.h
|
||||
src/recognition/PocketSphinxRecognizer.cpp
|
||||
src/recognition/PocketSphinxRecognizer.h
|
||||
src/recognition/pocketSphinxTools.cpp
|
||||
src/recognition/pocketSphinxTools.h
|
||||
src/recognition/Recognizer.h
|
||||
src/recognition/tokenization.cpp
|
||||
src/recognition/tokenization.h
|
||||
)
|
||||
|
@ -487,6 +492,8 @@ add_executable(rhubarb
|
|||
src/rhubarb/main.cpp
|
||||
src/rhubarb/ExportFormat.cpp
|
||||
src/rhubarb/ExportFormat.h
|
||||
src/rhubarb/RecognizerType.cpp
|
||||
src/rhubarb/RecognizerType.h
|
||||
src/rhubarb/semanticEntries.cpp
|
||||
src/rhubarb/semanticEntries.h
|
||||
src/rhubarb/sinks.cpp
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
|
||||
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppClangTidyModernizeRawStringLiteral/@EntryIndexedValue">HINT</s:String>
|
||||
|
||||
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=CppFunctionDoesntReturnValue/@EntryIndexedValue">ERROR</s:String>
|
||||
|
||||
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=LocalizableElement/@EntryIndexedValue">DO_NOT_SHOW</s:String>
|
||||
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CommonFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_ARGUMENT/@EntryValue">False</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CALLS_CHAIN/@EntryValue">False</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/ALIGN_MULTILINE_CTOR_INIT/@EntryValue">False</s:Boolean>
|
||||
|
@ -29,6 +34,7 @@
|
|||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CppFormatting/WRAP_ENUMERATION_STYLE/@EntryValue">CHOP_ALWAYS</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ACCESSOR_OWNER_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ALIGN_MULTILINE_BINARY_EXPRESSIONS_CHAIN/@EntryValue">False</s:Boolean>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/ANONYMOUS_METHOD_DECLARATION_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/CASE_BLOCK_BRACES/@EntryValue">END_OF_LINE</s:String>
|
||||
|
@ -44,6 +50,14 @@
|
|||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/USE_INDENT_FROM_VS/@EntryValue">False</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_BEFORE_BINARY_OPSIGN/@EntryValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/CodeStyle/CodeFormatting/CSharpFormat/WRAP_LINES/@EntryValue">False</s:Boolean>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/CssFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/HtmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/JavaScriptCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ProtobufCodeFormatting/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/ResxFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/VBFormat/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlDocFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CodeFormatting/XmlFormatter/ALIGNMENT_TAB_FILL_STYLE/@EntryValue">USE_TABS_ONLY</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForBuiltInTypes/@EntryValue">UseExplicitType</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/CSharpVarKeywordUsage/ForSimpleTypes/@EntryValue">UseVarWhenEvident</s:String>
|
||||
<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020fields/@EntryIndexedValue"><NamingElement Priority="10"><Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"><type Name="class field" /><type Name="struct field" /></Descriptor><Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /></NamingElement></s:String>
|
||||
|
@ -108,7 +122,16 @@
|
|||
<s:String x:Key="/Default/Environment/Hierarchy/PsiConfigurationSettingsKey/CustomLocation/@EntryValue">C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches</s:String>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002EFunctionReturnStyleSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002ECpp_002ECodeStyle_002ESettingsUpgrade_002ENamespaceIndentationSettingsUpgrader/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpKeepExistingMigration/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ECSharpPlaceEmbeddedOnSameLineMigration/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EAddAccessorOwnerDeclarationBracesMigration/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateBlankLinesAroundFieldToBlankLinesAroundProperty/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002EFormat_002ESettingsUpgrade_002EAlignmentTabFillStyleMigration/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=allphone/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=cepstral/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=cmudict/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=pbeam/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=qwhy/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=Viterbi/@EntryIndexedValue">True</s:Boolean>
|
||||
</wpf:ResourceDictionary>
|
|
@ -1,6 +1,5 @@
|
|||
#include "rhubarbLib.h"
|
||||
#include "core/Phone.h"
|
||||
#include "recognition/phoneRecognition.h"
|
||||
#include "tools/textFiles.h"
|
||||
#include "animation/mouthAnimation.h"
|
||||
#include "audio/audioFileReading.h"
|
||||
|
@ -8,27 +7,29 @@
|
|||
using boost::optional;
|
||||
using std::string;
|
||||
using boost::filesystem::path;
|
||||
using std::unique_ptr;
|
||||
|
||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||
const AudioClip& audioClip,
|
||||
optional<string> dialog,
|
||||
const optional<string>& dialog,
|
||||
const Recognizer& recognizer,
|
||||
const ShapeSet& targetShapeSet,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink)
|
||||
{
|
||||
BoundedTimeline<Phone> phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
|
||||
const BoundedTimeline<Phone> phones =
|
||||
recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
|
||||
JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||
path filePath,
|
||||
optional<string> dialog,
|
||||
const optional<string>& dialog,
|
||||
const Recognizer& recognizer,
|
||||
const ShapeSet& targetShapeSet,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink)
|
||||
{
|
||||
const auto audioClip = createAudioFileClip(filePath);
|
||||
return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink);
|
||||
return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
|
||||
}
|
||||
|
|
|
@ -6,17 +6,20 @@
|
|||
#include "tools/ProgressBar.h"
|
||||
#include <boost/filesystem.hpp>
|
||||
#include "animation/targetShapeSet.h"
|
||||
#include "recognition/Recognizer.h"
|
||||
|
||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||
const AudioClip& audioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
const boost::optional<std::string>& dialog,
|
||||
const Recognizer& recognizer,
|
||||
const ShapeSet& targetShapeSet,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink);
|
||||
|
||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||
boost::filesystem::path filePath,
|
||||
boost::optional<std::string> dialog,
|
||||
const boost::optional<std::string>& dialog,
|
||||
const Recognizer& recognizer,
|
||||
const ShapeSet& targetShapeSet,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink);
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
#include "PhoneticRecognizer.h"
|
||||
#include "time/Timeline.h"
|
||||
#include "audio/AudioSegment.h"
|
||||
#include "audio/SampleRateConverter.h"
|
||||
#include "audio/processing.h"
|
||||
#include "time/timedLogging.h"
|
||||
|
||||
using std::runtime_error;
|
||||
using std::unique_ptr;
|
||||
using std::string;
|
||||
using boost::optional;
|
||||
|
||||
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
|
||||
UNUSED(dialog);
|
||||
|
||||
lambda_unique_ptr<cmd_ln_t> config(
|
||||
cmd_ln_init(
|
||||
nullptr, ps_args(), true,
|
||||
// Set acoustic model
|
||||
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
||||
// Set phonetic language model
|
||||
"-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
|
||||
"-allphone_ci", "yes",
|
||||
// Set language model probability weight.
|
||||
// Low values (<= 0.4) can lead to fluttering animation.
|
||||
// High values (>= 1.0) can lead to imprecise or freezing animation.
|
||||
"-lw", "0.8",
|
||||
|
||||
// The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
|
||||
|
||||
// Set beam width applied to every frame in Viterbi search
|
||||
"-beam", "1e-20",
|
||||
// Set beam width applied to phone transitions
|
||||
"-pbeam", "1e-20",
|
||||
nullptr),
|
||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||
if (!config) throw runtime_error("Error creating configuration.");
|
||||
|
||||
lambda_unique_ptr<ps_decoder_t> decoder(
|
||||
ps_init(config.get()),
|
||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
||||
|
||||
return decoder;
|
||||
}
|
||||
|
||||
static Timeline<Phone> utteranceToPhones(
|
||||
const AudioClip& audioClip,
|
||||
TimeRange utteranceTimeRange,
|
||||
ps_decoder_t& decoder,
|
||||
ProgressSink& utteranceProgressSink
|
||||
) {
|
||||
// Pad time range to give PocketSphinx some breathing room
|
||||
TimeRange paddedTimeRange = utteranceTimeRange;
|
||||
const centiseconds padding(3);
|
||||
paddedTimeRange.grow(padding);
|
||||
paddedTimeRange.trim(audioClip.getTruncatedRange());
|
||||
|
||||
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
|
||||
const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
|
||||
|
||||
// Detect phones (returned as words)
|
||||
BoundedTimeline<string> phoneStrings = recognizeWords(audioBuffer, decoder);
|
||||
phoneStrings.shift(paddedTimeRange.getStart());
|
||||
Timeline<Phone> utterancePhones;
|
||||
for (const auto& timedPhoneString : phoneStrings) {
|
||||
Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
|
||||
if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
|
||||
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
|
||||
phone = Phone::Schwa;
|
||||
}
|
||||
utterancePhones.set(timedPhoneString.getTimeRange(), phone);
|
||||
}
|
||||
|
||||
// Log raw phones
|
||||
for (const auto& timedPhone : utterancePhones) {
|
||||
logTimedEvent("rawPhone", timedPhone);
|
||||
}
|
||||
|
||||
// Guess positions of noise sounds
|
||||
JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
|
||||
for (const auto& noiseSound : noiseSounds) {
|
||||
utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
|
||||
}
|
||||
|
||||
// Log phones
|
||||
for (const auto& timedPhone : utterancePhones) {
|
||||
logTimedEvent("phone", timedPhone);
|
||||
}
|
||||
|
||||
utteranceProgressSink.reportProgress(1.0);
|
||||
|
||||
return utterancePhones;
|
||||
}
|
||||
|
||||
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) const {
|
||||
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
|
||||
#include "Recognizer.h"
|
||||
#include "pocketSphinxTools.h"
|
||||
|
||||
class PhoneticRecognizer : public Recognizer {
|
||||
public:
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) const override;
|
||||
};
|
|
@ -1,143 +1,133 @@
|
|||
#include <boost/filesystem.hpp>
|
||||
#include "phoneRecognition.h"
|
||||
#include "audio/SampleRateConverter.h"
|
||||
#include "tools/platformTools.h"
|
||||
#include "tools/tools.h"
|
||||
#include <format.h>
|
||||
#include <s3types.h>
|
||||
#include "PocketSphinxRecognizer.h"
|
||||
#include <regex>
|
||||
#include <gsl_util.h>
|
||||
#include "logging/logging.h"
|
||||
#include "audio/DcOffset.h"
|
||||
#include "time/Timeline.h"
|
||||
#include "audio/voiceActivityDetection.h"
|
||||
#include "audio/AudioSegment.h"
|
||||
#include "audio/SampleRateConverter.h"
|
||||
#include "languageModels.h"
|
||||
#include "tokenization.h"
|
||||
#include "g2p.h"
|
||||
#include "time/ContinuousTimeline.h"
|
||||
#include "audio/processing.h"
|
||||
#include "tools/parallel.h"
|
||||
#include <boost/version.hpp>
|
||||
#include "tools/ObjectPool.h"
|
||||
#include "time/timedLogging.h"
|
||||
|
||||
extern "C" {
|
||||
#include <pocketsphinx.h>
|
||||
#include <sphinxbase/err.h>
|
||||
#include <ps_alignment.h>
|
||||
#include <state_align_search.h>
|
||||
#include <pocketsphinx_internal.h>
|
||||
#include <ngram_search.h>
|
||||
}
|
||||
|
||||
using std::runtime_error;
|
||||
using std::invalid_argument;
|
||||
using std::unique_ptr;
|
||||
using std::shared_ptr;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::map;
|
||||
using boost::filesystem::path;
|
||||
using std::function;
|
||||
using std::regex;
|
||||
using std::regex_replace;
|
||||
using std::chrono::duration;
|
||||
using boost::optional;
|
||||
using std::string;
|
||||
using std::chrono::duration_cast;
|
||||
using std::array;
|
||||
|
||||
constexpr int sphinxSampleRate = 16000;
|
||||
|
||||
const path& getSphinxModelDirectory() {
|
||||
static path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
|
||||
return sphinxModelDirectory;
|
||||
bool dictionaryContains(dict_t& dictionary, const string& word) {
|
||||
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
|
||||
}
|
||||
|
||||
logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
|
||||
switch (errorLevel) {
|
||||
case ERR_DEBUG:
|
||||
case ERR_INFO:
|
||||
case ERR_INFOCONT:
|
||||
return logging::Level::Trace;
|
||||
case ERR_WARN:
|
||||
return logging::Level::Warn;
|
||||
case ERR_ERROR:
|
||||
return logging::Level::Error;
|
||||
case ERR_FATAL:
|
||||
return logging::Level::Fatal;
|
||||
default:
|
||||
throw invalid_argument("Unknown log level.");
|
||||
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||
const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
||||
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
||||
return wordId;
|
||||
}
|
||||
|
||||
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
|
||||
map<string, string> missingPronunciations;
|
||||
for (const string& word : words) {
|
||||
if (!dictionaryContains(*decoder.dict, word)) {
|
||||
string pronunciation;
|
||||
for (Phone phone : wordToPhones(word)) {
|
||||
if (pronunciation.length() > 0) pronunciation += " ";
|
||||
pronunciation += PhoneConverter::get().toString(phone);
|
||||
}
|
||||
missingPronunciations[word] = pronunciation;
|
||||
}
|
||||
}
|
||||
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
|
||||
const bool isLast = it == --missingPronunciations.end();
|
||||
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
|
||||
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
|
||||
}
|
||||
}
|
||||
|
||||
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
|
||||
UNUSED(user_data);
|
||||
|
||||
// Create varArgs list
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
auto _ = gsl::finally([&args]() { va_end(args); });
|
||||
|
||||
// Format message
|
||||
const int initialSize = 256;
|
||||
vector<char> chars(initialSize);
|
||||
bool success = false;
|
||||
while (!success) {
|
||||
int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
|
||||
if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
|
||||
|
||||
success = charsWritten < static_cast<int>(chars.size());
|
||||
if (!success) chars.resize(chars.size() * 2);
|
||||
}
|
||||
regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
|
||||
string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
|
||||
boost::algorithm::trim(message);
|
||||
|
||||
logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
|
||||
logging::log(logLevel, message);
|
||||
}
|
||||
|
||||
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
|
||||
// Restart timing at 0
|
||||
ps_start_stream(&decoder);
|
||||
|
||||
// Start recognition
|
||||
int error = ps_start_utt(&decoder);
|
||||
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
||||
|
||||
// Process entire audio clip
|
||||
const bool noRecognition = false;
|
||||
const bool fullUtterance = true;
|
||||
int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
|
||||
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||
|
||||
// End recognition
|
||||
error = ps_end_utt(&decoder);
|
||||
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
||||
|
||||
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
|
||||
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
||||
if (noWordsRecognized) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Collect words
|
||||
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
|
||||
const char* word = ps_seg_word(it);
|
||||
int firstFrame, lastFrame;
|
||||
ps_seg_frames(it, &firstFrame, &lastFrame);
|
||||
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
|
||||
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
|
||||
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
|
||||
lambda_unique_ptr<ngram_model_t> result(
|
||||
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
|
||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||
if (!result) {
|
||||
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||
s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
|
||||
if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
|
||||
return wordId;
|
||||
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||
// Split dialog into normalized words
|
||||
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
|
||||
|
||||
// Add dialog-specific words to the dictionary
|
||||
addMissingDictionaryWords(words, decoder);
|
||||
|
||||
// Create dialog-specific language model
|
||||
words.insert(words.begin(), "<s>");
|
||||
words.emplace_back("</s>");
|
||||
return createLanguageModel(words, decoder);
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
|
||||
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
|
||||
constexpr int modelCount = 2;
|
||||
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
|
||||
array<const char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
|
||||
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
|
||||
lambda_unique_ptr<ngram_model_t> result(
|
||||
ngram_model_set_init(nullptr, languageModels.data(), const_cast<char**>(modelNames.data()), modelWeights.data(), modelCount),
|
||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||
if (!result) {
|
||||
throw runtime_error("Error creating biased language model.");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static lambda_unique_ptr<ps_decoder_t> createDecoder(optional<std::string> dialog) {
|
||||
lambda_unique_ptr<cmd_ln_t> config(
|
||||
cmd_ln_init(
|
||||
nullptr, ps_args(), true,
|
||||
// Set acoustic model
|
||||
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
||||
// Set pronunciation dictionary
|
||||
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
|
||||
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
||||
"-dither", "yes",
|
||||
// Disable VAD -- we're doing that ourselves
|
||||
"-remove_silence", "no",
|
||||
// Perform per-utterance cepstral mean normalization
|
||||
"-cmn", "batch",
|
||||
nullptr),
|
||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||
if (!config) throw runtime_error("Error creating configuration.");
|
||||
|
||||
lambda_unique_ptr<ps_decoder_t> decoder(
|
||||
ps_init(config.get()),
|
||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
||||
|
||||
// Set language model
|
||||
lambda_unique_ptr<ngram_model_t> languageModel(dialog
|
||||
? createBiasedLanguageModel(*decoder, *dialog)
|
||||
: createDefaultLanguageModel(*decoder));
|
||||
ps_set_lm(decoder.get(), "lm", languageModel.get());
|
||||
ps_set_search(decoder.get(), "lm");
|
||||
|
||||
return decoder;
|
||||
}
|
||||
|
||||
optional<Timeline<Phone>> getPhoneAlignment(
|
||||
|
@ -178,7 +168,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
|||
// Process entire audio clip
|
||||
const int16* nextSample = audioBuffer.data();
|
||||
size_t remainingSamples = audioBuffer.size();
|
||||
bool fullUtterance = true;
|
||||
const bool fullUtterance = true;
|
||||
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
|
||||
while (acousticModel->n_feat_frame > 0) {
|
||||
ps_search_step(search.get(), acousticModel->output_frame);
|
||||
|
@ -197,7 +187,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
|||
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
|
||||
// Get phone
|
||||
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
|
||||
s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
||||
const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
|
||||
string phoneName = phoneNames[phoneId];
|
||||
|
||||
if (phoneName == "SIL") continue;
|
||||
|
@ -207,162 +197,42 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
|||
centiseconds duration(phoneEntry->duration);
|
||||
Phone phone = PhoneConverter::get().parse(phoneName);
|
||||
if (phone == Phone::AH && duration < 6_cs) {
|
||||
// Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate.
|
||||
// Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
|
||||
phone = Phone::Schwa;
|
||||
}
|
||||
Timed<Phone> timedPhone(start, start + duration, phone);
|
||||
const Timed<Phone> timedPhone(start, start + duration, phone);
|
||||
result.set(timedPhone);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool dictionaryContains(dict_t& dictionary, const string& word) {
|
||||
return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
|
||||
}
|
||||
|
||||
void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
|
||||
map<string, string> missingPronunciations;
|
||||
for (const string& word : words) {
|
||||
if (!dictionaryContains(*decoder.dict, word)) {
|
||||
string pronunciation;
|
||||
for (Phone phone : wordToPhones(word)) {
|
||||
if (pronunciation.length() > 0) pronunciation += " ";
|
||||
pronunciation += PhoneConverter::get().toString(phone);
|
||||
}
|
||||
missingPronunciations[word] = pronunciation;
|
||||
}
|
||||
}
|
||||
for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
|
||||
bool isLast = it == --missingPronunciations.end();
|
||||
logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
|
||||
ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
|
||||
}
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decoder) {
|
||||
path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
|
||||
lambda_unique_ptr<ngram_model_t> result(
|
||||
ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
|
||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||
if (!result) {
|
||||
throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
|
||||
}
|
||||
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||
// Split dialog into normalized words
|
||||
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
|
||||
|
||||
// Add dialog-specific words to the dictionary
|
||||
addMissingDictionaryWords(words, decoder);
|
||||
|
||||
// Create dialog-specific language model
|
||||
words.insert(words.begin(), "<s>");
|
||||
words.push_back("</s>");
|
||||
return createLanguageModel(words, decoder);
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
|
||||
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
|
||||
constexpr int modelCount = 2;
|
||||
array<ngram_model_t*, modelCount> languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
|
||||
array<char*, modelCount> modelNames{ "defaultLM", "dialogLM" };
|
||||
array<float, modelCount> modelWeights{ 0.1f, 0.9f };
|
||||
lambda_unique_ptr<ngram_model_t> result(
|
||||
ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
|
||||
[](ngram_model_t* lm) { ngram_model_free(lm); });
|
||||
if (!result) {
|
||||
throw runtime_error("Error creating biased language model.");
|
||||
}
|
||||
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
lambda_unique_ptr<ps_decoder_t> createDecoder(optional<string> dialog) {
|
||||
lambda_unique_ptr<cmd_ln_t> config(
|
||||
cmd_ln_init(
|
||||
nullptr, ps_args(), true,
|
||||
// Set acoustic model
|
||||
"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
|
||||
// Set pronunciation dictionary
|
||||
"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
|
||||
// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
|
||||
"-dither", "yes",
|
||||
// Disable VAD -- we're doing that ourselves
|
||||
"-remove_silence", "no",
|
||||
// Perform per-utterance cepstral mean normalization
|
||||
"-cmn", "batch",
|
||||
nullptr),
|
||||
[](cmd_ln_t* config) { cmd_ln_free_r(config); });
|
||||
if (!config) throw runtime_error("Error creating configuration.");
|
||||
|
||||
lambda_unique_ptr<ps_decoder_t> decoder(
|
||||
ps_init(config.get()),
|
||||
[](ps_decoder_t* recognizer) { ps_free(recognizer); });
|
||||
if (!decoder) throw runtime_error("Error creating speech decoder.");
|
||||
|
||||
// Set language model
|
||||
lambda_unique_ptr<ngram_model_t> languageModel(dialog
|
||||
? createBiasedLanguageModel(*decoder, *dialog)
|
||||
: createDefaultLanguageModel(*decoder));
|
||||
ps_set_lm(decoder.get(), "lm", languageModel.get());
|
||||
ps_set_search(decoder.get(), "lm");
|
||||
|
||||
return decoder;
|
||||
}
|
||||
|
||||
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
|
||||
JoiningTimeline<void> noiseSounds;
|
||||
|
||||
// Find utterance parts without recogniced phones
|
||||
noiseSounds.set(utteranceTimeRange);
|
||||
for (const auto& timedPhone : phones) {
|
||||
noiseSounds.clear(timedPhone.getTimeRange());
|
||||
}
|
||||
|
||||
// Remove undesired elements
|
||||
const centiseconds minSoundDuration = 12_cs;
|
||||
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
|
||||
bool startsAtZero = unknownSound.getStart() == 0_cs;
|
||||
bool tooShort = unknownSound.getDuration() < minSoundDuration;
|
||||
if (startsAtZero || tooShort) {
|
||||
noiseSounds.clear(unknownSound.getTimeRange());
|
||||
}
|
||||
}
|
||||
|
||||
return noiseSounds;
|
||||
}
|
||||
|
||||
// Some words have multiple pronunciations, one of which results in better animation than the others.
|
||||
// This function returns the optimal pronunciation for a select set of these words.
|
||||
string fixPronunciation(const string& word) {
|
||||
const static map<string, string> replacements {
|
||||
{"into(2)", "into"},
|
||||
{"to(2)", "to"},
|
||||
{"to(3)", "to"},
|
||||
{"today(2)", "today"},
|
||||
{"tomorrow(2)", "tomorrow"},
|
||||
{"tonight(2)", "tonight"}
|
||||
const static map<string, string> replacements{
|
||||
{ "into(2)", "into" },
|
||||
{ "to(2)", "to" },
|
||||
{ "to(3)", "to" },
|
||||
{ "today(2)", "today" },
|
||||
{ "tomorrow(2)", "tomorrow" },
|
||||
{ "tonight(2)", "tonight" }
|
||||
};
|
||||
|
||||
const auto pair = replacements.find(word);
|
||||
return pair != replacements.end() ? pair->second : word;
|
||||
}
|
||||
|
||||
Timeline<Phone> utteranceToPhones(
|
||||
static Timeline<Phone> utteranceToPhones(
|
||||
const AudioClip& audioClip,
|
||||
TimeRange utteranceTimeRange,
|
||||
ps_decoder_t& decoder,
|
||||
ProgressSink& utteranceProgressSink)
|
||||
{
|
||||
ProgressSink& utteranceProgressSink
|
||||
) {
|
||||
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
|
||||
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
|
||||
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
|
||||
|
||||
// Pad time range to give Pocketsphinx some breathing room
|
||||
// Pad time range to give PocketSphinx some breathing room
|
||||
TimeRange paddedTimeRange = utteranceTimeRange;
|
||||
const centiseconds padding(3);
|
||||
paddedTimeRange.grow(padding);
|
||||
|
@ -384,7 +254,7 @@ Timeline<Phone> utteranceToPhones(
|
|||
continue;
|
||||
}
|
||||
word = regex_replace(word, regex("\\(\\d\\)"), "");
|
||||
if (text.size() > 0) {
|
||||
if (!text.empty()) {
|
||||
text += " ";
|
||||
}
|
||||
text += word;
|
||||
|
@ -403,7 +273,7 @@ Timeline<Phone> utteranceToPhones(
|
|||
const string fixedWord = fixPronunciation(timedWord.getValue());
|
||||
wordIds.push_back(getWordId(fixedWord, *decoder.dict));
|
||||
}
|
||||
if (wordIds.empty()) return {};
|
||||
if (wordIds.empty()) return{};
|
||||
|
||||
// Align the words' phones with speech
|
||||
#if BOOST_VERSION < 105600 // Support legacy syntax
|
||||
|
@ -433,77 +303,11 @@ Timeline<Phone> utteranceToPhones(
|
|||
return utterancePhones;
|
||||
}
|
||||
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
optional<string> dialog,
|
||||
optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink)
|
||||
{
|
||||
ProgressMerger totalProgressMerger(progressSink);
|
||||
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
|
||||
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
||||
|
||||
// Make sure audio stream has no DC offset
|
||||
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
|
||||
|
||||
// Split audio into utterances
|
||||
JoiningBoundedTimeline<void> utterances;
|
||||
try {
|
||||
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
|
||||
}
|
||||
catch (...) {
|
||||
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
||||
}
|
||||
|
||||
// Discard Pocketsphinx output
|
||||
err_set_logfp(nullptr);
|
||||
|
||||
// Redirect Pocketsphinx output to log
|
||||
err_set_callback(sphinxLogCallback, nullptr);
|
||||
|
||||
// Prepare pool of decoders
|
||||
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
|
||||
[&dialog] { return createDecoder(dialog); });
|
||||
|
||||
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
|
||||
std::mutex resultMutex;
|
||||
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
||||
// Detect phones for utterance
|
||||
auto decoder = decoderPool.acquire();
|
||||
Timeline<Phone> utterancePhones =
|
||||
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
|
||||
|
||||
// Copy phones to result timeline
|
||||
std::lock_guard<std::mutex> lock(resultMutex);
|
||||
for (const auto& timedPhone : utterancePhones) {
|
||||
phones.set(timedPhone);
|
||||
}
|
||||
};
|
||||
|
||||
auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
|
||||
return timedUtterance.getDuration().count();
|
||||
};
|
||||
|
||||
// Perform speech recognition
|
||||
try {
|
||||
// Determine how many parallel threads to use
|
||||
int threadCount = std::min({
|
||||
maxThreadCount,
|
||||
// Don't use more threads than there are utterances to be processed
|
||||
static_cast<int>(utterances.size()),
|
||||
// Don't waste time creating additional threads (and decoders!) if the recording is short
|
||||
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
|
||||
});
|
||||
if (threadCount < 1) {
|
||||
threadCount = 1;
|
||||
}
|
||||
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
|
||||
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
||||
logging::debug("Speech recognition -- end");
|
||||
}
|
||||
catch (...) {
|
||||
std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
|
||||
}
|
||||
|
||||
return phones;
|
||||
ProgressSink& progressSink
|
||||
) const {
|
||||
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
|
||||
#include "Recognizer.h"
|
||||
#include "pocketSphinxTools.h"
|
||||
|
||||
class PocketSphinxRecognizer : public Recognizer {
|
||||
public:
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) const override;
|
||||
};
|
|
@ -0,0 +1,18 @@
|
|||
#pragma once
|
||||
|
||||
#include "audio/AudioClip.h"
|
||||
#include "core/Phone.h"
|
||||
#include "tools/ProgressBar.h"
|
||||
#include "time/BoundedTimeline.h"
|
||||
|
||||
class Recognizer {
|
||||
public:
|
||||
virtual ~Recognizer() = default;
|
||||
|
||||
virtual BoundedTimeline<Phone>recognizePhones(
|
||||
const AudioClip& audioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) const = 0;
|
||||
};
|
|
@ -1,12 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "audio/AudioClip.h"
|
||||
#include "core/Phone.h"
|
||||
#include "tools/ProgressBar.h"
|
||||
#include "time/BoundedTimeline.h"
|
||||
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& audioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink);
|
|
@ -0,0 +1,218 @@
|
|||
#include "pocketSphinxTools.h"
|
||||
|
||||
#include "tools/platformTools.h"
|
||||
#include <regex>
|
||||
#include "audio/DcOffset.h"
|
||||
#include "audio/voiceActivityDetection.h"
|
||||
#include "tools/parallel.h"
|
||||
#include "tools/ObjectPool.h"
|
||||
#include "time/timedLogging.h"
|
||||
|
||||
extern "C" {
|
||||
#include <sphinxbase/err.h>
|
||||
#include <pocketsphinx_internal.h>
|
||||
#include <ngram_search.h>
|
||||
}
|
||||
|
||||
using std::runtime_error;
|
||||
using std::invalid_argument;
|
||||
using std::unique_ptr;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using boost::filesystem::path;
|
||||
using std::regex;
|
||||
using boost::optional;
|
||||
using std::chrono::duration_cast;
|
||||
|
||||
logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
|
||||
switch (errorLevel) {
|
||||
case ERR_DEBUG:
|
||||
case ERR_INFO:
|
||||
case ERR_INFOCONT:
|
||||
return logging::Level::Trace;
|
||||
case ERR_WARN:
|
||||
return logging::Level::Warn;
|
||||
case ERR_ERROR:
|
||||
return logging::Level::Error;
|
||||
case ERR_FATAL:
|
||||
return logging::Level::Fatal;
|
||||
default:
|
||||
throw invalid_argument("Unknown log level.");
|
||||
}
|
||||
}
|
||||
|
||||
void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
|
||||
UNUSED(user_data);
|
||||
|
||||
// Create varArgs list
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
auto _ = gsl::finally([&args]() { va_end(args); });
|
||||
|
||||
// Format message
|
||||
const int initialSize = 256;
|
||||
vector<char> chars(initialSize);
|
||||
bool success = false;
|
||||
while (!success) {
|
||||
const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
|
||||
if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
|
||||
|
||||
success = charsWritten < static_cast<int>(chars.size());
|
||||
if (!success) chars.resize(chars.size() * 2);
|
||||
}
|
||||
const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
|
||||
string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
|
||||
boost::algorithm::trim(message);
|
||||
|
||||
const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
|
||||
logging::log(logLevel, message);
|
||||
}
|
||||
|
||||
void redirectPocketSphinxOutput() {
|
||||
static bool redirected = false;
|
||||
if (redirected) return;
|
||||
|
||||
// Discard PocketSphinx output
|
||||
err_set_logfp(nullptr);
|
||||
|
||||
// Redirect PocketSphinx output to log
|
||||
err_set_callback(sphinxLogCallback, nullptr);
|
||||
|
||||
redirected = true;
|
||||
}
|
||||
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
optional<std::string> dialog,
|
||||
decoderFactory createDecoder,
|
||||
utteranceToPhonesFunction utteranceToPhones,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
) {
|
||||
ProgressMerger totalProgressMerger(progressSink);
|
||||
ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
|
||||
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
||||
|
||||
// Make sure audio stream has no DC offset
|
||||
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
|
||||
|
||||
// Split audio into utterances
|
||||
JoiningBoundedTimeline<void> utterances;
|
||||
try {
|
||||
utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
|
||||
} catch (...) {
|
||||
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
||||
}
|
||||
|
||||
redirectPocketSphinxOutput();
|
||||
|
||||
// Prepare pool of decoders
|
||||
ObjectPool<ps_decoder_t, lambda_unique_ptr<ps_decoder_t>> decoderPool(
|
||||
[&] { return createDecoder(dialog); });
|
||||
|
||||
BoundedTimeline<Phone> phones(audioClip->getTruncatedRange());
|
||||
std::mutex resultMutex;
|
||||
const auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
||||
// Detect phones for utterance
|
||||
const auto decoder = decoderPool.acquire();
|
||||
Timeline<Phone> utterancePhones =
|
||||
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
|
||||
|
||||
// Copy phones to result timeline
|
||||
std::lock_guard<std::mutex> lock(resultMutex);
|
||||
for (const auto& timedPhone : utterancePhones) {
|
||||
phones.set(timedPhone);
|
||||
}
|
||||
};
|
||||
|
||||
const auto getUtteranceProgressWeight = [](const Timed<void> timedUtterance) {
|
||||
return timedUtterance.getDuration().count();
|
||||
};
|
||||
|
||||
// Perform speech recognition
|
||||
try {
|
||||
// Determine how many parallel threads to use
|
||||
int threadCount = std::min({
|
||||
maxThreadCount,
|
||||
// Don't use more threads than there are utterances to be processed
|
||||
static_cast<int>(utterances.size()),
|
||||
// Don't waste time creating additional threads (and decoders!) if the recording is short
|
||||
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getDuration()).count() / 5)
|
||||
});
|
||||
if (threadCount < 1) {
|
||||
threadCount = 1;
|
||||
}
|
||||
logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
|
||||
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
||||
logging::debug("Speech recognition -- end");
|
||||
} catch (...) {
|
||||
std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
|
||||
}
|
||||
|
||||
return phones;
|
||||
}
|
||||
|
||||
const path& getSphinxModelDirectory() {
|
||||
static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
|
||||
return sphinxModelDirectory;
|
||||
}
|
||||
|
||||
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones) {
|
||||
JoiningTimeline<void> noiseSounds;
|
||||
|
||||
// Find utterance parts without recognized phones
|
||||
noiseSounds.set(utteranceTimeRange);
|
||||
for (const auto& timedPhone : phones) {
|
||||
noiseSounds.clear(timedPhone.getTimeRange());
|
||||
}
|
||||
|
||||
// Remove undesired elements
|
||||
const centiseconds minSoundDuration = 12_cs;
|
||||
for (const auto& unknownSound : JoiningTimeline<void>(noiseSounds)) {
|
||||
const bool startsAtZero = unknownSound.getStart() == 0_cs;
|
||||
const bool tooShort = unknownSound.getDuration() < minSoundDuration;
|
||||
if (startsAtZero || tooShort) {
|
||||
noiseSounds.clear(unknownSound.getTimeRange());
|
||||
}
|
||||
}
|
||||
|
||||
return noiseSounds;
|
||||
}
|
||||
|
||||
BoundedTimeline<string> recognizeWords(const vector<int16_t>& audioBuffer, ps_decoder_t& decoder) {
|
||||
// Restart timing at 0
|
||||
ps_start_stream(&decoder);
|
||||
|
||||
// Start recognition
|
||||
int error = ps_start_utt(&decoder);
|
||||
if (error) throw runtime_error("Error starting utterance processing for word recognition.");
|
||||
|
||||
// Process entire audio clip
|
||||
const bool noRecognition = false;
|
||||
const bool fullUtterance = true;
|
||||
const int searchedFrameCount =
|
||||
ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
|
||||
if (searchedFrameCount < 0) {
|
||||
throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||
}
|
||||
|
||||
// End recognition
|
||||
error = ps_end_utt(&decoder);
|
||||
if (error) throw runtime_error("Error ending utterance processing for word recognition.");
|
||||
|
||||
BoundedTimeline<string> result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
|
||||
const bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
||||
if (noWordsRecognized) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Collect words
|
||||
for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
|
||||
const char* word = ps_seg_word(it);
|
||||
int firstFrame, lastFrame;
|
||||
ps_seg_frames(it, &firstFrame, &lastFrame);
|
||||
result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#include "time/BoundedTimeline.h"
|
||||
#include "core/Phone.h"
|
||||
#include "audio/AudioClip.h"
|
||||
#include "tools/ProgressBar.h"
|
||||
#include <boost/filesystem/path.hpp>
|
||||
|
||||
extern "C" {
|
||||
#include <pocketsphinx.h>
|
||||
}
|
||||
|
||||
typedef std::function<lambda_unique_ptr<ps_decoder_t>(
|
||||
boost::optional<std::string> dialog
|
||||
)> decoderFactory;
|
||||
|
||||
typedef std::function<Timeline<Phone>(
|
||||
const AudioClip& audioClip,
|
||||
TimeRange utteranceTimeRange,
|
||||
ps_decoder_t& decoder,
|
||||
ProgressSink& utteranceProgressSink
|
||||
)> utteranceToPhonesFunction;
|
||||
|
||||
BoundedTimeline<Phone> recognizePhones(
|
||||
const AudioClip& inputAudioClip,
|
||||
boost::optional<std::string> dialog,
|
||||
decoderFactory createDecoder,
|
||||
utteranceToPhonesFunction utteranceToPhones,
|
||||
int maxThreadCount,
|
||||
ProgressSink& progressSink
|
||||
);
|
||||
|
||||
constexpr int sphinxSampleRate = 16000;
|
||||
|
||||
const boost::filesystem::path& getSphinxModelDirectory();
|
||||
|
||||
JoiningTimeline<void> getNoiseSounds(TimeRange utteranceTimeRange, const Timeline<Phone>& phones);
|
||||
|
||||
BoundedTimeline<std::string> recognizeWords(const std::vector<int16_t>& audioBuffer, ps_decoder_t& decoder);
|
|
@ -0,0 +1,27 @@
|
|||
#include "RecognizerType.h"
|
||||
|
||||
using std::string;
|
||||
|
||||
RecognizerTypeConverter& RecognizerTypeConverter::get() {
|
||||
static RecognizerTypeConverter converter;
|
||||
return converter;
|
||||
}
|
||||
|
||||
string RecognizerTypeConverter::getTypeName() {
|
||||
return "RecognizerType";
|
||||
}
|
||||
|
||||
EnumConverter<RecognizerType>::member_data RecognizerTypeConverter::getMemberData() {
|
||||
return member_data{
|
||||
{ RecognizerType::PocketSphinx, "pocketSphinx" },
|
||||
{ RecognizerType::Phonetic, "phonetic" }
|
||||
};
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
|
||||
return RecognizerTypeConverter::get().write(stream, value);
|
||||
}
|
||||
|
||||
std::istream& operator>>(std::istream& stream, RecognizerType& value) {
|
||||
return RecognizerTypeConverter::get().read(stream, value);
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
#pragma once
|
||||
|
||||
#include "tools/EnumConverter.h"
|
||||
|
||||
enum class RecognizerType {
|
||||
PocketSphinx,
|
||||
Phonetic
|
||||
};
|
||||
|
||||
class RecognizerTypeConverter : public EnumConverter<RecognizerType> {
|
||||
public:
|
||||
static RecognizerTypeConverter& get();
|
||||
protected:
|
||||
std::string getTypeName() override;
|
||||
member_data getMemberData() override;
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& stream, RecognizerType value);
|
||||
|
||||
std::istream& operator>>(std::istream& stream, RecognizerType& value);
|
|
@ -27,6 +27,9 @@
|
|||
#include "tools/platformTools.h"
|
||||
#include "sinks.h"
|
||||
#include "semanticEntries.h"
|
||||
#include "RecognizerType.h"
|
||||
#include "recognition/PocketSphinxRecognizer.h"
|
||||
#include "recognition/PhoneticRecognizer.h"
|
||||
|
||||
using std::exception;
|
||||
using std::string;
|
||||
|
@ -36,9 +39,6 @@ using std::unique_ptr;
|
|||
using std::make_unique;
|
||||
using std::shared_ptr;
|
||||
using std::make_shared;
|
||||
using std::map;
|
||||
using std::chrono::duration;
|
||||
using std::chrono::duration_cast;
|
||||
using std::ofstream;
|
||||
using boost::filesystem::path;
|
||||
using boost::adaptors::transformed;
|
||||
|
@ -56,6 +56,10 @@ namespace TCLAP {
|
|||
struct ArgTraits<ExportFormat> {
|
||||
typedef ValueLike ValueCategory;
|
||||
};
|
||||
template<>
|
||||
struct ArgTraits<RecognizerType> {
|
||||
typedef ValueLike ValueCategory;
|
||||
};
|
||||
}
|
||||
|
||||
shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
|
||||
|
@ -66,6 +70,17 @@ shared_ptr<logging::Sink> createFileSink(path path, logging::Level minLevel) {
|
|||
return make_shared<logging::LevelFilter>(FileSink, minLevel);
|
||||
}
|
||||
|
||||
unique_ptr<Recognizer> createRecognizer(RecognizerType recognizerType) {
|
||||
switch (recognizerType) {
|
||||
case RecognizerType::PocketSphinx:
|
||||
return make_unique<PocketSphinxRecognizer>();
|
||||
case RecognizerType::Phonetic:
|
||||
return make_unique<PhoneticRecognizer>();
|
||||
default:
|
||||
throw std::runtime_error("Unknown recognizer.");
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<Exporter> createExporter(ExportFormat exportFormat) {
|
||||
switch (exportFormat) {
|
||||
case ExportFormat::Tsv:
|
||||
|
@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
|
|||
auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
|
||||
tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
|
||||
tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
|
||||
auto recognizerTypes = vector<RecognizerType>(RecognizerTypeConverter::get().getValues());
|
||||
tclap::ValuesConstraint<RecognizerType> recognizerConstraint(recognizerTypes);
|
||||
tclap::ValueArg<RecognizerType> recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
|
||||
tclap::UnlabeledValueArg<string> inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
|
||||
|
||||
try {
|
||||
|
@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
|
|||
JoiningContinuousTimeline<Shape> animation = animateWaveFile(
|
||||
inputFilePath,
|
||||
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
|
||||
*createRecognizer(recognizerType.getValue()),
|
||||
targetShapeSet,
|
||||
maxThreadCount.getValue(),
|
||||
progressSink);
|
||||
|
|
Loading…
Reference in New Issue