diff --git a/CHANGELOG.md b/CHANGELOG.md
index 516cbe2..c466bc4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
# Version history
+## Unreleased
+
+* **Added** basic support for non-English recordings through phonetic recognition ([issue #45](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/45)).
+
## Version 1.8.0
* **Added** support for Ogg Vorbis (.ogg) file format ([issue #40](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/40)).
diff --git a/README.adoc b/README.adoc
index fb08c03..e0f4646 100644
--- a/README.adoc
+++ b/README.adoc
@@ -123,6 +123,11 @@ The following command-line options are the most common:
| __
| The audio file to be analyzed. This must be the last command-line argument. Supported file formats are WAVE (.wav) and Ogg Vorbis (.ogg).
+| `-r` __, `--recognizer` __
+| Specifies how Rhubarb Lip Sync recognizes speech within the recording. Options: `pocketSphinx` (use for English recordings), `phonetic` (use for non-English recordings). For details, see <>.
+
+_Default value: ``pocketSphinx``_
+
| `-f` __, `--exportFormat` __
| The export format. Options: `tsv` (tab-separated values, see <>), `xml` (see <>), `json` (see <>).
@@ -192,6 +197,19 @@ Note that for short audio files, Rhubarb Lip Sync may choose to use fewer thread
_Default value: as many threads as your CPU has cores_
|===
+[[recognizers]]
+== Recognizers
+
+The first step in processing an audio file is determining what is being said. More specifically, Rhubarb Lip Sync uses speech recognition to figure out what sound is being said at what point in time. You can choose between two recognizers:
+
+=== PocketSphinx
+
+PocketSphinx is an open-source speech recognition library that generally gives good results. This is the default recognizer. The downside is that PocketSphinx only recognizes English dialog. So if your recordings are in a language other than English, this is not a good choice.
+
+=== Phonetic
+
+Rhubarb Lip Sync also comes with a phonetic recognizer. _Phonetic_ means that this recognizer won't try to understand entire (English) words and phrases. Instead, it will recognize individual sounds and syllables. The results are usually less precise than those from the PocketSphinx recognizer. The advantage is that this recognizer is language-independent. Use it if your recordings are not in English.
+
[[outputFormats]]
== Output formats
diff --git a/appInfo.cmake b/appInfo.cmake
index 4f7dc9d..ed17020 100644
--- a/appInfo.cmake
+++ b/appInfo.cmake
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.2)
set(appName "Rhubarb Lip Sync")
set(appVersionMajor 1)
-set(appVersionMinor 8)
+set(appVersionMinor 9)
set(appVersionPatch 0)
-set(appVersionSuffix "")
+set(appVersionSuffix "-pre.1")
set(appVersion "${appVersionMajor}.${appVersionMinor}.${appVersionPatch}${appVersionSuffix}")
diff --git a/extras/AdobeAfterEffects/Rhubarb Lip Sync.jsx b/extras/AdobeAfterEffects/Rhubarb Lip Sync.jsx
index d8dbfcd..a8c8152 100644
--- a/extras/AdobeAfterEffects/Rhubarb Lip Sync.jsx
+++ b/extras/AdobeAfterEffects/Rhubarb Lip Sync.jsx
@@ -323,6 +323,12 @@ function createDialogWindow() {
+ 'your After Effects project.'
})
}),
+ recognizer: Group({
+ label: StaticText({ text: 'Recognizer:' }),
+ value: DropDownList({
+ helpTip: 'The dialog recognizer.'
+ })
+ }),
dialogText: Group({
label: StaticText({ text: 'Dialog text (optional):' }),
value: EditText({
@@ -384,6 +390,7 @@ function createDialogWindow() {
var controls = {
audioFile: window.settings.audioFile.value,
dialogText: window.settings.dialogText.value,
+ recognizer: window.settings.recognizer.value,
mouthComp: window.settings.mouthComp.value,
targetFolder: window.settings.targetFolder.value,
frameRate: window.settings.frameRate.value,
@@ -402,6 +409,16 @@ function createDialogWindow() {
listItem.projectItem = projectItem;
});
+ // Add recognizer options
+ const recognizerOptions = [
+ { text: 'PocketSphinx (use for English recordings)', value: 'pocketSphinx' },
+ { text: 'Phonetic (use for non-English recordings)', value: 'phonetic' }
+ ];
+ recognizerOptions.forEach(function(option) {
+ var listItem = controls.recognizer.add('item', option.text);
+ listItem.value = option.value;
+ });
+
// Add mouth composition options
var comps = toArrayBase1(app.project.items).filter(function (item) {
return item instanceof CompItem;
@@ -425,6 +442,7 @@ function createDialogWindow() {
var settings = readSettingsFile();
selectByTextOrFirst(controls.audioFile, settings.audioFile);
controls.dialogText.text = settings.dialogText || '';
+ selectByTextOrFirst(controls.recognizer, settings.recognizer);
selectByTextOrFirst(controls.mouthComp, settings.mouthComp);
extendedMouthShapeNames.forEach(function(shapeName) {
controls['mouthShape' + shapeName].value =
@@ -484,6 +502,7 @@ function createDialogWindow() {
// Store settings
var settings = {
audioFile: (controls.audioFile.selection || {}).text,
+ recognizer: (controls.recognizer.selection || {}).text,
dialogText: controls.dialogText.text,
mouthComp: (controls.mouthComp.selection || {}).text,
extendedMouthShapes: {},
@@ -543,7 +562,7 @@ function createDialogWindow() {
// Check for correct Rhubarb version
var version = exec(rhubarbPath + ' --version') || '';
- var match = version.match(/Rhubarb Lip Sync version ((\d+)\.(\d+).(\d+))/);
+ var match = version.match(/Rhubarb Lip Sync version ((\d+)\.(\d+).(\d+)(-[0-9A-Za-z-.]+)?)/);
if (!match) {
var instructions = osIsWindows
? 'Make sure your PATH environment variable contains the ' + appName + ' '
@@ -555,13 +574,16 @@ function createDialogWindow() {
var versionString = match[1];
var major = Number(match[2]);
var minor = Number(match[3]);
- if (major != 1 || minor < 6) {
- return 'This script requires ' + appName + ' 1.6.0 or a later 1.x version. '
+ var requiredMajor = 1;
+ var minRequiredMinor = 9;
+ if (major != requiredMajor || minor < minRequiredMinor) {
+ return 'This script requires ' + appName + ' ' + requiredMajor + '.' + minRequiredMinor
+ + '.0 or a later ' + requiredMajor + '.x version. '
+ 'Your installed version is ' + versionString + ', which is not compatible.';
}
}
- function generateMouthCues(audioFileFootage, dialogText, mouthComp, extendedMouthShapeNames,
+ function generateMouthCues(audioFileFootage, recognizer, dialogText, mouthComp, extendedMouthShapeNames,
targetProjectFolder, frameRate)
{
var basePath = Folder.temp.fsName + '/' + createGuid();
@@ -575,6 +597,7 @@ function createDialogWindow() {
// Create command line
var commandLine = rhubarbPath
+ ' --dialogFile ' + cliEscape(dialogFile.fsName)
+ + ' --recognizer ' + recognizer
+ ' --exportFormat json'
+ ' --extendedShapes ' + cliEscape(extendedMouthShapeNames.join(''))
+ ' --logFile ' + cliEscape(logFile.fsName)
@@ -660,11 +683,11 @@ function createDialogWindow() {
}
}
- function animate(audioFileFootage, dialogText, mouthComp, extendedMouthShapeNames,
+ function animate(audioFileFootage, recognizer, dialogText, mouthComp, extendedMouthShapeNames,
targetProjectFolder, frameRate)
{
try {
- var mouthCues = generateMouthCues(audioFileFootage, dialogText, mouthComp,
+ var mouthCues = generateMouthCues(audioFileFootage, recognizer, dialogText, mouthComp,
extendedMouthShapeNames, targetProjectFolder, frameRate);
app.beginUndoGroup(appName + ': Animation');
@@ -680,6 +703,7 @@ function createDialogWindow() {
// Handle changes
update();
controls.audioFile.onChange = update;
+ controls.recognizer.onChange = update;
controls.dialogText.onChanging = update;
controls.mouthComp.onChange = update;
extendedMouthShapeNames.forEach(function(shapeName) {
@@ -700,6 +724,7 @@ function createDialogWindow() {
window.close();
animate(
controls.audioFile.selection.projectItem,
+ controls.recognizer.selection.value,
controls.dialogText.text || '',
controls.mouthComp.selection.projectItem,
extendedMouthShapeNames.filter(function(shapeName) {
diff --git a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/AudioFileModel.kt b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/AudioFileModel.kt
index e9f41c6..632782c 100644
--- a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/AudioFileModel.kt
+++ b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/AudioFileModel.kt
@@ -141,11 +141,12 @@ class AudioFileModel(
private fun startAnimation() {
val wrapperTask = Runnable {
+ val recognizer = parentModel.parentModel.recognizer.value
val extendedMouthShapes = parentModel.mouthShapes.filter { it.isExtended }.toSet()
val reportProgress: (Double?) -> Unit = {
progress -> runAndWait { this@AudioFileModel.animationProgress = progress }
}
- val rhubarbTask = RhubarbTask(audioFilePath, dialog, extendedMouthShapes, reportProgress)
+ val rhubarbTask = RhubarbTask(audioFilePath, recognizer, dialog, extendedMouthShapes, reportProgress)
try {
try {
val result = rhubarbTask.call()
diff --git a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainModel.kt b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainModel.kt
index 9010146..6378aad 100644
--- a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainModel.kt
+++ b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainModel.kt
@@ -2,6 +2,8 @@ package com.rhubarb_lip_sync.rhubarb_for_spine
import javafx.beans.property.SimpleObjectProperty
import javafx.beans.property.SimpleStringProperty
+import javafx.collections.FXCollections
+import javafx.collections.ObservableList
import tornadofx.FX
import tornadofx.getValue
import tornadofx.setValue
@@ -40,6 +42,15 @@ class MainModel(private val executor: ExecutorService) {
var animationFileModel by animationFileModelProperty
private set
+ val recognizersProperty = SimpleObjectProperty>(FXCollections.observableArrayList(
+ Recognizer("pocketSphinx", "PocketSphinx (use for English recordings)"),
+ Recognizer("phonetic", "Phonetic (use for non-English recordings)")
+ ))
+ private var recognizers: ObservableList by recognizersProperty
+
+ val recognizerProperty = SimpleObjectProperty(recognizers[0])
+ var recognizer: Recognizer by recognizerProperty
+
val animationPrefixProperty = SimpleStringProperty("say_")
var animationPrefix: String by animationPrefixProperty
@@ -47,4 +58,6 @@ class MainModel(private val executor: ExecutorService) {
var animationSuffix: String by animationSuffixProperty
private fun getDefaultPathString() = FX.application.parameters.raw.firstOrNull()
-}
\ No newline at end of file
+}
+
+class Recognizer(val value: String, val description: String)
diff --git a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainView.kt b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainView.kt
index 13e2316..7a67e91 100644
--- a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainView.kt
+++ b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/MainView.kt
@@ -17,6 +17,7 @@ import javafx.scene.text.Font
import javafx.scene.text.FontWeight
import javafx.scene.text.Text
import javafx.stage.FileChooser
+import javafx.util.StringConverter
import tornadofx.*
import java.io.File
import java.util.concurrent.Executors
@@ -83,6 +84,20 @@ class MainView : View() {
}
}
}
+ field("Dialog recognizer") {
+ combobox {
+ itemsProperty().bind(mainModel.recognizersProperty)
+ this.converter = object : StringConverter() {
+ override fun toString(recognizer: Recognizer?): String {
+ return recognizer?.description ?: ""
+ }
+ override fun fromString(string: String?): Recognizer {
+ throw NotImplementedError()
+ }
+ }
+ valueProperty().bindBidirectional(mainModel.recognizerProperty)
+ }
+ }
field("Animation naming") {
textfield {
maxWidth = 100.0
diff --git a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/RhubarbTask.kt b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/RhubarbTask.kt
index 0268003..0694e79 100644
--- a/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/RhubarbTask.kt
+++ b/extras/EsotericSoftwareSpine/src/main/kotlin/com/rhubarb_lip_sync/rhubarb_for_spine/RhubarbTask.kt
@@ -14,6 +14,7 @@ import java.util.concurrent.Callable
class RhubarbTask(
val audioFilePath: Path,
+ val recognizer: String,
val dialog: String?,
val extendedMouthShapes: Set,
val reportProgress: (Double?) -> Unit
@@ -89,6 +90,7 @@ class RhubarbTask(
return mutableListOf(
rhubarbBinFilePath.toString(),
"--machineReadable",
+ "--recognizer", recognizer,
"--exportFormat", "json",
"--extendedShapes", extendedMouthShapesString
).apply {
@@ -100,7 +102,6 @@ class RhubarbTask(
}.apply {
add(audioFilePath.toString())
}
-
}
private val guiBinDirectory: Path by lazy {
diff --git a/rhubarb/CMakeLists.txt b/rhubarb/CMakeLists.txt
index 9353edf..0fef2b7 100644
--- a/rhubarb/CMakeLists.txt
+++ b/rhubarb/CMakeLists.txt
@@ -413,8 +413,13 @@ add_library(rhubarb-recognition
src/recognition/g2p.h
src/recognition/languageModels.cpp
src/recognition/languageModels.h
- src/recognition/phoneRecognition.cpp
- src/recognition/phoneRecognition.h
+ src/recognition/PhoneticRecognizer.cpp
+ src/recognition/PhoneticRecognizer.h
+ src/recognition/PocketSphinxRecognizer.cpp
+ src/recognition/PocketSphinxRecognizer.h
+ src/recognition/pocketSphinxTools.cpp
+ src/recognition/pocketSphinxTools.h
+ src/recognition/Recognizer.h
src/recognition/tokenization.cpp
src/recognition/tokenization.h
)
@@ -487,6 +492,8 @@ add_executable(rhubarb
src/rhubarb/main.cpp
src/rhubarb/ExportFormat.cpp
src/rhubarb/ExportFormat.h
+ src/rhubarb/RecognizerType.cpp
+ src/rhubarb/RecognizerType.h
src/rhubarb/semanticEntries.cpp
src/rhubarb/semanticEntries.h
src/rhubarb/sinks.cpp
diff --git a/rhubarb/resharper.DotSettings b/rhubarb/resharper.DotSettings
index b16b555..168efbe 100644
--- a/rhubarb/resharper.DotSettings
+++ b/rhubarb/resharper.DotSettings
@@ -1,7 +1,12 @@
+ HINT
+
ERROR
+
DO_NOT_SHOW
+
USE_TABS_ONLY
+ USE_TABS_ONLY
False
False
False
@@ -29,6 +34,7 @@
CHOP_ALWAYS
END_OF_LINE
END_OF_LINE
+ USE_TABS_ONLY
False
END_OF_LINE
END_OF_LINE
@@ -44,6 +50,14 @@
False
True
False
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
+ USE_TABS_ONLY
UseExplicitType
UseVarWhenEvident
<NamingElement Priority="10"><Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"><type Name="class field" /><type Name="struct field" /></Descriptor><Policy Inspect="True" Prefix="" Suffix="_" Style="aaBb" /></NamingElement>
@@ -108,7 +122,16 @@
C:\Users\Daniel\AppData\Local\JetBrains\Transient\ReSharperPlatformVs14\v09\SolutionCaches
True
True
+ True
+ True
True
True
True
+ True
+ True
+ True
+ True
+ True
+ True
+ True
\ No newline at end of file
diff --git a/rhubarb/src/lib/rhubarbLib.cpp b/rhubarb/src/lib/rhubarbLib.cpp
index ffadf68..5f8460f 100644
--- a/rhubarb/src/lib/rhubarbLib.cpp
+++ b/rhubarb/src/lib/rhubarbLib.cpp
@@ -1,6 +1,5 @@
#include "rhubarbLib.h"
#include "core/Phone.h"
-#include "recognition/phoneRecognition.h"
#include "tools/textFiles.h"
#include "animation/mouthAnimation.h"
#include "audio/audioFileReading.h"
@@ -8,27 +7,29 @@
using boost::optional;
using std::string;
using boost::filesystem::path;
-using std::unique_ptr;
JoiningContinuousTimeline animateAudioClip(
const AudioClip& audioClip,
- optional dialog,
+ const optional& dialog,
+ const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink)
{
- BoundedTimeline phones = recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
+ const BoundedTimeline phones =
+ recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
JoiningContinuousTimeline result = animate(phones, targetShapeSet);
return result;
}
JoiningContinuousTimeline animateWaveFile(
path filePath,
- optional dialog,
+ const optional& dialog,
+ const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink)
{
const auto audioClip = createAudioFileClip(filePath);
- return animateAudioClip(*audioClip, dialog, targetShapeSet, maxThreadCount, progressSink);
+ return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
}
diff --git a/rhubarb/src/lib/rhubarbLib.h b/rhubarb/src/lib/rhubarbLib.h
index 8663761..ca40a06 100644
--- a/rhubarb/src/lib/rhubarbLib.h
+++ b/rhubarb/src/lib/rhubarbLib.h
@@ -6,17 +6,20 @@
#include "tools/ProgressBar.h"
#include
#include "animation/targetShapeSet.h"
+#include "recognition/Recognizer.h"
JoiningContinuousTimeline animateAudioClip(
const AudioClip& audioClip,
- boost::optional dialog,
+ const boost::optional& dialog,
+ const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink);
JoiningContinuousTimeline animateWaveFile(
boost::filesystem::path filePath,
- boost::optional dialog,
+ const boost::optional& dialog,
+ const Recognizer& recognizer,
const ShapeSet& targetShapeSet,
int maxThreadCount,
ProgressSink& progressSink);
diff --git a/rhubarb/src/recognition/PhoneticRecognizer.cpp b/rhubarb/src/recognition/PhoneticRecognizer.cpp
new file mode 100644
index 0000000..bd9c9ac
--- /dev/null
+++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp
@@ -0,0 +1,103 @@
+#include "PhoneticRecognizer.h"
+#include "time/Timeline.h"
+#include "audio/AudioSegment.h"
+#include "audio/SampleRateConverter.h"
+#include "audio/processing.h"
+#include "time/timedLogging.h"
+
+using std::runtime_error;
+using std::unique_ptr;
+using std::string;
+using boost::optional;
+
+static lambda_unique_ptr createDecoder(optional dialog) {
+ UNUSED(dialog);
+
+ lambda_unique_ptr config(
+ cmd_ln_init(
+ nullptr, ps_args(), true,
+ // Set acoustic model
+ "-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
+ // Set phonetic language model
+ "-allphone", (getSphinxModelDirectory() / "en-us-phone.lm.bin").string().c_str(),
+ "-allphone_ci", "yes",
+ // Set language model probability weight.
+ // Low values (<= 0.4) can lead to fluttering animation.
+ // High values (>= 1.0) can lead to imprecise or freezing animation.
+ "-lw", "0.8",
+
+ // The following settings are recommended at http://cmusphinx.sourceforge.net/wiki/phonemerecognition
+
+ // Set beam width applied to every frame in Viterbi search
+ "-beam", "1e-20",
+ // Set beam width applied to phone transitions
+ "-pbeam", "1e-20",
+ nullptr),
+ [](cmd_ln_t* config) { cmd_ln_free_r(config); });
+ if (!config) throw runtime_error("Error creating configuration.");
+
+ lambda_unique_ptr decoder(
+ ps_init(config.get()),
+ [](ps_decoder_t* recognizer) { ps_free(recognizer); });
+ if (!decoder) throw runtime_error("Error creating speech decoder.");
+
+ return decoder;
+}
+
+static Timeline utteranceToPhones(
+ const AudioClip& audioClip,
+ TimeRange utteranceTimeRange,
+ ps_decoder_t& decoder,
+ ProgressSink& utteranceProgressSink
+) {
+ // Pad time range to give PocketSphinx some breathing room
+ TimeRange paddedTimeRange = utteranceTimeRange;
+ const centiseconds padding(3);
+ paddedTimeRange.grow(padding);
+ paddedTimeRange.trim(audioClip.getTruncatedRange());
+
+ const unique_ptr clipSegment = audioClip.clone() | segment(paddedTimeRange) | resample(sphinxSampleRate);
+ const auto audioBuffer = copyTo16bitBuffer(*clipSegment);
+
+ // Detect phones (returned as words)
+ BoundedTimeline phoneStrings = recognizeWords(audioBuffer, decoder);
+ phoneStrings.shift(paddedTimeRange.getStart());
+ Timeline utterancePhones;
+ for (const auto& timedPhoneString : phoneStrings) {
+ Phone phone = PhoneConverter::get().parse(timedPhoneString.getValue());
+ if (phone == Phone::AH && timedPhoneString.getDuration() < 6_cs) {
+ // Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
+ phone = Phone::Schwa;
+ }
+ utterancePhones.set(timedPhoneString.getTimeRange(), phone);
+ }
+
+ // Log raw phones
+ for (const auto& timedPhone : utterancePhones) {
+ logTimedEvent("rawPhone", timedPhone);
+ }
+
+ // Guess positions of noise sounds
+ JoiningTimeline noiseSounds = getNoiseSounds(utteranceTimeRange, utterancePhones);
+ for (const auto& noiseSound : noiseSounds) {
+ utterancePhones.set(noiseSound.getTimeRange(), Phone::Noise);
+ }
+
+ // Log phones
+ for (const auto& timedPhone : utterancePhones) {
+ logTimedEvent("phone", timedPhone);
+ }
+
+ utteranceProgressSink.reportProgress(1.0);
+
+ return utterancePhones;
+}
+
+BoundedTimeline PhoneticRecognizer::recognizePhones(
+ const AudioClip& inputAudioClip,
+ optional dialog,
+ int maxThreadCount,
+ ProgressSink& progressSink
+) const {
+ return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
+}
diff --git a/rhubarb/src/recognition/PhoneticRecognizer.h b/rhubarb/src/recognition/PhoneticRecognizer.h
new file mode 100644
index 0000000..96797cf
--- /dev/null
+++ b/rhubarb/src/recognition/PhoneticRecognizer.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "Recognizer.h"
+#include "pocketSphinxTools.h"
+
+class PhoneticRecognizer : public Recognizer {
+public:
+ BoundedTimeline recognizePhones(
+ const AudioClip& inputAudioClip,
+ boost::optional dialog,
+ int maxThreadCount,
+ ProgressSink& progressSink
+ ) const override;
+};
diff --git a/rhubarb/src/recognition/phoneRecognition.cpp b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
similarity index 55%
rename from rhubarb/src/recognition/phoneRecognition.cpp
rename to rhubarb/src/recognition/PocketSphinxRecognizer.cpp
index 5139596..b97c0b7 100644
--- a/rhubarb/src/recognition/phoneRecognition.cpp
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
@@ -1,143 +1,133 @@
-#include
-#include "phoneRecognition.h"
-#include "audio/SampleRateConverter.h"
-#include "tools/platformTools.h"
-#include "tools/tools.h"
-#include
-#include
+#include "PocketSphinxRecognizer.h"
#include
#include
-#include "logging/logging.h"
-#include "audio/DcOffset.h"
-#include "time/Timeline.h"
-#include "audio/voiceActivityDetection.h"
#include "audio/AudioSegment.h"
+#include "audio/SampleRateConverter.h"
#include "languageModels.h"
#include "tokenization.h"
#include "g2p.h"
#include "time/ContinuousTimeline.h"
#include "audio/processing.h"
-#include "tools/parallel.h"
-#include
-#include "tools/ObjectPool.h"
#include "time/timedLogging.h"
extern "C" {
-#include
-#include
-#include
#include
-#include
-#include
}
using std::runtime_error;
using std::invalid_argument;
using std::unique_ptr;
-using std::shared_ptr;
using std::string;
using std::vector;
using std::map;
using boost::filesystem::path;
-using std::function;
using std::regex;
using std::regex_replace;
-using std::chrono::duration;
using boost::optional;
-using std::string;
-using std::chrono::duration_cast;
using std::array;
-constexpr int sphinxSampleRate = 16000;
-
-const path& getSphinxModelDirectory() {
- static path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
- return sphinxModelDirectory;
+bool dictionaryContains(dict_t& dictionary, const string& word) {
+ return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
}
-logging::Level ConvertSphinxErrorLevel(err_lvl_t errorLevel) {
- switch (errorLevel) {
- case ERR_DEBUG:
- case ERR_INFO:
- case ERR_INFOCONT:
- return logging::Level::Trace;
- case ERR_WARN:
- return logging::Level::Warn;
- case ERR_ERROR:
- return logging::Level::Error;
- case ERR_FATAL:
- return logging::Level::Fatal;
- default:
- throw invalid_argument("Unknown log level.");
+s3wid_t getWordId(const string& word, dict_t& dictionary) {
+ const s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
+ if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
+ return wordId;
+}
+
+void addMissingDictionaryWords(const vector& words, ps_decoder_t& decoder) {
+ map missingPronunciations;
+ for (const string& word : words) {
+ if (!dictionaryContains(*decoder.dict, word)) {
+ string pronunciation;
+ for (Phone phone : wordToPhones(word)) {
+ if (pronunciation.length() > 0) pronunciation += " ";
+ pronunciation += PhoneConverter::get().toString(phone);
+ }
+ missingPronunciations[word] = pronunciation;
+ }
+ }
+ for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
+ const bool isLast = it == --missingPronunciations.end();
+ logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
+ ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
}
}
-void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
- UNUSED(user_data);
-
- // Create varArgs list
- va_list args;
- va_start(args, format);
- auto _ = gsl::finally([&args]() { va_end(args); });
-
- // Format message
- const int initialSize = 256;
- vector chars(initialSize);
- bool success = false;
- while (!success) {
- int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
- if (charsWritten < 0) throw runtime_error("Error formatting Pocketsphinx log message.");
-
- success = charsWritten < static_cast(chars.size());
- if (!success) chars.resize(chars.size() * 2);
- }
- regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
- string message = regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
- boost::algorithm::trim(message);
-
- logging::Level logLevel = ConvertSphinxErrorLevel(errorLevel);
- logging::log(logLevel, message);
-}
-
-BoundedTimeline recognizeWords(const vector& audioBuffer, ps_decoder_t& decoder) {
- // Restart timing at 0
- ps_start_stream(&decoder);
-
- // Start recognition
- int error = ps_start_utt(&decoder);
- if (error) throw runtime_error("Error starting utterance processing for word recognition.");
-
- // Process entire audio clip
- const bool noRecognition = false;
- const bool fullUtterance = true;
- int searchedFrameCount = ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
- if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
-
- // End recognition
- error = ps_end_utt(&decoder);
- if (error) throw runtime_error("Error ending utterance processing for word recognition.");
-
- BoundedTimeline result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
- bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0;
- if (noWordsRecognized) {
- return result;
- }
-
- // Collect words
- for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
- const char* word = ps_seg_word(it);
- int firstFrame, lastFrame;
- ps_seg_frames(it, &firstFrame, &lastFrame);
- result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
+lambda_unique_ptr createDefaultLanguageModel(ps_decoder_t& decoder) {
+ path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
+ lambda_unique_ptr result(
+ ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
+ [](ngram_model_t* lm) { ngram_model_free(lm); });
+ if (!result) {
+ throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
}
return result;
}
-s3wid_t getWordId(const string& word, dict_t& dictionary) {
- s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
- if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
- return wordId;
+lambda_unique_ptr createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
+ // Split dialog into normalized words
+ vector words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
+
+ // Add dialog-specific words to the dictionary
+ addMissingDictionaryWords(words, decoder);
+
+ // Create dialog-specific language model
+ words.insert(words.begin(), "");
+ words.emplace_back("");
+ return createLanguageModel(words, decoder);
+}
+
+lambda_unique_ptr createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
+ auto defaultLanguageModel = createDefaultLanguageModel(decoder);
+ auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
+ constexpr int modelCount = 2;
+ array languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
+ array modelNames{ "defaultLM", "dialogLM" };
+ array modelWeights{ 0.1f, 0.9f };
+ lambda_unique_ptr result(
+ ngram_model_set_init(nullptr, languageModels.data(), const_cast(modelNames.data()), modelWeights.data(), modelCount),
+ [](ngram_model_t* lm) { ngram_model_free(lm); });
+ if (!result) {
+ throw runtime_error("Error creating biased language model.");
+ }
+
+ return result;
+}
+
+static lambda_unique_ptr createDecoder(optional dialog) {
+ lambda_unique_ptr config(
+ cmd_ln_init(
+ nullptr, ps_args(), true,
+ // Set acoustic model
+ "-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
+ // Set pronunciation dictionary
+ "-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
+ // Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
+ "-dither", "yes",
+ // Disable VAD -- we're doing that ourselves
+ "-remove_silence", "no",
+ // Perform per-utterance cepstral mean normalization
+ "-cmn", "batch",
+ nullptr),
+ [](cmd_ln_t* config) { cmd_ln_free_r(config); });
+ if (!config) throw runtime_error("Error creating configuration.");
+
+ lambda_unique_ptr decoder(
+ ps_init(config.get()),
+ [](ps_decoder_t* recognizer) { ps_free(recognizer); });
+ if (!decoder) throw runtime_error("Error creating speech decoder.");
+
+ // Set language model
+ lambda_unique_ptr languageModel(dialog
+ ? createBiasedLanguageModel(*decoder, *dialog)
+ : createDefaultLanguageModel(*decoder));
+ ps_set_lm(decoder.get(), "lm", languageModel.get());
+ ps_set_search(decoder.get(), "lm");
+
+ return decoder;
}
optional> getPhoneAlignment(
@@ -178,7 +168,7 @@ optional> getPhoneAlignment(
// Process entire audio clip
const int16* nextSample = audioBuffer.data();
size_t remainingSamples = audioBuffer.size();
- bool fullUtterance = true;
+ const bool fullUtterance = true;
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, fullUtterance) > 0) {
while (acousticModel->n_feat_frame > 0) {
ps_search_step(search.get(), acousticModel->output_frame);
@@ -197,7 +187,7 @@ optional> getPhoneAlignment(
for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
// Get phone
ps_alignment_entry_t* phoneEntry = ps_alignment_iter_get(it);
- s3cipid_t phoneId = phoneEntry->id.pid.cipid;
+ const s3cipid_t phoneId = phoneEntry->id.pid.cipid;
string phoneName = phoneNames[phoneId];
if (phoneName == "SIL") continue;
@@ -207,162 +197,42 @@ optional> getPhoneAlignment(
centiseconds duration(phoneEntry->duration);
Phone phone = PhoneConverter::get().parse(phoneName);
if (phone == Phone::AH && duration < 6_cs) {
- // Heuristic: < 6_cs is schwa. Pocketsphinx doesn't differentiate.
+ // Heuristic: < 6_cs is schwa. PocketSphinx doesn't differentiate.
phone = Phone::Schwa;
}
- Timed timedPhone(start, start + duration, phone);
+ const Timed timedPhone(start, start + duration, phone);
result.set(timedPhone);
}
return result;
}
-bool dictionaryContains(dict_t& dictionary, const string& word) {
- return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
-}
-
-void addMissingDictionaryWords(const vector& words, ps_decoder_t& decoder) {
- map missingPronunciations;
- for (const string& word : words) {
- if (!dictionaryContains(*decoder.dict, word)) {
- string pronunciation;
- for (Phone phone : wordToPhones(word)) {
- if (pronunciation.length() > 0) pronunciation += " ";
- pronunciation += PhoneConverter::get().toString(phone);
- }
- missingPronunciations[word] = pronunciation;
- }
- }
- for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
- bool isLast = it == --missingPronunciations.end();
- logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
- ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
- }
-}
-
-lambda_unique_ptr createDefaultLanguageModel(ps_decoder_t& decoder) {
- path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
- lambda_unique_ptr result(
- ngram_model_read(decoder.config, modelPath.string().c_str(), NGRAM_AUTO, decoder.lmath),
- [](ngram_model_t* lm) { ngram_model_free(lm); });
- if (!result) {
- throw runtime_error(fmt::format("Error reading language model from {}.", modelPath));
- }
-
- return std::move(result);
-}
-
-lambda_unique_ptr createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
- // Split dialog into normalized words
- vector words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
-
- // Add dialog-specific words to the dictionary
- addMissingDictionaryWords(words, decoder);
-
- // Create dialog-specific language model
- words.insert(words.begin(), "");
- words.push_back("");
- return createLanguageModel(words, decoder);
-}
-
-lambda_unique_ptr createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
- auto defaultLanguageModel = createDefaultLanguageModel(decoder);
- auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
- constexpr int modelCount = 2;
- array languageModels{ defaultLanguageModel.get(), dialogLanguageModel.get() };
- array modelNames{ "defaultLM", "dialogLM" };
- array modelWeights{ 0.1f, 0.9f };
- lambda_unique_ptr result(
- ngram_model_set_init(nullptr, languageModels.data(), modelNames.data(), modelWeights.data(), modelCount),
- [](ngram_model_t* lm) { ngram_model_free(lm); });
- if (!result) {
- throw runtime_error("Error creating biased language model.");
- }
-
- return std::move(result);
-}
-
-lambda_unique_ptr createDecoder(optional dialog) {
- lambda_unique_ptr config(
- cmd_ln_init(
- nullptr, ps_args(), true,
- // Set acoustic model
- "-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
- // Set pronunciation dictionary
- "-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
- // Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
- "-dither", "yes",
- // Disable VAD -- we're doing that ourselves
- "-remove_silence", "no",
- // Perform per-utterance cepstral mean normalization
- "-cmn", "batch",
- nullptr),
- [](cmd_ln_t* config) { cmd_ln_free_r(config); });
- if (!config) throw runtime_error("Error creating configuration.");
-
- lambda_unique_ptr decoder(
- ps_init(config.get()),
- [](ps_decoder_t* recognizer) { ps_free(recognizer); });
- if (!decoder) throw runtime_error("Error creating speech decoder.");
-
- // Set language model
- lambda_unique_ptr languageModel(dialog
- ? createBiasedLanguageModel(*decoder, *dialog)
- : createDefaultLanguageModel(*decoder));
- ps_set_lm(decoder.get(), "lm", languageModel.get());
- ps_set_search(decoder.get(), "lm");
-
- return decoder;
-}
-
-JoiningTimeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones) {
- JoiningTimeline noiseSounds;
-
- // Find utterance parts without recogniced phones
- noiseSounds.set(utteranceTimeRange);
- for (const auto& timedPhone : phones) {
- noiseSounds.clear(timedPhone.getTimeRange());
- }
-
- // Remove undesired elements
- const centiseconds minSoundDuration = 12_cs;
- for (const auto& unknownSound : JoiningTimeline(noiseSounds)) {
- bool startsAtZero = unknownSound.getStart() == 0_cs;
- bool tooShort = unknownSound.getDuration() < minSoundDuration;
- if (startsAtZero || tooShort) {
- noiseSounds.clear(unknownSound.getTimeRange());
- }
- }
-
- return noiseSounds;
-}
-
// Some words have multiple pronunciations, one of which results in better animation than the others.
// This function returns the optimal pronunciation for a select set of these words.
string fixPronunciation(const string& word) {
- const static map replacements {
- {"into(2)", "into"},
- {"to(2)", "to"},
- {"to(3)", "to"},
- {"today(2)", "today"},
- {"tomorrow(2)", "tomorrow"},
- {"tonight(2)", "tonight"}
+ const static map replacements{
+ { "into(2)", "into" },
+ { "to(2)", "to" },
+ { "to(3)", "to" },
+ { "today(2)", "today" },
+ { "tomorrow(2)", "tomorrow" },
+ { "tonight(2)", "tonight" }
};
const auto pair = replacements.find(word);
return pair != replacements.end() ? pair->second : word;
}
-Timeline utteranceToPhones(
+static Timeline utteranceToPhones(
const AudioClip& audioClip,
TimeRange utteranceTimeRange,
ps_decoder_t& decoder,
- ProgressSink& utteranceProgressSink)
-{
+ ProgressSink& utteranceProgressSink
+) {
ProgressMerger utteranceProgressMerger(utteranceProgressSink);
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
- // Pad time range to give Pocketsphinx some breathing room
+ // Pad time range to give PocketSphinx some breathing room
TimeRange paddedTimeRange = utteranceTimeRange;
const centiseconds padding(3);
paddedTimeRange.grow(padding);
@@ -384,7 +254,7 @@ Timeline utteranceToPhones(
continue;
}
word = regex_replace(word, regex("\\(\\d\\)"), "");
- if (text.size() > 0) {
+ if (!text.empty()) {
text += " ";
}
text += word;
@@ -403,7 +273,7 @@ Timeline utteranceToPhones(
const string fixedWord = fixPronunciation(timedWord.getValue());
wordIds.push_back(getWordId(fixedWord, *decoder.dict));
}
- if (wordIds.empty()) return {};
+ if (wordIds.empty()) return{};
// Align the words' phones with speech
#if BOOST_VERSION < 105600 // Support legacy syntax
@@ -433,77 +303,11 @@ Timeline utteranceToPhones(
return utterancePhones;
}
-BoundedTimeline recognizePhones(
+BoundedTimeline PocketSphinxRecognizer::recognizePhones(
const AudioClip& inputAudioClip,
- optional dialog,
+ optional dialog,
int maxThreadCount,
- ProgressSink& progressSink)
-{
- ProgressMerger totalProgressMerger(progressSink);
- ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
- ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
-
- // Make sure audio stream has no DC offset
- const unique_ptr audioClip = inputAudioClip.clone() | removeDcOffset();
-
- // Split audio into utterances
- JoiningBoundedTimeline utterances;
- try {
- utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
- }
- catch (...) {
- std::throw_with_nested(runtime_error("Error detecting segments of speech."));
- }
-
- // Discard Pocketsphinx output
- err_set_logfp(nullptr);
-
- // Redirect Pocketsphinx output to log
- err_set_callback(sphinxLogCallback, nullptr);
-
- // Prepare pool of decoders
- ObjectPool> decoderPool(
- [&dialog] { return createDecoder(dialog); });
-
- BoundedTimeline phones(audioClip->getTruncatedRange());
- std::mutex resultMutex;
- auto processUtterance = [&](Timed timedUtterance, ProgressSink& utteranceProgressSink) {
- // Detect phones for utterance
- auto decoder = decoderPool.acquire();
- Timeline utterancePhones =
- utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
-
- // Copy phones to result timeline
- std::lock_guard lock(resultMutex);
- for (const auto& timedPhone : utterancePhones) {
- phones.set(timedPhone);
- }
- };
-
- auto getUtteranceProgressWeight = [](const Timed timedUtterance) {
- return timedUtterance.getDuration().count();
- };
-
- // Perform speech recognition
- try {
- // Determine how many parallel threads to use
- int threadCount = std::min({
- maxThreadCount,
- // Don't use more threads than there are utterances to be processed
- static_cast(utterances.size()),
- // Don't waste time creating additional threads (and decoders!) if the recording is short
- static_cast(duration_cast(audioClip->getTruncatedRange().getDuration()).count() / 5)
- });
- if (threadCount < 1) {
- threadCount = 1;
- }
- logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
- runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
- logging::debug("Speech recognition -- end");
- }
- catch (...) {
- std::throw_with_nested(runtime_error("Error performing speech recognition via Pocketsphinx."));
- }
-
- return phones;
+ ProgressSink& progressSink
+) const {
+ return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
}
diff --git a/rhubarb/src/recognition/PocketSphinxRecognizer.h b/rhubarb/src/recognition/PocketSphinxRecognizer.h
new file mode 100644
index 0000000..dc11d2d
--- /dev/null
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "Recognizer.h"
+#include "pocketSphinxTools.h"
+
+class PocketSphinxRecognizer : public Recognizer {
+public:
+ BoundedTimeline recognizePhones(
+ const AudioClip& inputAudioClip,
+ boost::optional dialog,
+ int maxThreadCount,
+ ProgressSink& progressSink
+ ) const override;
+};
diff --git a/rhubarb/src/recognition/Recognizer.h b/rhubarb/src/recognition/Recognizer.h
new file mode 100644
index 0000000..05c445d
--- /dev/null
+++ b/rhubarb/src/recognition/Recognizer.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "audio/AudioClip.h"
+#include "core/Phone.h"
+#include "tools/ProgressBar.h"
+#include "time/BoundedTimeline.h"
+
+class Recognizer {
+public:
+ virtual ~Recognizer() = default;
+
+ virtual BoundedTimelinerecognizePhones(
+ const AudioClip& audioClip,
+ boost::optional dialog,
+ int maxThreadCount,
+ ProgressSink& progressSink
+ ) const = 0;
+};
\ No newline at end of file
diff --git a/rhubarb/src/recognition/phoneRecognition.h b/rhubarb/src/recognition/phoneRecognition.h
deleted file mode 100644
index 2e66305..0000000
--- a/rhubarb/src/recognition/phoneRecognition.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include "audio/AudioClip.h"
-#include "core/Phone.h"
-#include "tools/ProgressBar.h"
-#include "time/BoundedTimeline.h"
-
-BoundedTimeline recognizePhones(
- const AudioClip& audioClip,
- boost::optional dialog,
- int maxThreadCount,
- ProgressSink& progressSink);
diff --git a/rhubarb/src/recognition/pocketSphinxTools.cpp b/rhubarb/src/recognition/pocketSphinxTools.cpp
new file mode 100644
index 0000000..87a13ea
--- /dev/null
+++ b/rhubarb/src/recognition/pocketSphinxTools.cpp
@@ -0,0 +1,218 @@
+#include "pocketSphinxTools.h"
+
+#include "tools/platformTools.h"
+#include
+#include "audio/DcOffset.h"
+#include "audio/voiceActivityDetection.h"
+#include "tools/parallel.h"
+#include "tools/ObjectPool.h"
+#include "time/timedLogging.h"
+
+extern "C" {
+#include
+#include
+#include
+}
+
+using std::runtime_error;
+using std::invalid_argument;
+using std::unique_ptr;
+using std::string;
+using std::vector;
+using boost::filesystem::path;
+using std::regex;
+using boost::optional;
+using std::chrono::duration_cast;
+
+logging::Level convertSphinxErrorLevel(err_lvl_t errorLevel) {
+ switch (errorLevel) {
+ case ERR_DEBUG:
+ case ERR_INFO:
+ case ERR_INFOCONT:
+ return logging::Level::Trace;
+ case ERR_WARN:
+ return logging::Level::Warn;
+ case ERR_ERROR:
+ return logging::Level::Error;
+ case ERR_FATAL:
+ return logging::Level::Fatal;
+ default:
+ throw invalid_argument("Unknown log level.");
+ }
+}
+
+void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format, ...) {
+ UNUSED(user_data);
+
+ // Create varArgs list
+ va_list args;
+ va_start(args, format);
+ auto _ = gsl::finally([&args]() { va_end(args); });
+
+ // Format message
+ const int initialSize = 256;
+ vector chars(initialSize);
+ bool success = false;
+ while (!success) {
+ const int charsWritten = vsnprintf(chars.data(), chars.size(), format, args);
+ if (charsWritten < 0) throw runtime_error("Error formatting PocketSphinx log message.");
+
+ success = charsWritten < static_cast(chars.size());
+ if (!success) chars.resize(chars.size() * 2);
+ }
+ const regex waste("^(DEBUG|INFO|INFOCONT|WARN|ERROR|FATAL): ");
+ string message = std::regex_replace(chars.data(), waste, "", std::regex_constants::format_first_only);
+ boost::algorithm::trim(message);
+
+ const logging::Level logLevel = convertSphinxErrorLevel(errorLevel);
+ logging::log(logLevel, message);
+}
+
+void redirectPocketSphinxOutput() {
+ static bool redirected = false;
+ if (redirected) return;
+
+ // Discard PocketSphinx output
+ err_set_logfp(nullptr);
+
+ // Redirect PocketSphinx output to log
+ err_set_callback(sphinxLogCallback, nullptr);
+
+ redirected = true;
+}
+
+BoundedTimeline recognizePhones(
+ const AudioClip& inputAudioClip,
+ optional dialog,
+ decoderFactory createDecoder,
+ utteranceToPhonesFunction utteranceToPhones,
+ int maxThreadCount,
+ ProgressSink& progressSink
+) {
+ ProgressMerger totalProgressMerger(progressSink);
+ ProgressSink& voiceActivationProgressSink = totalProgressMerger.addSink(1.0);
+ ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
+
+ // Make sure audio stream has no DC offset
+ const unique_ptr audioClip = inputAudioClip.clone() | removeDcOffset();
+
+ // Split audio into utterances
+ JoiningBoundedTimeline utterances;
+ try {
+ utterances = detectVoiceActivity(*audioClip, maxThreadCount, voiceActivationProgressSink);
+ } catch (...) {
+ std::throw_with_nested(runtime_error("Error detecting segments of speech."));
+ }
+
+ redirectPocketSphinxOutput();
+
+ // Prepare pool of decoders
+ ObjectPool> decoderPool(
+ [&] { return createDecoder(dialog); });
+
+ BoundedTimeline phones(audioClip->getTruncatedRange());
+ std::mutex resultMutex;
+ const auto processUtterance = [&](Timed timedUtterance, ProgressSink& utteranceProgressSink) {
+ // Detect phones for utterance
+ const auto decoder = decoderPool.acquire();
+ Timeline utterancePhones =
+ utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, utteranceProgressSink);
+
+ // Copy phones to result timeline
+ std::lock_guard lock(resultMutex);
+ for (const auto& timedPhone : utterancePhones) {
+ phones.set(timedPhone);
+ }
+ };
+
+ const auto getUtteranceProgressWeight = [](const Timed timedUtterance) {
+ return timedUtterance.getDuration().count();
+ };
+
+ // Perform speech recognition
+ try {
+ // Determine how many parallel threads to use
+ int threadCount = std::min({
+ maxThreadCount,
+ // Don't use more threads than there are utterances to be processed
+ static_cast(utterances.size()),
+ // Don't waste time creating additional threads (and decoders!) if the recording is short
+ static_cast(duration_cast(audioClip->getTruncatedRange().getDuration()).count() / 5)
+ });
+ if (threadCount < 1) {
+ threadCount = 1;
+ }
+ logging::debugFormat("Speech recognition using {} threads -- start", threadCount);
+ runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
+ logging::debug("Speech recognition -- end");
+ } catch (...) {
+ std::throw_with_nested(runtime_error("Error performing speech recognition via PocketSphinx."));
+ }
+
+ return phones;
+}
+
+const path& getSphinxModelDirectory() {
+ static path sphinxModelDirectory(getBinDirectory() / "res" / "sphinx");
+ return sphinxModelDirectory;
+}
+
+JoiningTimeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones) {
+ JoiningTimeline noiseSounds;
+
+ // Find utterance parts without recognized phones
+ noiseSounds.set(utteranceTimeRange);
+ for (const auto& timedPhone : phones) {
+ noiseSounds.clear(timedPhone.getTimeRange());
+ }
+
+ // Remove undesired elements
+ const centiseconds minSoundDuration = 12_cs;
+ for (const auto& unknownSound : JoiningTimeline(noiseSounds)) {
+ const bool startsAtZero = unknownSound.getStart() == 0_cs;
+ const bool tooShort = unknownSound.getDuration() < minSoundDuration;
+ if (startsAtZero || tooShort) {
+ noiseSounds.clear(unknownSound.getTimeRange());
+ }
+ }
+
+ return noiseSounds;
+}
+
+BoundedTimeline recognizeWords(const vector& audioBuffer, ps_decoder_t& decoder) {
+ // Restart timing at 0
+ ps_start_stream(&decoder);
+
+ // Start recognition
+ int error = ps_start_utt(&decoder);
+ if (error) throw runtime_error("Error starting utterance processing for word recognition.");
+
+ // Process entire audio clip
+ const bool noRecognition = false;
+ const bool fullUtterance = true;
+ const int searchedFrameCount =
+ ps_process_raw(&decoder, audioBuffer.data(), audioBuffer.size(), noRecognition, fullUtterance);
+ if (searchedFrameCount < 0) {
+ throw runtime_error("Error analyzing raw audio data for word recognition.");
+ }
+
+ // End recognition
+ error = ps_end_utt(&decoder);
+ if (error) throw runtime_error("Error ending utterance processing for word recognition.");
+
+ BoundedTimeline result(TimeRange(0_cs, centiseconds(100 * audioBuffer.size() / sphinxSampleRate)));
+ const bool noWordsRecognized = reinterpret_cast(decoder.search)->bpidx == 0;
+ if (noWordsRecognized) {
+ return result;
+ }
+
+ // Collect words
+ for (ps_seg_t* it = ps_seg_iter(&decoder); it; it = ps_seg_next(it)) {
+ const char* word = ps_seg_word(it);
+ int firstFrame, lastFrame;
+ ps_seg_frames(it, &firstFrame, &lastFrame);
+ result.set(centiseconds(firstFrame), centiseconds(lastFrame + 1), word);
+ }
+
+ return result;
+}
diff --git a/rhubarb/src/recognition/pocketSphinxTools.h b/rhubarb/src/recognition/pocketSphinxTools.h
new file mode 100644
index 0000000..568ccbe
--- /dev/null
+++ b/rhubarb/src/recognition/pocketSphinxTools.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "time/BoundedTimeline.h"
+#include "core/Phone.h"
+#include "audio/AudioClip.h"
+#include "tools/ProgressBar.h"
+#include
+
+extern "C" {
+#include
+}
+
+typedef std::function(
+ boost::optional dialog
+)> decoderFactory;
+
+typedef std::function(
+ const AudioClip& audioClip,
+ TimeRange utteranceTimeRange,
+ ps_decoder_t& decoder,
+ ProgressSink& utteranceProgressSink
+)> utteranceToPhonesFunction;
+
+BoundedTimeline recognizePhones(
+ const AudioClip& inputAudioClip,
+ boost::optional dialog,
+ decoderFactory createDecoder,
+ utteranceToPhonesFunction utteranceToPhones,
+ int maxThreadCount,
+ ProgressSink& progressSink
+);
+
+constexpr int sphinxSampleRate = 16000;
+
+const boost::filesystem::path& getSphinxModelDirectory();
+
+JoiningTimeline getNoiseSounds(TimeRange utteranceTimeRange, const Timeline& phones);
+
+BoundedTimeline recognizeWords(const std::vector& audioBuffer, ps_decoder_t& decoder);
diff --git a/rhubarb/src/rhubarb/RecognizerType.cpp b/rhubarb/src/rhubarb/RecognizerType.cpp
new file mode 100644
index 0000000..86f0837
--- /dev/null
+++ b/rhubarb/src/rhubarb/RecognizerType.cpp
@@ -0,0 +1,27 @@
+#include "RecognizerType.h"
+
+using std::string;
+
+RecognizerTypeConverter& RecognizerTypeConverter::get() {
+ static RecognizerTypeConverter converter;
+ return converter;
+}
+
+string RecognizerTypeConverter::getTypeName() {
+ return "RecognizerType";
+}
+
+EnumConverter::member_data RecognizerTypeConverter::getMemberData() {
+ return member_data{
+ { RecognizerType::PocketSphinx, "pocketSphinx" },
+ { RecognizerType::Phonetic, "phonetic" }
+ };
+}
+
+std::ostream& operator<<(std::ostream& stream, RecognizerType value) {
+ return RecognizerTypeConverter::get().write(stream, value);
+}
+
+std::istream& operator>>(std::istream& stream, RecognizerType& value) {
+ return RecognizerTypeConverter::get().read(stream, value);
+}
diff --git a/rhubarb/src/rhubarb/RecognizerType.h b/rhubarb/src/rhubarb/RecognizerType.h
new file mode 100644
index 0000000..6f8cf12
--- /dev/null
+++ b/rhubarb/src/rhubarb/RecognizerType.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "tools/EnumConverter.h"
+
+enum class RecognizerType {
+ PocketSphinx,
+ Phonetic
+};
+
+class RecognizerTypeConverter : public EnumConverter {
+public:
+ static RecognizerTypeConverter& get();
+protected:
+ std::string getTypeName() override;
+ member_data getMemberData() override;
+};
+
+std::ostream& operator<<(std::ostream& stream, RecognizerType value);
+
+std::istream& operator>>(std::istream& stream, RecognizerType& value);
diff --git a/rhubarb/src/rhubarb/main.cpp b/rhubarb/src/rhubarb/main.cpp
index 104a6e8..703dd67 100644
--- a/rhubarb/src/rhubarb/main.cpp
+++ b/rhubarb/src/rhubarb/main.cpp
@@ -27,6 +27,9 @@
#include "tools/platformTools.h"
#include "sinks.h"
#include "semanticEntries.h"
+#include "RecognizerType.h"
+#include "recognition/PocketSphinxRecognizer.h"
+#include "recognition/PhoneticRecognizer.h"
using std::exception;
using std::string;
@@ -36,9 +39,6 @@ using std::unique_ptr;
using std::make_unique;
using std::shared_ptr;
using std::make_shared;
-using std::map;
-using std::chrono::duration;
-using std::chrono::duration_cast;
using std::ofstream;
using boost::filesystem::path;
using boost::adaptors::transformed;
@@ -56,6 +56,10 @@ namespace TCLAP {
struct ArgTraits {
typedef ValueLike ValueCategory;
};
+ template<>
+ struct ArgTraits {
+ typedef ValueLike ValueCategory;
+ };
}
shared_ptr createFileSink(path path, logging::Level minLevel) {
@@ -66,6 +70,17 @@ shared_ptr createFileSink(path path, logging::Level minLevel) {
return make_shared(FileSink, minLevel);
}
+unique_ptr createRecognizer(RecognizerType recognizerType) {
+ switch (recognizerType) {
+ case RecognizerType::PocketSphinx:
+ return make_unique();
+ case RecognizerType::Phonetic:
+ return make_unique();
+ default:
+ throw std::runtime_error("Unknown recognizer.");
+ }
+}
+
unique_ptr createExporter(ExportFormat exportFormat) {
switch (exportFormat) {
case ExportFormat::Tsv:
@@ -123,6 +138,9 @@ int main(int platformArgc, char *platformArgv[]) {
auto exportFormats = vector(ExportFormatConverter::get().getValues());
tclap::ValuesConstraint exportFormatConstraint(exportFormats);
tclap::ValueArg exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::Tsv, &exportFormatConstraint, cmd);
+ auto recognizerTypes = vector(RecognizerTypeConverter::get().getValues());
+ tclap::ValuesConstraint recognizerConstraint(recognizerTypes);
+ tclap::ValueArg recognizerType("r", "recognizer", "The dialog recognizer.", false, RecognizerType::PocketSphinx, &recognizerConstraint, cmd);
tclap::UnlabeledValueArg inputFileName("inputFile", "The input file. Must be a sound file in WAVE format.", true, "", "string", cmd);
try {
@@ -169,6 +187,7 @@ int main(int platformArgc, char *platformArgv[]) {
JoiningContinuousTimeline animation = animateWaveFile(
inputFilePath,
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional(),
+ *createRecognizer(recognizerType.getValue()),
targetShapeSet,
maxThreadCount.getValue(),
progressSink);