rhubarb-lip-sync/src/recognition/tokenization.cpp

#include "tokenization.h"
#include "tools.h"
#include "stringTools.h"
#include <regex>

extern "C" {
#include <cst_utt_utils.h>
#include <lang/usenglish/usenglish.h>
#include <lang/cmulex/cmu_lex.h>
}

using std::runtime_error;
using std::u32string;
using std::string;
using std::vector;
using std::regex;
using std::pair;
using boost::optional;
using std::function;

lambda_unique_ptr<cst_voice> createDummyVoice() {
	lambda_unique_ptr<cst_voice> voice(new_voice(), [](cst_voice* voice) { delete_voice(voice); });
	voice->name = "dummy_voice";
	usenglish_init(voice.get());
	cst_lexicon *lexicon = cmu_lex_init();
	feat_set(voice->features, "lexicon", lexicon_val(lexicon));
	return voice;
}

static const cst_synth_module synth_method_normalize[] = {
	{ "tokenizer_func", default_tokenization },		// split text into tokens
	{ "textanalysis_func", default_textanalysis },	// transform tokens into words
	{ nullptr, nullptr }
};

vector<string> tokenizeViaFlite(const string& text) {
	// Create utterance object with text
	lambda_unique_ptr<cst_utterance> utterance(new_utterance(), [](cst_utterance* utterance) { delete_utterance(utterance); });
	utt_set_input_text(utterance.get(), text.c_str());
	lambda_unique_ptr<cst_voice> voice = createDummyVoice();
	utt_init(utterance.get(), voice.get());

	// Perform tokenization and text normalization
	if (!apply_synth_method(utterance.get(), synth_method_normalize)) {
		throw runtime_error("Error normalizing text using Flite.");
	}

	vector<string> result;
	for (cst_item* item = relation_head(utt_relation(utterance.get(), "Word")); item; item = item_next(item)) {
		const char* word = item_feat_string(item, "name");
		result.push_back(word);
	}
	return result;
}

optional<string> findSimilarDictionaryWord(const string& word, function<bool(const string&)> dictionaryContains) {
	for (bool addPeriod : { false, true }) {
		for (int apostropheIndex = -1; apostropheIndex <= static_cast<int>(word.size()); ++apostropheIndex) {
			string modified = word;
			if (apostropheIndex != -1) {
				modified.insert(apostropheIndex, "'");
			}
			if (addPeriod) {
				modified += ".";
			}

			if (dictionaryContains(modified)) {
				return modified;
			}
		}
	}

	return boost::none;
}

vector<string> tokenizeText(const u32string& text, function<bool(const string&)> dictionaryContains) {
	vector<string> words = tokenizeViaFlite(toASCII(text));

	// Join words separated by apostophes
	for (int i = words.size() - 1; i > 0; --i) {
		if (words[i].size() > 0 && words[i][0] == '\'') {
			words[i - 1].append(words[i]);
			words.erase(words.begin() + i);
		}
	}

	// Turn some symbols into words, remove the rest
	const static vector<pair<regex, string>> replacements {
		{ regex("&"), "and" },
		{ regex("\\*"), "times" },
		{ regex("\\+"), "plus" },
		{ regex("="), "equals" },
		{ regex("@"), "at" },
		{ regex("[^a-z']"), "" }
	};
	for (size_t i = 0; i < words.size(); ++i) {
		for (const auto& replacement : replacements) {
			words[i] = regex_replace(words[i], replacement.first, replacement.second);
		}
	}

	// Remove empty words
	words.erase(std::remove_if(words.begin(), words.end(), [](const string& s) { return s.empty(); }), words.end());

	// Try to replace words that are not in the dictionary with similar ones that are
	for (size_t i = 0; i < words.size(); ++i) {
		if (!dictionaryContains(words[i])) {
			optional<string> modifiedWord = findSimilarDictionaryWord(words[i], dictionaryContains);
			if (modifiedWord) {
				words[i] = *modifiedWord;
			}
		}
	}

	return words;
}