rhubarb-lip-sync/rhubarb/tests/tokenizationTests.cpp

#include <gmock/gmock.h>
#include "recognition/tokenization.h"
#include <regex>
#include <unordered_set>
#include <utf8.h>

using namespace testing;
using std::string;
using std::u32string;
using std::vector;
using std::regex;

bool returnTrue(const string&) {
	return true;
}

TEST(tokenizeText, simpleCases) {
	EXPECT_THAT(tokenizeText("", returnTrue), IsEmpty());
	EXPECT_THAT(tokenizeText("  \t\n\r\n ", returnTrue), IsEmpty());
	EXPECT_THAT(
		tokenizeText("Wit is educated insolence.", returnTrue),
		ElementsAre("wit", "is", "educated", "insolence")
	);
}

TEST(tokenizeText, numbers) {
	EXPECT_THAT(
		tokenizeText("Henry V died at 36.", returnTrue),
		ElementsAre("henry", "the", "fifth", "died", "at", "thirty", "six")
	);
	EXPECT_THAT(
		tokenizeText("I spent $4.50 on gum.", returnTrue),
		ElementsAre("i", "spent", "four", "dollars", "fifty", "cents", "on", "gum")
	);
	EXPECT_THAT(
		tokenizeText("I was born in 1982.", returnTrue),
		ElementsAre("i", "was", "born", "in", "nineteen", "eighty", "two")
	);
}

TEST(tokenizeText, abbreviations) {
	EXPECT_THAT(
		tokenizeText(
			"Prof. Foo lives on Dr. Dolittle Dr.",
			[](const string& word) { return word == "prof."; }
		),
		ElementsAre("prof.", "foo", "lives", "on", "doctor", "dolittle", "drive")
	);
}

TEST(tokenizeText, apostrophes) {
	EXPECT_THAT(
		tokenizeText(
			"'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk.",
			[](const string& word) { return word == "wouldn't"; }
		),
		ElementsAreArray(
			vector<string>{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldn't", "walk" })
	);
}

TEST(tokenizeText, math) {
	EXPECT_THAT(
		tokenizeText("'1+2*3=7", returnTrue),
		ElementsAre("one", "plus", "two", "times", "three", "equals", "seven")
	);
}

TEST(tokenizeText, unicodeCharacters) {
	EXPECT_THAT(
		tokenizeText("A naïve man called 晨 had piña colada and crème brûlée.", returnTrue),
		ElementsAre("a", "naive", "man", "called", "had", "pina", "colada", "and", "creme", "brulee")
	);
}

// Checks that each word contains only the characters a-z and the apostrophe
TEST(tokenizeText, wordsUseLimitedCharacters) {
	// Create string containing lots of undesirable characters
	string input = "A naïve man called 晨 was having piña colada and crème brûlée.";
	for (char32_t c = 0; c <= 1000; ++c) {
		input.append(" ");
		utf8::append(c, back_inserter(input));
	}

	const regex legal("^[a-z']+$");
	auto words = tokenizeText(input, returnTrue);
	for (const string& word : words) {
		EXPECT_TRUE(std::regex_match(word, legal)) << word;
	}
}
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+								#include <gmock/gmock.h>
-												Refactoring: Made imports more specific

											
										
										
											2017-09-10 20:17:17 +00:00
+								#include "recognition/tokenization.h"
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+								#include <regex>
-												Improved tokenization by taking dictionary into account

											
										
										
											2016-06-25 19:52:04 +00:00
+								#include <unordered_set>
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+								#include <utf8.h>
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
 								using namespace testing;
 								using std::string;
 								using std::u32string;
 								using std::vector;
 								using std::regex;
-												Improved tokenization by taking dictionary into account

											
										
										
											2016-06-25 19:52:04 +00:00
+								bool returnTrue(const string&) {
 									return true;
 								}
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+								TEST(tokenizeText, simpleCases) {
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+									EXPECT_THAT(tokenizeText("", returnTrue), IsEmpty());
 									EXPECT_THAT(tokenizeText("  \t\n\r\n ", returnTrue), IsEmpty());
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+									EXPECT_THAT(
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+										tokenizeText("Wit is educated insolence.", returnTrue),
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+										ElementsAre("wit", "is", "educated", "insolence")
 									);
 								}
 								TEST(tokenizeText, numbers) {
 									EXPECT_THAT(
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+										tokenizeText("Henry V died at 36.", returnTrue),
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+										ElementsAre("henry", "the", "fifth", "died", "at", "thirty", "six")
 									);
 									EXPECT_THAT(
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+										tokenizeText("I spent $4.50 on gum.", returnTrue),
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+										ElementsAre("i", "spent", "four", "dollars", "fifty", "cents", "on", "gum")
 									);
 									EXPECT_THAT(
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+										tokenizeText("I was born in 1982.", returnTrue),
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+										ElementsAre("i", "was", "born", "in", "nineteen", "eighty", "two")
 									);
 								}
 								TEST(tokenizeText, abbreviations) {
 									EXPECT_THAT(
-												Code cleanup

* Fix linter warnings
* Unify code formatting
* Fix typos

											
										
										
											2019-01-02 19:00:34 +00:00
+										tokenizeText(
 											"Prof. Foo lives on Dr. Dolittle Dr.",
 											[](const string& word) { return word == "prof."; }
 										),
-												Improved tokenization by taking dictionary into account

											
										
										
											2016-06-25 19:52:04 +00:00
+										ElementsAre("prof.", "foo", "lives", "on", "doctor", "dolittle", "drive")
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+									);
 								}
 								TEST(tokenizeText, apostrophes) {
 									EXPECT_THAT(
-												Code cleanup

* Fix linter warnings
* Unify code formatting
* Fix typos

											
										
										
											2019-01-02 19:00:34 +00:00
+										tokenizeText(
 											"'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk.",
 											[](const string& word) { return word == "wouldn't"; }
 										),
 										ElementsAreArray(
 											vector<string>{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldn't", "walk" })
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+									);
 								}
 								TEST(tokenizeText, math) {
 									EXPECT_THAT(
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+										tokenizeText("'1+2*3=7", returnTrue),
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+										ElementsAre("one", "plus", "two", "times", "three", "equals", "seven")
 									);
 								}
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+								TEST(tokenizeText, unicodeCharacters) {
 									EXPECT_THAT(
 										tokenizeText("A naïve man called 晨 had piña colada and crème brûlée.", returnTrue),
 										ElementsAre("a", "naive", "man", "called", "had", "pina", "colada", "and", "creme", "brulee")
 									);
 								}
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+								// Checks that each word contains only the characters a-z and the apostrophe
 								TEST(tokenizeText, wordsUseLimitedCharacters) {
 									// Create string containing lots of undesirable characters
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+									string input = "A naïve man called 晨 was having piña colada and crème brûlée.";
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+									for (char32_t c = 0; c <= 1000; ++c) {
-												Made entire application Unicode-aware

All 8-bit strings are UTF-8-encoded now.

											
										
										
											2017-08-15 18:47:05 +00:00
+										input.append(" ");
 										utf8::append(c, back_inserter(input));
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+									}
-												Code cleanup

* Fix linter warnings
* Unify code formatting
* Fix typos

											
										
										
											2019-01-02 19:00:34 +00:00
+									const regex legal("^[a-z']+$");
-												Improved tokenization by taking dictionary into account

											
										
										
											2016-06-25 19:52:04 +00:00
+									auto words = tokenizeText(input, returnTrue);
-												Implemented text tokenization using Flite

											
										
										
											2016-06-02 16:16:00 +00:00
+									for (const string& word : words) {
 										EXPECT_TRUE(std::regex_match(word, legal)) << word;
 									}
 								}