2016-06-02 16:16:00 +00:00
|
|
|
|
#include <gmock/gmock.h>
|
|
|
|
|
#include "tokenization.h"
|
|
|
|
|
#include <regex>
|
2016-06-25 19:52:04 +00:00
|
|
|
|
#include <unordered_set>
|
2017-08-15 18:47:05 +00:00
|
|
|
|
#include <utf8.h>
|
2016-06-02 16:16:00 +00:00
|
|
|
|
|
|
|
|
|
using namespace testing;
|
|
|
|
|
using std::string;
|
|
|
|
|
using std::u32string;
|
|
|
|
|
using std::vector;
|
|
|
|
|
using std::regex;
|
|
|
|
|
|
2016-06-25 19:52:04 +00:00
|
|
|
|
bool returnTrue(const string&) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-02 16:16:00 +00:00
|
|
|
|
TEST(tokenizeText, simpleCases) {
|
2017-08-15 18:47:05 +00:00
|
|
|
|
EXPECT_THAT(tokenizeText("", returnTrue), IsEmpty());
|
|
|
|
|
EXPECT_THAT(tokenizeText(" \t\n\r\n ", returnTrue), IsEmpty());
|
2016-06-02 16:16:00 +00:00
|
|
|
|
EXPECT_THAT(
|
2017-08-15 18:47:05 +00:00
|
|
|
|
tokenizeText("Wit is educated insolence.", returnTrue),
|
2016-06-02 16:16:00 +00:00
|
|
|
|
ElementsAre("wit", "is", "educated", "insolence")
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(tokenizeText, numbers) {
|
|
|
|
|
EXPECT_THAT(
|
2017-08-15 18:47:05 +00:00
|
|
|
|
tokenizeText("Henry V died at 36.", returnTrue),
|
2016-06-02 16:16:00 +00:00
|
|
|
|
ElementsAre("henry", "the", "fifth", "died", "at", "thirty", "six")
|
|
|
|
|
);
|
|
|
|
|
EXPECT_THAT(
|
2017-08-15 18:47:05 +00:00
|
|
|
|
tokenizeText("I spent $4.50 on gum.", returnTrue),
|
2016-06-02 16:16:00 +00:00
|
|
|
|
ElementsAre("i", "spent", "four", "dollars", "fifty", "cents", "on", "gum")
|
|
|
|
|
);
|
|
|
|
|
EXPECT_THAT(
|
2017-08-15 18:47:05 +00:00
|
|
|
|
tokenizeText("I was born in 1982.", returnTrue),
|
2016-06-02 16:16:00 +00:00
|
|
|
|
ElementsAre("i", "was", "born", "in", "nineteen", "eighty", "two")
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(tokenizeText, abbreviations) {
|
|
|
|
|
EXPECT_THAT(
|
2017-08-15 18:47:05 +00:00
|
|
|
|
tokenizeText("Prof. Foo lives on Dr. Dolittle Dr.", [](const string& word) { return word == "prof."; }),
|
2016-06-25 19:52:04 +00:00
|
|
|
|
ElementsAre("prof.", "foo", "lives", "on", "doctor", "dolittle", "drive")
|
2016-06-02 16:16:00 +00:00
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(tokenizeText, apostrophes) {
|
|
|
|
|
EXPECT_THAT(
|
2017-08-15 18:47:05 +00:00
|
|
|
|
tokenizeText("'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk.", [](const string& word) { return word == "wouldn't"; }),
|
2016-06-25 19:52:04 +00:00
|
|
|
|
ElementsAreArray(vector<string>{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldn't", "walk" })
|
2016-06-02 16:16:00 +00:00
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(tokenizeText, math) {
|
|
|
|
|
EXPECT_THAT(
|
2017-08-15 18:47:05 +00:00
|
|
|
|
tokenizeText("'1+2*3=7", returnTrue),
|
2016-06-02 16:16:00 +00:00
|
|
|
|
ElementsAre("one", "plus", "two", "times", "three", "equals", "seven")
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-15 18:47:05 +00:00
|
|
|
|
TEST(tokenizeText, unicodeCharacters) {
|
|
|
|
|
EXPECT_THAT(
|
|
|
|
|
tokenizeText("A naïve man called 晨 had piña colada and crème brûlée.", returnTrue),
|
|
|
|
|
ElementsAre("a", "naive", "man", "called", "had", "pina", "colada", "and", "creme", "brulee")
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-02 16:16:00 +00:00
|
|
|
|
// Checks that each word contains only the characters a-z and the apostrophe
|
|
|
|
|
TEST(tokenizeText, wordsUseLimitedCharacters) {
|
|
|
|
|
// Create string containing lots of undesirable characters
|
2017-08-15 18:47:05 +00:00
|
|
|
|
string input = "A naïve man called 晨 was having piña colada and crème brûlée.";
|
2016-06-02 16:16:00 +00:00
|
|
|
|
for (char32_t c = 0; c <= 1000; ++c) {
|
2017-08-15 18:47:05 +00:00
|
|
|
|
input.append(" ");
|
|
|
|
|
utf8::append(c, back_inserter(input));
|
2016-06-02 16:16:00 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
regex legal("^[a-z']+$");
|
2016-06-25 19:52:04 +00:00
|
|
|
|
auto words = tokenizeText(input, returnTrue);
|
2016-06-02 16:16:00 +00:00
|
|
|
|
for (const string& word : words) {
|
|
|
|
|
EXPECT_TRUE(std::regex_match(word, legal)) << word;
|
|
|
|
|
}
|
|
|
|
|
}
|