Made entire application Unicode-aware
All 8-bit strings are UTF-8-encoded now.
This commit is contained in:
parent
7789d43e47
commit
5f451feb00
|
@ -424,6 +424,8 @@ target_include_directories(rhubarb-tools PUBLIC "src/tools")
|
||||||
target_link_libraries(rhubarb-tools
|
target_link_libraries(rhubarb-tools
|
||||||
cppFormat
|
cppFormat
|
||||||
whereami
|
whereami
|
||||||
|
utfcpp
|
||||||
|
utf8proc
|
||||||
)
|
)
|
||||||
|
|
||||||
# Define Rhubarb executable
|
# Define Rhubarb executable
|
||||||
|
|
|
@ -6,13 +6,13 @@
|
||||||
#include "WaveFileReader.h"
|
#include "WaveFileReader.h"
|
||||||
|
|
||||||
using boost::optional;
|
using boost::optional;
|
||||||
using std::u32string;
|
using std::string;
|
||||||
using boost::filesystem::path;
|
using boost::filesystem::path;
|
||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
|
|
||||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||||
const AudioClip& audioClip,
|
const AudioClip& audioClip,
|
||||||
optional<u32string> dialog,
|
optional<string> dialog,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
|
@ -32,7 +32,7 @@ unique_ptr<AudioClip> createWaveAudioClip(path filePath) {
|
||||||
|
|
||||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||||
path filePath,
|
path filePath,
|
||||||
optional<u32string> dialog,
|
optional<string> dialog,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
|
|
|
@ -9,14 +9,14 @@
|
||||||
|
|
||||||
JoiningContinuousTimeline<Shape> animateAudioClip(
|
JoiningContinuousTimeline<Shape> animateAudioClip(
|
||||||
const AudioClip& audioClip,
|
const AudioClip& audioClip,
|
||||||
boost::optional<std::u32string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink);
|
ProgressSink& progressSink);
|
||||||
|
|
||||||
JoiningContinuousTimeline<Shape> animateWaveFile(
|
JoiningContinuousTimeline<Shape> animateWaveFile(
|
||||||
boost::filesystem::path filePath,
|
boost::filesystem::path filePath,
|
||||||
boost::optional<std::u32string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
const ShapeSet& targetShapeSet,
|
const ShapeSet& targetShapeSet,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink);
|
ProgressSink& progressSink);
|
||||||
|
|
22
src/main.cpp
22
src/main.cpp
|
@ -24,10 +24,11 @@
|
||||||
#include <boost/iostreams/device/null.hpp>
|
#include <boost/iostreams/device/null.hpp>
|
||||||
#include "targetShapeSet.h"
|
#include "targetShapeSet.h"
|
||||||
#include <boost/utility/in_place_factory.hpp>
|
#include <boost/utility/in_place_factory.hpp>
|
||||||
|
#include "platformTools.h"
|
||||||
|
|
||||||
using std::exception;
|
using std::exception;
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::u32string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
using std::make_unique;
|
using std::make_unique;
|
||||||
|
@ -97,7 +98,14 @@ ShapeSet getTargetShapeSet(const string& extendedShapesString) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int platformArgc, char *platformArgv[]) {
|
||||||
|
// Use UTF-8 throughout
|
||||||
|
useUtf8ForConsole();
|
||||||
|
useUtf8ForBoostFilesystem();
|
||||||
|
|
||||||
|
// Convert command-line arguments to UTF-8
|
||||||
|
const vector<string> args = argsToUtf8(platformArgc, platformArgv);
|
||||||
|
|
||||||
auto pausableStderrSink = addPausableStdErrSink(logging::Level::Warn);
|
auto pausableStderrSink = addPausableStdErrSink(logging::Level::Warn);
|
||||||
pausableStderrSink->pause();
|
pausableStderrSink->pause();
|
||||||
|
|
||||||
|
@ -130,7 +138,11 @@ int main(int argc, char *argv[]) {
|
||||||
});
|
});
|
||||||
|
|
||||||
// Parse command line
|
// Parse command line
|
||||||
cmd.parse(argc, argv);
|
{
|
||||||
|
// TCLAP mutates the function argument! Pass a copy.
|
||||||
|
vector<string> argsCopy(args);
|
||||||
|
cmd.parse(argsCopy);
|
||||||
|
}
|
||||||
if (quietMode.getValue()) {
|
if (quietMode.getValue()) {
|
||||||
infoStream = &nullStream;
|
infoStream = &nullStream;
|
||||||
}
|
}
|
||||||
|
@ -146,7 +158,7 @@ int main(int argc, char *argv[]) {
|
||||||
}
|
}
|
||||||
|
|
||||||
logging::infoFormat("Application startup. Command line: {}", join(
|
logging::infoFormat("Application startup. Command line: {}", join(
|
||||||
vector<char*>(argv, argv + argc) | transformed([](char* arg) { return fmt::format("\"{}\"", arg); }), " "));
|
args | transformed([](string arg) { return fmt::format("\"{}\"", arg); }), " "));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
*infoStream << fmt::format("Generating lip sync data for {}.", inputFilePath) << std::endl;
|
*infoStream << fmt::format("Generating lip sync data for {}.", inputFilePath) << std::endl;
|
||||||
|
@ -158,7 +170,7 @@ int main(int argc, char *argv[]) {
|
||||||
// Animate the recording
|
// Animate the recording
|
||||||
animation = animateWaveFile(
|
animation = animateWaveFile(
|
||||||
inputFilePath,
|
inputFilePath,
|
||||||
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<u32string>(),
|
dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional<string>(),
|
||||||
targetShapeSet,
|
targetShapeSet,
|
||||||
maxThreadCount.getValue(),
|
maxThreadCount.getValue(),
|
||||||
progressBar);
|
progressBar);
|
||||||
|
|
|
@ -11,7 +11,6 @@
|
||||||
#include <gsl_util.h>
|
#include <gsl_util.h>
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::u32string;
|
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using std::regex;
|
using std::regex;
|
||||||
using std::map;
|
using std::map;
|
||||||
|
|
|
@ -44,7 +44,7 @@ using std::regex;
|
||||||
using std::regex_replace;
|
using std::regex_replace;
|
||||||
using std::chrono::duration;
|
using std::chrono::duration;
|
||||||
using boost::optional;
|
using boost::optional;
|
||||||
using std::u32string;
|
using std::string;
|
||||||
using std::chrono::duration_cast;
|
using std::chrono::duration_cast;
|
||||||
using std::array;
|
using std::array;
|
||||||
|
|
||||||
|
@ -251,7 +251,7 @@ lambda_unique_ptr<ngram_model_t> createDefaultLanguageModel(ps_decoder_t& decode
|
||||||
return std::move(result);
|
return std::move(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const u32string& dialog) {
|
lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||||
// Split dialog into normalized words
|
// Split dialog into normalized words
|
||||||
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
|
vector<string> words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); });
|
||||||
|
|
||||||
|
@ -264,7 +264,7 @@ lambda_unique_ptr<ngram_model_t> createDialogLanguageModel(ps_decoder_t& decoder
|
||||||
return createLanguageModel(words, decoder);
|
return createLanguageModel(words, decoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const u32string& dialog) {
|
lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) {
|
||||||
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
|
auto defaultLanguageModel = createDefaultLanguageModel(decoder);
|
||||||
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
|
auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog);
|
||||||
constexpr int modelCount = 2;
|
constexpr int modelCount = 2;
|
||||||
|
@ -281,7 +281,7 @@ lambda_unique_ptr<ngram_model_t> createBiasedLanguageModel(ps_decoder_t& decoder
|
||||||
return std::move(result);
|
return std::move(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
lambda_unique_ptr<ps_decoder_t> createDecoder(optional<u32string> dialog) {
|
lambda_unique_ptr<ps_decoder_t> createDecoder(optional<string> dialog) {
|
||||||
lambda_unique_ptr<cmd_ln_t> config(
|
lambda_unique_ptr<cmd_ln_t> config(
|
||||||
cmd_ln_init(
|
cmd_ln_init(
|
||||||
nullptr, ps_args(), true,
|
nullptr, ps_args(), true,
|
||||||
|
@ -435,7 +435,7 @@ Timeline<Phone> utteranceToPhones(
|
||||||
|
|
||||||
BoundedTimeline<Phone> recognizePhones(
|
BoundedTimeline<Phone> recognizePhones(
|
||||||
const AudioClip& inputAudioClip,
|
const AudioClip& inputAudioClip,
|
||||||
optional<u32string> dialog,
|
optional<string> dialog,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
|
|
|
@ -7,6 +7,6 @@
|
||||||
|
|
||||||
BoundedTimeline<Phone> recognizePhones(
|
BoundedTimeline<Phone> recognizePhones(
|
||||||
const AudioClip& audioClip,
|
const AudioClip& audioClip,
|
||||||
boost::optional<std::u32string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
int maxThreadCount,
|
int maxThreadCount,
|
||||||
ProgressSink& progressSink);
|
ProgressSink& progressSink);
|
||||||
|
|
|
@ -10,7 +10,6 @@ extern "C" {
|
||||||
}
|
}
|
||||||
|
|
||||||
using std::runtime_error;
|
using std::runtime_error;
|
||||||
using std::u32string;
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using std::regex;
|
using std::regex;
|
||||||
|
@ -34,9 +33,12 @@ static const cst_synth_module synth_method_normalize[] = {
|
||||||
};
|
};
|
||||||
|
|
||||||
vector<string> tokenizeViaFlite(const string& text) {
|
vector<string> tokenizeViaFlite(const string& text) {
|
||||||
|
// Convert text to ASCII
|
||||||
|
const string asciiText = utf8ToAscii(text);
|
||||||
|
|
||||||
// Create utterance object with text
|
// Create utterance object with text
|
||||||
lambda_unique_ptr<cst_utterance> utterance(new_utterance(), [](cst_utterance* utterance) { delete_utterance(utterance); });
|
lambda_unique_ptr<cst_utterance> utterance(new_utterance(), [](cst_utterance* utterance) { delete_utterance(utterance); });
|
||||||
utt_set_input_text(utterance.get(), text.c_str());
|
utt_set_input_text(utterance.get(), asciiText.c_str());
|
||||||
lambda_unique_ptr<cst_voice> voice = createDummyVoice();
|
lambda_unique_ptr<cst_voice> voice = createDummyVoice();
|
||||||
utt_init(utterance.get(), voice.get());
|
utt_init(utterance.get(), voice.get());
|
||||||
|
|
||||||
|
@ -73,8 +75,8 @@ optional<string> findSimilarDictionaryWord(const string& word, function<bool(con
|
||||||
return boost::none;
|
return boost::none;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<string> tokenizeText(const u32string& text, function<bool(const string&)> dictionaryContains) {
|
vector<string> tokenizeText(const string& text, function<bool(const string&)> dictionaryContains) {
|
||||||
vector<string> words = tokenizeViaFlite(toAscii(text));
|
vector<string> words = tokenizeViaFlite(text);
|
||||||
|
|
||||||
// Join words separated by apostophes
|
// Join words separated by apostophes
|
||||||
for (int i = words.size() - 1; i > 0; --i) {
|
for (int i = words.size() - 1; i > 0; --i) {
|
||||||
|
|
|
@ -4,4 +4,4 @@
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
std::vector<std::string> tokenizeText(const std::u32string& text, std::function<bool(const std::string&)> dictionaryContains);
|
std::vector<std::string> tokenizeText(const std::string& text, std::function<bool(const std::string&)> dictionaryContains);
|
||||||
|
|
|
@ -1,94 +0,0 @@
|
||||||
// Generated by asciiCases.rb; don't modify by hand!
|
|
||||||
|
|
||||||
case U'À': case U'Á': case U'Â': case U'Ã': case U'Ä': case U'Å': case U'Ā': case U'Ă': case U'Ą': case U'Ǎ': case U'Ǟ': case U'Ǡ': case U'Ǻ': case U'Ȁ': case U'Ȃ': case U'Ȧ': case U'Ⱥ':
|
|
||||||
return 'A';
|
|
||||||
case U'Ɓ': case U'Ƃ': case U'Ƀ':
|
|
||||||
return 'B';
|
|
||||||
case U'Ç': case U'Ć': case U'Ĉ': case U'Ċ': case U'Č': case U'Ƈ': case U'Ȼ':
|
|
||||||
return 'C';
|
|
||||||
case U'Ď': case U'Đ': case U'Ɗ': case U'Ƌ':
|
|
||||||
return 'D';
|
|
||||||
case U'È': case U'É': case U'Ê': case U'Ë': case U'Ē': case U'Ĕ': case U'Ė': case U'Ę': case U'Ě': case U'Ȅ': case U'Ȇ': case U'Ȩ': case U'Ɇ':
|
|
||||||
return 'E';
|
|
||||||
case U'Ƒ':
|
|
||||||
return 'F';
|
|
||||||
case U'Ĝ': case U'Ğ': case U'Ġ': case U'Ģ': case U'Ɠ': case U'Ǥ': case U'Ǧ': case U'Ǵ':
|
|
||||||
return 'G';
|
|
||||||
case U'Ĥ': case U'Ħ': case U'Ȟ':
|
|
||||||
return 'H';
|
|
||||||
case U'Ì': case U'Í': case U'Î': case U'Ï': case U'Ĩ': case U'Ī': case U'Ĭ': case U'Į': case U'İ': case U'Ɨ': case U'Ǐ': case U'Ȉ': case U'Ȋ':
|
|
||||||
return 'I';
|
|
||||||
case U'Ĵ': case U'Ɉ':
|
|
||||||
return 'J';
|
|
||||||
case U'Ķ': case U'Ƙ': case U'Ǩ':
|
|
||||||
return 'K';
|
|
||||||
case U'Ĺ': case U'Ļ': case U'Ľ': case U'Ŀ': case U'Ł': case U'Ƚ':
|
|
||||||
return 'L';
|
|
||||||
case U'Ñ': case U'Ń': case U'Ņ': case U'Ň': case U'Ɲ': case U'Ǹ': case U'Ƞ':
|
|
||||||
return 'N';
|
|
||||||
case U'Ò': case U'Ó': case U'Ô': case U'Õ': case U'Ö': case U'Ø': case U'Ō': case U'Ŏ': case U'Ő': case U'Ɵ': case U'Ơ': case U'Ǒ': case U'Ǫ': case U'Ǭ': case U'Ǿ': case U'Ȍ': case U'Ȏ': case U'Ȫ': case U'Ȭ': case U'Ȯ': case U'Ȱ':
|
|
||||||
return 'O';
|
|
||||||
case U'Ƥ':
|
|
||||||
return 'P';
|
|
||||||
case U'Ŕ': case U'Ŗ': case U'Ř': case U'Ȑ': case U'Ȓ': case U'Ɍ':
|
|
||||||
return 'R';
|
|
||||||
case U'Ś': case U'Ŝ': case U'Ş': case U'Š': case U'Ș':
|
|
||||||
return 'S';
|
|
||||||
case U'Ţ': case U'Ť': case U'Ŧ': case U'Ƭ': case U'Ʈ': case U'Ț': case U'Ⱦ':
|
|
||||||
return 'T';
|
|
||||||
case U'Ù': case U'Ú': case U'Û': case U'Ü': case U'Ũ': case U'Ū': case U'Ŭ': case U'Ů': case U'Ű': case U'Ų': case U'Ư': case U'Ǔ': case U'Ǖ': case U'Ǘ': case U'Ǚ': case U'Ǜ': case U'Ȕ': case U'Ȗ': case U'Ʉ':
|
|
||||||
return 'U';
|
|
||||||
case U'Ʋ':
|
|
||||||
return 'V';
|
|
||||||
case U'Ŵ':
|
|
||||||
return 'W';
|
|
||||||
case U'Ý': case U'Ŷ': case U'Ÿ': case U'Ƴ': case U'Ȳ': case U'Ɏ':
|
|
||||||
return 'Y';
|
|
||||||
case U'Ź': case U'Ż': case U'Ž': case U'Ƶ': case U'Ȥ':
|
|
||||||
return 'Z';
|
|
||||||
case U'à': case U'á': case U'â': case U'ã': case U'ä': case U'å': case U'ā': case U'ă': case U'ą': case U'ǎ': case U'ǟ': case U'ǡ': case U'ǻ': case U'ȁ': case U'ȃ': case U'ȧ':
|
|
||||||
return 'a';
|
|
||||||
case U'ƀ': case U'ƃ':
|
|
||||||
return 'b';
|
|
||||||
case U'ç': case U'ć': case U'ĉ': case U'ċ': case U'č': case U'ƈ': case U'ȼ':
|
|
||||||
return 'c';
|
|
||||||
case U'ď': case U'đ': case U'ƌ': case U'ȡ':
|
|
||||||
return 'd';
|
|
||||||
case U'è': case U'é': case U'ê': case U'ë': case U'ē': case U'ĕ': case U'ė': case U'ę': case U'ě': case U'ȅ': case U'ȇ': case U'ȩ': case U'ɇ':
|
|
||||||
return 'e';
|
|
||||||
case U'ƒ':
|
|
||||||
return 'f';
|
|
||||||
case U'ĝ': case U'ğ': case U'ġ': case U'ģ': case U'ǥ': case U'ǧ': case U'ǵ':
|
|
||||||
return 'g';
|
|
||||||
case U'ĥ': case U'ħ': case U'ȟ':
|
|
||||||
return 'h';
|
|
||||||
case U'ì': case U'í': case U'î': case U'ï': case U'ĩ': case U'ī': case U'ĭ': case U'į': case U'ǐ': case U'ȉ': case U'ȋ':
|
|
||||||
return 'i';
|
|
||||||
case U'ĵ': case U'ǰ': case U'ɉ':
|
|
||||||
return 'j';
|
|
||||||
case U'ķ': case U'ƙ': case U'ǩ':
|
|
||||||
return 'k';
|
|
||||||
case U'ĺ': case U'ļ': case U'ľ': case U'ŀ': case U'ł': case U'ƚ': case U'ȴ':
|
|
||||||
return 'l';
|
|
||||||
case U'ñ': case U'ń': case U'ņ': case U'ň': case U'ʼn': case U'ƞ': case U'ǹ': case U'ȵ':
|
|
||||||
return 'n';
|
|
||||||
case U'ò': case U'ó': case U'ô': case U'õ': case U'ö': case U'ø': case U'ō': case U'ŏ': case U'ő': case U'ơ': case U'ǒ': case U'ǫ': case U'ǭ': case U'ǿ': case U'ȍ': case U'ȏ': case U'ȫ': case U'ȭ': case U'ȯ': case U'ȱ':
|
|
||||||
return 'o';
|
|
||||||
case U'ƥ':
|
|
||||||
return 'p';
|
|
||||||
case U'ɋ':
|
|
||||||
return 'q';
|
|
||||||
case U'ŕ': case U'ŗ': case U'ř': case U'ȑ': case U'ȓ': case U'ɍ':
|
|
||||||
return 'r';
|
|
||||||
case U'ś': case U'ŝ': case U'ş': case U'š': case U'ș': case U'ȿ':
|
|
||||||
return 's';
|
|
||||||
case U'ţ': case U'ť': case U'ŧ': case U'ƫ': case U'ƭ': case U'ț': case U'ȶ':
|
|
||||||
return 't';
|
|
||||||
case U'ù': case U'ú': case U'û': case U'ü': case U'ũ': case U'ū': case U'ŭ': case U'ů': case U'ű': case U'ų': case U'ư': case U'ǔ': case U'ǖ': case U'ǘ': case U'ǚ': case U'ǜ': case U'ȕ': case U'ȗ':
|
|
||||||
return 'u';
|
|
||||||
case U'ŵ':
|
|
||||||
return 'w';
|
|
||||||
case U'ý': case U'ÿ': case U'ŷ': case U'ƴ': case U'ȳ': case U'ɏ':
|
|
||||||
return 'y';
|
|
||||||
case U'ź': case U'ż': case U'ž': case U'ƶ': case U'ȥ': case U'ɀ':
|
|
||||||
return 'z';
|
|
|
@ -1,33 +0,0 @@
|
||||||
require 'open-uri'
|
|
||||||
require 'csv'
|
|
||||||
|
|
||||||
# Create mapping from ASCII characters to related Unicode characters
|
|
||||||
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
|
||||||
url = 'http://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt'
|
|
||||||
headers = [:code, :name, :category]
|
|
||||||
CSV.new(open(url), :col_sep => ';', :headers => headers).each do |row|
|
|
||||||
code = row[:code].hex
|
|
||||||
next if code < 0x80
|
|
||||||
break if code > 0x24f
|
|
||||||
|
|
||||||
char = [code].pack('U')
|
|
||||||
name = row[:name]
|
|
||||||
match = /^LATIN (CAPITAL|SMALL) LETTER ([A-Z])\b(?!.*\bLETTER\b)/.match(name)
|
|
||||||
if match
|
|
||||||
baseChar = match[2]
|
|
||||||
if match[1] == 'SMALL'
|
|
||||||
baseChar = (baseChar.ord + 0x20).chr
|
|
||||||
end
|
|
||||||
mapping[baseChar] << char
|
|
||||||
end
|
|
||||||
end
|
|
||||||
mapping = mapping.sort.to_h
|
|
||||||
|
|
||||||
# Generate asciiCases.cpp
|
|
||||||
File.open('asciiCases.cpp', 'w') do |file|
|
|
||||||
file.print "// Generated by #{__FILE__}; don't modify by hand!\n\n"
|
|
||||||
mapping.each do |asciiChar, unicodeChars|
|
|
||||||
file.print unicodeChars.map { |c| "case U'#{c}':" }.join(' '), "\n"
|
|
||||||
file.print "\treturn '#{asciiChar}';\n"
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -6,9 +6,21 @@
|
||||||
#include <boost/uuid/uuid_io.hpp>
|
#include <boost/uuid/uuid_io.hpp>
|
||||||
#include "platformTools.h"
|
#include "platformTools.h"
|
||||||
#include <whereami.h>
|
#include <whereami.h>
|
||||||
|
#include <utf8.h>
|
||||||
|
#include <gsl_util.h>
|
||||||
|
#include "tools.h"
|
||||||
|
#include <boost/filesystem/detail/utf8_codecvt_facet.hpp>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#include <Windows.h>
|
||||||
|
#include <io.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using boost::filesystem::path;
|
using boost::filesystem::path;
|
||||||
using std::string;
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
path getBinPath() {
|
path getBinPath() {
|
||||||
static const path binPath = [] {
|
static const path binPath = [] {
|
||||||
|
@ -69,3 +81,73 @@ std::string errorNumberToString(int errorNumber) {
|
||||||
#endif
|
#endif
|
||||||
return message;
|
return message;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vector<string> argsToUtf8(int argc, char* argv[]) {
|
||||||
|
#ifdef _WIN32
|
||||||
|
// On Windows, there is no way to convert the single-byte argument strings to Unicode.
|
||||||
|
// We'll just ignore them.
|
||||||
|
UNUSED(argc);
|
||||||
|
UNUSED(argv);
|
||||||
|
|
||||||
|
// Get command-line arguments as UTF16 strings
|
||||||
|
int argumentCount;
|
||||||
|
static_assert(sizeof(wchar_t) == sizeof(char16_t), "Expected wchar_t to be a 16-bit type.");
|
||||||
|
char16_t** args = reinterpret_cast<char16_t**>(CommandLineToArgvW(GetCommandLineW(), &argumentCount));
|
||||||
|
if (!args) {
|
||||||
|
throw std::runtime_error("Error splitting the UTF-16 command line arguments.");
|
||||||
|
}
|
||||||
|
auto freeArgs = gsl::finally([&]() { LocalFree(args); });
|
||||||
|
assert(argumentCount == argc);
|
||||||
|
|
||||||
|
// Convert UTF16 strings to UTF8
|
||||||
|
vector<string> result;
|
||||||
|
for (int i = 0; i < argc; ++i) {
|
||||||
|
std::u16string utf16String(args[i]);
|
||||||
|
string utf8String;
|
||||||
|
utf8::utf16to8(utf16String.begin(), utf16String.end(), back_inserter(utf8String));
|
||||||
|
result.push_back(utf8String);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
#else
|
||||||
|
// On Unix systems, command-line args are already in UTF-8 format. Just convert them to strings.
|
||||||
|
vector<string> result;
|
||||||
|
for (int i = 0; i < argc; ++i) {
|
||||||
|
result.push_back(string(argv[i]));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
class ConsoleBuffer : public std::stringbuf {
|
||||||
|
public:
|
||||||
|
explicit ConsoleBuffer(FILE* file)
|
||||||
|
: file(file) {}
|
||||||
|
|
||||||
|
int sync() override {
|
||||||
|
fputs(str().c_str(), file);
|
||||||
|
str("");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
FILE* file;
|
||||||
|
};
|
||||||
|
|
||||||
|
void useUtf8ForConsole() {
|
||||||
|
// Unix systems already expect UTF-8-encoded data
|
||||||
|
#ifdef _WIN32
|
||||||
|
// Set console code page to UTF-8 so the console knows how to interpret string data
|
||||||
|
SetConsoleOutputCP(CP_UTF8);
|
||||||
|
|
||||||
|
// Prevent default stream buffer from chopping up UTF-8 byte sequences.
|
||||||
|
// See https://stackoverflow.com/questions/45575863/how-to-print-utf-8-strings-to-stdcout-on-windows
|
||||||
|
std::cout.rdbuf(new ConsoleBuffer(stdout));
|
||||||
|
std::cerr.rdbuf(new ConsoleBuffer(stderr));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void useUtf8ForBoostFilesystem() {
|
||||||
|
std::locale globalLocale = std::locale();
|
||||||
|
std::locale utf8Locale(globalLocale, new boost::filesystem::detail::utf8_codecvt_facet);
|
||||||
|
path::imbue(utf8Locale);
|
||||||
|
}
|
||||||
|
|
|
@ -10,3 +10,8 @@ boost::filesystem::path getTempFilePath();
|
||||||
|
|
||||||
std::tm getLocalTime(const time_t& time);
|
std::tm getLocalTime(const time_t& time);
|
||||||
std::string errorNumberToString(int errorNumber);
|
std::string errorNumberToString(int errorNumber);
|
||||||
|
|
||||||
|
std::vector<std::string> argsToUtf8(int argc, char *argv[]);
|
||||||
|
|
||||||
|
void useUtf8ForConsole();
|
||||||
|
void useUtf8ForBoostFilesystem();
|
|
@ -1,12 +1,16 @@
|
||||||
#include "stringTools.h"
|
#include "stringTools.h"
|
||||||
#include <boost/algorithm/string/trim.hpp>
|
#include <boost/algorithm/string/trim.hpp>
|
||||||
#include <codecvt>
|
#include <utf8.h>
|
||||||
|
#include <utf8proc.h>
|
||||||
|
#include <regex>
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::wstring;
|
using std::wstring;
|
||||||
using std::u32string;
|
using std::u32string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using boost::optional;
|
using boost::optional;
|
||||||
|
using std::regex;
|
||||||
|
using std::regex_replace;
|
||||||
|
|
||||||
vector<string> splitIntoLines(const string& s) {
|
vector<string> splitIntoLines(const string& s) {
|
||||||
vector<string> lines;
|
vector<string> lines;
|
||||||
|
@ -83,6 +87,10 @@ vector<string> wrapString(const string& s, int lineLength, int hangingIndent) {
|
||||||
return lines;
|
return lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isValidUtf8(const string& s) {
|
||||||
|
return utf8::is_valid(s.begin(), s.end());
|
||||||
|
}
|
||||||
|
|
||||||
wstring latin1ToWide(const string& s) {
|
wstring latin1ToWide(const string& s) {
|
||||||
wstring result;
|
wstring result;
|
||||||
for (unsigned char c : s) {
|
for (unsigned char c : s) {
|
||||||
|
@ -91,40 +99,61 @@ wstring latin1ToWide(const string& s) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
optional<char> toAscii(char32_t ch) {
|
string utf8ToAscii(const string s) {
|
||||||
switch (ch) {
|
// Normalize string, simplifying it as much as possible
|
||||||
#include "asciiCases.cpp"
|
const NormalizationOptions options = NormalizationOptions::CompatibilityMode
|
||||||
default:
|
| NormalizationOptions::Decompose
|
||||||
return ch < 0x80 ? static_cast<char>(ch) : optional<char>();
|
| NormalizationOptions::SimplifyLineBreaks
|
||||||
}
|
| NormalizationOptions::SimplifyWhiteSpace
|
||||||
}
|
| NormalizationOptions::StripCharacterMarkings
|
||||||
|
| NormalizationOptions::StripIgnorableCharacters;
|
||||||
|
string simplified = normalizeUnicode(s, options);
|
||||||
|
|
||||||
string toAscii(const u32string& s) {
|
// Replace common Unicode characters with ASCII equivalents
|
||||||
string result;
|
static const vector<std::pair<regex, string>> replacements{
|
||||||
for (char32_t ch : s) {
|
{regex("«|»|“|”|„|‟"), "\""},
|
||||||
optional<char> ascii = toAscii(ch);
|
{regex("‘|’|‚|‛|‹|›"), "'"},
|
||||||
if (ascii) result.append(1, *ascii);
|
{regex("‐|‑|‒|⁃|⁻|₋|−|➖|–|—|―|﹘|﹣|-"), "-"},
|
||||||
|
{regex("…|⋯"), "..."},
|
||||||
|
{regex("•"), "*"},
|
||||||
|
{regex("†|+"), "+"},
|
||||||
|
{regex("⁄|∕|⧸|/|/"), "/"},
|
||||||
|
{regex("×"), "x"},
|
||||||
|
};
|
||||||
|
for (const auto& replacement : replacements) {
|
||||||
|
simplified = regex_replace(simplified, replacement.first, replacement.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Skip all non-ASCII code points, including multi-byte characters
|
||||||
|
string result;
|
||||||
|
for (char c : simplified) {
|
||||||
|
const bool isAscii = (c & 0x80) == 0;
|
||||||
|
if (isAscii) {
|
||||||
|
result.append(1, c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
string toAscii(const wstring& s) {
|
string normalizeUnicode(const string s, NormalizationOptions options) {
|
||||||
string result;
|
char* result;
|
||||||
for (wchar_t ch : s) {
|
const utf8proc_ssize_t charCount = utf8proc_map(
|
||||||
optional<char> ascii = toAscii(ch);
|
reinterpret_cast<const uint8_t*>(s.data()),
|
||||||
if (ascii) result.append(1, *ascii);
|
s.length(),
|
||||||
}
|
reinterpret_cast<uint8_t**>(&result),
|
||||||
return result;
|
static_cast<utf8proc_option_t>(options));
|
||||||
}
|
|
||||||
|
|
||||||
u32string utf8ToUtf32(const string& s) {
|
if (charCount < 0) {
|
||||||
#if defined(_MSC_VER) && _MSC_VER <= 1900
|
const utf8proc_ssize_t errorCode = charCount;
|
||||||
// Workaround for Visual Studio 2015
|
const string message = string("Error normalizing string: ") + utf8proc_errmsg(errorCode);
|
||||||
// See https://connect.microsoft.com/VisualStudio/feedback/details/1403302/unresolved-external-when-using-codecvt-utf8
|
if (errorCode == UTF8PROC_ERROR_INVALIDOPTS) {
|
||||||
std::wstring_convert<std::codecvt_utf8<uint32_t>, uint32_t> convert;
|
throw std::invalid_argument(message);
|
||||||
return u32string(reinterpret_cast<const char32_t*>(convert.from_bytes(s).c_str()));
|
}
|
||||||
#else
|
throw std::runtime_error(message);
|
||||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> convert;
|
}
|
||||||
return convert.from_bytes(s);
|
|
||||||
#endif
|
string resultString(result, charCount);
|
||||||
|
free(result);
|
||||||
|
return resultString;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <boost/optional.hpp>
|
#include <boost/optional.hpp>
|
||||||
#include <boost/lexical_cast.hpp>
|
#include <boost/lexical_cast.hpp>
|
||||||
|
#include <utf8proc.h>
|
||||||
|
|
||||||
std::vector<std::string> splitIntoLines(const std::string& s);
|
std::vector<std::string> splitIntoLines(const std::string& s);
|
||||||
|
|
||||||
|
@ -10,15 +11,31 @@ std::vector<std::string> wrapSingleLineString(const std::string& s, int lineLeng
|
||||||
|
|
||||||
std::vector<std::string> wrapString(const std::string& s, int lineLength, int hangingIndent = 0);
|
std::vector<std::string> wrapString(const std::string& s, int lineLength, int hangingIndent = 0);
|
||||||
|
|
||||||
|
bool isValidUtf8(const std::string& s);
|
||||||
|
|
||||||
std::wstring latin1ToWide(const std::string& s);
|
std::wstring latin1ToWide(const std::string& s);
|
||||||
|
|
||||||
boost::optional<char> toAscii(char32_t ch);
|
boost::optional<char> toAscii(char32_t ch);
|
||||||
|
|
||||||
std::string toAscii(const std::u32string& s);
|
std::string utf8ToAscii(const std::string s);
|
||||||
|
|
||||||
std::string toAscii(const std::wstring& s);
|
enum class NormalizationOptions : int {
|
||||||
|
CompatibilityMode = UTF8PROC_COMPAT,
|
||||||
|
Compose = UTF8PROC_COMPOSE,
|
||||||
|
Decompose = UTF8PROC_DECOMPOSE,
|
||||||
|
StripIgnorableCharacters = UTF8PROC_IGNORE,
|
||||||
|
ThrowOnUnassignedCodepoints = UTF8PROC_REJECTNA,
|
||||||
|
SimplifyLineBreaks = UTF8PROC_NLF2LS,
|
||||||
|
SimplifyWhiteSpace = UTF8PROC_STRIPCC,
|
||||||
|
StripCharacterMarkings = UTF8PROC_STRIPMARK
|
||||||
|
};
|
||||||
|
|
||||||
std::u32string utf8ToUtf32(const std::string& s);
|
constexpr NormalizationOptions
|
||||||
|
operator|(NormalizationOptions a, NormalizationOptions b) {
|
||||||
|
return static_cast<NormalizationOptions>(static_cast<int>(a) | static_cast<int>(b));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string normalizeUnicode(const std::string s, NormalizationOptions options);
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
std::string join(T range, const std::string separator) {
|
std::string join(T range, const std::string separator) {
|
||||||
|
|
|
@ -5,10 +5,9 @@
|
||||||
#include "stringTools.h"
|
#include "stringTools.h"
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::u32string;
|
|
||||||
using boost::filesystem::path;
|
using boost::filesystem::path;
|
||||||
|
|
||||||
u32string readUtf8File(path filePath) {
|
string readUtf8File(path filePath) {
|
||||||
if (!exists(filePath)) {
|
if (!exists(filePath)) {
|
||||||
throw std::invalid_argument(fmt::format("File {} does not exist.", filePath));
|
throw std::invalid_argument(fmt::format("File {} does not exist.", filePath));
|
||||||
}
|
}
|
||||||
|
@ -16,12 +15,12 @@ u32string readUtf8File(path filePath) {
|
||||||
boost::filesystem::ifstream file;
|
boost::filesystem::ifstream file;
|
||||||
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||||
file.open(filePath);
|
file.open(filePath);
|
||||||
string utf8Text((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
string text((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||||
try {
|
if (!isValidUtf8(text)) {
|
||||||
return utf8ToUtf32(utf8Text);
|
throw std::runtime_error("File encoding is not ASCII or UTF-8.");
|
||||||
} catch (...) {
|
|
||||||
std::throw_with_nested(std::runtime_error(fmt::format("File encoding is not ASCII or UTF-8.", filePath)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
std::throw_with_nested(std::runtime_error(fmt::format("Error reading file {0}.", filePath)));
|
std::throw_with_nested(std::runtime_error(fmt::format("Error reading file {0}.", filePath)));
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,4 +2,4 @@
|
||||||
|
|
||||||
#include <boost/filesystem/path.hpp>
|
#include <boost/filesystem/path.hpp>
|
||||||
|
|
||||||
std::u32string readUtf8File(boost::filesystem::path filePath);
|
std::string readUtf8File(boost::filesystem::path filePath);
|
|
@ -81,11 +81,15 @@ TEST(latin1ToWide, basic) {
|
||||||
EXPECT_EQ(pangramWide, latin1ToWide(pangramLatin1));
|
EXPECT_EQ(pangramWide, latin1ToWide(pangramLatin1));
|
||||||
}
|
}
|
||||||
|
|
||||||
// toAscii
|
// utf8ToAscii
|
||||||
|
|
||||||
TEST(toAscii, string) {
|
TEST(utf8ToAscii, string) {
|
||||||
EXPECT_EQ(
|
EXPECT_EQ(
|
||||||
"A naive man called was having pina colada and creme brulee.",
|
"A naive man called was having pina colada and creme brulee.",
|
||||||
toAscii(U"A naïve man called 晨 was having piña colada and crème brûlée."));
|
utf8ToAscii("A naïve man called 晨 was having piña colada and crème brûlée."));
|
||||||
EXPECT_EQ(string(""), toAscii(U""));
|
EXPECT_EQ(string(""), utf8ToAscii(""));
|
||||||
|
EXPECT_EQ(string("- - - - - - - - - -"), utf8ToAscii("- ‐ ‑ ‒ – — ― ﹘ ﹣ -"));
|
||||||
|
EXPECT_EQ(string("' ' ' ' \" \" \" \" \" \""), utf8ToAscii("‘ ’ ‚ ‛ “ ” „ ‟ « »"));
|
||||||
|
EXPECT_EQ(string("1 2 3"), utf8ToAscii("¹ ² ³"));
|
||||||
|
EXPECT_EQ(string("1/4 1/2 3/4"), utf8ToAscii("¼ ½ ¾"));
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#include "tokenization.h"
|
#include "tokenization.h"
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
#include <utf8.h>
|
||||||
|
|
||||||
using namespace testing;
|
using namespace testing;
|
||||||
using std::string;
|
using std::string;
|
||||||
|
@ -14,57 +15,64 @@ bool returnTrue(const string&) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(tokenizeText, simpleCases) {
|
TEST(tokenizeText, simpleCases) {
|
||||||
EXPECT_THAT(tokenizeText(U"", returnTrue), IsEmpty());
|
EXPECT_THAT(tokenizeText("", returnTrue), IsEmpty());
|
||||||
EXPECT_THAT(tokenizeText(U" \t\n\r\n ", returnTrue), IsEmpty());
|
EXPECT_THAT(tokenizeText(" \t\n\r\n ", returnTrue), IsEmpty());
|
||||||
EXPECT_THAT(
|
EXPECT_THAT(
|
||||||
tokenizeText(U"Wit is educated insolence.", returnTrue),
|
tokenizeText("Wit is educated insolence.", returnTrue),
|
||||||
ElementsAre("wit", "is", "educated", "insolence")
|
ElementsAre("wit", "is", "educated", "insolence")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(tokenizeText, numbers) {
|
TEST(tokenizeText, numbers) {
|
||||||
EXPECT_THAT(
|
EXPECT_THAT(
|
||||||
tokenizeText(U"Henry V died at 36.", returnTrue),
|
tokenizeText("Henry V died at 36.", returnTrue),
|
||||||
ElementsAre("henry", "the", "fifth", "died", "at", "thirty", "six")
|
ElementsAre("henry", "the", "fifth", "died", "at", "thirty", "six")
|
||||||
);
|
);
|
||||||
EXPECT_THAT(
|
EXPECT_THAT(
|
||||||
tokenizeText(U"I spent $4.50 on gum.", returnTrue),
|
tokenizeText("I spent $4.50 on gum.", returnTrue),
|
||||||
ElementsAre("i", "spent", "four", "dollars", "fifty", "cents", "on", "gum")
|
ElementsAre("i", "spent", "four", "dollars", "fifty", "cents", "on", "gum")
|
||||||
);
|
);
|
||||||
EXPECT_THAT(
|
EXPECT_THAT(
|
||||||
tokenizeText(U"I was born in 1982.", returnTrue),
|
tokenizeText("I was born in 1982.", returnTrue),
|
||||||
ElementsAre("i", "was", "born", "in", "nineteen", "eighty", "two")
|
ElementsAre("i", "was", "born", "in", "nineteen", "eighty", "two")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(tokenizeText, abbreviations) {
|
TEST(tokenizeText, abbreviations) {
|
||||||
EXPECT_THAT(
|
EXPECT_THAT(
|
||||||
tokenizeText(U"Prof. Foo lives on Dr. Dolittle Dr.", [](const string& word) { return word == "prof."; }),
|
tokenizeText("Prof. Foo lives on Dr. Dolittle Dr.", [](const string& word) { return word == "prof."; }),
|
||||||
ElementsAre("prof.", "foo", "lives", "on", "doctor", "dolittle", "drive")
|
ElementsAre("prof.", "foo", "lives", "on", "doctor", "dolittle", "drive")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(tokenizeText, apostrophes) {
|
TEST(tokenizeText, apostrophes) {
|
||||||
EXPECT_THAT(
|
EXPECT_THAT(
|
||||||
tokenizeText(U"'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk.", [](const string& word) { return word == "wouldn't"; }),
|
tokenizeText("'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk.", [](const string& word) { return word == "wouldn't"; }),
|
||||||
ElementsAreArray(vector<string>{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldn't", "walk" })
|
ElementsAreArray(vector<string>{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldn't", "walk" })
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(tokenizeText, math) {
|
TEST(tokenizeText, math) {
|
||||||
EXPECT_THAT(
|
EXPECT_THAT(
|
||||||
tokenizeText(U"'1+2*3=7", returnTrue),
|
tokenizeText("'1+2*3=7", returnTrue),
|
||||||
ElementsAre("one", "plus", "two", "times", "three", "equals", "seven")
|
ElementsAre("one", "plus", "two", "times", "three", "equals", "seven")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(tokenizeText, unicodeCharacters) {
|
||||||
|
EXPECT_THAT(
|
||||||
|
tokenizeText("A naïve man called 晨 had piña colada and crème brûlée.", returnTrue),
|
||||||
|
ElementsAre("a", "naive", "man", "called", "had", "pina", "colada", "and", "creme", "brulee")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Checks that each word contains only the characters a-z and the apostrophe
|
// Checks that each word contains only the characters a-z and the apostrophe
|
||||||
TEST(tokenizeText, wordsUseLimitedCharacters) {
|
TEST(tokenizeText, wordsUseLimitedCharacters) {
|
||||||
// Create string containing lots of undesirable characters
|
// Create string containing lots of undesirable characters
|
||||||
u32string input = U"A naïve man called 晨 was having piña colada and crème brûlée.";
|
string input = "A naïve man called 晨 was having piña colada and crème brûlée.";
|
||||||
for (char32_t c = 0; c <= 1000; ++c) {
|
for (char32_t c = 0; c <= 1000; ++c) {
|
||||||
input.append(U" ");
|
input.append(" ");
|
||||||
input.append(1, c);
|
utf8::append(c, back_inserter(input));
|
||||||
}
|
}
|
||||||
|
|
||||||
regex legal("^[a-z']+$");
|
regex legal("^[a-z']+$");
|
||||||
|
|
Loading…
Reference in New Issue