From 5f451feb00f0fc79e5e88e2c33f5139c0af8c107 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Tue, 15 Aug 2017 20:47:05 +0200 Subject: [PATCH] Made entire application Unicode-aware All 8-bit strings are UTF-8-encoded now. --- CMakeLists.txt | 2 + src/lib/rhubarbLib.cpp | 6 +- src/lib/rhubarbLib.h | 4 +- src/main.cpp | 22 +++++-- src/recognition/languageModels.cpp | 1 - src/recognition/phoneRecognition.cpp | 10 +-- src/recognition/phoneRecognition.h | 2 +- src/recognition/tokenization.cpp | 10 +-- src/recognition/tokenization.h | 2 +- src/tools/asciiCases.cpp | 94 ---------------------------- src/tools/asciiCases.rb | 33 ---------- src/tools/platformTools.cpp | 82 ++++++++++++++++++++++++ src/tools/platformTools.h | 5 ++ src/tools/stringTools.cpp | 93 +++++++++++++++++---------- src/tools/stringTools.h | 23 ++++++- src/tools/textFiles.cpp | 13 ++-- src/tools/textFiles.h | 2 +- tests/stringToolsTests.cpp | 12 ++-- tests/tokenizationTests.cpp | 32 ++++++---- 19 files changed, 240 insertions(+), 208 deletions(-) delete mode 100644 src/tools/asciiCases.cpp delete mode 100644 src/tools/asciiCases.rb diff --git a/CMakeLists.txt b/CMakeLists.txt index 7567dd0..8f6589b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -424,6 +424,8 @@ target_include_directories(rhubarb-tools PUBLIC "src/tools") target_link_libraries(rhubarb-tools cppFormat whereami + utfcpp + utf8proc ) # Define Rhubarb executable diff --git a/src/lib/rhubarbLib.cpp b/src/lib/rhubarbLib.cpp index c643d15..29c25bc 100644 --- a/src/lib/rhubarbLib.cpp +++ b/src/lib/rhubarbLib.cpp @@ -6,13 +6,13 @@ #include "WaveFileReader.h" using boost::optional; -using std::u32string; +using std::string; using boost::filesystem::path; using std::unique_ptr; JoiningContinuousTimeline animateAudioClip( const AudioClip& audioClip, - optional dialog, + optional dialog, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink) @@ -32,7 +32,7 @@ unique_ptr createWaveAudioClip(path filePath) { JoiningContinuousTimeline animateWaveFile( path filePath, - optional dialog, + optional dialog, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink) diff --git a/src/lib/rhubarbLib.h b/src/lib/rhubarbLib.h index a8d588f..b4457c9 100644 --- a/src/lib/rhubarbLib.h +++ b/src/lib/rhubarbLib.h @@ -9,14 +9,14 @@ JoiningContinuousTimeline animateAudioClip( const AudioClip& audioClip, - boost::optional dialog, + boost::optional dialog, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink); JoiningContinuousTimeline animateWaveFile( boost::filesystem::path filePath, - boost::optional dialog, + boost::optional dialog, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink); diff --git a/src/main.cpp b/src/main.cpp index 860502b..e709181 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -24,10 +24,11 @@ #include #include "targetShapeSet.h" #include +#include "platformTools.h" using std::exception; using std::string; -using std::u32string; +using std::string; using std::vector; using std::unique_ptr; using std::make_unique; @@ -97,7 +98,14 @@ ShapeSet getTargetShapeSet(const string& extendedShapesString) { return result; } -int main(int argc, char *argv[]) { +int main(int platformArgc, char *platformArgv[]) { + // Use UTF-8 throughout + useUtf8ForConsole(); + useUtf8ForBoostFilesystem(); + + // Convert command-line arguments to UTF-8 + const vector args = argsToUtf8(platformArgc, platformArgv); + auto pausableStderrSink = addPausableStdErrSink(logging::Level::Warn); pausableStderrSink->pause(); @@ -130,7 +138,11 @@ int main(int argc, char *argv[]) { }); // Parse command line - cmd.parse(argc, argv); + { + // TCLAP mutates the function argument! Pass a copy. + vector argsCopy(args); + cmd.parse(argsCopy); + } if (quietMode.getValue()) { infoStream = &nullStream; } @@ -146,7 +158,7 @@ int main(int argc, char *argv[]) { } logging::infoFormat("Application startup. Command line: {}", join( - vector(argv, argv + argc) | transformed([](char* arg) { return fmt::format("\"{}\"", arg); }), " ")); + args | transformed([](string arg) { return fmt::format("\"{}\"", arg); }), " ")); try { *infoStream << fmt::format("Generating lip sync data for {}.", inputFilePath) << std::endl; @@ -158,7 +170,7 @@ int main(int argc, char *argv[]) { // Animate the recording animation = animateWaveFile( inputFilePath, - dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional(), + dialogFile.isSet() ? readUtf8File(path(dialogFile.getValue())) : boost::optional(), targetShapeSet, maxThreadCount.getValue(), progressBar); diff --git a/src/recognition/languageModels.cpp b/src/recognition/languageModels.cpp index 093c2e6..c7b4a2c 100644 --- a/src/recognition/languageModels.cpp +++ b/src/recognition/languageModels.cpp @@ -11,7 +11,6 @@ #include using std::string; -using std::u32string; using std::vector; using std::regex; using std::map; diff --git a/src/recognition/phoneRecognition.cpp b/src/recognition/phoneRecognition.cpp index fc07170..f56201f 100644 --- a/src/recognition/phoneRecognition.cpp +++ b/src/recognition/phoneRecognition.cpp @@ -44,7 +44,7 @@ using std::regex; using std::regex_replace; using std::chrono::duration; using boost::optional; -using std::u32string; +using std::string; using std::chrono::duration_cast; using std::array; @@ -251,7 +251,7 @@ lambda_unique_ptr createDefaultLanguageModel(ps_decoder_t& decode return std::move(result); } -lambda_unique_ptr createDialogLanguageModel(ps_decoder_t& decoder, const u32string& dialog) { +lambda_unique_ptr createDialogLanguageModel(ps_decoder_t& decoder, const string& dialog) { // Split dialog into normalized words vector words = tokenizeText(dialog, [&](const string& word) { return dictionaryContains(*decoder.dict, word); }); @@ -264,7 +264,7 @@ lambda_unique_ptr createDialogLanguageModel(ps_decoder_t& decoder return createLanguageModel(words, decoder); } -lambda_unique_ptr createBiasedLanguageModel(ps_decoder_t& decoder, const u32string& dialog) { +lambda_unique_ptr createBiasedLanguageModel(ps_decoder_t& decoder, const string& dialog) { auto defaultLanguageModel = createDefaultLanguageModel(decoder); auto dialogLanguageModel = createDialogLanguageModel(decoder, dialog); constexpr int modelCount = 2; @@ -281,7 +281,7 @@ lambda_unique_ptr createBiasedLanguageModel(ps_decoder_t& decoder return std::move(result); } -lambda_unique_ptr createDecoder(optional dialog) { +lambda_unique_ptr createDecoder(optional dialog) { lambda_unique_ptr config( cmd_ln_init( nullptr, ps_args(), true, @@ -435,7 +435,7 @@ Timeline utteranceToPhones( BoundedTimeline recognizePhones( const AudioClip& inputAudioClip, - optional dialog, + optional dialog, int maxThreadCount, ProgressSink& progressSink) { diff --git a/src/recognition/phoneRecognition.h b/src/recognition/phoneRecognition.h index 8deb5d8..61e94ee 100644 --- a/src/recognition/phoneRecognition.h +++ b/src/recognition/phoneRecognition.h @@ -7,6 +7,6 @@ BoundedTimeline recognizePhones( const AudioClip& audioClip, - boost::optional dialog, + boost::optional dialog, int maxThreadCount, ProgressSink& progressSink); diff --git a/src/recognition/tokenization.cpp b/src/recognition/tokenization.cpp index e2b9000..2fb7741 100644 --- a/src/recognition/tokenization.cpp +++ b/src/recognition/tokenization.cpp @@ -10,7 +10,6 @@ extern "C" { } using std::runtime_error; -using std::u32string; using std::string; using std::vector; using std::regex; @@ -34,9 +33,12 @@ static const cst_synth_module synth_method_normalize[] = { }; vector tokenizeViaFlite(const string& text) { + // Convert text to ASCII + const string asciiText = utf8ToAscii(text); + // Create utterance object with text lambda_unique_ptr utterance(new_utterance(), [](cst_utterance* utterance) { delete_utterance(utterance); }); - utt_set_input_text(utterance.get(), text.c_str()); + utt_set_input_text(utterance.get(), asciiText.c_str()); lambda_unique_ptr voice = createDummyVoice(); utt_init(utterance.get(), voice.get()); @@ -73,8 +75,8 @@ optional findSimilarDictionaryWord(const string& word, function tokenizeText(const u32string& text, function dictionaryContains) { - vector words = tokenizeViaFlite(toAscii(text)); +vector tokenizeText(const string& text, function dictionaryContains) { + vector words = tokenizeViaFlite(text); // Join words separated by apostophes for (int i = words.size() - 1; i > 0; --i) { diff --git a/src/recognition/tokenization.h b/src/recognition/tokenization.h index 16e48fc..c990501 100644 --- a/src/recognition/tokenization.h +++ b/src/recognition/tokenization.h @@ -4,4 +4,4 @@ #include #include -std::vector tokenizeText(const std::u32string& text, std::function dictionaryContains); +std::vector tokenizeText(const std::string& text, std::function dictionaryContains); diff --git a/src/tools/asciiCases.cpp b/src/tools/asciiCases.cpp deleted file mode 100644 index d7cc2ff..0000000 --- a/src/tools/asciiCases.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Generated by asciiCases.rb; don't modify by hand! - -case U'À': case U'Á': case U'Â': case U'Ã': case U'Ä': case U'Å': case U'Ā': case U'Ă': case U'Ą': case U'Ǎ': case U'Ǟ': case U'Ǡ': case U'Ǻ': case U'Ȁ': case U'Ȃ': case U'Ȧ': case U'Ⱥ': - return 'A'; -case U'Ɓ': case U'Ƃ': case U'Ƀ': - return 'B'; -case U'Ç': case U'Ć': case U'Ĉ': case U'Ċ': case U'Č': case U'Ƈ': case U'Ȼ': - return 'C'; -case U'Ď': case U'Đ': case U'Ɗ': case U'Ƌ': - return 'D'; -case U'È': case U'É': case U'Ê': case U'Ë': case U'Ē': case U'Ĕ': case U'Ė': case U'Ę': case U'Ě': case U'Ȅ': case U'Ȇ': case U'Ȩ': case U'Ɇ': - return 'E'; -case U'Ƒ': - return 'F'; -case U'Ĝ': case U'Ğ': case U'Ġ': case U'Ģ': case U'Ɠ': case U'Ǥ': case U'Ǧ': case U'Ǵ': - return 'G'; -case U'Ĥ': case U'Ħ': case U'Ȟ': - return 'H'; -case U'Ì': case U'Í': case U'Î': case U'Ï': case U'Ĩ': case U'Ī': case U'Ĭ': case U'Į': case U'İ': case U'Ɨ': case U'Ǐ': case U'Ȉ': case U'Ȋ': - return 'I'; -case U'Ĵ': case U'Ɉ': - return 'J'; -case U'Ķ': case U'Ƙ': case U'Ǩ': - return 'K'; -case U'Ĺ': case U'Ļ': case U'Ľ': case U'Ŀ': case U'Ł': case U'Ƚ': - return 'L'; -case U'Ñ': case U'Ń': case U'Ņ': case U'Ň': case U'Ɲ': case U'Ǹ': case U'Ƞ': - return 'N'; -case U'Ò': case U'Ó': case U'Ô': case U'Õ': case U'Ö': case U'Ø': case U'Ō': case U'Ŏ': case U'Ő': case U'Ɵ': case U'Ơ': case U'Ǒ': case U'Ǫ': case U'Ǭ': case U'Ǿ': case U'Ȍ': case U'Ȏ': case U'Ȫ': case U'Ȭ': case U'Ȯ': case U'Ȱ': - return 'O'; -case U'Ƥ': - return 'P'; -case U'Ŕ': case U'Ŗ': case U'Ř': case U'Ȑ': case U'Ȓ': case U'Ɍ': - return 'R'; -case U'Ś': case U'Ŝ': case U'Ş': case U'Š': case U'Ș': - return 'S'; -case U'Ţ': case U'Ť': case U'Ŧ': case U'Ƭ': case U'Ʈ': case U'Ț': case U'Ⱦ': - return 'T'; -case U'Ù': case U'Ú': case U'Û': case U'Ü': case U'Ũ': case U'Ū': case U'Ŭ': case U'Ů': case U'Ű': case U'Ų': case U'Ư': case U'Ǔ': case U'Ǖ': case U'Ǘ': case U'Ǚ': case U'Ǜ': case U'Ȕ': case U'Ȗ': case U'Ʉ': - return 'U'; -case U'Ʋ': - return 'V'; -case U'Ŵ': - return 'W'; -case U'Ý': case U'Ŷ': case U'Ÿ': case U'Ƴ': case U'Ȳ': case U'Ɏ': - return 'Y'; -case U'Ź': case U'Ż': case U'Ž': case U'Ƶ': case U'Ȥ': - return 'Z'; -case U'à': case U'á': case U'â': case U'ã': case U'ä': case U'å': case U'ā': case U'ă': case U'ą': case U'ǎ': case U'ǟ': case U'ǡ': case U'ǻ': case U'ȁ': case U'ȃ': case U'ȧ': - return 'a'; -case U'ƀ': case U'ƃ': - return 'b'; -case U'ç': case U'ć': case U'ĉ': case U'ċ': case U'č': case U'ƈ': case U'ȼ': - return 'c'; -case U'ď': case U'đ': case U'ƌ': case U'ȡ': - return 'd'; -case U'è': case U'é': case U'ê': case U'ë': case U'ē': case U'ĕ': case U'ė': case U'ę': case U'ě': case U'ȅ': case U'ȇ': case U'ȩ': case U'ɇ': - return 'e'; -case U'ƒ': - return 'f'; -case U'ĝ': case U'ğ': case U'ġ': case U'ģ': case U'ǥ': case U'ǧ': case U'ǵ': - return 'g'; -case U'ĥ': case U'ħ': case U'ȟ': - return 'h'; -case U'ì': case U'í': case U'î': case U'ï': case U'ĩ': case U'ī': case U'ĭ': case U'į': case U'ǐ': case U'ȉ': case U'ȋ': - return 'i'; -case U'ĵ': case U'ǰ': case U'ɉ': - return 'j'; -case U'ķ': case U'ƙ': case U'ǩ': - return 'k'; -case U'ĺ': case U'ļ': case U'ľ': case U'ŀ': case U'ł': case U'ƚ': case U'ȴ': - return 'l'; -case U'ñ': case U'ń': case U'ņ': case U'ň': case U'ʼn': case U'ƞ': case U'ǹ': case U'ȵ': - return 'n'; -case U'ò': case U'ó': case U'ô': case U'õ': case U'ö': case U'ø': case U'ō': case U'ŏ': case U'ő': case U'ơ': case U'ǒ': case U'ǫ': case U'ǭ': case U'ǿ': case U'ȍ': case U'ȏ': case U'ȫ': case U'ȭ': case U'ȯ': case U'ȱ': - return 'o'; -case U'ƥ': - return 'p'; -case U'ɋ': - return 'q'; -case U'ŕ': case U'ŗ': case U'ř': case U'ȑ': case U'ȓ': case U'ɍ': - return 'r'; -case U'ś': case U'ŝ': case U'ş': case U'š': case U'ș': case U'ȿ': - return 's'; -case U'ţ': case U'ť': case U'ŧ': case U'ƫ': case U'ƭ': case U'ț': case U'ȶ': - return 't'; -case U'ù': case U'ú': case U'û': case U'ü': case U'ũ': case U'ū': case U'ŭ': case U'ů': case U'ű': case U'ų': case U'ư': case U'ǔ': case U'ǖ': case U'ǘ': case U'ǚ': case U'ǜ': case U'ȕ': case U'ȗ': - return 'u'; -case U'ŵ': - return 'w'; -case U'ý': case U'ÿ': case U'ŷ': case U'ƴ': case U'ȳ': case U'ɏ': - return 'y'; -case U'ź': case U'ż': case U'ž': case U'ƶ': case U'ȥ': case U'ɀ': - return 'z'; diff --git a/src/tools/asciiCases.rb b/src/tools/asciiCases.rb deleted file mode 100644 index ef584e3..0000000 --- a/src/tools/asciiCases.rb +++ /dev/null @@ -1,33 +0,0 @@ -require 'open-uri' -require 'csv' - -# Create mapping from ASCII characters to related Unicode characters -mapping = Hash.new{ |hash, key| hash[key] = [] } -url = 'http://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt' -headers = [:code, :name, :category] -CSV.new(open(url), :col_sep => ';', :headers => headers).each do |row| - code = row[:code].hex - next if code < 0x80 - break if code > 0x24f - - char = [code].pack('U') - name = row[:name] - match = /^LATIN (CAPITAL|SMALL) LETTER ([A-Z])\b(?!.*\bLETTER\b)/.match(name) - if match - baseChar = match[2] - if match[1] == 'SMALL' - baseChar = (baseChar.ord + 0x20).chr - end - mapping[baseChar] << char - end -end -mapping = mapping.sort.to_h - -# Generate asciiCases.cpp -File.open('asciiCases.cpp', 'w') do |file| - file.print "// Generated by #{__FILE__}; don't modify by hand!\n\n" - mapping.each do |asciiChar, unicodeChars| - file.print unicodeChars.map { |c| "case U'#{c}':" }.join(' '), "\n" - file.print "\treturn '#{asciiChar}';\n" - end -end diff --git a/src/tools/platformTools.cpp b/src/tools/platformTools.cpp index 2fcc9fe..116ff97 100644 --- a/src/tools/platformTools.cpp +++ b/src/tools/platformTools.cpp @@ -6,9 +6,21 @@ #include #include "platformTools.h" #include +#include +#include +#include "tools.h" +#include +#include + +#ifdef _WIN32 + #include + #include + #include +#endif using boost::filesystem::path; using std::string; +using std::vector; path getBinPath() { static const path binPath = [] { @@ -69,3 +81,73 @@ std::string errorNumberToString(int errorNumber) { #endif return message; } + +vector argsToUtf8(int argc, char* argv[]) { +#ifdef _WIN32 + // On Windows, there is no way to convert the single-byte argument strings to Unicode. + // We'll just ignore them. + UNUSED(argc); + UNUSED(argv); + + // Get command-line arguments as UTF16 strings + int argumentCount; + static_assert(sizeof(wchar_t) == sizeof(char16_t), "Expected wchar_t to be a 16-bit type."); + char16_t** args = reinterpret_cast(CommandLineToArgvW(GetCommandLineW(), &argumentCount)); + if (!args) { + throw std::runtime_error("Error splitting the UTF-16 command line arguments."); + } + auto freeArgs = gsl::finally([&]() { LocalFree(args); }); + assert(argumentCount == argc); + + // Convert UTF16 strings to UTF8 + vector result; + for (int i = 0; i < argc; ++i) { + std::u16string utf16String(args[i]); + string utf8String; + utf8::utf16to8(utf16String.begin(), utf16String.end(), back_inserter(utf8String)); + result.push_back(utf8String); + } + return result; +#else + // On Unix systems, command-line args are already in UTF-8 format. Just convert them to strings. + vector result; + for (int i = 0; i < argc; ++i) { + result.push_back(string(argv[i])); + } + return result; +#endif +} + +class ConsoleBuffer : public std::stringbuf { +public: + explicit ConsoleBuffer(FILE* file) + : file(file) {} + + int sync() override { + fputs(str().c_str(), file); + str(""); + return 0; + } + +private: + FILE* file; +}; + +void useUtf8ForConsole() { +// Unix systems already expect UTF-8-encoded data +#ifdef _WIN32 + // Set console code page to UTF-8 so the console knows how to interpret string data + SetConsoleOutputCP(CP_UTF8); + + // Prevent default stream buffer from chopping up UTF-8 byte sequences. + // See https://stackoverflow.com/questions/45575863/how-to-print-utf-8-strings-to-stdcout-on-windows + std::cout.rdbuf(new ConsoleBuffer(stdout)); + std::cerr.rdbuf(new ConsoleBuffer(stderr)); +#endif +} + +void useUtf8ForBoostFilesystem() { + std::locale globalLocale = std::locale(); + std::locale utf8Locale(globalLocale, new boost::filesystem::detail::utf8_codecvt_facet); + path::imbue(utf8Locale); +} diff --git a/src/tools/platformTools.h b/src/tools/platformTools.h index d5b3529..2fec1db 100644 --- a/src/tools/platformTools.h +++ b/src/tools/platformTools.h @@ -10,3 +10,8 @@ boost::filesystem::path getTempFilePath(); std::tm getLocalTime(const time_t& time); std::string errorNumberToString(int errorNumber); + +std::vector argsToUtf8(int argc, char *argv[]); + +void useUtf8ForConsole(); +void useUtf8ForBoostFilesystem(); \ No newline at end of file diff --git a/src/tools/stringTools.cpp b/src/tools/stringTools.cpp index ab0b9e6..5d27bbe 100644 --- a/src/tools/stringTools.cpp +++ b/src/tools/stringTools.cpp @@ -1,12 +1,16 @@ -#include "stringTools.h" +#include "stringTools.h" #include -#include +#include +#include +#include using std::string; using std::wstring; using std::u32string; using std::vector; using boost::optional; +using std::regex; +using std::regex_replace; vector splitIntoLines(const string& s) { vector lines; @@ -83,6 +87,10 @@ vector wrapString(const string& s, int lineLength, int hangingIndent) { return lines; } +bool isValidUtf8(const string& s) { + return utf8::is_valid(s.begin(), s.end()); +} + wstring latin1ToWide(const string& s) { wstring result; for (unsigned char c : s) { @@ -91,40 +99,61 @@ wstring latin1ToWide(const string& s) { return result; } -optional toAscii(char32_t ch) { - switch (ch) { -#include "asciiCases.cpp" - default: - return ch < 0x80 ? static_cast(ch) : optional(); - } -} +string utf8ToAscii(const string s) { + // Normalize string, simplifying it as much as possible + const NormalizationOptions options = NormalizationOptions::CompatibilityMode + | NormalizationOptions::Decompose + | NormalizationOptions::SimplifyLineBreaks + | NormalizationOptions::SimplifyWhiteSpace + | NormalizationOptions::StripCharacterMarkings + | NormalizationOptions::StripIgnorableCharacters; + string simplified = normalizeUnicode(s, options); -string toAscii(const u32string& s) { - string result; - for (char32_t ch : s) { - optional ascii = toAscii(ch); - if (ascii) result.append(1, *ascii); + // Replace common Unicode characters with ASCII equivalents + static const vector> replacements{ + {regex("«|»|“|”|„|‟"), "\""}, + {regex("‘|’|‚|‛|‹|›"), "'"}, + {regex("‐|‑|‒|⁃|⁻|₋|−|➖|–|—|―|﹘|﹣|-"), "-"}, + {regex("…|⋯"), "..."}, + {regex("•"), "*"}, + {regex("†|+"), "+"}, + {regex("⁄|∕|⧸|/|/"), "/"}, + {regex("×"), "x"}, + }; + for (const auto& replacement : replacements) { + simplified = regex_replace(simplified, replacement.first, replacement.second); } + + // Skip all non-ASCII code points, including multi-byte characters + string result; + for (char c : simplified) { + const bool isAscii = (c & 0x80) == 0; + if (isAscii) { + result.append(1, c); + } + } + return result; } -string toAscii(const wstring& s) { - string result; - for (wchar_t ch : s) { - optional ascii = toAscii(ch); - if (ascii) result.append(1, *ascii); - } - return result; -} +string normalizeUnicode(const string s, NormalizationOptions options) { + char* result; + const utf8proc_ssize_t charCount = utf8proc_map( + reinterpret_cast(s.data()), + s.length(), + reinterpret_cast(&result), + static_cast(options)); -u32string utf8ToUtf32(const string& s) { -#if defined(_MSC_VER) && _MSC_VER <= 1900 - // Workaround for Visual Studio 2015 - // See https://connect.microsoft.com/VisualStudio/feedback/details/1403302/unresolved-external-when-using-codecvt-utf8 - std::wstring_convert, uint32_t> convert; - return u32string(reinterpret_cast(convert.from_bytes(s).c_str())); -#else - std::wstring_convert, char32_t> convert; - return convert.from_bytes(s); -#endif + if (charCount < 0) { + const utf8proc_ssize_t errorCode = charCount; + const string message = string("Error normalizing string: ") + utf8proc_errmsg(errorCode); + if (errorCode == UTF8PROC_ERROR_INVALIDOPTS) { + throw std::invalid_argument(message); + } + throw std::runtime_error(message); + } + + string resultString(result, charCount); + free(result); + return resultString; } diff --git a/src/tools/stringTools.h b/src/tools/stringTools.h index bab3854..2964de1 100644 --- a/src/tools/stringTools.h +++ b/src/tools/stringTools.h @@ -3,6 +3,7 @@ #include #include #include +#include std::vector splitIntoLines(const std::string& s); @@ -10,15 +11,31 @@ std::vector wrapSingleLineString(const std::string& s, int lineLeng std::vector wrapString(const std::string& s, int lineLength, int hangingIndent = 0); +bool isValidUtf8(const std::string& s); + std::wstring latin1ToWide(const std::string& s); boost::optional toAscii(char32_t ch); -std::string toAscii(const std::u32string& s); +std::string utf8ToAscii(const std::string s); -std::string toAscii(const std::wstring& s); +enum class NormalizationOptions : int { + CompatibilityMode = UTF8PROC_COMPAT, + Compose = UTF8PROC_COMPOSE, + Decompose = UTF8PROC_DECOMPOSE, + StripIgnorableCharacters = UTF8PROC_IGNORE, + ThrowOnUnassignedCodepoints = UTF8PROC_REJECTNA, + SimplifyLineBreaks = UTF8PROC_NLF2LS, + SimplifyWhiteSpace = UTF8PROC_STRIPCC, + StripCharacterMarkings = UTF8PROC_STRIPMARK +}; -std::u32string utf8ToUtf32(const std::string& s); +constexpr NormalizationOptions +operator|(NormalizationOptions a, NormalizationOptions b) { + return static_cast(static_cast(a) | static_cast(b)); +} + +std::string normalizeUnicode(const std::string s, NormalizationOptions options); template std::string join(T range, const std::string separator) { diff --git a/src/tools/textFiles.cpp b/src/tools/textFiles.cpp index fd0ac6f..d266f65 100644 --- a/src/tools/textFiles.cpp +++ b/src/tools/textFiles.cpp @@ -5,10 +5,9 @@ #include "stringTools.h" using std::string; -using std::u32string; using boost::filesystem::path; -u32string readUtf8File(path filePath) { +string readUtf8File(path filePath) { if (!exists(filePath)) { throw std::invalid_argument(fmt::format("File {} does not exist.", filePath)); } @@ -16,12 +15,12 @@ u32string readUtf8File(path filePath) { boost::filesystem::ifstream file; file.exceptions(std::ifstream::failbit | std::ifstream::badbit); file.open(filePath); - string utf8Text((std::istreambuf_iterator(file)), std::istreambuf_iterator()); - try { - return utf8ToUtf32(utf8Text); - } catch (...) { - std::throw_with_nested(std::runtime_error(fmt::format("File encoding is not ASCII or UTF-8.", filePath))); + string text((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + if (!isValidUtf8(text)) { + throw std::runtime_error("File encoding is not ASCII or UTF-8."); } + + return text; } catch (...) { std::throw_with_nested(std::runtime_error(fmt::format("Error reading file {0}.", filePath))); } diff --git a/src/tools/textFiles.h b/src/tools/textFiles.h index 3c72a6c..adfe96b 100644 --- a/src/tools/textFiles.h +++ b/src/tools/textFiles.h @@ -2,4 +2,4 @@ #include -std::u32string readUtf8File(boost::filesystem::path filePath); \ No newline at end of file +std::string readUtf8File(boost::filesystem::path filePath); \ No newline at end of file diff --git a/tests/stringToolsTests.cpp b/tests/stringToolsTests.cpp index 3ecbd92..7f58971 100644 --- a/tests/stringToolsTests.cpp +++ b/tests/stringToolsTests.cpp @@ -81,11 +81,15 @@ TEST(latin1ToWide, basic) { EXPECT_EQ(pangramWide, latin1ToWide(pangramLatin1)); } -// toAscii +// utf8ToAscii -TEST(toAscii, string) { +TEST(utf8ToAscii, string) { EXPECT_EQ( "A naive man called was having pina colada and creme brulee.", - toAscii(U"A naïve man called 晨 was having piña colada and crème brûlée.")); - EXPECT_EQ(string(""), toAscii(U"")); + utf8ToAscii("A naïve man called 晨 was having piña colada and crème brûlée.")); + EXPECT_EQ(string(""), utf8ToAscii("")); + EXPECT_EQ(string("- - - - - - - - - -"), utf8ToAscii("- ‐ ‑ ‒ – — ― ﹘ ﹣ -")); + EXPECT_EQ(string("' ' ' ' \" \" \" \" \" \""), utf8ToAscii("‘ ’ ‚ ‛ “ ” „ ‟ « »")); + EXPECT_EQ(string("1 2 3"), utf8ToAscii("¹ ² ³")); + EXPECT_EQ(string("1/4 1/2 3/4"), utf8ToAscii("¼ ½ ¾")); } diff --git a/tests/tokenizationTests.cpp b/tests/tokenizationTests.cpp index dcd3941..577c6bb 100644 --- a/tests/tokenizationTests.cpp +++ b/tests/tokenizationTests.cpp @@ -2,6 +2,7 @@ #include "tokenization.h" #include #include +#include using namespace testing; using std::string; @@ -14,57 +15,64 @@ bool returnTrue(const string&) { } TEST(tokenizeText, simpleCases) { - EXPECT_THAT(tokenizeText(U"", returnTrue), IsEmpty()); - EXPECT_THAT(tokenizeText(U" \t\n\r\n ", returnTrue), IsEmpty()); + EXPECT_THAT(tokenizeText("", returnTrue), IsEmpty()); + EXPECT_THAT(tokenizeText(" \t\n\r\n ", returnTrue), IsEmpty()); EXPECT_THAT( - tokenizeText(U"Wit is educated insolence.", returnTrue), + tokenizeText("Wit is educated insolence.", returnTrue), ElementsAre("wit", "is", "educated", "insolence") ); } TEST(tokenizeText, numbers) { EXPECT_THAT( - tokenizeText(U"Henry V died at 36.", returnTrue), + tokenizeText("Henry V died at 36.", returnTrue), ElementsAre("henry", "the", "fifth", "died", "at", "thirty", "six") ); EXPECT_THAT( - tokenizeText(U"I spent $4.50 on gum.", returnTrue), + tokenizeText("I spent $4.50 on gum.", returnTrue), ElementsAre("i", "spent", "four", "dollars", "fifty", "cents", "on", "gum") ); EXPECT_THAT( - tokenizeText(U"I was born in 1982.", returnTrue), + tokenizeText("I was born in 1982.", returnTrue), ElementsAre("i", "was", "born", "in", "nineteen", "eighty", "two") ); } TEST(tokenizeText, abbreviations) { EXPECT_THAT( - tokenizeText(U"Prof. Foo lives on Dr. Dolittle Dr.", [](const string& word) { return word == "prof."; }), + tokenizeText("Prof. Foo lives on Dr. Dolittle Dr.", [](const string& word) { return word == "prof."; }), ElementsAre("prof.", "foo", "lives", "on", "doctor", "dolittle", "drive") ); } TEST(tokenizeText, apostrophes) { EXPECT_THAT( - tokenizeText(U"'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk.", [](const string& word) { return word == "wouldn't"; }), + tokenizeText("'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk.", [](const string& word) { return word == "wouldn't"; }), ElementsAreArray(vector{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldn't", "walk" }) ); } TEST(tokenizeText, math) { EXPECT_THAT( - tokenizeText(U"'1+2*3=7", returnTrue), + tokenizeText("'1+2*3=7", returnTrue), ElementsAre("one", "plus", "two", "times", "three", "equals", "seven") ); } +TEST(tokenizeText, unicodeCharacters) { + EXPECT_THAT( + tokenizeText("A naïve man called 晨 had piña colada and crème brûlée.", returnTrue), + ElementsAre("a", "naive", "man", "called", "had", "pina", "colada", "and", "creme", "brulee") + ); +} + // Checks that each word contains only the characters a-z and the apostrophe TEST(tokenizeText, wordsUseLimitedCharacters) { // Create string containing lots of undesirable characters - u32string input = U"A naïve man called 晨 was having piña colada and crème brûlée."; + string input = "A naïve man called 晨 was having piña colada and crème brûlée."; for (char32_t c = 0; c <= 1000; ++c) { - input.append(U" "); - input.append(1, c); + input.append(" "); + utf8::append(c, back_inserter(input)); } regex legal("^[a-z']+$");