#include "stringTools.h" #include #include #include #include using std::string; using std::wstring; using std::u32string; using std::vector; using boost::optional; using std::regex; using std::regex_replace; vector splitIntoLines(const string& s) { vector lines; auto p = &s[0]; auto lineBegin = p; auto end = p + s.size(); // Iterate over input string while (p <= end) { // Add a new result line when we hit a \n character or the end of the string if (p == end || *p == '\n') { string line(lineBegin, p); // Trim \r characters boost::algorithm::trim_if(line, [](char c) { return c == '\r'; }); lines.push_back(line); lineBegin = p + 1; } ++p; } return lines; } vector wrapSingleLineString(const string& s, int lineLength, int hangingIndent) { if (lineLength <= 0) throw std::invalid_argument("lineLength must be > 0."); if (hangingIndent < 0) throw std::invalid_argument("hangingIndent must be >= 0."); if (hangingIndent >= lineLength) throw std::invalid_argument("hangingIndent must be < lineLength."); if (s.find('\t') != std::string::npos) throw std::invalid_argument("s must not contain tabs."); if (s.find('\n') != std::string::npos) throw std::invalid_argument("s must not contain line breaks."); vector lines; auto p = &s[0]; auto lineBegin = p; auto lineEnd = p; auto end = p + s.size(); // Iterate over input string while (p <= end) { // If we're at a word boundary: update lineEnd if (p == end || *p == ' ' || *p == '|') { lineEnd = p; } // If we've hit lineLength or the end of the string: add a new result line int currentIndent = lines.empty() ? 0 : hangingIndent; if (p == end || p - lineBegin == lineLength - currentIndent) { if (lineEnd == lineBegin) { // The line contains a single word, which is too long. Split mid-word. lineEnd = p; } // Add trimmed line to list string line(lineBegin, lineEnd); boost::algorithm::trim_right(line); lines.push_back(string(currentIndent, ' ') + line); // Resume after the last line, skipping spaces p = lineEnd; while (p != end && *p == ' ') ++p; lineBegin = lineEnd = p; } ++p; } return lines; } vector wrapString(const string& s, int lineLength, int hangingIndent) { vector lines; for (string paragraph : splitIntoLines(s)) { auto paragraphLines = wrapSingleLineString(paragraph, lineLength, hangingIndent); copy(paragraphLines.cbegin(), paragraphLines.cend(), back_inserter(lines)); } return lines; } bool isValidUtf8(const string& s) { return utf8::is_valid(s.begin(), s.end()); } wstring latin1ToWide(const string& s) { wstring result; for (unsigned char c : s) { result.append(1, c); } return result; } string utf8ToAscii(const string s) { // Normalize string, simplifying it as much as possible const NormalizationOptions options = NormalizationOptions::CompatibilityMode | NormalizationOptions::Decompose | NormalizationOptions::SimplifyLineBreaks | NormalizationOptions::SimplifyWhiteSpace | NormalizationOptions::StripCharacterMarkings | NormalizationOptions::StripIgnorableCharacters; string simplified = normalizeUnicode(s, options); // Replace common Unicode characters with ASCII equivalents static const vector> replacements{ {regex("«|»|“|”|„|‟"), "\""}, {regex("‘|’|‚|‛|‹|›"), "'"}, {regex("‐|‑|‒|⁃|⁻|₋|−|➖|–|—|―|﹘|﹣|-"), "-"}, {regex("…|⋯"), "..."}, {regex("•"), "*"}, {regex("†|+"), "+"}, {regex("⁄|∕|⧸|/|/"), "/"}, {regex("×"), "x"}, }; for (const auto& replacement : replacements) { simplified = regex_replace(simplified, replacement.first, replacement.second); } // Skip all non-ASCII code points, including multi-byte characters string result; for (char c : simplified) { const bool isAscii = (c & 0x80) == 0; if (isAscii) { result.append(1, c); } } return result; } string normalizeUnicode(const string s, NormalizationOptions options) { char* result; const utf8proc_ssize_t charCount = utf8proc_map( reinterpret_cast(s.data()), s.length(), reinterpret_cast(&result), static_cast(options)); if (charCount < 0) { const utf8proc_ssize_t errorCode = charCount; const string message = string("Error normalizing string: ") + utf8proc_errmsg(errorCode); if (errorCode == UTF8PROC_ERROR_INVALIDOPTS) { throw std::invalid_argument(message); } throw std::runtime_error(message); } string resultString(result, charCount); free(result); return resultString; }