rhubarb-lip-sync/src/tools/stringTools.cpp

160 lines
4.6 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "stringTools.h"
#include <boost/algorithm/string/trim.hpp>
#include <utf8.h>
#include <utf8proc.h>
#include <regex>
using std::string;
using std::wstring;
using std::u32string;
using std::vector;
using boost::optional;
using std::regex;
using std::regex_replace;
vector<string> splitIntoLines(const string& s) {
vector<string> lines;
auto p = &s[0];
auto lineBegin = p;
auto end = p + s.size();
// Iterate over input string
while (p <= end) {
// Add a new result line when we hit a \n character or the end of the string
if (p == end || *p == '\n') {
string line(lineBegin, p);
// Trim \r characters
boost::algorithm::trim_if(line, [](char c) { return c == '\r'; });
lines.push_back(line);
lineBegin = p + 1;
}
++p;
}
return lines;
}
vector<string> wrapSingleLineString(const string& s, int lineLength, int hangingIndent) {
if (lineLength <= 0) throw std::invalid_argument("lineLength must be > 0.");
if (hangingIndent < 0) throw std::invalid_argument("hangingIndent must be >= 0.");
if (hangingIndent >= lineLength) throw std::invalid_argument("hangingIndent must be < lineLength.");
if (s.find('\t') != std::string::npos) throw std::invalid_argument("s must not contain tabs.");
if (s.find('\n') != std::string::npos) throw std::invalid_argument("s must not contain line breaks.");
vector<string> lines;
auto p = &s[0];
auto lineBegin = p;
auto lineEnd = p;
auto end = p + s.size();
// Iterate over input string
while (p <= end) {
// If we're at a word boundary: update lineEnd
if (p == end || *p == ' ' || *p == '|') {
lineEnd = p;
}
// If we've hit lineLength or the end of the string: add a new result line
int currentIndent = lines.empty() ? 0 : hangingIndent;
if (p == end || p - lineBegin == lineLength - currentIndent) {
if (lineEnd == lineBegin) {
// The line contains a single word, which is too long. Split mid-word.
lineEnd = p;
}
// Add trimmed line to list
string line(lineBegin, lineEnd);
boost::algorithm::trim_right(line);
lines.push_back(string(currentIndent, ' ') + line);
// Resume after the last line, skipping spaces
p = lineEnd;
while (p != end && *p == ' ') ++p;
lineBegin = lineEnd = p;
}
++p;
}
return lines;
}
vector<string> wrapString(const string& s, int lineLength, int hangingIndent) {
vector<string> lines;
for (string paragraph : splitIntoLines(s)) {
auto paragraphLines = wrapSingleLineString(paragraph, lineLength, hangingIndent);
copy(paragraphLines.cbegin(), paragraphLines.cend(), back_inserter(lines));
}
return lines;
}
bool isValidUtf8(const string& s) {
return utf8::is_valid(s.begin(), s.end());
}
wstring latin1ToWide(const string& s) {
wstring result;
for (unsigned char c : s) {
result.append(1, c);
}
return result;
}
string utf8ToAscii(const string s) {
// Normalize string, simplifying it as much as possible
const NormalizationOptions options = NormalizationOptions::CompatibilityMode
| NormalizationOptions::Decompose
| NormalizationOptions::SimplifyLineBreaks
| NormalizationOptions::SimplifyWhiteSpace
| NormalizationOptions::StripCharacterMarkings
| NormalizationOptions::StripIgnorableCharacters;
string simplified = normalizeUnicode(s, options);
// Replace common Unicode characters with ASCII equivalents
static const vector<std::pair<regex, string>> replacements{
{regex("«|»|“|”|„|‟"), "\""},
{regex("|||||"), "'"},
{regex("||||⁻|₋||||—|―||﹣|"), "-"},
{regex("…|⋯"), "..."},
{regex(""), "*"},
{regex("†|"), "+"},
{regex("||||/"), "/"},
{regex("×"), "x"},
};
for (const auto& replacement : replacements) {
simplified = regex_replace(simplified, replacement.first, replacement.second);
}
// Skip all non-ASCII code points, including multi-byte characters
string result;
for (char c : simplified) {
const bool isAscii = (c & 0x80) == 0;
if (isAscii) {
result.append(1, c);
}
}
return result;
}
string normalizeUnicode(const string s, NormalizationOptions options) {
char* result;
const utf8proc_ssize_t charCount = utf8proc_map(
reinterpret_cast<const uint8_t*>(s.data()),
s.length(),
reinterpret_cast<uint8_t**>(&result),
static_cast<utf8proc_option_t>(options));
if (charCount < 0) {
const utf8proc_ssize_t errorCode = charCount;
const string message = string("Error normalizing string: ") + utf8proc_errmsg(errorCode);
if (errorCode == UTF8PROC_ERROR_INVALIDOPTS) {
throw std::invalid_argument(message);
}
throw std::runtime_error(message);
}
string resultString(result, charCount);
free(result);
return resultString;
}