Implemented simple conversion from Unicode string to ASCII
This commit is contained in:
parent
f1563919e1
commit
d4b9a8e0c6
|
@ -39,6 +39,9 @@ elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||||
# Disable warning C4456: declaration of '...' hides previous local declaration
|
# Disable warning C4456: declaration of '...' hides previous local declaration
|
||||||
# I'm doing that on purpose.
|
# I'm doing that on purpose.
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4458")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4458")
|
||||||
|
|
||||||
|
# Assume UTF-8 encoding for source files and encode string constants in UTF-8
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Enable project folders
|
# Enable project folders
|
||||||
|
@ -122,6 +125,7 @@ set(SOURCE_FILES
|
||||||
src/ContinuousTimeline.h
|
src/ContinuousTimeline.h
|
||||||
src/pairs.h
|
src/pairs.h
|
||||||
src/Exporter.cpp src/Exporter.h
|
src/Exporter.cpp src/Exporter.h
|
||||||
|
src/ascii.cpp src/ascii.h
|
||||||
)
|
)
|
||||||
add_executable(rhubarb ${SOURCE_FILES})
|
add_executable(rhubarb ${SOURCE_FILES})
|
||||||
target_link_libraries(rhubarb ${Boost_LIBRARIES} cppFormat sphinxbase pocketSphinx)
|
target_link_libraries(rhubarb ${Boost_LIBRARIES} cppFormat sphinxbase pocketSphinx)
|
||||||
|
@ -135,11 +139,13 @@ set(TEST_FILES
|
||||||
tests/BoundedTimelineTests.cpp
|
tests/BoundedTimelineTests.cpp
|
||||||
tests/ContinuousTimelineTests.cpp
|
tests/ContinuousTimelineTests.cpp
|
||||||
tests/pairsTests.cpp
|
tests/pairsTests.cpp
|
||||||
|
tests/asciiTests.cpp
|
||||||
src/stringTools.cpp src/stringTools.h
|
src/stringTools.cpp src/stringTools.h
|
||||||
src/Timeline.h
|
src/Timeline.h
|
||||||
src/TimeRange.cpp src/TimeRange.h
|
src/TimeRange.cpp src/TimeRange.h
|
||||||
src/centiseconds.cpp src/centiseconds.h
|
src/centiseconds.cpp src/centiseconds.h
|
||||||
src/pairs.h
|
src/pairs.h
|
||||||
|
src/ascii.cpp src/ascii.h
|
||||||
)
|
)
|
||||||
add_executable(runTests ${TEST_FILES})
|
add_executable(runTests ${TEST_FILES})
|
||||||
target_link_libraries(runTests gtest gmock gmock_main)
|
target_link_libraries(runTests gtest gmock gmock_main)
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
#include "ascii.h"
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
using std::u32string;
|
||||||
|
using boost::optional;
|
||||||
|
|
||||||
|
optional<char> toASCII(char32_t ch) {
|
||||||
|
switch (ch) {
|
||||||
|
#include "asciiCases.cpp"
|
||||||
|
default:
|
||||||
|
return ch < 0x80 ? static_cast<char>(ch) : optional<char>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
string toASCII(const u32string& s) {
|
||||||
|
string result;
|
||||||
|
for (char32_t ch : s) {
|
||||||
|
optional<char> ascii = toASCII(ch);
|
||||||
|
if (ascii) result.append(1, *ascii);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <boost/optional.hpp>
|
||||||
|
|
||||||
|
boost::optional<char> toASCII(char32_t ch);
|
||||||
|
std::string toASCII(const std::u32string& s);
|
|
@ -0,0 +1,94 @@
|
||||||
|
// Generated by asciiCases.rb; don't modify by hand!
|
||||||
|
|
||||||
|
case U'À': case U'Á': case U'Â': case U'Ã': case U'Ä': case U'Å': case U'Ā': case U'Ă': case U'Ą': case U'Ǎ': case U'Ǟ': case U'Ǡ': case U'Ǻ': case U'Ȁ': case U'Ȃ': case U'Ȧ': case U'Ⱥ':
|
||||||
|
return 'A';
|
||||||
|
case U'Ɓ': case U'Ƃ': case U'Ƀ':
|
||||||
|
return 'B';
|
||||||
|
case U'Ç': case U'Ć': case U'Ĉ': case U'Ċ': case U'Č': case U'Ƈ': case U'Ȼ':
|
||||||
|
return 'C';
|
||||||
|
case U'Ď': case U'Đ': case U'Ɗ': case U'Ƌ':
|
||||||
|
return 'D';
|
||||||
|
case U'È': case U'É': case U'Ê': case U'Ë': case U'Ē': case U'Ĕ': case U'Ė': case U'Ę': case U'Ě': case U'Ȅ': case U'Ȇ': case U'Ȩ': case U'Ɇ':
|
||||||
|
return 'E';
|
||||||
|
case U'Ƒ':
|
||||||
|
return 'F';
|
||||||
|
case U'Ĝ': case U'Ğ': case U'Ġ': case U'Ģ': case U'Ɠ': case U'Ǥ': case U'Ǧ': case U'Ǵ':
|
||||||
|
return 'G';
|
||||||
|
case U'Ĥ': case U'Ħ': case U'Ȟ':
|
||||||
|
return 'H';
|
||||||
|
case U'Ì': case U'Í': case U'Î': case U'Ï': case U'Ĩ': case U'Ī': case U'Ĭ': case U'Į': case U'İ': case U'Ɨ': case U'Ǐ': case U'Ȉ': case U'Ȋ':
|
||||||
|
return 'I';
|
||||||
|
case U'Ĵ': case U'Ɉ':
|
||||||
|
return 'J';
|
||||||
|
case U'Ķ': case U'Ƙ': case U'Ǩ':
|
||||||
|
return 'K';
|
||||||
|
case U'Ĺ': case U'Ļ': case U'Ľ': case U'Ŀ': case U'Ł': case U'Ƚ':
|
||||||
|
return 'L';
|
||||||
|
case U'Ñ': case U'Ń': case U'Ņ': case U'Ň': case U'Ɲ': case U'Ǹ': case U'Ƞ':
|
||||||
|
return 'N';
|
||||||
|
case U'Ò': case U'Ó': case U'Ô': case U'Õ': case U'Ö': case U'Ø': case U'Ō': case U'Ŏ': case U'Ő': case U'Ɵ': case U'Ơ': case U'Ǒ': case U'Ǫ': case U'Ǭ': case U'Ǿ': case U'Ȍ': case U'Ȏ': case U'Ȫ': case U'Ȭ': case U'Ȯ': case U'Ȱ':
|
||||||
|
return 'O';
|
||||||
|
case U'Ƥ':
|
||||||
|
return 'P';
|
||||||
|
case U'Ŕ': case U'Ŗ': case U'Ř': case U'Ȑ': case U'Ȓ': case U'Ɍ':
|
||||||
|
return 'R';
|
||||||
|
case U'Ś': case U'Ŝ': case U'Ş': case U'Š': case U'Ș':
|
||||||
|
return 'S';
|
||||||
|
case U'Ţ': case U'Ť': case U'Ŧ': case U'Ƭ': case U'Ʈ': case U'Ț': case U'Ⱦ':
|
||||||
|
return 'T';
|
||||||
|
case U'Ù': case U'Ú': case U'Û': case U'Ü': case U'Ũ': case U'Ū': case U'Ŭ': case U'Ů': case U'Ű': case U'Ų': case U'Ư': case U'Ǔ': case U'Ǖ': case U'Ǘ': case U'Ǚ': case U'Ǜ': case U'Ȕ': case U'Ȗ': case U'Ʉ':
|
||||||
|
return 'U';
|
||||||
|
case U'Ʋ':
|
||||||
|
return 'V';
|
||||||
|
case U'Ŵ':
|
||||||
|
return 'W';
|
||||||
|
case U'Ý': case U'Ŷ': case U'Ÿ': case U'Ƴ': case U'Ȳ': case U'Ɏ':
|
||||||
|
return 'Y';
|
||||||
|
case U'Ź': case U'Ż': case U'Ž': case U'Ƶ': case U'Ȥ':
|
||||||
|
return 'Z';
|
||||||
|
case U'à': case U'á': case U'â': case U'ã': case U'ä': case U'å': case U'ā': case U'ă': case U'ą': case U'ǎ': case U'ǟ': case U'ǡ': case U'ǻ': case U'ȁ': case U'ȃ': case U'ȧ':
|
||||||
|
return 'a';
|
||||||
|
case U'ƀ': case U'ƃ':
|
||||||
|
return 'b';
|
||||||
|
case U'ç': case U'ć': case U'ĉ': case U'ċ': case U'č': case U'ƈ': case U'ȼ':
|
||||||
|
return 'c';
|
||||||
|
case U'ď': case U'đ': case U'ƌ': case U'ȡ':
|
||||||
|
return 'd';
|
||||||
|
case U'è': case U'é': case U'ê': case U'ë': case U'ē': case U'ĕ': case U'ė': case U'ę': case U'ě': case U'ȅ': case U'ȇ': case U'ȩ': case U'ɇ':
|
||||||
|
return 'e';
|
||||||
|
case U'ƒ':
|
||||||
|
return 'f';
|
||||||
|
case U'ĝ': case U'ğ': case U'ġ': case U'ģ': case U'ǥ': case U'ǧ': case U'ǵ':
|
||||||
|
return 'g';
|
||||||
|
case U'ĥ': case U'ħ': case U'ȟ':
|
||||||
|
return 'h';
|
||||||
|
case U'ì': case U'í': case U'î': case U'ï': case U'ĩ': case U'ī': case U'ĭ': case U'į': case U'ǐ': case U'ȉ': case U'ȋ':
|
||||||
|
return 'i';
|
||||||
|
case U'ĵ': case U'ǰ': case U'ɉ':
|
||||||
|
return 'j';
|
||||||
|
case U'ķ': case U'ƙ': case U'ǩ':
|
||||||
|
return 'k';
|
||||||
|
case U'ĺ': case U'ļ': case U'ľ': case U'ŀ': case U'ł': case U'ƚ': case U'ȴ':
|
||||||
|
return 'l';
|
||||||
|
case U'ñ': case U'ń': case U'ņ': case U'ň': case U'ʼn': case U'ƞ': case U'ǹ': case U'ȵ':
|
||||||
|
return 'n';
|
||||||
|
case U'ò': case U'ó': case U'ô': case U'õ': case U'ö': case U'ø': case U'ō': case U'ŏ': case U'ő': case U'ơ': case U'ǒ': case U'ǫ': case U'ǭ': case U'ǿ': case U'ȍ': case U'ȏ': case U'ȫ': case U'ȭ': case U'ȯ': case U'ȱ':
|
||||||
|
return 'o';
|
||||||
|
case U'ƥ':
|
||||||
|
return 'p';
|
||||||
|
case U'ɋ':
|
||||||
|
return 'q';
|
||||||
|
case U'ŕ': case U'ŗ': case U'ř': case U'ȑ': case U'ȓ': case U'ɍ':
|
||||||
|
return 'r';
|
||||||
|
case U'ś': case U'ŝ': case U'ş': case U'š': case U'ș': case U'ȿ':
|
||||||
|
return 's';
|
||||||
|
case U'ţ': case U'ť': case U'ŧ': case U'ƫ': case U'ƭ': case U'ț': case U'ȶ':
|
||||||
|
return 't';
|
||||||
|
case U'ù': case U'ú': case U'û': case U'ü': case U'ũ': case U'ū': case U'ŭ': case U'ů': case U'ű': case U'ų': case U'ư': case U'ǔ': case U'ǖ': case U'ǘ': case U'ǚ': case U'ǜ': case U'ȕ': case U'ȗ':
|
||||||
|
return 'u';
|
||||||
|
case U'ŵ':
|
||||||
|
return 'w';
|
||||||
|
case U'ý': case U'ÿ': case U'ŷ': case U'ƴ': case U'ȳ': case U'ɏ':
|
||||||
|
return 'y';
|
||||||
|
case U'ź': case U'ż': case U'ž': case U'ƶ': case U'ȥ': case U'ɀ':
|
||||||
|
return 'z';
|
|
@ -0,0 +1,33 @@
|
||||||
|
require 'open-uri'
|
||||||
|
require 'csv'
|
||||||
|
|
||||||
|
# Create mapping from ASCII characters to related Unicode characters
|
||||||
|
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
||||||
|
url = 'http://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt'
|
||||||
|
headers = [:code, :name, :category]
|
||||||
|
CSV.new(open(url), :col_sep => ';', :headers => headers).each do |row|
|
||||||
|
code = row[:code].hex
|
||||||
|
next if code < 0x80
|
||||||
|
break if code > 0x24f
|
||||||
|
|
||||||
|
char = [code].pack('U')
|
||||||
|
name = row[:name]
|
||||||
|
match = /^LATIN (CAPITAL|SMALL) LETTER ([A-Z])\b(?!.*\bLETTER\b)/.match(name)
|
||||||
|
if match
|
||||||
|
baseChar = match[2]
|
||||||
|
if match[1] == 'SMALL'
|
||||||
|
baseChar = (baseChar.ord + 0x20).chr
|
||||||
|
end
|
||||||
|
mapping[baseChar] << char
|
||||||
|
end
|
||||||
|
end
|
||||||
|
mapping = mapping.sort.to_h
|
||||||
|
|
||||||
|
# Generate asciiCases.cpp
|
||||||
|
File.open('asciiCases.cpp', 'w') do |file|
|
||||||
|
file.print "// Generated by #{__FILE__}; don't modify by hand!\n\n"
|
||||||
|
mapping.each do |asciiChar, unicodeChars|
|
||||||
|
file.print unicodeChars.map { |c| "case U'#{c}':" }.join(' '), "\n"
|
||||||
|
file.print "\treturn '#{asciiChar}';\n"
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,13 @@
|
||||||
|
#include <gmock/gmock.h>
|
||||||
|
#include "ascii.h"
|
||||||
|
|
||||||
|
using namespace testing;
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
TEST(toASCII, string) {
|
||||||
|
EXPECT_EQ(
|
||||||
|
"A naive man called was having pina colada and creme brulee.",
|
||||||
|
toASCII(U"A naïve man called 晨 was having piña colada and crème brûlée."));
|
||||||
|
EXPECT_EQ(string(""), toASCII(U""));
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue