Merge pull request #49 from DanielSWolf/feature/#38-upgrade-utf8proc
Upgrade to utf8proc 2.2.0
This commit is contained in:
commit
238687e33b
|
@ -225,11 +225,12 @@ set_target_properties(utfcpp PROPERTIES FOLDER lib)
|
||||||
|
|
||||||
# ... utf8proc
|
# ... utf8proc
|
||||||
add_library(utf8proc
|
add_library(utf8proc
|
||||||
lib/utf8proc-2a2f97e1/utf8proc.c
|
lib/utf8proc-2.2.0/utf8proc.c
|
||||||
lib/utf8proc-2a2f97e1/utf8proc.h
|
lib/utf8proc-2.2.0/utf8proc.h
|
||||||
)
|
)
|
||||||
target_include_directories(utf8proc SYSTEM PUBLIC "lib/utf8proc-2a2f97e1")
|
target_include_directories(utf8proc SYSTEM PUBLIC "lib/utf8proc-2.2.0")
|
||||||
target_compile_options(utf8proc PRIVATE ${disableWarningsFlags})
|
target_compile_options(utf8proc PRIVATE ${disableWarningsFlags})
|
||||||
|
target_compile_definitions(utf8proc PUBLIC UTF8PROC_STATIC=1) # Compile as static lib
|
||||||
set_target_properties(utf8proc PROPERTIES FOLDER lib)
|
set_target_properties(utf8proc PROPERTIES FOLDER lib)
|
||||||
|
|
||||||
# ... Ogg
|
# ... Ogg
|
||||||
|
|
|
@ -21,6 +21,7 @@ test/normtest
|
||||||
test/graphemetest
|
test/graphemetest
|
||||||
test/printproperty
|
test/printproperty
|
||||||
test/charwidth
|
test/charwidth
|
||||||
|
test/misc
|
||||||
test/valid
|
test/valid
|
||||||
test/iterate
|
test/iterate
|
||||||
test/case
|
test/case
|
|
@ -0,0 +1,51 @@
|
||||||
|
cmake_minimum_required (VERSION 2.8.12)
|
||||||
|
|
||||||
|
include (utils.cmake)
|
||||||
|
|
||||||
|
disallow_intree_builds()
|
||||||
|
|
||||||
|
project (utf8proc C)
|
||||||
|
|
||||||
|
# This is the ABI version number, which may differ from the
|
||||||
|
# API version number (defined in utf8proc.h).
|
||||||
|
# Be sure to also update these in Makefile and MANIFEST!
|
||||||
|
set(SO_MAJOR 2)
|
||||||
|
set(SO_MINOR 2)
|
||||||
|
set(SO_PATCH 0)
|
||||||
|
|
||||||
|
if (NOT MSVC)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -std=c99 -pedantic -Wall")
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
add_library (utf8proc
|
||||||
|
utf8proc.c
|
||||||
|
utf8proc.h
|
||||||
|
)
|
||||||
|
|
||||||
|
if (BUILD_SHARED_LIBS)
|
||||||
|
# Building shared library
|
||||||
|
else()
|
||||||
|
# Building static library
|
||||||
|
target_compile_definitions(utf8proc PUBLIC "UTF8PROC_STATIC")
|
||||||
|
if (MSVC)
|
||||||
|
set_target_properties(utf8proc PROPERTIES OUTPUT_NAME "utf8proc_static")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
target_compile_definitions(utf8proc PRIVATE "UTF8PROC_EXPORTS")
|
||||||
|
|
||||||
|
set_target_properties (utf8proc PROPERTIES
|
||||||
|
POSITION_INDEPENDENT_CODE ON
|
||||||
|
VERSION "${SO_MAJOR}.${SO_MINOR}.${SO_PATCH}"
|
||||||
|
SOVERSION ${SO_MAJOR}
|
||||||
|
)
|
||||||
|
|
||||||
|
install(TARGETS utf8proc
|
||||||
|
RUNTIME DESTINATION bin
|
||||||
|
LIBRARY DESTINATION lib
|
||||||
|
ARCHIVE DESTINATION lib)
|
||||||
|
|
||||||
|
install(
|
||||||
|
FILES
|
||||||
|
"${PROJECT_SOURCE_DIR}/utf8proc.h"
|
||||||
|
DESTINATION include)
|
|
@ -7,7 +7,7 @@ whose copyright and license statements are reproduced below, all new
|
||||||
work on the utf8proc library is licensed under the [MIT "expat"
|
work on the utf8proc library is licensed under the [MIT "expat"
|
||||||
license](http://opensource.org/licenses/MIT):
|
license](http://opensource.org/licenses/MIT):
|
||||||
|
|
||||||
*Copyright © 2014-2015 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.*
|
*Copyright © 2014-2018 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.*
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a
|
Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
copy of this software and associated documentation files (the "Software"),
|
copy of this software and associated documentation files (the "Software"),
|
|
@ -0,0 +1,7 @@
|
||||||
|
include/
|
||||||
|
include/utf8proc.h
|
||||||
|
lib/
|
||||||
|
lib/libutf8proc.a
|
||||||
|
lib/libutf8proc.so -> libutf8proc.so.2.2.0
|
||||||
|
lib/libutf8proc.so.2 -> libutf8proc.so.2.2.0
|
||||||
|
lib/libutf8proc.so.2.2.0
|
|
@ -11,7 +11,7 @@ CFLAGS ?= -O2
|
||||||
PICFLAG = -fPIC
|
PICFLAG = -fPIC
|
||||||
C99FLAG = -std=c99
|
C99FLAG = -std=c99
|
||||||
WCFLAGS = -Wall -pedantic
|
WCFLAGS = -Wall -pedantic
|
||||||
UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
|
UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES)
|
||||||
|
|
||||||
# shared-library version MAJOR.MINOR.PATCH ... this may be *different*
|
# shared-library version MAJOR.MINOR.PATCH ... this may be *different*
|
||||||
# from the utf8proc version number because it indicates ABI compatibility,
|
# from the utf8proc version number because it indicates ABI compatibility,
|
||||||
|
@ -20,7 +20,7 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
|
||||||
# The API version number is defined in utf8proc.h.
|
# The API version number is defined in utf8proc.h.
|
||||||
# Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
|
# Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
|
||||||
MAJOR=2
|
MAJOR=2
|
||||||
MINOR=1
|
MINOR=2
|
||||||
PATCH=0
|
PATCH=0
|
||||||
|
|
||||||
OS := $(shell uname)
|
OS := $(shell uname)
|
||||||
|
@ -48,7 +48,7 @@ clean:
|
||||||
ifneq ($(OS),Darwin)
|
ifneq ($(OS),Darwin)
|
||||||
rm -f libutf8proc.so.$(MAJOR)
|
rm -f libutf8proc.so.$(MAJOR)
|
||||||
endif
|
endif
|
||||||
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom
|
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc
|
||||||
rm -rf MANIFEST.new tmp
|
rm -rf MANIFEST.new tmp
|
||||||
$(MAKE) -C bench clean
|
$(MAKE) -C bench clean
|
||||||
$(MAKE) -C data clean
|
$(MAKE) -C data clean
|
||||||
|
@ -81,7 +81,7 @@ libutf8proc.so: libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH)
|
||||||
ln -f -s libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH) $@.$(MAJOR)
|
ln -f -s libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH) $@.$(MAJOR)
|
||||||
|
|
||||||
libutf8proc.$(MAJOR).dylib: utf8proc.o
|
libutf8proc.$(MAJOR).dylib: utf8proc.o
|
||||||
$(CC) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH)
|
$(CC) $(LDFLAGS) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH)
|
||||||
|
|
||||||
libutf8proc.dylib: libutf8proc.$(MAJOR).dylib
|
libutf8proc.dylib: libutf8proc.$(MAJOR).dylib
|
||||||
ln -f -s libutf8proc.$(MAJOR).dylib $@
|
ln -f -s libutf8proc.$(MAJOR).dylib $@
|
||||||
|
@ -138,11 +138,15 @@ test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||||
test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||||
$(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
|
$(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
|
||||||
|
|
||||||
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
|
test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||||
|
$(CC) $(UCFLAGS) test/misc.c test/tests.o utf8proc.o -o $@
|
||||||
|
|
||||||
|
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
|
||||||
$(MAKE) -C bench
|
$(MAKE) -C bench
|
||||||
test/normtest data/NormalizationTest.txt
|
test/normtest data/NormalizationTest.txt
|
||||||
test/graphemetest data/GraphemeBreakTest.txt
|
test/graphemetest data/GraphemeBreakTest.txt
|
||||||
test/charwidth
|
test/charwidth
|
||||||
|
test/misc
|
||||||
test/valid
|
test/valid
|
||||||
test/iterate
|
test/iterate
|
||||||
test/case
|
test/case
|
|
@ -1,5 +1,35 @@
|
||||||
# utf8proc release history #
|
# utf8proc release history #
|
||||||
|
|
||||||
|
## Version 2.2 ##
|
||||||
|
|
||||||
|
2018-07-24
|
||||||
|
|
||||||
|
- Unicode 11 support ([#132] and [#140]).
|
||||||
|
|
||||||
|
- `utf8proc_NFKC_Casefold` convenience function for `NFKC_Casefold`
|
||||||
|
normalization ([#133]).
|
||||||
|
|
||||||
|
- `UTF8PROC_STRIPNA` option to strip unassigned codepoints ([#133]).
|
||||||
|
|
||||||
|
- Support building static libraries on Windows (callers need to
|
||||||
|
`#define UTF8PROC_STATIC`) ([#123]).
|
||||||
|
|
||||||
|
- `cmake` fix to avoid defining `UTF8PROC_EXPORTS` globally ([#121]).
|
||||||
|
|
||||||
|
- `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
|
||||||
|
case-folding still yields the standard "ss" mapping.
|
||||||
|
|
||||||
|
- `utf8proc_charwidth` now returns `1` for U+00AD (soft hyphen) and
|
||||||
|
for unassigned/PUA codepoints ([#135]).
|
||||||
|
|
||||||
|
## Version 2.1.1 ##
|
||||||
|
|
||||||
|
2018-04-27
|
||||||
|
|
||||||
|
- Fixed composition bug ([#128]).
|
||||||
|
|
||||||
|
- Minor build fixes ([#94], [#99], [#113], [#125]).
|
||||||
|
|
||||||
## Version 2.1 ##
|
## Version 2.1 ##
|
||||||
|
|
||||||
2016-12-26:
|
2016-12-26:
|
||||||
|
@ -297,7 +327,19 @@ Release of version 1.0.1
|
||||||
[#78]: https://github.com/JuliaLang/utf8proc/issues/78
|
[#78]: https://github.com/JuliaLang/utf8proc/issues/78
|
||||||
[#79]: https://github.com/JuliaLang/utf8proc/issues/79
|
[#79]: https://github.com/JuliaLang/utf8proc/issues/79
|
||||||
[#80]: https://github.com/JuliaLang/utf8proc/issues/80
|
[#80]: https://github.com/JuliaLang/utf8proc/issues/80
|
||||||
[#84]: https://github.com/JuliaLang/utf8proc/pull/84
|
[#84]: https://github.com/JuliaLang/utf8proc/issues/84
|
||||||
[#88]: https://github.com/JuliaLang/utf8proc/pull/88
|
[#88]: https://github.com/JuliaLang/utf8proc/issues/88
|
||||||
[#89]: https://github.com/JuliaLang/utf8proc/pull/89
|
[#89]: https://github.com/JuliaLang/utf8proc/issues/89
|
||||||
[#90]: https://github.com/JuliaLang/utf8proc/issues/90
|
[#90]: https://github.com/JuliaLang/utf8proc/issues/90
|
||||||
|
[#94]: https://github.com/JuliaLang/utf8proc/issues/94
|
||||||
|
[#99]: https://github.com/JuliaLang/utf8proc/issues/99
|
||||||
|
[#113]: https://github.com/JuliaLang/utf8proc/issues/113
|
||||||
|
[#121]: https://github.com/JuliaLang/utf8proc/issues/121
|
||||||
|
[#123]: https://github.com/JuliaLang/utf8proc/issues/123
|
||||||
|
[#125]: https://github.com/JuliaLang/utf8proc/issues/125
|
||||||
|
[#128]: https://github.com/JuliaLang/utf8proc/issues/128
|
||||||
|
[#132]: https://github.com/JuliaLang/utf8proc/issues/132
|
||||||
|
[#133]: https://github.com/JuliaLang/utf8proc/issues/133
|
||||||
|
[#134]: https://github.com/JuliaLang/utf8proc/issues/134
|
||||||
|
[#135]: https://github.com/JuliaLang/utf8proc/issues/135
|
||||||
|
[#140]: https://github.com/JuliaLang/utf8proc/issues/140
|
|
@ -1,7 +1,6 @@
|
||||||
# utf8proc
|
# utf8proc
|
||||||
[![Travis CI Status](https://travis-ci.org/JuliaLang/utf8proc.png)](https://travis-ci.org/JuliaLang/utf8proc)
|
[![Travis CI Status](https://travis-ci.org/JuliaStrings/utf8proc.png)](https://travis-ci.org/JuliaStrings/utf8proc)
|
||||||
[![AppVeyor Status](https://ci.appveyor.com/api/projects/status/aou20lfkyhj8xbwq/branch/master?svg=true)](https://ci.appveyor.com/project/tkelman/utf8proc/branch/master)
|
[![AppVeyor status](https://ci.appveyor.com/api/projects/status/ivaa0v6ikxrmm5r6?svg=true)](https://ci.appveyor.com/project/StevenGJohnson/utf8proc)
|
||||||
|
|
||||||
|
|
||||||
[utf8proc](http://julialang.org/utf8proc/) is a small, clean C
|
[utf8proc](http://julialang.org/utf8proc/) is a small, clean C
|
||||||
library that provides Unicode normalization, case-folding, and other
|
library that provides Unicode normalization, case-folding, and other
|
||||||
|
@ -40,7 +39,7 @@ The C library is found in this directory after successful compilation
|
||||||
and is named `libutf8proc.a` (for the static library) and
|
and is named `libutf8proc.a` (for the static library) and
|
||||||
`libutf8proc.so` (for the dynamic library).
|
`libutf8proc.so` (for the dynamic library).
|
||||||
|
|
||||||
The Unicode version supported is 9.0.0.
|
The Unicode version supported is 11.0.0.
|
||||||
|
|
||||||
For Unicode normalizations, the following options are used:
|
For Unicode normalizations, the following options are used:
|
||||||
|
|
|
@ -16,11 +16,11 @@ CURLFLAGS = --retry 5 --location
|
||||||
|
|
||||||
.DELETE_ON_ERROR:
|
.DELETE_ON_ERROR:
|
||||||
|
|
||||||
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt
|
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
|
||||||
$(RUBY) data_generator.rb < UnicodeData.txt > $@
|
$(RUBY) data_generator.rb < UnicodeData.txt > $@
|
||||||
|
|
||||||
# GNU Unifont version for font metric calculations:
|
# GNU Unifont version for font metric calculations:
|
||||||
UNIFONT_VERSION=9.0.04
|
UNIFONT_VERSION=11.0.01
|
||||||
|
|
||||||
unifont.ttf:
|
unifont.ttf:
|
||||||
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
|
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
|
||||||
|
@ -35,7 +35,7 @@ CharWidths.txt: charwidths.jl unifont.sfd unifont_upper.sfd EastAsianWidth.txt
|
||||||
$(JULIA) charwidths.jl > $@
|
$(JULIA) charwidths.jl > $@
|
||||||
|
|
||||||
# Unicode data version
|
# Unicode data version
|
||||||
UNICODE_VERSION=9.0.0
|
UNICODE_VERSION=11.0.0
|
||||||
|
|
||||||
UnicodeData.txt:
|
UnicodeData.txt:
|
||||||
$(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
|
$(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
|
||||||
|
@ -61,6 +61,9 @@ NormalizationTest.txt:
|
||||||
GraphemeBreakTest.txt:
|
GraphemeBreakTest.txt:
|
||||||
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
|
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
|
||||||
|
|
||||||
|
emoji-data.txt:
|
||||||
|
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://unicode.org/Public/emoji/`echo $(UNICODE_VERSION) | cut -d. -f1-2`/emoji-data.txt
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd
|
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd emoji-data.txt
|
||||||
rm -f utf8proc_data.c.new
|
rm -f utf8proc_data.c.new
|
|
@ -7,17 +7,6 @@
|
||||||
# Requires Julia (obviously) and FontForge.
|
# Requires Julia (obviously) and FontForge.
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# Julia 0.3/0.4 compatibility (taken from Compat package)
|
|
||||||
if VERSION < v"0.4.0-dev+1387"
|
|
||||||
typealias AbstractString String
|
|
||||||
end
|
|
||||||
if VERSION < v"0.4.0-dev+1419"
|
|
||||||
const UInt32 = Uint32
|
|
||||||
end
|
|
||||||
if VERSION < v"0.4.0-dev+3874"
|
|
||||||
Base.parse{T<:Integer}(::Type{T}, s::AbstractString) = parseint(T, s)
|
|
||||||
end
|
|
||||||
|
|
||||||
CharWidths = Dict{Int,Int}()
|
CharWidths = Dict{Int,Int}()
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
|
@ -31,12 +20,12 @@ import Base.UTF8proc
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# Use a default width of 1 for all character categories that are
|
# Use a default width of 1 for all character categories that are
|
||||||
# letter/symbol/number-like. This can be overriden by Unifont or UAX 11
|
# letter/symbol/number-like, as well as for unassigned/private-use chars.
|
||||||
|
# This can be overriden by Unifont or UAX 11
|
||||||
# below, but provides a useful nonzero fallback for new codepoints when
|
# below, but provides a useful nonzero fallback for new codepoints when
|
||||||
# a new Unicode version has been released but Unifont hasn't been updated yet.
|
# a new Unicode version has been released but Unifont hasn't been updated yet.
|
||||||
|
|
||||||
zerowidth = Set{Int}() # categories that may contain zero-width chars
|
zerowidth = Set{Int}() # categories that may contain zero-width chars
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN)
|
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
|
||||||
|
@ -47,7 +36,6 @@ push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO)
|
|
||||||
for c in 0x0000:0x110000
|
for c in 0x0000:0x110000
|
||||||
if catcode(c) ∉ zerowidth
|
if catcode(c) ∉ zerowidth
|
||||||
CharWidths[c] = 1
|
CharWidths[c] = 1
|
||||||
|
@ -97,7 +85,7 @@ CharWidths=parsesfd("unifont_upper.sfd", CharWidths)
|
||||||
|
|
||||||
for line in readlines(open("EastAsianWidth.txt"))
|
for line in readlines(open("EastAsianWidth.txt"))
|
||||||
#Strip comments
|
#Strip comments
|
||||||
line[1] == '#' && continue
|
(isempty(line) || line[1] == '#') && continue
|
||||||
precomment = split(line, '#')[1]
|
precomment = split(line, '#')[1]
|
||||||
#Parse code point range and width code
|
#Parse code point range and width code
|
||||||
tokens = split(precomment, ';')
|
tokens = split(precomment, ';')
|
||||||
|
@ -113,7 +101,7 @@ for line in readlines(open("EastAsianWidth.txt"))
|
||||||
for c in charstart:charend
|
for c in charstart:charend
|
||||||
if width=="W" || width=="F" # wide or full
|
if width=="W" || width=="F" # wide or full
|
||||||
CharWidths[c]=2
|
CharWidths[c]=2
|
||||||
elseif width=="Na"|| width=="H" # narrow or half
|
elseif width=="Na"|| width=="H"
|
||||||
CharWidths[c]=1
|
CharWidths[c]=1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -126,9 +114,11 @@ end
|
||||||
for c in keys(CharWidths)
|
for c in keys(CharWidths)
|
||||||
cat = catcode(c)
|
cat = catcode(c)
|
||||||
|
|
||||||
# make sure format control character (category Cf) have width 0,
|
# make sure format control character (category Cf) have width 0
|
||||||
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
|
# (some of these, like U+0601, can have a width in some cases
|
||||||
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd]
|
# but normally act like prepended combining marks. U+fff9 etc
|
||||||
|
# are also odd, but have zero width in typical terminal contexts)
|
||||||
|
if cat==UTF8proc.UTF8PROC_CATEGORY_CF
|
||||||
CharWidths[c]=0
|
CharWidths[c]=0
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -139,11 +129,12 @@ for c in keys(CharWidths)
|
||||||
CharWidths[c]=0
|
CharWidths[c]=0
|
||||||
end
|
end
|
||||||
|
|
||||||
# We also assign width of zero to unassigned and private-use
|
# We also assign width of one to unassigned and private-use
|
||||||
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
|
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
|
||||||
# but since these are nonstandard it seems questionable to recognize them).
|
# but since these are nonstandard it seems questionable to use Unifont metrics;
|
||||||
|
# if they are printed as the replacement character U+FFFD they will have width 1).
|
||||||
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
|
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
|
||||||
CharWidths[c]=0
|
CharWidths[c]=1
|
||||||
end
|
end
|
||||||
|
|
||||||
# for some reason, Unifont has width-2 glyphs for ASCII control chars
|
# for some reason, Unifont has width-2 glyphs for ASCII control chars
|
||||||
|
@ -152,6 +143,9 @@ for c in keys(CharWidths)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
#Soft hyphen is typically printed as a hyphen (-) in terminals.
|
||||||
|
CharWidths[0x00ad]=1
|
||||||
|
|
||||||
#By definition, should have zero width (on the same line)
|
#By definition, should have zero width (on the same line)
|
||||||
#0x002028 '
' category: Zl name: LINE SEPARATOR/
|
#0x002028 '
' category: Zl name: LINE SEPARATOR/
|
||||||
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
|
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
|
||||||
|
@ -169,8 +163,8 @@ CharWidths[0x2001]=2
|
||||||
CharWidths[0x2003]=2
|
CharWidths[0x2003]=2
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# Output (to a file or pipe) for processing by data_generator.rb
|
# Output (to a file or pipe) for processing by data_generator.rb,
|
||||||
# ... don't bother to output zero widths since that will be the default.
|
# encoded as a sequence of intervals.
|
||||||
|
|
||||||
firstc = 0x000000
|
firstc = 0x000000
|
||||||
lastv = 0
|
lastv = 0
|
|
@ -6,6 +6,8 @@
|
||||||
# production use.
|
# production use.
|
||||||
|
|
||||||
|
|
||||||
|
# Copyright (c) 2018 Steven G. Johnson, Tony Kelman, Keno Fischer,
|
||||||
|
# Benito van der Zander, Michaël Meyer, and other contributors.
|
||||||
# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||||
#
|
#
|
||||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
@ -85,6 +87,19 @@ $grapheme_boundclass_list.each_line do |entry|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
$emoji_data_list = File.read("emoji-data.txt")
|
||||||
|
$emoji_data_list.each_line do |entry|
|
||||||
|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
|
||||||
|
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
|
||||||
|
elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
|
||||||
|
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC"
|
||||||
|
elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
|
||||||
|
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" }
|
||||||
|
elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
|
||||||
|
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
$charwidth_list = File.read("CharWidths.txt")
|
$charwidth_list = File.read("CharWidths.txt")
|
||||||
$charwidth = Hash.new(0)
|
$charwidth = Hash.new(0)
|
||||||
$charwidth_list.each_line do |entry|
|
$charwidth_list.each_line do |entry|
|
||||||
|
@ -104,7 +119,7 @@ $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
|
||||||
$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
|
$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
|
||||||
$case_folding = {}
|
$case_folding = {}
|
||||||
$case_folding_string.chomp.split("\n").each do |line|
|
$case_folding_string.chomp.split("\n").each do |line|
|
||||||
next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
|
next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i
|
||||||
$case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
|
$case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -137,13 +152,13 @@ def cpary2utf16encoded(array)
|
||||||
end
|
end
|
||||||
def cpary2c(array)
|
def cpary2c(array)
|
||||||
return "UINT16_MAX" if array.nil? || array.length == 0
|
return "UINT16_MAX" if array.nil? || array.length == 0
|
||||||
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
|
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
|
||||||
array = cpary2utf16encoded(array)
|
array = cpary2utf16encoded(array)
|
||||||
if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
|
if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
|
||||||
array = [lencode] + array
|
array = [lencode] + array
|
||||||
lencode = 7
|
lencode = 7
|
||||||
end
|
end
|
||||||
idx = pushary(array)
|
idx = pushary(array)
|
||||||
raise "Array index out of bound" if idx > 0x1FFF
|
raise "Array index out of bound" if idx > 0x1FFF
|
||||||
return "#{idx | (lencode << 13)}"
|
return "#{idx | (lencode << 13)}"
|
||||||
end
|
end
|
||||||
|
@ -188,9 +203,10 @@ class UnicodeChar
|
||||||
@decomp_mapping = ($8=='') ? nil :
|
@decomp_mapping = ($8=='') ? nil :
|
||||||
$8.split.collect { |element| element.hex }
|
$8.split.collect { |element| element.hex }
|
||||||
@bidi_mirrored = ($13=='Y') ? true : false
|
@bidi_mirrored = ($13=='Y') ? true : false
|
||||||
@uppercase_mapping = ($16=='') ? nil : $16.hex
|
# issue #130: use nonstandard uppercase ß -> ẞ
|
||||||
|
@uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
|
||||||
@lowercase_mapping = ($17=='') ? nil : $17.hex
|
@lowercase_mapping = ($17=='') ? nil : $17.hex
|
||||||
@titlecase_mapping = ($18=='') ? nil : $18.hex
|
@titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
|
||||||
end
|
end
|
||||||
def case_folding
|
def case_folding
|
||||||
$case_folding[code]
|
$case_folding[code]
|
||||||
|
@ -260,17 +276,17 @@ chars.each do |char|
|
||||||
end
|
end
|
||||||
unless comb2nd_indicies[dm1]
|
unless comb2nd_indicies[dm1]
|
||||||
comb2nd_indicies_sorted_keys << dm1
|
comb2nd_indicies_sorted_keys << dm1
|
||||||
comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
|
comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
|
||||||
end
|
end
|
||||||
comb_array[comb1st_indicies[dm0]] ||= []
|
comb_array[comb1st_indicies[dm0]] ||= []
|
||||||
raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
|
raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
|
||||||
comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
|
comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
|
||||||
|
|
||||||
comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
|
comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
|
||||||
end
|
end
|
||||||
char.c_decomp_mapping = cpary2c(char.decomp_mapping)
|
char.c_decomp_mapping = cpary2c(char.decomp_mapping)
|
||||||
char.c_case_folding = cpary2c(char.case_folding)
|
char.c_case_folding = cpary2c(char.case_folding)
|
||||||
end
|
end
|
||||||
|
|
||||||
comb_indicies = {}
|
comb_indicies = {}
|
||||||
cumoffset = 0
|
cumoffset = 0
|
||||||
|
@ -281,7 +297,7 @@ comb1st_indicies.each do |dm0, index|
|
||||||
last = nil
|
last = nil
|
||||||
offset = 0
|
offset = 0
|
||||||
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
|
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
|
||||||
if comb_array[index][b]
|
if comb_array[index][b]
|
||||||
first = offset unless first
|
first = offset unless first
|
||||||
last = offset
|
last = offset
|
||||||
last += 1 if comb2nd_indicies_nonbasic[dm1]
|
last += 1 if comb2nd_indicies_nonbasic[dm1]
|
||||||
|
@ -377,7 +393,7 @@ end
|
||||||
$stdout << "};\n\n"
|
$stdout << "};\n\n"
|
||||||
|
|
||||||
$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
|
$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
|
||||||
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
|
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
|
||||||
properties.each { |line|
|
properties.each { |line|
|
||||||
$stdout << line
|
$stdout << line
|
||||||
}
|
}
|
||||||
|
@ -391,7 +407,7 @@ comb1st_indicies.keys.each_index do |a|
|
||||||
offset = 0
|
offset = 0
|
||||||
$stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
|
$stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
|
||||||
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
|
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
|
||||||
break if offset > comb1st_indicies_lastoffsets[a]
|
break if offset > comb1st_indicies_lastoffsets[a]
|
||||||
if offset >= comb1st_indicies_firstoffsets[a]
|
if offset >= comb1st_indicies_firstoffsets[a]
|
||||||
i += 1
|
i += 1
|
||||||
if i == 8
|
if i == 8
|
||||||
|
@ -403,9 +419,8 @@ comb1st_indicies.keys.each_index do |a|
|
||||||
$stdout << (v & 0xFFFF) << ", "
|
$stdout << (v & 0xFFFF) << ", "
|
||||||
end
|
end
|
||||||
offset += 1
|
offset += 1
|
||||||
offset += 1 if comb2nd_indicies_nonbasic[dm1]
|
offset += 1 if comb2nd_indicies_nonbasic[dm1]
|
||||||
end
|
end
|
||||||
$stdout << "\n"
|
$stdout << "\n"
|
||||||
end
|
end
|
||||||
$stdout << "};\n\n"
|
$stdout << "};\n\n"
|
||||||
|
|
|
@ -13,13 +13,22 @@ int main(int argc, char **argv)
|
||||||
for (c = 0; c <= 0x110000; ++c) {
|
for (c = 0; c <= 0x110000; ++c) {
|
||||||
utf8proc_int32_t l = utf8proc_tolower(c);
|
utf8proc_int32_t l = utf8proc_tolower(c);
|
||||||
utf8proc_int32_t u = utf8proc_toupper(c);
|
utf8proc_int32_t u = utf8proc_toupper(c);
|
||||||
|
utf8proc_int32_t t = utf8proc_totitle(c);
|
||||||
|
|
||||||
check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
|
check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
|
||||||
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
|
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
|
||||||
|
check(t == c || utf8proc_codepoint_valid(t), "invalid totitle");
|
||||||
|
|
||||||
|
if (utf8proc_codepoint_valid(c) && (l == u) != (l == t) &&
|
||||||
|
/* Unicode 11: Georgian Mkhedruli chars have uppercase but no titlecase. */
|
||||||
|
!(((c >= 0x10d0 && c <= 0x10fa) || c >= (0x10fd && c <= 0x10ff)) && l != u)) {
|
||||||
|
fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c);
|
||||||
|
++error;
|
||||||
|
}
|
||||||
|
|
||||||
if (sizeof(wint_t) > 2 || c < (1<<16)) {
|
if (sizeof(wint_t) > 2 || c < (1<<16)) {
|
||||||
wint_t l0 = towlower(c), u0 = towupper(c);
|
wint_t l0 = towlower(c), u0 = towupper(c);
|
||||||
|
|
||||||
/* OS unicode tables may be out of date. But if they
|
/* OS unicode tables may be out of date. But if they
|
||||||
do have a lower/uppercase mapping, hopefully it
|
do have a lower/uppercase mapping, hopefully it
|
||||||
is correct? */
|
is correct? */
|
||||||
|
@ -44,6 +53,20 @@ int main(int argc, char **argv)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
check(!error, "utf8proc case conversion FAILED %d tests.", error);
|
check(!error, "utf8proc case conversion FAILED %d tests.", error);
|
||||||
|
|
||||||
|
/* issue #130 */
|
||||||
|
check(utf8proc_toupper(0x00df) == 0x1e9e &&
|
||||||
|
utf8proc_totitle(0x00df) == 0x1e9e &&
|
||||||
|
utf8proc_tolower(0x00df) == 0x00df &&
|
||||||
|
utf8proc_tolower(0x1e9e) == 0x00df &&
|
||||||
|
utf8proc_toupper(0x1e9e) == 0x1e9e,
|
||||||
|
"incorrect 0x00df/0x1e9e case conversions");
|
||||||
|
utf8proc_uint8_t str_00df[] = {0xc3, 0x9f, 0x00};
|
||||||
|
utf8proc_uint8_t str_1e9e[] = {0xe1, 0xba, 0x9e, 0x00};
|
||||||
|
check(!strcmp((char*)utf8proc_NFKC_Casefold(str_00df), "ss") &&
|
||||||
|
!strcmp((char*)utf8proc_NFKC_Casefold(str_1e9e), "ss"),
|
||||||
|
"incorrect 0x00df/0x1e9e casefold normalization");
|
||||||
|
|
||||||
printf("More up-to-date than OS unicode tables for %d tests.\n", better);
|
printf("More up-to-date than OS unicode tables for %d tests.\n", better);
|
||||||
printf("utf8proc case conversion tests SUCCEEDED.\n");
|
printf("utf8proc case conversion tests SUCCEEDED.\n");
|
||||||
return 0;
|
return 0;
|
|
@ -0,0 +1,77 @@
|
||||||
|
#include "tests.h"
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <wchar.h>
|
||||||
|
|
||||||
|
static int my_unassigned(int c) {
|
||||||
|
int cat = utf8proc_get_property(c)->category;
|
||||||
|
return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int my_isprint(int c) {
|
||||||
|
int cat = utf8proc_get_property(c)->category;
|
||||||
|
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
|
||||||
|
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) ||
|
||||||
|
(cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int c, error = 0, updates = 0;
|
||||||
|
|
||||||
|
(void) argc; /* unused */
|
||||||
|
(void) argv; /* unused */
|
||||||
|
|
||||||
|
/* some simple sanity tests of the character widths */
|
||||||
|
for (c = 0; c <= 0x110000; ++c) {
|
||||||
|
int cat = utf8proc_get_property(c)->category;
|
||||||
|
int w = utf8proc_charwidth(c);
|
||||||
|
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
|
||||||
|
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
|
||||||
|
error += 1;
|
||||||
|
}
|
||||||
|
if (w == 0 &&
|
||||||
|
((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
|
||||||
|
(cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
|
||||||
|
(cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
|
||||||
|
fprintf(stderr, "zero width for symbol-like char %x\n", c);
|
||||||
|
error += 1;
|
||||||
|
}
|
||||||
|
if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) {
|
||||||
|
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
|
||||||
|
wcwidth(c), w,
|
||||||
|
isprint(c) ? "printable" : "non-printable", c);
|
||||||
|
error += 1;
|
||||||
|
}
|
||||||
|
if (!my_isprint(c) && w > 0) {
|
||||||
|
fprintf(stderr, "non-printing %x had width %d\n", c, w);
|
||||||
|
error += 1;
|
||||||
|
}
|
||||||
|
if (my_unassigned(c) && w != 1) {
|
||||||
|
fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
|
||||||
|
error += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
check(!error, "utf8proc_charwidth FAILED %d tests.", error);
|
||||||
|
|
||||||
|
check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
|
||||||
|
check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
|
||||||
|
|
||||||
|
/* print some other information by compariing with system wcwidth */
|
||||||
|
printf("Mismatches with system wcwidth (not necessarily errors):\n");
|
||||||
|
for (c = 0; c <= 0x110000; ++c) {
|
||||||
|
int w = utf8proc_charwidth(c);
|
||||||
|
int wc = wcwidth(c);
|
||||||
|
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
|
||||||
|
/* lots of these errors for out-of-date system unicode tables */
|
||||||
|
if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0)
|
||||||
|
updates += 1;
|
||||||
|
if (wc == -1 && !my_isprint(c) && w > 0)
|
||||||
|
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
|
||||||
|
if (wc >= 0 && wc != w)
|
||||||
|
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
|
||||||
|
}
|
||||||
|
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n", updates);
|
||||||
|
printf("Character-width tests SUCCEEDED.\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -23,5 +23,6 @@ int main(void)
|
||||||
check(strlen((char*) output) == 6, "incorrect output length");
|
check(strlen((char*) output) == 6, "incorrect output length");
|
||||||
check(!memcmp(correct, output, 7), "incorrect output data");
|
check(!memcmp(correct, output, 7), "incorrect output data");
|
||||||
free(output);
|
free(output);
|
||||||
|
printf("map_custom tests SUCCEEDED.\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
/* Miscellaneous tests, e.g. regression tests */
|
||||||
|
|
||||||
|
#include "tests.h"
|
||||||
|
|
||||||
|
static void issue128(void) /* #128 */
|
||||||
|
{
|
||||||
|
utf8proc_uint8_t input[] = {0x72, 0xcc, 0x87, 0xcc, 0xa3, 0x00}; /* "r\u0307\u0323" */
|
||||||
|
utf8proc_uint8_t nfc[] = {0xe1, 0xb9, 0x9b, 0xcc, 0x87, 0x00}; /* "\u1E5B\u0307" */
|
||||||
|
utf8proc_uint8_t nfd[] = {0x72, 0xcc, 0xa3, 0xcc, 0x87, 0x00}; /* "r\u0323\u0307" */
|
||||||
|
utf8proc_uint8_t *nfc_out, *nfd_out;
|
||||||
|
nfc_out = utf8proc_NFC(input);
|
||||||
|
printf("NFC \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfc_out, (char*)nfc);
|
||||||
|
check(strlen((char*) nfc_out) == 5, "incorrect nfc length");
|
||||||
|
check(!memcmp(nfc, nfc_out, 6), "incorrect nfc data");
|
||||||
|
nfd_out = utf8proc_NFD(input);
|
||||||
|
printf("NFD \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfd_out, (char*)nfd);
|
||||||
|
check(strlen((char*) nfd_out) == 5, "incorrect nfd length");
|
||||||
|
check(!memcmp(nfd, nfd_out, 6), "incorrect nfd data");
|
||||||
|
free(nfd_out); free(nfc_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void issue102(void) /* #128 */
|
||||||
|
{
|
||||||
|
utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */
|
||||||
|
utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */
|
||||||
|
utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */
|
||||||
|
utf8proc_uint8_t *output;
|
||||||
|
utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA);
|
||||||
|
printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna);
|
||||||
|
check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length");
|
||||||
|
check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data");
|
||||||
|
free(output);
|
||||||
|
output = utf8proc_NFKC_Casefold(input);
|
||||||
|
printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct);
|
||||||
|
check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length");
|
||||||
|
check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
issue128();
|
||||||
|
issue102();
|
||||||
|
printf("Misc tests SUCCEEDED.\n");
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,60 @@
|
||||||
|
/* simple test program to print out the utf8proc properties for a codepoint */
|
||||||
|
|
||||||
|
#include "tests.h"
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 1; i < argc; ++i) {
|
||||||
|
utf8proc_uint8_t cstr[16], *map;
|
||||||
|
unsigned int c;
|
||||||
|
if (!strcmp(argv[i], "-V")) {
|
||||||
|
printf("utf8proc version %s\n", utf8proc_version());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
|
||||||
|
const utf8proc_property_t *p = utf8proc_get_property(c);
|
||||||
|
|
||||||
|
if (utf8proc_codepoint_valid(c))
|
||||||
|
cstr[utf8proc_encode_char(c, cstr)] = 0;
|
||||||
|
else
|
||||||
|
strcat((char*)cstr, "N/A");
|
||||||
|
utf8proc_map(cstr, 0, &map, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD);
|
||||||
|
|
||||||
|
printf("U+%s: %s\n"
|
||||||
|
" category = %s\n"
|
||||||
|
" combining_class = %d\n"
|
||||||
|
" bidi_class = %d\n"
|
||||||
|
" decomp_type = %d\n"
|
||||||
|
" uppercase_mapping = %x\n"
|
||||||
|
" lowercase_mapping = %x\n"
|
||||||
|
" titlecase_mapping = %x\n"
|
||||||
|
" casefold = %s\n"
|
||||||
|
" comb_index = %d\n"
|
||||||
|
" bidi_mirrored = %d\n"
|
||||||
|
" comp_exclusion = %d\n"
|
||||||
|
" ignorable = %d\n"
|
||||||
|
" control_boundary = %d\n"
|
||||||
|
" boundclass = %d\n"
|
||||||
|
" charwidth = %d\n",
|
||||||
|
argv[i], (char*) cstr,
|
||||||
|
utf8proc_category_string(c),
|
||||||
|
p->combining_class,
|
||||||
|
p->bidi_class,
|
||||||
|
p->decomp_type,
|
||||||
|
utf8proc_toupper(c),
|
||||||
|
utf8proc_tolower(c),
|
||||||
|
utf8proc_totitle(c),
|
||||||
|
(char *) map,
|
||||||
|
p->comb_index,
|
||||||
|
p->bidi_mirrored,
|
||||||
|
p->comp_exclusion,
|
||||||
|
p->ignorable,
|
||||||
|
p->control_boundary,
|
||||||
|
p->boundclass,
|
||||||
|
utf8proc_charwidth(c));
|
||||||
|
free(map);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -1,6 +1,6 @@
|
||||||
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
|
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
|
* Copyright (c) 2018 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
|
||||||
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
@ -42,6 +42,14 @@
|
||||||
|
|
||||||
|
|
||||||
#include "utf8proc.h"
|
#include "utf8proc.h"
|
||||||
|
|
||||||
|
#ifndef SSIZE_MAX
|
||||||
|
#define SSIZE_MAX ((size_t)SIZE_MAX/2)
|
||||||
|
#endif
|
||||||
|
#ifndef UINT16_MAX
|
||||||
|
# define UINT16_MAX 65535U
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "utf8proc_data.c"
|
#include "utf8proc_data.c"
|
||||||
|
|
||||||
|
|
||||||
|
@ -271,12 +279,8 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
|
||||||
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
|
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
|
||||||
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
|
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
|
||||||
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
|
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
|
(lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below)
|
||||||
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
|
tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
|
||||||
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
|
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
|
||||||
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
|
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
|
||||||
true; // GB999
|
true; // GB999
|
||||||
|
@ -284,9 +288,8 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
|
||||||
|
|
||||||
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
|
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
|
||||||
{
|
{
|
||||||
int lbc_override = lbc;
|
int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
|
||||||
if (state && *state != UTF8PROC_BOUNDCLASS_START)
|
? *state : lbc);
|
||||||
lbc_override = *state;
|
|
||||||
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
|
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
|
||||||
if (state) {
|
if (state) {
|
||||||
// Special support for GB 12/13 made possible by GB999. After two RI
|
// Special support for GB 12/13 made possible by GB999. After two RI
|
||||||
|
@ -296,12 +299,15 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
|
||||||
// forbidden by a different rule such as GB9).
|
// forbidden by a different rule such as GB9).
|
||||||
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
||||||
*state = UTF8PROC_BOUNDCLASS_OTHER;
|
*state = UTF8PROC_BOUNDCLASS_OTHER;
|
||||||
// Special support for GB10. Fold any EXTEND codepoints into the previous
|
// Special support for GB11 (emoji extend* zwj / emoji)
|
||||||
// boundclass if we're dealing with an emoji base boundclass.
|
else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
|
||||||
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
|
if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
|
||||||
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
|
*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
|
||||||
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
|
else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
|
||||||
*state = UTF8PROC_BOUNDCLASS_E_BASE;
|
*state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
|
||||||
|
else
|
||||||
|
*state = tbc;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
*state = tbc;
|
*state = tbc;
|
||||||
}
|
}
|
||||||
|
@ -424,6 +430,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
|
||||||
if (options & UTF8PROC_IGNORE) {
|
if (options & UTF8PROC_IGNORE) {
|
||||||
if (property->ignorable) return 0;
|
if (property->ignorable) return 0;
|
||||||
}
|
}
|
||||||
|
if (options & UTF8PROC_STRIPNA) {
|
||||||
|
if (!category) return 0;
|
||||||
|
}
|
||||||
if (options & UTF8PROC_LUMP) {
|
if (options & UTF8PROC_LUMP) {
|
||||||
if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
|
if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
|
||||||
if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
|
if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
|
||||||
|
@ -632,9 +641,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
|
||||||
current_property->comb_index != UINT16_MAX &&
|
current_property->comb_index != UINT16_MAX &&
|
||||||
current_property->comb_index >= 0x8000) {
|
current_property->comb_index >= 0x8000) {
|
||||||
int sidx = starter_property->comb_index;
|
int sidx = starter_property->comb_index;
|
||||||
int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
|
int idx = current_property->comb_index & 0x3FFF;
|
||||||
if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
|
if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + 1] ) {
|
||||||
idx += sidx + 2;
|
idx += sidx + 2 - utf8proc_combinations[sidx];
|
||||||
if (current_property->comb_index & 0x4000) {
|
if (current_property->comb_index & 0x4000) {
|
||||||
composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
|
composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
|
||||||
} else
|
} else
|
||||||
|
@ -753,3 +762,10 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
|
||||||
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
|
||||||
|
utf8proc_uint8_t *retval;
|
||||||
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
|
||||||
|
return retval;
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
|
* Copyright (c) 2018 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
|
||||||
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
@ -71,7 +71,7 @@
|
||||||
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
||||||
#define UTF8PROC_VERSION_MAJOR 2
|
#define UTF8PROC_VERSION_MAJOR 2
|
||||||
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
||||||
#define UTF8PROC_VERSION_MINOR 1
|
#define UTF8PROC_VERSION_MINOR 2
|
||||||
/** The PATCH version (increased for fixes that do not change the API). */
|
/** The PATCH version (increased for fixes that do not change the API). */
|
||||||
#define UTF8PROC_VERSION_PATCH 0
|
#define UTF8PROC_VERSION_PATCH 0
|
||||||
/** @} */
|
/** @} */
|
||||||
|
@ -120,20 +120,26 @@ typedef bool utf8proc_bool;
|
||||||
#endif
|
#endif
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
|
||||||
#define UTF8PROC_DLLEXPORT
|
#ifdef UTF8PROC_STATIC
|
||||||
|
# define UTF8PROC_DLLEXPORT
|
||||||
|
#else
|
||||||
|
# ifdef _WIN32
|
||||||
|
# ifdef UTF8PROC_EXPORTS
|
||||||
|
# define UTF8PROC_DLLEXPORT __declspec(dllexport)
|
||||||
|
# else
|
||||||
|
# define UTF8PROC_DLLEXPORT __declspec(dllimport)
|
||||||
|
# endif
|
||||||
|
# elif __GNUC__ >= 4
|
||||||
|
# define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
|
||||||
|
# else
|
||||||
|
# define UTF8PROC_DLLEXPORT
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef SSIZE_MAX
|
|
||||||
#define SSIZE_MAX ((size_t)SIZE_MAX/2)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef UINT16_MAX
|
|
||||||
# define UINT16_MAX 65535U
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Option flags used by several functions in the library.
|
* Option flags used by several functions in the library.
|
||||||
*/
|
*/
|
||||||
|
@ -199,6 +205,10 @@ typedef enum {
|
||||||
* @ref UTF8PROC_DECOMPOSE
|
* @ref UTF8PROC_DECOMPOSE
|
||||||
*/
|
*/
|
||||||
UTF8PROC_STRIPMARK = (1<<13),
|
UTF8PROC_STRIPMARK = (1<<13),
|
||||||
|
/**
|
||||||
|
* Strip unassigned codepoints.
|
||||||
|
*/
|
||||||
|
UTF8PROC_STRIPNA = (1<<14),
|
||||||
} utf8proc_option_t;
|
} utf8proc_option_t;
|
||||||
|
|
||||||
/** @name Error codes
|
/** @name Error codes
|
||||||
|
@ -364,10 +374,18 @@ typedef enum {
|
||||||
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
||||||
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
|
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
|
||||||
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
|
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
|
||||||
|
|
||||||
|
/* the following are no longer used in Unicode 11, but we keep
|
||||||
|
the constants here for backward compatibility */
|
||||||
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
|
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
|
||||||
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
|
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
|
||||||
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
|
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
|
||||||
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
||||||
|
|
||||||
|
/* the Extended_Pictographic property is used in the Unicode 11
|
||||||
|
grapheme-boundary rules, so we store it in the boundclass field */
|
||||||
|
UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19,
|
||||||
|
UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
|
||||||
} utf8proc_boundclass_t;
|
} utf8proc_boundclass_t;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -455,6 +473,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
|
||||||
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
|
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
|
||||||
* - @ref UTF8PROC_LUMP - lump certain different codepoints together
|
* - @ref UTF8PROC_LUMP - lump certain different codepoints together
|
||||||
* - @ref UTF8PROC_STRIPMARK - remove all character marks
|
* - @ref UTF8PROC_STRIPMARK - remove all character marks
|
||||||
|
* - @ref UTF8PROC_STRIPNA - remove unassigned codepoints
|
||||||
* @param last_boundclass
|
* @param last_boundclass
|
||||||
* Pointer to an integer variable containing
|
* Pointer to an integer variable containing
|
||||||
* the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
|
* the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
|
||||||
|
@ -566,6 +585,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
||||||
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
||||||
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
||||||
*
|
*
|
||||||
|
* @param codepoint1 The first codepoint.
|
||||||
|
* @param codepoint2 The second codepoint, occurring consecutively after `codepoint1`.
|
||||||
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
|
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
|
||||||
* state to break graphemes. This state can be passed in as a pointer
|
* state to break graphemes. This state can be passed in as a pointer
|
||||||
* in the `state` argument and should initially be set to 0. If the
|
* in the `state` argument and should initially be set to 0. If the
|
||||||
|
@ -641,7 +662,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
|
||||||
* contain NULL characters with the string if `str` contained NULL
|
* contain NULL characters with the string if `str` contained NULL
|
||||||
* characters). Other flags in the `options` field are passed to the
|
* characters). Other flags in the `options` field are passed to the
|
||||||
* functions defined above, and regarded as described. See also
|
* functions defined above, and regarded as described. See also
|
||||||
* @ref utfproc_map_custom to supply a custom codepoint transformation.
|
* @ref utf8proc_map_custom to supply a custom codepoint transformation.
|
||||||
*
|
*
|
||||||
* In case of success the length of the new string is returned,
|
* In case of success the length of the new string is returned,
|
||||||
* otherwise a negative error code is returned.
|
* otherwise a negative error code is returned.
|
||||||
|
@ -666,8 +687,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
|
||||||
|
|
||||||
/** @name Unicode normalization
|
/** @name Unicode normalization
|
||||||
*
|
*
|
||||||
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or
|
||||||
* normalized version of the null-terminated string `str`. These
|
* NFKC_Casefold normalized version of the null-terminated string `str`. These
|
||||||
* are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
|
* are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
|
||||||
* combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
|
* combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
|
||||||
*/
|
*/
|
||||||
|
@ -680,6 +701,11 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
|
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
|
||||||
/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
|
/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
||||||
|
/**
|
||||||
|
* NFKC_Casefold normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT
|
||||||
|
* and @ref UTF8PROC_CASEFOLD and @ref UTF8PROC_IGNORE).
|
||||||
|
**/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
|
||||||
/** @} */
|
/** @} */
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
File diff suppressed because it is too large
Load Diff
|
@ -1,33 +0,0 @@
|
||||||
cmake_minimum_required (VERSION 2.8)
|
|
||||||
|
|
||||||
include (utils.cmake)
|
|
||||||
|
|
||||||
disallow_intree_builds()
|
|
||||||
|
|
||||||
project (utf8proc C)
|
|
||||||
|
|
||||||
# This is the ABI version number, which may differ from the
|
|
||||||
# API version number (defined in utf8proc.h).
|
|
||||||
# Be sure to also update these in Makefile and MANIFEST!
|
|
||||||
set(SO_MAJOR 2)
|
|
||||||
set(SO_MINOR 1)
|
|
||||||
set(SO_PATCH 0)
|
|
||||||
|
|
||||||
add_definitions (
|
|
||||||
-DUTF8PROC_EXPORTS
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT MSVC)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -std=c99 -pedantic -Wall")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
add_library (utf8proc
|
|
||||||
utf8proc.c
|
|
||||||
utf8proc.h
|
|
||||||
)
|
|
||||||
|
|
||||||
set_target_properties (utf8proc PROPERTIES
|
|
||||||
POSITION_INDEPENDENT_CODE ON
|
|
||||||
VERSION "${SO_MAJOR}.${SO_MINOR}.${SO_PATCH}"
|
|
||||||
SOVERSION ${SO_MAJOR}
|
|
||||||
)
|
|
|
@ -1,7 +0,0 @@
|
||||||
include/
|
|
||||||
include/utf8proc.h
|
|
||||||
lib/
|
|
||||||
lib/libutf8proc.a
|
|
||||||
lib/libutf8proc.so -> libutf8proc.so.2.1.0
|
|
||||||
lib/libutf8proc.so.2 -> libutf8proc.so.2.1.0
|
|
||||||
lib/libutf8proc.so.2.1.0
|
|
|
@ -1,71 +0,0 @@
|
||||||
#include "tests.h"
|
|
||||||
#include <ctype.h>
|
|
||||||
#include <wchar.h>
|
|
||||||
|
|
||||||
static int my_isprint(int c) {
|
|
||||||
int cat = utf8proc_get_property(c)->category;
|
|
||||||
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
|
|
||||||
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
|
||||||
{
|
|
||||||
int c, error = 0, updates = 0;
|
|
||||||
|
|
||||||
(void) argc; /* unused */
|
|
||||||
(void) argv; /* unused */
|
|
||||||
|
|
||||||
/* some simple sanity tests of the character widths */
|
|
||||||
for (c = 0; c <= 0x110000; ++c) {
|
|
||||||
int cat = utf8proc_get_property(c)->category;
|
|
||||||
int w = utf8proc_charwidth(c);
|
|
||||||
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) &&
|
|
||||||
w > 0) {
|
|
||||||
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
|
|
||||||
error = 1;
|
|
||||||
}
|
|
||||||
if (w == 0 &&
|
|
||||||
((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
|
|
||||||
(cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
|
|
||||||
(cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
|
|
||||||
fprintf(stderr, "zero width for symbol-like char %x\n", c);
|
|
||||||
error = 1;
|
|
||||||
}
|
|
||||||
if (c <= 127 && ((!isprint(c) && w > 0) ||
|
|
||||||
(isprint(c) && wcwidth(c) != w))) {
|
|
||||||
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
|
|
||||||
wcwidth(c), w,
|
|
||||||
isprint(c) ? "printable" : "non-printable", c);
|
|
||||||
error = 1;
|
|
||||||
}
|
|
||||||
if (!my_isprint(c) && w > 0) {
|
|
||||||
fprintf(stderr, "non-printing %x had width %d\n", c, w);
|
|
||||||
error = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
check(!error, "utf8proc_charwidth FAILED tests.");
|
|
||||||
|
|
||||||
/* print some other information by compariing with system wcwidth */
|
|
||||||
printf("Mismatches with system wcwidth (not necessarily errors):\n");
|
|
||||||
for (c = 0; c <= 0x110000; ++c) {
|
|
||||||
int w = utf8proc_charwidth(c);
|
|
||||||
int wc = wcwidth(c);
|
|
||||||
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
|
|
||||||
/* lots of these errors for out-of-date system unicode tables */
|
|
||||||
if (wc == -1 && my_isprint(c) && w > 0) {
|
|
||||||
updates += 1;
|
|
||||||
#if 0
|
|
||||||
printf(" wcwidth(%x) = -1 for printable char\n", c);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
if (wc == -1 && !my_isprint(c) && w > 0)
|
|
||||||
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
|
|
||||||
if (wc >= 0 && wc != w)
|
|
||||||
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
|
|
||||||
}
|
|
||||||
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n",
|
|
||||||
updates);
|
|
||||||
printf("Character-width tests SUCCEEDED.\n");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,49 +0,0 @@
|
||||||
/* simple test program to print out the utf8proc properties for a codepoint */
|
|
||||||
|
|
||||||
#include "tests.h"
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 1; i < argc; ++i) {
|
|
||||||
unsigned int c;
|
|
||||||
if (!strcmp(argv[i], "-V")) {
|
|
||||||
printf("utf8proc version %s\n", utf8proc_version());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
|
|
||||||
const utf8proc_property_t *p = utf8proc_get_property(c);
|
|
||||||
printf("U+%s:\n"
|
|
||||||
" category = %s\n"
|
|
||||||
" combining_class = %d\n"
|
|
||||||
" bidi_class = %d\n"
|
|
||||||
" decomp_type = %d\n"
|
|
||||||
" uppercase_mapping = %x\n"
|
|
||||||
" lowercase_mapping = %x\n"
|
|
||||||
" titlecase_mapping = %x\n"
|
|
||||||
" comb_index = %d\n"
|
|
||||||
" bidi_mirrored = %d\n"
|
|
||||||
" comp_exclusion = %d\n"
|
|
||||||
" ignorable = %d\n"
|
|
||||||
" control_boundary = %d\n"
|
|
||||||
" boundclass = %d\n"
|
|
||||||
" charwidth = %d\n",
|
|
||||||
argv[i],
|
|
||||||
utf8proc_category_string(c),
|
|
||||||
p->combining_class,
|
|
||||||
p->bidi_class,
|
|
||||||
p->decomp_type,
|
|
||||||
utf8proc_toupper(c),
|
|
||||||
utf8proc_tolower(c),
|
|
||||||
utf8proc_totitle(c),
|
|
||||||
p->comb_index,
|
|
||||||
p->bidi_mirrored,
|
|
||||||
p->comp_exclusion,
|
|
||||||
p->ignorable,
|
|
||||||
p->control_boundary,
|
|
||||||
p->boundclass,
|
|
||||||
utf8proc_charwidth(c));
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,23 +0,0 @@
|
||||||
diff --git a/lib/utf8proc-2a2f97e1/utf8proc.h b/lib/utf8proc-2a2f97e1/utf8proc.h
|
|
||||||
index 64155a1..2fca528 100644
|
|
||||||
--- a/lib/utf8proc-2a2f97e1/utf8proc.h
|
|
||||||
+++ b/lib/utf8proc-2a2f97e1/utf8proc.h
|
|
||||||
@@ -120,17 +120,7 @@ typedef bool utf8proc_bool;
|
|
||||||
#endif
|
|
||||||
#include <limits.h>
|
|
||||||
|
|
||||||
-#ifdef _WIN32
|
|
||||||
-# ifdef UTF8PROC_EXPORTS
|
|
||||||
-# define UTF8PROC_DLLEXPORT __declspec(dllexport)
|
|
||||||
-# else
|
|
||||||
-# define UTF8PROC_DLLEXPORT __declspec(dllimport)
|
|
||||||
-# endif
|
|
||||||
-#elif __GNUC__ >= 4
|
|
||||||
-# define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
|
|
||||||
-#else
|
|
||||||
-# define UTF8PROC_DLLEXPORT
|
|
||||||
-#endif
|
|
||||||
+#define UTF8PROC_DLLEXPORT
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
Loading…
Reference in New Issue