Merge pull request #49 from DanielSWolf/feature/#38-upgrade-utf8proc

Upgrade to utf8proc 2.2.0
2019-01-02 16:21:00 +01:00 · 2019-01-02 16:21:00 +01:00 · 238687e33b
parent f4418ff25a cdc9c5b1c4
commit 238687e33b
41 changed files with 10731 additions and 10063 deletions
--- a/rhubarb/CMakeLists.txt
+++ b/rhubarb/CMakeLists.txt
@ -225,11 +225,12 @@ set_target_properties(utfcpp PROPERTIES FOLDER lib)

 # ... utf8proc
 add_library(utf8proc
-	lib/utf8proc-2a2f97e1/utf8proc.c
-	lib/utf8proc-2a2f97e1/utf8proc.h
+	lib/utf8proc-2.2.0/utf8proc.c
+	lib/utf8proc-2.2.0/utf8proc.h
 )
-target_include_directories(utf8proc SYSTEM PUBLIC "lib/utf8proc-2a2f97e1")
+target_include_directories(utf8proc SYSTEM PUBLIC "lib/utf8proc-2.2.0")
 target_compile_options(utf8proc PRIVATE ${disableWarningsFlags})
+target_compile_definitions(utf8proc PUBLIC UTF8PROC_STATIC=1) # Compile as static lib
 set_target_properties(utf8proc PROPERTIES FOLDER lib)

 # ... Ogg
--- a/rhubarb/lib/utf8proc-2a2f97e1/.gitignore
+++ b/rhubarb/lib/utf8proc-2a2f97e1/.gitignore
@ -21,6 +21,7 @@ test/normtest
 test/graphemetest
 test/printproperty
 test/charwidth
+test/misc
 test/valid
 test/iterate
 test/case
--- a/rhubarb/lib/utf8proc-2a2f97e1/.travis.yml
+++ b/rhubarb/lib/utf8proc-2a2f97e1/.travis.yml
--- a/rhubarb/lib/utf8proc-2.2.0/CMakeLists.txt
+++ b/rhubarb/lib/utf8proc-2.2.0/CMakeLists.txt
@ -0,0 +1,51 @@
+cmake_minimum_required (VERSION 2.8.12)
+
+include (utils.cmake)
+
+disallow_intree_builds()
+
+project (utf8proc C)
+
+# This is the ABI version number, which may differ from the
+# API version number (defined in utf8proc.h).
+# Be sure to also update these in Makefile and MANIFEST!
+set(SO_MAJOR 2)
+set(SO_MINOR 2)
+set(SO_PATCH 0)
+
+if (NOT MSVC)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -std=c99 -pedantic -Wall")
+endif ()
+
+add_library (utf8proc
+  utf8proc.c
+  utf8proc.h
+)
+
+if (BUILD_SHARED_LIBS)
+  # Building shared library
+else()
+  # Building static library
+  target_compile_definitions(utf8proc PUBLIC "UTF8PROC_STATIC")
+  if (MSVC)
+    set_target_properties(utf8proc PROPERTIES OUTPUT_NAME "utf8proc_static")
+  endif()
+endif()
+
+target_compile_definitions(utf8proc PRIVATE "UTF8PROC_EXPORTS")
+
+set_target_properties (utf8proc PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+  VERSION "${SO_MAJOR}.${SO_MINOR}.${SO_PATCH}"
+  SOVERSION ${SO_MAJOR}
+)
+
+install(TARGETS utf8proc
+  RUNTIME DESTINATION bin
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib)
+
+install(
+  FILES
+    "${PROJECT_SOURCE_DIR}/utf8proc.h"
+  DESTINATION include)
--- a/rhubarb/lib/utf8proc-2a2f97e1/Doxyfile
+++ b/rhubarb/lib/utf8proc-2a2f97e1/Doxyfile
--- a/rhubarb/lib/utf8proc-2a2f97e1/LICENSE.md
+++ b/rhubarb/lib/utf8proc-2a2f97e1/LICENSE.md
@ -7,7 +7,7 @@ whose copyright and license statements are reproduced below, all new
 work on the utf8proc library is licensed under the [MIT "expat"
 license](http://opensource.org/licenses/MIT):

-*Copyright &copy; 2014-2015 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.*
+*Copyright &copy; 2014-2018 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.*

 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),
--- a/rhubarb/lib/utf8proc-2.2.0/MANIFEST
+++ b/rhubarb/lib/utf8proc-2.2.0/MANIFEST
@ -0,0 +1,7 @@
+include/
+include/utf8proc.h
+lib/
+lib/libutf8proc.a
+lib/libutf8proc.so -> libutf8proc.so.2.2.0
+lib/libutf8proc.so.2 -> libutf8proc.so.2.2.0
+lib/libutf8proc.so.2.2.0
--- a/rhubarb/lib/utf8proc-2a2f97e1/Makefile
+++ b/rhubarb/lib/utf8proc-2a2f97e1/Makefile
@ -11,7 +11,7 @@ CFLAGS ?= -O2
 PICFLAG = -fPIC
 C99FLAG = -std=c99
 WCFLAGS = -Wall -pedantic
-UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
+UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES)

 # shared-library version MAJOR.MINOR.PATCH ... this may be *different*
 # from the utf8proc version number because it indicates ABI compatibility,
@ -20,7 +20,7 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
 # The API version number is defined in utf8proc.h.
 # Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
 MAJOR=2
-MINOR=1
+MINOR=2
 PATCH=0

 OS := $(shell uname)
@ -48,7 +48,7 @@ clean:
 ifneq ($(OS),Darwin)
 	rm -f libutf8proc.so.$(MAJOR)
 endif
-	rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom
+	rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc
 	rm -rf MANIFEST.new tmp
 	$(MAKE) -C bench clean
 	$(MAKE) -C data clean
@ -81,7 +81,7 @@ libutf8proc.so: libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH)
 	ln -f -s libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH) $@.$(MAJOR)

 libutf8proc.$(MAJOR).dylib: utf8proc.o
-	$(CC) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH)
+	$(CC) $(LDFLAGS) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH)

 libutf8proc.dylib: libutf8proc.$(MAJOR).dylib
 	ln -f -s libutf8proc.$(MAJOR).dylib $@
@ -138,11 +138,15 @@ test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@

-check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
+test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
+	$(CC) $(UCFLAGS) test/misc.c test/tests.o utf8proc.o -o $@
+
+check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
 	$(MAKE) -C bench
 	test/normtest data/NormalizationTest.txt
 	test/graphemetest data/GraphemeBreakTest.txt
 	test/charwidth
+	test/misc
 	test/valid
 	test/iterate
 	test/case
--- a/rhubarb/lib/utf8proc-2a2f97e1/NEWS.md
+++ b/rhubarb/lib/utf8proc-2a2f97e1/NEWS.md
@ -1,5 +1,35 @@
 # utf8proc release history #

+## Version 2.2 ##
+
+2018-07-24
+
+- Unicode 11 support ([#132] and [#140]).
+
+- `utf8proc_NFKC_Casefold` convenience function for `NFKC_Casefold`
+  normalization ([#133]).
+
+- `UTF8PROC_STRIPNA` option to strip unassigned codepoints ([#133]).
+
+- Support building static libraries on Windows (callers need to
+  `#define UTF8PROC_STATIC`) ([#123]).
+
+- `cmake` fix to avoid defining `UTF8PROC_EXPORTS` globally ([#121]).
+
+- `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
+  case-folding still yields the standard "ss" mapping.
+
+- `utf8proc_charwidth` now returns `1` for U+00AD (soft hyphen) and
+  for unassigned/PUA codepoints ([#135]).
+
+## Version 2.1.1 ##
+
+2018-04-27
+
+- Fixed composition bug ([#128]).
+
+- Minor build fixes ([#94], [#99], [#113], [#125]).
+
 ## Version 2.1 ##

 2016-12-26:
@ -297,7 +327,19 @@ Release of version 1.0.1
 [#78]: https://github.com/JuliaLang/utf8proc/issues/78
 [#79]: https://github.com/JuliaLang/utf8proc/issues/79
 [#80]: https://github.com/JuliaLang/utf8proc/issues/80
-[#84]: https://github.com/JuliaLang/utf8proc/pull/84
-[#88]: https://github.com/JuliaLang/utf8proc/pull/88
-[#89]: https://github.com/JuliaLang/utf8proc/pull/89
+[#84]: https://github.com/JuliaLang/utf8proc/issues/84
+[#88]: https://github.com/JuliaLang/utf8proc/issues/88
+[#89]: https://github.com/JuliaLang/utf8proc/issues/89
 [#90]: https://github.com/JuliaLang/utf8proc/issues/90
+[#94]: https://github.com/JuliaLang/utf8proc/issues/94
+[#99]: https://github.com/JuliaLang/utf8proc/issues/99
+[#113]: https://github.com/JuliaLang/utf8proc/issues/113
+[#121]: https://github.com/JuliaLang/utf8proc/issues/121
+[#123]: https://github.com/JuliaLang/utf8proc/issues/123
+[#125]: https://github.com/JuliaLang/utf8proc/issues/125
+[#128]: https://github.com/JuliaLang/utf8proc/issues/128
+[#132]: https://github.com/JuliaLang/utf8proc/issues/132
+[#133]: https://github.com/JuliaLang/utf8proc/issues/133
+[#134]: https://github.com/JuliaLang/utf8proc/issues/134
+[#135]: https://github.com/JuliaLang/utf8proc/issues/135
+[#140]: https://github.com/JuliaLang/utf8proc/issues/140
--- a/rhubarb/lib/utf8proc-2a2f97e1/README.md
+++ b/rhubarb/lib/utf8proc-2a2f97e1/README.md
@ -1,7 +1,6 @@
 # utf8proc
-[![Travis CI Status](https://travis-ci.org/JuliaLang/utf8proc.png)](https://travis-ci.org/JuliaLang/utf8proc)
-[![AppVeyor Status](https://ci.appveyor.com/api/projects/status/aou20lfkyhj8xbwq/branch/master?svg=true)](https://ci.appveyor.com/project/tkelman/utf8proc/branch/master)
-
+[![Travis CI Status](https://travis-ci.org/JuliaStrings/utf8proc.png)](https://travis-ci.org/JuliaStrings/utf8proc)
+[![AppVeyor status](https://ci.appveyor.com/api/projects/status/ivaa0v6ikxrmm5r6?svg=true)](https://ci.appveyor.com/project/StevenGJohnson/utf8proc)

 [utf8proc](http://julialang.org/utf8proc/) is a small, clean C
 library that provides Unicode normalization, case-folding, and other
@ -40,7 +39,7 @@ The C library is found in this directory after successful compilation
 and is named `libutf8proc.a` (for the static library) and
 `libutf8proc.so` (for the dynamic library).

-The Unicode version supported is 9.0.0.
+The Unicode version supported is 11.0.0.

 For Unicode normalizations, the following options are used:

--- a/rhubarb/lib/utf8proc-2a2f97e1/appveyor.yml
+++ b/rhubarb/lib/utf8proc-2a2f97e1/appveyor.yml
--- a/rhubarb/lib/utf8proc-2a2f97e1/bench/Makefile
+++ b/rhubarb/lib/utf8proc-2a2f97e1/bench/Makefile
--- a/rhubarb/lib/utf8proc-2a2f97e1/bench/bench.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/bench/bench.c
--- a/rhubarb/lib/utf8proc-2a2f97e1/bench/icu.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/bench/icu.c
--- a/rhubarb/lib/utf8proc-2a2f97e1/bench/unistring.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/bench/unistring.c
--- a/rhubarb/lib/utf8proc-2a2f97e1/bench/util.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/bench/util.c
--- a/rhubarb/lib/utf8proc-2a2f97e1/bench/util.h
+++ b/rhubarb/lib/utf8proc-2a2f97e1/bench/util.h
--- a/rhubarb/lib/utf8proc-2a2f97e1/data/Makefile
+++ b/rhubarb/lib/utf8proc-2a2f97e1/data/Makefile
@ -16,11 +16,11 @@ CURLFLAGS = --retry 5 --location

 .DELETE_ON_ERROR:

-utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt
+utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
 	$(RUBY) data_generator.rb < UnicodeData.txt > $@

 # GNU Unifont version for font metric calculations:
-UNIFONT_VERSION=9.0.04
+UNIFONT_VERSION=11.0.01

 unifont.ttf:
 	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
@ -35,7 +35,7 @@ CharWidths.txt: charwidths.jl unifont.sfd unifont_upper.sfd EastAsianWidth.txt
 	$(JULIA) charwidths.jl > $@

 # Unicode data version
-UNICODE_VERSION=9.0.0
+UNICODE_VERSION=11.0.0

 UnicodeData.txt:
 	$(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
@ -61,6 +61,9 @@ NormalizationTest.txt:
 GraphemeBreakTest.txt:
 	$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@

+emoji-data.txt:
+	$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://unicode.org/Public/emoji/`echo $(UNICODE_VERSION) | cut -d. -f1-2`/emoji-data.txt
+
 clean:
-	rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd
+	rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd emoji-data.txt
 	rm -f utf8proc_data.c.new
--- a/rhubarb/lib/utf8proc-2a2f97e1/data/charwidths.jl
+++ b/rhubarb/lib/utf8proc-2a2f97e1/data/charwidths.jl
@ -7,17 +7,6 @@
 # Requires Julia (obviously) and FontForge.

 #############################################################################
-# Julia 0.3/0.4 compatibility (taken from Compat package)
-if VERSION < v"0.4.0-dev+1387"
-    typealias AbstractString String
-end
-if VERSION < v"0.4.0-dev+1419"
-    const UInt32 = Uint32
-end
-if VERSION < v"0.4.0-dev+3874"
-    Base.parse{T<:Integer}(::Type{T}, s::AbstractString) = parseint(T, s)
-end
-
 CharWidths = Dict{Int,Int}()

 #############################################################################
@ -31,12 +20,12 @@ import Base.UTF8proc

 #############################################################################
 # Use a default width of 1 for all character categories that are
-# letter/symbol/number-like.  This can be overriden by Unifont or UAX 11
+# letter/symbol/number-like, as well as for unassigned/private-use chars.
+# This can be overriden by Unifont or UAX 11
 # below, but provides a useful nonzero fallback for new codepoints when
 # a new Unicode version has been released but Unifont hasn't been updated yet.

 zerowidth = Set{Int}() # categories that may contain zero-width chars
-push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
@ -47,7 +36,6 @@ push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
-push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO)
 for c in 0x0000:0x110000
    if catcode(c) ∉ zerowidth
        CharWidths[c] = 1
@ -97,7 +85,7 @@ CharWidths=parsesfd("unifont_upper.sfd", CharWidths)

 for line in readlines(open("EastAsianWidth.txt"))
    #Strip comments
-    line[1] == '#' && continue
+    (isempty(line) || line[1] == '#') && continue
    precomment = split(line, '#')[1]
    #Parse code point range and width code
    tokens = split(precomment, ';')
@ -113,7 +101,7 @@ for line in readlines(open("EastAsianWidth.txt"))
    for c in charstart:charend
        if width=="W" || width=="F" # wide or full
            CharWidths[c]=2
-        elseif width=="Na"|| width=="H" # narrow or half
+        elseif width=="Na"|| width=="H"
            CharWidths[c]=1
        end
    end
@ -126,9 +114,11 @@ end
 for c in keys(CharWidths)
    cat = catcode(c)

-    # make sure format control character (category Cf) have width 0,
-    # except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
-    if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd]
+    # make sure format control character (category Cf) have width 0
+    # (some of these, like U+0601, can have a width in some cases
+    #  but normally act like prepended combining marks.  U+fff9 etc
+    #  are also odd, but have zero width in typical terminal contexts)
+    if cat==UTF8proc.UTF8PROC_CATEGORY_CF
        CharWidths[c]=0
    end

@ -139,11 +129,12 @@ for c in keys(CharWidths)
        CharWidths[c]=0
    end

-    # We also assign width of zero to unassigned and private-use
+    # We also assign width of one to unassigned and private-use
    # codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
-    # but since these are nonstandard it seems questionable to recognize them).
+    # but since these are nonstandard it seems questionable to use Unifont metrics;
+    # if they are printed as the replacement character U+FFFD they will have width 1).
    if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
-        CharWidths[c]=0
+        CharWidths[c]=1
    end

    # for some reason, Unifont has width-2 glyphs for ASCII control chars
@ -152,6 +143,9 @@ for c in keys(CharWidths)
    end
 end

+#Soft hyphen is typically printed as a hyphen (-) in terminals.
+CharWidths[0x00ad]=1
+
 #By definition, should have zero width (on the same line)
 #0x002028 ' ' category: Zl name: LINE SEPARATOR/
 #0x002029 ' ' category: Zp name: PARAGRAPH SEPARATOR/
@ -169,8 +163,8 @@ CharWidths[0x2001]=2
 CharWidths[0x2003]=2

 #############################################################################
-# Output (to a file or pipe) for processing by data_generator.rb
-# ... don't bother to output zero widths since that will be the default.
+# Output (to a file or pipe) for processing by data_generator.rb,
+# encoded as a sequence of intervals.

 firstc = 0x000000
 lastv = 0
--- a/rhubarb/lib/utf8proc-2a2f97e1/data/data_generator.rb
+++ b/rhubarb/lib/utf8proc-2a2f97e1/data/data_generator.rb
@ -6,6 +6,8 @@
 #  production use.


+#  Copyright (c) 2018 Steven G. Johnson, Tony Kelman, Keno Fischer,
+#                Benito van der Zander, Michaël Meyer, and other contributors.
 #  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
 #
 #  Permission is hereby granted, free of charge, to any person obtaining a
@ -85,6 +87,19 @@ $grapheme_boundclass_list.each_line do |entry|
  end
 end

+$emoji_data_list = File.read("emoji-data.txt")
+$emoji_data_list.each_line do |entry|
+  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
+    $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
+  elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
+    $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC"
+  elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
+    $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" }
+  elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
+    $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND"
+  end
+end
+
 $charwidth_list = File.read("CharWidths.txt")
 $charwidth = Hash.new(0)
 $charwidth_list.each_line do |entry|
@ -104,7 +119,7 @@ $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
 $case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
 $case_folding = {}
 $case_folding_string.chomp.split("\n").each do |line|
-  next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
+  next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i
  $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
 end

@ -137,13 +152,13 @@ def cpary2utf16encoded(array)
 end
 def cpary2c(array)
  return "UINT16_MAX" if array.nil? || array.length == 0
-  lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ... 
+  lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
  array = cpary2utf16encoded(array)
  if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
-    array = [lencode] + array 
+    array = [lencode] + array
    lencode = 7
-  end  
-  idx = pushary(array) 
+  end
+  idx = pushary(array)
  raise "Array index out of bound" if idx > 0x1FFF
  return "#{idx | (lencode << 13)}"
 end
@ -188,9 +203,10 @@ class UnicodeChar
    @decomp_mapping    = ($8=='') ? nil :
                         $8.split.collect { |element| element.hex }
    @bidi_mirrored     = ($13=='Y') ? true : false
-    @uppercase_mapping = ($16=='') ? nil : $16.hex
+    # issue #130: use nonstandard uppercase ß -> ẞ
+    @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
    @lowercase_mapping = ($17=='') ? nil : $17.hex
-    @titlecase_mapping = ($18=='') ? nil : $18.hex
+    @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
  end
  def case_folding
    $case_folding[code]
@ -260,17 +276,17 @@ chars.each do |char|
    end
    unless comb2nd_indicies[dm1]
      comb2nd_indicies_sorted_keys << dm1
-      comb2nd_indicies[dm1] = comb2nd_indicies.keys.length 
+      comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
    end
    comb_array[comb1st_indicies[dm0]] ||= []
    raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
    comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
-    
+
    comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
  end
  char.c_decomp_mapping = cpary2c(char.decomp_mapping)
  char.c_case_folding = cpary2c(char.case_folding)
-end 
+end

 comb_indicies = {}
 cumoffset = 0
@ -281,7 +297,7 @@ comb1st_indicies.each do |dm0, index|
  last = nil
  offset = 0
  comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
-    if comb_array[index][b] 
+    if comb_array[index][b]
      first = offset unless first
      last = offset
      last += 1 if comb2nd_indicies_nonbasic[dm1]
@ -377,7 +393,7 @@ end
 $stdout << "};\n\n"

 $stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << "  {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
+$stdout << "  {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
 properties.each { |line|
  $stdout << line
 }
@ -391,7 +407,7 @@ comb1st_indicies.keys.each_index do |a|
  offset = 0
  $stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
  comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
-    break if offset > comb1st_indicies_lastoffsets[a] 
+    break if offset > comb1st_indicies_lastoffsets[a]
    if offset >= comb1st_indicies_firstoffsets[a]
      i += 1
      if i == 8
@ -403,9 +419,8 @@ comb1st_indicies.keys.each_index do |a|
      $stdout << (v & 0xFFFF) << ", "
    end
    offset += 1
-    offset += 1 if comb2nd_indicies_nonbasic[dm1]    
+    offset += 1 if comb2nd_indicies_nonbasic[dm1]
  end
  $stdout  << "\n"
 end
 $stdout << "};\n\n"
-
--- a/rhubarb/lib/utf8proc-2a2f97e1/lump.md
+++ b/rhubarb/lib/utf8proc-2a2f97e1/lump.md
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/case.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/case.c
@ -13,13 +13,22 @@ int main(int argc, char **argv)
     for (c = 0; c <= 0x110000; ++c) {
          utf8proc_int32_t l = utf8proc_tolower(c);
          utf8proc_int32_t u = utf8proc_toupper(c);
+          utf8proc_int32_t t = utf8proc_totitle(c);

          check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
          check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
+          check(t == c || utf8proc_codepoint_valid(t), "invalid totitle");
+
+          if (utf8proc_codepoint_valid(c) && (l == u) != (l == t) &&
+              /* Unicode 11: Georgian Mkhedruli chars have uppercase but no titlecase. */
+              !(((c >= 0x10d0 && c <= 0x10fa) || c >= (0x10fd && c <= 0x10ff)) && l != u)) {
+               fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c);
+               ++error;
+          }

          if (sizeof(wint_t) > 2 || c < (1<<16)) {
               wint_t l0 = towlower(c), u0 = towupper(c);
-               
+
               /* OS unicode tables may be out of date.  But if they
                  do have a lower/uppercase mapping, hopefully it
                  is correct? */
@ -44,6 +53,20 @@ int main(int argc, char **argv)
          }
     }
     check(!error, "utf8proc case conversion FAILED %d tests.", error);
+
+     /* issue #130 */
+     check(utf8proc_toupper(0x00df) == 0x1e9e &&
+           utf8proc_totitle(0x00df) == 0x1e9e &&
+           utf8proc_tolower(0x00df) == 0x00df &&
+           utf8proc_tolower(0x1e9e) == 0x00df &&
+           utf8proc_toupper(0x1e9e) == 0x1e9e,
+           "incorrect 0x00df/0x1e9e case conversions");
+     utf8proc_uint8_t str_00df[] = {0xc3, 0x9f, 0x00};
+     utf8proc_uint8_t str_1e9e[] = {0xe1, 0xba, 0x9e, 0x00};
+     check(!strcmp((char*)utf8proc_NFKC_Casefold(str_00df), "ss") &&
+           !strcmp((char*)utf8proc_NFKC_Casefold(str_1e9e), "ss"),
+           "incorrect 0x00df/0x1e9e casefold normalization");
+
     printf("More up-to-date than OS unicode tables for %d tests.\n", better);
     printf("utf8proc case conversion tests SUCCEEDED.\n");
     return 0;
--- a/rhubarb/lib/utf8proc-2.2.0/test/charwidth.c
+++ b/rhubarb/lib/utf8proc-2.2.0/test/charwidth.c
@ -0,0 +1,77 @@
+#include "tests.h"
+#include <ctype.h>
+#include <wchar.h>
+
+static int my_unassigned(int c) {
+    int cat = utf8proc_get_property(c)->category;
+    return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
+}
+
+static int my_isprint(int c) {
+    int cat = utf8proc_get_property(c)->category;
+    return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
+           (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) ||
+           (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
+}
+
+int main(int argc, char **argv)
+{
+    int c, error = 0, updates = 0;
+
+    (void) argc; /* unused */
+    (void) argv; /* unused */
+
+    /* some simple sanity tests of the character widths */
+    for (c = 0; c <= 0x110000; ++c) {
+        int cat = utf8proc_get_property(c)->category;
+        int w = utf8proc_charwidth(c);
+        if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
+            fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
+            error += 1;
+        }
+        if (w == 0 &&
+            ((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
+             (cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
+             (cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
+            fprintf(stderr, "zero width for symbol-like char %x\n", c);
+            error += 1;
+        }
+        if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) {
+            fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
+            wcwidth(c), w,
+            isprint(c) ? "printable" : "non-printable", c);
+            error += 1;
+        }
+        if (!my_isprint(c) && w > 0) {
+            fprintf(stderr, "non-printing %x had width %d\n", c, w);
+            error += 1;
+        }
+        if (my_unassigned(c) && w != 1) {
+            fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
+            error += 1;
+        }
+    }
+    check(!error, "utf8proc_charwidth FAILED %d tests.", error);
+
+    check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
+    check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
+
+    /* print some other information by compariing with system wcwidth */
+    printf("Mismatches with system wcwidth (not necessarily errors):\n");
+    for (c = 0; c <= 0x110000; ++c) {
+        int w = utf8proc_charwidth(c);
+        int wc = wcwidth(c);
+        if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
+        /* lots of these errors for out-of-date system unicode tables */
+        if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0)
+            updates += 1;
+        if (wc == -1 && !my_isprint(c) && w > 0)
+            printf("  wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
+        if (wc >= 0 && wc != w)
+            printf("  wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
+    }
+    printf("   ... (positive widths for %d chars unknown to wcwidth) ...\n", updates);
+    printf("Character-width tests SUCCEEDED.\n");
+
+    return 0;
+}
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/custom.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/custom.c
@ -23,5 +23,6 @@ int main(void)
    check(strlen((char*) output) == 6, "incorrect output length");
    check(!memcmp(correct, output, 7), "incorrect output data");
    free(output);
+    printf("map_custom tests SUCCEEDED.\n");
    return 0;
 }
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/graphemetest.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/graphemetest.c
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/iterate.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/iterate.c
--- a/rhubarb/lib/utf8proc-2.2.0/test/misc.c
+++ b/rhubarb/lib/utf8proc-2.2.0/test/misc.c
@ -0,0 +1,46 @@
+/* Miscellaneous tests, e.g. regression tests */
+
+#include "tests.h"
+
+static void issue128(void) /* #128 */
+{
+    utf8proc_uint8_t input[] = {0x72, 0xcc, 0x87, 0xcc, 0xa3, 0x00}; /* "r\u0307\u0323" */
+    utf8proc_uint8_t nfc[] = {0xe1, 0xb9, 0x9b, 0xcc, 0x87, 0x00}; /* "\u1E5B\u0307" */
+    utf8proc_uint8_t nfd[] = {0x72, 0xcc, 0xa3, 0xcc, 0x87, 0x00}; /* "r\u0323\u0307" */
+    utf8proc_uint8_t *nfc_out, *nfd_out;
+    nfc_out = utf8proc_NFC(input);
+    printf("NFC \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfc_out, (char*)nfc);
+    check(strlen((char*) nfc_out) == 5, "incorrect nfc length");
+    check(!memcmp(nfc, nfc_out, 6), "incorrect nfc data");
+    nfd_out = utf8proc_NFD(input);
+    printf("NFD \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfd_out, (char*)nfd);
+    check(strlen((char*) nfd_out) == 5, "incorrect nfd length");
+    check(!memcmp(nfd, nfd_out, 6), "incorrect nfd data");
+    free(nfd_out); free(nfc_out);
+}
+
+static void issue102(void) /* #128 */
+{
+    utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */
+    utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */
+    utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */
+    utf8proc_uint8_t *output;
+    utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+        UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA);
+    printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna);
+    check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length");
+    check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data");
+    free(output);
+    output = utf8proc_NFKC_Casefold(input);
+    printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct);
+    check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length");
+    check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data");
+}
+
+int main(void)
+{
+    issue128();
+    issue102();
+    printf("Misc tests SUCCEEDED.\n");
+    return 0;
+}
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/normtest.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/normtest.c
--- a/rhubarb/lib/utf8proc-2.2.0/test/printproperty.c
+++ b/rhubarb/lib/utf8proc-2.2.0/test/printproperty.c
@ -0,0 +1,60 @@
+/* simple test program to print out the utf8proc properties for a codepoint */
+
+#include "tests.h"
+
+int main(int argc, char **argv)
+{
+    int i;
+
+    for (i = 1; i < argc; ++i) {
+        utf8proc_uint8_t cstr[16], *map;
+        unsigned int c;
+        if (!strcmp(argv[i], "-V")) {
+            printf("utf8proc version %s\n", utf8proc_version());
+            continue;
+        }
+        check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
+        const utf8proc_property_t *p = utf8proc_get_property(c);
+
+        if (utf8proc_codepoint_valid(c))
+            cstr[utf8proc_encode_char(c, cstr)] = 0;
+        else
+            strcat((char*)cstr, "N/A");
+        utf8proc_map(cstr, 0, &map, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD);
+
+        printf("U+%s: %s\n"
+            "  category = %s\n"
+            "  combining_class = %d\n"
+            "  bidi_class = %d\n"
+            "  decomp_type = %d\n"
+            "  uppercase_mapping = %x\n"
+            "  lowercase_mapping = %x\n"
+            "  titlecase_mapping = %x\n"
+            "  casefold = %s\n"
+            "  comb_index = %d\n"
+            "  bidi_mirrored = %d\n"
+            "  comp_exclusion = %d\n"
+            "  ignorable = %d\n"
+            "  control_boundary = %d\n"
+            "  boundclass = %d\n"
+            "  charwidth = %d\n",
+        argv[i], (char*) cstr,
+        utf8proc_category_string(c),
+        p->combining_class,
+        p->bidi_class,
+        p->decomp_type,
+        utf8proc_toupper(c),
+        utf8proc_tolower(c),
+        utf8proc_totitle(c),
+        (char *) map,
+        p->comb_index,
+        p->bidi_mirrored,
+        p->comp_exclusion,
+        p->ignorable,
+        p->control_boundary,
+        p->boundclass,
+        utf8proc_charwidth(c));
+        free(map);
+    }
+    return 0;
+}
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/tests.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/tests.c
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/tests.h
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/tests.h
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/valid.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/valid.c
--- a/rhubarb/lib/utf8proc-2a2f97e1/utf8proc.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/utf8proc.c
@ -1,6 +1,6 @@
 /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
 /*
- *  Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
+ *  Copyright (c) 2018 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
 *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
 *
 *  Permission is hereby granted, free of charge, to any person obtaining a
@ -42,6 +42,14 @@


 #include "utf8proc.h"
+
+#ifndef SSIZE_MAX
+#define SSIZE_MAX ((size_t)SIZE_MAX/2)
+#endif
+#ifndef UINT16_MAX
+#  define UINT16_MAX 65535U
+#endif
+
 #include "utf8proc_data.c"


@ -271,12 +279,8 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
     tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                // ---
     tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        // GB9a
     lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    // GB9b
-    ((lbc == UTF8PROC_BOUNDCLASS_E_BASE ||            // GB10 (requires additional handling below)
-      lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&       // ----
-     tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
-    (lbc == UTF8PROC_BOUNDCLASS_ZWJ &&                         // GB11
-     (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ ||             // ----
-      tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false :        // ----
+    (lbc == UTF8PROC_BOUNDCLASS_E_ZWG &&              // GB11 (requires additional handling below)
+     tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          // GB12/13 (requires additional handling below)
     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  // ----
    true; // GB999
@ -284,9 +288,8 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {

 static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
 {
-  int lbc_override = lbc;
-  if (state && *state != UTF8PROC_BOUNDCLASS_START)
-    lbc_override = *state;
+  int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
+                      ? *state : lbc);
  utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
  if (state) {
    // Special support for GB 12/13 made possible by GB999. After two RI
@ -296,12 +299,15 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
    // forbidden by a different rule such as GB9).
    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
      *state = UTF8PROC_BOUNDCLASS_OTHER;
-    // Special support for GB10. Fold any EXTEND codepoints into the previous
-    // boundclass if we're dealing with an emoji base boundclass.
-    else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE      ||
-              *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
-             tbc == UTF8PROC_BOUNDCLASS_EXTEND)
-      *state = UTF8PROC_BOUNDCLASS_E_BASE;
+    // Special support for GB11 (emoji extend* zwj / emoji)
+    else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
+      if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
+        *state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
+      else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
+        *state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
+      else
+        *state = tbc;
+    }
    else
      *state = tbc;
  }
@ -424,6 +430,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
  if (options & UTF8PROC_IGNORE) {
    if (property->ignorable) return 0;
  }
+  if (options & UTF8PROC_STRIPNA) {
+    if (!category) return 0;
+  }
  if (options & UTF8PROC_LUMP) {
    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
@ -632,9 +641,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
            current_property->comb_index != UINT16_MAX &&
            current_property->comb_index >= 0x8000) {
          int sidx = starter_property->comb_index;
-          int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
-          if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
-            idx += sidx + 2;
+          int idx = current_property->comb_index & 0x3FFF;
+          if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + 1] ) {
+            idx += sidx + 2 - utf8proc_combinations[sidx];
            if (current_property->comb_index & 0x4000) {
              composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
            } else
@ -753,3 +762,10 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
  return retval;
 }
+
+UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
+  utf8proc_uint8_t *retval;
+  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+    UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
+  return retval;
+}
--- a/rhubarb/lib/utf8proc-2a2f97e1/utf8proc.h
+++ b/rhubarb/lib/utf8proc-2a2f97e1/utf8proc.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
+ * Copyright (c) 2018 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -71,7 +71,7 @@
 /** The MAJOR version number (increased when backwards API compatibility is broken). */
 #define UTF8PROC_VERSION_MAJOR 2
 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 1
+#define UTF8PROC_VERSION_MINOR 2
 /** The PATCH version (increased for fixes that do not change the API). */
 #define UTF8PROC_VERSION_PATCH 0
 /** @} */
@ -120,20 +120,26 @@ typedef bool utf8proc_bool;
 #endif
 #include <limits.h>

-#define UTF8PROC_DLLEXPORT
+#ifdef UTF8PROC_STATIC
+#  define UTF8PROC_DLLEXPORT
+#else
+#  ifdef _WIN32
+#    ifdef UTF8PROC_EXPORTS
+#      define UTF8PROC_DLLEXPORT __declspec(dllexport)
+#    else
+#      define UTF8PROC_DLLEXPORT __declspec(dllimport)
+#    endif
+#  elif __GNUC__ >= 4
+#    define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
+#  else
+#    define UTF8PROC_DLLEXPORT
+#  endif
+#endif

 #ifdef __cplusplus
 extern "C" {
 #endif

-#ifndef SSIZE_MAX
-#define SSIZE_MAX ((size_t)SIZE_MAX/2)
-#endif
-
-#ifndef UINT16_MAX
-#  define UINT16_MAX 65535U
-#endif
-
 /**
 * Option flags used by several functions in the library.
 */
@ -199,6 +205,10 @@ typedef enum {
   *       @ref UTF8PROC_DECOMPOSE
   */
  UTF8PROC_STRIPMARK = (1<<13),
+  /**
+   * Strip unassigned codepoints.
+   */
+  UTF8PROC_STRIPNA    = (1<<14),
 } utf8proc_option_t;

 /** @name Error codes
@ -364,10 +374,18 @@ typedef enum {
  UTF8PROC_BOUNDCLASS_SPACINGMARK        = 12, /**< Spacingmark */
  UTF8PROC_BOUNDCLASS_PREPEND            = 13, /**< Prepend */
  UTF8PROC_BOUNDCLASS_ZWJ                = 14, /**< Zero Width Joiner */
+
+  /* the following are no longer used in Unicode 11, but we keep
+     the constants here for backward compatibility */
  UTF8PROC_BOUNDCLASS_E_BASE             = 15, /**< Emoji Base */
  UTF8PROC_BOUNDCLASS_E_MODIFIER         = 16, /**< Emoji Modifier */
  UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ     = 17, /**< Glue_After_ZWJ */
  UTF8PROC_BOUNDCLASS_E_BASE_GAZ         = 18, /**< E_BASE + GLUE_AFTER_ZJW */
+
+  /* the Extended_Pictographic property is used in the Unicode 11
+     grapheme-boundary rules, so we store it in the boundclass field */
+  UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19,
+  UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
 } utf8proc_boundclass_t;

 /**
@ -455,6 +473,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
 * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
 * - @ref UTF8PROC_LUMP      - lump certain different codepoints together
 * - @ref UTF8PROC_STRIPMARK - remove all character marks
+ * - @ref UTF8PROC_STRIPNA   - remove unassigned codepoints
 * @param last_boundclass
 * Pointer to an integer variable containing
 * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
@ -566,6 +585,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
 * Given a pair of consecutive codepoints, return whether a grapheme break is
 * permitted between them (as defined by the extended grapheme clusters in UAX#29).
 *
+ * @param codepoint1 The first codepoint.
+ * @param codepoint2 The second codepoint, occurring consecutively after `codepoint1`.
 * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
 *              state to break graphemes. This state can be passed in as a pointer
 *              in the `state` argument and should initially be set to 0. If the
@ -641,7 +662,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
 * contain NULL characters with the string if `str` contained NULL
 * characters). Other flags in the `options` field are passed to the
 * functions defined above, and regarded as described.  See also
- * @ref utfproc_map_custom to supply a custom codepoint transformation.
+ * @ref utf8proc_map_custom to supply a custom codepoint transformation.
 *
 * In case of success the length of the new string is returned,
 * otherwise a negative error code is returned.
@ -666,8 +687,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(

 /** @name Unicode normalization
 *
- * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
- * normalized version of the null-terminated string `str`.  These
+ * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or
+ * NFKC_Casefold normalized version of the null-terminated string `str`.  These
 * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
 * combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
 */
@ -680,6 +701,11 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
 /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
+/**
+ * NFKC_Casefold normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT
+ * and @ref UTF8PROC_CASEFOLD and @ref UTF8PROC_IGNORE).
+ **/
+UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
 /** @} */

 #ifdef __cplusplus
--- a/rhubarb/lib/utf8proc-2a2f97e1/utf8proc_data.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/utf8proc_data.c
--- a/rhubarb/lib/utf8proc-2a2f97e1/utils.cmake
+++ b/rhubarb/lib/utf8proc-2a2f97e1/utils.cmake
--- a/rhubarb/lib/utf8proc-2a2f97e1/CMakeLists.txt
+++ b/rhubarb/lib/utf8proc-2a2f97e1/CMakeLists.txt
@ -1,33 +0,0 @@
-cmake_minimum_required (VERSION 2.8)
-
-include (utils.cmake)
-
-disallow_intree_builds()
-
-project (utf8proc C)
-
-# This is the ABI version number, which may differ from the
-# API version number (defined in utf8proc.h).
-# Be sure to also update these in Makefile and MANIFEST!
-set(SO_MAJOR 2)
-set(SO_MINOR 1)
-set(SO_PATCH 0)
-
-add_definitions (
-  -DUTF8PROC_EXPORTS
-)
-
-if (NOT MSVC)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -std=c99 -pedantic -Wall")
-endif ()
-
-add_library (utf8proc
-  utf8proc.c
-  utf8proc.h
-)
-
-set_target_properties (utf8proc PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-  VERSION "${SO_MAJOR}.${SO_MINOR}.${SO_PATCH}"
-  SOVERSION ${SO_MAJOR}
-)
--- a/rhubarb/lib/utf8proc-2a2f97e1/MANIFEST
+++ b/rhubarb/lib/utf8proc-2a2f97e1/MANIFEST
@ -1,7 +0,0 @@
-include/
-include/utf8proc.h
-lib/
-lib/libutf8proc.a
-lib/libutf8proc.so -> libutf8proc.so.2.1.0
-lib/libutf8proc.so.2 -> libutf8proc.so.2.1.0
-lib/libutf8proc.so.2.1.0
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/charwidth.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/charwidth.c
@ -1,71 +0,0 @@
-#include "tests.h"
-#include <ctype.h>
-#include <wchar.h>
-
-static int my_isprint(int c) {
-     int cat = utf8proc_get_property(c)->category;
-     return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
-          (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd);
-}
-
-int main(int argc, char **argv)
-{
-     int c, error = 0, updates = 0;
-
-     (void) argc; /* unused */
-     (void) argv; /* unused */
-
-     /* some simple sanity tests of the character widths */
-     for (c = 0; c <= 0x110000; ++c) {
-          int cat = utf8proc_get_property(c)->category;
-          int w = utf8proc_charwidth(c);
-          if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) &&
-              w > 0) {
-               fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
-               error = 1;
-          }
-          if (w == 0 &&
-			  ((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
-			   (cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
-			   (cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
-               fprintf(stderr, "zero width for symbol-like char %x\n", c);
-               error = 1;
-          }
-          if (c <= 127 && ((!isprint(c) && w > 0) ||
-                           (isprint(c) && wcwidth(c) != w))) {
-               fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
-                       wcwidth(c), w, 
-                       isprint(c) ? "printable" : "non-printable", c);
-               error = 1;
-          }
-          if (!my_isprint(c) && w > 0) {
-               fprintf(stderr, "non-printing %x had width %d\n", c, w);
-               error = 1;
-          }
-     }
-     check(!error, "utf8proc_charwidth FAILED tests.");
-
-     /* print some other information by compariing with system wcwidth */
-     printf("Mismatches with system wcwidth (not necessarily errors):\n");
-     for (c = 0; c <= 0x110000; ++c) {
-          int w = utf8proc_charwidth(c);
-          int wc = wcwidth(c);
-          if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
-          /* lots of these errors for out-of-date system unicode tables */
-          if (wc == -1 && my_isprint(c) && w > 0) {
-			   updates += 1;
-#if 0
-               printf("  wcwidth(%x) = -1 for printable char\n", c);
-#endif
-		  }
-          if (wc == -1 && !my_isprint(c) && w > 0)
-               printf("  wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
-          if (wc >= 0 && wc != w)
-               printf("  wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
-     }
-	 printf("   ... (positive widths for %d chars unknown to wcwidth) ...\n",
-			updates);
-     printf("Character-width tests SUCCEEDED.\n");
-
-     return 0;
-}
--- a/rhubarb/lib/utf8proc-2a2f97e1/test/printproperty.c
+++ b/rhubarb/lib/utf8proc-2a2f97e1/test/printproperty.c
@ -1,49 +0,0 @@
-/* simple test program to print out the utf8proc properties for a codepoint */
-
-#include "tests.h"
-
-int main(int argc, char **argv)
-{
-     int i;
-
-     for (i = 1; i < argc; ++i) {
-          unsigned int c;
-          if (!strcmp(argv[i], "-V")) {
-               printf("utf8proc version %s\n", utf8proc_version());
-               continue;
-          }
-          check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
-          const utf8proc_property_t *p = utf8proc_get_property(c);
-          printf("U+%s:\n"
-                 "  category = %s\n"
-                 "  combining_class = %d\n"
-                 "  bidi_class = %d\n"
-                 "  decomp_type = %d\n"
-                 "  uppercase_mapping = %x\n"
-                 "  lowercase_mapping = %x\n"
-                 "  titlecase_mapping = %x\n"
-                 "  comb_index = %d\n"
-                 "  bidi_mirrored = %d\n"
-                 "  comp_exclusion = %d\n"
-                 "  ignorable = %d\n"
-                 "  control_boundary = %d\n"
-                 "  boundclass = %d\n"
-                 "  charwidth = %d\n",
-                 argv[i],
-                 utf8proc_category_string(c),
-                 p->combining_class,
-                 p->bidi_class,
-                 p->decomp_type,
-                 utf8proc_toupper(c),
-                 utf8proc_tolower(c),
-                 utf8proc_totitle(c),
-                 p->comb_index,
-                 p->bidi_mirrored,
-                 p->comp_exclusion,
-                 p->ignorable,
-                 p->control_boundary,
-                 p->boundclass,
-                 utf8proc_charwidth(c));
-     }
-     return 0;
-}
--- a/rhubarb/lib/utf8proc.patch
+++ b/rhubarb/lib/utf8proc.patch
@ -1,23 +0,0 @@
-diff --git a/lib/utf8proc-2a2f97e1/utf8proc.h b/lib/utf8proc-2a2f97e1/utf8proc.h
-index 64155a1..2fca528 100644
--- a/lib/utf8proc-2a2f97e1/utf8proc.h
-+++ b/lib/utf8proc-2a2f97e1/utf8proc.h
-@@ -120,17 +120,7 @@ typedef bool utf8proc_bool;
- #endif
- #include <limits.h>
- 
-#ifdef _WIN32
-#  ifdef UTF8PROC_EXPORTS
-#    define UTF8PROC_DLLEXPORT __declspec(dllexport)
-#  else
-#    define UTF8PROC_DLLEXPORT __declspec(dllimport)
-#  endif
-#elif __GNUC__ >= 4
-#  define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
-#else
-#  define UTF8PROC_DLLEXPORT
-#endif
-+#define UTF8PROC_DLLEXPORT
- 
- #ifdef __cplusplus
- extern "C" {