@@ -24,10 +24,39 @@ rangify = fn [head | tail] ->
2424 [ { first , last } | acc ]
2525end
2626
27- acc = { [ ] , [ ] , [ ] , % { } , % { } }
28-
29- { codes , cased_letters , non_breakable , decompositions , combining_classes } =
30- Enum . reduce ( File . stream! ( data_path ) , acc , fn line , { cacc , lacc , wacc , dacc , kacc } ->
27+ # A character is case ignorable if:
28+ #
29+ # Word_Break(C) = MidLetter or MidNumLet or Single_Quote, or
30+ # General_Category(C) = Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
31+ # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
32+ #
33+ # Word breaks are defined below based on TR29 (http://unicode.org/reports/tr29/).
34+ # The categories are computed later.
35+ case_ignorable = [
36+ 0x0027 ,
37+ 0x002E ,
38+ 0x2018 ,
39+ 0x2019 ,
40+ 0x2024 ,
41+ 0xFE52 ,
42+ 0xFF07 ,
43+ 0xFF0E ,
44+ 0x00B7 ,
45+ 0x0387 ,
46+ 0x05F4 ,
47+ 0x2027 ,
48+ 0x003A ,
49+ 0xFE13 ,
50+ 0xFE55 ,
51+ 0xFF1A
52+ ]
53+
54+ acc = { [ ] , [ ] , case_ignorable , [ ] , % { } , % { } }
55+ cased_letter_categories = :binary . compile_pattern ( [ "Ll" , "Lt" , "Lu" ] )
56+ case_ignorable_categories = :binary . compile_pattern ( [ "Mn" , "Me" , "Cf" , "Lm" , "Sk" ] )
57+
58+ { codes , cased_letters , case_ignorable , non_breakable , decompositions , combining_classes } =
59+ Enum . reduce ( File . stream! ( data_path ) , acc , fn line , { cacc , lacc , iacc , wacc , dacc , kacc } ->
3160 [
3261 codepoint ,
3362 _name ,
@@ -55,10 +84,19 @@ acc = {[], [], [], %{}, %{}}
5584 cacc
5685 end
5786
58- lacc =
59- case category do
60- << "L" , l >> <> _ when l in 'ltu' -> [ String . to_integer ( codepoint , 16 ) | lacc ]
61- _ -> lacc
87+ cased_letter_categories = :binary . compile_pattern ( [ "Ll" , "Lt" , "Lu" ] )
88+ case_ignorable_categories = :binary . compile_pattern ( [ "Mn" , "Me" , "Cf" , "Lm" , "Sk" ] )
89+
90+ { lacc , iacc } =
91+ cond do
92+ match? ( { 0 , _ } , :binary . match ( category , cased_letter_categories ) ) ->
93+ { [ String . to_integer ( codepoint , 16 ) | lacc ] , iacc }
94+
95+ match? ( { 0 , _ } , :binary . match ( category , case_ignorable_categories ) ) ->
96+ { lacc , [ String . to_integer ( codepoint , 16 ) | iacc ] }
97+
98+ true ->
99+ { lacc , iacc }
62100 end
63101
64102 wacc =
@@ -88,7 +126,7 @@ acc = {[], [], [], %{}, %{}}
88126 { n , "" } -> Map . put ( kacc , String . to_integer ( codepoint , 16 ) , n )
89127 end
90128
91- { cacc , lacc , wacc , dacc , kacc }
129+ { cacc , lacc , iacc , wacc , dacc , kacc }
92130 end )
93131
94132defmodule String.Casing do
@@ -136,10 +174,24 @@ defmodule String.Casing do
136174
137175 # Sigma handling
138176
139- defp cased_letter_binary? ( << codepoint :: utf8 , _ :: bits >> ) , do: cased_letter? ( codepoint )
177+ defp cased_letter_binary? ( << codepoint :: utf8 , rest :: bits >> ) do
178+ if case_ignorable? ( codepoint ) do
179+ cased_letter_binary? ( rest )
180+ else
181+ cased_letter? ( codepoint )
182+ end
183+ end
184+
140185 defp cased_letter_binary? ( _ ) , do: false
141186
142- defp cased_letter_list? ( [ << codepoint :: utf8 >> | _ ] ) , do: cased_letter? ( codepoint )
187+ defp cased_letter_list? ( [ << codepoint :: utf8 >> | rest ] ) do
188+ if case_ignorable? ( codepoint ) do
189+ cased_letter_list? ( rest )
190+ else
191+ cased_letter? ( codepoint )
192+ end
193+ end
194+
143195 defp cased_letter_list? ( _ ) , do: false
144196
145197 for { first , last } <- rangify . ( cased_letters ) do
@@ -154,6 +206,18 @@ defmodule String.Casing do
154206
155207 defp cased_letter? ( _ ) , do: false
156208
209+ for { first , last } <- rangify . ( case_ignorable ) do
210+ if first == last do
211+ defp case_ignorable? ( unquote ( first ) ) , do: true
212+ else
213+ defp case_ignorable? ( codepoint )
214+ when codepoint >= unquote ( first ) and codepoint <= unquote ( last ) ,
215+ do: true
216+ end
217+ end
218+
219+ defp case_ignorable? ( _ ) , do: false
220+
157221 # Upcase
158222
159223 for { codepoint , upper , _lower , _title } <- codes , upper && upper != codepoint do
0 commit comments