Skip to content

Commit fd1d15a

Browse files
author
José Valim
committed
Consider case ignorable characters on Greek downcasing, closes #7149
Note this does not impact the runtime cost of other downcasing operations. The final beam file grew only in 8kb.
1 parent 56fcd2a commit fd1d15a

File tree

2 files changed

+79
-11
lines changed

2 files changed

+79
-11
lines changed

lib/elixir/test/elixir/string_test.exs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,14 @@ defmodule StringTest do
171171
assert String.downcase("Σ") == "σ"
172172
assert String.downcase("ΣΣ") == "σσ"
173173
assert String.downcase("Σ ΣΣ") == "σ σσ"
174+
assert String.downcase("ΜΕΣ'ΑΠΟ") == "μεσ'απο"
175+
assert String.downcase("ΑΣ'ΤΟΥΣ") == "ασ'τουσ"
174176

175177
assert String.downcase("Σ", :greek) == "σ"
176178
assert String.downcase("Σ ΣΣ", :greek) == "σ σς"
177179
assert String.downcase("Σ ΣΑΣ Σ", :greek) == "σ σας σ"
180+
assert String.downcase("ΜΕΣ'ΑΠΟ", :greek) == "μεσ'απο"
181+
assert String.downcase("ΑΣ'ΤΟΥΣ", :greek) == "ασ'τους"
178182
end
179183

180184
test "downcase/1 with ascii" do

lib/elixir/unicode/properties.ex

Lines changed: 75 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,39 @@ rangify = fn [head | tail] ->
2424
[{first, last} | acc]
2525
end
2626

27-
acc = {[], [], [], %{}, %{}}
28-
29-
{codes, cased_letters, non_breakable, decompositions, combining_classes} =
30-
Enum.reduce(File.stream!(data_path), acc, fn line, {cacc, lacc, wacc, dacc, kacc} ->
27+
# A character is case ignorable if:
28+
#
29+
# Word_Break(C) = MidLetter or MidNumLet or Single_Quote, or
30+
# General_Category(C) = Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
31+
# Modifier_Letter (Lm), or Modifier_Symbol (Sk).
32+
#
33+
# Word breaks are defined below based on TR29 (http://unicode.org/reports/tr29/).
34+
# The categories are computed later.
35+
case_ignorable = [
36+
0x0027,
37+
0x002E,
38+
0x2018,
39+
0x2019,
40+
0x2024,
41+
0xFE52,
42+
0xFF07,
43+
0xFF0E,
44+
0x00B7,
45+
0x0387,
46+
0x05F4,
47+
0x2027,
48+
0x003A,
49+
0xFE13,
50+
0xFE55,
51+
0xFF1A
52+
]
53+
54+
acc = {[], [], case_ignorable, [], %{}, %{}}
55+
cased_letter_categories = :binary.compile_pattern(["Ll", "Lt", "Lu"])
56+
case_ignorable_categories = :binary.compile_pattern(["Mn", "Me", "Cf", "Lm", "Sk"])
57+
58+
{codes, cased_letters, case_ignorable, non_breakable, decompositions, combining_classes} =
59+
Enum.reduce(File.stream!(data_path), acc, fn line, {cacc, lacc, iacc, wacc, dacc, kacc} ->
3160
[
3261
codepoint,
3362
_name,
@@ -55,10 +84,19 @@ acc = {[], [], [], %{}, %{}}
5584
cacc
5685
end
5786

58-
lacc =
59-
case category do
60-
<<"L", l>> <> _ when l in 'ltu' -> [String.to_integer(codepoint, 16) | lacc]
61-
_ -> lacc
87+
cased_letter_categories = :binary.compile_pattern(["Ll", "Lt", "Lu"])
88+
case_ignorable_categories = :binary.compile_pattern(["Mn", "Me", "Cf", "Lm", "Sk"])
89+
90+
{lacc, iacc} =
91+
cond do
92+
match?({0, _}, :binary.match(category, cased_letter_categories)) ->
93+
{[String.to_integer(codepoint, 16) | lacc], iacc}
94+
95+
match?({0, _}, :binary.match(category, case_ignorable_categories)) ->
96+
{lacc, [String.to_integer(codepoint, 16) | iacc]}
97+
98+
true ->
99+
{lacc, iacc}
62100
end
63101

64102
wacc =
@@ -88,7 +126,7 @@ acc = {[], [], [], %{}, %{}}
88126
{n, ""} -> Map.put(kacc, String.to_integer(codepoint, 16), n)
89127
end
90128

91-
{cacc, lacc, wacc, dacc, kacc}
129+
{cacc, lacc, iacc, wacc, dacc, kacc}
92130
end)
93131

94132
defmodule String.Casing do
@@ -136,10 +174,24 @@ defmodule String.Casing do
136174

137175
# Sigma handling
138176

139-
defp cased_letter_binary?(<<codepoint::utf8, _::bits>>), do: cased_letter?(codepoint)
177+
defp cased_letter_binary?(<<codepoint::utf8, rest::bits>>) do
178+
if case_ignorable?(codepoint) do
179+
cased_letter_binary?(rest)
180+
else
181+
cased_letter?(codepoint)
182+
end
183+
end
184+
140185
defp cased_letter_binary?(_), do: false
141186

142-
defp cased_letter_list?([<<codepoint::utf8>> | _]), do: cased_letter?(codepoint)
187+
defp cased_letter_list?([<<codepoint::utf8>> | rest]) do
188+
if case_ignorable?(codepoint) do
189+
cased_letter_list?(rest)
190+
else
191+
cased_letter?(codepoint)
192+
end
193+
end
194+
143195
defp cased_letter_list?(_), do: false
144196

145197
for {first, last} <- rangify.(cased_letters) do
@@ -154,6 +206,18 @@ defmodule String.Casing do
154206

155207
defp cased_letter?(_), do: false
156208

209+
for {first, last} <- rangify.(case_ignorable) do
210+
if first == last do
211+
defp case_ignorable?(unquote(first)), do: true
212+
else
213+
defp case_ignorable?(codepoint)
214+
when codepoint >= unquote(first) and codepoint <= unquote(last),
215+
do: true
216+
end
217+
end
218+
219+
defp case_ignorable?(_), do: false
220+
157221
# Upcase
158222

159223
for {codepoint, upper, _lower, _title} <- codes, upper && upper != codepoint do

0 commit comments

Comments
 (0)