From 294e8a7e467665ebd4073190738aeaf677786ea1 Mon Sep 17 00:00:00 2001 From: arnaudroger Date: Tue, 10 Oct 2017 15:28:13 +0100 Subject: [PATCH 1/2] use a open addressing map to store folding paths --- java/com/google/re2j/Inst.java | 50 ++++++++---- java/com/google/re2j/Unicode.java | 95 ++++++++++++++++++++++ javatests/com/google/re2j/RunesTest.java | 48 +++++++++++ javatests/com/google/re2j/UnicodeTest.java | 21 +++++ 4 files changed, 196 insertions(+), 18 deletions(-) create mode 100644 javatests/com/google/re2j/RunesTest.java diff --git a/java/com/google/re2j/Inst.java b/java/com/google/re2j/Inst.java index 8a76b349..37725d9d 100644 --- a/java/com/google/re2j/Inst.java +++ b/java/com/google/re2j/Inst.java @@ -13,6 +13,8 @@ */ class Inst { + private static final int RUNES_LINER_SEARCH_SIZE = 10; + enum Op { ALT, ALT_MATCH, @@ -57,25 +59,37 @@ boolean matchRune(int r) { // Special case: single-rune slice is from literal string, not char // class. if (runes.length == 1) { - int r0 = runes[0]; - if (r == r0) { - return true; - } - if ((arg & RE2.FOLD_CASE) != 0) { - for (int r1 = Unicode.simpleFold(r0); - r1 != r0; - r1 = Unicode.simpleFold(r1)) { - if (r == r1) { - return true; - } + return singleMatchRune(r); + } + + return multiMatchRune(r); + } + + private boolean singleMatchRune(int r) { + int r0 = runes[0]; + if (r == r0) { + return true; + } + + if ((arg & RE2.FOLD_CASE) != 0) { + int[] folds = Unicode.optimizedFoldOrbit(r0); + + if (folds == null) { + return (r == Unicode.optimizedFoldNonOrbit(r0)); + } else { + for(int i = 0; i < folds.length; i++) { + if (folds[i] == r) return true; } } - return false; } + return false; + } - // Peek at the first few pairs. + private boolean multiMatchRune(int r) { + // Peek at the first 5 pairs. // Should handle ASCII well. - for (int j = 0; j < runes.length && j <= 8; j += 2) { + int length = Math.min(runes.length, RUNES_LINER_SEARCH_SIZE); + for (int j = 0; j < length; j += 2) { if (r < runes[j]) { return false; } @@ -84,12 +98,12 @@ boolean matchRune(int r) { } } - // Otherwise binary search. - for (int lo = 0, hi = runes.length / 2; lo < hi; ) { + // Otherwise binary search on rest of the array + for (int lo = 0, hi = (runes.length - RUNES_LINER_SEARCH_SIZE) / 2; lo < hi; ) { int m = lo + (hi - lo) / 2; - int c = runes[2 * m]; + int c = runes[2 * m + RUNES_LINER_SEARCH_SIZE]; if (c <= r) { - if (r <= runes[2 * m + 1]) { + if (r <= runes[2 * m + 1 + RUNES_LINER_SEARCH_SIZE]) { return true; } lo = m + 1; diff --git a/java/com/google/re2j/Unicode.java b/java/com/google/re2j/Unicode.java index 2f12a8ee..a9bed699 100644 --- a/java/com/google/re2j/Unicode.java +++ b/java/com/google/re2j/Unicode.java @@ -7,6 +7,8 @@ package com.google.re2j; +import java.util.Arrays; + /** * Utilities for dealing with Unicode better than Java does. * @@ -227,6 +229,99 @@ static int simpleFold(int r) { return toUpper(r); } + static int[] optimizedFoldOrbit(int r) { + if (r >= ORBIT_MIN_VALUE) { + return FOLD_MAP.get(r); + } else return null; + } + + static int optimizedFoldNonOrbit(int r) { + int l = toLower(r); + if (l != r) { + return l; + } + return toUpper(r); + } + + private static final FoldMap FOLD_MAP = new FoldMap(UnicodeTables.CASE_ORBIT.length * 4); + private static final int ORBIT_MIN_VALUE = UnicodeTables.CASE_ORBIT[0][0]; + + static { + for(int i = 0; i < UnicodeTables.CASE_ORBIT.length; i++) { + int r0 = UnicodeTables.CASE_ORBIT[i][0]; + int[] folds = new int[0]; + int r1 = r0; + while((r1 = simpleFold(r1)) != r0) { + folds = Arrays.copyOf(folds, folds.length + 1); + folds[folds.length - 1] = r1; + } + FOLD_MAP.put(r0, folds); + } + } + + /* + associate a rune to it's unfolded case equivalent + */ + private static class FoldMap { + private static final int MAX_DISTANCE = 4; + private final int[] keys; + private final int[][] values; + private final int mask; + private final int maxDistance; + + FoldMap(int size) { + int s = findNextPositivePowerOfTwo(size); + keys = new int[s]; + Arrays.fill(keys, -1); + values = new int[s][]; + mask = s - 1; + maxDistance = Math.min(keys.length, MAX_DISTANCE); + } + + public void put(int k, int[] fold) { + if (k == -1) throw new IllegalArgumentException("-1 is the empty marker"); + + int hash = hashCode(k); + int i = 0; + do { + int index = (hash + i) & mask; + if (keys[index] == -1) { // empty slot + values[index] = fold; + keys[index] = k; + return; + } + i++; + } while (i < maxDistance); + + throw new IllegalStateException("Map is full"); + } + + public int[] get(int k) { + int hash = hashCode(k); + + int i = 0; + do { + int index = (hash + i) & mask; + int key = keys[index]; + if (key == -1) return null; // empty slot + if (key == k) return values[index]; + i++; + } while (i < maxDistance); + + return null; + } + private static final int INT_PHI = 0x9E3779B9; + + private int hashCode(int k) { + final int h = k * INT_PHI; + return h ^ (h >> 16); + } + + public static int findNextPositivePowerOfTwo(final int value) { + return 1 << (32 - Integer.numberOfLeadingZeros(value - 1)); + } + } + private Unicode() {} // uninstantiable } diff --git a/javatests/com/google/re2j/RunesTest.java b/javatests/com/google/re2j/RunesTest.java new file mode 100644 index 00000000..129d6643 --- /dev/null +++ b/javatests/com/google/re2j/RunesTest.java @@ -0,0 +1,48 @@ +package com.google.re2j; + +import org.junit.Test; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class RunesTest { + + @Test + public void testRunes() { + RE2 compile = RE2.compile("[0-13-46-78-9b-ce-fh-i]"); + assertTrue(compile.match("0")); + assertTrue(compile.match("1")); + assertTrue(compile.match("3")); + assertTrue(compile.match("4")); + assertTrue(compile.match("6")); + assertTrue(compile.match("7")); + assertTrue(compile.match("8")); + assertTrue(compile.match("9")); + assertTrue(compile.match("b")); + assertTrue(compile.match("c")); + assertTrue(compile.match("e")); + assertTrue(compile.match("f")); + assertTrue(compile.match("h")); + assertTrue(compile.match("i")); + + assertFalse(compile.match("2")); + assertFalse(compile.match("5")); + assertFalse(compile.match("a")); + assertFalse(compile.match("d")); + assertFalse(compile.match("g")); + assertFalse(compile.match("j")); + } + @Test + public void testRunesWithFold() { + Pattern pattern = Pattern.compile("ak", Pattern.CASE_INSENSITIVE); + String upperCase = "AK"; + String withKelvin = "AK"; + assertFalse(withKelvin.equals(upperCase));// check we use the kelvin sign + + assertTrue(pattern.matches("ak")); + assertTrue(pattern.matches(upperCase)); + assertTrue(pattern.matches("aK")); + assertTrue(pattern.matches("Ak")); + assertTrue(pattern.matches(withKelvin)); + } +} diff --git a/javatests/com/google/re2j/UnicodeTest.java b/javatests/com/google/re2j/UnicodeTest.java index 0ae11b29..9f51a850 100644 --- a/javatests/com/google/re2j/UnicodeTest.java +++ b/javatests/com/google/re2j/UnicodeTest.java @@ -4,6 +4,7 @@ package com.google.re2j; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; import org.junit.Test; @@ -38,4 +39,24 @@ public void testFoldConstants() { // int toLower(int r); // int simpleFold(int r); + @Test + public void testOptimizedFold() { + // check that the new optimized fold alg will have the same result as using simple fold + for (int i = 0; i <= Unicode.MAX_RUNE; i++) { + int[] orbit = Unicode.optimizedFoldOrbit(i); + if (orbit != null) { + int r = Unicode.simpleFold(i); + int j = 0; + while (r != i) { + assertEquals(r, orbit[j]); + j++; + r = Unicode.simpleFold(r); + } + } else { + int r = Unicode.simpleFold(i); + assertEquals(r, Unicode.optimizedFoldNonOrbit(i)); // first fold map the same + assertEquals(i, Unicode.simpleFold(r)); // second fold always go back to first + } + } + } } From cecfbead608f8134cf4b3765a2a60657dc9923e8 Mon Sep 17 00:00:00 2001 From: arnaudroger Date: Wed, 11 Oct 2017 14:22:22 +0100 Subject: [PATCH 2/2] fix comments and use Character for toLowerCase --- java/com/google/re2j/Inst.java | 16 ++-- java/com/google/re2j/Unicode.java | 90 +++++++++++----------- javatests/com/google/re2j/RunesTest.java | 2 +- javatests/com/google/re2j/UnicodeTest.java | 34 +++++--- 4 files changed, 78 insertions(+), 64 deletions(-) diff --git a/java/com/google/re2j/Inst.java b/java/com/google/re2j/Inst.java index 37725d9d..11b77362 100644 --- a/java/com/google/re2j/Inst.java +++ b/java/com/google/re2j/Inst.java @@ -13,8 +13,6 @@ */ class Inst { - private static final int RUNES_LINER_SEARCH_SIZE = 10; - enum Op { ALT, ALT_MATCH, @@ -75,7 +73,7 @@ private boolean singleMatchRune(int r) { int[] folds = Unicode.optimizedFoldOrbit(r0); if (folds == null) { - return (r == Unicode.optimizedFoldNonOrbit(r0)); + return Unicode.areEqualsCaseInsensitive(r, r0); } else { for(int i = 0; i < folds.length; i++) { if (folds[i] == r) return true; @@ -85,10 +83,11 @@ private boolean singleMatchRune(int r) { return false; } + private boolean multiMatchRune(int r) { // Peek at the first 5 pairs. // Should handle ASCII well. - int length = Math.min(runes.length, RUNES_LINER_SEARCH_SIZE); + int length = Math.min(runes.length, 10); for (int j = 0; j < length; j += 2) { if (r < runes[j]) { return false; @@ -98,12 +97,13 @@ private boolean multiMatchRune(int r) { } } - // Otherwise binary search on rest of the array - for (int lo = 0, hi = (runes.length - RUNES_LINER_SEARCH_SIZE) / 2; lo < hi; ) { + // Otherwise binary search + // Invariant: lo, hi, m are even. + for (int lo = 0, hi = runes.length / 2; lo < hi; ) { int m = lo + (hi - lo) / 2; - int c = runes[2 * m + RUNES_LINER_SEARCH_SIZE]; + int c = runes[2 * m]; if (c <= r) { - if (r <= runes[2 * m + 1 + RUNES_LINER_SEARCH_SIZE]) { + if (r <= runes[2 * m + 1]) { return true; } lo = m + 1; diff --git a/java/com/google/re2j/Unicode.java b/java/com/google/re2j/Unicode.java index a9bed699..d114a278 100644 --- a/java/com/google/re2j/Unicode.java +++ b/java/com/google/re2j/Unicode.java @@ -232,92 +232,94 @@ static int simpleFold(int r) { static int[] optimizedFoldOrbit(int r) { if (r >= ORBIT_MIN_VALUE) { return FOLD_MAP.get(r); - } else return null; - } - - static int optimizedFoldNonOrbit(int r) { - int l = toLower(r); - if (l != r) { - return l; } - return toUpper(r); + return null; } - private static final FoldMap FOLD_MAP = new FoldMap(UnicodeTables.CASE_ORBIT.length * 4); + private static final FoldMap FOLD_MAP = new FoldMap(); private static final int ORBIT_MIN_VALUE = UnicodeTables.CASE_ORBIT[0][0]; static { - for(int i = 0; i < UnicodeTables.CASE_ORBIT.length; i++) { - int r0 = UnicodeTables.CASE_ORBIT[i][0]; - int[] folds = new int[0]; + for(int[] orbit : UnicodeTables.CASE_ORBIT) { + int r0 = orbit[0]; + int[] folds = new int[3]; + int foldsSize = 0; int r1 = r0; while((r1 = simpleFold(r1)) != r0) { - folds = Arrays.copyOf(folds, folds.length + 1); - folds[folds.length - 1] = r1; + if (foldsSize >= folds.length) { + Arrays.copyOf(folds, folds.length * 2); + } + folds[foldsSize] = r1; + foldsSize++; } - FOLD_MAP.put(r0, folds); + FOLD_MAP.put(r0, Arrays.copyOf(folds, foldsSize)); } } + public static boolean areEqualsCaseInsensitive(int r0, int r1) { + return r0 == toUpper(r1) || r0 == toLower(r1); + } + /* - associate a rune to it's unfolded case equivalent + A FoldMap maps a rune to an array of runes that are equivalent to it in a case-insensitive pattern. */ private static class FoldMap { - private static final int MAX_DISTANCE = 4; + private static final int MAX_LINEAR_PROBING = 4; private final int[] keys; private final int[][] values; private final int mask; - private final int maxDistance; - FoldMap(int size) { - int s = findNextPositivePowerOfTwo(size); + FoldMap() { + int s = findNextPositivePowerOfTwo(UnicodeTables.CASE_ORBIT.length * 4); // load of factor of 0.25 to have a max linear probing of MAX_LINEAR_PROBING keys = new int[s]; - Arrays.fill(keys, -1); + Arrays.fill(keys,-1); values = new int[s][]; mask = s - 1; - maxDistance = Math.min(keys.length, MAX_DISTANCE); } - public void put(int k, int[] fold) { + void put(int k, int[] fold) { if (k == -1) throw new IllegalArgumentException("-1 is the empty marker"); - int hash = hashCode(k); - int i = 0; + int index = hashCode(k); + int maxIndex = index + MAX_LINEAR_PROBING; + do { - int index = (hash + i) & mask; - if (keys[index] == -1) { // empty slot - values[index] = fold; - keys[index] = k; + int slot = index & mask; + if (keys[slot] == -1) { // empty slot + values[slot] = fold; + keys[slot] = k; return; } - i++; - } while (i < maxDistance); + index++; + } while (index < maxIndex); throw new IllegalStateException("Map is full"); } - public int[] get(int k) { - int hash = hashCode(k); - - int i = 0; + int[] get(int k) { + int index = hashCode(k); + int maxIndex = index + MAX_LINEAR_PROBING; do { - int index = (hash + i) & mask; - int key = keys[index]; - if (key == -1) return null; // empty slot - if (key == k) return values[index]; - i++; - } while (i < maxDistance); + int slot = index & mask; + int key = keys[slot]; + if (key == -1) { + return null; // empty slot + } + if (key == k) { + return values[slot]; + } + index++; + } while (index < maxIndex); return null; } - private static final int INT_PHI = 0x9E3779B9; private int hashCode(int k) { - final int h = k * INT_PHI; + final int h = k * 0x9E3779B9; return h ^ (h >> 16); } - public static int findNextPositivePowerOfTwo(final int value) { + private int findNextPositivePowerOfTwo(final int value) { return 1 << (32 - Integer.numberOfLeadingZeros(value - 1)); } } diff --git a/javatests/com/google/re2j/RunesTest.java b/javatests/com/google/re2j/RunesTest.java index 129d6643..2d75eb90 100644 --- a/javatests/com/google/re2j/RunesTest.java +++ b/javatests/com/google/re2j/RunesTest.java @@ -37,7 +37,7 @@ public void testRunesWithFold() { Pattern pattern = Pattern.compile("ak", Pattern.CASE_INSENSITIVE); String upperCase = "AK"; String withKelvin = "AK"; - assertFalse(withKelvin.equals(upperCase));// check we use the kelvin sign + assertFalse(withKelvin.equals(upperCase)); // check we use the kelvin sign assertTrue(pattern.matches("ak")); assertTrue(pattern.matches(upperCase)); diff --git a/javatests/com/google/re2j/UnicodeTest.java b/javatests/com/google/re2j/UnicodeTest.java index 9f51a850..e043b1bd 100644 --- a/javatests/com/google/re2j/UnicodeTest.java +++ b/javatests/com/google/re2j/UnicodeTest.java @@ -5,6 +5,7 @@ package com.google.re2j; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import org.junit.Test; @@ -41,22 +42,33 @@ public void testFoldConstants() { @Test public void testOptimizedFold() { - // check that the new optimized fold alg will have the same result as using simple fold + // Check that the new optimized fold algorithm gives the same result as using simple fold. for (int i = 0; i <= Unicode.MAX_RUNE; i++) { int[] orbit = Unicode.optimizedFoldOrbit(i); if (orbit != null) { - int r = Unicode.simpleFold(i); - int j = 0; - while (r != i) { - assertEquals(r, orbit[j]); - j++; - r = Unicode.simpleFold(r); - } + testOptimizedFoldOrbitCase(i, orbit); } else { - int r = Unicode.simpleFold(i); - assertEquals(r, Unicode.optimizedFoldNonOrbit(i)); // first fold map the same - assertEquals(i, Unicode.simpleFold(r)); // second fold always go back to first + testOptimizedFoldNonOrbitCase(i); } } } + + private void testOptimizedFoldNonOrbitCase(int i) { + int r = Unicode.simpleFold(i); + assertEquals(i, Unicode.simpleFold(r)); // second fold always go back to first + assertTrue(Unicode.areEqualsCaseInsensitive(i, i)); + assertTrue(Unicode.areEqualsCaseInsensitive(i, r)); + assertTrue(Unicode.areEqualsCaseInsensitive(r, i)); + assertTrue(Unicode.areEqualsCaseInsensitive(r, r)); + } + + private void testOptimizedFoldOrbitCase(int i, int[] orbit) { + int r = Unicode.simpleFold(i); + int j = 0; + while (r != i) { + assertEquals(r, orbit[j]); + j++; + r = Unicode.simpleFold(r); + } + } }