diff --git a/Cargo.toml b/Cargo.toml index d33839f..e3909d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "python-daachorse" -version = "0.2.2" +version = "0.3.0" edition = "2021" authors = [ "Koichi Akabe ", @@ -16,7 +16,7 @@ name = "daachorse" crate-type = ["cdylib"] [dependencies] -daachorse = "2.1.1" # MIT or Apache-2.0 +daachorse = "3.0.0" # MIT or Apache-2.0 pyo3 = { version = "0.28.3", features = ["extension-module"] } # Apache-2.0 [profile.release] diff --git a/pyproject.toml b/pyproject.toml index 4a3b5f3..8cefed2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "daachorse" -version = "0.2.2" +version = "0.3.0" description = "🐎 A fast implementation of the Aho-Corasick algorithm using the compact double-array data structure" readme = "README.md" classifiers = [ diff --git a/src/lib.rs b/src/lib.rs index ce7a6f5..81fd786 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,6 +45,18 @@ impl DoubleArrayAhoCorasick { /// Returns a list of non-overlapping matches in the given haystack. /// + /// According to the ``match_kind`` option you specified in the construction, the behavior is + /// changed for multiple possible matches, as follows. + /// + /// * If you set ``MATCH_KIND_STANDARD`` (default), the automaton searches from the beginning of + /// the input string, yielding a value immediately when a pattern is found. + /// * If you set ``MATCH_KIND_LEFTMOST_LONGEST``, the automaton reports matches corresponding to + /// the longest pattern. + /// * If you set ``MATCH_KIND_LEFTMOST_FIRST``, the automaton reports matches corresponding to + /// the pattern earlier registered to the automaton. + /// + /// The next search resumes from the end of the previously found pattern. + /// /// Example 1: Standard semantics /// >>> import daachorse /// >>> patterns = [b'bcd', b'ab', b'a'] @@ -55,14 +67,20 @@ impl DoubleArrayAhoCorasick { /// Example 2: Leftmost longest semantics /// >>> import daachorse /// >>> patterns = [b'ab', b'a', b'abcd'] - /// >>> pma = daachorse.DoubleArrayAhoCorasick(patterns, daachorse.MATCH_KIND_LEFTMOST_LONGEST) + /// >>> pma = daachorse.DoubleArrayAhoCorasick( + /// ... patterns, + /// ... daachorse.MATCH_KIND_LEFTMOST_LONGEST + /// ... ) /// >>> pma.find(b'abcd') /// [(0, 4, 2)] /// /// Example 3: Leftmost first semantics /// >>> import daachorse /// >>> patterns = [b'ab', b'a', b'abcd'] - /// >>> pma = daachorse.DoubleArrayAhoCorasick(patterns, daachorse.MATCH_KIND_LEFTMOST_FIRST) + /// >>> pma = daachorse.DoubleArrayAhoCorasick( + /// ... patterns, + /// ... daachorse.MATCH_KIND_LEFTMOST_FIRST + /// ... ) /// >>> pma.find(b'abcd') /// [(0, 2, 0)] /// @@ -90,6 +108,13 @@ impl DoubleArrayAhoCorasick { /// Returns a list of overlapping matches in the given haystack. /// + /// The automaton follows the standard behavior of the Aho-Corasick algorithm. It searches from + /// the beginning of the input string, and upon reaching a given position, it yields the + /// patterns ending at that position in descending order of length. + /// + /// If the pattern set contains duplicate patterns, they are yielded in the order they were + /// registered. + /// /// Examples: /// >>> import daachorse /// >>> patterns = [b'bcd', b'ab', b'a'] @@ -116,13 +141,11 @@ impl DoubleArrayAhoCorasick { })) } - /// Returns a list of overlapping matches without suffixes in the given haystack iterator. - /// - /// The Aho-Corasick algorithm reads through the haystack from left to right and reports - /// matches when it reaches the end of each pattern. In the overlapping match, more than one - /// pattern can be returned per report. + /// Returns a list of overlapping matches without suffixes in the given haystack. /// - /// This function returns the first match on each report. + /// The behavior of the automaton is similar to ``find_overlapping()``, except that upon + /// reaching a given position, it yields only the single longest pattern ending at that + /// position. /// /// Examples: /// >>> import daachorse @@ -252,6 +275,18 @@ impl CharwiseDoubleArrayAhoCorasick { /// Returns a list of non-overlapping matches in the given haystack. /// + /// According to the ``match_kind`` option you specified in the construction, the behavior is + /// changed for multiple possible matches, as follows. + /// + /// * If you set ``MATCH_KIND_STANDARD`` (default), the automaton searches from the beginning of + /// the input string, yielding a value immediately when a pattern is found. + /// * If you set ``MATCH_KIND_LEFTMOST_LONGEST``, the automaton reports matches corresponding to + /// the longest pattern. + /// * If you set ``MATCH_KIND_LEFTMOST_FIRST``, the automaton reports matches corresponding to + /// the pattern earlier registered to the automaton. + /// + /// The next search resumes from the end of the previously found pattern. + /// /// Example 1: Standard semantics /// >>> import daachorse /// >>> patterns = ['bcd', 'ab', 'a'] @@ -262,14 +297,20 @@ impl CharwiseDoubleArrayAhoCorasick { /// Example 2: Leftmost longest semantics /// >>> import daachorse /// >>> patterns = ['ab', 'a', 'abcd'] - /// >>> pma = daachorse.CharwiseDoubleArrayAhoCorasick(patterns, daachorse.MATCH_KIND_LEFTMOST_LONGEST) + /// >>> pma = daachorse.CharwiseDoubleArrayAhoCorasick( + /// ... patterns, + /// ... daachorse.MATCH_KIND_LEFTMOST_LONGEST + /// ... ) /// >>> pma.find('abcd') /// [(0, 4, 2)] /// /// Example 3: Leftmost first semantics /// >>> import daachorse /// >>> patterns = ['ab', 'a', 'abcd'] - /// >>> pma = daachorse.CharwiseDoubleArrayAhoCorasick(patterns, daachorse.MATCH_KIND_LEFTMOST_FIRST) + /// >>> pma = daachorse.CharwiseDoubleArrayAhoCorasick( + /// ... patterns, + /// ... daachorse.MATCH_KIND_LEFTMOST_FIRST + /// ... ) /// >>> pma.find('abcd') /// [(0, 2, 0)] /// @@ -322,6 +363,13 @@ impl CharwiseDoubleArrayAhoCorasick { /// Returns a list of overlapping matches in the given haystack. /// + /// The automaton follows the standard behavior of the Aho-Corasick algorithm. It searches from + /// the beginning of the input string, and upon reaching a given position, it yields the + /// patterns ending at that position in descending order of length. + /// + /// If the pattern set contains duplicate patterns, they are yielded in the order they were + /// registered. + /// /// Examples: /// >>> import daachorse /// >>> patterns = ['bcd', 'ab', 'a'] @@ -364,13 +412,11 @@ impl CharwiseDoubleArrayAhoCorasick { })) } - /// Returns a list of overlapping matches without suffixes in the given haystack iterator. - /// - /// The Aho-Corasick algorithm reads through the haystack from left to right and reports - /// matches when it reaches the end of each pattern. In the overlapping match, more than one - /// pattern can be returned per report. + /// Returns a list of overlapping matches without suffixes in the given haystack. /// - /// This function returns the first match on each report. + /// The behavior of the automaton is similar to ``find_overlapping()``, except that upon + /// reaching a given position, it yields only the single longest pattern ending at that + /// position. /// /// Examples: /// >>> import daachorse