From b99e906d37424e1f72dc010641cc9df858aaccff Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 23 Jan 2026 18:01:03 +0000 Subject: [PATCH 1/6] Initial plan From 6dc42ceee8774dd32a99aaffa42baef0fe49fc52 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 23 Jan 2026 18:05:34 +0000 Subject: [PATCH 2/6] Implement terminal reuse optimization for build_cfg() Co-authored-by: Ki-Seki <60967965+Ki-Seki@users.noreply.github.com> --- src/gimkit/dsls.py | 26 +++++++++++----- tests/test_dsls.py | 74 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 88 insertions(+), 12 deletions(-) diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py index 4540116..2361130 100644 --- a/src/gimkit/dsls.py +++ b/src/gimkit/dsls.py @@ -80,24 +80,34 @@ def build_cfg(query: Query) -> str: # 3. Define whitespace rule (named REGEX to match examples, usually can also be called WS) lines.append(r"REGEX: /\s*/") - # 4. Generate specific rules for each tag + # 4. Collect unique patterns and create a mapping for terminal reuse + # This optimization avoids creating duplicate terminal rules for tags with the same regex + pattern_to_terminal: dict[str, str] = {} + terminal_definitions: list[str] = [] + for i, tag in enumerate(query.tags): # Note: When used with suffix, using greedy match /(?s:.*)/ instead of /(?s:.)*?/ is correct and legal. pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.*)/" + # Get or create a shared terminal for this pattern + if pattern not in pattern_to_terminal: + # Create a new terminal name for this unique pattern + terminal_name = f"T_{len(pattern_to_terminal)}" + pattern_to_terminal[pattern] = terminal_name + terminal_definitions.append(f"{terminal_name}: {pattern}") + + terminal_name = pattern_to_terminal[pattern] + # Rule m_i (logical layer): # - capture: tells the engine to capture this part. # - suffix: specifies the ending tag, the engine stops and consumes it when encountered. # Note: Here we reference the TAG_END constant (i.e., "<|/MASKED|>") - lines.append(f'm_{i}[capture, suffix="{TAG_END}"]: M_{i}') - - # Rule M_i (regex layer): - # Define the actual matching pattern for this tag. - lines.append(f"M_{i}: {pattern}") + lines.append(f'm_{i}[capture, suffix="{TAG_END}"]: {terminal_name}') - # TODO: There may be many tags with "/(?s:.*)/" pattern, which can be inefficient. + # 5. Add all unique terminal definitions + lines.extend(terminal_definitions) - # 5. Assemble final string + # 6. Assemble final string grammar = "\n".join(lines) + "\n" is_error, msgs = validate_grammar_spec(get_grammar_spec(grammar)) diff --git a/tests/test_dsls.py b/tests/test_dsls.py index 10190e7..f6e245f 100644 --- a/tests/test_dsls.py +++ b/tests/test_dsls.py @@ -14,8 +14,8 @@ def test_build_cfg(): "%llguidance {}\n" 'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n' "REGEX: /\\s*/\n" - 'm_0[capture, suffix="<|/MASKED|>"]: M_0\n' - "M_0: /(?s:.*)/\n" + 'm_0[capture, suffix="<|/MASKED|>"]: T_0\n' + "T_0: /(?s:.*)/\n" ) assert build_cfg(query) == grm @@ -25,8 +25,8 @@ def test_build_cfg(): "%llguidance {}\n" 'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n' "REGEX: /\\s*/\n" - 'm_0[capture, suffix="<|/MASKED|>"]: M_0\n' - "M_0: /\\w+\\.com/\n" + 'm_0[capture, suffix="<|/MASKED|>"]: T_0\n' + "T_0: /\\w+\\.com/\n" ) assert build_cfg(query_with_regex) == whole_grammar_regex @@ -60,3 +60,69 @@ def test_build_json_schema(): "additionalProperties": False, } assert schema == expected_schema + + +def test_build_cfg_terminal_reuse(): + """Test that tags with the same regex pattern reuse the same terminal.""" + # Test multiple tags with default pattern (no regex) + query = Query( + "Name: ", + MaskedTag(id=0), + ", Age: ", + MaskedTag(id=1), + ", City: ", + MaskedTag(id=2), + ) + grammar = build_cfg(query) + + # All three tags should reference the same terminal T_0 + assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar + assert 'm_1[capture, suffix="<|/MASKED|>"]: T_0' in grammar + assert 'm_2[capture, suffix="<|/MASKED|>"]: T_0' in grammar + + # Should only have one terminal definition for the default pattern + assert grammar.count("T_0: /(?s:.*)/") == 1 + + # Should not have M_0, M_1, M_2 (old naming scheme) + assert "M_0:" not in grammar + assert "M_1:" not in grammar + assert "M_2:" not in grammar + + +def test_build_cfg_mixed_terminals(): + """Test that different regex patterns create different terminals but still reuse when possible.""" + query = Query( + "Email: ", + MaskedTag(id=0, regex=r"\w+@\w+\.com"), + ", Name: ", + MaskedTag(id=1), # default pattern + ", Age: ", + MaskedTag(id=2), # default pattern + ", Website: ", + MaskedTag(id=3, regex=r"\w+\.com"), + ", Backup Email: ", + MaskedTag(id=4, regex=r"\w+@\w+\.com"), # same as id=0 + ) + grammar = build_cfg(query) + + # Email pattern should be T_0 + assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar + assert "T_0: /\\w+@\\w+\\.com/" in grammar + + # Default pattern should be T_1 + assert 'm_1[capture, suffix="<|/MASKED|>"]: T_1' in grammar + assert 'm_2[capture, suffix="<|/MASKED|>"]: T_1' in grammar + assert "T_1: /(?s:.*)/" in grammar + + # Website pattern should be T_2 + assert 'm_3[capture, suffix="<|/MASKED|>"]: T_2' in grammar + assert "T_2: /\\w+\\.com/" in grammar + + # Backup email should reuse T_0 (same pattern as first email) + assert 'm_4[capture, suffix="<|/MASKED|>"]: T_0' in grammar + + # Verify we have exactly 3 unique terminals + assert grammar.count("T_0:") == 1 + assert grammar.count("T_1:") == 1 + assert grammar.count("T_2:") == 1 + assert "T_3:" not in grammar # Should not exist since we reused T_0 From 4c08e421670e78a0d72f6cea6d87abdcae8191ba Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 23 Jan 2026 18:06:55 +0000 Subject: [PATCH 3/6] Address code review: rename variable for clarity Co-authored-by: Ki-Seki <60967965+Ki-Seki@users.noreply.github.com> --- src/gimkit/dsls.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py index 2361130..81f6a0a 100644 --- a/src/gimkit/dsls.py +++ b/src/gimkit/dsls.py @@ -82,7 +82,7 @@ def build_cfg(query: Query) -> str: # 4. Collect unique patterns and create a mapping for terminal reuse # This optimization avoids creating duplicate terminal rules for tags with the same regex - pattern_to_terminal: dict[str, str] = {} + unique_pattern_terminals: dict[str, str] = {} terminal_definitions: list[str] = [] for i, tag in enumerate(query.tags): @@ -90,13 +90,13 @@ def build_cfg(query: Query) -> str: pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.*)/" # Get or create a shared terminal for this pattern - if pattern not in pattern_to_terminal: + if pattern not in unique_pattern_terminals: # Create a new terminal name for this unique pattern - terminal_name = f"T_{len(pattern_to_terminal)}" - pattern_to_terminal[pattern] = terminal_name + terminal_name = f"T_{len(unique_pattern_terminals)}" + unique_pattern_terminals[pattern] = terminal_name terminal_definitions.append(f"{terminal_name}: {pattern}") - terminal_name = pattern_to_terminal[pattern] + terminal_name = unique_pattern_terminals[pattern] # Rule m_i (logical layer): # - capture: tells the engine to capture this part. From 1dcc04bb2f41f81fae53abcfa0940587490732de Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 23 Jan 2026 18:14:30 +0000 Subject: [PATCH 4/6] Add 5 new test cases for terminal reuse optimization - test_build_cfg_single_tag_no_reuse: Edge case with single tag - test_build_cfg_all_unique_patterns: All tags with different patterns - test_build_cfg_many_tags_same_pattern: High reuse scenario with 10 tags - test_build_cfg_complex_regex_patterns: Complex regex with special chars - test_build_cfg_terminal_reuse_validates: Validation check for reused terminals Co-authored-by: Ki-Seki <60967965+Ki-Seki@users.noreply.github.com> --- tests/test_dsls.py | 119 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/tests/test_dsls.py b/tests/test_dsls.py index f6e245f..8d69448 100644 --- a/tests/test_dsls.py +++ b/tests/test_dsls.py @@ -126,3 +126,122 @@ def test_build_cfg_mixed_terminals(): assert grammar.count("T_1:") == 1 assert grammar.count("T_2:") == 1 assert "T_3:" not in grammar # Should not exist since we reused T_0 + + +def test_build_cfg_single_tag_no_reuse(): + """Test edge case: single tag should create exactly one terminal.""" + query = Query("Value: ", MaskedTag(id=0)) + grammar = build_cfg(query) + + # Should have exactly one terminal T_0 + assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar + assert "T_0: /(?s:.*)/\n" in grammar + assert grammar.count("T_0:") == 1 + assert "T_1:" not in grammar + + +def test_build_cfg_all_unique_patterns(): + """Test edge case: all tags with different patterns should create unique terminals.""" + query = Query( + "Pattern1: ", + MaskedTag(id=0, regex=r"[A-Z]+"), + ", Pattern2: ", + MaskedTag(id=1, regex=r"[0-9]+"), + ", Pattern3: ", + MaskedTag(id=2, regex=r"[a-z]+"), + ) + grammar = build_cfg(query) + + # Each tag should have its own terminal + assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar + assert 'm_1[capture, suffix="<|/MASKED|>"]: T_1' in grammar + assert 'm_2[capture, suffix="<|/MASKED|>"]: T_2' in grammar + + # Three unique terminal definitions + assert "T_0: /[A-Z]+/" in grammar + assert "T_1: /[0-9]+/" in grammar + assert "T_2: /[a-z]+/" in grammar + + # No additional terminals + assert "T_3:" not in grammar + + +def test_build_cfg_many_tags_same_pattern(): + """Test high reuse scenario: many tags sharing the same pattern.""" + # Create 10 tags all using the default pattern + parts = [] + for i in range(10): + parts.extend([f"Field{i}: ", MaskedTag(id=i)]) + if i < 9: + parts.append(", ") + + query = Query(*parts) + grammar = build_cfg(query) + + # All 10 tags should reference the same terminal T_0 + for i in range(10): + assert f'm_{i}[capture, suffix="<|/MASKED|>"]: T_0' in grammar + + # Only one terminal definition for the shared pattern + assert grammar.count("T_0: /(?s:.*)/") == 1 + + # No other terminals should exist + assert "T_1:" not in grammar + + # Verify efficiency: 10 tags but only 1 terminal definition + terminal_count = grammar.count(": /") + assert terminal_count == 2 # 1 for REGEX (whitespace), 1 for T_0 + + +def test_build_cfg_complex_regex_patterns(): + """Test terminal reuse with complex regex patterns containing special characters.""" + # Test with various complex patterns including those from real-world use cases + query = Query( + "Date: ", + MaskedTag(id=0, regex=r"\d{4}-\d{2}-\d{2}"), + ", AnotherDate: ", + MaskedTag(id=1, regex=r"\d{4}-\d{2}-\d{2}"), # same as id=0 + ", Time: ", + MaskedTag(id=2, regex=r"\d{2}:\d{2}:\d{2}"), + ", AnotherTime: ", + MaskedTag(id=3, regex=r"\d{2}:\d{2}:\d{2}"), # same as id=2 + ) + grammar = build_cfg(query) + + # Date pattern should be reused + assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar + assert 'm_1[capture, suffix="<|/MASKED|>"]: T_0' in grammar + assert "T_0: /\\d{4}-\\d{2}-\\d{2}/" in grammar + + # Time pattern should be reused + assert 'm_2[capture, suffix="<|/MASKED|>"]: T_1' in grammar + assert 'm_3[capture, suffix="<|/MASKED|>"]: T_1' in grammar + assert "T_1: /\\d{2}:\\d{2}:\\d{2}/" in grammar + + # Only 2 unique terminals for 4 tags + assert grammar.count("T_0:") == 1 + assert grammar.count("T_1:") == 1 + assert "T_2:" not in grammar + + +def test_build_cfg_terminal_reuse_validates(): + """Test that grammars with terminal reuse still pass validation.""" + from gimkit.dsls import get_grammar_spec, validate_grammar_spec + + # Create a query with multiple tags sharing patterns + query = Query( + "A: ", + MaskedTag(id=0), + ", B: ", + MaskedTag(id=1), + ", C: ", + MaskedTag(id=2, regex=r"\w+"), + ) + grammar = build_cfg(query) + + # The grammar should be valid + grammar_spec = get_grammar_spec(grammar) + is_error, msgs = validate_grammar_spec(grammar_spec) + + assert not is_error, f"Grammar validation failed: {msgs}" + assert isinstance(msgs, list) From b5d8385d006d16c4c35155a20caa88fa63e65b90 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Sat, 24 Jan 2026 02:23:08 +0800 Subject: [PATCH 5/6] docs: update docstring --- src/gimkit/dsls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py index 81f6a0a..6720f40 100644 --- a/src/gimkit/dsls.py +++ b/src/gimkit/dsls.py @@ -47,7 +47,7 @@ def build_cfg(query: Query) -> str: ```python query = '<|GIM_QUERY|>The capital of <|MASKED desc="single word" regex="中国|法国"|><|/MASKED|> is Beijing<|MASKED desc="punctuation mark" regex="\\."|><|/MASKED|><|/GIM_QUERY|>' print(repr(build_cfg(Query(query)))) - >>> '%llguidance {}\nstart: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|MASKED id=\\"m_1\\"|>" m_1 REGEX "<|/GIM_RESPONSE|>"\nREGEX: /\\s*/\nm_0[capture, suffix="<|/MASKED|>"]: M_0\nM_0: /中国|法国/\nm_1[capture, suffix="<|/MASKED|>"]: M_1\nM_1: /\\./\n' + >>> '%llguidance {}\nstart: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|MASKED id=\\"m_1\\"|>" m_1 REGEX "<|/GIM_RESPONSE|>"\nREGEX: /\\s*/\nm_0[capture, suffix="<|/MASKED|>"]: T_0\nm_1[capture, suffix="<|/MASKED|>"]: T_1\nT_0: /中国|法国/\nT_1: /\\./\n' ``` """ num_tags = len(query.tags) From 4b19946972e61cab0ea3de14b11840523e7633e6 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Sat, 24 Jan 2026 02:23:40 +0800 Subject: [PATCH 6/6] test: simplify test cases --- tests/test_dsls.py | 208 +++++---------------------------------------- 1 file changed, 23 insertions(+), 185 deletions(-) diff --git a/tests/test_dsls.py b/tests/test_dsls.py index 8d69448..ee085bf 100644 --- a/tests/test_dsls.py +++ b/tests/test_dsls.py @@ -37,6 +37,29 @@ def test_build_cfg(): ): build_cfg(Query(MaskedTag(regex="[[]]"))) + # Test with various complex patterns including repeated regexes + query = Query( + "Date: ", + MaskedTag(id=0, regex=r"\d{4}-\d{2}-\d{2}"), + ", AnotherDate: ", + MaskedTag(id=1, regex=r"\d{4}-\d{2}-\d{2}"), # same as id=0 + ", Time: ", + MaskedTag(id=2, regex=r"\d{2}:\d{2}:\d{2}"), + ", AnotherTime: ", + MaskedTag(id=3, regex=r"\d{2}:\d{2}:\d{2}"), # same as id=2 + ) + assert build_cfg(query) == ( + "%llguidance {}\n" + 'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|MASKED id=\\"m_1\\"|>" m_1 REGEX "<|MASKED id=\\"m_2\\"|>" m_2 REGEX "<|MASKED id=\\"m_3\\"|>" m_3 REGEX "<|/GIM_RESPONSE|>"\n' + "REGEX: /\\s*/\n" + 'm_0[capture, suffix="<|/MASKED|>"]: T_0\n' + 'm_1[capture, suffix="<|/MASKED|>"]: T_0\n' + 'm_2[capture, suffix="<|/MASKED|>"]: T_1\n' + 'm_3[capture, suffix="<|/MASKED|>"]: T_1\n' + "T_0: /\\d{4}-\\d{2}-\\d{2}/\n" + "T_1: /\\d{2}:\\d{2}:\\d{2}/\n" + ) + def test_build_json_schema(): query = Query( @@ -60,188 +83,3 @@ def test_build_json_schema(): "additionalProperties": False, } assert schema == expected_schema - - -def test_build_cfg_terminal_reuse(): - """Test that tags with the same regex pattern reuse the same terminal.""" - # Test multiple tags with default pattern (no regex) - query = Query( - "Name: ", - MaskedTag(id=0), - ", Age: ", - MaskedTag(id=1), - ", City: ", - MaskedTag(id=2), - ) - grammar = build_cfg(query) - - # All three tags should reference the same terminal T_0 - assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar - assert 'm_1[capture, suffix="<|/MASKED|>"]: T_0' in grammar - assert 'm_2[capture, suffix="<|/MASKED|>"]: T_0' in grammar - - # Should only have one terminal definition for the default pattern - assert grammar.count("T_0: /(?s:.*)/") == 1 - - # Should not have M_0, M_1, M_2 (old naming scheme) - assert "M_0:" not in grammar - assert "M_1:" not in grammar - assert "M_2:" not in grammar - - -def test_build_cfg_mixed_terminals(): - """Test that different regex patterns create different terminals but still reuse when possible.""" - query = Query( - "Email: ", - MaskedTag(id=0, regex=r"\w+@\w+\.com"), - ", Name: ", - MaskedTag(id=1), # default pattern - ", Age: ", - MaskedTag(id=2), # default pattern - ", Website: ", - MaskedTag(id=3, regex=r"\w+\.com"), - ", Backup Email: ", - MaskedTag(id=4, regex=r"\w+@\w+\.com"), # same as id=0 - ) - grammar = build_cfg(query) - - # Email pattern should be T_0 - assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar - assert "T_0: /\\w+@\\w+\\.com/" in grammar - - # Default pattern should be T_1 - assert 'm_1[capture, suffix="<|/MASKED|>"]: T_1' in grammar - assert 'm_2[capture, suffix="<|/MASKED|>"]: T_1' in grammar - assert "T_1: /(?s:.*)/" in grammar - - # Website pattern should be T_2 - assert 'm_3[capture, suffix="<|/MASKED|>"]: T_2' in grammar - assert "T_2: /\\w+\\.com/" in grammar - - # Backup email should reuse T_0 (same pattern as first email) - assert 'm_4[capture, suffix="<|/MASKED|>"]: T_0' in grammar - - # Verify we have exactly 3 unique terminals - assert grammar.count("T_0:") == 1 - assert grammar.count("T_1:") == 1 - assert grammar.count("T_2:") == 1 - assert "T_3:" not in grammar # Should not exist since we reused T_0 - - -def test_build_cfg_single_tag_no_reuse(): - """Test edge case: single tag should create exactly one terminal.""" - query = Query("Value: ", MaskedTag(id=0)) - grammar = build_cfg(query) - - # Should have exactly one terminal T_0 - assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar - assert "T_0: /(?s:.*)/\n" in grammar - assert grammar.count("T_0:") == 1 - assert "T_1:" not in grammar - - -def test_build_cfg_all_unique_patterns(): - """Test edge case: all tags with different patterns should create unique terminals.""" - query = Query( - "Pattern1: ", - MaskedTag(id=0, regex=r"[A-Z]+"), - ", Pattern2: ", - MaskedTag(id=1, regex=r"[0-9]+"), - ", Pattern3: ", - MaskedTag(id=2, regex=r"[a-z]+"), - ) - grammar = build_cfg(query) - - # Each tag should have its own terminal - assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar - assert 'm_1[capture, suffix="<|/MASKED|>"]: T_1' in grammar - assert 'm_2[capture, suffix="<|/MASKED|>"]: T_2' in grammar - - # Three unique terminal definitions - assert "T_0: /[A-Z]+/" in grammar - assert "T_1: /[0-9]+/" in grammar - assert "T_2: /[a-z]+/" in grammar - - # No additional terminals - assert "T_3:" not in grammar - - -def test_build_cfg_many_tags_same_pattern(): - """Test high reuse scenario: many tags sharing the same pattern.""" - # Create 10 tags all using the default pattern - parts = [] - for i in range(10): - parts.extend([f"Field{i}: ", MaskedTag(id=i)]) - if i < 9: - parts.append(", ") - - query = Query(*parts) - grammar = build_cfg(query) - - # All 10 tags should reference the same terminal T_0 - for i in range(10): - assert f'm_{i}[capture, suffix="<|/MASKED|>"]: T_0' in grammar - - # Only one terminal definition for the shared pattern - assert grammar.count("T_0: /(?s:.*)/") == 1 - - # No other terminals should exist - assert "T_1:" not in grammar - - # Verify efficiency: 10 tags but only 1 terminal definition - terminal_count = grammar.count(": /") - assert terminal_count == 2 # 1 for REGEX (whitespace), 1 for T_0 - - -def test_build_cfg_complex_regex_patterns(): - """Test terminal reuse with complex regex patterns containing special characters.""" - # Test with various complex patterns including those from real-world use cases - query = Query( - "Date: ", - MaskedTag(id=0, regex=r"\d{4}-\d{2}-\d{2}"), - ", AnotherDate: ", - MaskedTag(id=1, regex=r"\d{4}-\d{2}-\d{2}"), # same as id=0 - ", Time: ", - MaskedTag(id=2, regex=r"\d{2}:\d{2}:\d{2}"), - ", AnotherTime: ", - MaskedTag(id=3, regex=r"\d{2}:\d{2}:\d{2}"), # same as id=2 - ) - grammar = build_cfg(query) - - # Date pattern should be reused - assert 'm_0[capture, suffix="<|/MASKED|>"]: T_0' in grammar - assert 'm_1[capture, suffix="<|/MASKED|>"]: T_0' in grammar - assert "T_0: /\\d{4}-\\d{2}-\\d{2}/" in grammar - - # Time pattern should be reused - assert 'm_2[capture, suffix="<|/MASKED|>"]: T_1' in grammar - assert 'm_3[capture, suffix="<|/MASKED|>"]: T_1' in grammar - assert "T_1: /\\d{2}:\\d{2}:\\d{2}/" in grammar - - # Only 2 unique terminals for 4 tags - assert grammar.count("T_0:") == 1 - assert grammar.count("T_1:") == 1 - assert "T_2:" not in grammar - - -def test_build_cfg_terminal_reuse_validates(): - """Test that grammars with terminal reuse still pass validation.""" - from gimkit.dsls import get_grammar_spec, validate_grammar_spec - - # Create a query with multiple tags sharing patterns - query = Query( - "A: ", - MaskedTag(id=0), - ", B: ", - MaskedTag(id=1), - ", C: ", - MaskedTag(id=2, regex=r"\w+"), - ) - grammar = build_cfg(query) - - # The grammar should be valid - grammar_spec = get_grammar_spec(grammar) - is_error, msgs = validate_grammar_spec(grammar_spec) - - assert not is_error, f"Grammar validation failed: {msgs}" - assert isinstance(msgs, list)