From 1a2822e9823951aaf0743d2991761912a76ea56f Mon Sep 17 00:00:00 2001
From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com>
Date: Sat, 24 Jan 2026 00:47:01 +0800
Subject: [PATCH 1/6] fix(grammar): refactor CFG generation to use llguidance
 suffix syntax

Refactor the `build_cfg` function to generate a flattened grammar structure compatible with llguidance.

Key changes:
- Flatten the `start` rule to sequence tags and whitespace directly.
- Implement `[suffix="..."]` and `[capture]` attributes on lowercase rules (`m_i`) to correctly consume closing tags.
- Separate regex definitions into uppercase terminals (`M_i`), enabling safe use of greedy matching `/(?s:.*)/`.
- Add the required `%llguidance {}` header.

This resolves validation errors and prevents issues where greedy regexes swallowed closing tags or generated duplicate closures.
---
 src/gimkit/dsls.py | 89 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 77 insertions(+), 12 deletions(-)

diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py
index 925ca7f..6fe1340 100644
--- a/src/gimkit/dsls.py
+++ b/src/gimkit/dsls.py
@@ -35,21 +35,86 @@ def validate_grammar_spec(grammar_spec: str) -> tuple[bool, list[str]]:
 
 def build_cfg(query: Query) -> str:
     """Build an LLGuidance context-free grammar (CFG) string based on the query object.
-
-    LLGuidance syntax reference: https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md
+    
+    Constructs a flattened grammar structure compatible with LLGuidance's suffix/capture logic.
+
+    Ref: 
+    - https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete doc of llguidance grammar syntax
+    - https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation
+    - https://github.com/guidance-ai/llguidance: Source code
+    
+    Example:
+    ```python
+    print(build_cfg(query))
+    %llguidance {}
+
+    start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\"m_0\"|>" m_0 REGEX "<|MASKED id=\"m_1\"|>" m_1 REGEX "<|MASKED id=\"m_2\"|>" m_2 REGEX "<|MASKED id=\"m_3\"|>" m_3 REGEX "<|MASKED id=\"m_4\"|>" m_4 REGEX "<|MASKED id=\"m_5\"|>" m_5 REGEX "<|MASKED id=\"m_6\"|>" m_6 REGEX "<|/GIM_RESPONSE|>"
+    REGEX: /\s*/
+    m_0[capture, suffix="<|/MASKED|>"]: M_0
+    M_0: /CO₂|二氧化碳/
+    m_1[capture, suffix="<|/MASKED|>"]: M_1
+    M_1: /(?s:.*)/
+    m_2[capture, suffix="<|/MASKED|>"]: M_2
+    M_2: /(?s:.*)/
+    m_3[capture, suffix="<|/MASKED|>"]: M_3
+    M_3: /(?s:.*)/
+    m_4[capture, suffix="<|/MASKED|>"]: M_4
+    M_4: /(?s:.*)/
+    m_5[capture, suffix="<|/MASKED|>"]: M_5
+    M_5: /(?s:.*)/
+    m_6[capture, suffix="<|/MASKED|>"]: M_6
+    M_6: /(?s:.*)/
+    ```
     """
     num_tags = len(query.tags)
-    grammar_first_line = f'''start: "{RESPONSE_PREFIX}" {" ".join(f"tag{i}" for i in range(num_tags))} "{RESPONSE_SUFFIX}"'''
-
-    grammar_rest_lines = []
+    
+    # 1. 头部声明
+    lines = ["%llguidance {}"]
+
+    # 2. 构建 start 规则
+    # 目标格式: start: "PREFIX" REGEX "OPEN_TAG_0" m_0 REGEX "OPEN_TAG_1" m_1 ... REGEX "SUFFIX"
+    start_parts = [f'"{RESPONSE_PREFIX}"']
+    
+    for i in range(num_tags):
+        # 添加空白符规则引用
+        start_parts.append("REGEX")
+        
+        # 添加开始标签的字面量，例如: "<|MASKED id=\"m_0\"|>"
+        # 注意转义: id=\"m_{i}\"
+        open_tag_str = f'"{TAG_OPEN_LEFT} id=\\"m_{i}\\"{TAG_OPEN_RIGHT}"'
+        start_parts.append(open_tag_str)
+        
+        # 添加内容规则引用 (小写 m_i)
+        start_parts.append(f"m_{i}")
+
+    # 添加结尾的空白符和后缀
+    start_parts.append("REGEX")
+    start_parts.append(f'"{RESPONSE_SUFFIX}"')
+    
+    lines.append(f"start: {' '.join(start_parts)}")
+
+    # 3. 定义空白符规则 (命名为 REGEX 以匹配你的合法示例，通常也可以叫 WS)
+    lines.append(r"REGEX: /\s*/")
+
+    # 4. 生成每个 tag 的具体规则
     for i, tag in enumerate(query.tags):
-        # `/(?s:.)*?/` is a non-greedy match for any character including newlines
-        content_pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.)*?/"
-        grammar_rest_lines.append(
-            f'tag{i}: "{TAG_OPEN_LEFT} id=\\"m_{i}\\"{TAG_OPEN_RIGHT}" {content_pattern} "{TAG_END}"'
-        )
-
-    grammar = grammar_first_line + "\n" + "\n".join(grammar_rest_lines)
+        # 注意：配合 suffix 使用时，使用贪婪匹配 /(?s:.*)/ 而不是 /(?s:.)*?/ 是正确且合法的。
+        pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.*)/"
+        
+        # 规则 m_i (逻辑层):
+        # - capture: 告诉引擎捕获这个部分。
+        # - suffix: 指定结束标签，引擎遇到它会停止并消费它。
+        # 注意：这里引用 TAG_END 常量 (即 "<|/MASKED|>")
+        lines.append(f'm_{i}[capture, suffix="{TAG_END}"]: M_{i}')
+        
+        # 规则 M_i (正则层):
+        # 定义实际的匹配模式
+        lines.append(f'M_{i}: {pattern}')
+        
+        # TODO: "/(?s:.*)/" 的 tags 可能有很多个, 可以将具有相同 pattern 的规则合并以优化效率
+
+    # 5. 组合最终字符串
+    grammar = "\n".join(lines) + "\n"
 
     is_error, msgs = validate_grammar_spec(get_grammar_spec(grammar))
     if is_error:

From 96ca5a5e3f66fa85cf64420f68edd2d1d9dc2f13 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 23 Jan 2026 16:51:27 +0000
Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/gimkit/dsls.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py
index 6fe1340..ba2dd5d 100644
--- a/src/gimkit/dsls.py
+++ b/src/gimkit/dsls.py
@@ -35,14 +35,14 @@ def validate_grammar_spec(grammar_spec: str) -> tuple[bool, list[str]]:
 
 def build_cfg(query: Query) -> str:
     """Build an LLGuidance context-free grammar (CFG) string based on the query object.
-    
+
     Constructs a flattened grammar structure compatible with LLGuidance's suffix/capture logic.
 
-    Ref: 
+    Ref:
     - https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete doc of llguidance grammar syntax
     - https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation
     - https://github.com/guidance-ai/llguidance: Source code
-    
+
     Example:
     ```python
     print(build_cfg(query))
@@ -67,30 +67,30 @@ def build_cfg(query: Query) -> str:
     ```
     """
     num_tags = len(query.tags)
-    
+
     # 1. 头部声明
     lines = ["%llguidance {}"]
 
     # 2. 构建 start 规则
     # 目标格式: start: "PREFIX" REGEX "OPEN_TAG_0" m_0 REGEX "OPEN_TAG_1" m_1 ... REGEX "SUFFIX"
     start_parts = [f'"{RESPONSE_PREFIX}"']
-    
+
     for i in range(num_tags):
         # 添加空白符规则引用
         start_parts.append("REGEX")
-        
+
         # 添加开始标签的字面量，例如: "<|MASKED id=\"m_0\"|>"
         # 注意转义: id=\"m_{i}\"
         open_tag_str = f'"{TAG_OPEN_LEFT} id=\\"m_{i}\\"{TAG_OPEN_RIGHT}"'
         start_parts.append(open_tag_str)
-        
+
         # 添加内容规则引用 (小写 m_i)
         start_parts.append(f"m_{i}")
 
     # 添加结尾的空白符和后缀
     start_parts.append("REGEX")
     start_parts.append(f'"{RESPONSE_SUFFIX}"')
-    
+
     lines.append(f"start: {' '.join(start_parts)}")
 
     # 3. 定义空白符规则 (命名为 REGEX 以匹配你的合法示例，通常也可以叫 WS)
@@ -100,17 +100,17 @@ def build_cfg(query: Query) -> str:
     for i, tag in enumerate(query.tags):
         # 注意：配合 suffix 使用时，使用贪婪匹配 /(?s:.*)/ 而不是 /(?s:.)*?/ 是正确且合法的。
         pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.*)/"
-        
+
         # 规则 m_i (逻辑层):
         # - capture: 告诉引擎捕获这个部分。
         # - suffix: 指定结束标签，引擎遇到它会停止并消费它。
         # 注意：这里引用 TAG_END 常量 (即 "<|/MASKED|>")
         lines.append(f'm_{i}[capture, suffix="{TAG_END}"]: M_{i}')
-        
+
         # 规则 M_i (正则层):
         # 定义实际的匹配模式
-        lines.append(f'M_{i}: {pattern}')
-        
+        lines.append(f"M_{i}: {pattern}")
+
         # TODO: "/(?s:.*)/" 的 tags 可能有很多个, 可以将具有相同 pattern 的规则合并以优化效率
 
     # 5. 组合最终字符串

From 8fae664c0efdc933bdd031415d398f5819d5c5cd Mon Sep 17 00:00:00 2001
From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com>
Date: Sat, 24 Jan 2026 01:04:03 +0800
Subject: [PATCH 3/6] Update src/gimkit/dsls.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/gimkit/dsls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py
index ba2dd5d..22ba7b0 100644
--- a/src/gimkit/dsls.py
+++ b/src/gimkit/dsls.py
@@ -39,7 +39,7 @@ def build_cfg(query: Query) -> str:
     Constructs a flattened grammar structure compatible with LLGuidance's suffix/capture logic.
 
     Ref:
-    - https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete doc of llguidance grammar syntax
+    - https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete documentation of llguidance grammar syntax
     - https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation
     - https://github.com/guidance-ai/llguidance: Source code
 

From 3b3fcc844f9b223dbcce6b3eb3dc08eb133704a8 Mon Sep 17 00:00:00 2001
From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com>
Date: Sat, 24 Jan 2026 01:30:29 +0800
Subject: [PATCH 4/6] docs: enhance documentation and some comments

---
 src/gimkit/dsls.py | 62 +++++++++++++++++-----------------------------
 1 file changed, 23 insertions(+), 39 deletions(-)

diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py
index 22ba7b0..4540116 100644
--- a/src/gimkit/dsls.py
+++ b/src/gimkit/dsls.py
@@ -43,77 +43,61 @@ def build_cfg(query: Query) -> str:
     - https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation
     - https://github.com/guidance-ai/llguidance: Source code
 
-    Example:
+    Real-World Example:
     ```python
-    print(build_cfg(query))
-    %llguidance {}
-
-    start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\"m_0\"|>" m_0 REGEX "<|MASKED id=\"m_1\"|>" m_1 REGEX "<|MASKED id=\"m_2\"|>" m_2 REGEX "<|MASKED id=\"m_3\"|>" m_3 REGEX "<|MASKED id=\"m_4\"|>" m_4 REGEX "<|MASKED id=\"m_5\"|>" m_5 REGEX "<|MASKED id=\"m_6\"|>" m_6 REGEX "<|/GIM_RESPONSE|>"
-    REGEX: /\s*/
-    m_0[capture, suffix="<|/MASKED|>"]: M_0
-    M_0: /CO₂|二氧化碳/
-    m_1[capture, suffix="<|/MASKED|>"]: M_1
-    M_1: /(?s:.*)/
-    m_2[capture, suffix="<|/MASKED|>"]: M_2
-    M_2: /(?s:.*)/
-    m_3[capture, suffix="<|/MASKED|>"]: M_3
-    M_3: /(?s:.*)/
-    m_4[capture, suffix="<|/MASKED|>"]: M_4
-    M_4: /(?s:.*)/
-    m_5[capture, suffix="<|/MASKED|>"]: M_5
-    M_5: /(?s:.*)/
-    m_6[capture, suffix="<|/MASKED|>"]: M_6
-    M_6: /(?s:.*)/
+    query = '<|GIM_QUERY|>The capital of <|MASKED desc="single word" regex="中国|法国"|><|/MASKED|> is Beijing<|MASKED desc="punctuation mark" regex="\\."|><|/MASKED|><|/GIM_QUERY|>'
+    print(repr(build_cfg(Query(query))))
+    >>> '%llguidance {}\nstart: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|MASKED id=\\"m_1\\"|>" m_1 REGEX "<|/GIM_RESPONSE|>"\nREGEX: /\\s*/\nm_0[capture, suffix="<|/MASKED|>"]: M_0\nM_0: /中国|法国/\nm_1[capture, suffix="<|/MASKED|>"]: M_1\nM_1: /\\./\n'
     ```
     """
     num_tags = len(query.tags)
 
-    # 1. 头部声明
+    # 1. Header declaration
     lines = ["%llguidance {}"]
 
-    # 2. 构建 start 规则
-    # 目标格式: start: "PREFIX" REGEX "OPEN_TAG_0" m_0 REGEX "OPEN_TAG_1" m_1 ... REGEX "SUFFIX"
+    # 2. Build start rule
+    # Target format: start: "PREFIX" REGEX "OPEN_TAG_0" m_0 REGEX "OPEN_TAG_1" m_1 ... REGEX "SUFFIX"
     start_parts = [f'"{RESPONSE_PREFIX}"']
 
     for i in range(num_tags):
-        # 添加空白符规则引用
+        # Add whitespace rule reference
         start_parts.append("REGEX")
 
-        # 添加开始标签的字面量，例如: "<|MASKED id=\"m_0\"|>"
-        # 注意转义: id=\"m_{i}\"
+        # Add opening tag literal, e.g.: "<|MASKED id=\"m_0\"|>"
+        # Note escaping: id=\"m_{i}\"
         open_tag_str = f'"{TAG_OPEN_LEFT} id=\\"m_{i}\\"{TAG_OPEN_RIGHT}"'
         start_parts.append(open_tag_str)
 
-        # 添加内容规则引用 (小写 m_i)
+        # Add content rule reference (lowercase m_i)
         start_parts.append(f"m_{i}")
 
-    # 添加结尾的空白符和后缀
+    # Add trailing whitespace and suffix
     start_parts.append("REGEX")
     start_parts.append(f'"{RESPONSE_SUFFIX}"')
 
     lines.append(f"start: {' '.join(start_parts)}")
 
-    # 3. 定义空白符规则 (命名为 REGEX 以匹配你的合法示例，通常也可以叫 WS)
+    # 3. Define whitespace rule (named REGEX to match examples, usually can also be called WS)
     lines.append(r"REGEX: /\s*/")
 
-    # 4. 生成每个 tag 的具体规则
+    # 4. Generate specific rules for each tag
     for i, tag in enumerate(query.tags):
-        # 注意：配合 suffix 使用时，使用贪婪匹配 /(?s:.*)/ 而不是 /(?s:.)*?/ 是正确且合法的。
+        # Note: When used with suffix, using greedy match /(?s:.*)/ instead of /(?s:.)*?/ is correct and legal.
         pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.*)/"
 
-        # 规则 m_i (逻辑层):
-        # - capture: 告诉引擎捕获这个部分。
-        # - suffix: 指定结束标签，引擎遇到它会停止并消费它。
-        # 注意：这里引用 TAG_END 常量 (即 "<|/MASKED|>")
+        # Rule m_i (logical layer):
+        # - capture: tells the engine to capture this part.
+        # - suffix: specifies the ending tag, the engine stops and consumes it when encountered.
+        # Note: Here we reference the TAG_END constant (i.e., "<|/MASKED|>")
         lines.append(f'm_{i}[capture, suffix="{TAG_END}"]: M_{i}')
 
-        # 规则 M_i (正则层):
-        # 定义实际的匹配模式
+        # Rule M_i (regex layer):
+        # Define the actual matching pattern for this tag.
         lines.append(f"M_{i}: {pattern}")
 
-        # TODO: "/(?s:.*)/" 的 tags 可能有很多个, 可以将具有相同 pattern 的规则合并以优化效率
+        # TODO: There may be many tags with "/(?s:.*)/" pattern, which can be inefficient.
 
-    # 5. 组合最终字符串
+    # 5. Assemble final string
     grammar = "\n".join(lines) + "\n"
 
     is_error, msgs = validate_grammar_spec(get_grammar_spec(grammar))

From 1f91290801b4e83c5cd3905148707b43b6ce96c9 Mon Sep 17 00:00:00 2001
From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com>
Date: Sat, 24 Jan 2026 01:46:46 +0800
Subject: [PATCH 5/6] test: update corresponding test

---
 tests/test_dsls.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/test_dsls.py b/tests/test_dsls.py
index fe3ffd1..aeee735 100644
--- a/tests/test_dsls.py
+++ b/tests/test_dsls.py
@@ -11,16 +11,22 @@
 def test_build_cfg():
     query = Query('Hello, <|MASKED id="m_0"|>world<|/MASKED|>!')
     grm = (
-        'start: "<|GIM_RESPONSE|>" tag0 "<|/GIM_RESPONSE|>"\n'
-        'tag0: "<|MASKED id=\\"m_0\\"|>" /(?s:.)*?/ "<|/MASKED|>"'
+        '%llguidance {}\n'
+        'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n'
+        'REGEX: /\\s*/\n'
+        'm_0[capture, suffix="<|/MASKED|>"]: M_0\n'
+        'M_0: /(?s:.*)/\n'
     )
     assert build_cfg(query) == grm
 
     # Test with regex
-    query_with_regex = Query("Hello, ", MaskedTag(id=0, regex="[A-Za-z]{5}"), "!")
+    query_with_regex = Query("Hello, ", MaskedTag(id=0, regex=r"\w+\.com"), "!")
     whole_grammar_regex = (
-        'start: "<|GIM_RESPONSE|>" tag0 "<|/GIM_RESPONSE|>"\n'
-        'tag0: "<|MASKED id=\\"m_0\\"|>" /[A-Za-z]{5}/ "<|/MASKED|>"'
+        '%llguidance {}\n'
+        'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n'
+        'REGEX: /\\s*/\n'
+        'm_0[capture, suffix="<|/MASKED|>"]: M_0\n'
+        'M_0: /\\w+\\.com/\n'
     )
     assert build_cfg(query_with_regex) == whole_grammar_regex
 

From 09c9c56e11a1a7f36d98b234854130468e7eae32 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 23 Jan 2026 17:47:26 +0000
Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_dsls.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_dsls.py b/tests/test_dsls.py
index aeee735..10190e7 100644
--- a/tests/test_dsls.py
+++ b/tests/test_dsls.py
@@ -11,22 +11,22 @@
 def test_build_cfg():
     query = Query('Hello, <|MASKED id="m_0"|>world<|/MASKED|>!')
     grm = (
-        '%llguidance {}\n'
+        "%llguidance {}\n"
         'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n'
-        'REGEX: /\\s*/\n'
+        "REGEX: /\\s*/\n"
         'm_0[capture, suffix="<|/MASKED|>"]: M_0\n'
-        'M_0: /(?s:.*)/\n'
+        "M_0: /(?s:.*)/\n"
     )
     assert build_cfg(query) == grm
 
     # Test with regex
     query_with_regex = Query("Hello, ", MaskedTag(id=0, regex=r"\w+\.com"), "!")
     whole_grammar_regex = (
-        '%llguidance {}\n'
+        "%llguidance {}\n"
         'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n'
-        'REGEX: /\\s*/\n'
+        "REGEX: /\\s*/\n"
         'm_0[capture, suffix="<|/MASKED|>"]: M_0\n'
-        'M_0: /\\w+\\.com/\n'
+        "M_0: /\\w+\\.com/\n"
     )
     assert build_cfg(query_with_regex) == whole_grammar_regex