From 1a2822e9823951aaf0743d2991761912a76ea56f Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Sat, 24 Jan 2026 00:47:01 +0800 Subject: [PATCH 1/6] fix(grammar): refactor CFG generation to use llguidance suffix syntax Refactor the `build_cfg` function to generate a flattened grammar structure compatible with llguidance. Key changes: - Flatten the `start` rule to sequence tags and whitespace directly. - Implement `[suffix="..."]` and `[capture]` attributes on lowercase rules (`m_i`) to correctly consume closing tags. - Separate regex definitions into uppercase terminals (`M_i`), enabling safe use of greedy matching `/(?s:.*)/`. - Add the required `%llguidance {}` header. This resolves validation errors and prevents issues where greedy regexes swallowed closing tags or generated duplicate closures. --- src/gimkit/dsls.py | 89 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 77 insertions(+), 12 deletions(-) diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py index 925ca7f..6fe1340 100644 --- a/src/gimkit/dsls.py +++ b/src/gimkit/dsls.py @@ -35,21 +35,86 @@ def validate_grammar_spec(grammar_spec: str) -> tuple[bool, list[str]]: def build_cfg(query: Query) -> str: """Build an LLGuidance context-free grammar (CFG) string based on the query object. - - LLGuidance syntax reference: https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md + + Constructs a flattened grammar structure compatible with LLGuidance's suffix/capture logic. + + Ref: + - https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete doc of llguidance grammar syntax + - https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation + - https://github.com/guidance-ai/llguidance: Source code + + Example: + ```python + print(build_cfg(query)) + %llguidance {} + + start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\"m_0\"|>" m_0 REGEX "<|MASKED id=\"m_1\"|>" m_1 REGEX "<|MASKED id=\"m_2\"|>" m_2 REGEX "<|MASKED id=\"m_3\"|>" m_3 REGEX "<|MASKED id=\"m_4\"|>" m_4 REGEX "<|MASKED id=\"m_5\"|>" m_5 REGEX "<|MASKED id=\"m_6\"|>" m_6 REGEX "<|/GIM_RESPONSE|>" + REGEX: /\s*/ + m_0[capture, suffix="<|/MASKED|>"]: M_0 + M_0: /CO₂|二氧化碳/ + m_1[capture, suffix="<|/MASKED|>"]: M_1 + M_1: /(?s:.*)/ + m_2[capture, suffix="<|/MASKED|>"]: M_2 + M_2: /(?s:.*)/ + m_3[capture, suffix="<|/MASKED|>"]: M_3 + M_3: /(?s:.*)/ + m_4[capture, suffix="<|/MASKED|>"]: M_4 + M_4: /(?s:.*)/ + m_5[capture, suffix="<|/MASKED|>"]: M_5 + M_5: /(?s:.*)/ + m_6[capture, suffix="<|/MASKED|>"]: M_6 + M_6: /(?s:.*)/ + ``` """ num_tags = len(query.tags) - grammar_first_line = f'''start: "{RESPONSE_PREFIX}" {" ".join(f"tag{i}" for i in range(num_tags))} "{RESPONSE_SUFFIX}"''' - - grammar_rest_lines = [] + + # 1. 头部声明 + lines = ["%llguidance {}"] + + # 2. 构建 start 规则 + # 目标格式: start: "PREFIX" REGEX "OPEN_TAG_0" m_0 REGEX "OPEN_TAG_1" m_1 ... REGEX "SUFFIX" + start_parts = [f'"{RESPONSE_PREFIX}"'] + + for i in range(num_tags): + # 添加空白符规则引用 + start_parts.append("REGEX") + + # 添加开始标签的字面量,例如: "<|MASKED id=\"m_0\"|>" + # 注意转义: id=\"m_{i}\" + open_tag_str = f'"{TAG_OPEN_LEFT} id=\\"m_{i}\\"{TAG_OPEN_RIGHT}"' + start_parts.append(open_tag_str) + + # 添加内容规则引用 (小写 m_i) + start_parts.append(f"m_{i}") + + # 添加结尾的空白符和后缀 + start_parts.append("REGEX") + start_parts.append(f'"{RESPONSE_SUFFIX}"') + + lines.append(f"start: {' '.join(start_parts)}") + + # 3. 定义空白符规则 (命名为 REGEX 以匹配你的合法示例,通常也可以叫 WS) + lines.append(r"REGEX: /\s*/") + + # 4. 生成每个 tag 的具体规则 for i, tag in enumerate(query.tags): - # `/(?s:.)*?/` is a non-greedy match for any character including newlines - content_pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.)*?/" - grammar_rest_lines.append( - f'tag{i}: "{TAG_OPEN_LEFT} id=\\"m_{i}\\"{TAG_OPEN_RIGHT}" {content_pattern} "{TAG_END}"' - ) - - grammar = grammar_first_line + "\n" + "\n".join(grammar_rest_lines) + # 注意:配合 suffix 使用时,使用贪婪匹配 /(?s:.*)/ 而不是 /(?s:.)*?/ 是正确且合法的。 + pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.*)/" + + # 规则 m_i (逻辑层): + # - capture: 告诉引擎捕获这个部分。 + # - suffix: 指定结束标签,引擎遇到它会停止并消费它。 + # 注意:这里引用 TAG_END 常量 (即 "<|/MASKED|>") + lines.append(f'm_{i}[capture, suffix="{TAG_END}"]: M_{i}') + + # 规则 M_i (正则层): + # 定义实际的匹配模式 + lines.append(f'M_{i}: {pattern}') + + # TODO: "/(?s:.*)/" 的 tags 可能有很多个, 可以将具有相同 pattern 的规则合并以优化效率 + + # 5. 组合最终字符串 + grammar = "\n".join(lines) + "\n" is_error, msgs = validate_grammar_spec(get_grammar_spec(grammar)) if is_error: From 96ca5a5e3f66fa85cf64420f68edd2d1d9dc2f13 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 23 Jan 2026 16:51:27 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/gimkit/dsls.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py index 6fe1340..ba2dd5d 100644 --- a/src/gimkit/dsls.py +++ b/src/gimkit/dsls.py @@ -35,14 +35,14 @@ def validate_grammar_spec(grammar_spec: str) -> tuple[bool, list[str]]: def build_cfg(query: Query) -> str: """Build an LLGuidance context-free grammar (CFG) string based on the query object. - + Constructs a flattened grammar structure compatible with LLGuidance's suffix/capture logic. - Ref: + Ref: - https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete doc of llguidance grammar syntax - https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation - https://github.com/guidance-ai/llguidance: Source code - + Example: ```python print(build_cfg(query)) @@ -67,30 +67,30 @@ def build_cfg(query: Query) -> str: ``` """ num_tags = len(query.tags) - + # 1. 头部声明 lines = ["%llguidance {}"] # 2. 构建 start 规则 # 目标格式: start: "PREFIX" REGEX "OPEN_TAG_0" m_0 REGEX "OPEN_TAG_1" m_1 ... REGEX "SUFFIX" start_parts = [f'"{RESPONSE_PREFIX}"'] - + for i in range(num_tags): # 添加空白符规则引用 start_parts.append("REGEX") - + # 添加开始标签的字面量,例如: "<|MASKED id=\"m_0\"|>" # 注意转义: id=\"m_{i}\" open_tag_str = f'"{TAG_OPEN_LEFT} id=\\"m_{i}\\"{TAG_OPEN_RIGHT}"' start_parts.append(open_tag_str) - + # 添加内容规则引用 (小写 m_i) start_parts.append(f"m_{i}") # 添加结尾的空白符和后缀 start_parts.append("REGEX") start_parts.append(f'"{RESPONSE_SUFFIX}"') - + lines.append(f"start: {' '.join(start_parts)}") # 3. 定义空白符规则 (命名为 REGEX 以匹配你的合法示例,通常也可以叫 WS) @@ -100,17 +100,17 @@ def build_cfg(query: Query) -> str: for i, tag in enumerate(query.tags): # 注意:配合 suffix 使用时,使用贪婪匹配 /(?s:.*)/ 而不是 /(?s:.)*?/ 是正确且合法的。 pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.*)/" - + # 规则 m_i (逻辑层): # - capture: 告诉引擎捕获这个部分。 # - suffix: 指定结束标签,引擎遇到它会停止并消费它。 # 注意:这里引用 TAG_END 常量 (即 "<|/MASKED|>") lines.append(f'm_{i}[capture, suffix="{TAG_END}"]: M_{i}') - + # 规则 M_i (正则层): # 定义实际的匹配模式 - lines.append(f'M_{i}: {pattern}') - + lines.append(f"M_{i}: {pattern}") + # TODO: "/(?s:.*)/" 的 tags 可能有很多个, 可以将具有相同 pattern 的规则合并以优化效率 # 5. 组合最终字符串 From 8fae664c0efdc933bdd031415d398f5819d5c5cd Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Sat, 24 Jan 2026 01:04:03 +0800 Subject: [PATCH 3/6] Update src/gimkit/dsls.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/gimkit/dsls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py index ba2dd5d..22ba7b0 100644 --- a/src/gimkit/dsls.py +++ b/src/gimkit/dsls.py @@ -39,7 +39,7 @@ def build_cfg(query: Query) -> str: Constructs a flattened grammar structure compatible with LLGuidance's suffix/capture logic. Ref: - - https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete doc of llguidance grammar syntax + - https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete documentation of llguidance grammar syntax - https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation - https://github.com/guidance-ai/llguidance: Source code From 3b3fcc844f9b223dbcce6b3eb3dc08eb133704a8 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Sat, 24 Jan 2026 01:30:29 +0800 Subject: [PATCH 4/6] docs: enhance documentation and some comments --- src/gimkit/dsls.py | 62 +++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/src/gimkit/dsls.py b/src/gimkit/dsls.py index 22ba7b0..4540116 100644 --- a/src/gimkit/dsls.py +++ b/src/gimkit/dsls.py @@ -43,77 +43,61 @@ def build_cfg(query: Query) -> str: - https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation - https://github.com/guidance-ai/llguidance: Source code - Example: + Real-World Example: ```python - print(build_cfg(query)) - %llguidance {} - - start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\"m_0\"|>" m_0 REGEX "<|MASKED id=\"m_1\"|>" m_1 REGEX "<|MASKED id=\"m_2\"|>" m_2 REGEX "<|MASKED id=\"m_3\"|>" m_3 REGEX "<|MASKED id=\"m_4\"|>" m_4 REGEX "<|MASKED id=\"m_5\"|>" m_5 REGEX "<|MASKED id=\"m_6\"|>" m_6 REGEX "<|/GIM_RESPONSE|>" - REGEX: /\s*/ - m_0[capture, suffix="<|/MASKED|>"]: M_0 - M_0: /CO₂|二氧化碳/ - m_1[capture, suffix="<|/MASKED|>"]: M_1 - M_1: /(?s:.*)/ - m_2[capture, suffix="<|/MASKED|>"]: M_2 - M_2: /(?s:.*)/ - m_3[capture, suffix="<|/MASKED|>"]: M_3 - M_3: /(?s:.*)/ - m_4[capture, suffix="<|/MASKED|>"]: M_4 - M_4: /(?s:.*)/ - m_5[capture, suffix="<|/MASKED|>"]: M_5 - M_5: /(?s:.*)/ - m_6[capture, suffix="<|/MASKED|>"]: M_6 - M_6: /(?s:.*)/ + query = '<|GIM_QUERY|>The capital of <|MASKED desc="single word" regex="中国|法国"|><|/MASKED|> is Beijing<|MASKED desc="punctuation mark" regex="\\."|><|/MASKED|><|/GIM_QUERY|>' + print(repr(build_cfg(Query(query)))) + >>> '%llguidance {}\nstart: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|MASKED id=\\"m_1\\"|>" m_1 REGEX "<|/GIM_RESPONSE|>"\nREGEX: /\\s*/\nm_0[capture, suffix="<|/MASKED|>"]: M_0\nM_0: /中国|法国/\nm_1[capture, suffix="<|/MASKED|>"]: M_1\nM_1: /\\./\n' ``` """ num_tags = len(query.tags) - # 1. 头部声明 + # 1. Header declaration lines = ["%llguidance {}"] - # 2. 构建 start 规则 - # 目标格式: start: "PREFIX" REGEX "OPEN_TAG_0" m_0 REGEX "OPEN_TAG_1" m_1 ... REGEX "SUFFIX" + # 2. Build start rule + # Target format: start: "PREFIX" REGEX "OPEN_TAG_0" m_0 REGEX "OPEN_TAG_1" m_1 ... REGEX "SUFFIX" start_parts = [f'"{RESPONSE_PREFIX}"'] for i in range(num_tags): - # 添加空白符规则引用 + # Add whitespace rule reference start_parts.append("REGEX") - # 添加开始标签的字面量,例如: "<|MASKED id=\"m_0\"|>" - # 注意转义: id=\"m_{i}\" + # Add opening tag literal, e.g.: "<|MASKED id=\"m_0\"|>" + # Note escaping: id=\"m_{i}\" open_tag_str = f'"{TAG_OPEN_LEFT} id=\\"m_{i}\\"{TAG_OPEN_RIGHT}"' start_parts.append(open_tag_str) - # 添加内容规则引用 (小写 m_i) + # Add content rule reference (lowercase m_i) start_parts.append(f"m_{i}") - # 添加结尾的空白符和后缀 + # Add trailing whitespace and suffix start_parts.append("REGEX") start_parts.append(f'"{RESPONSE_SUFFIX}"') lines.append(f"start: {' '.join(start_parts)}") - # 3. 定义空白符规则 (命名为 REGEX 以匹配你的合法示例,通常也可以叫 WS) + # 3. Define whitespace rule (named REGEX to match examples, usually can also be called WS) lines.append(r"REGEX: /\s*/") - # 4. 生成每个 tag 的具体规则 + # 4. Generate specific rules for each tag for i, tag in enumerate(query.tags): - # 注意:配合 suffix 使用时,使用贪婪匹配 /(?s:.*)/ 而不是 /(?s:.)*?/ 是正确且合法的。 + # Note: When used with suffix, using greedy match /(?s:.*)/ instead of /(?s:.)*?/ is correct and legal. pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.*)/" - # 规则 m_i (逻辑层): - # - capture: 告诉引擎捕获这个部分。 - # - suffix: 指定结束标签,引擎遇到它会停止并消费它。 - # 注意:这里引用 TAG_END 常量 (即 "<|/MASKED|>") + # Rule m_i (logical layer): + # - capture: tells the engine to capture this part. + # - suffix: specifies the ending tag, the engine stops and consumes it when encountered. + # Note: Here we reference the TAG_END constant (i.e., "<|/MASKED|>") lines.append(f'm_{i}[capture, suffix="{TAG_END}"]: M_{i}') - # 规则 M_i (正则层): - # 定义实际的匹配模式 + # Rule M_i (regex layer): + # Define the actual matching pattern for this tag. lines.append(f"M_{i}: {pattern}") - # TODO: "/(?s:.*)/" 的 tags 可能有很多个, 可以将具有相同 pattern 的规则合并以优化效率 + # TODO: There may be many tags with "/(?s:.*)/" pattern, which can be inefficient. - # 5. 组合最终字符串 + # 5. Assemble final string grammar = "\n".join(lines) + "\n" is_error, msgs = validate_grammar_spec(get_grammar_spec(grammar)) From 1f91290801b4e83c5cd3905148707b43b6ce96c9 Mon Sep 17 00:00:00 2001 From: Shichao Song <60967965+Ki-Seki@users.noreply.github.com> Date: Sat, 24 Jan 2026 01:46:46 +0800 Subject: [PATCH 5/6] test: update corresponding test --- tests/test_dsls.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/test_dsls.py b/tests/test_dsls.py index fe3ffd1..aeee735 100644 --- a/tests/test_dsls.py +++ b/tests/test_dsls.py @@ -11,16 +11,22 @@ def test_build_cfg(): query = Query('Hello, <|MASKED id="m_0"|>world<|/MASKED|>!') grm = ( - 'start: "<|GIM_RESPONSE|>" tag0 "<|/GIM_RESPONSE|>"\n' - 'tag0: "<|MASKED id=\\"m_0\\"|>" /(?s:.)*?/ "<|/MASKED|>"' + '%llguidance {}\n' + 'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n' + 'REGEX: /\\s*/\n' + 'm_0[capture, suffix="<|/MASKED|>"]: M_0\n' + 'M_0: /(?s:.*)/\n' ) assert build_cfg(query) == grm # Test with regex - query_with_regex = Query("Hello, ", MaskedTag(id=0, regex="[A-Za-z]{5}"), "!") + query_with_regex = Query("Hello, ", MaskedTag(id=0, regex=r"\w+\.com"), "!") whole_grammar_regex = ( - 'start: "<|GIM_RESPONSE|>" tag0 "<|/GIM_RESPONSE|>"\n' - 'tag0: "<|MASKED id=\\"m_0\\"|>" /[A-Za-z]{5}/ "<|/MASKED|>"' + '%llguidance {}\n' + 'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n' + 'REGEX: /\\s*/\n' + 'm_0[capture, suffix="<|/MASKED|>"]: M_0\n' + 'M_0: /\\w+\\.com/\n' ) assert build_cfg(query_with_regex) == whole_grammar_regex From 09c9c56e11a1a7f36d98b234854130468e7eae32 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 23 Jan 2026 17:47:26 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_dsls.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_dsls.py b/tests/test_dsls.py index aeee735..10190e7 100644 --- a/tests/test_dsls.py +++ b/tests/test_dsls.py @@ -11,22 +11,22 @@ def test_build_cfg(): query = Query('Hello, <|MASKED id="m_0"|>world<|/MASKED|>!') grm = ( - '%llguidance {}\n' + "%llguidance {}\n" 'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n' - 'REGEX: /\\s*/\n' + "REGEX: /\\s*/\n" 'm_0[capture, suffix="<|/MASKED|>"]: M_0\n' - 'M_0: /(?s:.*)/\n' + "M_0: /(?s:.*)/\n" ) assert build_cfg(query) == grm # Test with regex query_with_regex = Query("Hello, ", MaskedTag(id=0, regex=r"\w+\.com"), "!") whole_grammar_regex = ( - '%llguidance {}\n' + "%llguidance {}\n" 'start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|/GIM_RESPONSE|>"\n' - 'REGEX: /\\s*/\n' + "REGEX: /\\s*/\n" 'm_0[capture, suffix="<|/MASKED|>"]: M_0\n' - 'M_0: /\\w+\\.com/\n' + "M_0: /\\w+\\.com/\n" ) assert build_cfg(query_with_regex) == whole_grammar_regex