From d79cd9203fb54370b129d4ddf95c604e576476e1 Mon Sep 17 00:00:00 2001
From: Theo Ephraim <theo@dmno.dev>
Date: Thu, 11 Jun 2026 23:10:48 -0700
Subject: [PATCH] feat(indent): three opt-in extensions for non-YAML
 indentation languages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Indentation languages that nest tag lines (Pug-like) rather than key/value
scalars need three behaviors the indent mode currently hardcodes for YAML.
Each is an opt-in IndentConfig field, default off — a grammar declaring none
tokenizes byte-identically (all existing gates unchanged).

- commentExcept: an exception string after the comment introducer makes the
  line fall through to tokenization ('//' lines vanish, '//!' doc-comment
  lines lex as real structural tokens).
- rawBlock: verbatim capture introduced from the END of a line (tag:mode
  filters / content modes) — the mirror image of blockScalar's leading | / >.
  The introducer must be glued to the line content (no top-level whitespace)
  or sit at the line lead.
- flowColonSeparator: false disables the YAML flow ':' key-separator
  carve-out, for grammars with ':name'-shaped tokens (bound-attribute
  shorthand) that legally follow quoted values / flow closes.

Specified as engine behavior over toy grammars in test/indent-extensions.ts
(21 checks, registered as a core gate).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/gen-lexer.ts          |  71 +++++++++++++-
 src/types.ts              |  17 ++++
 test/check.ts             |   1 +
 test/indent-extensions.ts | 196 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 283 insertions(+), 2 deletions(-)
 create mode 100644 test/indent-extensions.ts

diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts
index de52e5e..9aac269 100644
--- a/src/gen-lexer.ts
+++ b/src/gen-lexer.ts
@@ -304,6 +304,7 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
   const kNewlineModeTok = kOf(newline?.token ?? null);
   const kIndentTok = kOf(indent?.indentToken ?? null), kDedentTok = kOf(indent?.dedentToken ?? null), kIndentNewlineTok = kOf(indent?.newlineToken ?? null);
   const kBlockScalarTok = kOf(indent?.blockScalar?.token ?? null);
+  const kRawBlockTok = kOf(indent?.rawBlock?.token ?? null);
   const kPlainCont = kOf(plainContinuationTokenName);
   const tColon = puLitOf.get(':') ?? 0;
 
@@ -343,6 +344,14 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
   // denoted, so the "newline-or-EOF" alternation is unchanged.
   const blockScalarSig = /[|>](?:[1-9][+-]?|[+-][1-9]?|[+-]|)[ \t]*(?:(?<=[ \t])#[^\n]*)?(?:\r?\n|$)/y;
   if (indent?.blockScalar) indentTokenNames.add(indent.blockScalar.token);
+  // Raw content blocks: a line-TRAILING introducer (e.g. Pug-style `tag:mode` at end of a line)
+  // whose SIGNATURE must match from the introducer char through end-of-line. Sticky, like
+  // blockScalarSig. `introChar` is the first char of the signature's match (a cheap pre-filter).
+  const rawBlockSig = indent?.rawBlock
+    ? new RegExp(indent.rawBlock.signature ?? ':(?:[A-Za-z][A-Za-z0-9-]*)?[ \\t]*(?:\\r?\\n|$)', 'y')
+    : null;
+  const rawBlockChar = indent?.rawBlock?.introChar ?? ':';
+  if (indent?.rawBlock) indentTokenNames.add(indent.rawBlock.token);
   // Col-0 strings (`---`/`...`) that always end a block scalar — a document boundary outranks
   // indentation — and, when one heads the introducer's line, mark a document-ROOT scalar.
   const blockScalarDocMarkers = indent?.blockScalar?.documentMarkers ?? [];
@@ -684,7 +693,10 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
             throw new Error(`Tab character used in indentation at offset ${p}`);
           }
         }
-        if (lineComment && source.startsWith(lineComment, p)) {             // comment-only line — ignored
+        if (lineComment && source.startsWith(lineComment, p)
+            // commentExcept: a comment introducer immediately followed by this string is NOT a
+            // comment line (e.g. `//` strip-comments vs `//!` doc-comments) — fall through to tokens.
+            && !(indent?.commentExcept && source.startsWith(indent.commentExcept, p + lineComment.length))) {
           let e = p; while (e < source.length && source[e] !== '\n') e++;
           pos = e; pendingComment = true; continue;                         // next iteration consumes the newline
         }
@@ -904,6 +916,58 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
         continue;
       }
 
+      // ── Raw content block: a line-TRAILING `:mode` introducer (per
+      // indent.rawBlock.signature, matched at this position through end of line) captures all
+      // following lines more indented than the introducer's line as ONE verbatim token (blank
+      // lines included). The introducer must be GLUED to preceding line content (`script:`,
+      // `article:md`) or sit at the line lead (`:md` — implicit element). The analogue of the
+      // YAML block scalar above, but introduced at line END rather than by a leading `|`/`>`. ──
+      if (indent?.rawBlock && flowDepth === 0 && rawBlockSig && source[pos] === rawBlockChar
+          && ((rawBlockSig.lastIndex = pos), rawBlockSig.test(source))) {
+        let lineBegin = pos; while (lineBegin > 0 && source[lineBegin - 1] !== '\n') lineBegin--;
+        const beforeText = source.slice(lineBegin, pos);
+        // GLUED means: the introducer follows the line's tag-head/attrs with NO top-level
+        // whitespace anywhere before it (whitespace inside balanced parens/quotes is fine —
+        // `div(a="1" b):md`). A trailing colon after inline TEXT (`label Size:`) has a
+        // top-level space, so it stays text and never opens a raw block.
+        const glued = beforeText.length > 0 && /\S/.test(beforeText) && (() => {
+          let depth = 0, quote = '';
+          const lead = beforeText.match(/^[ \t]*/)![0].length;   // leading indentation is fine
+          for (let i = lead; i < beforeText.length; i++) {
+            const ch = beforeText[i];
+            if (quote) { if (ch === quote) quote = ''; continue; }
+            if (ch === '"' || ch === "'" || ch === '`') quote = ch;
+            else if (ch === '(') depth++;
+            else if (ch === ')') depth = Math.max(0, depth - 1);
+            else if ((ch === ' ' || ch === '\t') && depth === 0) return false;
+          }
+          return true;
+        })();
+        const atLead = /^[ \t]*$/.test(beforeText);
+        if (glued || atLead) {
+          const startPos = pos;
+          const parent = indentStack[indentStack.length - 1];
+          let p = pos; while (p < source.length && source[p] !== '\n') p++; if (p < source.length) p++;  // skip the header line
+          while (p < source.length) {
+            let q = p, c = 0;
+            while (q < source.length && source[q] === ' ') { q++; c++; }
+            if (q >= source.length) { p = q; break; }
+            if (source[q] === '\n' || source[q] === '\r') {                 // blank line — part of the block
+              p = q + 1; if (source[q] === '\r' && source[p] === '\n') p++;
+              continue;
+            }
+            if (c > parent) {                                               // content line — more indented than the introducer's line
+              let e = q; while (e < source.length && source[e] !== '\n') e++; p = e < source.length ? e + 1 : e;
+            }
+            else break;                                                     // dedent → the raw block ends
+          }
+          push(mkNamed(indent.rawBlock.token, source.slice(startPos, p), startPos, kRawBlockTok));
+          pos = p;
+          lineStart = true;
+          continue;
+        }
+      }
+
       // Close an interpolation hole (interpClose at baseline depth) → resume the template span.
       if (templateStack.length > 0 && source.startsWith(tplInterpClose, pos)) {
         const depth = templateStack[templateStack.length - 1];
@@ -982,7 +1046,10 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
       // the separator — emit it as the `:` punctuation literal here. Gated on flow (block-context `:`
       // separators are handled by the KEY-position lookaheads). yaml-test-suite 5MUD / 5T43 / 9MMW
       // / C2DT / K3WX (quoted key) and the flow-collection-key cohort.
-      if (indent && flowDepth > 0 && source[pos] === ':') {
+      // flowColonSeparator: false disables the YAML `"key":value` / `}: value` flow
+      // separator carve-out, for indentation grammars with `:name`-shaped tokens that
+      // may legally follow a quoted value or a flow-close delimiter.
+      if (indent && indent.flowColonSeparator !== false && flowDepth > 0 && source[pos] === ':') {
         const prevTok = tokens[tokens.length - 1];
         if (prevTok && (stringTokenNames.has(prevTok.type) || (prevTok.type === '' && flowCloseSet.has(prevTok.text)))) {
           push(mkPu(':', pos, tColon));
diff --git a/src/types.ts b/src/types.ts
index fa335ae..0106fef 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -326,6 +326,23 @@ export interface IndentConfig {
   // control sigil, not content; absent → the block-scalar token's own scope (introducer reads as the
   // body string). The body always keeps the token scope; only the introducer capture is re-scoped.
   blockScalar?: { introducers: string[]; token: string; documentMarkers?: string[]; indicatorScope?: string };
+  // Set false to disable the YAML flow `:` key-separator carve-out (a `:` glued after a quoted
+  // scalar / flow-close is forced punctuation). Indentation grammars with `:name`-shaped tokens
+  // (bound-attribute shorthand) need those to survive after values. Default true (YAML behavior).
+  flowColonSeparator?: boolean;
+  // A comment introducer immediately followed by this string is NOT a comment line — it falls
+  // through to ordinary tokenization (e.g. comment '//' + commentExcept '!' → `//!` doc-comment
+  // lines lex as real tokens and stay visible to the indent stack, while `//` lines vanish).
+  commentExcept?: string;
+  // Raw content blocks: a line-TRAILING introducer (`tag:mode` at end of line, or a bare `:mode`
+  // at the line lead) captures all following more-indented lines as ONE verbatim token — the
+  // analogue of `blockScalar` for languages whose raw regions are introduced from the END of a
+  // line (Pug-style filters/content modes) rather than by a leading `|`/`>`. `signature` is a
+  // sticky-regex SOURCE matched at the introducer char through end-of-line (default
+  // `:(?:[A-Za-z][A-Za-z0-9-]*)?[ \t]*(?:\r?\n|$)`); `introChar` is its first char (a cheap
+  // pre-filter, default ':'). The introducer must be GLUED to the line's content (no top-level
+  // whitespace before it — whitespace inside balanced parens/quotes is fine) or sit at line lead.
+  rawBlock?: { token: string; signature?: string; introChar?: string };
   // Compact-notation indicators (YAML `-` / `?`): a block entry indicator whose nested node begins
   // INLINE on the same line (`- item: a`, `? - x`). The node's true indentation is then the column
   // of its first char AFTER the indicator, not the indicator's own column — so a following SIBLING
diff --git a/test/check.ts b/test/check.ts
index 8754566..bdd8470 100644
--- a/test/check.ts
+++ b/test/check.ts
@@ -38,6 +38,7 @@ const GATES: Gate[] = [
   { group: 'vue', name: 'directives', args: ['test/vue-directives.ts'] },
   { group: 'vue', name: 'embed-boundary', args: ['test/vue-embed-boundary.ts'] },
   { group: 'vue', name: 'interp-expr', args: ['test/vue-interp-expr.ts'] },
+  { group: 'core', name: 'indent-extensions', args: ['test/indent-extensions.ts'] },
   { group: 'yaml', name: 'issue12-regressions', args: ['test/yaml-issue12-regressions.ts'] },
   { group: 'yaml', name: 'depth-witnesses', args: ['test/yaml-depth-witnesses.ts'] },
   { group: 'yaml', name: 'depth-sites', args: ['test/depth-sites.ts'] },
diff --git a/test/indent-extensions.ts b/test/indent-extensions.ts
new file mode 100644
index 0000000..dc6526e
--- /dev/null
+++ b/test/indent-extensions.ts
@@ -0,0 +1,196 @@
+// Indent-mode extensions for non-YAML indentation languages, specified as
+// engine behavior over TOY grammars (token names and introducer characters
+// deliberately unlike any real language — the behaviors are grammar DATA).
+//
+// Three opt-in IndentConfig fields, each motivated by a Pug-like indentation
+// language (one that nests HTML-ish tag lines rather than key/value scalars):
+//
+//   1. `commentExcept`   — two-tier comments: `--` lines vanish (invisible to
+//                          the indent stack, like YAML `#`), but `--!` lines
+//                          are REAL tokens (doc comments that ship to output).
+//   2. `rawBlock`        — verbatim capture introduced from the END of a line
+//                          (`tag:mode` filters/content modes, Pug-style); the
+//                          mirror image of YAML's leading `|`/`>` blockScalar.
+//   3. `flowColonSeparator: false` — languages with `:name`-shaped tokens
+//                          (bound-attribute shorthand) need a `:` after a
+//                          quoted value / flow-close to stay a token start,
+//                          not YAML's forced `key: value` separator punct.
+//
+// All three default OFF — a grammar declaring none (YAML) tokenizes
+// byte-identically, which the yaml gates already enforce.
+import { token, rule, defineGrammar, alt, many, many1, opt, seq, oneOf, noneOf, range, star, plus, never } from '../src/api.ts';
+import type { IndentConfig } from '../src/types.ts';
+import { createLexer } from '../src/gen-lexer.ts';
+
+let ok = 0, fail = 0;
+const check = (label: string, cond: boolean) => { cond ? ok++ : (fail++, console.log('  ✗', label)); };
+
+type Tok = { type: string; text: string };
+const types = (toks: Tok[]) => toks.map(t => `${t.type || 'punct'}:${t.text}`);
+const lexed = (g: ReturnType<typeof defineGrammar>, src: string): Tok[] => createLexer(g as any).tokenize(src) as any;
+
+// Shared toy tokens
+const lower = range('a', 'z');
+const Word = token(plus(lower), { identifier: true });
+const Str = token(seq('"', star(noneOf('"')), '"'), { string: true });
+
+// ─────────────────────────────────────────────────────────────────────────────
+// 1. commentExcept — exception string after the comment introducer
+// ─────────────────────────────────────────────────────────────────────────────
+{
+  const Indent = token(never(), {});
+  const Dedent = token(never(), {});
+  const Newline = token(never(), {});
+  // `--! …` doc comment: a REAL token. Declared before the skip token (`--` is its prefix).
+  const DocNote = token(seq('--!', star(noneOf('\n'))), {});
+  const Strip = token(seq('--', star(noneOf('\n'))), { skip: true });
+  const Line = rule(() => [[alt(Word, DocNote)], [Word]]);
+  const Lines = rule(() => [[Line, many(Newline, Line)]]);
+  const Doc = rule(() => [[opt(Lines), opt(Indent), opt(Lines), opt(Dedent)]]);
+
+  const mk = (indent: IndentConfig) => defineGrammar({
+    name: 'tiny', tokens: { Indent, Dedent, Newline, DocNote, Strip, Word }, rules: { Line, Lines, Doc }, entry: Doc, indent,
+  });
+  const base: IndentConfig = { indentToken: 'Indent', dedentToken: 'Dedent', newlineToken: 'Newline', comment: '--' };
+  const gDefault = mk(base);
+  const gExcept = mk({ ...base, commentExcept: '!' });
+
+  // Comment-only lines stay invisible to the indent stack — with or without the option.
+  check('commentExcept: plain comment lines remain invisible',
+    types(lexed(gExcept, 'aaa\n-- note\nbbb')).join(' ') === types(lexed(gExcept, 'aaa\nbbb')).join(' '));
+  check('commentExcept: a DEEPER comment-only line emits no Indent',
+    !types(lexed(gExcept, 'aaa\n    -- deep note\nbbb')).some(t => t.startsWith('Indent')));
+
+  // The exception: `--!` lines fall through to tokenization and are REAL tokens.
+  check('commentExcept: introducer+exception lines tokenize (DocNote token present)',
+    lexed(gExcept, 'aaa\n--! ship me\nbbb').some(t => t.type === 'DocNote' && t.text === '--! ship me'));
+  check('commentExcept: doc-comment lines are STRUCTURAL (sibling Newline separation intact)',
+    types(lexed(gExcept, 'aaa\n--! ship me\nbbb')).join(' ') ===
+    'Word:aaa Newline: DocNote:--! ship me Newline: Word:bbb');
+
+  // Default behavior unchanged: without the option, `--!` is swallowed like any `--` line.
+  check('commentExcept: absent → introducer+exception lines are still swallowed (back-compat)',
+    !lexed(gDefault, 'aaa\n--! gone\nbbb').some(t => t.type === 'DocNote'));
+
+  // The exception is position-sensitive: it must IMMEDIATELY follow the introducer.
+  check('commentExcept: `-- !` (space before the exception) is still a comment',
+    !lexed(gExcept, 'aaa\n-- ! still a comment\nbbb').some(t => t.type === 'DocNote'));
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// 2. rawBlock — line-TRAILING introducer captures the indented body verbatim
+// ─────────────────────────────────────────────────────────────────────────────
+{
+  const Indent = token(never(), {});
+  const Dedent = token(never(), {});
+  const Newline = token(never(), {});
+  const RawBody = token(never(), {});
+  const Line = rule(() => [[Word, opt('(', many(alt(Word, Str)), ')'), opt(RawBody)], [RawBody]]);
+  const Lines = rule(() => [[Line, many(Newline, Line)]]);
+  const Doc = rule(() => [[opt(Lines), opt(Indent), opt(Lines), opt(Dedent)]]);
+
+  const mk = (indent: IndentConfig) => defineGrammar({
+    name: 'tinyraw', tokens: { Indent, Dedent, Newline, RawBody, Word, Str }, rules: { Line, Lines, Doc }, entry: Doc, indent,
+  });
+  const base: IndentConfig = {
+    indentToken: 'Indent', dedentToken: 'Dedent', newlineToken: 'Newline',
+    flowOpen: ['('], flowClose: [')'],
+  };
+  const g = mk({ ...base, rawBlock: { token: 'RawBody' } });
+  const gOff = mk(base);
+
+  // Core capture: `word:` at end of line takes all MORE-indented lines as ONE token.
+  const t1 = lexed(g, 'thing:\n  raw one\n  raw two\nnext');
+  check('rawBlock: trailing `:` captures the indented body as ONE token',
+    t1.some(t => t.type === 'RawBody' && t.text === ':\n  raw one\n  raw two\n'));
+  check('rawBlock: capture ends at dedent — the sibling lexes normally',
+    t1.some(t => t.type === 'Word' && t.text === 'next'));
+
+  // Named mode: `word:mode` — the mode word is part of the token (introducer line).
+  check('rawBlock: named mode `thing:md` is captured with the introducer',
+    lexed(g, 'thing:md\n  body').some(t => t.type === 'RawBody' && t.text.startsWith(':md\n')));
+
+  // Line-lead form: a bare `:mode` at the start of a line.
+  check('rawBlock: bare `:mode` at line lead opens a block',
+    lexed(g, ':md\n  body').some(t => t.type === 'RawBody'));
+
+  // Blank lines belong to the block; capture still ends at the dedent.
+  check('rawBlock: interior blank lines are part of the body',
+    lexed(g, 'thing:\n  one\n\n  two\nnext').some(t => t.type === 'RawBody' && t.text.includes('one\n\n  two')));
+
+  // GLUE rule: top-level whitespace before the introducer means it is NOT an
+  // introducer (`label size:` is text ending in a colon, not a raw block) …
+  check('rawBlock: top-level space before `:` does not open a block', (() => {
+    try { return !lexed(g, 'label size:\n  child').some(t => t.type === 'RawBody'); }
+    catch { return true; }   // a lex error on the stray `:` is also "did not capture"
+  })());
+  // … but whitespace INSIDE balanced parens/quotes does not break the glue.
+  check('rawBlock: whitespace inside parens keeps the introducer glued',
+    lexed(g, 'thing(aa "b b" cc):\n  body').some(t => t.type === 'RawBody'));
+
+  // Mid-line content after the introducer breaks the signature (must run to EOL).
+  check('rawBlock: `:` with trailing content on the line is not an introducer', (() => {
+    try { return !lexed(g, 'thing: not a block\n  child').some(t => t.type === 'RawBody'); }
+    catch { return true; }   // a lex error on the stray `:` is also "did not capture"
+  })());
+
+  // Inside flow, the introducer char is inert.
+  check('rawBlock: introducer inside parens is inert', (() => {
+    try { return !lexed(g, 'thing(aa:\n  bb)').some(t => t.type === 'RawBody'); }
+    catch { return true; }   // a lex error is also "did not open a raw block"
+  })());
+
+  // Default off: without the config, nothing captures.
+  check('rawBlock: absent → no capture (back-compat)', (() => {
+    try { return !lexed(gOff, 'thing:\n  raw').some(t => t.type === 'RawBody'); }
+    catch { return true; }
+  })());
+
+  // The introducer is grammar DATA: a custom signature/char works identically.
+  const gCustom = mk({ ...base, rawBlock: { token: 'RawBody', introChar: '=', signature: '=(?:[a-z]+)?[ \\t]*(?:\\r?\\n|$)' } });
+  check('rawBlock: custom introducer char/signature is honored (data-driven)',
+    lexed(gCustom, 'thing=md\n  body').some(t => t.type === 'RawBody' && t.text.startsWith('=md')));
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// 3. flowColonSeparator: false — `:name` tokens survive after values in flow
+// ─────────────────────────────────────────────────────────────────────────────
+{
+  const Indent = token(never(), {});
+  const Dedent = token(never(), {});
+  const Newline = token(never(), {});
+  // Bound-attribute-style token: `:name`
+  const BoundName = token(seq(':', plus(lower)), {});
+  const Item = rule(() => [Word, Str, BoundName, ['(', many(Item), ')']]);
+  const Line = rule(() => [[Word, '(', many(Item), ')']]);
+  const Doc = rule(() => [[opt(Line)]]);
+
+  const mk = (indent: IndentConfig) => defineGrammar({
+    name: 'tinyflow', tokens: { Indent, Dedent, Newline, Word, Str, BoundName }, rules: { Item, Line, Doc }, entry: Doc, indent,
+  });
+  const base: IndentConfig = {
+    indentToken: 'Indent', dedentToken: 'Dedent', newlineToken: 'Newline',
+    flowOpen: ['('], flowClose: [')'],
+  };
+  const gYaml = mk(base);                                       // default: YAML behavior
+  const gOff = mk({ ...base, flowColonSeparator: false });
+
+  // Default (YAML): a `:` after a quoted value in flow is forced separator punctuation.
+  check('flowColonSeparator default: `:` after a string is separator punct (YAML behavior)',
+    lexed(gYaml, 'tag("v" :k)').some(t => t.type === '' && t.text === ':'));
+
+  // Disabled: the same `:` starts the BoundName token.
+  check('flowColonSeparator false: `:name` after a string lexes as one token',
+    lexed(gOff, 'tag("v" :k)').some(t => t.type === 'BoundName' && t.text === ':k'));
+
+  // Same carve-out after a flow-CLOSE delimiter, nested so flow depth stays > 0.
+  check('flowColonSeparator false: `:name` after `)` (still in flow) lexes as one token',
+    lexed(gOff, 'tag((aa) :k)').some(t => t.type === 'BoundName' && t.text === ':k'));
+  check('flowColonSeparator default: `:` after `)` splits (YAML behavior preserved)',
+    !lexed(gYaml, 'tag((aa) :k)').some(t => t.type === 'BoundName'));
+}
+
+console.log(fail === 0
+  ? `\n${ok}/${ok} indent-extension checks pass — commentExcept / rawBlock / flowColonSeparator behave as specified`
+  : `\n${fail} FAILED`);
+process.exit(fail === 0 ? 0 : 1);