From d79cd9203fb54370b129d4ddf95c604e576476e1 Mon Sep 17 00:00:00 2001 From: Theo Ephraim Date: Thu, 11 Jun 2026 23:10:48 -0700 Subject: [PATCH] feat(indent): three opt-in extensions for non-YAML indentation languages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Indentation languages that nest tag lines (Pug-like) rather than key/value scalars need three behaviors the indent mode currently hardcodes for YAML. Each is an opt-in IndentConfig field, default off — a grammar declaring none tokenizes byte-identically (all existing gates unchanged). - commentExcept: an exception string after the comment introducer makes the line fall through to tokenization ('//' lines vanish, '//!' doc-comment lines lex as real structural tokens). - rawBlock: verbatim capture introduced from the END of a line (tag:mode filters / content modes) — the mirror image of blockScalar's leading | / >. The introducer must be glued to the line content (no top-level whitespace) or sit at the line lead. - flowColonSeparator: false disables the YAML flow ':' key-separator carve-out, for grammars with ':name'-shaped tokens (bound-attribute shorthand) that legally follow quoted values / flow closes. Specified as engine behavior over toy grammars in test/indent-extensions.ts (21 checks, registered as a core gate). Co-Authored-By: Claude Fable 5 --- src/gen-lexer.ts | 71 +++++++++++++- src/types.ts | 17 ++++ test/check.ts | 1 + test/indent-extensions.ts | 196 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 283 insertions(+), 2 deletions(-) create mode 100644 test/indent-extensions.ts diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts index de52e5e..9aac269 100644 --- a/src/gen-lexer.ts +++ b/src/gen-lexer.ts @@ -304,6 +304,7 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { const kNewlineModeTok = kOf(newline?.token ?? null); const kIndentTok = kOf(indent?.indentToken ?? null), kDedentTok = kOf(indent?.dedentToken ?? null), kIndentNewlineTok = kOf(indent?.newlineToken ?? null); const kBlockScalarTok = kOf(indent?.blockScalar?.token ?? null); + const kRawBlockTok = kOf(indent?.rawBlock?.token ?? null); const kPlainCont = kOf(plainContinuationTokenName); const tColon = puLitOf.get(':') ?? 0; @@ -343,6 +344,14 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { // denoted, so the "newline-or-EOF" alternation is unchanged. const blockScalarSig = /[|>](?:[1-9][+-]?|[+-][1-9]?|[+-]|)[ \t]*(?:(?<=[ \t])#[^\n]*)?(?:\r?\n|$)/y; if (indent?.blockScalar) indentTokenNames.add(indent.blockScalar.token); + // Raw content blocks: a line-TRAILING introducer (e.g. Pug-style `tag:mode` at end of a line) + // whose SIGNATURE must match from the introducer char through end-of-line. Sticky, like + // blockScalarSig. `introChar` is the first char of the signature's match (a cheap pre-filter). + const rawBlockSig = indent?.rawBlock + ? new RegExp(indent.rawBlock.signature ?? ':(?:[A-Za-z][A-Za-z0-9-]*)?[ \\t]*(?:\\r?\\n|$)', 'y') + : null; + const rawBlockChar = indent?.rawBlock?.introChar ?? ':'; + if (indent?.rawBlock) indentTokenNames.add(indent.rawBlock.token); // Col-0 strings (`---`/`...`) that always end a block scalar — a document boundary outranks // indentation — and, when one heads the introducer's line, mark a document-ROOT scalar. const blockScalarDocMarkers = indent?.blockScalar?.documentMarkers ?? []; @@ -684,7 +693,10 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { throw new Error(`Tab character used in indentation at offset ${p}`); } } - if (lineComment && source.startsWith(lineComment, p)) { // comment-only line — ignored + if (lineComment && source.startsWith(lineComment, p) + // commentExcept: a comment introducer immediately followed by this string is NOT a + // comment line (e.g. `//` strip-comments vs `//!` doc-comments) — fall through to tokens. + && !(indent?.commentExcept && source.startsWith(indent.commentExcept, p + lineComment.length))) { let e = p; while (e < source.length && source[e] !== '\n') e++; pos = e; pendingComment = true; continue; // next iteration consumes the newline } @@ -904,6 +916,58 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { continue; } + // ── Raw content block: a line-TRAILING `:mode` introducer (per + // indent.rawBlock.signature, matched at this position through end of line) captures all + // following lines more indented than the introducer's line as ONE verbatim token (blank + // lines included). The introducer must be GLUED to preceding line content (`script:`, + // `article:md`) or sit at the line lead (`:md` — implicit element). The analogue of the + // YAML block scalar above, but introduced at line END rather than by a leading `|`/`>`. ── + if (indent?.rawBlock && flowDepth === 0 && rawBlockSig && source[pos] === rawBlockChar + && ((rawBlockSig.lastIndex = pos), rawBlockSig.test(source))) { + let lineBegin = pos; while (lineBegin > 0 && source[lineBegin - 1] !== '\n') lineBegin--; + const beforeText = source.slice(lineBegin, pos); + // GLUED means: the introducer follows the line's tag-head/attrs with NO top-level + // whitespace anywhere before it (whitespace inside balanced parens/quotes is fine — + // `div(a="1" b):md`). A trailing colon after inline TEXT (`label Size:`) has a + // top-level space, so it stays text and never opens a raw block. + const glued = beforeText.length > 0 && /\S/.test(beforeText) && (() => { + let depth = 0, quote = ''; + const lead = beforeText.match(/^[ \t]*/)![0].length; // leading indentation is fine + for (let i = lead; i < beforeText.length; i++) { + const ch = beforeText[i]; + if (quote) { if (ch === quote) quote = ''; continue; } + if (ch === '"' || ch === "'" || ch === '`') quote = ch; + else if (ch === '(') depth++; + else if (ch === ')') depth = Math.max(0, depth - 1); + else if ((ch === ' ' || ch === '\t') && depth === 0) return false; + } + return true; + })(); + const atLead = /^[ \t]*$/.test(beforeText); + if (glued || atLead) { + const startPos = pos; + const parent = indentStack[indentStack.length - 1]; + let p = pos; while (p < source.length && source[p] !== '\n') p++; if (p < source.length) p++; // skip the header line + while (p < source.length) { + let q = p, c = 0; + while (q < source.length && source[q] === ' ') { q++; c++; } + if (q >= source.length) { p = q; break; } + if (source[q] === '\n' || source[q] === '\r') { // blank line — part of the block + p = q + 1; if (source[q] === '\r' && source[p] === '\n') p++; + continue; + } + if (c > parent) { // content line — more indented than the introducer's line + let e = q; while (e < source.length && source[e] !== '\n') e++; p = e < source.length ? e + 1 : e; + } + else break; // dedent → the raw block ends + } + push(mkNamed(indent.rawBlock.token, source.slice(startPos, p), startPos, kRawBlockTok)); + pos = p; + lineStart = true; + continue; + } + } + // Close an interpolation hole (interpClose at baseline depth) → resume the template span. if (templateStack.length > 0 && source.startsWith(tplInterpClose, pos)) { const depth = templateStack[templateStack.length - 1]; @@ -982,7 +1046,10 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { // the separator — emit it as the `:` punctuation literal here. Gated on flow (block-context `:` // separators are handled by the KEY-position lookaheads). yaml-test-suite 5MUD / 5T43 / 9MMW // / C2DT / K3WX (quoted key) and the flow-collection-key cohort. - if (indent && flowDepth > 0 && source[pos] === ':') { + // flowColonSeparator: false disables the YAML `"key":value` / `}: value` flow + // separator carve-out, for indentation grammars with `:name`-shaped tokens that + // may legally follow a quoted value or a flow-close delimiter. + if (indent && indent.flowColonSeparator !== false && flowDepth > 0 && source[pos] === ':') { const prevTok = tokens[tokens.length - 1]; if (prevTok && (stringTokenNames.has(prevTok.type) || (prevTok.type === '' && flowCloseSet.has(prevTok.text)))) { push(mkPu(':', pos, tColon)); diff --git a/src/types.ts b/src/types.ts index fa335ae..0106fef 100644 --- a/src/types.ts +++ b/src/types.ts @@ -326,6 +326,23 @@ export interface IndentConfig { // control sigil, not content; absent → the block-scalar token's own scope (introducer reads as the // body string). The body always keeps the token scope; only the introducer capture is re-scoped. blockScalar?: { introducers: string[]; token: string; documentMarkers?: string[]; indicatorScope?: string }; + // Set false to disable the YAML flow `:` key-separator carve-out (a `:` glued after a quoted + // scalar / flow-close is forced punctuation). Indentation grammars with `:name`-shaped tokens + // (bound-attribute shorthand) need those to survive after values. Default true (YAML behavior). + flowColonSeparator?: boolean; + // A comment introducer immediately followed by this string is NOT a comment line — it falls + // through to ordinary tokenization (e.g. comment '//' + commentExcept '!' → `//!` doc-comment + // lines lex as real tokens and stay visible to the indent stack, while `//` lines vanish). + commentExcept?: string; + // Raw content blocks: a line-TRAILING introducer (`tag:mode` at end of line, or a bare `:mode` + // at the line lead) captures all following more-indented lines as ONE verbatim token — the + // analogue of `blockScalar` for languages whose raw regions are introduced from the END of a + // line (Pug-style filters/content modes) rather than by a leading `|`/`>`. `signature` is a + // sticky-regex SOURCE matched at the introducer char through end-of-line (default + // `:(?:[A-Za-z][A-Za-z0-9-]*)?[ \t]*(?:\r?\n|$)`); `introChar` is its first char (a cheap + // pre-filter, default ':'). The introducer must be GLUED to the line's content (no top-level + // whitespace before it — whitespace inside balanced parens/quotes is fine) or sit at line lead. + rawBlock?: { token: string; signature?: string; introChar?: string }; // Compact-notation indicators (YAML `-` / `?`): a block entry indicator whose nested node begins // INLINE on the same line (`- item: a`, `? - x`). The node's true indentation is then the column // of its first char AFTER the indicator, not the indicator's own column — so a following SIBLING diff --git a/test/check.ts b/test/check.ts index 8754566..bdd8470 100644 --- a/test/check.ts +++ b/test/check.ts @@ -38,6 +38,7 @@ const GATES: Gate[] = [ { group: 'vue', name: 'directives', args: ['test/vue-directives.ts'] }, { group: 'vue', name: 'embed-boundary', args: ['test/vue-embed-boundary.ts'] }, { group: 'vue', name: 'interp-expr', args: ['test/vue-interp-expr.ts'] }, + { group: 'core', name: 'indent-extensions', args: ['test/indent-extensions.ts'] }, { group: 'yaml', name: 'issue12-regressions', args: ['test/yaml-issue12-regressions.ts'] }, { group: 'yaml', name: 'depth-witnesses', args: ['test/yaml-depth-witnesses.ts'] }, { group: 'yaml', name: 'depth-sites', args: ['test/depth-sites.ts'] }, diff --git a/test/indent-extensions.ts b/test/indent-extensions.ts new file mode 100644 index 0000000..dc6526e --- /dev/null +++ b/test/indent-extensions.ts @@ -0,0 +1,196 @@ +// Indent-mode extensions for non-YAML indentation languages, specified as +// engine behavior over TOY grammars (token names and introducer characters +// deliberately unlike any real language — the behaviors are grammar DATA). +// +// Three opt-in IndentConfig fields, each motivated by a Pug-like indentation +// language (one that nests HTML-ish tag lines rather than key/value scalars): +// +// 1. `commentExcept` — two-tier comments: `--` lines vanish (invisible to +// the indent stack, like YAML `#`), but `--!` lines +// are REAL tokens (doc comments that ship to output). +// 2. `rawBlock` — verbatim capture introduced from the END of a line +// (`tag:mode` filters/content modes, Pug-style); the +// mirror image of YAML's leading `|`/`>` blockScalar. +// 3. `flowColonSeparator: false` — languages with `:name`-shaped tokens +// (bound-attribute shorthand) need a `:` after a +// quoted value / flow-close to stay a token start, +// not YAML's forced `key: value` separator punct. +// +// All three default OFF — a grammar declaring none (YAML) tokenizes +// byte-identically, which the yaml gates already enforce. +import { token, rule, defineGrammar, alt, many, many1, opt, seq, oneOf, noneOf, range, star, plus, never } from '../src/api.ts'; +import type { IndentConfig } from '../src/types.ts'; +import { createLexer } from '../src/gen-lexer.ts'; + +let ok = 0, fail = 0; +const check = (label: string, cond: boolean) => { cond ? ok++ : (fail++, console.log(' ✗', label)); }; + +type Tok = { type: string; text: string }; +const types = (toks: Tok[]) => toks.map(t => `${t.type || 'punct'}:${t.text}`); +const lexed = (g: ReturnType, src: string): Tok[] => createLexer(g as any).tokenize(src) as any; + +// Shared toy tokens +const lower = range('a', 'z'); +const Word = token(plus(lower), { identifier: true }); +const Str = token(seq('"', star(noneOf('"')), '"'), { string: true }); + +// ───────────────────────────────────────────────────────────────────────────── +// 1. commentExcept — exception string after the comment introducer +// ───────────────────────────────────────────────────────────────────────────── +{ + const Indent = token(never(), {}); + const Dedent = token(never(), {}); + const Newline = token(never(), {}); + // `--! …` doc comment: a REAL token. Declared before the skip token (`--` is its prefix). + const DocNote = token(seq('--!', star(noneOf('\n'))), {}); + const Strip = token(seq('--', star(noneOf('\n'))), { skip: true }); + const Line = rule(() => [[alt(Word, DocNote)], [Word]]); + const Lines = rule(() => [[Line, many(Newline, Line)]]); + const Doc = rule(() => [[opt(Lines), opt(Indent), opt(Lines), opt(Dedent)]]); + + const mk = (indent: IndentConfig) => defineGrammar({ + name: 'tiny', tokens: { Indent, Dedent, Newline, DocNote, Strip, Word }, rules: { Line, Lines, Doc }, entry: Doc, indent, + }); + const base: IndentConfig = { indentToken: 'Indent', dedentToken: 'Dedent', newlineToken: 'Newline', comment: '--' }; + const gDefault = mk(base); + const gExcept = mk({ ...base, commentExcept: '!' }); + + // Comment-only lines stay invisible to the indent stack — with or without the option. + check('commentExcept: plain comment lines remain invisible', + types(lexed(gExcept, 'aaa\n-- note\nbbb')).join(' ') === types(lexed(gExcept, 'aaa\nbbb')).join(' ')); + check('commentExcept: a DEEPER comment-only line emits no Indent', + !types(lexed(gExcept, 'aaa\n -- deep note\nbbb')).some(t => t.startsWith('Indent'))); + + // The exception: `--!` lines fall through to tokenization and are REAL tokens. + check('commentExcept: introducer+exception lines tokenize (DocNote token present)', + lexed(gExcept, 'aaa\n--! ship me\nbbb').some(t => t.type === 'DocNote' && t.text === '--! ship me')); + check('commentExcept: doc-comment lines are STRUCTURAL (sibling Newline separation intact)', + types(lexed(gExcept, 'aaa\n--! ship me\nbbb')).join(' ') === + 'Word:aaa Newline: DocNote:--! ship me Newline: Word:bbb'); + + // Default behavior unchanged: without the option, `--!` is swallowed like any `--` line. + check('commentExcept: absent → introducer+exception lines are still swallowed (back-compat)', + !lexed(gDefault, 'aaa\n--! gone\nbbb').some(t => t.type === 'DocNote')); + + // The exception is position-sensitive: it must IMMEDIATELY follow the introducer. + check('commentExcept: `-- !` (space before the exception) is still a comment', + !lexed(gExcept, 'aaa\n-- ! still a comment\nbbb').some(t => t.type === 'DocNote')); +} + +// ───────────────────────────────────────────────────────────────────────────── +// 2. rawBlock — line-TRAILING introducer captures the indented body verbatim +// ───────────────────────────────────────────────────────────────────────────── +{ + const Indent = token(never(), {}); + const Dedent = token(never(), {}); + const Newline = token(never(), {}); + const RawBody = token(never(), {}); + const Line = rule(() => [[Word, opt('(', many(alt(Word, Str)), ')'), opt(RawBody)], [RawBody]]); + const Lines = rule(() => [[Line, many(Newline, Line)]]); + const Doc = rule(() => [[opt(Lines), opt(Indent), opt(Lines), opt(Dedent)]]); + + const mk = (indent: IndentConfig) => defineGrammar({ + name: 'tinyraw', tokens: { Indent, Dedent, Newline, RawBody, Word, Str }, rules: { Line, Lines, Doc }, entry: Doc, indent, + }); + const base: IndentConfig = { + indentToken: 'Indent', dedentToken: 'Dedent', newlineToken: 'Newline', + flowOpen: ['('], flowClose: [')'], + }; + const g = mk({ ...base, rawBlock: { token: 'RawBody' } }); + const gOff = mk(base); + + // Core capture: `word:` at end of line takes all MORE-indented lines as ONE token. + const t1 = lexed(g, 'thing:\n raw one\n raw two\nnext'); + check('rawBlock: trailing `:` captures the indented body as ONE token', + t1.some(t => t.type === 'RawBody' && t.text === ':\n raw one\n raw two\n')); + check('rawBlock: capture ends at dedent — the sibling lexes normally', + t1.some(t => t.type === 'Word' && t.text === 'next')); + + // Named mode: `word:mode` — the mode word is part of the token (introducer line). + check('rawBlock: named mode `thing:md` is captured with the introducer', + lexed(g, 'thing:md\n body').some(t => t.type === 'RawBody' && t.text.startsWith(':md\n'))); + + // Line-lead form: a bare `:mode` at the start of a line. + check('rawBlock: bare `:mode` at line lead opens a block', + lexed(g, ':md\n body').some(t => t.type === 'RawBody')); + + // Blank lines belong to the block; capture still ends at the dedent. + check('rawBlock: interior blank lines are part of the body', + lexed(g, 'thing:\n one\n\n two\nnext').some(t => t.type === 'RawBody' && t.text.includes('one\n\n two'))); + + // GLUE rule: top-level whitespace before the introducer means it is NOT an + // introducer (`label size:` is text ending in a colon, not a raw block) … + check('rawBlock: top-level space before `:` does not open a block', (() => { + try { return !lexed(g, 'label size:\n child').some(t => t.type === 'RawBody'); } + catch { return true; } // a lex error on the stray `:` is also "did not capture" + })()); + // … but whitespace INSIDE balanced parens/quotes does not break the glue. + check('rawBlock: whitespace inside parens keeps the introducer glued', + lexed(g, 'thing(aa "b b" cc):\n body').some(t => t.type === 'RawBody')); + + // Mid-line content after the introducer breaks the signature (must run to EOL). + check('rawBlock: `:` with trailing content on the line is not an introducer', (() => { + try { return !lexed(g, 'thing: not a block\n child').some(t => t.type === 'RawBody'); } + catch { return true; } // a lex error on the stray `:` is also "did not capture" + })()); + + // Inside flow, the introducer char is inert. + check('rawBlock: introducer inside parens is inert', (() => { + try { return !lexed(g, 'thing(aa:\n bb)').some(t => t.type === 'RawBody'); } + catch { return true; } // a lex error is also "did not open a raw block" + })()); + + // Default off: without the config, nothing captures. + check('rawBlock: absent → no capture (back-compat)', (() => { + try { return !lexed(gOff, 'thing:\n raw').some(t => t.type === 'RawBody'); } + catch { return true; } + })()); + + // The introducer is grammar DATA: a custom signature/char works identically. + const gCustom = mk({ ...base, rawBlock: { token: 'RawBody', introChar: '=', signature: '=(?:[a-z]+)?[ \\t]*(?:\\r?\\n|$)' } }); + check('rawBlock: custom introducer char/signature is honored (data-driven)', + lexed(gCustom, 'thing=md\n body').some(t => t.type === 'RawBody' && t.text.startsWith('=md'))); +} + +// ───────────────────────────────────────────────────────────────────────────── +// 3. flowColonSeparator: false — `:name` tokens survive after values in flow +// ───────────────────────────────────────────────────────────────────────────── +{ + const Indent = token(never(), {}); + const Dedent = token(never(), {}); + const Newline = token(never(), {}); + // Bound-attribute-style token: `:name` + const BoundName = token(seq(':', plus(lower)), {}); + const Item = rule(() => [Word, Str, BoundName, ['(', many(Item), ')']]); + const Line = rule(() => [[Word, '(', many(Item), ')']]); + const Doc = rule(() => [[opt(Line)]]); + + const mk = (indent: IndentConfig) => defineGrammar({ + name: 'tinyflow', tokens: { Indent, Dedent, Newline, Word, Str, BoundName }, rules: { Item, Line, Doc }, entry: Doc, indent, + }); + const base: IndentConfig = { + indentToken: 'Indent', dedentToken: 'Dedent', newlineToken: 'Newline', + flowOpen: ['('], flowClose: [')'], + }; + const gYaml = mk(base); // default: YAML behavior + const gOff = mk({ ...base, flowColonSeparator: false }); + + // Default (YAML): a `:` after a quoted value in flow is forced separator punctuation. + check('flowColonSeparator default: `:` after a string is separator punct (YAML behavior)', + lexed(gYaml, 'tag("v" :k)').some(t => t.type === '' && t.text === ':')); + + // Disabled: the same `:` starts the BoundName token. + check('flowColonSeparator false: `:name` after a string lexes as one token', + lexed(gOff, 'tag("v" :k)').some(t => t.type === 'BoundName' && t.text === ':k')); + + // Same carve-out after a flow-CLOSE delimiter, nested so flow depth stays > 0. + check('flowColonSeparator false: `:name` after `)` (still in flow) lexes as one token', + lexed(gOff, 'tag((aa) :k)').some(t => t.type === 'BoundName' && t.text === ':k')); + check('flowColonSeparator default: `:` after `)` splits (YAML behavior preserved)', + !lexed(gYaml, 'tag((aa) :k)').some(t => t.type === 'BoundName')); +} + +console.log(fail === 0 + ? `\n${ok}/${ok} indent-extension checks pass — commentExcept / rawBlock / flowColonSeparator behave as specified` + : `\n${fail} FAILED`); +process.exit(fail === 0 ? 0 : 1);