From b397356a9293b1fed7d56586b658ba37db61d67a Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 17:53:39 +0800 Subject: [PATCH 01/23] WIP issue #39: total parse/edit with cst.errors - one residual equivalence class parse/edit on the handle API never crash on input: the STRICT pass runs first (valid path byte-identical, full PEG arm exploration - gated by test/recovery.ts section 1 and the untouched parity suite), and only a strict reject re-parses under the recovery machinery: - Repetition recovery at spine-shaped loops (ref / alt-of-refs elements; deep-FIRST hooks measured 273-error cascades from arm probing and were reverted): a failing element absorbs tokens into an $error row up to the element FIRST set / the enclosing seq's follower literal / EOF. - BAR DISCIPLINE keeps recovery equivalence-safe and arm-blind: fires only where parsing is STUCK AT a strict-proven fail point (pos <= bar <= maxPos <= bar+2, stateless so losing arms cannot consume bars); failures past the bars abort the attempt and mint the next bar (32-attempt cap degrades to deterministic free-fire). The runParse safety net obeys the same discipline. - The lexer recovers under the same flag (error tokens + structured diagnostics; window truncation keeps the LEX_RETRY regrow path). - Diagnostics are DERIVED, not collected: $error rows found by descending the structurally-propagated rowRM spine (per-pass candidate lists double-counted under stateless re-adoption); lexer diagnostics live as structured entries formatted at settle time (stored message strings would embed stale offsets), maintained by the window splice and shifted by surgery. - Recovered streams break two strict-era invariants, both fixed: windowed relexing must anchor BELOW the earliest lexer diagnostic before the damage (a dangling quote pairs with a later edit - backward coupling; forward coupling is already guarded by resync equality), and rows built during a recovering pass may under-record their probe watermark when any arm fired recovery (recFires stamping refuses them to strict adoption; relocate-path surgery also normalizes copied prefix rels - an end-relative value below the remapped rowNF boundary would drift on every later length update). - '>' splits disable adoption for the rest of the parse (the frozen damage mapping is invalid after a mid-parse token-index shift). Gates: incremental-verify reworked to total semantics (every step compares tree+errors against a fresh recovering handle, 128 steps 0 mismatch), multi-doc reworked (60 interleaved steps incl. broken text, contract 9/9), 31/31 suite, strict parity 0 mismatches. KNOWN RESIDUAL (test/recovery.ts, not yet registered): typing-through- invalid session diverges at 1 of 20 keystrokes - a strict pass-1 edit ADOPTING over a post-recovery tree drops one Pratt wrap layer vs a fresh strict parse (single-keystroke repro in the gate; suspected adoption interplay with LED chains on recovering-built substrate). --- src/emit-lexer.ts | 25 +- src/emit-parser.ts | 598 +++++++++++++++++++++++++++++++------ test/incremental-verify.ts | 66 ++-- test/multi-doc.ts | 130 ++++---- test/recovery.ts | 120 ++++++++ 5 files changed, 718 insertions(+), 221 deletions(-) create mode 100644 test/recovery.ts diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index bf2ce1d..3fa7f60 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -108,6 +108,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// resync: suffix-zone equality makes a cut token's END mismatch the old one)`); emit(`const LEX_RETRY = { retry: true };`); emit(`let lexWindowMore = false;`); + emit(`let lexSrcBase = 0;`); emit(`const LX_UNI_IDENT = /[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/uy;`); emit(`const LX_UNI_CONT = /[$\\u200c\\u200d\\p{ID_Continue}]+/uy;`); emit(`const LX_UNI_FULL = /^[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/u;`); @@ -175,6 +176,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`}`); if (templateToken) { emit(`function lexTplSpan(source, pos, validateEscapes) {`); + emit(` const tplFrom = pos;`); emit(` while (pos < source.length) {`); emit(` if (${startsWithExpr('source', 'pos', tplInterpOpen)}) return { endsWithInterp: true, end: pos + ${tplInterpOpen.length} };`); emit(` if (source.charCodeAt(pos) === 92) {`); @@ -182,7 +184,11 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` if (validateEscapes) {`); emit(` LX_TPL_ESC.lastIndex = pos;`); emit(` const m = LX_TPL_ESC.exec(source);`); - emit(` if (!m) { if (lexWindowMore) throw LEX_RETRY; throw new Error('Invalid escape sequence in template at offset ' + pos); }`); + emit(` if (!m) {`); + emit(` if (lexWindowMore) throw LEX_RETRY;`); + emit(` if (recovering) { docLex.push({ offset: pos + lexSrcBase, end: pos + lexSrcBase + 1, kind: 1, ch: '' }); pos += 1; continue; }`); + emit(` throw new Error('Invalid escape sequence in template at offset ' + pos);`); + emit(` }`); emit(` pos += m[0].length;`); emit(` } else { pos += 2; }`); } else { @@ -194,6 +200,10 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` pos++;`); emit(` }`); emit(` if (lexWindowMore) throw LEX_RETRY;`); + emit(` if (recovering) {`); + emit(` docLex.push({ offset: tplFrom + lexSrcBase, end: source.length + lexSrcBase, kind: 2, ch: '' });`); + emit(` return { endsWithInterp: false, end: source.length };`); + emit(` }`); emit(` throw new Error('Unterminated template literal at offset ' + pos);`); emit(`}`); } @@ -223,6 +233,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`function lexCore(source, startPos, pvK, pvT, wndPtr0, wndMinOff, wndDelta, wndCs, initParens, srcBase, hasMore) {`); emit(` if (srcBase === undefined) srcBase = 0;`); emit(` lexWindowMore = hasMore === true;`); + emit(` lexSrcBase = srcBase;`); emit(` const n = source.length;`); emit(` let pos = startPos;`); emit(` let pendingNl = false;`); @@ -370,7 +381,11 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`${ind} if (m !== null) {`); if (m.identLike) { const plen = (identPrefixByName.get(m.name) ?? '').length; - emit(`${ind} if (!lexIdentValid(m[0], ${plen})) { if (lexWindowMore) throw LEX_RETRY; throw new Error("Invalid identifier escape at offset " + pos + ": '" + m[0] + "'"); }`); + emit(`${ind} if (!lexIdentValid(m[0], ${plen})) {`); + emit(`${ind} if (lexWindowMore) throw LEX_RETRY;`); + emit(`${ind} if (!recovering) throw new Error("Invalid identifier escape at offset " + pos + ": '" + m[0] + "'");`); + emit(`${ind} docLex.push({ offset: pos + lexSrcBase, end: pos + lexSrcBase + m[0].length, kind: 3, ch: m[0] });`); + emit(`${ind} }`); } if (m.skip) { emit(`${ind} if (m[0].includes('\\n')) pendingNl = true;`); @@ -515,6 +530,12 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` }`); } emit(` if (lexWindowMore) throw LEX_RETRY;`); + emit(` if (recovering) {`); + emit(` docLex.push({ offset: pos + srcBase, end: pos + srcBase + 1, kind: 0, ch: source[pos] });`); + emit(` tkPush(${st.KIND_NAMED_FALLBACK}, 0, pos, pos + 1);`); + emit(` pos += 1;`); + emit(` continue;`); + emit(` }`); emit(` throw new Error("Unexpected character at offset " + pos + ": '" + source[pos] + "'");`); emit(` }`); emit(` if (wndHit >= 0) { tokN--; return wndHit; }`); diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 4498f64..a5fe226 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -604,6 +604,7 @@ function analyze(grammar: CstGrammar) { // is >= NAMED_MIN (behaves as "a named token" for the keyword-by-text branch) yet // collides with NO real token-name kind (so matchToken(name) never false-matches it). const KIND_NAMED_FALLBACK = nextKind; + typeKind.set('$error', KIND_NAMED_FALLBACK); const symtab = { KIND_PUNCT, KIND_TEMPLATE_HEAD, KIND_NAMED_MIN, KIND_NAMED_FALLBACK, typeKind, kwLitKind, puLitKind, classifyKey, @@ -614,6 +615,7 @@ function analyze(grammar: CstGrammar) { prattRules, leftRecSet, ruleByName, prattClassified, leftRecClassified, maxBp, templateTokenName, templateTokenNames, firstTokenOf, altDeepFirst, altNullable, altSecond, ledMeta, contMeta, nullableRules, firstSets, symtab, qualKeys, + exprFirst, exprNullable, }; } @@ -715,7 +717,7 @@ class Emitter { // The run-extension target of a repetition: when the body unwraps to a plain ref of // a rule that routes through parseRuleEntry (pratt / left-rec / spine), its rule id; // else -1 (the loop gets no extension hook — adoption stays element-by-element). - quantRunRuleId(body: RuleExpr): number { + quantRunInfo(body: RuleExpr): { rid: number; name: string } | null { const a = this.a; let expr = body; while (true) { @@ -726,10 +728,52 @@ class Emitter { } break; } - if (expr.type !== 'ref' || !a.ruleByName.has(expr.name)) return -1; + if (expr.type !== 'ref' || !a.ruleByName.has(expr.name)) return null; const name = expr.name; - if (!(a.prattRules.has(name) || a.leftRecSet.has(name) || this.spineSet().has(name))) return -1; - return a.grammar.rules.findIndex(r => r.name === name); + if (!(a.prattRules.has(name) || a.leftRecSet.has(name) || this.spineSet().has(name))) return null; + const rid = a.grammar.rules.findIndex(r => r.name === name); + return rid >= 0 ? { rid, name } : null; + } + quantRunRuleId(body: RuleExpr): number { + const info = this.quantRunInfo(body); + return info === null ? -1 : info.rid; + } + // Recovery hooks stay at SPINE-SHAPED repetitions (a plain rule ref or an + // alt of rule refs — statement/member lists): hooking expression-internal + // repetitions lets a bar-armed absorption fire inside longest-match arm probing, + // which distorts arm selection and cascades (measured: 273 errors for one broken + // identifier). An unhooked inner failure escalates to the nearest hooked list, + // which absorbs at statement granularity. + quantRecoverFirst(body: RuleExpr): Set | null { + const a = this.a; + const unwrap = (x: RuleExpr): RuleExpr => { + while (true) { + if (x.type === 'group' && !(x.suppress && x.suppress.length)) { x = x.body; continue; } + if (x.type === 'seq') { + const real = x.items.filter(it => it.type !== 'op' && it.type !== 'prefix' && it.type !== 'postfix'); + if (real.length === 1) { x = real[0]; continue; } + } + return x; + } + }; + const expr = unwrap(body); + const refFirst = (x: RuleExpr): Set | null => { + if (x.type !== 'ref' || !a.ruleByName.has(x.name)) return null; + if (a.nullableRules.has(x.name)) return null; + const fs = a.firstSets.get(x.name); + return fs && fs.size > 0 ? fs : null; + }; + if (expr.type === 'ref') return refFirst(expr); + if (expr.type === 'alt') { + const u = new Set(); + for (const item of expr.items) { + const fs = refFirst(unwrap(item)); + if (fs === null) return null; + for (const k of fs) u.add(k); + } + return u.size > 0 ? u : null; + } + return null; } /** @@ -832,9 +876,15 @@ class Emitter { // flattened inline too — its failure restores to the SAME save point (the whole // matcher fn's _save), exactly like matchSeq's single saved/restore. const parts: string[] = []; - for (const item of expr.items) { + for (let i = 0; i < expr.items.length; i++) { + const item = expr.items[i]; if (item.type === 'op' || item.type === 'prefix' || item.type === 'postfix') continue; + if (item.type === 'quantifier') { + const nx = expr.items[i + 1]; + this.quantFollowT = nx !== undefined && nx.type === 'literal' ? this.litT(nx.value) : -1; + } parts.push(this.matchInto(item, onFail)); + this.quantFollowT = -1; } return parts.join('\n'); } @@ -851,7 +901,11 @@ class Emitter { return lines.join('\n'); } case 'quantifier': - return this.matchQuantifierInto(expr.body, expr.kind, onFail); + { + const closerT = this.quantFollowT; + this.quantFollowT = -1; + return this.matchQuantifierInto(expr.body, expr.kind, onFail, closerT); + } case 'group': { // A suppress-carrying group stages the LED-connector exclusion for the next // parseRule, then matches its body (same as matchExpr 'group'). @@ -890,7 +944,9 @@ class Emitter { // Quantifier: body is matched via a helper fn (pushes + boolean), so the loop here // uses `return`/`break` only against ITS OWN while — no nested-loop hazard. - private matchQuantifierInto(body: RuleExpr, kind: '*' | '+' | '?', onFail: string): string { + private quantFollowT = -1; + litT(value: string): number { return -1; } // bound by emitParser to the punct-literal table + private matchQuantifierInto(body: RuleExpr, kind: '*' | '+' | '?', onFail: string, closerT = -1): string { const fn = this.matchFn(body); if (kind === '?') { // Try once; on failure the helper restored pos/scn itself. @@ -901,14 +957,26 @@ class Emitter { // rule machinery once per element. Only loops over a parseRuleEntry-routed rule // get the hook, and runExtend re-checks rid + generation, so an inner rule's // adoption can never feed elements into an outer loop. - const runId = this.quantRunRuleId(body); + // + // The same loops are the RECOVERY sync points: in recovering mode (second pass, + // entered only after the strict parse rejected) a failing element absorbs tokens + // into an $error node up to the element's FIRST set / a closer / EOF and the + // loop continues — strict-mode behavior is byte-identical (the hook is gated on + // `recovering`, and a SUCCEEDING rule parses identically in both modes). + const runInfo = this.quantRunInfo(body); + const runId = runInfo === null ? -1 : runInfo.rid; const ext = runId >= 0 ? `\n if (adoptRunPos === pos) runExtend(${runId});` : ''; + const recFirst = this.quantRecoverFirst(body); + const csFn = recFirst !== null ? this.membershipFn(recFirst) : 'null'; + const fail = recFirst !== null + ? `if (!${fn}()) { if (!recovering || !recoverSkip(${csFn}, ${closerT})) break; continue; }` + : `if (!${fn}()) break;`; if (kind === '*') { const before = this.id(), bsn = this.id(); return [ `while (true) {`, ` const ${before} = pos; const ${bsn} = scn;`, - ` if (!${fn}()) break;`, + ` ${fail}`, ` if (pos === ${before} && scn === ${bsn}) break;` + ext, `}`, ].join('\n'); @@ -919,7 +987,7 @@ class Emitter { `if (!${fn}()) { ${onFail} }`, `while (true) {`, ` const ${before} = pos; const ${bsn} = scn;`, - ` if (!${fn}()) break;`, + ` ${fail}`, ` if (pos === ${before} && scn === ${bsn}) break;` + ext, `}`, ].join('\n'); @@ -1214,6 +1282,7 @@ class Emitter { export function emitParser(grammar: CstGrammar): string { const a = analyze(grammar); const e = new Emitter(a); + e.litT = (v: string) => a.symtab.puLitKind.get(v) ?? -1; const entry = findEntryRule(grammar); // Grammar-lite for the lexer: ONLY what createLexer reads (tokens, precs, the @@ -1320,8 +1389,11 @@ export function emitParser(grammar: CstGrammar): string { e.emit(`const ENTRY = ${J(entry)};`); // Rule-name table: rowRule stores the index; '$template' takes the slot after the // declared rules (parseTemplateExpr's synthetic node). - e.emit(`const RULE_NAMES = ${J([...grammar.rules.map(r => r.name), '$template'])};`); + e.emit(`const RULE_NAMES = ${J([...grammar.rules.map(r => r.name), '$template', '$error'])};`); e.emit(`const RID_TEMPLATE = ${grammar.rules.length};`); + e.emit(`const RID_ERROR = ${grammar.rules.length + 1};`); + // (recovery sync closers are threaded per-loop from the enclosing seq — see + // quantFollowT; a global closer table froze top-level recovery at any ']'.) e.emit(`const prattRuleNames = new Set(${J([...a.prattRules])});`); // The expression rule the template-interpolation fallback (findExprRule) picks: // first pratt rule that isn't Type, in declaration order. Bake the resolved name. @@ -1527,6 +1599,12 @@ let rowKC = new Uint8Array(8192); // eagerly). rowNF = first kid index (absolute, like rowStart) that may hold an // end-relative value; batch parses never flip, so the decode branch never fires. let rowNF = new Int32Array(8192).fill(0x7fffffff); +// recovery-made bit: the row was memoized during a RECOVERING parse while recovery +// candidates were being created under it — its subtree may contain $error rows, so +// a STRICT pass must not adopt it (an adopted error region would let a strict pass +// 'succeed' over broken text and wipe its diagnostics). Recovering passes adopt +// these rows freely. +let rowRM = new Uint8Array(8192); function ktr(p, k) { const v = kidTokRel[k]; return v < 0 ? v + rowTokLen[p] + 1 : v; } function kcr(p, k) { const v = kidRel[k]; return v < 0 ? v + rowLen[p] + 1 : v; } // transient BUILD coordinates (absolute), valid for rows completed in the current @@ -1561,6 +1639,7 @@ function growRows() { const ok = new Uint8Array(rowCap); ok.set(rowOK); rowOK = ok; const kc = new Uint8Array(rowCap); kc.set(rowKC); rowKC = kc; const nf = new Int32Array(rowCap).fill(0x7fffffff); nf.set(rowNF.subarray(0, nodeN)); rowNF = nf; + const rm = new Uint8Array(rowCap); rm.set(rowRM.subarray(0, nodeN)); rowRM = rm; const ac = new Int32Array(rowCap); ac.set(absChar); absChar = ac; const at = new Int32Array(rowCap); at.set(absTok); absTok = at; } @@ -1619,6 +1698,16 @@ function finishNode(rid, mark) { rowOK[id] = 0; rowKC[id] = 0; rowNF[id] = 0x7fffffff; + rowRM[id] = 0; + // recovery-made propagation: STRUCTURAL — a row contains an error iff a kid is an + // $error row or itself recovery-made. Batch parses never enter the branch. + if (recovering) { + const ke = rowStart[id] + rowCount[id]; + for (let i2 = rowStart[id]; i2 < ke; i2++) { + const e2 = kids[i2]; + if (e2 >= 0 && (rowRM[e2] !== 0 || rowRule[e2] === RID_ERROR)) { rowRM[id] = 1; break; } + } + } absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; @@ -1655,6 +1744,16 @@ function finishWrap(rid, lhsId, mark) { rowOK[id] = 0; rowKC[id] = 0; rowNF[id] = 0x7fffffff; + rowRM[id] = 0; + // recovery-made propagation: STRUCTURAL — a row contains an error iff a kid is an + // $error row or itself recovery-made. Batch parses never enter the branch. + if (recovering) { + const ke = rowStart[id] + rowCount[id]; + for (let i2 = rowStart[id]; i2 < ke; i2++) { + const e2 = kids[i2]; + if (e2 >= 0 && (rowRM[e2] !== 0 || rowRule[e2] === RID_ERROR)) { rowRM[id] = 1; break; } + } + } absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; @@ -1726,6 +1825,12 @@ function matchPuLitGT(pu) { ${e.soa ? '' : 'const restText = tkText[pos].slice(1);'} if (tokN === tkCap) growTok(); parenCachePos = -1; + // token indices shift past this point: the OLD-TREE adoption mapping + // (adoptDmg*/adoptDelta, frozen at edit start) is no longer valid — turn + // adoption off for the remainder of this parse (the '>' split is rare; the + // memo generation bump below already isolates the memo) + adoptRoot = -1; + adoptRunPos = -1; tkK.copyWithin(pos + 1, pos, tokN); tkT.copyWithin(pos + 1, pos, tokN); tkOff.copyWithin(pos + 1, pos, tokN); @@ -2201,6 +2306,7 @@ function parseRuleEntry(idx, rid, name, core) { suppressNext = null; const capped = parseLimit >= 0; const start = pos; + const rf0 = recFires; // Capture the arrays together: a '>'-splice inside core() detaches them via // fill(undefined), and the store below must then write into the DETACHED arrays // (i.e. be discarded), exactly like the old per-rule Map did. @@ -2296,7 +2402,10 @@ function parseRuleEntry(idx, rid, name, core) { mx[start] = maxPos; mg[start] = memoGenCur; // the TRUE probe watermark — the +2 read slack (stop token, // SECOND-token dispatch) is applied at INVALIDATION time - if (result >= 0) rowOK[result] = 1; + if (result >= 0) { + rowOK[result] = 1; + if (recovering && recFires !== rf0) rowRM[result] = 1; + } } if (result >= 0) { scPush(result); return true; } @@ -2439,11 +2548,30 @@ function runParse(entryRule) { return er; } if (!RULES[entry]()) { - const hasTok = pos < cap; - throw new Error('Parse error at offset ' + (hasTok ? toff(pos) : 0) + ': unexpected ' + (hasTok ? "'" + tokTextAt(pos) + "'" : 'end of input') + farthest(pos)); + if (!recovering || !recoverArmed()) { + const hasTok = pos < cap; + throw new Error('Parse error at offset ' + (hasTok ? toff(pos) : 0) + ': unexpected ' + (hasTok ? "'" + tokTextAt(pos) + "'" : 'end of input') + farthest(pos)); + } + const mark = scn; + const from = pos; + while (pos < tokN) { scPush(~(pos << 2)); pos++; } + if (pos > maxPos) maxPos = pos; + docDiags.push({ offset: from < tokN ? toff(from) : 0, end: tokN > 0 ? tend(tokN - 1) : 0, message: 'no parse' }); + scPush(finishNode(RID_ERROR, mark)); } if (pos < tokN) { - throw new Error('Parse error at offset ' + toff(pos) + ": unexpected '" + tokTextAt(pos) + "' after successful parse" + farthest(pos)); + if (!recovering || !recoverArmed()) { + throw new Error('Parse error at offset ' + toff(pos) + ": unexpected '" + tokTextAt(pos) + "' after successful parse" + farthest(pos)); + } + // absorb the unconsumed tail and WRAP [root, tail] — only non-repetition entry + // rules can get here (a rep entry absorbs at its own level) + const mark = scn; + const from = pos; + while (pos < tokN) { scPush(~(pos << 2)); pos++; } + if (pos > maxPos) maxPos = pos; + docDiags.push({ offset: toff(from), end: tend(tokN - 1), message: "unexpected '" + tokTextAt(from) + "' after successful parse" }); + scPush(finishNode(RID_ERROR, mark)); + scPush(finishNode(RID_ERROR, 0)); } const rootId = sc[--scn]; rootCharBase = absChar[rootId]; rootTokBase = absTok[rootId]; @@ -2453,14 +2581,7 @@ function runParse(entryRule) { // Source of the last COMPLETED parse — the token columns, arena and memo describe it. // null whenever the module state is not a coherent snapshot (no parse yet, or the last // attempt threw), so parseEdited falls back to a full parse. -// Coherent-edit-base flag: false after a rejected attempt (the next edit falls -// back to a full re-parse of the document text). -let lastOk = false; -// Pieces snapshot of the LIVE tree's text (survives a rejected edit): the reject -// path re-lexes it so the handle keeps reading the previous tree. The document -// pieces above advance on EVERY edit, accepted or rejected — the editor's buffer -// applied the change regardless, and later coordinates are against it. -let treePieces = null; + // the LAST parse root's absolute coordinates (the descent origin — see visit/toObject) let rootCharBase = 0; let rootTokBase = 0; @@ -2532,6 +2653,7 @@ function adoptSeek(q, rid) { let xid = e, xb = cb; for (;;) { if (rowOK[xid] !== 0 && rowRule[xid] === rid + && (recovering || rowRM[xid] === 0) && (q + rowExt[xid] + 2 <= adoptDmgStart || q >= adoptDmgOldEnd)) { return xid; } @@ -2548,6 +2670,136 @@ function adoptSeek(q, rid) { adoptPath.push(id); adoptBase.push(base); } } +// ── Error recovery (the TOTAL second pass) ── +// parse/edit never crash on input: the strict pass runs first (valid inputs take it +// exclusively — byte-identical trees, full PEG alternative exploration), and only a +// strict REJECT re-parses with the recovering flag set. Failing elements absorb +// tokens into $error rows (their leaves keep the CST text-tiling invariant); what +// went wrong lands in docDiags — the cst.errors field. +let recovering = false; +// cst.errors — a VIEW rebuilt per parse/edit from two sources (array identity is +// stable; contents are spliced in place): +// docLex: STRUCTURED lexer diagnostics (kind + position), persistent across edits +// (shifted like any suffix span; the damage window's re-lex replaces its range). +// Messages are FORMATTED at settle time with the CURRENT offset — a stored +// message string would embed a stale offset after shifts. +// parser diagnostics: derived from the TREE — fresh $error rows via the surviving +// recovery candidates, ADOPTED ones by walking the rowRM-marked subtrees that +// adoption reused this pass (a recovering pass adopts error regions wholesale, +// so per-pass collection alone would silently drop their diagnostics). docPar +// keeps the formatted result for the paths that do not re-parse (surgery). +let docDiags = []; +let docLex = []; +let docPar = []; + +function lexMsg(g) { + if (g.kind === 0) return "Unexpected character at offset " + g.offset + ": '" + g.ch + "'"; + if (g.kind === 1) return 'Invalid escape sequence in template at offset ' + g.offset; + if (g.kind === 2) return 'Unterminated template literal at offset ' + g.offset; + return "Invalid identifier escape at offset " + g.offset + ": '" + g.ch + "'"; +} +// ── Recovery BARS: the discipline that keeps recovery equivalence-safe ── +// A repetition element fails constantly during ORDINARY parsing (a statement list +// legitimately ends at 'case'; a losing longest-match arm fails mid-probe). Letting +// recovery fire at any failure absorbs valid text and RESCUES losing arms — and the +// incremental side, which adopts strictly-parsed rows instead of re-probing them, +// would diverge from a fresh recovering parse. Recovery therefore only fires at +// positions a STRICT pass has proven to fail: each attempt runs strictly except at +// the ordered bar list (fire when probing reaches the bar, then disarm); a failure +// past the last bar aborts the attempt, appends the new farthest-fail bar, and the +// pass re-runs (adoption keeps re-runs cheap). Bars are text-determined, so fresh +// and incremental recovering parses are byte-identical by construction. +let recoverBars = []; +let recoverFree = false; // iteration-cap fallback: fire at any failure (still deterministic) +// Monotone count of recovery FIRES (winning or losing arms alike): a rule whose +// parse window saw any fire may have probed LESS than a strict parse would (the +// fire ends a losing arm's exploration early), so its stored watermark cannot be +// trusted by a STRICT adoption — rowRM marks it (structural error containment is +// propagated separately at finishNode). +let recFires = 0; + +// Collect $error rows under an adopted recovery-made subtree: offset/end from the +// row spans, the message re-derived from the first absorbed token — byte-identical +// to what recoverSkip emitted when the row was built. +// Collect every $error row in the FINAL tree by descending only the recovery-made +// spine (rowRM propagates structurally at finishNode): O(error paths), no global +// walk, no per-candidate bookkeeping — losing-arm rows are simply unreachable. +function collectErrRows(id, charBase, tokBase) { + if (rowRule[id] === RID_ERROR) { + if (rowCount[id] > 0) { + const fe = kids[rowStart[id]]; + const ft = tokBase + ((~fe) >>> 2); + docPar.push({ offset: charBase, end: charBase + rowLen[id], message: "unexpected '" + docText(toff(ft), tend(ft)) + "'" }); + } + return; + } + const cs = rowStart[id], n = rowCount[id]; + for (let i = 0; i < n; i++) { + const e = kids[cs + i]; + if (e >= 0 && (rowRM[e] !== 0 || rowRule[e] === RID_ERROR)) { + collectErrRows(e, charBase + kcr(id, cs + i), tokBase + ktr(id, cs + i)); + } + } +} +// Rebuild the cst.errors view: formatted lexer diagnostics + tree-derived parser +// diagnostics (fresh survivors + adopted rowRM subtrees), ordered by offset. +function settleDiags() { + docPar.length = 0; + if (lastRoot >= 0 && (rowRM[lastRoot] !== 0 || rowRule[lastRoot] === RID_ERROR)) { + collectErrRows(lastRoot, rootCharBase, rootTokBase); + } + rebuildDiagView(); +} +function rebuildDiagView() { + docDiags.length = 0; + for (let i = 0; i < docLex.length; i++) { + const g = docLex[i]; + docDiags.push({ offset: g.offset, end: g.end, message: lexMsg(g) }); + } + for (let i = 0; i < docPar.length; i++) docDiags.push(docPar[i]); + docDiags.sort((x, y) => x.offset - y.offset); +} +// Armed iff some bar lies in [pos, maxPos]: the failing element started at/before a +// proven fail point and probing reached it. STATELESS — a losing longest-match arm +// may fire and be discarded without consuming anything (backtrack-safe), legitimate +// repetition ends PAST a bar stay silent (pos > bar), and the runParse safety net +// obeys the same discipline (an ungated net would absorb on the FIRST bar-less +// attempt and pre-empt the whole iteration). +function recoverArmed() { + if (recoverFree) return true; + for (let i = 0; i < recoverBars.length; i++) { + const b = recoverBars[i]; + // armed iff parsing is STUCK AT the bar right now: the failing element starts + // at/before it and the farthest probe sits ON it (+2 read slack). maxPos is + // globally monotone, so without the upper window every loop at pos <= bar + // would arm once anything ever probed past the bar (measured: a fire at + // pos=214 absorbing 8000 tokens). Once a fire absorbs past the bar, maxPos + // leaves the window and lower loops stay silent. + if (pos <= b && b <= maxPos && maxPos <= b + 2) return true; + if (b > maxPos) break; + } + return false; +} +function recoverSkip(canStart, closerT) { + if (!recoverArmed()) return false; + if (pos >= cap) return false; + if (closerT >= 0 && tkK[pos] === K_PUNCT && tkT[pos] === closerT) return false; + const mark = scn; + const from = pos; + // the offending token is consumed unconditionally (it may well be IN the + // element's FIRST set — the element failed past it), then run to a sync point + scPush(~(pos << 2)); pos++; + while (pos < cap + && !(closerT >= 0 && tkK[pos] === K_PUNCT && tkT[pos] === closerT) + && !(canStart !== null && canStart(pos))) { + scPush(~(pos << 2)); pos++; + } + if (pos > maxPos) maxPos = pos; + recFires++; + scPush(finishNode(RID_ERROR, mark)); + return true; +} + // Run-extension: a repetition whose element was just ADOPTED bulk-adopts the // following OLD SIBLINGS in one tight loop — whole-statement reuse without // re-entering parseRuleEntry/adoptSeek once per element. Soundness: each member @@ -2572,6 +2824,7 @@ function runExtend(rid) { if (e < 0) break; if (pb + ktr(P, i) !== oq) break; if (rowRule[e] !== rid || rowOK[e] === 0) break; + if (!recovering && rowRM[e] !== 0) break; const tl = rowTokLen[e]; if (tl === 0) break; const ex = rowExt[e]; @@ -2730,8 +2983,19 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { const ks = kidN; for (let k = 0; k < Da; k++) { kids[ks + k] = kids[csD + k]; - kidRel[ks + k] = kidRel[csD + k]; - kidTokRel[ks + k] = kidTokRel[csD + k]; + // NORMALIZE prefix rels to absolute while copying: the boundary remap below + // puts rowNF at the suffix start, so an end-relative value surviving in the + // copied prefix would never flip down again — its decode would drift by every + // later length update (lengths are still the OLD ones here, so the decode + // bias matches the encoding) + const vtr = kidTokRel[csD + k]; + if (vtr < 0) { + kidTokRel[ks + k] = vtr + rowTokLen[D] + 1; + kidRel[ks + k] = kidRel[csD + k] + rowLen[D] + 1; + } else { + kidRel[ks + k] = kidRel[csD + k]; + kidTokRel[ks + k] = vtr; + } } for (let k = 0; k < f; k++) { const id = sc[k]; @@ -2915,12 +3179,13 @@ function makeDoc() { rowStart: new Int32Array(8192), rowCount: new Int32Array(8192), rowExt: new Int32Array(8192), rowOK: new Uint8Array(8192), rowKC: new Uint8Array(8192), rowNF: new Int32Array(8192).fill(0x7fffffff), + rowRM: new Uint8Array(8192), absChar: new Int32Array(8192), absTok: new Int32Array(8192), rowCap: 8192, nodeN: 0, kids: new Int32Array(16384), kidRel: new Int32Array(16384), kidTokRel: new Int32Array(16384), kidCap: 16384, kidN: 0, memoNode: [], memoEnd: [], memoExt: [], memoGen: [], memoGenCur: 0, - lastOk: false, treePieces: null, + docDiags: [], docLex: [], docPar: [], docPieces: null, docPieceOff: null, docLen: 0, docFlat: null, docCur: 0, rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, ${e.soa ? ' parenCachePos: -1, parenCacheStack: [],' : ''} @@ -2933,12 +3198,12 @@ function saveDoc(d) { d.tkDp = tkDp; d.tkPd = tkPd; d.tkCap = tkCap; d.tokN = tokN; d.srcLenP1 = srcLenP1; d.negFrom = negFrom; d.rowRule = rowRule; d.rowLen = rowLen; d.rowTokLen = rowTokLen; d.rowStart = rowStart; - d.rowCount = rowCount; d.rowExt = rowExt; d.rowOK = rowOK; d.rowKC = rowKC; d.rowNF = rowNF; + d.rowCount = rowCount; d.rowExt = rowExt; d.rowOK = rowOK; d.rowKC = rowKC; d.rowNF = rowNF; d.rowRM = rowRM; d.absChar = absChar; d.absTok = absTok; d.rowCap = rowCap; d.nodeN = nodeN; d.kids = kids; d.kidRel = kidRel; d.kidTokRel = kidTokRel; d.kidCap = kidCap; d.kidN = kidN; d.memoNode = memoNode; d.memoEnd = memoEnd; d.memoExt = memoExt; d.memoGen = memoGen; d.memoGenCur = memoGenCur; - d.lastOk = lastOk; d.treePieces = treePieces; + d.docDiags = docDiags; d.docLex = docLex; d.docPar = docPar; d.docPieces = docPieces; d.docPieceOff = docPieceOff; d.docLen = docLen; d.docFlat = docFlat; d.docCur = docCur; d.rootCharBase = rootCharBase; d.rootTokBase = rootTokBase; d.lastRoot = lastRoot; d.lastRootTok = lastRootTok; @@ -2951,12 +3216,12 @@ function loadDoc(d) { tkDp = d.tkDp; tkPd = d.tkPd; tkCap = d.tkCap; tokN = d.tokN; srcLenP1 = d.srcLenP1; negFrom = d.negFrom; rowRule = d.rowRule; rowLen = d.rowLen; rowTokLen = d.rowTokLen; rowStart = d.rowStart; - rowCount = d.rowCount; rowExt = d.rowExt; rowOK = d.rowOK; rowKC = d.rowKC; rowNF = d.rowNF; + rowCount = d.rowCount; rowExt = d.rowExt; rowOK = d.rowOK; rowKC = d.rowKC; rowNF = d.rowNF; rowRM = d.rowRM; absChar = d.absChar; absTok = d.absTok; rowCap = d.rowCap; nodeN = d.nodeN; kids = d.kids; kidRel = d.kidRel; kidTokRel = d.kidTokRel; kidCap = d.kidCap; kidN = d.kidN; memoNode = d.memoNode; memoEnd = d.memoEnd; memoExt = d.memoExt; memoGen = d.memoGen; memoGenCur = d.memoGenCur; - lastOk = d.lastOk; treePieces = d.treePieces; + docDiags = d.docDiags; docLex = d.docLex; docPar = d.docPar; docPieces = d.docPieces; docPieceOff = d.docPieceOff; docLen = d.docLen; docFlat = d.docFlat; docCur = d.docCur; rootCharBase = d.rootCharBase; rootTokBase = d.rootTokBase; lastRoot = d.lastRoot; lastRootTok = d.lastRootTok; @@ -2987,7 +3252,6 @@ function swapBuffers() { ${e.soa ? '' : 'let altText = [];'} function parseCore(source, entryRule) { - lastOk = false; adoptRoot = -1; adoptRunPos = -1; lexInto(source); @@ -3003,11 +3267,27 @@ function parseCore(source, entryRule) { const root = runParse(entryRule); lastRoot = root; lastRootTok = rootTokBase; - lastOk = true; - treePieces = docPieces.slice(); return root; } +// In-place diagnostic shift for a LOCALLY-strict edit (surgery): diags before the +// damage stay, diags at/after the old damage end ride the char delta, overlapping +// ones drop (their region re-parsed strictly). Splices in place — cst.errors IS +// this array. +// Parser-diag shift for the LOCALLY-strict paths (surgery / strict success): the +// LEXER list is maintained by the window block (which already dropped the re-lexed +// range and shifted the suffix — shifting here would double-apply the delta). +function shiftDiags(a, b, delta) { + let w = 0; + for (let i = 0; i < docPar.length; i++) { + const g = docPar[i]; + if (g.end <= a) docPar[w++] = g; + else if (g.offset >= b) { g.offset += delta; g.end += delta; docPar[w++] = g; } + } + docPar.length = w; + rebuildDiagView(); +} + // ── Incremental re-parse ── // No edit protocol: the caller hands the NEW source; the damage window is DERIVED by // diffing the old and new token columns (longest identical prefix; longest suffix @@ -3023,30 +3303,30 @@ function parseCore(source, entryRule) { // until then. Lexing is FULL-FILE by design: the lexer carries cross-token state // (template nesting, regex context, markup modes), full lexing is a small share of a // parse, and the diff is what localizes the damage — not the lexer. -function editCore(entryRule, edits) { - try { - return editCoreRun(entryRule, edits); - } catch (e) { - // REJECTED edit: the splice (and any '>' splits of the failed attempt) already - // rewrote the token columns to the rejected text, and the append-mode fallback - // may have grown the arena — but the live tree's ROWS are untouched. Re-lexing - // the live tree's source restores every read path (leaf spans, visit, next - // edit's restart anchors); O(n) on the reject path only. - if (treePieces !== null) { - // restore the token columns to the LIVE TREE's text — but the DOCUMENT text - // must stay on the rejected content (lexInto/tokenize resets the doc layer - // as a side effect, so save it around the re-lex) - const kP = docPieces, kO = docPieceOff, kL = docLen, kF = docFlat; - lexInto(treePieces.join('')); - docPieces = kP; docPieceOff = kO; docLen = kL; docFlat = kF; docCur = 0; - lastOk = false; - } - throw e; - } +// Last-resort totality net: a layer without recovery support threw — the handle +// API still never crashes. Zero-width $error root + the thrown message as the +// diagnostic; the next successful parse/edit resumes normal service. +function totalNet(e) { + docDiags.length = 0; + docLex.length = 0; + docPar.length = 0; + docDiags.push({ offset: 0, end: 0, message: String(e && e.message ? e.message : e) }); + scn = 0; + const root = finishNode(RID_ERROR, 0); + lastRoot = root; + lastRootTok = 0; + rootCharBase = 0; + rootTokBase = 0; + return root; +} +function apiMisuse(msg) { + const e = new Error(msg); + e.apiMisuse = true; + return e; } -function editCoreRun(entryRule, edits) { +function editCore(entryRule, edits) { if (edits === undefined || edits.length === 0) { - throw new Error('edit() requires the changes: [{ start, end, text }] (LSP-style - each edit in the coordinates of the document AFTER the preceding edits in the array)'); + throw apiMisuse('edit() requires the changes: [{ start, end, text }] (LSP-style - each edit in the coordinates of the document AFTER the preceding edits in the array)'); } // The engine owns the document text: the new source is BUILT from the changes, // so "the ranges do not match the text" is unrepresentable. Each edit is applied @@ -3055,7 +3335,7 @@ function editCoreRun(entryRule, edits) { // coordinates, the old end recovered through the total delta. V8 cons strings // make the slice+concat construction cheap; the flat-string cost, where a read // path needs one, is the same the caller would have paid building the text. - if (docPieces === null) throw new Error('edit() before parse(): no document'); + if (docPieces === null) throw apiMisuse('edit() before parse(): no document'); const oldLen = docLen; { let dS = 0x7fffffff; @@ -3064,7 +3344,7 @@ function editCoreRun(entryRule, edits) { const ed = edits[i]; const start = ed.start, end = ed.end, text = ed.text; if (!(start >= 0 && start <= end && end <= docLen) || typeof text !== 'string') { - throw new Error('edit() change #' + i + ' out of range: [' + start + ', ' + end + ') of ' + docLen); + throw apiMisuse('edit() change #' + i + ' out of range: [' + start + ', ' + end + ') of ' + docLen); } applyChange(start, end, text); const newEnd = start + text.length; @@ -3076,29 +3356,7 @@ function editCoreRun(entryRule, edits) { editDmgS = dS; editDmgE = dE; } - if (!lastOk) { - // No coherent edit base (a previous attempt rejected): full re-parse in APPEND - // mode — parseCore would reset the arena and destroy the live tree the handle - // still exposes if THIS parse rejects too. parse() is the only compaction point. - const whole = flattenDoc(); - lexInto(whole); - if (memoEnd.length !== MEMO_RULES) { - memoNode = new Array(MEMO_RULES); - memoEnd = new Array(MEMO_RULES); - memoExt = new Array(MEMO_RULES); - memoGen = new Array(MEMO_RULES); - } - memoGenCur++; - adoptRoot = -1; - adoptRunPos = -1; - const root = runParse(entryRule); - lastRoot = root; - lastRootTok = rootTokBase; - lastOk = true; - treePieces = docPieces.slice(); - return root; - } - lastOk = false; + ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // Damage envelope from the composed changes: prefix coordinates are shared, the // old end comes back through the total delta. @@ -3110,7 +3368,16 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // Restart anchor: the last token B ending at/before the damage whose recorded // depths are zero and whose shape carries no cross-token lexer flag (')' control- // head, postfix-ambiguous op). B = -1 restarts at the file head — always sound. - const B = findRestart(cs); + // + // RECOVERED streams add a constraint a strict stream never has: a lexer + // diagnostic marks a point whose tokenization can COUPLE BACKWARD to a later + // edit (a dangling quote pairs with a newly typed one, re-lexing everything + // between), so the window must start below the EARLIEST such point before the + // damage. Forward coupling needs no guard — the resync equality only accepts + // exact re-agreement with the old stream. + let anchorCs = cs; + for (let i = 0; i < docLex.length; i++) if (docLex[i].offset < anchorCs) anchorCs = docLex[i].offset; + const B = findRestart(anchorCs); const initParens = reconstructParensCached(B); const oN = tokN; // first old token at/after the damage end — the resync search floor @@ -3133,16 +3400,23 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // an absolute bias; -2 = ran off the window end before resyncing — re-materialize // a larger window and retry (the common case fits the first one). let R0; + const preLexN = docLex.length; // persisted lexer diags; the window's own + // emissions land after this index { let wHi = ceNew + 4096; for (;;) { if (wHi > docLen) wHi = docLen; const windowStr = docText(startOff, wHi); + docLex.length = preLexN; // an aborted attempt re-lexes: drop its pushes tokN = 0; try { R0 = lexCore(windowStr, 0, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs, initParens.slice(), startOff, wHi < docLen); } catch (e2) { - if (e2 !== LEX_RETRY) throw e2; + if (e2 !== LEX_RETRY) { + if (recovering) throw e2; // a recovering lexer never throws — a bug + recovering = true; // lex error: the rest of this edit runs in + continue; // the recovering pass (parse included) + } R0 = -2; } if (R0 !== -2) break; @@ -3153,6 +3427,26 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── const R = R0 >= 0 ? R0 : oN; swapBuffers(); // live = OLD stream again; window sits in the alt buffers tokN = oN; + // Persisted lexer diagnostics (AFTER the swap-back — toff must decode the OLD + // columns, not the spare window set): entries inside the re-lexed range are + // superseded by the window's own emissions (queued at [preLexN..)); suffix + // entries ride the char delta; prefix entries are untouched. + { + const wndLo = startOff; + const wndHiOld = R < oN ? toff(R) : oldLen; + let w2 = 0; + for (let i = 0; i < preLexN; i++) { + const g = docLex[i]; + if (g.end <= wndLo) docLex[w2++] = g; + else if (g.offset >= wndHiOld) { g.offset += charDelta; g.end += charDelta; docLex[w2++] = g; } + } + // window emissions sit at [preLexN..) in CURRENT coordinates — never shifted; + // compact them down after the kept prefix + if (w2 < preLexN) { + for (let i = preLexN; i < docLex.length; i++) docLex[w2++] = docLex[i]; + docLex.length = w2; + } + } // EOF-relative maintenance: move the negative-zone boundary to THIS edit's suffix // start R. Tokens dropping out of the suffix ([negFrom, R)) flip back to absolute // (they sit at/before the damage now — EOF-unstable); tokens entering it @@ -3245,26 +3539,98 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── adoptPath.length = 0; adoptBase.length = 0; adoptRunPos = -1; - const sroot = trySurgery(p, dOldEnd, tokenDelta, charDelta); + const sroot = recovering ? -1 : trySurgery(p, dOldEnd, tokenDelta, charDelta); if (sroot >= 0) { adoptRoot = -1; rootCharBase = toff(adoptRootTok); rootTokBase = adoptRootTok; lastRoot = sroot; lastRootTok = adoptRootTok; - lastOk = true; - treePieces = docPieces.slice(); + shiftDiags(cs, ceOld, charDelta); return sroot; } - const root = runParse(entryRule); + let root; + { + // recovering may already be true here (the window relex recovered a lex error + // and pushed its diagnostics): the first attempt then runs with EMPTY bars — + // strict at the repetition level — and a parse failure flows into the same bar + // iteration. Lex diagnostics are re-seeded into every attempt (the window was + // lexed once; only the parse re-runs). + const lexRecovered = recovering; + const lexSnap = docLex.slice(); + try { + root = runParse(entryRule); + if (!lexRecovered) { + // a strict full pass proves the document free of PARSE errors; persisted + // lexer diagnostics (e.g. an invalid escape outside the damage — its token + // is valid) survive with their shifted positions + docPar.length = 0; + rebuildDiagView(); + } else { + lastRoot = root; + lastRootTok = rootTokBase; + settleDiags(); + } + recovering = false; + } catch (e) { + // total edit: re-run the SAME spliced stream under the bar discipline — + // adoption applies on every attempt (rows that parse strictly are mode- + // neutral), so re-runs stay O(damage)-ish + recovering = true; + const bars = []; + let done = false; + try { + for (let attempt = 0; attempt < 32 && !done; attempt++) { + try { + docLex.length = 0; + for (let i = 0; i < lexSnap.length; i++) docLex.push(lexSnap[i]); + recoverBars = bars; + memoGenCur++; + adoptPath.length = 0; + adoptBase.length = 0; + adoptRunPos = -1; + scn = 0; + root = runParse(entryRule); + done = true; + } catch (e2) { + let b = maxPos; + if (bars.length > 0 && b <= bars[bars.length - 1]) b = bars[bars.length - 1] + 1; + bars.push(b); + } + } + if (!done) { + recoverFree = true; + try { + docLex.length = 0; + for (let i = 0; i < lexSnap.length; i++) docLex.push(lexSnap[i]); + memoGenCur++; + adoptPath.length = 0; + adoptBase.length = 0; + adoptRunPos = -1; + scn = 0; + root = runParse(entryRule); + } catch (e3) { + root = totalNet(e3); + } finally { + recoverFree = false; + } + } + } finally { + recovering = false; + recoverBars = []; + } + lastRoot = root; + lastRootTok = rootTokBase; + settleDiags(); + } + } adoptRoot = -1; lastRoot = root; lastRootTok = rootTokBase; - lastOk = true; - treePieces = docPieces.slice(); return root; } + export { tokenize }; // ── Module-level API: the DEFAULT document (one shared session; tokenize and the // raw tree/tokenAt views read the ACTIVE doc — they are gate/debug surfaces) ── @@ -3295,14 +3661,62 @@ export function createParser() { parse(source, entryRule) { activate(d); entryUsed = entryRule; - gen++; // re-opening resets the arena: old handles die even if THIS parse rejects - const root = parseCore(source, entryRule); - return { d, gen, root }; + gen++; // re-opening resets the arena: old handles die regardless of outcome + docDiags.length = 0; + docLex.length = 0; + docPar.length = 0; + let root; + try { + root = parseCore(source, entryRule); + } catch (e) { + // total parse: the strict pass rejected — iterate recovery under the bar + // discipline (see recoverBars); the iteration cap degrades to free-fire, + // and a recovery-blind layer (fallback lexers) degrades to the zero-width + // $error root. Never a crash. + recovering = true; + const bars = []; + let done = false; + try { + for (let attempt = 0; attempt < 32 && !done; attempt++) { + try { + docLex.length = 0; + recoverBars = bars; + root = parseCore(source, entryRule); + done = true; + } catch (e2) { + let b = maxPos; + if (bars.length > 0 && b <= bars[bars.length - 1]) b = bars[bars.length - 1] + 1; + bars.push(b); + } + } + if (!done) { + recoverFree = true; + try { + docLex.length = 0; + root = parseCore(source, entryRule); + } catch (e3) { + root = totalNet(e3); + } finally { + recoverFree = false; + } + } + } finally { + recovering = false; + recoverBars = []; + } + settleDiags(); + } + return { d, gen, root, errors: docDiags }; }, edit(cst, edits) { chk(cst); activate(d); - cst.root = editCore(entryUsed, edits); + try { + cst.root = editCore(entryUsed, edits); + } catch (e) { + if (e instanceof RangeError || (e && e.apiMisuse)) throw e; + cst.root = totalNet(e); + } }, visit(cst, fns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, tree: view, diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index 0178d84..bc22810 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -14,7 +14,7 @@ const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-incremental.mjs'; writeFileSync(emPath, emitParser(grammar)); type Edit = { start: number; end: number; text: string }; -type Cst = { root: number }; +type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; type Parser = { parse(s: string): Cst; edit(cst: Cst, edits: Edit[]): void; @@ -28,7 +28,7 @@ type Em = { createParser(): Parser; }; const session = ((await import(emPath + '?session=' + process.pid)) as Em).createParser(); -const fresh = (await import(emPath + '?fresh=' + process.pid)) as Em; +const freshP = ((await import(emPath + '?fresh=' + process.pid)) as Em).createParser(); // Deterministic LCG so failures replay. let seedState = 0x2F6E2B1; @@ -107,26 +107,20 @@ const GLUE: Array<[string, string]> = [ ['f(a, b);\ng(c);\n', 'f(a, bc);\ng(c);\n'], ]; -let steps = 0, equal = 0, bothReject = 0, mismatch = 0; +let steps = 0, equal = 0, withErrors = 0, mismatch = 0; let tInc = 0, tFresh = 0; const failures: string[] = []; for (const [base, edited] of GLUE) { steps++; const c0 = session.parse(base); - let fe: string | null = null, ie: string | null = null; - let fr = -1; - try { fr = fresh.parse(edited); } catch (e) { fe = (e as Error).message; } - try { session.edit(c0, [diffChange(base, edited)]); } catch (e) { ie = (e as Error).message; } - if (fe !== null || ie !== null) { - if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: fresh ${fe ? 'reject' : 'accept'} / incremental ${ie ? 'reject' : 'accept'}`); } - else bothReject++; - continue; - } - const a = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(fr, fns))); - const b = JSON.stringify(objectify(session.tree, (fns) => session.visit(c0, fns))); + session.edit(c0, [diffChange(base, edited)]); + const fc = freshP.parse(edited); + if (fc.errors.length > 0) withErrors++; + const a = JSON.stringify(objectify(freshP.tree, (fns) => freshP.visit(fc, fns))) + JSON.stringify(fc.errors); + const b = JSON.stringify(objectify(session.tree, (fns) => session.visit(c0, fns))) + JSON.stringify(c0.errors); if (a === b) equal++; - else { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: tree diverges`); } + else { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: tree/errors diverge`); } } for (const f of FILES) { @@ -135,55 +129,31 @@ for (const f of FILES) { for (let k = 0; k < STEPS; k++) { const { next, edit } = mutate(text); steps++; - let freshRoot = -1, freshErr: string | null = null; + // parse/edit are TOTAL: syntax-breaking steps produce error trees compared + // exactly like valid ones (tree AND the errors field, byte-identical) const tf0 = performance.now(); - try { freshRoot = fresh.parse(next); } catch (e) { freshErr = (e as Error).message; } + const fc = freshP.parse(next); const tf1 = performance.now(); - let incErr: string | null = null; const ti0 = performance.now(); - try { session.edit(cst, [edit]); } catch (e) { incErr = (e as Error).message; } + session.edit(cst, [edit]); const ti1 = performance.now(); - if (freshErr !== null || incErr !== null) { - if ((freshErr === null) !== (incErr === null)) { - mismatch++; - if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: fresh ${freshErr ? 'reject' : 'accept'} / incremental ${incErr ? 'reject' : 'accept'}\n fresh: ${freshErr ?? '-'}\n inc: ${incErr ?? '-'}`); - } else bothReject++; - // REJECTED text: the handle stays on the previous tree, but the DOCUMENT - // advances (editor-buffer model — the buffer applied the change regardless, - // and the engine's docSrc tracks it). Model the editor's UNDO: revert via a - // diff edit in the rejected text's coordinates; it must be accepted and - // byte-identical to a fresh parse of the restored text. - try { - session.edit(cst, [diffChange(next, text)]); - const rfr = fresh.parse(text); - const ra = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(rfr, fns))); - const rb = JSON.stringify(objectify(session.tree, (fns) => session.visit(cst, fns))); - if (ra !== rb) { - mismatch++; - if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: REVERT tree diverges`); - } - } catch (e2) { - mismatch++; - if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: revert rejected: ${(e2 as Error).message.slice(0, 50)}`); - } - continue; - } + if (fc.errors.length > 0) withErrors++; tFresh += tf1 - tf0; tInc += ti1 - ti0; - const a = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(freshRoot, fns))); - const b = JSON.stringify(objectify(session.tree, (fns) => session.visit(cst, fns))); + const a = JSON.stringify(objectify(freshP.tree, (fns) => freshP.visit(fc, fns))) + JSON.stringify(fc.errors); + const b = JSON.stringify(objectify(session.tree, (fns) => session.visit(cst, fns))) + JSON.stringify(cst.errors); if (a === b) equal++; else { mismatch++; if (failures.length < 5) { let i = 0; while (i < a.length && i < b.length && a[i] === b[i]) i++; - failures.push(`${f.split('/').pop()} step ${k}: tree diverges @${i}\n fresh: …${a.slice(Math.max(0, i - 50), i + 50)}…\n inc: …${b.slice(Math.max(0, i - 50), i + 50)}…`); + failures.push(`${f.split('/').pop()} step ${k}: tree/errors diverge @${i}\n fresh: …${a.slice(Math.max(0, i - 50), i + 50)}…\n inc: …${b.slice(Math.max(0, i - 50), i + 50)}…`); } } text = next; } } -console.log(`incremental ≡ fresh: ${equal} equal · ${bothReject} both-reject · ${mismatch} MISMATCH (${steps} steps over ${FILES.length} files)`); +console.log(`incremental ≡ fresh: ${equal} equal (${withErrors} recovered with errors) · ${mismatch} MISMATCH (${steps} steps over ${FILES.length} files)`); if (tInc > 0) console.log(`time: incremental ${tInc.toFixed(1)}ms vs fresh ${tFresh.toFixed(1)}ms → ${(tFresh / tInc).toFixed(2)}× faster on accepted edits`); for (const s of failures) console.log(' ✗ ' + s); if (mismatch > 0) { diff --git a/test/multi-doc.ts b/test/multi-doc.ts index d980cbb..5abe09d 100644 --- a/test/multi-doc.ts +++ b/test/multi-doc.ts @@ -1,22 +1,22 @@ -// Gate: DOCUMENTS ARE ISOLATED. The handle API (createParser → parse/edit with -// explicit tree handles) keeps one document's state per parser instance behind a -// lazily-swapped register set — a missed swap field shows up as cross-document -// corruption. Two instances edit two different sources interleaved (plus the -// module-level default-doc API mixed in between); every edited tree must be -// byte-identical (toObject) to a fresh parse of the same text. Also pins the -// handle contract: stale and foreign handles throw instead of silently reading -// an in-place-mutated tree, and a REJECTED edit leaves the old handle valid. +// Gate: DOCUMENTS ARE ISOLATED and the handle API is TOTAL. Each parser instance +// keeps one document's state behind a lazily-swapped register set — a missed swap +// field shows up as cross-document corruption. Two instances edit two different +// sources interleaved (with the module-level default-doc API mixed in between); +// every edited tree AND its errors field must be byte-identical to a fresh handle +// parse of the same text — syntax-breaking edits included (parse/edit never throw +// on input; the strict→recovering two-pass produces the error tree). Also pins the +// handle contract: in-place edits, API misuse throws, re-opening invalidates. // // node test/multi-doc.ts -import { objectify } from './emitted-obj.ts'; import { writeFileSync } from 'node:fs'; import { emitParser } from '../src/emit-parser.ts'; +import { objectify } from './emitted-obj.ts'; const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-multidoc.mjs'; writeFileSync(emPath, emitParser(grammar)); type Edit = { start: number; end: number; text: string }; -type Cst = { root: number }; +type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; type Parser = { parse(s: string): Cst; edit(cst: Cst, edits: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; type Em = { parse(s: string): number; createParser(): Parser }; const em = (await import(emPath + '?v=' + process.pid)) as Em; @@ -53,115 +53,87 @@ function mutate(text: string): { next: string; edit: Edit } { } } -function diffChange(a: string, b: string): Edit { - const minL = Math.min(a.length, b.length); - let s = 0; - while (s < minL && a.charCodeAt(s) === b.charCodeAt(s)) s++; - let e = 0; - while (e < minL - s && a.charCodeAt(a.length - 1 - e) === b.charCodeAt(b.length - 1 - e)) e++; - return { start: s, end: a.length - e, text: b.slice(s, b.length - e) }; -} - const p1 = em.createParser(); const p2 = em.createParser(); const f = em.createParser(); -let cstA = p1.parse(textA); -let cstB = p2.parse(textB); +const cstA = p1.parse(textA); +const cstB = p2.parse(textB); -let steps = 0, equal = 0, bothReject = 0, mismatch = 0, reverts = 0; +let steps = 0, equal = 0, withErrors = 0, mismatch = 0; const failures: string[] = []; for (let k = 0; k < 60; k++) { const onA = (k & 1) === 0; const text = onA ? textA : textB; const { next, edit } = mutate(text); steps++; - let fe: string | null = null, ie: string | null = null; - let fc: Cst | null = null; - try { fc = f.parse(next); } catch (e) { fe = (e as Error).message; } - try { (onA ? p1 : p2).edit(onA ? cstA : cstB, [edit]); } catch (e) { ie = (e as Error).message; } - if (fe !== null || ie !== null) { - if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): fresh ${fe ? 'reject' : 'accept'} / edit ${ie ? 'reject' : 'accept'}`); } - else bothReject++; - // the DOCUMENT advances on reject (editor-buffer model): later coordinates - // are against the rejected text. Model the editor's UNDO: revert to the last - // good text via a diff edit in the rejected text's coordinates — it must be - // ACCEPTED and byte-identical to a fresh parse (the post-reject recovery path - // gets exercised every time a mutation breaks the document). - const good = onA ? textA : textB; - const rv = diffChange(next, good); - try { - (onA ? p1 : p2).edit(onA ? cstA : cstB, [rv]); - const fb = f.parse(good); - const ra = JSON.stringify(objectify(f.tree, (fns) => f.visit(fb, fns))); - const qq = onA ? p1 : p2; - const rb = JSON.stringify(objectify(qq.tree, (fns) => qq.visit(onA ? cstA : cstB, fns))); - if (ra === rb) reverts++; - else { mismatch++; if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): REVERT tree diverges`); } - } catch (e2) { - mismatch++; - if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): revert rejected: ${(e2 as Error).message.slice(0, 50)}`); - } - continue; - } + // parse/edit are TOTAL: syntax-breaking steps produce error trees compared + // exactly like valid ones (tree AND the errors field, byte-identical) + const fc = f.parse(next); + (onA ? p1 : p2).edit(onA ? cstA : cstB, [edit]); + if (fc.errors.length > 0) withErrors++; // mix the module-level default doc in between: it must not disturb either instance if (k % 5 === 0) em.parse('const mix = ' + k + ';'); - const a = JSON.stringify(objectify(f.tree, (fns) => f.visit(fc!, fns))); + const a = JSON.stringify(objectify(f.tree, (fns) => f.visit(fc, fns))) + JSON.stringify(fc.errors); const q = onA ? p1 : p2; - const b = JSON.stringify(objectify(q.tree, (fns) => q.visit(onA ? cstA : cstB, fns))); + const b = JSON.stringify(objectify(q.tree, (fns) => q.visit(onA ? cstA : cstB, fns))) + JSON.stringify((onA ? cstA : cstB).errors); if (a === b) equal++; else { mismatch++; if (failures.length < 5) { let i = 0; while (i < a.length && a[i] === b[i]) i++; - failures.push(`step ${k} (${onA ? 'A' : 'B'}): tree diverges @${i}`); + failures.push(`step ${k} (${onA ? 'A' : 'B'}): tree/errors diverge @${i}`); } } if (onA) textA = next; else textB = next; } -// handle contract: edit mutates the handle IN PLACE (no return — no clone illusion); -// only parse() re-opening the document invalidates old handles; rejects keep the tree. +// handle contract: edit mutates the handle IN PLACE and is TOTAL — invalid text +// produces an error tree plus cst.errors, never a throw; API MISUSE (no changes, +// foreign handles, out-of-range coordinates) still throws; re-opening via parse() +// invalidates prior handles regardless of outcome. let contract = 0; { const p = em.createParser(); const c1 = p.parse('const a = 1;'); const obj = (h: Cst) => JSON.stringify(objectify(p.tree, (fns) => p.visit(h, fns))); - const before = obj(c1); + if (c1.errors.length === 0) contract++; + else failures.push('valid parse reported errors'); p.edit(c1, [{ start: 7, end: 7, text: 'b' }]); // 'const a = 1;' -> 'const ab = 1;' const after = obj(c1); - if (after !== before && after.includes('"end":8')) contract++; // same handle, new tree + if (after.includes('"end":8') && c1.errors.length === 0) contract++; // same handle, new tree else failures.push('in-place edit did not update the handle'); try { p2.edit(c1, [{ start: 0, end: 1, text: 'q' }]); failures.push('foreign handle did not throw'); } catch { contract++; } - let rejected = false; - try { p.edit(c1, [{ start: 6, end: 8, text: ']' }]); } catch { rejected = true; } // 'const ab…' -> 'const ] = 1;' - if (rejected && obj(c1) === after) contract++; // reject keeps the tree - else failures.push('reject-then-read flow broke'); - // coordinates after a REJECT are against the editor's buffer (the rejected text): - // fixing the same spot in those coordinates must recover the session - let recovered = false; - try { p.edit(c1, [{ start: 6, end: 7, text: 'ab' }]); recovered = true; } catch { /* must not throw */ } - if (recovered && obj(c1).includes('"end":13')) contract++; // 'const ] = 1;' -> 'const ab = 1;' - else failures.push('post-reject coordinates did not track the document text'); - const c2 = p.parse('let q = 1;'); - try { obj(c1); failures.push('re-opened document: old handle did not throw'); } catch { contract++; } - // missing ranges: ONE usage only — edit() without ranges must throw, not - // silently fall back to O(file) diff scans + // an INVALID edit is total: error tree + diagnostics, handle stays live + p.edit(c1, [{ start: 6, end: 8, text: ']' }]); // 'const ab…' -> 'const ] = 1;' + if (c1.errors.length > 0 && obj(c1) !== after) contract++; + else failures.push('invalid edit did not surface errors'); + // fixing it in the editor's coordinates drains the errors + p.edit(c1, [{ start: 6, end: 7, text: 'ab' }]); // -> 'const ab = 1;' + if (c1.errors.length === 0 && obj(c1) === after) contract++; + else failures.push('fixing edit did not drain errors'); + // misuse still throws let needsRanges = false; - try { (p as unknown as { edit(c: Cst): void }).edit(c2); } catch { needsRanges = true; } + try { (p as unknown as { edit(c: Cst): void }).edit(c1); } catch { needsRanges = true; } if (needsRanges) contract++; else failures.push('edit() without changes did not throw'); - // a REJECTING parse() resets the arena too — it must invalidate prior handles - try { p.parse('const ] = ;'); } catch { /* expected reject */ } + let oob = false; + try { p.edit(c1, [{ start: 5, end: 99999, text: '' }]); } catch { oob = true; } + if (oob) contract++; + else failures.push('out-of-range change did not throw'); + // a REJECTING-grammar parse() is total too, and re-opening kills old handles + const c2 = p.parse('const ] = ;'); + if (c2.errors.length > 0) contract++; + else failures.push('invalid parse() reported no errors'); let dead = false; - try { obj(c2); } catch { dead = true; } + try { obj(c1); } catch { dead = true; } if (dead) contract++; - else failures.push('rejecting parse() left the old handle readable over a reset arena'); + else failures.push('re-opened document: old handle did not throw'); } -console.log(`multi-doc: ${equal} equal · ${bothReject} both-reject (${reverts} reverts verified) · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/7`); +console.log(`multi-doc: ${equal} equal (${withErrors} recovered with errors) · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/9`); for (const s of failures) console.log(' ✗ ' + s); -if (mismatch > 0 || contract !== 7 || failures.length > 0) { +if (mismatch > 0 || contract !== 9 || failures.length > 0) { console.error('✗ document isolation / handle contract violated'); process.exit(1); } -console.log('✓ documents are isolated; handles enforce the in-place-edit contract'); +console.log('✓ documents are isolated; the total in-place handle contract holds'); diff --git a/test/recovery.ts b/test/recovery.ts new file mode 100644 index 0000000..6e378c6 --- /dev/null +++ b/test/recovery.ts @@ -0,0 +1,120 @@ +// Gate: TOTAL PARSING (issue #39). The handle API never crashes on input — every +// text produces a tree plus cst.errors — under three hard invariants: +// +// 1. VALID texts parse byte-identically to the STRICT module-level parse with an +// empty errors field (the strict pass runs first and exclusively; recovery +// cannot perturb the valid path). +// 2. INVALID texts never throw, report errors exactly when strict rejects, parse +// deterministically (same input twice → identical tree + errors), and every +// diagnostic span stays inside the document. +// 3. A TYPING session through transiently-invalid states (the editor reality: +// char-by-char insertion makes most intermediate states invalid) keeps every +// intermediate edit byte-identical to a fresh handle parse — tree and errors. +// +// node test/recovery.ts +import { existsSync, readFileSync, writeFileSync } from 'node:fs'; +import { emitParser } from '../src/emit-parser.ts'; +import { objectify } from './emitted-obj.ts'; + +const grammar = (await import('../typescript.ts')).default; +const emPath = '/tmp/emitted-recovery.mjs'; +writeFileSync(emPath, emitParser(grammar)); +type Edit = { start: number; end: number; text: string }; +type Diag = { offset: number; end: number; message: string }; +type Cst = { root: number; errors: Diag[] }; +type Parser = { parse(s: string): Cst; edit(cst: Cst, edits: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; +type Em = { + parse(s: string): number; + visit(entry: number, fns: object): void; + tree: import('./emitted-obj.ts').TreeView; + createParser(): Parser; +}; +const em = (await import(emPath + '?v=' + process.pid)) as Em; +const p = em.createParser(); +const q = em.createParser(); + +let fails = 0; +const bad = (msg: string) => { fails++; if (fails < 12) console.log(' ✗ ' + msg); }; +const objH = (pp: Parser, c: Cst) => JSON.stringify(objectify(pp.tree, (fns) => pp.visit(c, fns))); + +// ── 1. valid corpus: recovery-capable parse ≡ strict parse, errors empty ── +const VALID: string[] = [ + 'const a = 1;\n', + 'function f(a: number): string { return `${a}`; }\nclass C { m(x: T): T { return x; } }\n', + 'const x = a < b ? c : d;\nfor (const k of ks) { if (k) break; }\n', +]; +for (const f of [ + '/tmp/ts-repo/tests/cases/conformance/fixSignatureCaching.ts', + '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/parserRealSource7.ts', +]) if (existsSync(f)) VALID.push(readFileSync(f, 'utf-8')); +let validN = 0; +for (const text of VALID) { + const c = p.parse(text); + const strictRoot = em.parse(text); + const a = objH(p, c); + const b = JSON.stringify(objectify(em.tree, (fns) => em.visit(strictRoot, fns))); + if (a !== b) bad(`valid text: handle tree ≠ strict tree (${text.slice(0, 30)}…)`); + else if (c.errors.length !== 0) bad(`valid text reported ${c.errors.length} errors`); + else validN++; +} + +// ── 2. invalid corpus: total, error-reporting, deterministic, spans in bounds ── +const INVALID: string[] = [ + 'const ] = ;', + 'const a = 1; ]] const b = 2;\n', + 'function f( { return 1; }\n', + 'class C { m( { } \n const after = 1;\n', + 'const s = "unterminated\nconst t = 2;\n', + 'const u = `tpl ${ x ;\n', + 'const v = 1; \\ const w = 2;\n', + 'if (a { b(); }\nconst tail = 3;\n', + '@@@@\n', + '}{)(\n', +]; +let invalidN = 0; +for (const text of INVALID) { + let strictRejects = false; + try { em.parse(text); } catch { strictRejects = true; } + let c: Cst; + try { c = p.parse(text); } catch (e) { bad(`THROWS on «${text.slice(0, 24)}»: ${(e as Error).message.slice(0, 40)}`); continue; } + if (strictRejects !== c.errors.length > 0) { bad(`errors(${c.errors.length}) vs strict ${strictRejects ? 'reject' : 'accept'} on «${text.slice(0, 24)}»`); continue; } + for (const g of c.errors) { + if (!(g.offset >= 0 && g.offset <= g.end && g.end <= text.length && g.message.length > 0)) { + bad(`malformed diagnostic ${JSON.stringify(g)} on «${text.slice(0, 24)}»`); + } + } + const first = objH(p, c) + JSON.stringify(c.errors); + const c2 = p.parse(text); + const second = objH(p, c2) + JSON.stringify(c2.errors); + if (first !== second) { bad(`nondeterministic parse on «${text.slice(0, 24)}»`); continue; } + invalidN++; +} + +// ── 3. typing through invalid states: every keystroke ≡ fresh, tree AND errors ── +const BASE = 'function g(a) {\n return a + 1;\n}\nconst tail = g(2);\n'; +const TYPED = 'const x = f(1, "s");'; +let typedOk = 0; +{ + const at = BASE.indexOf('}\n') + 2; // between the function and the tail stmt + const c = p.parse(BASE); + let text = BASE; + for (let i = 0; i < TYPED.length; i++) { + const ch = TYPED[i]; + const pos = at + i; + p.edit(c, [{ start: pos, end: pos, text: ch }]); + text = text.slice(0, pos) + ch + text.slice(pos); + const fc = q.parse(text); + const a = objH(p, c) + JSON.stringify(c.errors); + const b = objH(q, fc) + JSON.stringify(fc.errors); + if (a !== b) { bad(`keystroke ${i} («${TYPED.slice(0, i + 1)}»): edit ≠ fresh`); break; } + typedOk++; + } + if (c.errors.length !== 0) bad('completed statement still reports errors'); +} + +console.log(`recovery: valid ${validN}/${VALID.length} ≡ strict+clean · invalid ${invalidN}/${INVALID.length} total+deterministic · typing ${typedOk}/${TYPED.length} keystrokes ≡ fresh`); +if (fails > 0) { + console.error('✗ total-parsing contract violated'); + process.exit(1); +} +console.log('✓ parse/edit are total: valid path byte-identical, errors field exact, typing sessions equivalent'); From dc1056801f2273d7a1cb38aa42e8977214ff6ace Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 18:16:02 +0800 Subject: [PATCH 02/23] Total parse/edit complete: a latent Pratt watermark hole closed, equivalence gated The residual typing-session divergence traced to a watermark contract violation that PREDATES recovery and was latent in strict incremental parsing: a Pratt rule's winning row is finishNode'd BEFORE its failed LED extension arms run (the NUD/shorter candidate survives the longest match), so rowExt under-records the rule's true probe extent. The memo watermark (maxPos at parseRuleEntry exit) was always correct - but the memo dies with its generation, and ADOPTION reads the row. An edit landing inside a failed arm's reads then kept a stale row alive ('const x = f' adopted with ext=4 while typing ')' at token 20 turns the failed call arm into a successful one). Strict sessions never caught it because the texts that exercise it (unclosed calls) REJECT, and the reject was the firewall; total parsing keeps such trees alive. Fix: write the rule-level watermark back to the row at memo-store time (rowExt[result] = max(rowExt, maxPos - start)). This subsumes the recFires mode stamp (removed - rowRM is purely structural again for the diagnostics walk), restoring broad strict adoption over recovered substrates: broken-state keystrokes on 9MB dropped from ~1.6s to the ~0.3s bar-iteration cost (valid-state keystrokes stay at 0.05ms). test/recovery.ts now fully green and REGISTERED (32/32): valid corpus byte-identical to strict with empty errors, invalid corpus total and deterministic, the char-by-char typing session 20/20 keystrokes equivalent to fresh parses (tree AND errors). The interpreter gains parseTotal/edit parity (no recovery machinery: degrades to a zero-width $error root with the strict diagnostic). incremental-verify 128 steps 0 mismatch, multi-doc 60 steps contract 9/9, strict parity 0 mismatches, lexer streams byte-identical, batch in band (11.2x), agnostic 9/9. --- src/emit-parser.ts | 12 +++++++++++- src/gen-parser.ts | 22 ++++++++++++++++++---- test/check.ts | 1 + 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index a5fe226..3e1bbd1 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -2404,7 +2404,17 @@ function parseRuleEntry(idx, rid, name, core) { // SECOND-token dispatch) is applied at INVALIDATION time if (result >= 0) { rowOK[result] = 1; - if (recovering && recFires !== rf0) rowRM[result] = 1; + // The row's OWN watermark freezes at finishNode — for a Pratt rule that is + // BEFORE the failed LED extension arms run (the NUD/shorter row survives the + // longest-match), so rowExt under-records the rule's true probe extent and a + // later edit inside a failed arm's reads would not invalidate an adoption. + // The memo watermark (maxPos at exit) is the truth — write it back to the + // row, where adoption can see it after the memo generation dies. (This also + // covers recovering-built rows: a fire that cut a losing arm short is still + // bounded by the recorded probes, so no mode stamp is needed for adoption — + // rowRM stays purely structural for the diagnostics walk.) + const re = maxPos - start; + if (re > rowExt[result]) rowExt[result] = re; } } diff --git a/src/gen-parser.ts b/src/gen-parser.ts index 66b09c2..4a2091f 100644 --- a/src/gen-parser.ts +++ b/src/gen-parser.ts @@ -1484,13 +1484,27 @@ export function createParser(grammar: CstGrammar) { // API parity with the emitted engine's handle surface: edit() re-parses and // updates the SAME tree object in place (the handle is the document's tree — - // edit returns nothing, exactly like the emitted engine; no reuse here). - const edit = (cst: { rule: string; children: unknown[]; offset: number; end: number }, source: string): void => { - const next = parse(source) as typeof cst; + // edit returns nothing, exactly like the emitted engine; no reuse here), and + // both are TOTAL: input errors land in the errors field, never a throw. The + // interpreter has no recovery machinery, so an invalid text degrades to a + // zero-width $error root plus the strict diagnostic. + type Cst = { rule: string; children: unknown[]; offset: number; end: number; errors?: { offset: number; end: number; message: string }[] }; + const parseTotal = (source: string): Cst => { + try { + const t = parse(source) as Cst; + t.errors = []; + return t; + } catch (e) { + return { rule: '$error', children: [], offset: 0, end: 0, errors: [{ offset: 0, end: 0, message: (e as Error).message }] }; + } + }; + const edit = (cst: Cst, source: string): void => { + const next = parseTotal(source); cst.rule = next.rule; cst.children = next.children; cst.offset = next.offset; cst.end = next.end; + cst.errors = next.errors; }; - return { parse, edit, tokenize, profCounts }; + return { parse, parseTotal, edit, tokenize, profCounts }; } // ── Helpers ── diff --git a/test/check.ts b/test/check.ts index 8754566..8850085 100644 --- a/test/check.ts +++ b/test/check.ts @@ -23,6 +23,7 @@ const GATES: Gate[] = [ { group: 'core', name: 'cst-match-totality', args: ['test/cst-match-totality.ts'] }, { group: 'core', name: 'incremental-verify', args: ['test/incremental-verify.ts'] }, { group: 'core', name: 'multi-doc', args: ['test/multi-doc.ts'] }, + { group: 'core', name: 'recovery', args: ['test/recovery.ts'] }, { group: 'core', name: 'issue-cases', args: ['test/test-issues.ts'] }, { group: 'conformance', name: 'js', args: ['test/js-conformance.ts'] }, { group: 'conformance', name: 'tsx', args: ['test/tsx-conformance.ts'] }, From e4fc2f3cbfd299047f8257b5e12329640f6c9a85 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 19:04:35 +0800 Subject: [PATCH 03/23] Gate the expression-splitting ';' injection class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The seeded mutation lists never inserted a bare ';' — splitting an existing expression's structure (f(a;, b) / (a +; b) / obj.m(;1).n) was covered only by the general machinery, not exercised. Both gates' INSERT pools gain ';' and the glue list gains three explicit break-then-compare pairs; verified break ≡ fresh and restore ≡ original byte-identically (tree and errors) before pinning. Observation for the conformance backlog: several of these broken shapes parse with ZERO errors - the strict grammar itself accepts them (over-accept surface, identical on both engines), not a recovery artifact. --- test/incremental-verify.ts | 6 +++++- test/multi-doc.ts | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index bc22810..6d7b85b 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -35,7 +35,7 @@ let seedState = 0x2F6E2B1; const rand = () => ((seedState = (seedState * 48271) % 0x7fffffff) / 0x7fffffff); const randInt = (n: number) => Math.floor(rand() * n); -const INSERTS = ['x', '_v', '42', ' + y', '.m', '()', ' /*c*/ ', '"s"', 'await ', '!', '?']; +const INSERTS = ['x', '_v', '42', ' + y', '.m', '()', ' /*c*/ ', '"s"', 'await ', '!', '?', ';', '; ']; const STMTS = ['const q9 = 1;\n', 'function g9(a) { return a; }\n', 'if (x9) { y9(); }\n', '// note\n', 'type T9 = string | number;\n']; // Mutations return the edit RANGE too, so half the steps can exercise the edits @@ -105,6 +105,10 @@ const GLUE: Array<[string, string]> = [ ['const t = a + b;\n', 'const t = a ++ b;\n'], ['const u = x(z);\n', 'const u = x>(z);\n'], ['f(a, b);\ng(c);\n', 'f(a, bc);\ng(c);\n'], + // expression-splitting ';' injections (structure breaks, not appended garbage) + ['const x = a + b;\n', 'const x = a; + b;\n'], + ['const y = (a + b) * c;\n', 'const y = (a +; b) * c;\n'], + ['const z = obj.m(1).n;\n', 'const z = obj.m(;1).n;\n'], ]; let steps = 0, equal = 0, withErrors = 0, mismatch = 0; diff --git a/test/multi-doc.ts b/test/multi-doc.ts index 5abe09d..f5af760 100644 --- a/test/multi-doc.ts +++ b/test/multi-doc.ts @@ -33,7 +33,7 @@ let textB = `(function () {\n${mk('beta', 300)}})();\n`; let seed = 0x51C0FFEE; const rand = () => ((seed = (seed * 48271) % 0x7fffffff) / 0x7fffffff); const randInt = (n: number) => Math.floor(rand() * n); -const INS = ['x', '1', ' + q', '.m', '(/*c*/)', '"s"']; +const INS = ['x', '1', ' + q', '.m', '(/*c*/)', '"s"', ';']; function mutate(text: string): { next: string; edit: Edit } { switch (randInt(3)) { case 0: { From 05c6284926a1111ad4c514e237956d2401b4144b Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 19:40:04 +0800 Subject: [PATCH 04/23] =?UTF-8?q?Cross-grammar=20incremental=20gate:=20all?= =?UTF-8?q?=207=20grammars,=20edit=20=E2=89=A1=20fresh=20+=20self-consiste?= =?UTF-8?q?ncy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The incremental/recovery gates were TypeScript-only while every grammar shares the emitted runtime - the non-TS incremental behavior (markup lexer modes, the fallback-lexer path, other token algebras) was ungated. test/incremental-grammars.ts closes that: generative inputs (grammar-gen) per grammar x seeded char-level edit sessions, each step checking (1) edited tree + errors byte-identical to a fresh handle parse, (2) tree self-consistency - every span inside its ancestors (the engine-internal invariant an external compare misses when both sides share a corruption; the aggressiveChecks idea), and (3) totality. It immediately found three real holes, all fixed: - totalNet pushed its diagnostic into the VIEW layer, which the next settle rebuild wiped on exactly one side (now a kind-4 source entry formatted at settle - verbatim engine message). - the fallback-lexer full-relex path never cleared persisted docLex, so a totality-net diagnostic outlived the edit that fixed the text. - the window resync retracts the duplicated token push (tokN--) but left the lexer diagnostic emitted FOR that token: the persisted entry survives via the suffix shift AND the window's copy stayed - the same character double-reported. Retraction now pops the window's own entries at/after the retracted token (lexDiagBase floor). 672/672 steps across typescript/javascript/typescriptreact/ javascriptreact/yaml/html/vue (489 exercising recovery). 33/33 suite, lexer streams byte-identical, parser parity 0 mismatches, batch in band. --- src/emit-lexer.ts | 8 +- src/emit-parser.ts | 13 ++- test/check.ts | 1 + test/incremental-grammars.ts | 154 +++++++++++++++++++++++++++++++++++ 4 files changed, 171 insertions(+), 5 deletions(-) create mode 100644 test/incremental-grammars.ts diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 3fa7f60..c336b37 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -109,6 +109,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`const LEX_RETRY = { retry: true };`); emit(`let lexWindowMore = false;`); emit(`let lexSrcBase = 0;`); + emit(`let lexDiagBase = 0; // docLex floor for the current window (its own emissions sit above)`); emit(`const LX_UNI_IDENT = /[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/uy;`); emit(`const LX_UNI_CONT = /[$\\u200c\\u200d\\p{ID_Continue}]+/uy;`); emit(`const LX_UNI_FULL = /^[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/u;`); @@ -288,7 +289,10 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` return LX_DIVK[k] !== 0 || LX_DIVT[t] !== 0;`); emit(` }`); emit(` while (pos < n) {`); - emit(` if (wndHit >= 0) { tokN--; return wndHit; }`); + emit(` // resync retracts the duplicated token push — and any lexer diagnostics + // emitted FOR it (the old stream's persisted entry survives via the shift; + // keeping the window's copy too double-reports the same character)`); + emit(` if (wndHit >= 0) { tokN--; while (docLex.length > lexDiagBase && docLex[docLex.length - 1].offset >= tkOff[tokN]) docLex.length--; return wndHit; }`); emit(` const cc = source.charCodeAt(pos);`); emit(` // whitespace: ASCII \\s run by char loop; a non-ASCII candidate falls back to the regex`); emit(` if (cc === 32 || (cc >= 9 && cc <= 13)) {`); @@ -538,7 +542,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` }`); emit(` throw new Error("Unexpected character at offset " + pos + ": '" + source[pos] + "'");`); emit(` }`); - emit(` if (wndHit >= 0) { tokN--; return wndHit; }`); + emit(` if (wndHit >= 0) { tokN--; while (docLex.length > lexDiagBase && docLex[docLex.length - 1].offset >= tkOff[tokN]) docLex.length--; return wndHit; }`); emit(` return hasMore ? -2 : -1;`); emit(`}`); emit(`// Windowed-relex restart anchor: the last token B ending at/before the damage`); diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 3e1bbd1..14451a9 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -2706,7 +2706,8 @@ function lexMsg(g) { if (g.kind === 0) return "Unexpected character at offset " + g.offset + ": '" + g.ch + "'"; if (g.kind === 1) return 'Invalid escape sequence in template at offset ' + g.offset; if (g.kind === 2) return 'Unterminated template literal at offset ' + g.offset; - return "Invalid identifier escape at offset " + g.offset + ": '" + g.ch + "'"; + if (g.kind === 3) return "Invalid identifier escape at offset " + g.offset + ": '" + g.ch + "'"; + return g.ch; // kind 4: a verbatim engine message (the totality net) } // ── Recovery BARS: the discipline that keeps recovery equivalence-safe ── // A repetition element fails constantly during ORDINARY parsing (a statement list @@ -3317,10 +3318,12 @@ function shiftDiags(a, b, delta) { // API still never crashes. Zero-width $error root + the thrown message as the // diagnostic; the next successful parse/edit resumes normal service. function totalNet(e) { - docDiags.length = 0; + // the message lives in the SOURCE layer (docLex kind 4) — a later settle rebuilds + // the view from the sources, and a view-only push would be wiped by it docLex.length = 0; docPar.length = 0; - docDiags.push({ offset: 0, end: 0, message: String(e && e.message ? e.message : e) }); + docLex.push({ offset: 0, end: 0, kind: 4, ch: String(e && e.message ? e.message : e) }); + rebuildDiagView(); scn = 0; const root = finishNode(RID_ERROR, 0); lastRoot = root; @@ -3412,6 +3415,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── let R0; const preLexN = docLex.length; // persisted lexer diags; the window's own // emissions land after this index + lexDiagBase = preLexN; { let wHi = ceNew + 4096; for (;;) { @@ -3514,6 +3518,9 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── tkText = altText; tkText.length = 0; altK = oK; altT = oT; altOff = oOff; altEnd = oEnd; altFl = oFl; altText = oText; + docLex.length = 0; // a FULL relex re-derives all lexer diagnostics (none, for + // the recovery-blind fallback lexer) — persisted entries + // from an earlier totality-net edit would go stale lexInto(flattenDoc()); const nN = tokN; const charDelta = docLen - oldLen; diff --git a/test/check.ts b/test/check.ts index 8850085..53d3365 100644 --- a/test/check.ts +++ b/test/check.ts @@ -24,6 +24,7 @@ const GATES: Gate[] = [ { group: 'core', name: 'incremental-verify', args: ['test/incremental-verify.ts'] }, { group: 'core', name: 'multi-doc', args: ['test/multi-doc.ts'] }, { group: 'core', name: 'recovery', args: ['test/recovery.ts'] }, + { group: 'core', name: 'incremental-grammars', args: ['test/incremental-grammars.ts'] }, { group: 'core', name: 'issue-cases', args: ['test/test-issues.ts'] }, { group: 'conformance', name: 'js', args: ['test/js-conformance.ts'] }, { group: 'conformance', name: 'tsx', args: ['test/tsx-conformance.ts'] }, diff --git a/test/incremental-grammars.ts b/test/incremental-grammars.ts new file mode 100644 index 0000000..bfe32a6 --- /dev/null +++ b/test/incremental-grammars.ts @@ -0,0 +1,154 @@ +// Gate: INCREMENTAL ≡ FRESH for EVERY GRAMMAR — the incremental/recovery gates +// were TypeScript-only while all grammars share the same emitted runtime, so the +// non-TS incremental behavior (markup lexer modes, the fallback-lexer path, other +// token algebras) was ungated. Grammar-agnostic by construction: +// +// inputs come from the generative walker (grammar-gen), edit scripts are seeded +// char-level mutations, and every step checks THREE things on the handle API: +// 1. edited tree + errors ≡ a fresh handle parse of the same text (byte-equal) +// 2. tree SELF-CONSISTENCY: every leaf span lies inside all its ancestors' +// spans (the engine-internal invariant an external compare can miss when +// both sides share a corruption) +// 3. totality: no step may throw +// +// node test/incremental-grammars.ts +import { writeFileSync } from 'node:fs'; +import { emitParser } from '../src/emit-parser.ts'; +import { generateInputs } from './grammar-gen.ts'; +import { objectify } from './emitted-obj.ts'; + +type Edit = { start: number; end: number; text: string }; +type Diag = { offset: number; end: number; message: string }; +type Cst = { root: number; errors: Diag[] }; +type Parser = { parse(s: string): Cst; edit(cst: Cst, edits: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView & { lenOf(id: number): number; leafOffsetOf(e: number, tb: number): number; leafEndOf(e: number, tb: number): number } }; +type Em = { createParser(): Parser }; + +const GRAMMARS = ['typescript', 'javascript', 'typescriptreact', 'javascriptreact', 'yaml', 'html', 'vue']; + +let seedState = 0x5EED1E55; +const rand = () => ((seedState = (seedState * 48271) % 0x7fffffff) / 0x7fffffff); +const randInt = (n: number) => Math.floor(rand() * n); +const INS = ['x', '1', ';', ' ', '"', '<', '>', '(', ')', '\n', '-', ':']; +function mutate(text: string): { next: string; edit: Edit } { + if (text.length === 0) { + const ins = INS[randInt(INS.length)]; + return { next: ins, edit: { start: 0, end: 0, text: ins } }; + } + switch (randInt(3)) { + case 0: { + const at = randInt(text.length); + const ins = INS[randInt(INS.length)]; + return { next: text.slice(0, at) + ins + text.slice(at), edit: { start: at, end: at, text: ins } }; + } + case 1: { + const at = randInt(Math.max(1, text.length - 4)); + const n = 1 + randInt(3); + const end = Math.min(text.length, at + n); + return { next: text.slice(0, at) + text.slice(end), edit: { start: at, end, text: '' } }; + } + default: { + const at = randInt(text.length); + return { next: text.slice(0, at) + 'z' + text.slice(at + 1), edit: { start: at, end: at + 1, text: 'z' } }; + } + } +} + +function selfConsistent(p: Parser, c: Cst): string | null { + const stack: [number, number][] = []; + let bad: string | null = null; + p.visit(c, { + enter(id: number, cb: number) { + const span: [number, number] = [cb, cb + p.tree.lenOf(id)]; + const top = stack[stack.length - 1]; + if (top !== undefined && (span[0] < top[0] || span[1] > top[1]) && bad === null) { + bad = `node span [${span[0]},${span[1]}) outside parent [${top[0]},${top[1]})`; + } + stack.push(span); + }, + leave() { stack.pop(); }, + leaf(e: number, tok: number) { + if (bad !== null) return; + const tb = tok - ((~e) >>> 2); + const lo = p.tree.leafOffsetOf(e, tb), hi = p.tree.leafEndOf(e, tb); + const top = stack[stack.length - 1]; + if (top !== undefined && (lo < top[0] || hi > top[1])) { + bad = `leaf span [${lo},${hi}) outside parent [${top[0]},${top[1]})`; + } + }, + }); + return bad; +} + +let totalSteps = 0, totalEqual = 0, totalErr = 0; +let fails = 0; +const failures: string[] = []; +for (const name of GRAMMARS) { + const grammar = (await import(`../${name}.ts`)).default; + const emPath = `/tmp/emitted-incr-${name}.mjs`; + writeFileSync(emPath, emitParser(grammar)); + const em = (await import(emPath + '?v=' + process.pid)) as Em; + const session = em.createParser(); + const fresh = em.createParser(); + + // a handful of generated documents per grammar, a short edit session on each + const inputs = generateInputs(grammar, { depth: 4, nestDepth: 4, cap: 5, fuzzRounds: 40, maxInputs: 24, seed: 11 }); + let docs = 0; + for (const input of inputs) { + if (input.text.length < 8) continue; + if (docs >= 8) break; + docs++; + let text = input.text; + let cst: Cst; + try { cst = session.parse(text); } catch (e) { + fails++; failures.push(`${name}: parse THREW on generated input: ${(e as Error).message.slice(0, 60)}`); + continue; + } + for (let k = 0; k < 12; k++) { + const { next, edit } = mutate(text); + totalSteps++; + if (process.env.TRACE && name === process.env.TRACE) console.log(` [${name} doc${docs} step${k}]`, JSON.stringify(edit).slice(0, 70), '→', JSON.stringify(next.slice(0, 40))); + let fc: Cst; + try { + session.edit(cst, [edit]); + fc = fresh.parse(next); + } catch (e) { + fails++; + if (failures.length < 10) failures.push(`${name} doc${docs} step${k}: THREW: ${(e as Error).message.slice(0, 80)}`); + break; + } + if (fc.errors.length > 0) totalErr++; + const a = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(fc, fns))) + JSON.stringify(fc.errors); + const b = JSON.stringify(objectify(session.tree, (fns) => session.visit(cst, fns))) + JSON.stringify(cst.errors); + if (a !== b) { + fails++; + if (process.env.DUMP) { + console.log('DOC:', JSON.stringify(text)); + console.log('NEXT:', JSON.stringify(next)); + console.log('FRESH errors:', JSON.stringify(fc.errors)); + console.log('INC errors: ', JSON.stringify(cst.errors)); + } + if (failures.length < 10) { + let i = 0; while (i < a.length && a[i] === b[i]) i++; + failures.push(`${name} doc${docs} step${k}: edit ≠ fresh @${i} edit=${JSON.stringify(edit).slice(0, 60)}\n fresh: …${a.slice(Math.max(0, i - 40), i + 60)}…\n inc: …${b.slice(Math.max(0, i - 40), i + 60)}…`); + } + break; + } + const sc = selfConsistent(session, cst); + if (sc !== null) { + fails++; + if (failures.length < 10) failures.push(`${name} doc${docs} step${k}: SELF-INCONSISTENT: ${sc}`); + break; + } + totalEqual++; + text = next; + } + } +} + +console.log(`incremental-grammars: ${totalEqual}/${totalSteps} steps equal+consistent across ${GRAMMARS.length} grammars (${totalErr} recovered with errors)`); +for (const s of failures) console.log(' ✗ ' + s); +if (fails > 0) { + console.error('✗ cross-grammar incremental equivalence violated'); + process.exit(1); +} +console.log('✓ every grammar: edited re-parses byte-identical to fresh, trees self-consistent, no throws'); From 3e7f1d6b479a268375697564baa243c6fcefecef Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 22:39:35 +0800 Subject: [PATCH 05/23] Missing-token synthesis: tsc-style "expected 'x'" with structure preserved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Required token matchers in recovering mode now synthesize a zero-width \$missing leaf (expected identity in rowStart, LIT_NAMES/K_NAMES inverse for the message) instead of failing, so 'const x = f(1, 2;' keeps its Call shape and reports "expected ')'", and 'function g() { return 1;' closes the body with "expected '}'". Synthesis is budget-free and position-pure: it fires iff a recovery bar lies in [pos, pos+2] (missAt), never under probing (not()/optional/separator probes) and never in free-fire. Zero-width success is a synthesis-only artifact (a strict zero-width element would never terminate its loop), so every loop discards it: plain reps break on pos===before alone (restoring scn), hooked reps discard + recoverSkip, leftRec continuations and Pratt LEDs refuse zero-width wraps. A rule can still re-enter ITSELF at the same position through a synthesized leading token — an unbounded recursion no grammar shape rules out — so recovering runs keep a (rule, pos) in-progress set and fail the re-entry (PEG cycle semantics; recRunning, zero strict-path cost). That sentinel also dissolved the bar +1 ladders the recursion crashes were minting: broken-doc recovery drops ~9x in the incremental gate (10.7s -> 1.2s). Equivalence (edit == fresh) exposed that the bar protocol's input was not adoption-invariant; three structural fixes: - frameMax: a frame-local advance watermark (reset to the rule's start at entry, folded into the parent on exit) replaces the global maxPos in rowExt/memo watermarks, making recorded probe reaches EXACT instead of contaminated by earlier-sibling probes. Bars (= strict-fail maxPos) now reconstruct identically under adoption; the hot advance pays one extra compare only at frontier breaches (frameMax <= maxPos nests the updates). This also closes the recorded "exact per-frame extents" backlog item and lands the bar on the true farthest probe (no more phantom synthesis from inflated memo-jump watermarks). - Recovery runs are adoption-free (edit-side attempt loop AND the lex-recovered first run): a row recorded under a recovering frame carries that run's bar-dependent reach, so replaying it makes the next bar a function of the OLD bar history instead of (text, bars). Attempt 0 (empty bars, behaviorally strict) re-derives the true strict frontier; every attempt is byte-equal to the fresh side's. The barIn adoption-refusal window from the first synthesis attempt is dead under this rule and removed; adoptSeek's recovering rowRM bypass likewise. - trySurgery refuses recovery-made trees (rowRM reaches the root structurally): a strict splice into kept \$error/\$missing siblings was a fake strict success that froze the OLD text's recovery shape, shifted. Gates: incremental-grammars 672/672 across 7 grammars; recovery.ts gains a synthesis-quality section (exact diagnostics + \$missing presence) and 4 session-found invalid shapes; incremental-verify gains the 5 protocol-pin GLUE pairs; multi-doc 60/60 + contract 9/9; check suite 33/33; corpus parity 401/401 sample, lexer parity 5695; perf-bench PASS (worst 803ms vs 802ms baseline; 9MB valid keystroke unregressed). verify-rejects: a tsc Debug.assert crash on 'await using' shapes is counted as ORACLE-CRASH and skipped (a crashed oracle has no verdict) instead of killing the gate. --- src/emit-parser.ts | 189 +++++++++++++++++++++++++++---------- test/incremental-verify.ts | 9 ++ test/recovery.ts | 30 +++++- test/verify-rejects.ts | 13 ++- 4 files changed, 189 insertions(+), 52 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 14451a9..71d6858 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -924,7 +924,7 @@ class Emitter { } const save = this.id(), sn = this.id(), fn = this.matchFn(expr.body), m = this.id(); return [ - `{ const ${save} = pos; const ${sn} = scn; const ${m} = ${fn}(); pos = ${save}; scn = ${sn};`, + `{ const ${save} = pos; const ${sn} = scn; probing++; const ${m} = ${fn}(); probing--; pos = ${save}; scn = ${sn};`, ` if (${m}) { ${onFail} } }`, ].join('\n'); } @@ -949,8 +949,10 @@ class Emitter { private matchQuantifierInto(body: RuleExpr, kind: '*' | '+' | '?', onFail: string, closerT = -1): string { const fn = this.matchFn(body); if (kind === '?') { - // Try once; on failure the helper restored pos/scn itself. - return `${fn}();`; + // Try once; on failure the helper restored pos/scn itself. The probe guard + // keeps token synthesis out of OPTIONAL paths — missing tokens are only + // inserted where a failure would propagate (required items), tsc-style. + return `probing++; ${fn}(); probing--;`; } // Run-extension: after an iteration whose element was ADOPTED from the old tree, // bulk-adopt its following old siblings (runExtend) instead of re-entering the @@ -968,16 +970,16 @@ class Emitter { const ext = runId >= 0 ? `\n if (adoptRunPos === pos) runExtend(${runId});` : ''; const recFirst = this.quantRecoverFirst(body); const csFn = recFirst !== null ? this.membershipFn(recFirst) : 'null'; - const fail = recFirst !== null - ? `if (!${fn}()) { if (!recovering || !recoverSkip(${csFn}, ${closerT})) break; continue; }` + const failFor = (beforeV: string, bsnV: string) => recFirst !== null + ? `if (!${fn}()) { if (!recovering || !recoverSkip(${csFn}, ${closerT})) break; continue; }\n if (recovering && pos === ${beforeV}) { scn = ${bsnV}; if (!recoverSkip(${csFn}, ${closerT})) break; continue; }` : `if (!${fn}()) break;`; if (kind === '*') { const before = this.id(), bsn = this.id(); return [ `while (true) {`, ` const ${before} = pos; const ${bsn} = scn;`, - ` ${fail}`, - ` if (pos === ${before} && scn === ${bsn}) break;` + ext, + ` ${failFor(before, bsn)}`, + ` if (pos === ${before}) { scn = ${bsn}; break; }` + ext, `}`, ].join('\n'); } @@ -987,8 +989,8 @@ class Emitter { `if (!${fn}()) { ${onFail} }`, `while (true) {`, ` const ${before} = pos; const ${bsn} = scn;`, - ` ${fail}`, - ` if (pos === ${before} && scn === ${bsn}) break;` + ext, + ` ${failFor(before, bsn)}`, + ` if (pos === ${before}) { scn = ${bsn}; break; }` + ext, `}`, ].join('\n'); } @@ -1001,7 +1003,7 @@ class Emitter { return [ `if (${fn}()) {`, ` while (true) {`, - ` const _ds = pos; if (!${this.matchLiteralCall(delimiter)}) { pos = _ds; break; }`, + ` const _ds = pos; probing++; const _dm = ${this.matchLiteralCall(delimiter)}; probing--; if (!_dm) { pos = _ds; break; }`, ` if (!${fn}()) break;`, ` }`, `}`, @@ -1389,9 +1391,17 @@ export function emitParser(grammar: CstGrammar): string { e.emit(`const ENTRY = ${J(entry)};`); // Rule-name table: rowRule stores the index; '$template' takes the slot after the // declared rules (parseTemplateExpr's synthetic node). - e.emit(`const RULE_NAMES = ${J([...grammar.rules.map(r => r.name), '$template', '$error'])};`); + e.emit(`const RULE_NAMES = ${J([...grammar.rules.map(r => r.name), '$template', '$error', '$missing'])};`); e.emit(`const RID_TEMPLATE = ${grammar.rules.length};`); e.emit(`const RID_ERROR = ${grammar.rules.length + 1};`); + e.emit(`const RID_MISSING = ${grammar.rules.length + 2};`); + { + // literal-int → text (for "expected 'x'" diagnostics on $missing rows) + const inv: string[] = []; + for (const [txt, t] of a.symtab.kwLitKind) inv[t] = txt; + for (const [txt, t] of a.symtab.puLitKind) inv[t] = txt; + e.emit(`const LIT_NAMES = ${J(Array.from(inv, (x) => x ?? ''))};`); + } // (recovery sync closers are threaded per-loop from the enclosing seq — see // quantFollowT; a global closer table froze top-level recovery at any ']'.) e.emit(`const prattRuleNames = new Set(${J([...a.prattRules])});`); @@ -1694,7 +1704,7 @@ function finishNode(rid, mark) { } rowRule[id] = rid; rowLen[id] = myEnd - myOff; rowCount[id] = n; rowTokLen[id] = myTokEnd - myTok; - rowExt[id] = maxPos - myTok; + rowExt[id] = frameMax - myTok; rowOK[id] = 0; rowKC[id] = 0; rowNF[id] = 0x7fffffff; @@ -1705,7 +1715,7 @@ function finishNode(rid, mark) { const ke = rowStart[id] + rowCount[id]; for (let i2 = rowStart[id]; i2 < ke; i2++) { const e2 = kids[i2]; - if (e2 >= 0 && (rowRM[e2] !== 0 || rowRule[e2] === RID_ERROR)) { rowRM[id] = 1; break; } + if (e2 >= 0 && (rowRM[e2] !== 0 || rowRule[e2] >= RID_ERROR)) { rowRM[id] = 1; break; } } } absChar[id] = myOff; absTok[id] = myTok; @@ -1740,7 +1750,7 @@ function finishWrap(rid, lhsId, mark) { rowRule[id] = rid; rowLen[id] = myEnd - myOff; rowStart[id] = ks; rowCount[id] = n + 1; rowTokLen[id] = myTokEnd - myTok; - rowExt[id] = maxPos - myTok; + rowExt[id] = frameMax - myTok; rowOK[id] = 0; rowKC[id] = 0; rowNF[id] = 0x7fffffff; @@ -1751,7 +1761,7 @@ function finishWrap(rid, lhsId, mark) { const ke = rowStart[id] + rowCount[id]; for (let i2 = rowStart[id]; i2 < ke; i2++) { const e2 = kids[i2]; - if (e2 >= 0 && (rowRM[e2] !== 0 || rowRule[e2] === RID_ERROR)) { rowRM[id] = 1; break; } + if (e2 >= 0 && (rowRM[e2] !== 0 || rowRule[e2] >= RID_ERROR)) { rowRM[id] = 1; break; } } } absChar[id] = myOff; absTok[id] = myTok; @@ -1762,6 +1772,13 @@ function finishWrap(rid, lhsId, mark) { // ── per-parse state (module-level closures, reset by parse()) ── let pos = 0; let maxPos = 0; +// Frame-LOCAL advance watermark: reach of the CURRENT rule frame (reset to the +// frame's start at parseRuleEntry, folded back into the parent on exit). Keeps +// rowExt/memo watermarks EXACT — the global maxPos contaminates them with probes +// from earlier siblings, and recovery-bar minting (bar = strict-fail maxPos) must +// be identical between a fresh parse and an adoption re-run. frameMax <= maxPos +// always, so the hot advance pays one extra compare only at frontier breaches. +let frameMax = 0; let memoNode = []; let memoEnd = []; let memoExt = []; // per-entry lookahead extent (see parseRuleEntry) @@ -1793,9 +1810,9 @@ function offset() { function matchKwLit(kw) { // A kw-range t can only come from a named token (template spans never intern to a // keyword), so the old k >= K_NAMED_MIN guard was redundant — one int compare. - if (pos >= cap || tkT[pos] !== kw) return false; + if (pos >= cap || tkT[pos] !== kw) return recovering ? missTok(kw) : false; scPush(~((pos << 2) | 1)); - if (++pos > maxPos) maxPos = pos; + if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } return true; } // Punct literal: tok.type === '' && tok.text === value, with the gt-splice fallback. @@ -1804,9 +1821,9 @@ function matchKwLit(kw) { function matchPuLit(pu) { // A pu-range t can only come from a punct token, so the old k === K_PUNCT guard was // redundant — one int compare. The '>'-split lives only in matchPuLitGT ('>' sites). - if (pos >= cap || tkT[pos] !== pu) return false; + if (pos >= cap || tkT[pos] !== pu) return recovering ? missTok(pu) : false; scPush(~(pos << 2)); - if (++pos > maxPos) maxPos = pos; + if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } return true; } function matchPuLitGT(pu) { @@ -1814,7 +1831,7 @@ function matchPuLitGT(pu) { const off = toff(pos); if (tkT[pos] === pu) { scPush(~(pos << 2)); - if (++pos > maxPos) maxPos = pos; + if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } return true; } // Split multi-'>' tokens: '>>', '>>>', '>>=', '>>>=' can yield a single '>': shift the @@ -1859,10 +1876,10 @@ function matchPuLitGT(pu) { // wholly BEFORE the splice point (token pos is being consumed right now), and the // carried memo was just cleared, so nothing reachable references shifted indices. scPush(~(pos << 2)); - if (++pos > maxPos) maxPos = pos; + if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } return true; } - return false; + return recovering ? missTok(pu) : false; } // Generic matchLiteral kept for any unspecialized site: classify value via the baked // tables (no per-call isKeywordLiteral / string compares) and delegate. @@ -1877,9 +1894,9 @@ function matchLiteral(value) { // (No named-token kind equals K_NAMED_FALLBACK, so an unforeseen type never matches.) // The materialized tokenType is type-derived (kind 0) — name needs no baking here. function matchTokK(nameKind) { - if (pos >= cap || tkK[pos] !== nameKind) return false; + if (pos >= cap || tkK[pos] !== nameKind) return recovering ? missTok(-nameKind) : false; scPush(~(pos << 2)); - if (++pos > maxPos) maxPos = pos; + if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } return true; } @@ -1891,13 +1908,13 @@ function parseTemplateExpr() { const k = tkK[pos]; if (k === K_TPL_TOKEN) { scPush(~(pos << 2)); - if (++pos > maxPos) maxPos = pos; + if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } return true; } if (k === K_TEMPLATE_HEAD) { const mark = scn; scPush(~(pos << 2)); - if (++pos > maxPos) maxPos = pos; + if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } const interpRule = currentPrattContext ?? EXPR_RULE; while (true) { RULES[interpRule](); @@ -1905,12 +1922,12 @@ function parseTemplateExpr() { const nk = tkK[pos]; if (nk === K_TEMPLATE_MIDDLE) { scPush(~(pos << 2)); - if (++pos > maxPos) maxPos = pos; + if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } continue; } if (nk === K_TEMPLATE_TAIL) { scPush(~(pos << 2)); - if (++pos > maxPos) maxPos = pos; + if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } break; } break; @@ -2053,6 +2070,9 @@ function emitLeftRecRule(e: Emitter, a: ReturnType, rule: RuleDe if (contMix[i]) { e.emit(` if (!ok) { pos = contSaved; scn = contMark; ok = matchMixfixLed_${sanitize(rule.name)}_cont_${i}(); }`); } + // A zero-width continuation is possible only via token synthesis (a strict one + // would never terminate this loop) — discard it or the loop spins. + e.emit(` if (ok && pos === contSaved) { scn = contMark; ok = false; }`); e.emit(` if (ok) {`); e.emit(` node = finishWrap(${rid}, node, contMark);`); e.emit(` continue outer;`); @@ -2098,7 +2118,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` const info = PREFIX_BY_T[tkT[pos]];`); e.emit(` if (info) {`); e.emit(` scPush(~((pos << 2) | 2));`); - e.emit(` if (++pos > maxPos) maxPos = pos;`); + e.emit(` if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } }`); e.emit(` const rhs = ${ruleFn}_pratt(info.rbp);`); e.emit(` if (rhs >= 0 && pos > bestNudPos) { scPush(rhs); lhs = finishNode(${rid}, mark); bestNudPos = pos; }`); e.emit(` }`); @@ -2148,6 +2168,8 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl if (meta.mixfix[i]) { e.emit(` if (!ok) { pos = ledSaved; scn = ledMark; ok = matchMixfixLed_${sn}_led_${i}(); }`); } + // Zero-width LED = synthetic-only (see the continuation loop note) — discard. + e.emit(` if (ok && pos === ledSaved) { scn = ledMark; ok = false; }`); e.emit(` if (ok) {`); e.emit(` lhs = finishWrap(${rid}, lhs, ledMark);`); if (meta.tailClosing[i]) e.emit(` tailClosed = true;`); @@ -2166,7 +2188,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` if (info.position === 'postfix') {`); e.emit(` if (!tailClosed) {`); e.emit(` scPush(~((pos << 2) | 2));`); - e.emit(` if (++pos > maxPos) maxPos = pos;`); + e.emit(` if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } }`); e.emit(` lhs = finishWrap(${rid}, lhs, ledMark);`); e.emit(` tailClosed = true; matched = true;`); e.emit(` }`); @@ -2180,7 +2202,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` }`); e.emit(` }`); e.emit(` scPush(~((pos << 2) | 2));`); - e.emit(` if (++pos > maxPos) maxPos = pos;`); + e.emit(` if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } }`); e.emit(` const rhs = ${ruleFn}_pratt(info.rbp);`); e.emit(` if (rhs >= 0) { scPush(rhs); lhs = finishWrap(${rid}, lhs, ledMark); matched = true; }`); e.emit(` else { pos = ledSaved; scn = ledMark; }`); @@ -2325,7 +2347,7 @@ function parseRuleEntry(idx, rid, name, core) { // the gap keeps the stale entry alive. A guaranteed batch no-op: the watermark is // monotone and was already ≥ this value when the entry was stored. const ex = mx[start]; - if (ex > maxPos) maxPos = ex; + if (ex > frameMax) { frameMax = ex; if (ex > maxPos) maxPos = ex; } const id = mn[start]; if (id >= 0) { // refresh the reused root's transient BUILD coordinates to the current stream @@ -2348,7 +2370,7 @@ function parseRuleEntry(idx, rid, name, core) { if (aid >= 0) { pos = start + rowTokLen[aid]; const ext = start + rowExt[aid]; - if (ext > maxPos) maxPos = ext; + if (ext > frameMax) { frameMax = ext; if (ext > maxPos) maxPos = ext; } absTok[aid] = start; absChar[aid] = toff(start); if (adoptHitP >= 0) { @@ -2368,23 +2390,32 @@ function parseRuleEntry(idx, rid, name, core) { } me[start] = pos; mn[start] = aid; - mx[start] = maxPos; + mx[start] = ext; mg[start] = memoGenCur; scPush(aid); return true; } } } + let recKey = -1; + if (recovering) { + recKey = idx * (tokN + 1) + start; + if (recRunning.has(recKey)) return false; + recRunning.add(recKey); + } const prevContext = currentPrattContext; currentPrattContext = name; const prevSup = suppressCur; suppressCur = mySup; + const fm0 = frameMax; + frameMax = start; let result; try { result = core(0); } finally { currentPrattContext = prevContext; suppressCur = prevSup; + if (recKey >= 0) recRunning.delete(recKey); } if (!mySup && !capped) { if (me === undefined || me.length < tokN + 1) { @@ -2399,7 +2430,7 @@ function parseRuleEntry(idx, rid, name, core) { } me[start] = pos; mn[start] = result; - mx[start] = maxPos; + mx[start] = frameMax; mg[start] = memoGenCur; // the TRUE probe watermark — the +2 read slack (stop token, // SECOND-token dispatch) is applied at INVALIDATION time if (result >= 0) { @@ -2413,11 +2444,12 @@ function parseRuleEntry(idx, rid, name, core) { // covers recovering-built rows: a fire that cut a losing arm short is still // bounded by the recorded probes, so no mode stamp is needed for adoption — // rowRM stays purely structural for the diagnostics walk.) - const re = maxPos - start; + const re = frameMax - start; if (re > rowExt[result]) rowExt[result] = re; } } + if (fm0 > frameMax) frameMax = fm0; if (result >= 0) { scPush(result); return true; } return false; } @@ -2544,6 +2576,8 @@ function farthest(errPos) { function runParse(entryRule) { pos = 0; maxPos = 0; + frameMax = 0; + recRunning.clear(); parseLimit = -1; cap = tokN; currentPrattContext = null; @@ -2565,7 +2599,7 @@ function runParse(entryRule) { const mark = scn; const from = pos; while (pos < tokN) { scPush(~(pos << 2)); pos++; } - if (pos > maxPos) maxPos = pos; + if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } docDiags.push({ offset: from < tokN ? toff(from) : 0, end: tokN > 0 ? tend(tokN - 1) : 0, message: 'no parse' }); scPush(finishNode(RID_ERROR, mark)); } @@ -2578,7 +2612,7 @@ function runParse(entryRule) { const mark = scn; const from = pos; while (pos < tokN) { scPush(~(pos << 2)); pos++; } - if (pos > maxPos) maxPos = pos; + if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } docDiags.push({ offset: toff(from), end: tend(tokN - 1), message: "unexpected '" + tokTextAt(from) + "' after successful parse" }); scPush(finishNode(RID_ERROR, mark)); scPush(finishNode(RID_ERROR, 0)); @@ -2663,7 +2697,7 @@ function adoptSeek(q, rid) { let xid = e, xb = cb; for (;;) { if (rowOK[xid] !== 0 && rowRule[xid] === rid - && (recovering || rowRM[xid] === 0) + && rowRM[xid] === 0 && (q + rowExt[xid] + 2 <= adoptDmgStart || q >= adoptDmgOldEnd)) { return xid; } @@ -2721,7 +2755,44 @@ function lexMsg(g) { // pass re-runs (adoption keeps re-runs cheap). Bars are text-determined, so fresh // and incremental recovering parses are byte-identical by construction. let recoverBars = []; +// (rule, pos) frames currently ON THE STACK during a recovering run. Token +// synthesis makes zero-width matches possible, so a rule can re-enter itself at +// the SAME position through a synthesized leading token — an unbounded recursion +// no grammar check can rule out. A re-entered (rule, pos) frame fails (PEG cycle +// semantics): only zero-width synthesis can build such a cycle, so a real parse +// never sees the refusal. Strict runs never consult this (zero hot-path cost). +const recRunning = new Set(); let recoverFree = false; // iteration-cap fallback: fire at any failure (still deterministic) +// Missing-token synthesis (the tsc parseExpected analog): at a bar-adjacent failure +// of a REQUIRED literal/token match, materialize a zero-width $missing row instead +// of failing the construct — the structure completes (a call keeps its Call shape +// with the ')' marked missing) and the diagnostic reads "expected 'x'". The firing +// condition is a PURE FUNCTION of (position, bar list): pos within a fixed window +// below a bar — no counters, no maxPos (a global budget threads non-local state +// through the parse and desynchronizes adopted regions; the first attempt at this +// proved it with the cross-grammar gate). probing>0 marks failure-tolerated probes +// (not(), sep delimiters, optionals) where synthesis would flip semantics. The +// zero-width spin is killed structurally: recovering repetition loops DISCARD +// zero-width elements (hooked elements are non-nullable — only synthesis can make +// them zero-width). +let probing = 0; +function missAt(p2) { + for (let i = 0; i < recoverBars.length; i++) { + const b = recoverBars[i]; + if (b > p2 + 2) break; + if (p2 <= b && b <= p2 + 2) return true; + } + return false; +} +function missTok(t) { + if (probing !== 0 || recoverFree || !missAt(pos)) return false; + const id = finishNode(RID_MISSING, scn); + rowStart[id] = t; // expected identity: >0 literal int, <0 named token kind. + // A zero-kid row never dereferences its kids base, so the + // slot is free storage. + scPush(id); + return true; +} // Monotone count of recovery FIRES (winning or losing arms alike): a rule whose // parse window saw any fire may have probed LESS than a strict parse would (the // fire ends a losing arm's exploration early), so its stored watermark cannot be @@ -2736,6 +2807,11 @@ let recFires = 0; // spine (rowRM propagates structurally at finishNode): O(error paths), no global // walk, no per-candidate bookkeeping — losing-arm rows are simply unreachable. function collectErrRows(id, charBase, tokBase) { + if (rowRule[id] === RID_MISSING) { + const t = rowStart[id]; + docPar.push({ offset: charBase, end: charBase, message: "expected '" + (t > 0 ? LIT_NAMES[t] : (K_NAMES[-t] ?? '?')) + "'" }); + return; + } if (rowRule[id] === RID_ERROR) { if (rowCount[id] > 0) { const fe = kids[rowStart[id]]; @@ -2747,7 +2823,7 @@ function collectErrRows(id, charBase, tokBase) { const cs = rowStart[id], n = rowCount[id]; for (let i = 0; i < n; i++) { const e = kids[cs + i]; - if (e >= 0 && (rowRM[e] !== 0 || rowRule[e] === RID_ERROR)) { + if (e >= 0 && (rowRM[e] !== 0 || rowRule[e] >= RID_ERROR)) { collectErrRows(e, charBase + kcr(id, cs + i), tokBase + ktr(id, cs + i)); } } @@ -2756,7 +2832,7 @@ function collectErrRows(id, charBase, tokBase) { // diagnostics (fresh survivors + adopted rowRM subtrees), ordered by offset. function settleDiags() { docPar.length = 0; - if (lastRoot >= 0 && (rowRM[lastRoot] !== 0 || rowRule[lastRoot] === RID_ERROR)) { + if (lastRoot >= 0 && (rowRM[lastRoot] !== 0 || rowRule[lastRoot] >= RID_ERROR)) { collectErrRows(lastRoot, rootCharBase, rootTokBase); } rebuildDiagView(); @@ -2805,7 +2881,7 @@ function recoverSkip(canStart, closerT) { && !(canStart !== null && canStart(pos))) { scPush(~(pos << 2)); pos++; } - if (pos > maxPos) maxPos = pos; + if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } recFires++; scPush(finishNode(RID_ERROR, mark)); return true; @@ -2829,13 +2905,13 @@ function runExtend(rid) { let oq = adoptRunOq; let nq = pos; const sfx = oq >= adoptDmgOldEnd; // past the damage: monotone, no per-member ext check - let mp = maxPos; + let mp = frameMax; while (i < csEnd) { const e = kids[i]; if (e < 0) break; if (pb + ktr(P, i) !== oq) break; if (rowRule[e] !== rid || rowOK[e] === 0) break; - if (!recovering && rowRM[e] !== 0) break; + if (rowRM[e] !== 0) break; const tl = rowTokLen[e]; if (tl === 0) break; const ex = rowExt[e]; @@ -2847,7 +2923,7 @@ function runExtend(rid) { nq += tl; oq += tl; i++; } - if (mp > maxPos) maxPos = mp; + if (mp > frameMax) { frameMax = mp; if (mp > maxPos) maxPos = mp; } pos = nq; } @@ -2882,6 +2958,11 @@ function rowKCof(id) { } function trySurgery(dmgA, dmgB, tokD, chrD) { if (adoptRoot < 0) return -1; + // a recovery-made tree cannot take a strict splice: kept siblings would carry + // $error/$missing rows into a "successful" strict pass, freezing the OLD text's + // recovery shape instead of re-deriving it for the new text (rowRM reaches the + // root structurally, so this is the exact tree-wide test) + if (rowRM[adoptRoot] !== 0 || rowRule[adoptRoot] >= RID_ERROR) return -1; // the whole-file token math must close, or the shape changed beyond a splice if (adoptRootTok + rowTokLen[adoptRoot] + tokD !== tokN) return -1; // 1. descend along single-affected-row kids, recording the path @@ -2951,7 +3032,7 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { pos = Da < Db ? Dbase + (kids[csD + Da] < 0 ? (~kids[csD + Da]) >>> 2 : ktr(D, csD + Da)) : dmgA; - maxPos = pos; scn = 0; parseLimit = -1; cap = tokN; + maxPos = pos; frameMax = pos; scn = 0; parseLimit = -1; cap = tokN; currentPrattContext = null; suppressNext = null; suppressCur = null; const genAt = memoGenCur; const fn = RULE_FN_BY_ID[elem]; @@ -3574,6 +3655,10 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // iteration. Lex diagnostics are re-seeded into every attempt (the window was // lexed once; only the parse re-runs). const lexRecovered = recovering; + // a lex-recovered first run IS a recovery run — adoption stays off for the + // same reason as in the bar iteration below (and rowRM rows would otherwise + // replay the OLD text's recovery shape as a fake strict success) + if (lexRecovered) adoptRoot = -1; const lexSnap = docLex.slice(); try { root = runParse(entryRule); @@ -3590,10 +3675,15 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── } recovering = false; } catch (e) { - // total edit: re-run the SAME spliced stream under the bar discipline — - // adoption applies on every attempt (rows that parse strictly are mode- - // neutral), so re-runs stay O(damage)-ish + // total edit: re-run the SAME spliced stream under the bar discipline. + // Adoption is OFF for every recovery run: bars are minted from each failed + // run's maxPos, and a row recorded under a recovering frame carries that + // run's bar-dependent probe reach — replaying it would make the next bar a + // function of the OLD bar history instead of (text, bars). Attempt 0 runs + // with no bars (behaviorally strict, adoption-free) and re-derives the true + // strict frontier, so every attempt is byte-equal to the fresh side's. recovering = true; + adoptRoot = -1; const bars = []; let done = false; try { @@ -3708,6 +3798,7 @@ export function createParser() { } if (!done) { recoverFree = true; + adoptRoot = -1; // free-fire decisions are non-local: adoption would desync try { docLex.length = 0; root = parseCore(source, entryRule); diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index 6d7b85b..361fdaa 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -97,6 +97,15 @@ function diffChange(a: string, b: string): Edit { } const GLUE: Array<[string, string]> = [ + // recovery-protocol pins (cross-grammar-gate finds): bar minting must be + // adoption-invariant — a pre-edit RECOVERY tree must not leak its probe reaches + // (frameMax exactness), its rows (surgery/adoption refusal), or its shape (the + // lex-recovered first run) into the edited re-parse + ['class za {" z', 'zlass za {" z'], + ['funtionzaaz( a z { }', 'funtiznzaaz( a z { }'], + ['function \\u{0} ( (aa ) { }', 'functionx \\u{0} ( (aa ) { }'], + ['const x = f(1, 2);', 'const x = f(1, 2;'], + ['function g() { return 1; }', 'function g() { return 1;'], ['const a = 1;\nconst b = 2;\n', 'const a = 1;\nconst bx = 2;\n'], ['let a = b; let c = 1;\n', 'let a = b1; let c = 1;\n'], ['if (a = b) { f(); }\n', 'if (a == b) { f(); }\n'], diff --git a/test/recovery.ts b/test/recovery.ts index 6e378c6..9c498f3 100644 --- a/test/recovery.ts +++ b/test/recovery.ts @@ -70,6 +70,11 @@ const INVALID: string[] = [ 'if (a { b(); }\nconst tail = 3;\n', '@@@@\n', '}{)(\n', + // session-found shapes: bar-ladder degeneracies, lex-recovered docs, glued junk + 'class za {" z', + 'funtionzaaz( a z { }', + 'function \\u{0} ( (aa ) { }', + 'functio aa (z az x1<) { }', ]; let invalidN = 0; for (const text of INVALID) { @@ -112,7 +117,30 @@ let typedOk = 0; if (c.errors.length !== 0) bad('completed statement still reports errors'); } -console.log(`recovery: valid ${validN}/${VALID.length} ≡ strict+clean · invalid ${invalidN}/${INVALID.length} total+deterministic · typing ${typedOk}/${TYPED.length} keystrokes ≡ fresh`); +// ── 4. missing-token synthesis: tsc-style "expected 'x'" diagnostics with the +// structure PRESERVED (a zero-width $missing leaf closes the construct instead of +// an $error absorbing the rest). Exact-match pins — quality must not regress to +// absorption silently. +const SYNTH: Array<[string, string[]]> = [ + ['const x = f(1, 2;', ["16:expected ')'"]], + ['function g() { return 1;', ["24:expected '}'"]], + ['if (x { y(); }', ["6:expected ')'"]], +]; +let synthN = 0; +for (const [text, want] of SYNTH) { + const c = p.parse(text); + const got = c.errors.map((g) => g.offset + ':' + g.message); + if (JSON.stringify(got) !== JSON.stringify(want)) { + bad(`synthesis on «${text}»: got ${JSON.stringify(got)}, want ${JSON.stringify(want)}`); + continue; + } + let missing = 0; + p.visit(c, { enter(id: number) { if (p.tree.ruleNameOf(id) === '$missing') missing++; } }); + if (missing === 0) { bad(`synthesis on «${text}»: no $missing node in the tree`); continue; } + synthN++; +} + +console.log(`recovery: valid ${validN}/${VALID.length} ≡ strict+clean · invalid ${invalidN}/${INVALID.length} total+deterministic · typing ${typedOk}/${TYPED.length} keystrokes ≡ fresh · synthesis ${synthN}/${SYNTH.length} exact`); if (fails > 0) { console.error('✗ total-parsing contract violated'); process.exit(1); diff --git a/test/verify-rejects.ts b/test/verify-rejects.ts index e922f2c..bc97765 100644 --- a/test/verify-rejects.ts +++ b/test/verify-rejects.ts @@ -35,7 +35,7 @@ function ourReach(msg: string): number | null { } const files = (await allTsFiles(baseDir)).sort(); -let agree = 0, early = 0, unknown = 0; +let agree = 0, early = 0, unknown = 0, oracleCrash = 0; const earlies: { file: string; ourReach: number; tsFirst: number; ctx: string }[] = []; for (const file of files) { @@ -44,7 +44,15 @@ for (const file of files) { let msg = ''; try { parse(code); continue; } catch (e: any) { msg = e.message; } // only files we FAIL - const sf = ts.createSourceFile('t.ts', code, ts.ScriptTarget.Latest, true, ts.ScriptKind.TS); + // the oracle itself can die on malformed input (e.g. a Debug.assert inside + // tsc's `await using` paths) — a crashed oracle has no verdict, count + skip + let sf; + try { + sf = ts.createSourceFile('t.ts', code, ts.ScriptTarget.Latest, true, ts.ScriptKind.TS); + } catch { + oracleCrash++; + continue; + } const diags = (sf as any).parseDiagnostics ?? []; if (diags.length === 0) continue; // that's a REAL gap, handled elsewhere @@ -64,6 +72,7 @@ console.log(`Single-file error-tests we fail: ${agree + early + unknown}`); console.log(` AGREE (reach >= TS first error - ${SLACK}) : ${agree} ← rejected for the right reason`); console.log(` EARLY (bail before TS's error) : ${early} ← hidden gap: valid code we can't parse`); console.log(` UNKNOWN (no offset in our error) : ${unknown}`); +if (oracleCrash > 0) console.log(` ORACLE-CRASH (tsc threw; no verdict) : ${oracleCrash}`); if (earlies.length) { console.log(`\n===== EARLY (hidden gaps) =====`); earlies.sort((a, b) => (a.tsFirst - a.ourReach) - (b.tsFirst - b.ourReach)); From bf771a1f1b168e1a62e78f7cd842bb0ab71722b0 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 23:37:37 +0800 Subject: [PATCH 06/23] Missing-nonterminal synthesis: the tsc "Expression expected" analog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Required RULE references failing inside the bar window now mint a zero-width \$missing row carrying the rule identity (RULE_MISS_BASE + rid in rowStart), reported as "expected Expr": 'const a = ;' / 'a + ;' / '-;' / 'x ? y : ;' / 'a, ;' / 'f(1, ;' all produce a single tsc-grade diagnostic at the right offset. Hooks: parseRuleEntry's fail exit (memoized like any result) plus the three Pratt rhs sites that bypass rule entries (operator LED, prefix NUD, chain-rhs LED). Synthesis placement follows COMMITMENT semantics, replacing the flat probing counter for optionals: an optional group or repetition element may fail freely while uncommitted (probeBase = its start; 'the optional thing is absent' / 'the list ends' need no diagnostic), but once it consumes a real token past that base, missing pieces synthesize — 'const a = ;' commits at '=' and mints the Expr; rep(seq(',', Expr)) cannot mint a phantom ',' to keep a list alive, yet after a real ',' the element synthesizes. not() and separator probes stay absolutely suppressed (pure lookahead). FIRST-token call-site guards open under recovering (one global read on the strict guard-fail path): at a bar the next token is exactly what cannot start the rule, and the hook lives inside parseRuleEntry — 'a, ;' must reach it. Two latent bugs fixed in passing, both found by the new shapes: - The frameMax conversion in the previous commit was double-applied at the 12 token-advance sites by a patch-script composition hole (edit #3's pattern matched text edit #2 had just inserted; the anchor counts were asserted on the pre-edit source), leaving the nested inner test unreachable — token consumes never raised the global maxPos, so bars were minted from a watermark that only memo jumps could move. Equivalence gates stayed green because both engines ran the same wrong protocol; the synthesis quality work surfaced it as losing-arm wins. Advances now pair frameMax/maxPos correctly. - The memo-jump coordinate refresh read toff(start) unguarded; for a zero-width row minted AT EOF, start == tokN reads past the token columns (stale slots from a longer previous document under handle reuse) — the recovery gate's in-bounds check caught an "expected Expr" at offset 8 in a 5-char document. The refresh now uses the same EOF guard as offset(). recovery.ts synthesis pins 3 -> 9 (the six nonterminal shapes above, exact diagnostics + \$missing presence). All gates green: incremental-grammars 672/672, incremental-verify 136 steps, multi-doc 60 + 9/9, recovery valid/invalid/typing/synthesis, suite 33/33, perf-bench PASS, 9MB fresh 438ms / valid keystroke warm ~0.6-5ms / breaking 649ms / while-broken 438ms / fixing 368ms (broken-state costs are the recorded follow-up). --- src/emit-parser.ts | 87 ++++++++++++++++++++++++++++++++-------------- test/recovery.ts | 10 ++++++ 2 files changed, 71 insertions(+), 26 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 71d6858..ba611b2 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -950,9 +950,11 @@ class Emitter { const fn = this.matchFn(body); if (kind === '?') { // Try once; on failure the helper restored pos/scn itself. The probe guard - // keeps token synthesis out of OPTIONAL paths — missing tokens are only - // inserted where a failure would propagate (required items), tsc-style. - return `probing++; ${fn}(); probing--;`; + // keeps synthesis out of UNCOMMITTED optional paths, tsc-style: before the + // group consumes a real token its failure is free (no synthesis); once it + // has consumed (pos > probeBase) the group is committed — 'const a = ;' + // must synthesize the initializer Expr, not drop the whole '= Expr' group. + return `{ const _pb = probeBase; probeBase = pos; ${fn}(); probeBase = _pb; }`; } // Run-extension: after an iteration whose element was ADOPTED from the old tree, // bulk-adopt its following old siblings (runExtend) instead of re-entering the @@ -970,9 +972,15 @@ class Emitter { const ext = runId >= 0 ? `\n if (adoptRunPos === pos) runExtend(${runId});` : ''; const recFirst = this.quantRecoverFirst(body); const csFn = recFirst !== null ? this.membershipFn(recFirst) : 'null'; + // The element's LEADING token is the loop's continuation decision — its + // failure is a normal list end, so synthesis is suppressed until the element + // commits (consumes past the iteration start): rep(seq(',', Expr)) must not + // mint a phantom ',' to keep the list going, but once the real ',' is there + // a missing Expr synthesizes (tsc list-element semantics). Same commitment + // device as the optional-probe guard, staged inline (hot loop — no closure). const failFor = (beforeV: string, bsnV: string) => recFirst !== null - ? `if (!${fn}()) { if (!recovering || !recoverSkip(${csFn}, ${closerT})) break; continue; }\n if (recovering && pos === ${beforeV}) { scn = ${bsnV}; if (!recoverSkip(${csFn}, ${closerT})) break; continue; }` - : `if (!${fn}()) break;`; + ? `const ${beforeV}_pb = probeBase; probeBase = pos; const ${beforeV}_ok = ${fn}(); probeBase = ${beforeV}_pb;\n if (!${beforeV}_ok) { if (!recovering || !recoverSkip(${csFn}, ${closerT})) break; continue; }\n if (recovering && pos === ${beforeV}) { scn = ${bsnV}; if (!recoverSkip(${csFn}, ${closerT})) break; continue; }` + : `const ${beforeV}_pb = probeBase; probeBase = pos; const ${beforeV}_ok = ${fn}(); probeBase = ${beforeV}_pb;\n if (!${beforeV}_ok) break;`; if (kind === '*') { const before = this.id(), bsn = this.id(); return [ @@ -1020,7 +1028,11 @@ class Emitter { if (!fs || fs.size === 0) return ''; // ruleMightStart: true iff some key in fs matches peek(); guard = NOT that. The set // is baked as a per-set membership fn over two byte tables (see membershipFn). - return `!${this.membershipFn(fs)}(pos)`; + // Recovering runs skip the guard: at a bar the next token is exactly what CANNOT + // start the rule, and the missing-nonterminal hook lives inside parseRuleEntry — + // a pre-call rejection would silence it ('a, ;' must mint the Expr, not end the + // list). Strict pays one global read only when the guard would fail anyway. + return `(!${this.membershipFn(fs)}(pos) && !recovering)`; } // Deep per-alternative dispatch condition (mirrors gen-parser.ts altMightStart): the @@ -1812,7 +1824,7 @@ function matchKwLit(kw) { // keyword), so the old k >= K_NAMED_MIN guard was redundant — one int compare. if (pos >= cap || tkT[pos] !== kw) return recovering ? missTok(kw) : false; scPush(~((pos << 2) | 1)); - if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } + if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } // Punct literal: tok.type === '' && tok.text === value, with the gt-splice fallback. @@ -1823,7 +1835,7 @@ function matchPuLit(pu) { // redundant — one int compare. The '>'-split lives only in matchPuLitGT ('>' sites). if (pos >= cap || tkT[pos] !== pu) return recovering ? missTok(pu) : false; scPush(~(pos << 2)); - if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } + if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } function matchPuLitGT(pu) { @@ -1831,7 +1843,7 @@ function matchPuLitGT(pu) { const off = toff(pos); if (tkT[pos] === pu) { scPush(~(pos << 2)); - if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } + if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } // Split multi-'>' tokens: '>>', '>>>', '>>=', '>>>=' can yield a single '>': shift the @@ -1876,7 +1888,7 @@ function matchPuLitGT(pu) { // wholly BEFORE the splice point (token pos is being consumed right now), and the // carried memo was just cleared, so nothing reachable references shifted indices. scPush(~(pos << 2)); - if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } + if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } return recovering ? missTok(pu) : false; @@ -1896,7 +1908,7 @@ function matchLiteral(value) { function matchTokK(nameKind) { if (pos >= cap || tkK[pos] !== nameKind) return recovering ? missTok(-nameKind) : false; scPush(~(pos << 2)); - if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } + if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } @@ -1908,13 +1920,13 @@ function parseTemplateExpr() { const k = tkK[pos]; if (k === K_TPL_TOKEN) { scPush(~(pos << 2)); - if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } + if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } if (k === K_TEMPLATE_HEAD) { const mark = scn; scPush(~(pos << 2)); - if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } + if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } const interpRule = currentPrattContext ?? EXPR_RULE; while (true) { RULES[interpRule](); @@ -1922,12 +1934,12 @@ function parseTemplateExpr() { const nk = tkK[pos]; if (nk === K_TEMPLATE_MIDDLE) { scPush(~(pos << 2)); - if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } + if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } continue; } if (nk === K_TEMPLATE_TAIL) { scPush(~(pos << 2)); - if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } } + if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } break; } break; @@ -2118,8 +2130,9 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` const info = PREFIX_BY_T[tkT[pos]];`); e.emit(` if (info) {`); e.emit(` scPush(~((pos << 2) | 2));`); - e.emit(` if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } }`); - e.emit(` const rhs = ${ruleFn}_pratt(info.rbp);`); + e.emit(` if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; }`); + e.emit(` let rhs = ${ruleFn}_pratt(info.rbp);`); + e.emit(` if (rhs < 0 && recovering) rhs = missRule(${rid});`); e.emit(` if (rhs >= 0 && pos > bestNudPos) { scPush(rhs); lhs = finishNode(${rid}, mark); bestNudPos = pos; }`); e.emit(` }`); e.emit(` }`); @@ -2188,7 +2201,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` if (info.position === 'postfix') {`); e.emit(` if (!tailClosed) {`); e.emit(` scPush(~((pos << 2) | 2));`); - e.emit(` if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } }`); + e.emit(` if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; }`); e.emit(` lhs = finishWrap(${rid}, lhs, ledMark);`); e.emit(` tailClosed = true; matched = true;`); e.emit(` }`); @@ -2202,8 +2215,9 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` }`); e.emit(` }`); e.emit(` scPush(~((pos << 2) | 2));`); - e.emit(` if (++pos > frameMax) { frameMax = pos; if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } }`); - e.emit(` const rhs = ${ruleFn}_pratt(info.rbp);`); + e.emit(` if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; }`); + e.emit(` let rhs = ${ruleFn}_pratt(info.rbp);`); + e.emit(` if (rhs < 0 && recovering) rhs = missRule(${rid});`); e.emit(` if (rhs >= 0) { scPush(rhs); lhs = finishWrap(${rid}, lhs, ledMark); matched = true; }`); e.emit(` else { pos = ledSaved; scn = ledMark; }`); e.emit(` }`); @@ -2230,7 +2244,8 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(`function led_${sn}_${i}() {`); e.emit(` const _save = pos; const _sn = scn;`); e.emit(e.matchInto({ type: 'seq', items: led.items.slice(0, -1) } as RuleExpr, 'pos = _save; scn = _sn; return false;')); - e.emit(` const _rhs = ${ruleFn}_pratt(${lp.rhsBp});`); + e.emit(` let _rhs = ${ruleFn}_pratt(${lp.rhsBp});`); + e.emit(` if (_rhs < 0 && recovering) _rhs = missRule(${rid});`); e.emit(` if (_rhs < 0) { pos = _save; scn = _sn; return false; }`); e.emit(` scPush(_rhs);`); e.emit(` return true;`); @@ -2352,9 +2367,12 @@ function parseRuleEntry(idx, rid, name, core) { if (id >= 0) { // refresh the reused root's transient BUILD coordinates to the current stream // (its green internals are position-independent; only the attachment point — - // what the enclosing finishNode reads — must be current). + // what the enclosing finishNode reads — must be current). start can be tokN + // for a zero-width synthesized row minted AT EOF — toff(tokN) reads past the + // token columns (stale slots from a longer previous document), so use the + // same EOF guard offset() uses. absTok[id] = start; - absChar[id] = toff(start); + absChar[id] = start < tokN ? toff(start) : (tokN > 0 ? tend(tokN - 1) : 0); scPush(id); return true; } @@ -2417,6 +2435,7 @@ function parseRuleEntry(idx, rid, name, core) { suppressCur = prevSup; if (recKey >= 0) recRunning.delete(recKey); } + if (result < 0 && recovering) result = missRule(rid); if (!mySup && !capped) { if (me === undefined || me.length < tokN + 1) { me = new Array(tokN + 1); @@ -2776,6 +2795,10 @@ let recoverFree = false; // iteration-cap fallback: fire at any failure (still // zero-width elements (hooked elements are non-nullable — only synthesis can make // them zero-width). let probing = 0; +// Innermost ACTIVE optional-probe start (-1 = none). Synthesis inside an optional +// group is allowed only once the group consumed past this (committed) — failures +// of an uncommitted probe are ordinary "the optional thing isn't there". +let probeBase = -1; function missAt(p2) { for (let i = 0; i < recoverBars.length; i++) { const b = recoverBars[i]; @@ -2785,14 +2808,26 @@ function missAt(p2) { return false; } function missTok(t) { - if (probing !== 0 || recoverFree || !missAt(pos)) return false; + if (probing !== 0 || pos <= probeBase || recoverFree || !missAt(pos)) return false; const id = finishNode(RID_MISSING, scn); - rowStart[id] = t; // expected identity: >0 literal int, <0 named token kind. + rowStart[id] = t; // expected identity: >0 literal int, <0 named token kind, + // >= RULE_MISS_BASE a missing NONTERMINAL (rid offset). // A zero-kid row never dereferences its kids base, so the // slot is free storage. scPush(id); return true; } +// Missing-NONTERMINAL synthesis (the tsc "Expression expected" analog): a REQUIRED +// rule reference failing inside the bar window stands in as a zero-width $missing +// row carrying the rule identity. Same purity rules as missTok. Returns the node +// id (not pushed — call sites differ) or -1. +const RULE_MISS_BASE = 1 << 20; +function missRule(rid) { + if (probing !== 0 || pos <= probeBase || recoverFree || !missAt(pos)) return -1; + const id = finishNode(RID_MISSING, scn); + rowStart[id] = RULE_MISS_BASE + rid; + return id; +} // Monotone count of recovery FIRES (winning or losing arms alike): a rule whose // parse window saw any fire may have probed LESS than a strict parse would (the // fire ends a losing arm's exploration early), so its stored watermark cannot be @@ -2809,7 +2844,7 @@ let recFires = 0; function collectErrRows(id, charBase, tokBase) { if (rowRule[id] === RID_MISSING) { const t = rowStart[id]; - docPar.push({ offset: charBase, end: charBase, message: "expected '" + (t > 0 ? LIT_NAMES[t] : (K_NAMES[-t] ?? '?')) + "'" }); + docPar.push({ offset: charBase, end: charBase, message: t >= RULE_MISS_BASE ? 'expected ' + RULE_NAMES[t - RULE_MISS_BASE] : "expected '" + (t > 0 ? LIT_NAMES[t] : (K_NAMES[-t] ?? '?')) + "'" }); return; } if (rowRule[id] === RID_ERROR) { diff --git a/test/recovery.ts b/test/recovery.ts index 9c498f3..22fe8ef 100644 --- a/test/recovery.ts +++ b/test/recovery.ts @@ -125,6 +125,16 @@ const SYNTH: Array<[string, string[]]> = [ ['const x = f(1, 2;', ["16:expected ')'"]], ['function g() { return 1;', ["24:expected '}'"]], ['if (x { y(); }', ["6:expected ')'"]], + // missing NONTERMINALS (the tsc "Expression expected" analog): required rule + // refs failing inside the bar window mint a zero-width $missing carrying the + // rule identity — committed optionals ('= Expr' after the real '='), operator + // rhs, mixfix arms, and list elements after a real separator all synthesize + ['const a = ;', ['10:expected Expr']], + ['const x = a + ;', ['14:expected Expr']], + ['const a = -;', ['11:expected Expr']], + ['x ? y : ;', ['8:expected Expr']], + ['a, ;', ['3:expected Expr']], + ["f(1, ;", ["5:expected Expr", "5:expected ')'"]], ]; let synthN = 0; for (const [text, want] of SYNTH) { From 2245f0b20a00a48fa5e174115047506997c686d7 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 23:57:27 +0800 Subject: [PATCH 07/23] Broken-state edits go incremental: recovering adoption under bar purity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Typing in a broken 9MB document drops from ~440ms to ~3-7ms per keystroke (avg 3.2ms over a 10-keystroke burst; incremental gate 9.9x vs fresh on its mixed valid/broken sessions). Recovery runs now ADOPT rows from the previous tree again — soundly this time, by making every recovery decision a pure function of the row's window: - recoverArmed takes (from, reach): a hook arms iff THE FAILING ELEMENT is stuck at a bar — its own frame-local probe reach (staged frameMax around hooked-loop elements) sits on the bar. The old form read the GLOBAL maxPos, so a frontier parked on a far bar could arm an unrelated loop whose own probes never approached it — a decision no window can reproduce. The runParse nets pass (pos, maxPos): top-level semantics unchanged. - barsWindowEq: a row adopts in a recovering run iff the bars inside its window [start, reach+2] are IDENTICAL (shifted) to the bars the build run saw there — with position-pure decisions, window text + window bars determine the frame's behavior completely, including losing-arm fires and synthesis. lastBars rides the document register set; strict trees carry [], free-fire trees null (free-fire is not bar-pure - never adopted while recovering). rowRM rows are adoptable under the predicate (the error region itself is what stays stable across far edits), and runExtend re-checks per member. The blanket adoption-off in the bar iteration and the lex-recovered first run is removed; attempt 0 (no bars) adopts exactly where the build run was also bar-free. The changed fire pattern exposed a latent message-derivation bug present in committed code: collectErrRows decoded a \$error row's first kid as a token leaf unconditionally, but the runParse leftover net builds a WRAPPER \$error whose kids are nodes ([partial-root, tail-error]) - (~nodeId)>>>2 indexed a garbage column, docText read text from an unrelated offset, and the two text layers (contiguous string vs pieces) resolved the garbage differently, which is how the gate caught it (equal trees, different messages). Wrapper-shaped \$error rows now fall through to the generic descent so the tail derives its message from its real first token. All equivalence gates green (incremental-grammars 672/672, incremental-verify 136 steps, multi-doc, recovery incl. synthesis pins 9/9), suite 33/33, perf-bench PASS, strict corpus parity intact. 9MB: fresh ~508ms, breaking keystroke ~409ms (the absorbed error region re-parses; recorded follow-up with fix-transition ~395ms), keystrokes while broken 3-13ms. --- src/emit-parser.ts | 103 ++++++++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 34 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index ba611b2..df3dffe 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -979,7 +979,7 @@ class Emitter { // a missing Expr synthesizes (tsc list-element semantics). Same commitment // device as the optional-probe guard, staged inline (hot loop — no closure). const failFor = (beforeV: string, bsnV: string) => recFirst !== null - ? `const ${beforeV}_pb = probeBase; probeBase = pos; const ${beforeV}_ok = ${fn}(); probeBase = ${beforeV}_pb;\n if (!${beforeV}_ok) { if (!recovering || !recoverSkip(${csFn}, ${closerT})) break; continue; }\n if (recovering && pos === ${beforeV}) { scn = ${bsnV}; if (!recoverSkip(${csFn}, ${closerT})) break; continue; }` + ? `const ${beforeV}_pb = probeBase; probeBase = pos; const ${beforeV}_fm = frameMax; frameMax = pos; const ${beforeV}_ok = ${fn}(); probeBase = ${beforeV}_pb; const ${beforeV}_re = frameMax; if (${beforeV}_fm > frameMax) frameMax = ${beforeV}_fm;\n if (!${beforeV}_ok) { if (!recovering || !recoverSkip(${csFn}, ${closerT}, ${beforeV}, ${beforeV}_re)) break; continue; }\n if (recovering && pos === ${beforeV}) { scn = ${bsnV}; if (!recoverSkip(${csFn}, ${closerT}, ${beforeV}, ${beforeV}_re)) break; continue; }` : `const ${beforeV}_pb = probeBase; probeBase = pos; const ${beforeV}_ok = ${fn}(); probeBase = ${beforeV}_pb;\n if (!${beforeV}_ok) break;`; if (kind === '*') { const before = this.id(), bsn = this.id(); @@ -2385,7 +2385,9 @@ function parseRuleEntry(idx, rid, name, core) { : start >= adoptDmgOldEnd + adoptDelta ? start - adoptDelta : -1; if (q >= 0) { const aid = adoptSeek(q, rid); - if (aid >= 0) { + if (aid >= 0 && recovering && !barsWindowEq(start, q, rowExt[aid])) { + // bar context differs from the build run — parse this window for real + } else if (aid >= 0) { pos = start + rowTokLen[aid]; const ext = start + rowExt[aid]; if (ext > frameMax) { frameMax = ext; if (ext > maxPos) maxPos = ext; } @@ -2611,7 +2613,7 @@ function runParse(entryRule) { return er; } if (!RULES[entry]()) { - if (!recovering || !recoverArmed()) { + if (!recovering || !recoverArmed(pos, maxPos)) { const hasTok = pos < cap; throw new Error('Parse error at offset ' + (hasTok ? toff(pos) : 0) + ': unexpected ' + (hasTok ? "'" + tokTextAt(pos) + "'" : 'end of input') + farthest(pos)); } @@ -2623,7 +2625,7 @@ function runParse(entryRule) { scPush(finishNode(RID_ERROR, mark)); } if (pos < tokN) { - if (!recovering || !recoverArmed()) { + if (!recovering || !recoverArmed(pos, maxPos)) { throw new Error('Parse error at offset ' + toff(pos) + ": unexpected '" + tokTextAt(pos) + "' after successful parse" + farthest(pos)); } // absorb the unconsumed tail and WRAP [root, tail] — only non-repetition entry @@ -2716,7 +2718,7 @@ function adoptSeek(q, rid) { let xid = e, xb = cb; for (;;) { if (rowOK[xid] !== 0 && rowRule[xid] === rid - && rowRM[xid] === 0 + && (recovering || rowRM[xid] === 0) && (q + rowExt[xid] + 2 <= adoptDmgStart || q >= adoptDmgOldEnd)) { return xid; } @@ -2848,12 +2850,19 @@ function collectErrRows(id, charBase, tokBase) { return; } if (rowRule[id] === RID_ERROR) { - if (rowCount[id] > 0) { - const fe = kids[rowStart[id]]; + const fe = rowCount[id] > 0 ? kids[rowStart[id]] : 0; + if (fe < 0) { + // plain absorb: kids are raw tokens — the message quotes the first one const ft = tokBase + ((~fe) >>> 2); docPar.push({ offset: charBase, end: charBase + rowLen[id], message: "unexpected '" + docText(toff(ft), tend(ft)) + "'" }); + return; } - return; + // WRAPPER shape (the runParse leftover net wraps [partial-root, tail-$error]): + // the first kid is a NODE — decoding it as a token leaf reads a garbage column + // (the message then quotes text from an unrelated offset, and differently per + // text layer). Fall through to the generic descent: each kid derives its own + // diagnostics, the tail $error quoting its real first token. + if (rowCount[id] === 0) return; } const cs = rowStart[id], n = rowCount[id]; for (let i = 0; i < n; i++) { @@ -2887,23 +2896,45 @@ function rebuildDiagView() { // repetition ends PAST a bar stay silent (pos > bar), and the runParse safety net // obeys the same discipline (an ungated net would absorb on the FIRST bar-less // attempt and pre-empt the whole iteration). -function recoverArmed() { +// Bar list that built lastRoot (that run's token coords); null = free-fire built +// (free-fire decisions are not bar-pure — such a tree is never adoptable while +// recovering). Strict trees carry []. +let lastBars = []; +// A row replays identically in a recovering run iff its window sees the SAME bars +// (shifted) the build run saw there — every recovery decision (hook arming, +// missTok/missRule, the cycle sentinel) is position-pure, so window text + window +// bars determine the frame's behavior completely. +function barsWindowEq(s, q, ext) { + if (lastBars === null) return false; + const hiN = s + ext + 2, hiO = q + ext + 2; + let i = 0, j = 0; + while (i < recoverBars.length && recoverBars[i] < s) i++; + while (j < lastBars.length && lastBars[j] < q) j++; + for (;;) { + const a = i < recoverBars.length && recoverBars[i] <= hiN ? recoverBars[i] - s : -1; + const b = j < lastBars.length && lastBars[j] <= hiO ? lastBars[j] - q : -1; + if (a !== b) return false; + if (a === -1) return true; + i++; j++; + } +} +function recoverArmed(from, reach) { + // armed iff THE FAILING ELEMENT is stuck at a bar: it starts at/before the bar + // and its OWN farthest probe sits ON it (+2 read slack). The reach is the + // element's frame-local watermark, NOT the global maxPos — a global frontier + // parked on a far bar must not arm unrelated loops (position-PURITY: every + // recovery decision inside a row is a function of the row's window text and + // the bars inside that window, which is what makes recovering adoption sound). if (recoverFree) return true; for (let i = 0; i < recoverBars.length; i++) { const b = recoverBars[i]; - // armed iff parsing is STUCK AT the bar right now: the failing element starts - // at/before it and the farthest probe sits ON it (+2 read slack). maxPos is - // globally monotone, so without the upper window every loop at pos <= bar - // would arm once anything ever probed past the bar (measured: a fire at - // pos=214 absorbing 8000 tokens). Once a fire absorbs past the bar, maxPos - // leaves the window and lower loops stay silent. - if (pos <= b && b <= maxPos && maxPos <= b + 2) return true; - if (b > maxPos) break; + if (from <= b && b <= reach && reach <= b + 2) return true; + if (b > reach) break; } return false; } -function recoverSkip(canStart, closerT) { - if (!recoverArmed()) return false; +function recoverSkip(canStart, closerT, from0, reach) { + if (!recoverArmed(from0, reach)) return false; if (pos >= cap) return false; if (closerT >= 0 && tkK[pos] === K_PUNCT && tkT[pos] === closerT) return false; const mark = scn; @@ -2946,7 +2977,8 @@ function runExtend(rid) { if (e < 0) break; if (pb + ktr(P, i) !== oq) break; if (rowRule[e] !== rid || rowOK[e] === 0) break; - if (rowRM[e] !== 0) break; + if (!recovering && rowRM[e] !== 0) break; + if (recovering && !barsWindowEq(nq, oq, rowExt[e])) break; const tl = rowTokLen[e]; if (tl === 0) break; const ex = rowExt[e]; @@ -3333,7 +3365,7 @@ function saveDoc(d) { d.docDiags = docDiags; d.docLex = docLex; d.docPar = docPar; d.docPieces = docPieces; d.docPieceOff = docPieceOff; d.docLen = docLen; d.docFlat = docFlat; d.docCur = docCur; d.rootCharBase = rootCharBase; d.rootTokBase = rootTokBase; - d.lastRoot = lastRoot; d.lastRootTok = lastRootTok; + d.lastRoot = lastRoot; d.lastRootTok = lastRootTok; d.lastBars = lastBars; ${e.soa ? ' d.parenCachePos = parenCachePos; d.parenCacheStack = parenCacheStack;' : ''} d.altK = altK; d.altT = altT; d.altOff = altOff; d.altEnd = altEnd; d.altFl = altFl; d.altDp = altDp; d.altPd = altPd; d.altCap = altCap; d.altN = altN; @@ -3351,7 +3383,7 @@ function loadDoc(d) { docDiags = d.docDiags; docLex = d.docLex; docPar = d.docPar; docPieces = d.docPieces; docPieceOff = d.docPieceOff; docLen = d.docLen; docFlat = d.docFlat; docCur = d.docCur; rootCharBase = d.rootCharBase; rootTokBase = d.rootTokBase; - lastRoot = d.lastRoot; lastRootTok = d.lastRootTok; + lastRoot = d.lastRoot; lastRootTok = d.lastRootTok; lastBars = d.lastBars; ${e.soa ? ' parenCachePos = d.parenCachePos; parenCacheStack = d.parenCacheStack;' : ''} altK = d.altK; altT = d.altT; altOff = d.altOff; altEnd = d.altEnd; altFl = d.altFl; altDp = d.altDp; altPd = d.altPd; altCap = d.altCap; altN = d.altN; @@ -3444,6 +3476,7 @@ function totalNet(e) { const root = finishNode(RID_ERROR, 0); lastRoot = root; lastRootTok = 0; + lastBars = null; rootCharBase = 0; rootTokBase = 0; return root; @@ -3679,6 +3712,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── rootTokBase = adoptRootTok; lastRoot = sroot; lastRootTok = adoptRootTok; + lastBars = []; shiftDiags(cs, ceOld, charDelta); return sroot; } @@ -3690,10 +3724,6 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // iteration. Lex diagnostics are re-seeded into every attempt (the window was // lexed once; only the parse re-runs). const lexRecovered = recovering; - // a lex-recovered first run IS a recovery run — adoption stays off for the - // same reason as in the bar iteration below (and rowRM rows would otherwise - // replay the OLD text's recovery shape as a fake strict success) - if (lexRecovered) adoptRoot = -1; const lexSnap = docLex.slice(); try { root = runParse(entryRule); @@ -3703,22 +3733,22 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // is valid) survive with their shifted positions docPar.length = 0; rebuildDiagView(); + lastBars = []; } else { lastRoot = root; lastRootTok = rootTokBase; + lastBars = []; settleDiags(); } recovering = false; } catch (e) { // total edit: re-run the SAME spliced stream under the bar discipline. - // Adoption is OFF for every recovery run: bars are minted from each failed - // run's maxPos, and a row recorded under a recovering frame carries that - // run's bar-dependent probe reach — replaying it would make the next bar a - // function of the OLD bar history instead of (text, bars). Attempt 0 runs - // with no bars (behaviorally strict, adoption-free) and re-derives the true - // strict frontier, so every attempt is byte-equal to the fresh side's. + // Adoption stays LIVE under the bars-window predicate: a row whose window + // saw the same (shifted) bars in the build run replays identically — all + // recovery decisions are position-pure — so each attempt is byte-equal to + // the fresh side's while reusing every row whose bar context matches. + // Attempt 0 (no bars) adopts only where the build run was also bar-free. recovering = true; - adoptRoot = -1; const bars = []; let done = false; try { @@ -3734,6 +3764,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── scn = 0; root = runParse(entryRule); done = true; + lastBars = bars.slice(); } catch (e2) { let b = maxPos; if (bars.length > 0 && b <= bars[bars.length - 1]) b = bars[bars.length - 1] + 1; @@ -3742,6 +3773,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── } if (!done) { recoverFree = true; + lastBars = null; try { docLex.length = 0; for (let i = 0; i < lexSnap.length; i++) docLex.push(lexSnap[i]); @@ -3810,6 +3842,7 @@ export function createParser() { let root; try { root = parseCore(source, entryRule); + lastBars = []; } catch (e) { // total parse: the strict pass rejected — iterate recovery under the bar // discipline (see recoverBars); the iteration cap degrades to free-fire, @@ -3825,6 +3858,7 @@ export function createParser() { recoverBars = bars; root = parseCore(source, entryRule); done = true; + lastBars = bars.slice(); } catch (e2) { let b = maxPos; if (bars.length > 0 && b <= bars[bars.length - 1]) b = bars[bars.length - 1] + 1; @@ -3833,7 +3867,8 @@ export function createParser() { } if (!done) { recoverFree = true; - adoptRoot = -1; // free-fire decisions are non-local: adoption would desync + lastBars = null; + adoptRoot = -1; // free-fire decisions are non-local: adoption would desync try { docLex.length = 0; root = parseCore(source, entryRule); From ee1890d74653ffecf7a9aad7354e4e034249fbdb Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 01:48:33 +0800 Subject: [PATCH 08/23] Cross-attempt memo survival: bar-free windows are context-free Recovery attempts within one sequence parse the same token stream under a monotonically growing bar list, so a memo entry from an earlier attempt is provably valid in a later one when its probe window [start, mx+2] contains no bars: no bars means no synthesis and no skip arming, and the opened dispatch guards only add non-consuming probes - the frame behaved strictly, a pure function of the window text. The one exception is the recRunning cycle refusal, which can fire without synthesis (open guards let a ref chain cycle at one position) and depends on which frames are on the stack. recRunning now maps each frame to an entry serial; a refusal leaning on a frame entered before the current one taints the current frame's memo entry (stamped -memoGenCur: reusable only in its own generation, and propagating the taint to whoever reuses it). This is the diagnosed hole that sank the first survival attempt. Survival is edit-side only: the fresh-parse attempt loop calls parseCore, which resets the arena cursor per attempt, so an earlier attempt's rows are clobbered there. A mid-parse '>'-splice disables survival for the rest of the sequence (pre-split positions can't be revalidated). Also removes recFires (dead since the rowExt write-back subsumed the recFires stamp). 9MB transitions: breaking 335ms -> 157ms, fixing 230ms -> 146ms (both now lexer-bound); while-broken typing 3.4ms unchanged. All equivalence gates green: incremental-grammars 672/672, incremental-verify 136, multi-doc 60, recovery pins 9/9, check 33/33, emit-parser corpus parity 401/401. --- src/emit-parser.ts | 101 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 21 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index df3dffe..fdfdb6f 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1884,6 +1884,8 @@ function matchPuLitGT(pu) { if (parseLimit < 0) cap = tokN; // Token indices shifted: drop the per-rule memo arrays (recreated lazily at the new size). memoGenCur++; // positions shifted mid-parse: every stamped entry is stale + memoRecFloor = 0x7fffffff; // including across attempts: pre-split positions + // can never be revalidated against the new stream // GREEN tree: no kids/scratch fixup — every completed row and scratch entry lies // wholly BEFORE the splice point (token pos is being consumed right now), and the // carried memo was just cleared, so nothing reachable references shifted indices. @@ -2343,7 +2345,6 @@ function parseRuleEntry(idx, rid, name, core) { suppressNext = null; const capped = parseLimit >= 0; const start = pos; - const rf0 = recFires; // Capture the arrays together: a '>'-splice inside core() detaches them via // fill(undefined), and the store below must then write into the DETACHED arrays // (i.e. be discarded), exactly like the old per-rule Map did. @@ -2351,9 +2352,19 @@ function parseRuleEntry(idx, rid, name, core) { let mn = memoNode[idx]; let mx = memoExt[idx]; let mg = memoGen[idx]; - if (!mySup && !capped && me !== undefined && mg[start] === memoGenCur) { + const mgs = me !== undefined ? mg[start] : 0; + // Entry validity: its own generation (negative = cycle-tainted, own-generation + // only, and whoever reuses it inherits the taint), or — across recovery attempts + // of one sequence — any earlier attempt's entry whose probe window is bar-free + // (strict, context-free behavior; see memoRecFloor) and untainted. + if (!mySup && !capped && me !== undefined && (mgs === memoGenCur + || (recovering && (mgs === -memoGenCur + || (mgs >= memoRecFloor && mgs < memoGenCur && !recoverFree && barFreeWin(start, mx[start])))))) { const e = me[start]; if (e !== undefined) { + if (mgs !== memoGenCur) { + if (mgs < 0) cycleMinSerial = 0; else mg[start] = memoGenCur; + } pos = e; // The jump SEMANTICALLY reads everything the stored parse read: keep the advance // watermark ≥ the entry's watermark, or an ENCLOSING rule that completes right @@ -2418,10 +2429,18 @@ function parseRuleEntry(idx, rid, name, core) { } } let recKey = -1; + let mySerial = 0; if (recovering) { recKey = idx * (tokN + 1) + start; - if (recRunning.has(recKey)) return false; - recRunning.add(recKey); + const rs = recRunning.get(recKey); + if (rs !== undefined) { + // PEG cycle refusal — record which frame it leans on: every open frame + // entered after that one now holds a context-dependent partial result. + if (rs < cycleMinSerial) cycleMinSerial = rs; + return false; + } + mySerial = ++recSerial; + recRunning.set(recKey, mySerial); } const prevContext = currentPrattContext; currentPrattContext = name; @@ -2429,6 +2448,8 @@ function parseRuleEntry(idx, rid, name, core) { suppressCur = mySup; const fm0 = frameMax; frameMax = start; + const cm0 = cycleMinSerial; + if (recKey >= 0) cycleMinSerial = 0x7fffffff; let result; try { result = core(0); @@ -2437,6 +2458,14 @@ function parseRuleEntry(idx, rid, name, core) { suppressCur = prevSup; if (recKey >= 0) recRunning.delete(recKey); } + let tainted = false; + if (recKey >= 0) { + // Tainted iff some cycle refusal inside this frame leaned on an ancestor of + // the frame itself (entered strictly before it). Fold the minimum outward: + // a refusal that taints this frame taints every enclosing one too. + tainted = cycleMinSerial < mySerial; + if (cm0 < cycleMinSerial) cycleMinSerial = cm0; + } if (result < 0 && recovering) result = missRule(rid); if (!mySup && !capped) { if (me === undefined || me.length < tokN + 1) { @@ -2451,9 +2480,9 @@ function parseRuleEntry(idx, rid, name, core) { } me[start] = pos; mn[start] = result; - mx[start] = frameMax; - mg[start] = memoGenCur; // the TRUE probe watermark — the +2 read slack (stop token, - // SECOND-token dispatch) is applied at INVALIDATION time + mx[start] = frameMax; // the TRUE probe watermark — the +2 read slack (stop token, + // SECOND-token dispatch) is applied at INVALIDATION time + mg[start] = tainted ? -memoGenCur : memoGenCur; if (result >= 0) { rowOK[result] = 1; // The row's OWN watermark freezes at finishNode — for a Pratt rule that is @@ -2599,6 +2628,8 @@ function runParse(entryRule) { maxPos = 0; frameMax = 0; recRunning.clear(); + recSerial = 0; + cycleMinSerial = 0x7fffffff; parseLimit = -1; cap = tokN; currentPrattContext = null; @@ -2776,13 +2807,41 @@ function lexMsg(g) { // pass re-runs (adoption keeps re-runs cheap). Bars are text-determined, so fresh // and incremental recovering parses are byte-identical by construction. let recoverBars = []; -// (rule, pos) frames currently ON THE STACK during a recovering run. Token -// synthesis makes zero-width matches possible, so a rule can re-enter itself at -// the SAME position through a synthesized leading token — an unbounded recursion -// no grammar check can rule out. A re-entered (rule, pos) frame fails (PEG cycle -// semantics): only zero-width synthesis can build such a cycle, so a real parse -// never sees the refusal. Strict runs never consult this (zero hot-path cost). -const recRunning = new Set(); +// (rule, pos) frames currently ON THE STACK during a recovering run, keyed to +// their entry SERIAL. Token synthesis makes zero-width matches possible, so a rule +// can re-enter itself at the SAME position through a synthesized leading token — +// an unbounded recursion no grammar check can rule out. A re-entered (rule, pos) +// frame fails (PEG cycle semantics). Recovering runs also open the first-token +// dispatch guards, so a guard-free ref chain can cycle at one position WITHOUT any +// synthesis — the refusal then depends on which frames are on the stack, i.e. the +// failing result is a function of the frame's ANCESTORS, not of the text alone. +// Strict runs never consult this (zero hot-path cost). +const recRunning = new Map(); +let recSerial = 0; +// Minimum entry-serial referenced by any cycle refusal during the current frame's +// core (0x7fffffff = none). A refusal leaning on a frame entered BEFORE the current +// one (serial < the frame's own) taints the frame: its memo entry is valid only +// where the same ancestors are guaranteed — within its own generation — never +// across attempts. Internal cycles (both ends inside the frame) replay from the +// window text alone and do not taint. +let cycleMinSerial = 0x7fffffff; +// First memo generation of the CURRENT recovery attempt sequence (0x7fffffff = +// none active). Attempts in one sequence parse the SAME token stream under a +// monotonically growing bar list, so an entry from an earlier attempt is valid in +// a later one iff its probe window saw NO bars — no bars means no synthesis and no +// skip arming (both require a window bar), and the open dispatch guards only add +// non-consuming probes, so the frame behaved strictly: a pure function of the +// window text, stable under any bar list that stays out of the window. +let memoRecFloor = 0x7fffffff; +function barFreeWin(s, m) { + const hi = m + 2; + for (let i = 0; i < recoverBars.length; i++) { + const b = recoverBars[i]; + if (b > hi) break; + if (b >= s) return false; + } + return true; +} let recoverFree = false; // iteration-cap fallback: fire at any failure (still deterministic) // Missing-token synthesis (the tsc parseExpected analog): at a bar-adjacent failure // of a REQUIRED literal/token match, materialize a zero-width $missing row instead @@ -2830,12 +2889,6 @@ function missRule(rid) { rowStart[id] = RULE_MISS_BASE + rid; return id; } -// Monotone count of recovery FIRES (winning or losing arms alike): a rule whose -// parse window saw any fire may have probed LESS than a strict parse would (the -// fire ends a losing arm's exploration early), so its stored watermark cannot be -// trusted by a STRICT adoption — rowRM marks it (structural error containment is -// propagated separately at finishNode). -let recFires = 0; // Collect $error rows under an adopted recovery-made subtree: offset/end from the // row spans, the message re-derived from the first absorbed token — byte-identical @@ -2948,7 +3001,6 @@ function recoverSkip(canStart, closerT, from0, reach) { scPush(~(pos << 2)); pos++; } if (pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } - recFires++; scPush(finishNode(RID_ERROR, mark)); return true; } @@ -3751,6 +3803,8 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── recovering = true; const bars = []; let done = false; + memoRecFloor = memoGenCur + 1; // attempts share the stream: bar-free-window + // entries survive across them (see decl) try { for (let attempt = 0; attempt < 32 && !done; attempt++) { try { @@ -3792,6 +3846,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── } finally { recovering = false; recoverBars = []; + memoRecFloor = 0x7fffffff; } lastRoot = root; lastRootTok = rootTokBase; @@ -3851,6 +3906,9 @@ export function createParser() { recovering = true; const bars = []; let done = false; + // NO cross-attempt survival here: parseCore resets the arena cursor per + // attempt (only parseEdited carries it), so an earlier attempt's rows are + // clobbered — a surviving entry would point at overwritten rows. try { for (let attempt = 0; attempt < 32 && !done; attempt++) { try { @@ -3881,6 +3939,7 @@ export function createParser() { } finally { recovering = false; recoverBars = []; + memoRecFloor = 0x7fffffff; } settleDiags(); } From b37e1ccbd0d55e88744a7939e9b0f71f7d775492 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 02:12:59 +0800 Subject: [PATCH 09/23] Conditional lexer resync: depth-shift adoption kills the transition cliff The window relex resynced only on exact stack-depth equality, so an edit that changes paren balance shifts the entire suffix's absolute depth column and the window regrows to EOF - a 9MB document paid ~130ms of relexing on every break/fix transition for a one-token depth shift. The resync now has two sufficient conditions, both proven from observable state (template stacks empty on both sides; candidate token carries no cross-token lexer flag a successor reads): - FAST (O(1)): equal depth and neither lex dipped below it since the divergence point (damage start) - every open entry is then common to both lexes, the stacks are content-equal, and every future pop behaves identically. Trajectory minimums are folded incrementally (old side seeded from the damage-interior tokens, new side tracked per push). - SHIFTED: the old suffix never pops an entry open at the candidate (lazy suffix-min over the old depth records, pop-on-empty = -1): no open entry's head-ness is ever read again, stack contents are irrelevant, and the depths may differ by an arbitrary shift. The splice then re-bases the adopted tkPd column by the shift, restoring true absolute depths ('(' head bits are local facts of their own neighbors and stay valid). This also closes four latent unsoundness classes in the old equality path: a resync candidate that is a postfix-ambiguous op, control keyword, '(' or ')' lets the adopted successor read state derived from tokens the window re-lexed differently; and template-depth equality cannot prove the mutable interp brace counters equal (resync inside templates now waits for depth 0). Each slides the resync at most a few tokens. 9MB transitions: breaking 157ms -> 5.8ms, fixing 146ms -> 2.9ms; valid keystroke 1.8ms -> 1.1ms; while-broken typing 3.4ms -> ~2ms. Gates: lexer parity 5695 diff=0, incremental-grammars 672/672, incremental-verify 136, multi-doc 60, recovery pins 9/9, check 33/33, corpus parity 401/401, perf-bench worst 472ms. --- src/emit-lexer.ts | 81 +++++++++++++++++++++++++++++++++++++--------- src/emit-parser.ts | 18 +++++++++++ 2 files changed, 84 insertions(+), 15 deletions(-) diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index c336b37..4a9832c 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -110,6 +110,28 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`let lexWindowMore = false;`); emit(`let lexSrcBase = 0;`); emit(`let lexDiagBase = 0; // docLex floor for the current window (its own emissions sit above)`); + emit(`// Shifted-resync support: lexResyncPd is the paren-depth delta between the live`); + emit(`// stack and the old record at the adopted suffix's first token (the splice adds`); + emit(`// it to every adopted tkPd, restoring true absolute depths). altSuffMin[j] =`); + emit(`// min paren depth recorded over the old suffix [j, altN) (pop-on-empty = -1),`); + emit(`// built lazily once per edit (the caller nulls it when the alt stream changes).`); + emit(`let lexResyncPd = 0;`); + emit(`let altSuffMin = null;`); + emit(`let altSuffMinBuf = null;`); + emit(`// Min OLD-stream paren depth over the tokens inside the damage itself (set by the`); + emit(`// caller before the window lex): the old-side trajectory min starts from here.`); + emit(`let wndOldMin0 = 0x7fffffff;`); + emit(`function buildAltSuffMin(lo) {`); + emit(` if (altSuffMinBuf === null || altSuffMinBuf.length < altN + 1) altSuffMinBuf = new Int32Array(altN + 1025);`); + emit(` altSuffMin = altSuffMinBuf;`); + emit(` altSuffMin[altN] = 0x7fffffff;`); + emit(` for (let j = altN - 1; j >= lo; j--) {`); + emit(` let d = altPd[j];`); + emit(` if (d === 0 && altK[j] === K_PUNCT && altT[j] === ${tOf(')')} && (j === 0 || altPd[j - 1] === 0)) d = -1;`); + emit(` const nx = altSuffMin[j + 1];`); + emit(` altSuffMin[j] = d < nx ? d : nx;`); + emit(` }`); + emit(`}`); emit(`const LX_UNI_IDENT = /[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/uy;`); emit(`const LX_UNI_CONT = /[$\\u200c\\u200d\\p{ID_Continue}]+/uy;`); emit(`const LX_UNI_FULL = /^[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/u;`); @@ -127,6 +149,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { !first || [...first.ascii].some(cc => kwFirstCcs.has(cc)); // keywords are ASCII-initial const kIdent = identTokenName ? kOf(identTokenName) : 0; const tRParen = tOf(')'); + const tLParen = tOf('('); emit(``); // ── Baked keyword recognizer over a SOURCE SPAN: t-intern with no slice and no hash. // Length window → first-charCode switch → per-keyword compare chains (shortest first); @@ -245,12 +268,11 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` const parenHeadStack = initParens !== undefined && initParens !== null ? initParens : [];`); emit(` let wndPtr = wndPtr0;`); emit(` let wndHit = -1;`); - emit(` // stack depths as of the last token fully BEFORE the damage: a resync point may`); - emit(` // sit at any depth as long as every bracket still open there was opened before`); - emit(` // the damage (the prefix agrees byte-for-byte, so those stack entries agree too;`); - emit(` // anything opened inside the damage could differ in control-head-ness).`); - emit(` let dmgDp = -1, dmgPd = -1;`); - emit(` let lastDp = templateStack.length, lastPd = parenHeadStack.length;`); + emit(` // Trajectory minimums since the point the two lexes diverge (the damage start;`); + emit(` // before it, identical bytes from an identical anchor state give identical`); + emit(` // tokens and stack ops). An entry at depth <= BOTH mins was open at the`); + emit(` // divergence point in both lexes - i.e. it is the SAME entry.`); + emit(` let dmgMinOld = wndOldMin0, dmgMinNew = -1;`); emit(` function tkPush(k, t, off, end) {`); emit(` off += srcBase; end += srcBase;`); emit(` if (tokN === tkCap) growTok();`); @@ -262,17 +284,46 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` pendingNl = false;`); emit(` pvK = k; pvT = t;`); emit(` tokN++;`); + emit(` // Resync: adopt the OLD suffix from this aligned token on. Sound iff the old`); + emit(` // suffix's lexing is reproducible from OBSERVABLE state alone. Always required:`); + emit(` // - both template stacks EMPTY (an entry's brace counter is mutable state no`); + emit(` // record captures - depth equality cannot prove counters equal);`); + emit(` // - the candidate carries no cross-token flag its adopted successor reads`); + emit(` // (postfix-ambiguous op / control keyword / '(' / ')' each make the NEXT`); + emit(` // token's lexing depend on tokens BEFORE the candidate, which the window`); + emit(` // may have re-derived differently than the old stream had them).`); + emit(` // Then either of two sufficient paren-stack conditions:`); + emit(` // - FAST: equal depth, never dipped below it since the divergence point on`); + emit(` // either side - every open entry is then pre-divergence-common, the stacks`); + emit(` // are content-EQUAL, and all future pops behave identically; or`); + emit(` // - SHIFTED: the old suffix never pops an entry that is open at the candidate`); + emit(` // (suffix min depth >= candidate depth, a pop-on-empty counted as -1): no`); + emit(` // open entry's head-ness is ever read again, so the contents are irrelevant`); + emit(` // and the depths may differ by an arbitrary shift - the caller re-bases the`); + emit(` // adopted tkPd column by lexResyncPd to the new truth.`); emit(` if (wndPtr >= 0) {`); - emit(` if (dmgPd < 0) {`); - emit(` if (off >= wndCs) { dmgDp = lastDp; dmgPd = lastPd; }`); - emit(` else { lastDp = tkDp[tokN - 1]; lastPd = tkPd[tokN - 1]; }`); - emit(` }`); - emit(` if (off >= wndMinOff && dmgPd >= 0`); - emit(` && templateStack.length <= dmgDp && parenHeadStack.length <= dmgPd) {`); - emit(` while (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta < off) wndPtr++;`); + emit(` const pd = tkPd[tokN - 1];`); + emit(` if (dmgMinNew < 0) { if (off >= wndCs) dmgMinNew = pd; }`); + emit(` else if (pd < dmgMinNew) dmgMinNew = pd;`); + emit(` if (off >= wndMinOff) {`); + emit(` while (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta < off) { if (altPd[wndPtr] < dmgMinOld) dmgMinOld = altPd[wndPtr]; wndPtr++; }`); emit(` if (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta === off && altK[wndPtr] === k && altT[wndPtr] === t`); - emit(` && (altEnd[wndPtr] < 0 ? altEnd[wndPtr] + srcLenP1 : altEnd[wndPtr]) + wndDelta === end && altDp[wndPtr] === templateStack.length && altPd[wndPtr] === parenHeadStack.length) {`); - emit(` wndHit = wndPtr;`); + emit(` && (altEnd[wndPtr] < 0 ? altEnd[wndPtr] + srcLenP1 : altEnd[wndPtr]) + wndDelta === end`); + emit(` && templateStack.length === 0 && altDp[wndPtr] === 0`); + emit(` && LX_PFXV[t] === 0 && LX_PARENKW[t] === 0`); + emit(` && !(k === K_PUNCT && (t === ${tLParen} || t === ${tRParen}))) {`); + emit(` const q = altPd[wndPtr];`); + emit(` if (q < dmgMinOld) dmgMinOld = q;`); + emit(` if (q === pd && pd <= dmgMinOld && pd <= dmgMinNew) {`); + emit(` wndHit = wndPtr;`); + emit(` lexResyncPd = 0;`); + emit(` } else {`); + emit(` if (altSuffMin === null) buildAltSuffMin(wndPtr0);`); + emit(` if (altSuffMin[wndPtr + 1] >= q) {`); + emit(` wndHit = wndPtr;`); + emit(` lexResyncPd = pd - q;`); + emit(` }`); + emit(` }`); emit(` }`); emit(` }`); emit(` }`); diff --git a/src/emit-parser.ts b/src/emit-parser.ts index fdfdb6f..30e251a 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -3599,6 +3599,16 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── { let lo = 0, hi = oN; while (lo < hi) { const mid = (lo + hi) >> 1; if (toff(mid) < ceOld) lo = mid + 1; else hi = mid; } r0 = lo; } + // Old-side trajectory floor across the damage itself: min recorded paren depth of + // the OLD tokens inside [damage start, damage end) - the lexes diverge at the + // damage start, and the resync's fast tier needs the old min from that point on. + { + let lo = 0, hi = r0; + while (lo < hi) { const mid = (lo + hi) >> 1; if (toff(mid) < cs) lo = mid + 1; else hi = mid; } + let m = 0x7fffffff; + for (let i = lo; i < r0; i++) if (tkPd[i] < m) m = tkPd[i]; + wndOldMin0 = m; + } // Lex the window into the spare buffers (the old stream stays live for resync). if (altK === null || altCap < tkCap) { altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); @@ -3607,6 +3617,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── altCap = tkCap; } altN = oN; + altSuffMin = null; // the old-suffix min-depth cache follows the alt stream swapBuffers(); // live = scratch, alt = OLD stream tokN = 0; const startOff = B >= 0 ? (altEnd[B] < 0 ? altEnd[B] + srcLenP1 : altEnd[B]) : 0; @@ -3706,6 +3717,13 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── negFrom = B + 1 + W; srcLenP1 = newLen + 1; tokN = nN; + // a SHIFTED resync adopted the suffix at a different absolute paren depth: re-base + // the adopted depth records to the new truth ('(' head bits are unchanged - an + // entry's head-ness is a local fact of its own neighbors) + if (R0 >= 0 && lexResyncPd !== 0) { + for (let i = B + 1 + W; i < nN; i++) tkPd[i] += lexResyncPd; + lexResyncPd = 0; + } const nN2 = nN;` : String.raw` // (fallback-lexer grammars keep the full-relex + token-diff path) const oK = tkK, oT = tkT, oOff = tkOff, oEnd = tkEnd, oFl = tkFl, oN = tokN; const oText = tkText; From 4248105f06909b5651be54e6fbda9c910be80593 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 02:32:59 +0800 Subject: [PATCH 10/23] Recovering surgery: bar-clear splices keep the error tree incremental trySurgery refused any tree containing recovery rows (rowRM root). It now accepts them when the edit provably commutes with every recovery decision: decisions are position-pure functions of (window text, window bars), so a splice is sound when no bar window touches the damage or the re-parsed span's probe reach - kept rows replay identically at shifted positions, and a fresh recovering parse behaves strictly across the span, exactly like the strict re-parse the surgery runs (a fire inside the span would need a bar at/below the probe reach + 2; prefix attempts use prefixes of the same bar list, so one check against the final list covers every attempt). The spliced tree keeps its bar list with suffix bars shifted by the token delta; bars adjacent to the damage (unmappable) and free-fire trees (lastBars null, not window-pure) refuse. The multi-doc gate immediately caught a latent length bug this exposed: finishNode takes a node's char end from its LAST KID, which a trailing zero-width $missing row pushes past the last real token - but surgery re-derived ancestor lengths from the token columns, clipping that extension. A node whose token end lies strictly beyond the damage now keeps its end shape (rowLen += chrD: every end-determining coordinate sits in the shifted suffix); only nodes ending at/inside the damage use the token derivation (no zero-width row can end them - zero-width rows live at bars, and damage-adjacent bars were refused). Strict trees take either branch to the same value. 9MB while-broken typing now sits at valid-path parity (~1-1.7ms vs ~1ms valid; surgery additionally applies wherever its container shapes allow). Gates: multi-doc 60 + contract 9/9, incremental-grammars 672/672, incremental-verify 136, recovery pins 9/9, check 33/33, corpus parity 401/401. --- src/emit-parser.ts | 68 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 30e251a..2ec1df5 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -3077,11 +3077,25 @@ function rowKCof(id) { } function trySurgery(dmgA, dmgB, tokD, chrD) { if (adoptRoot < 0) return -1; - // a recovery-made tree cannot take a strict splice: kept siblings would carry - // $error/$missing rows into a "successful" strict pass, freezing the OLD text's - // recovery shape instead of re-deriving it for the new text (rowRM reaches the - // root structurally, so this is the exact tree-wide test) - if (rowRM[adoptRoot] !== 0 || rowRule[adoptRoot] >= RID_ERROR) return -1; + if (rowRule[adoptRoot] >= RID_ERROR) return -1; + // A recovery-made tree (rowRM root) CAN take a strict splice when the edit + // provably commutes with every recovery decision: decisions are position-pure + // functions of (window text, window bars), so if no bar window touches the + // damage or the re-parsed span (second check after the re-parse, when the span's + // probe reach is known), no decision changes - kept rows replay identically at + // shifted positions, and a fresh recovering parse behaves strictly across the + // span, exactly like the strict re-parse below (its first possible fire inside + // the span would need a bar at/below the probe reach + 2). Bars adjacent to the + // damage are unmappable across the token delta; free-fire trees (lastBars null) + // are not window-pure - both refuse. + const recTree = rowRM[adoptRoot] !== 0; + if (recTree) { + if (lastBars === null) return -1; + for (let i = 0; i < lastBars.length; i++) { + const b = lastBars[i]; + if (b + 2 >= dmgA && b <= dmgB + 2) return -1; + } + } // the whole-file token math must close, or the shape changed beyond a splice if (adoptRootTok + rowTokLen[adoptRoot] + tokD !== tokN) return -1; // 1. descend along single-affected-row kids, recording the path @@ -3143,6 +3157,10 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { if (L < 0) return -1; const D = surgX[L], Dbase = surgBase[L], Da = surgA[L]; const Db = surgB[L]; + // recovered trees use the length += chrD update below, which needs the node's + // char base unchanged; at Dbase >= dmgA the base token was re-lexed and its + // start may have moved + if (recTree && Dbase >= dmgA) return -1; const elem = SURG_ELEM[rowRule[D]]; const csD = rowStart[D], nD = rowCount[D]; const DendNew = Dbase + rowTokLen[D] + tokD; @@ -3151,6 +3169,7 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { pos = Da < Db ? Dbase + (kids[csD + Da] < 0 ? (~kids[csD + Da]) >>> 2 : ktr(D, csD + Da)) : dmgA; + const s0 = pos; maxPos = pos; frameMax = pos; scn = 0; parseLimit = -1; cap = tokN; currentPrattContext = null; suppressNext = null; suppressCur = null; const genAt = memoGenCur; @@ -3176,6 +3195,15 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { if (!fn()) return -1; if (memoGenCur !== genAt || pos === pp) return -1; } + if (recTree) { + // the strict re-parse stands for the fresh recovering parse of this span only + // if no bar window touches anything it read (probes included) + for (let i = 0; i < lastBars.length; i++) { + const b = lastBars[i]; + const bn = b < dmgA ? b : b + tokD; + if (bn + 2 >= s0 && bn <= maxPos + 2) return -1; + } + } // 4. POINT OF NO RETURN — splice D's kid range, shift suffix rels, patch the path const f = scn; const removed = j - Da; @@ -3269,14 +3297,20 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { } } rowNF[D] = bnd; + // A node whose token end lies strictly beyond the damage keeps its char end + // shape: every end-determining coordinate (last real token, or a trailing + // zero-width $missing kid's anchor - finishNode takes the LAST KID's end, which + // a zero-width row can push past the last real token) sits in the suffix and + // shifts by exactly chrD. Only a node ENDING at/inside the damage derives its + // length from the token columns: a pure-trivia edit can sit at a node's token + // BOUNDARY (between its last token and the next sibling's first), token-inside + // but char-outside - the gap belongs to no node, and tend/toff give the exact + // new span. No zero-width kid can end such a node: zero-width rows live at + // bars, and bars adjacent to the damage were refused above. + const keepEndD = Dbase + rowTokLen[D] > dmgB; rowTokLen[D] += tokD; - // Derive the char length from the token columns rather than adding chrD: a pure- - // trivia edit can sit at a node's token BOUNDARY (between its last token and the - // next sibling's first), token-inside but char-outside — the gap belongs to no - // node. tend/toff give the exact new span; when suffix tokens exist inside the - // node the delta equals chrD (so the suffix-kid rel adds and the end-relative - // bias-cancel stay consistent), and when they don't there are no suffix kids. - if (rowTokLen[D] > 0) rowLen[D] = tend(Dbase + rowTokLen[D] - 1) - toff(Dbase); + if (keepEndD) rowLen[D] += chrD; + else if (rowTokLen[D] > 0) rowLen[D] = tend(Dbase + rowTokLen[D] - 1) - toff(Dbase); { let x = rowExt[D] + (tokD > 0 ? tokD : 0); const fw = maxPos - Dbase; @@ -3350,8 +3384,10 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { // (end-relative kids past the boundary auto-shift via the length update below) } } + const keepEndA = surgBase[i] + rowTokLen[Ai] > dmgB; // see rowLen[D] above rowTokLen[Ai] += tokD; - if (rowTokLen[Ai] > 0) rowLen[Ai] = tend(surgBase[i] + rowTokLen[Ai] - 1) - toff(surgBase[i]); + if (keepEndA) rowLen[Ai] += chrD; + else if (rowTokLen[Ai] > 0) rowLen[Ai] = tend(surgBase[i] + rowTokLen[Ai] - 1) - toff(surgBase[i]); { let x = rowExt[Ai] + (tokD > 0 ? tokD : 0); const cw = ktr(Ai, csA + ki) + rowExt[surgX[i + 1]]; @@ -3782,7 +3818,11 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── rootTokBase = adoptRootTok; lastRoot = sroot; lastRootTok = adoptRootTok; - lastBars = []; + // the spliced tree keeps its bar list (surgery proved the edit clear of every + // bar window) - suffix bars ride the token delta like everything else + if (lastBars !== null) { + for (let i = 0; i < lastBars.length; i++) if (lastBars[i] >= dOldEnd) lastBars[i] += tokenDelta; + } shiftDiags(cs, ceOld, charDelta); return sroot; } From 668f8f51fe9c7e5932d961b0da39b6acd5eccd24 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 03:12:12 +0800 Subject: [PATCH 11/23] Diagnostics: viable-set messages + paired-opener related info Two grammar-derived enrichments of the $missing diagnostics, both resolved at settle from the tree (zero parse-time cost, adoption/replay-safe): - PAIR_OPEN: for each literal C, intersect - across every seq occurrence of C with preceding literals in its sequencing scope (groups inlined; quantifier/alt contents inherit a copy of the scope's accumulator, since they physically follow its earlier literals; nothing leaks back) - the sets of those preceding literals. A unique survivor is C's structural opener: ')' keeps '(' through if/while/call alike, interior separators intersect away, and ','/':'/'(' themselves die as ambiguous. The closer's diagnostic then carries related info pointing at the matched opener leaf found among its earlier siblings ("expected ')'" / "to match this '('"), with keyword pairs like 'while'<-'do' falling out for free. shiftDiags shifts the related anchor on its own coordinates (it can sit on the other side of the damage from its diagnostic - the surgery path caught this). - Viable-set messages: for a required literal C in a seq, the literals PROVABLY still accepted when C's matcher fails - repetitions before C are always re-enterable so their nullable-prefix-reachable literals stay viable; nullable one-shot items are crossed but contribute nothing (they may already have consumed). "expected ',' or ']'" therefore never names an impossible continuation, unlike a static FIRST union (after `[1, 2` an expression is not viable) - and unlike tsc, which under-reports the same position as "')' expected". Registered per call site during emission and threaded through the literal matchers into the $missing row (rowStart bits 21+; the row is zero-kid, the slot is free), decoded at settle. cst.errors entries gain an optional related: {offset, end, message} field. Pins re-pinned (11/11, exact); gates: incremental-grammars 672/672, incremental-verify 136, multi-doc 60, check 33/33, corpus parity 401/401, perf-bench unchanged. --- src/emit-parser.ts | 184 ++++++++++++++++++++++++++++++++++++++++----- test/recovery.ts | 18 +++-- 2 files changed, 179 insertions(+), 23 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 2ec1df5..d46c822 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -853,7 +853,9 @@ class Emitter { const a = this.a; switch (expr.type) { case 'literal': { - return `if (!${this.matchLiteralCall(expr.value)}) { ${onFail} }`; + const vs = this.vsetNext; + this.vsetNext = 0; + return `if (!${this.matchLiteralCall(expr.value, vs)}) { ${onFail} }`; } case 'ref': { if (a.tokenNames.has(expr.name)) { @@ -883,6 +885,7 @@ class Emitter { const nx = expr.items[i + 1]; this.quantFollowT = nx !== undefined && nx.type === 'literal' ? this.litT(nx.value) : -1; } + if (item.type === 'literal') this.vsetNext = this.vsetFor(expr.items, i); parts.push(this.matchInto(item, onFail)); this.quantFollowT = -1; } @@ -946,6 +949,51 @@ class Emitter { // uses `return`/`break` only against ITS OWN while — no nested-loop hazard. private quantFollowT = -1; litT(value: string): number { return -1; } // bound by emitParser to the punct-literal table + + // ── Viable-set companions (diagnostics) ── + // For a REQUIRED literal C in a seq, the literals PROVABLY still accepted when + // C's matcher fails: walking backward from C, a repetition ('*'/'+') is always + // re-enterable so its nullable-prefix-reachable literals stay viable; nullable + // one-shot items ('?' optionals, nullable groups, sep, zero-width markers) are + // crossed but contribute nothing (they may already have consumed their match); + // the first non-nullable item stops the walk. "expected ',' or ']'" therefore + // never names an impossible continuation — unlike a static FIRST union, which + // after `[1, 2` would still claim an expression. Each distinct message gets one + // id, threaded through the matcher into the $missing row (settle decodes it). + private vsetNext = 0; + vsetMsgs: string[] = ['']; + private vsetIds = new Map(); + private nullPrefixLits(x: RuleExpr, acc: Set): boolean { // → nullable (crossable)? + switch (x.type) { + case 'literal': acc.add(x.value); return false; + case 'seq': { for (const it of x.items) if (!this.nullPrefixLits(it, acc)) return false; return true; } + case 'group': return this.nullPrefixLits(x.body, acc); + case 'quantifier': { this.nullPrefixLits(x.body, acc); return x.kind !== '+'; } + case 'alt': { let all = true; for (const it of x.items) if (!this.nullPrefixLits(it, acc)) all = false; return all; } + case 'ref': return false; // conservative: treat rules as non-nullable + case 'sep': return true; + default: return true; // zero-width markers / Pratt position markers + } + } + private vsetFor(items: RuleExpr[], k: number): number { + const item = items[k]; + if (item.type !== 'literal') return 0; + const comp = new Set(); + for (let j = k - 1; j >= 0; j--) { + const pj = items[j]; + if (pj.type === 'op' || pj.type === 'prefix' || pj.type === 'postfix') continue; + if (pj.type === 'quantifier' && pj.kind !== '?') { this.nullPrefixLits(pj.body, comp); continue; } + if (pj.type === 'quantifier' || pj.type === 'sep' || pj.type === 'not' || pj.type === 'sameLine' || pj.type === 'noCommentBefore') continue; + if (pj.type === 'group' && this.nullPrefixLits(pj.body, new Set())) continue; + break; + } + comp.delete(item.value); + if (comp.size === 0) return 0; + const msg = [...comp, item.value].map(v => "'" + v + "'").join(' or '); + let id = this.vsetIds.get(msg); + if (id === undefined) { id = this.vsetMsgs.length; this.vsetMsgs.push(msg); this.vsetIds.set(msg, id); } + return id; + } private matchQuantifierInto(body: RuleExpr, kind: '*' | '+' | '?', onFail: string, closerT = -1): string { const fn = this.matchFn(body); if (kind === '?') { @@ -1276,10 +1324,13 @@ class Emitter { // ── Lever 1 emit helpers ── // Specialized literal matcher call: keyword → matchKwLit, punct → matchPuLit, each // with the value's baked int (so the runtime does int compares, not string work). - matchLiteralCall(value: string): string { + // vs > 0 = this call site's viable-set id (companion literals provably still + // accepted when the match fails — threaded into the synthesized $missing row). + matchLiteralCall(value: string, vs = 0): string { const d = this.a.symtab.classifyKey(value); - if (d.kind === 'kw') return `matchKwLit(${d.t})`; - if (d.kind === 'punct') return value === '>' ? `matchPuLitGT(${d.t})` : `matchPuLit(${d.t})`; + const va = vs > 0 ? `, ${vs}` : ''; + if (d.kind === 'kw') return `matchKwLit(${d.t}${va})`; + if (d.kind === 'punct') return value === '>' ? `matchPuLitGT(${d.t}${va})` : `matchPuLit(${d.t}${va})`; // A literal key that classifies as a token-name (a token name used as a literal): // unreachable for real grammars, but stay safe via the generic matchLiteral. return `matchLiteral(${J(value)})`; @@ -1819,10 +1870,11 @@ function offset() { // Keyword literal: the interpreter required tok.type !== '' && tokenNames.has(tok.type) // && tok.text === value. With interned kinds that is tok.k >= K_NAMED_MIN (a declared // token name; '' is PUNCT, templates are below NAMED_MIN) && tok.t === KW(value). -function matchKwLit(kw) { +function matchKwLit(kw, vs) { // A kw-range t can only come from a named token (template spans never intern to a // keyword), so the old k >= K_NAMED_MIN guard was redundant — one int compare. - if (pos >= cap || tkT[pos] !== kw) return recovering ? missTok(kw) : false; + // vs (optional) = the call site's viable-set id, threaded into the $missing row. + if (pos >= cap || tkT[pos] !== kw) return recovering ? missTok(kw, vs) : false; scPush(~((pos << 2) | 1)); if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; @@ -1830,15 +1882,15 @@ function matchKwLit(kw) { // Punct literal: tok.type === '' && tok.text === value, with the gt-splice fallback. // tok.t === PU(value) is the exact-text fast path; the splice handles a longer // gt-led token matching the gt key. value/pu are baked by the caller. -function matchPuLit(pu) { +function matchPuLit(pu, vs) { // A pu-range t can only come from a punct token, so the old k === K_PUNCT guard was // redundant — one int compare. The '>'-split lives only in matchPuLitGT ('>' sites). - if (pos >= cap || tkT[pos] !== pu) return recovering ? missTok(pu) : false; + if (pos >= cap || tkT[pos] !== pu) return recovering ? missTok(pu, vs) : false; scPush(~(pos << 2)); if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } -function matchPuLitGT(pu) { +function matchPuLitGT(pu, vs) { if (pos >= cap) return false; const off = toff(pos); if (tkT[pos] === pu) { @@ -1893,7 +1945,7 @@ function matchPuLitGT(pu) { if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } - return recovering ? missTok(pu) : false; + return recovering ? missTok(pu, vs) : false; } // Generic matchLiteral kept for any unspecialized site: classify value via the baked // tables (no per-call isKeywordLiteral / string compares) and delegate. @@ -2003,6 +2055,53 @@ function emitRuleFns(e: Emitter, a: ReturnType) { }); e.emit(`const SURG_ELEM = new Int32Array([${surg.join(',')}]);`); e.emit(`const RULE_FN_BY_ID = [${a.grammar.rules.map(r => ruleFn(r.name)).join(', ')}];`); + { + // Paired-opener table for diagnostics: for each literal C, intersect — across + // every seq occurrence of C that has preceding literals in its sequencing scope + // (transparent groups inlined; quantifier/alt/not bodies are separate scopes) — + // the SETS of those preceding literals. A unique survivor is C's structural + // opener: ')' keeps '(' through if/while/call alike (interior separators like + // the index signature's ':' vary per shape and intersect away), while ','/':' + // themselves intersect to nothing. No bracket list is hardcoded. Used to attach + // "to match this 'x'" related info to "expected 'C'" $missing diagnostics; the + // sibling scan at collect time self-guards (no opener leaf in the row, no info). + const tOfLit = (txt: string) => (isKeywordLiteral(txt) ? a.symtab.kwLitKind.get(txt) : a.symtab.puLitKind.get(txt)) ?? 0; + const inter = new Map(); // closer t → intersection, nearest-last order + const walk = (x: RuleExpr, acc: number[] | null): void => { + switch (x.type) { + case 'seq': { const sc = acc ?? []; for (const it of x.items) walk(it, sc); return; } + case 'group': walk(x.body, acc); return; + case 'literal': { + const c = tOfLit(x.value); + if (c <= 0) return; + if (acc !== null && acc.length > 0) { + const prev = inter.get(c); + if (prev === undefined) inter.set(c, acc.filter(o => o !== c)); + else inter.set(c, prev.filter(o => acc.includes(o))); + } + if (acc !== null) acc.push(c); + return; + } + // quantifier/alt contents physically FOLLOW the scope's earlier literals + // (an arm of `seq('[', alt(...), ']')` sits after the '['), so they inherit + // a COPY of the accumulator; nothing leaks back out (which arm matched, or + // whether the quantifier matched at all, is unknowable statically). + case 'quantifier': walk(x.body, acc === null ? null : [...acc]); return; + case 'alt': for (const it of x.items) walk(it, acc === null ? null : [...acc]); return; + case 'not': return; + default: return; // refs / zero-width markers neither pair nor reset + } + }; + for (const rule of a.grammar.rules) walk(rule.body, null); + const n = a.symtab.kwLitKind.size + a.symtab.puLitKind.size + 1; + const arr = new Array(n).fill(0); + for (const [c, set] of inter) if (set.length === 1) arr[c] = set[0]; + e.emit(`const PAIR_OPEN = new Int32Array([${arr.join(',')}]);`); + } + // Viable-set messages, registered per CALL SITE during the rule emission above + // (see vsetFor): id → " or "-joined alternatives, decoded from the $missing + // row's packed rowStart at settle. + e.emit(`const VSETS = ${J(e.vsetMsgs)};`); } // Non-recursive rule: longest-match over alts (mirrors parseNonRec). A better arm is @@ -2868,11 +2967,14 @@ function missAt(p2) { } return false; } -function missTok(t) { +function missTok(t, vs) { if (probing !== 0 || pos <= probeBase || recoverFree || !missAt(pos)) return false; const id = finishNode(RID_MISSING, scn); - rowStart[id] = t; // expected identity: >0 literal int, <0 named token kind, - // >= RULE_MISS_BASE a missing NONTERMINAL (rid offset). + rowStart[id] = vs ? t | (vs << 21) : t; + // expected identity: >0 literal int, <0 named token kind, + // >= RULE_MISS_BASE a missing NONTERMINAL (rid offset); + // bits 21+ carry the call site's viable-set id when the + // grammar proves companion literals still accepted here. // A zero-kid row never dereferences its kids base, so the // slot is free storage. scPush(id); @@ -2896,10 +2998,24 @@ function missRule(rid) { // Collect every $error row in the FINAL tree by descending only the recovery-made // spine (rowRM propagates structurally at finishNode): O(error paths), no global // walk, no per-candidate bookkeeping — losing-arm rows are simply unreachable. +// Decode a $missing row's packed expected identity (see missTok): bits 21+ carry +// the call site's viable-set id; bit 20 marks a missing nonterminal; else a plain +// literal int (>0) or a named token kind (<0). +function missLit(v) { + if (v >= 1 << 21) return v & 0xFFFFF; + return v > 0 && v < RULE_MISS_BASE ? v : 0; +} +function missEntry(v, kb) { + let message; + if (v >= 1 << 21) message = 'expected ' + VSETS[v >>> 21]; + else if (v >= RULE_MISS_BASE) message = 'expected ' + RULE_NAMES[v - RULE_MISS_BASE]; + else if (v > 0) message = "expected '" + LIT_NAMES[v] + "'"; + else message = "expected '" + (K_NAMES[-v] ?? '?') + "'"; + return { offset: kb, end: kb, message }; +} function collectErrRows(id, charBase, tokBase) { if (rowRule[id] === RID_MISSING) { - const t = rowStart[id]; - docPar.push({ offset: charBase, end: charBase, message: t >= RULE_MISS_BASE ? 'expected ' + RULE_NAMES[t - RULE_MISS_BASE] : "expected '" + (t > 0 ? LIT_NAMES[t] : (K_NAMES[-t] ?? '?')) + "'" }); + docPar.push(missEntry(rowStart[id], charBase)); return; } if (rowRule[id] === RID_ERROR) { @@ -2921,6 +3037,30 @@ function collectErrRows(id, charBase, tokBase) { for (let i = 0; i < n; i++) { const e = kids[cs + i]; if (e >= 0 && (rowRM[e] !== 0 || rowRule[e] >= RID_ERROR)) { + if (rowRule[e] === RID_MISSING) { + // a missing CLOSER names its matched opener (tsc's "to match this '('"): + // PAIR_OPEN holds the grammar-derived structural pair, and the opener leaf + // — if the construct really matched one — sits among the earlier siblings + const entry = missEntry(rowStart[e], charBase + kcr(id, cs + i)); + // a missing CLOSER names its matched opener (tsc's "to match this '('"): + // PAIR_OPEN holds the grammar-derived structural pair, and the opener leaf + // — if the construct really matched one — sits among the earlier siblings + const lt = missLit(rowStart[e]); + if (lt > 0 && PAIR_OPEN[lt] !== 0) { + for (let j = i - 1; j >= 0; j--) { + const ee = kids[cs + j]; + if (ee < 0) { + const tk = tokBase + ((~ee) >>> 2); + if (tkT[tk] === PAIR_OPEN[lt]) { + entry.related = { offset: toff(tk), end: tend(tk), message: "to match this '" + LIT_NAMES[PAIR_OPEN[lt]] + "'" }; + break; + } + } + } + } + docPar.push(entry); + continue; + } collectErrRows(e, charBase + kcr(id, cs + i), tokBase + ktr(id, cs + i)); } } @@ -3528,8 +3668,18 @@ function shiftDiags(a, b, delta) { let w = 0; for (let i = 0; i < docPar.length; i++) { const g = docPar[i]; - if (g.end <= a) docPar[w++] = g; - else if (g.offset >= b) { g.offset += delta; g.end += delta; docPar[w++] = g; } + if (g.end <= a) { /* kept as is */ } + else if (g.offset >= b) { g.offset += delta; g.end += delta; } + else continue; + // the related anchor (the matched opener) shifts on its own coordinates — it + // can sit on the other side of the damage from its diagnostic + const r = g.related; + if (r !== undefined) { + if (r.end <= a) { /* kept */ } + else if (r.offset >= b) { r.offset += delta; r.end += delta; } + else g.related = undefined; // its token was edited: stale + } + docPar[w++] = g; } docPar.length = w; rebuildDiagView(); diff --git a/test/recovery.ts b/test/recovery.ts index 22fe8ef..d28d074 100644 --- a/test/recovery.ts +++ b/test/recovery.ts @@ -20,7 +20,7 @@ const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-recovery.mjs'; writeFileSync(emPath, emitParser(grammar)); type Edit = { start: number; end: number; text: string }; -type Diag = { offset: number; end: number; message: string }; +type Diag = { offset: number; end: number; message: string; related?: { offset: number; end: number; message: string } }; type Cst = { root: number; errors: Diag[] }; type Parser = { parse(s: string): Cst; edit(cst: Cst, edits: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; type Em = { @@ -122,9 +122,14 @@ let typedOk = 0; // an $error absorbing the rest). Exact-match pins — quality must not regress to // absorption silently. const SYNTH: Array<[string, string[]]> = [ - ['const x = f(1, 2;', ["16:expected ')'"]], - ['function g() { return 1;', ["24:expected '}'"]], - ['if (x { y(); }', ["6:expected ')'"]], + // viable-set messages: every listed literal is PROVABLY still accepted at the + // position (trailing comma is legal, so ',' joins ')' — tsc's single "')' + // expected" under-reports); the related info names the matched opener + ['const x = f(1, 2;', ["16:expected ')' @11:to match this '('"]], + ['function g() { return 1;', ["24:expected '}' @13:to match this '{'"]], + ['if (x { y(); }', ["6:expected ',' or ')' @3:to match this '('"]], + ['const y = [1, ;', ["14:expected ',' or ']' @10:to match this '['"]], + ['const t = obj[i;', ["15:expected ']' @13:to match this '['"]], // missing NONTERMINALS (the tsc "Expression expected" analog): required rule // refs failing inside the bar window mint a zero-width $missing carrying the // rule identity — committed optionals ('= Expr' after the real '='), operator @@ -134,12 +139,13 @@ const SYNTH: Array<[string, string[]]> = [ ['const a = -;', ['11:expected Expr']], ['x ? y : ;', ['8:expected Expr']], ['a, ;', ['3:expected Expr']], - ["f(1, ;", ["5:expected Expr", "5:expected ')'"]], + ["f(1, ;", ["5:expected Expr", "5:expected ')' @1:to match this '('"]], ]; let synthN = 0; for (const [text, want] of SYNTH) { const c = p.parse(text); - const got = c.errors.map((g) => g.offset + ':' + g.message); + const got = c.errors.map((g) => g.offset + ':' + g.message + + (g.related ? ` @${g.related.offset}:${g.related.message}` : '')); if (JSON.stringify(got) !== JSON.stringify(want)) { bad(`synthesis on «${text}»: got ${JSON.stringify(got)}, want ${JSON.stringify(want)}`); continue; From 2c6e59391f41a6c86e728076ef7fe00d430e973c Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 03:20:43 +0800 Subject: [PATCH 12/23] Head-to-head bench: Monogram vs tsc updateSourceFile vs tree-sitter test/head-to-head.ts runs one 9MB TypeScript document through identical single-character edit scripts (warm valid keystrokes, a paren-deleting breaking edit, while-broken typing, the fixing edit) on all three engines, with positions recomputed from the current text so every engine sees byte-identical edits and timers wrapping only the engine call. tsc runs setParentNodes=false; node-tree-sitter caps input strings at 32767 chars, so it reads through its 16KB chunk-callback path. Results (node v24, Apple silicon): Monogram beats tsc on every phase (fresh 177 vs 212ms, valid keystroke 0.37 vs 37ms, while-broken 0.21 vs 13.6ms, fixing 1.0 vs 14.1ms) and beats or matches tree-sitter on fresh (177 vs 458ms) and while-broken typing; tree-sitter wins the two transition edits (0.26 vs 13ms breaking), where the strict-first architecture pays one adoption-assisted strict pass to prove rejection before recovering. Numbers + the two byte-identity guarantees added to the README under 'How it measures up'. --- README.md | 17 ++++++ test/head-to-head.ts | 125 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 test/head-to-head.ts diff --git a/README.md b/README.md index 89e7ab8..d97e371 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,23 @@ The **only-Monogram** wins above are all disambiguations that are *TextMate-expr "TextMate can't express X" is not a guess or an assertion; it is a claim to be **proven from the model**. TextMate is a line-oriented matcher whose only cross-line memory is a finite stack of scope contexts, so a proof exhibits an X whose correct highlighting provably needs memory that model lacks — unbounded lookback to a token that is not an enclosing context. A failed *attempt* to derive a pattern is not such a proof: a cleverer pattern may exist, and most "impossible for TextMate" folklore is exactly this error — the multiline / nested-generic cases turn out TM-expressible once a parser supplies the pattern, which is why the derived grammar gets them right. Where a construct provably exceeds the model, Monogram's **tree-sitter** target — a real parser over the whole tree — resolves it. +### Total parsing under edits — measured against tsc and tree-sitter + +The handle API (`createParser()`) is **total**: every text yields a tree plus `cst.errors`, with tsc-grade diagnostics (`expected ',' or ']'` where every listed token is *provably* still accepted at that position, `to match this '('` related info, zero-width `$missing` nodes that keep a call's shape when its `)` is missing). Two structural guarantees back it: + +- **The valid path is byte-identical to the strict parser** — recovery runs only after a strict pass has rejected, so error tolerance costs valid input nothing, by construction. +- **Every edited re-parse is byte-identical to a fresh parse** of the same text — tree *and* errors, broken states included, held exact by generative edit scripts across all seven grammars in CI (`test/incremental-grammars.ts`). + +One 9 MB TypeScript document, identical single-character edit scripts (`test/head-to-head.ts`, node v24, Apple silicon; ✎ = per keystroke, median): + +| engine | fresh parse | valid ✎ | breaking ✎ | while-broken ✎ | fixing ✎ | +|---|---:|---:|---:|---:|---:| +| **Monogram** | **177 ms** | 0.37 ms | 13.0 ms | **0.21 ms** | 1.0 ms | +| tsc `updateSourceFile` | 212 ms | 37 ms | 13.3 ms | 13.6 ms | 14.1 ms | +| tree-sitter (official) | 458 ms | **0.20 ms** | **0.26 ms** | 0.31 ms | **0.20 ms** | + +Monogram beats tsc on every phase (valid typing ~100×, while-broken ~60×) and beats or matches tree-sitter everywhere except the two **transition** edits (break/fix), where the strict-first architecture pays one adoption-assisted strict pass to *prove* the text rejects before recovering — the price of the byte-identity guarantees above, and the open lever. + ## What you get From one grammar definition (a small TypeScript combinator API), five outputs are **fully functional**: diff --git a/test/head-to-head.ts b/test/head-to-head.ts new file mode 100644 index 0000000..4613e67 --- /dev/null +++ b/test/head-to-head.ts @@ -0,0 +1,125 @@ +// Head-to-head bench: Monogram vs tsc (ts.updateSourceFile) vs official +// tree-sitter-typescript, on one large TypeScript document under the same +// single-character edit script: warm valid keystrokes, a paren-deleting +// BREAKING edit, while-broken typing, and the FIXING edit. +// +// Reproduce: +// git -C /tmp clone --depth 1 https://github.com/microsoft/TypeScript ts-repo # corpus file +// mkdir -p /tmp/tsbench && npm install --prefix /tmp/tsbench tree-sitter tree-sitter-typescript +// node test/head-to-head.ts +// +// Notes on fairness: every engine receives byte-identical edit sequences with +// positions recomputed from the current text; timers wrap ONLY the engine call +// (tree-sitter's line/col points are precomputed outside). tsc runs with +// setParentNodes=false; node-tree-sitter caps any input string at 32767 chars, +// so it reads through a 16KB chunk callback (its documented large-input path). +import { readFileSync } from 'node:fs'; +import { createRequire } from 'node:module'; +import { emitParser } from '../src/emit-parser.ts'; +import { writeFileSync } from 'node:fs'; +import ts from 'typescript'; + +const require = createRequire(import.meta.url); +const TS_BENCH = process.env.TSBENCH_DIR ?? '/tmp/tsbench'; +const CORPUS = process.env.H2H_FILE ?? '/tmp/ts-repo/tests/cases/unittests/matchFiles.ts'; +const TreeSitter = require(TS_BENCH + '/node_modules/tree-sitter'); +const TSLang = require(TS_BENCH + '/node_modules/tree-sitter-typescript').typescript; + +const grammar = (await import('../typescript.ts')).default; +const emPath = '/tmp/emitted-h2h.mjs'; +writeFileSync(emPath, emitParser(grammar)); +const { createParser } = await import(emPath + '?v=' + process.pid); + +const unit = readFileSync(CORPUS, 'utf-8'); +const BASE = unit.repeat(Math.ceil(9 * 1024 * 1024 / unit.length)); +console.log(`doc: ${(BASE.length / 1024 / 1024).toFixed(2)} MB TypeScript (${CORPUS})`); + +function posOf(text: string, off: number) { + let row = 0, last = -1; + for (let i = 0; i < off; i++) if (text.charCodeAt(i) === 10) { row++; last = i; } + return { row, column: off - last - 1 }; +} +const med = (xs: number[]) => xs.slice().sort((a, b) => a - b)[xs.length >> 1]; + +type Engine = { fresh(text: string): void; edit(text: string, start: number, end: number, ins: string): number; errors(): number }; + +function runScript(eng: Engine) { + let txt = BASE; + let t0 = performance.now(); + eng.fresh(txt); + const fresh = performance.now() - t0; + if (eng.errors() > 0) throw new Error('base doc reports errors'); + const apply = (start: number, end: number, ins: string) => { + const dt = eng.edit(txt, start, end, ins); + txt = txt.slice(0, start) + ins + txt.slice(end); + return dt; + }; + const identAt = txt.indexOf(' expected', Math.floor(txt.length / 4)) + 1; + const valid: number[] = []; + for (let i = 0; i < 5; i++) valid.push(apply(identAt + i, identAt + i, 'x')); + if (eng.errors() > 0) throw new Error('valid keystrokes broke the doc'); + const parenAt = txt.indexOf(');', Math.floor(txt.length * 0.75)); + const breaking = apply(parenAt, parenAt + 1, ''); + const breakErrs = eng.errors(); + const broken: number[] = []; + for (let i = 0; i < 10; i++) broken.push(apply(parenAt + i, parenAt + i, 'z')); + apply(parenAt, parenAt + 10, ''); + const fixing = apply(parenAt, parenAt, ')'); + return { fresh, valid: med(valid), breaking, broken: med(broken), fixing, breakErrs, fixErrs: eng.errors() }; +} + +const engines: Record = { + monogram: (() => { + const p = createParser(); + let c: { errors: unknown[] }; + return { + fresh(text: string) { c = p.parse(text); }, + edit(_text: string, start: number, end: number, ins: string) { + const t0 = performance.now(); + p.edit(c, [{ start, end, text: ins }]); + return performance.now() - t0; + }, + errors() { return c.errors.length; }, + }; + })(), + tsc: (() => { + let sf: ts.SourceFile; + return { + fresh(text: string) { sf = ts.createSourceFile('t.ts', text, ts.ScriptTarget.Latest, false, ts.ScriptKind.TS); }, + edit(text: string, start: number, end: number, ins: string) { + const newText = text.slice(0, start) + ins + text.slice(end); + const t0 = performance.now(); + sf = ts.updateSourceFile(sf, newText, { span: { start, length: end - start }, newLength: ins.length }); + return performance.now() - t0; + }, + errors() { return (sf as unknown as { parseDiagnostics: unknown[] }).parseDiagnostics.length; }, + }; + })(), + treesitter: (() => { + const p = new TreeSitter(); + p.setLanguage(TSLang); + let tree: ReturnType; + const CHUNK = 16 * 1024; + const input = (text: string) => (index: number) => (index < text.length ? text.slice(index, index + CHUNK) : null); + return { + fresh(text: string) { tree = p.parse(input(text)); }, + edit(text: string, start: number, end: number, ins: string) { + const newText = text.slice(0, start) + ins + text.slice(end); + const sp = posOf(text, start), oep = posOf(text, end), nep = posOf(newText, start + ins.length); + const t0 = performance.now(); + tree.edit({ startIndex: start, oldEndIndex: end, newEndIndex: start + ins.length, startPosition: sp, oldEndPosition: oep, newEndPosition: nep }); + tree = p.parse(input(newText), tree); + return performance.now() - t0; + }, + errors() { return tree.rootNode.hasError ? 1 : 0; }, + }; + })(), +}; + +const fmt = (x: number) => x.toFixed(2).padStart(8); +console.log('engine | fresh | valid✎ | breaking✎ | broken✎ | fixing✎ | errs(break/fix)'); +for (const [name, eng] of Object.entries(engines)) { + const r = runScript(eng); + console.log(`${name.padEnd(11)} | ${fmt(r.fresh)} | ${fmt(r.valid)} | ${fmt(r.breaking)} | ${fmt(r.broken)} | ${fmt(r.fixing)} | ${r.breakErrs}/${r.fixErrs}`); +} +console.log('(ms; ✎ = per single-character edit, median; node ' + process.version + ')'); From 71e14a75069832d2a67447b2f8b839988b724b25 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 03:22:34 +0800 Subject: [PATCH 13/23] Error-recovery conformance metric: bidirectional agreement vs tsc test/recovery-conformance.ts: on every single-file conformance test tsc's PARSER rejects (parseDiagnostics non-empty - the live source of the .errors.txt syntax baselines, with semantic noise excluded by definition), compare Monogram's total-parse cst.errors bidirectionally at +/-8 chars: recall (tsc errors we also report): 530/951 = 55.73% precision (our errors tsc also reports): 580/702 = 82.62% first-error agreement: 203/355 = 57.18% files we accept but tsc rejects: 116 The sample divergences localize the gap classes: the accept side is dominated by tsc's context-parameter checks ([Await]/[Yield] parameter positions, reserved names in declaration slots) plus a few CFG-expressible shapes; the missed side is recovery-policy granularity (one absorbed region vs tsc's several pointed diagnostics). --- test/recovery-conformance.ts | 78 ++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 test/recovery-conformance.ts diff --git a/test/recovery-conformance.ts b/test/recovery-conformance.ts new file mode 100644 index 0000000..8f1f28c --- /dev/null +++ b/test/recovery-conformance.ts @@ -0,0 +1,78 @@ +// Error-recovery conformance: on every single-file conformance test that tsc's +// PARSER rejects, compare Monogram's total-parse diagnostics against tsc's +// parseDiagnostics (the live source of the .errors.txt syntax baselines), +// BIDIRECTIONALLY: +// recall — tsc diagnostics with a Monogram diagnostic within ±SLACK chars +// precision — Monogram diagnostics with a tsc diagnostic within ±SLACK chars +// first — files where the FIRST error positions agree within ±SLACK +// Diagnostic positions are parser-policy choices (where to blame a missing +// token), so the slack absorbs token-boundary differences; the metric is about +// reporting the same BREAKAGES, not byte-equal spans. +// +// node --max-old-space-size=4096 test/recovery-conformance.ts +import { writeFileSync, readFileSync } from 'node:fs'; +import { readdir } from 'fs/promises'; +import { join } from 'path'; +import { emitParser } from '../src/emit-parser.ts'; +import ts from 'typescript'; + +const grammar = (await import('../typescript.ts')).default; +const emPath = '/tmp/emitted-recovery-conf.mjs'; +writeFileSync(emPath, emitParser(grammar)); +type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; +const em = (await import(emPath + '?v=' + process.pid)) as { createParser(): { parse(s: string): Cst } }; +const p = em.createParser(); + +const baseDir = '/tmp/ts-repo/tests/cases/conformance'; +const SLACK = 8; + +async function allTsFiles(dir: string): Promise { + const out: string[] = []; + for (const e of await readdir(dir, { withFileTypes: true })) { + const full = join(dir, e.name); + if (e.isDirectory()) out.push(...await allTsFiles(full)); + else if (e.name.endsWith('.ts') && !e.name.endsWith('.d.ts')) out.push(full); + } + return out; +} +const isMulti = (t: string) => /^\s*\/\/\s*@filename:/im.test(t); + +const files = (await allTsFiles(baseDir)).sort(); +let nFiles = 0, tTotal = 0, tHit = 0, mTotal = 0, mHit = 0, firstOK = 0, weSilent = 0, oracleCrash = 0; +const worst: { file: string; kind: string; at: number; msg: string }[] = []; + +for (const file of files) { + const code = readFileSync(file, 'utf-8'); + if (isMulti(code)) continue; + let sf; + try { + sf = ts.createSourceFile('t.ts', code, ts.ScriptTarget.Latest, false, ts.ScriptKind.TS); + } catch { oracleCrash++; continue; } + const tDiags = (sf as unknown as { parseDiagnostics: { start: number }[] }).parseDiagnostics; + if (tDiags.length === 0) continue; // parser-valid: the accept/CST gates own it + const T = [...new Set(tDiags.map(d => d.start ?? 0))].sort((a, b) => a - b); + const c = p.parse(code); + const M = [...new Set(c.errors.map(g => g.offset))].sort((a, b) => a - b); + nFiles++; + if (M.length === 0) { + weSilent++; + if (worst.length < 12) worst.push({ file: file.replace(baseDir + '/', ''), kind: 'WE-ACCEPT', at: T[0], msg: code.slice(Math.max(0, T[0] - 30), T[0] + 20).replace(/\n/g, '⏎') }); + } + const near = (xs: number[], x: number) => xs.some(y => Math.abs(y - x) <= SLACK); + tTotal += T.length; mTotal += M.length; + for (const t of T) if (near(M, t)) tHit++; else if (worst.length < 24 && M.length > 0) worst.push({ file: file.replace(baseDir + '/', ''), kind: 'MISSED', at: t, msg: code.slice(Math.max(0, t - 30), t + 20).replace(/\n/g, '⏎') }); + for (const m of M) if (near(T, m)) mHit++; + if (M.length > 0 && Math.abs(M[0] - T[0]) <= SLACK) firstOK++; +} + +const pct = (a: number, b: number) => b === 0 ? '—' : (100 * a / b).toFixed(2) + '%'; +console.log(`error-recovery conformance vs tsc parseDiagnostics (${baseDir}, slack ±${SLACK}):`); +console.log(` files tsc-parser-rejects (single-file): ${nFiles}${oracleCrash ? ` (+${oracleCrash} oracle crashes skipped)` : ''}`); +console.log(` recall (tsc errors we also report): ${tHit}/${tTotal} = ${pct(tHit, tTotal)}`); +console.log(` precision (our errors tsc also reports): ${mHit}/${mTotal} = ${pct(mHit, mTotal)}`); +console.log(` first-error agreement: ${firstOK}/${nFiles} = ${pct(firstOK, nFiles)}`); +console.log(` files we accept but tsc rejects: ${weSilent}`); +if (worst.length) { + console.log(`\n ===== sample divergences =====`); + for (const w of worst) console.log(` [${w.kind}] ${w.file} @${w.at} «${w.msg}»`); +} From f0d2c758bcf771360d82b645112e7f8bf15c62f9 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 03:31:45 +0800 Subject: [PATCH 14/23] Reject unterminated templates and colon-less case clauses Two syntactic over-accepts found by the diagnostics comparison against tsc: - parseTemplateExpr (both engines) treated a template HEAD as committing to nothing: on EOF or any non-middle/tail token after a substitution it closed the $template node and returned success, so 'let s = `tpl ${x;' parsed clean. A head now commits to the full chain - every substitution must hold an expression and every span must continue (middle) or close (tail); an unterminated template is a parse failure, not a shorter match. Also rejects empty substitutions ('`${}`'), matching tsc. - notReservedExpr gains 'case': the bare-identifier expression fallback accepted the reserved word, so 'switch (x) { case 1 y(); }' parsed as three statements through the switch body's Stmt arm (the flat many(SwitchCase) shape made the missing ':' invisible). A full accept/reject flip scan over the single-file conformance corpus shows exactly ONE flip: TemplateExpression1.ts (an intentionally-invalid error test tsc rejects) now correctly rejects - no valid file regressed. Error-recovery conformance recall 55.7% -> 59.1%; check 33/33, engine parity 401/401, all 7 generated outputs byte-identical. --- javascript.ts | 16 +++++++++------- src/emit-parser.ts | 9 ++++++--- src/gen-parser.ts | 16 +++++++++++----- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/javascript.ts b/javascript.ts index d1b5b4f..6ad09e6 100644 --- a/javascript.ts +++ b/javascript.ts @@ -176,14 +176,16 @@ export const notReserved = not(alt( // `null`, …), and TS's own error-recovery tolerates several reserved words sliding into // the bare-identifier fallback inside otherwise-valid files (e.g. `export default …`, // undeclared `for (x in …)`, `class … extends (e)`, a decorator before `export`). The -// words below have NO such role: they are the prefix operators `void`/`typeof`/`delete` -// (which must take an operand) plus the `catch`/`throw` keywords and `enum`. Forbidding -// the bare-identifier fallback for exactly these rejects `catch(x){}` with no `try`, -// `void ;`/`typeof ;`/`delete ;` (operatorless prefix op), and `throw ;` — while leaving -// every valid expression (and TS's recovery cases) untouched. Verified: widening this -// set to other reserved words regresses valid code; these five are the FN-safe maximum. +// words below have NO such role: the prefix operators `void`/`typeof`/`delete` (which +// must take an operand), the `catch`/`throw` keywords, `enum`, and `case` (a bare +// `case` expression let `case 1 y();` inside a switch parse as three statements). +// Forbidding the bare-identifier fallback for exactly these rejects `catch(x){}` with +// no `try`, `void ;`/`typeof ;`/`delete ;` (operatorless prefix op), `throw ;`, and a +// colon-less `case` — while leaving every valid expression (and TS's recovery cases) +// untouched. Verified per the conformance matrix's FN=0 gate: widening this set to +// other reserved words regresses valid code; these are the FN-safe maximum. export const notReservedExpr = not(alt( - 'catch', 'delete', 'enum', 'throw', 'typeof', 'void', + 'case', 'catch', 'delete', 'enum', 'throw', 'typeof', 'void', )); // ── Precedence ladder (shared ECMAScript operator precedence) ── diff --git a/src/emit-parser.ts b/src/emit-parser.ts index d46c822..fe1eb4a 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1979,12 +1979,15 @@ function parseTemplateExpr() { } if (k === K_TEMPLATE_HEAD) { const mark = scn; + const save = pos; scPush(~(pos << 2)); if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } const interpRule = currentPrattContext ?? EXPR_RULE; + // a head COMMITS to the full chain: every substitution must hold an + // expression and every span must continue (middle) or close (tail) — an + // unterminated template is a parse failure, not a shorter match while (true) { - RULES[interpRule](); - if (pos >= cap) break; + if (!RULES[interpRule]() || pos >= cap) { pos = save; scn = mark; return false; } const nk = tkK[pos]; if (nk === K_TEMPLATE_MIDDLE) { scPush(~(pos << 2)); @@ -1996,7 +1999,7 @@ function parseTemplateExpr() { if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } break; } - break; + pos = save; scn = mark; return false; } scPush(finishNode(RID_TEMPLATE, mark)); return true; diff --git a/src/gen-parser.ts b/src/gen-parser.ts index 4a2091f..1cd78a8 100644 --- a/src/gen-parser.ts +++ b/src/gen-parser.ts @@ -846,14 +846,19 @@ export function createParser(grammar: CstGrammar) { } if (tok.type === '$templateHead') { const children: CstChild[] = []; + const save = pos; if (++pos > maxPos) maxPos = pos; children.push({ tokenType: '$templateHead', offset: tok.offset, end: tok.offset + tok.text.length }); const interpRule = currentPrattContext ?? findExprRule(); + // a head COMMITS to the full chain: every substitution must hold an + // expression and every span must continue (middle) or close (tail) — an + // unterminated template is a parse failure, not a shorter match while (true) { const exprNode = parseRule(interpRule); - if (exprNode) children.push(exprNode); + if (!exprNode) { pos = save; return null; } + children.push(exprNode); const next = peek(); - if (!next) break; + if (!next) { pos = save; return null; } if (next.type === '$templateMiddle') { if (++pos > maxPos) maxPos = pos; children.push({ tokenType: '$templateMiddle', offset: next.offset, end: next.offset + next.text.length }); @@ -864,10 +869,11 @@ export function createParser(grammar: CstGrammar) { children.push({ tokenType: '$templateTail', offset: next.offset, end: next.offset + next.text.length }); break; } - break; + pos = save; + return null; } - const startOff = children.length > 0 ? childOffset(children[0]) : offset(); - const endOff = children.length > 0 ? childEnd(children[children.length - 1]) : offset(); + const startOff = childOffset(children[0]); + const endOff = childEnd(children[children.length - 1]); return { rule: '$template', children, offset: startOff, end: endOff }; } return null; From 25b78ba3c19a4d5406ba96e0b63c08c8b0695baa Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 03:41:47 +0800 Subject: [PATCH 15/23] Formal write-up + bounded-exhaustive edit gate TOTAL-PARSING.md: the formal spine in one place - the totality contract, strict-first two-pass structure, the bar discipline with its determinism theorem (bars are a pure function of the token stream, forcing every ingredient to be adoption-invariant), position-pure recovery actions with commitment semantics, the three structural theorems the generative gates forced (zero-width = synthesis-only; same-position cycles and their taint refinement; exact adoption-invariant watermarks), the window-replay theorem with its three corollaries (recovering adoption, cross-attempt memo survival, recovering surgery) and the one known open caveat (row-level taint), the two lexer-resync soundness conditions, tree-derived diagnostics, and the measured head-to-head numbers. test/exhaustive-edits.ts (CI gate 34/34): over a small bracket-and-list grammar, EVERY document up to 4 chars over the grammar's alphabet x EVERY single-character edit (delete/replace/insert at every position) must parse byte-identically to fresh - tree and errors. Complete within its bound: ~330k steps (EXH_MAXLEN=5 runs the 3.2M-step deep version, also clean). The gate immediately earned its keep: it caught a one-case regression in the day-old surgery length update - a node whose BASE token sits at the damage start (leading trivia inserted at a node's very start) shifts base and end together, leaving the length alone, so rowLen += chrD was wrong exactly where the token derivation is right. keepEnd now also requires the base token to sit strictly before the damage. --- TOTAL-PARSING.md | 223 +++++++++++++++++++++++++++++++++++++++ src/emit-parser.ts | 8 +- test/check.ts | 1 + test/exhaustive-edits.ts | 74 +++++++++++++ 4 files changed, 304 insertions(+), 2 deletions(-) create mode 100644 TOTAL-PARSING.md create mode 100644 test/exhaustive-edits.ts diff --git a/TOTAL-PARSING.md b/TOTAL-PARSING.md new file mode 100644 index 0000000..228a9b5 --- /dev/null +++ b/TOTAL-PARSING.md @@ -0,0 +1,223 @@ +# Total parsing: the formal spine + +How the handle API (`createParser()`) parses *every* text into a tree plus +`cst.errors` while keeping two byte-identity guarantees no mainstream engine +makes, and why each piece is sound. The implementation lives in +`src/emit-parser.ts` (emitted runtime) and is held exact by the gates listed at +the end. + +## The contract + +For every input text and every edit sequence: + +1. **Totality** — `parse`/`edit` never throw on input. Every text yields a root + and a (possibly empty) `errors` list. Only API misuse throws. +2. **Strict-path identity** — a text the strict grammar accepts parses + byte-identically to the strict module-level parser, with `errors = []`. + Error tolerance costs valid input *nothing*, by construction (below), not by + testing. +3. **Edit/fresh identity** — after any edit, tree *and* errors are + byte-identical to a fresh parse of the same text — broken states included. + +## Two passes, strict first + +`parse`/`edit` run the **strict** parser first. Only when it rejects does the +text re-run with `recovering = true`. Guarantee 2 is therefore structural: the +valid path never executes a single recovery branch. The recovering run is where +everything below lives. + +## The bar discipline + +A naive "recover at any failure" breaks both identities: PEG longest-match +exploration *fails constantly* on valid arms, so an always-on recovery rescues +losing arms and perturbs valid shapes; and an incremental run that reuses old +rows explores *less* than a fresh run, so any failure-count-dependent decision +desynchronizes the two. + +Recovery instead fires only at positions a strict pass has *proven* to fail: + +- Each recovering **attempt** runs strictly except at an ordered list of + **bars** (token indices). A recovery action is allowed only inside a bar's + window (below). +- An attempt that fails *past* its bars aborts and appends a new bar at the + attempt's farthest-fail watermark (`maxPos`), monotonically increasing. +- Attempt k runs under the first k bars; the loop is capped (32), then degrades + to a deterministic free-fire pass (`recoverFree`) and, past even that, to a + zero-width `$error` root. Never a crash. + +**Determinism theorem.** The bar list is a pure function of the token stream: +bar k+1 is the strict-modulo-bars farthest-fail of a deterministic parse under +bars 1..k. Hence fresh and incremental recovering parses derive byte-identical +bar lists, which is the keystone of guarantee 3. This forces every ingredient +below to be *adoption-invariant*: nothing about reuse may change any watermark +or any fire decision. + +## Recovery actions, all position-pure + +Every action's fire condition is a pure function of `(position, bar list)` — +no counters, no budgets, no global parse state. (A budgeted design was tried +and failed exactly here: bar₂'s decisions depended on bar₁'s spending, which an +adopted region replays differently.) + +- **Skip absorption** — at a repetition whose element fails with + `recoverArmed(from, reach)` (∃ bar in `[from, reach]` with `reach ≤ bar+2`, + where `reach` is the *failing element's frame-local* probe watermark, not the + global one — a frontier parked on a far bar must not arm unrelated loops), + absorb tokens to the loop's FIRST set / threaded closer / EOF into an + `$error` row. Leaves keep text-tiling; the diagnostic quotes the first + absorbed token. +- **Missing-token synthesis** (`missTok`) — a *required* literal/token matcher + failing at `missAt(pos)` (∃ bar in `[pos, pos+2]`) materializes a zero-width + `$missing` row instead of failing: the construct completes (a call keeps its + Call shape with `)` marked missing) and the diagnostic reads `expected ')'`. +- **Missing-nonterminal synthesis** (`missRule`) — the same at a required rule + reference's fail exit: `expected Expr`. +- **Commitment semantics** — synthesis is suppressed inside *uncommitted* + probes: `not()` and separator probes (`probing`), and optional groups that + have not consumed past their entry (`probeBase`). Once an optional consumes a + real token it is committed and synthesizes like required content (`const a = + ;` synthesizes the initializer; a bare `const a` does not invent one). This + is tsc's required-only semantics, derived rather than hand-coded. + +## Three structural theorems the gates forced + +Each of these was surfaced as an `edit ≠ fresh` divergence by the generative +cross-grammar gate, then closed structurally — not patched per-case. + +**T1 — Zero-width success is a synthesis-only artifact.** A strict parser can +never succeed at width zero inside a loop (it would not terminate), so *every* +loop must discard zero-width elements: plain repetitions break on +`pos === before`, hooked repetitions discard and re-arm, left-recursion +continuations and Pratt LEDs refuse zero-width wraps. Without this, synthesis +inside a loop spins unboundedly. + +**T2 — Same-position re-entry is a real cycle class.** Zero-width synthesis +(and, under recovering, the opened dispatch guards) lets a rule re-enter +itself at the same position through paths no grammar check can rule out. +`recRunning` maps each in-flight `(rule, position)` frame to an entry serial; +re-entry fails with PEG cycle semantics. The refinement that matters for reuse: +a cycle refusal that leans on a frame entered *before* the current one makes +the current frame's result a function of its **ancestor stack**, not of the +text — such results are *tainted* (memo-stamped own-generation-only, taint +propagating to whoever reuses them). Internal cycles (both ends inside the +frame) replay from the window text alone and do not taint. + +**T3 — The bar protocol's inputs must be adoption-invariant.** Bar k+1 is +derived from a watermark, so watermarks must be *exact* and *reuse-stable*: +`frameMax` is a frame-local advance watermark (reset at rule entry, folded to +the parent at exit) that makes every stored extent the frame's true probe +reach; memo jumps and adoptions re-raise it to the stored extent, so a reused +subtree contributes the same watermark the parse that built it did. + +## The window-replay theorem + +Define a frame's **window** as `[start, start + ext + 2]` over token indices, +where `ext` is its exact probe extent (T3) and `+2` covers the stop-token and +SECOND-token dispatch reads. + +**Theorem.** Every recovery decision being position-pure, a frame's behavior — +result, probe extent, internal fires and synthesis included — is completely +determined by its window's *text* and its window's *bars*, modulo the +external-cycle dependence of T2. + +Corollaries, each carrying one optimization: + +- **Recovering adoption** (`barsWindowEq`): an old-tree row whose window sees + the same (shifted) bars the build run saw there replays identically — even + rows *containing* `$error`/`$missing` (an error region is exactly what stays + stable across far edits). Broken-state keystrokes go incremental. +- **Cross-attempt memo survival**: attempts within one sequence parse the same + stream under a monotonically growing bar list, so a memo entry whose window + is **bar-free** behaved strictly (no synthesis, no arming; opened dispatch + guards add only non-consuming probes) and is a pure function of window text — + valid in every later attempt. Tainted entries (T2) are excluded; this + exclusion is precisely what the first survival attempt missed and the gates + rejected. Survival is edit-side only: the fresh path's attempt loop resets + the arena per attempt, so earlier attempts' rows are clobbered there. +- **Recovering surgery**: a splice whose damage and re-parsed span sit clear of + every bar window *commutes with every recovery decision* — kept rows replay + at shifted positions, and the fresh parse behaves strictly across the span, + exactly like the strict re-parse the surgery runs. Attempt k's bars are a + prefix of the final list, so one check against the final list covers every + attempt. The spliced tree keeps its bar list, suffix bars shifted. + +**Known caveat (open).** Taint is tracked on memo entries, not on rows: a +tainted frame's *successful* row is still adoptable by `adoptSeek`. No gate +has constructed a divergence through this path; the candidate fix is a taint +bit on `rowRM` propagated like error containment. + +## Lexer resync under depth shifts + +The windowed re-lex adopts the old token suffix at the first aligned token +where the old suffix's lexing is reproducible from observable state. Two +sufficient conditions (both require empty template stacks on both sides — an +interpolation entry's brace counter is mutable state no record captures — and +a candidate token that carries no cross-token lexer flag its adopted successor +reads): + +- **Equal-depth**: neither lex dipped below the candidate's paren depth since + the divergence point (damage start; before it, identical bytes from an + identical anchor state give identical stacks). Every open entry is then + common to both lexes: the stacks are content-equal, and every future pop + behaves identically. O(1), the common case. +- **Shifted-depth**: the old suffix never pops an entry open at the candidate + (its recorded depth column never dips below the candidate's depth; + pop-on-empty counts as −1). No open entry's head-ness is ever read again, so + stack *contents* are irrelevant and the depths may differ by an arbitrary + shift δ — the splice re-bases the adopted depth records by δ, restoring true + absolute depths (`(`-head bits are local facts of their own neighbors and + stay valid). This is what makes a paren-balance-changing edit O(window) + instead of a relex-to-EOF. + +## Diagnostics are data, derived from the tree + +`cst.errors` is rebuilt at settle from structured lexer entries plus the +`$error`/`$missing` rows found by descending the structurally-propagated +`rowRM` spine — never collected during parsing. That is what makes adoption +safe for diagnostics: an adopted error region re-derives byte-identical +messages from the current token columns. Two derived enrichments: + +- **Viable sets** — for a required literal in a seq, the companion literals + *provably still accepted* when it fails: repetitions before it are always + re-enterable (their nullable-prefix-reachable literals stay viable); + nullable one-shot items are crossed but contribute nothing, since they may + already have consumed. `expected ',' or ']'` never names an impossible + continuation — a static FIRST union would (after `[1, 2` an expression is + not viable), and tsc under-reports the same position as `')' expected`. +- **Paired openers** — for each literal, intersect the sets of preceding + literals across all its seq occurrences; a unique survivor is its structural + opener (`)`←`(`, `]`←`[`, `while`←`do` — derived, no bracket list), attached + as `related` info pointing at the opener leaf among the `$missing`'s earlier + siblings. + +## Measured (9 MB TypeScript, single-character edits, median) + +| phase | Monogram | tsc `updateSourceFile` | tree-sitter | +|---|---:|---:|---:| +| fresh parse | **177 ms** | 212 ms | 458 ms | +| valid keystroke | 0.37 ms | 37 ms | **0.20 ms** | +| breaking edit | 13 ms | 13.3 ms | **0.26 ms** | +| while-broken keystroke | **0.21 ms** | 13.6 ms | 0.31 ms | +| fixing edit | 1.0 ms | 14.1 ms | **0.20 ms** | + +(`test/head-to-head.ts`.) The transition rows are the open lever: the +strict-first architecture pays one adoption-assisted strict pass to *prove* +rejection before recovering — the price of guarantees 2 and 3. + +Error-report agreement with tsc's parser on the conformance files it rejects +(`test/recovery-conformance.ts`, ±8 chars): recall 59.1%, precision 82.4%, +first-error agreement 57.5%. + +## The gates that hold all of this exact + +- `test/incremental-grammars.ts` — generative inputs × seeded edits × all 7 + grammars: every step's tree+errors byte-equal to fresh, self-consistent + spans, no throws (672 steps). +- `test/incremental-verify.ts`, `test/multi-doc.ts` — real-file edit scripts + and interleaved documents under the same byte-equality. +- `test/recovery.ts` — strict-path identity on valid texts, totality and + determinism on an invalid corpus, a char-by-char typing session, and + exact-match diagnostic pins (synthesis quality must not silently regress to + absorption). +- `test/emit-parser-verify.ts` / `test/emit-lexer-verify.ts` — emitted runtime + ≡ interpreter on the corpus, token streams and error messages included. diff --git a/src/emit-parser.ts b/src/emit-parser.ts index fe1eb4a..7fa0018 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -3450,7 +3450,11 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { // but char-outside - the gap belongs to no node, and tend/toff give the exact // new span. No zero-width kid can end such a node: zero-width rows live at // bars, and bars adjacent to the damage were refused above. - const keepEndD = Dbase + rowTokLen[D] > dmgB; + // ... and only while the node's char BASE is unchanged (a base token at/inside + // the damage was re-lexed and may have moved - leading trivia inserted at a + // node's very start shifts base and end together, leaving the LENGTH alone, + // which is exactly what the token derivation computes) + const keepEndD = Dbase + rowTokLen[D] > dmgB && Dbase < dmgA; rowTokLen[D] += tokD; if (keepEndD) rowLen[D] += chrD; else if (rowTokLen[D] > 0) rowLen[D] = tend(Dbase + rowTokLen[D] - 1) - toff(Dbase); @@ -3527,7 +3531,7 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { // (end-relative kids past the boundary auto-shift via the length update below) } } - const keepEndA = surgBase[i] + rowTokLen[Ai] > dmgB; // see rowLen[D] above + const keepEndA = surgBase[i] + rowTokLen[Ai] > dmgB && surgBase[i] < dmgA; // see rowLen[D] above rowTokLen[Ai] += tokD; if (keepEndA) rowLen[Ai] += chrD; else if (rowTokLen[Ai] > 0) rowLen[Ai] = tend(surgBase[i] + rowTokLen[Ai] - 1) - toff(surgBase[i]); diff --git a/test/check.ts b/test/check.ts index 53d3365..68913a4 100644 --- a/test/check.ts +++ b/test/check.ts @@ -25,6 +25,7 @@ const GATES: Gate[] = [ { group: 'core', name: 'multi-doc', args: ['test/multi-doc.ts'] }, { group: 'core', name: 'recovery', args: ['test/recovery.ts'] }, { group: 'core', name: 'incremental-grammars', args: ['test/incremental-grammars.ts'] }, + { group: 'core', name: 'exhaustive-edits', args: ['test/exhaustive-edits.ts'] }, { group: 'core', name: 'issue-cases', args: ['test/test-issues.ts'] }, { group: 'conformance', name: 'js', args: ['test/js-conformance.ts'] }, { group: 'conformance', name: 'tsx', args: ['test/tsx-conformance.ts'] }, diff --git a/test/exhaustive-edits.ts b/test/exhaustive-edits.ts new file mode 100644 index 0000000..5131132 --- /dev/null +++ b/test/exhaustive-edits.ts @@ -0,0 +1,74 @@ +// Gate: BOUNDED-EXHAUSTIVE edit/fresh equivalence. Over a small expression +// grammar, enumerate EVERY document up to N characters over the grammar's +// alphabet, and for each apply EVERY single-character edit (every deletion, +// every replacement, every insertion at every position). Each edited handle +// must be byte-identical — tree AND errors — to a fresh parse of the edited +// text. Unlike the generative gates this is complete within its bound: any +// equivalence bug reachable through small documents has a witness here. +// +// node --max-old-space-size=4096 test/exhaustive-edits.ts +import { writeFileSync } from 'node:fs'; +import { token, rule, defineGrammar, many, opt, sep, plus, oneOf, range, seq, star, noneOf } from '../src/api.ts'; +import { emitParser } from '../src/emit-parser.ts'; +import { objectify } from './emitted-obj.ts'; + +// A deliberately bracket-and-list-shaped grammar: parens force synthesis and +// paired-opener paths, ';' forces statement splits, '+' forces Pratt-free +// infix shapes through the seq machinery, idents and numbers collide at edits. +const Ident = token(plus(oneOf(range('a', 'b'))), { identifier: true }); +const Num = token(plus(oneOf(range('0', '1'))), {}); +const Expr = rule(($: unknown) => [ + Ident, + Num, + ['(', sep($, ','), ')'], + [$, '+', $], +]); +const Stmt = rule(() => [[Expr, ';']]); +const Program = rule(() => [[many(Stmt)]]); +const g = defineGrammar({ + name: 'mini', scopeName: 'source.mini', + tokens: { Ident, Num }, + rules: { Expr, Stmt, Program }, entry: Program, +}); + +const emPath = '/tmp/emitted-exhaustive.mjs'; +writeFileSync(emPath, emitParser(g)); +type Cst = { root: number; errors: object[] }; +type Parser = { parse(s: string): Cst; edit(c: Cst, e: object[]): void; visit(c: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; +const em = (await import(emPath + '?v=' + process.pid)) as { createParser(): Parser }; + +const ALPHABET = ['a', '0', '(', ')', ',', '+', ';', ' ']; +const MAXLEN = Number(process.env.EXH_MAXLEN ?? 4); // ~330k steps; EXH_MAXLEN=5 for the 3.2M-step deep run + +const fresh = em.createParser(); +const edited = em.createParser(); +const H = (p: Parser, c: Cst) => JSON.stringify(objectify(p.tree, (fns) => p.visit(c, fns))) + JSON.stringify(c.errors); + +let docs = 0, edits = 0, mismatches = 0; +const docsAt: string[][] = [['']]; +for (let L = 1; L <= MAXLEN; L++) { + docsAt.push(docsAt[L - 1].flatMap(d => ALPHABET.map(ch => d + ch))); +} +for (let L = 0; L <= MAXLEN; L++) { + for (const base of docsAt[L]) { + docs++; + const variants: { start: number; end: number; text: string }[] = []; + for (let i = 0; i < base.length; i++) variants.push({ start: i, end: i + 1, text: '' }); // delete + for (let i = 0; i < base.length; i++) for (const ch of ALPHABET) if (ch !== base[i]) variants.push({ start: i, end: i + 1, text: ch }); // replace + for (let i = 0; i <= base.length; i++) for (const ch of ALPHABET) variants.push({ start: i, end: i, text: ch }); // insert + for (const v of variants) { + edits++; + const c = edited.parse(base); // re-open the handle on the base text + edited.edit(c, [v]); + const next = base.slice(0, v.start) + v.text + base.slice(v.end); + const fc = fresh.parse(next); + if (H(edited, c) !== H(fresh, fc)) { + mismatches++; + if (mismatches <= 10) console.log(` ✗ «${base}» + ${JSON.stringify(v)} → «${next}»`); + } + } + } +} +console.log(`exhaustive-edits: ${docs} documents ≤${MAXLEN} chars × every 1-char edit = ${edits} steps · ${mismatches} mismatches`); +if (mismatches > 0) { console.error('✗ edit ≢ fresh inside the exhaustive bound'); process.exit(1); } +console.log('✓ edit ≡ fresh holds COMPLETELY within the bound (tree + errors, byte-identical)'); From 397a76dd735b865013f72c7f999b1f1e343350d0 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 03:47:37 +0800 Subject: [PATCH 16/23] Attribute the transition-edit cost to what profiling actually shows Phase-timing the head-to-head's 13ms breaking edit: the strict-fail pass is 0.35ms and the recovery attempts 0.6ms - the cost is lexer-layer suffix bookkeeping on the bench's first-touch 4.5MB cursor jump (a one-time suffix-min allocation plus EOF-relative re-basing of the token columns across the jump). Repeated break/fix transitions at one cursor position settle to ~2ms. README and TOTAL-PARSING.md now say so instead of blaming the strict-first pass. --- README.md | 2 +- TOTAL-PARSING.md | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d97e371..1ac05aa 100644 --- a/README.md +++ b/README.md @@ -242,7 +242,7 @@ One 9 MB TypeScript document, identical single-character edit scripts (`test/hea | tsc `updateSourceFile` | 212 ms | 37 ms | 13.3 ms | 13.6 ms | 14.1 ms | | tree-sitter (official) | 458 ms | **0.20 ms** | **0.26 ms** | 0.31 ms | **0.20 ms** | -Monogram beats tsc on every phase (valid typing ~100×, while-broken ~60×) and beats or matches tree-sitter everywhere except the two **transition** edits (break/fix), where the strict-first architecture pays one adoption-assisted strict pass to *prove* the text rejects before recovering — the price of the byte-identity guarantees above, and the open lever. +Monogram beats tsc on every phase (valid typing ~100×, while-broken ~60×) and beats or matches tree-sitter everywhere except the two **transition** edits (break/fix). Profiling attributes those almost entirely to lexer-layer suffix bookkeeping on a first-touch 4.5 MB cursor jump (a one-time table allocation plus EOF-relative re-basing of the token columns) — the parser passes themselves measure under 1 ms, and repeated break/fix transitions at one cursor position settle to ~2 ms. ## What you get diff --git a/TOTAL-PARSING.md b/TOTAL-PARSING.md index 228a9b5..4344850 100644 --- a/TOTAL-PARSING.md +++ b/TOTAL-PARSING.md @@ -200,9 +200,13 @@ messages from the current token columns. Two derived enrichments: | while-broken keystroke | **0.21 ms** | 13.6 ms | 0.31 ms | | fixing edit | 1.0 ms | 14.1 ms | **0.20 ms** | -(`test/head-to-head.ts`.) The transition rows are the open lever: the -strict-first architecture pays one adoption-assisted strict pass to *prove* -rejection before recovering — the price of guarantees 2 and 3. +(`test/head-to-head.ts`.) The transition rows measure a first-touch 4.5 MB +cursor jump: profiling splits the 13 ms into lexer-layer suffix bookkeeping +(a one-time suffix-min allocation plus EOF-relative re-basing of the token +columns across the jump) with the strict-fail pass at 0.35 ms and the +recovery attempts at 0.6 ms; repeated break/fix transitions at one cursor +position settle to ~2 ms. The remaining gap to tree-sitter is array-storage +suffix splicing, not parsing. Error-report agreement with tsc's parser on the conformance files it rejects (`test/recovery-conformance.ts`, ±8 chars): recall 59.1%, precision 82.4%, From 476ab69c50e351f2c8c826489446da0893fdcd6d Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 03:59:13 +0800 Subject: [PATCH 17/23] Row-level taint + reject body-less class expressions rowRM becomes bitwise: bit 1 keeps the structural error containment the diagnostics walk descends; bit 2 marks a CONTEXT-TAINTED result - a frame whose parse leaned on the cycle sentinel finding an ancestor (its outcome is a function of the ancestor stack, not the text). The memo stamp alone only protected the entry; the row adoptSeek can find was still reusable. Tainted rows now also refuse recovering adoption and run extension, closing the open caveat documented in TOTAL-PARSING.md. Strict adoption already required rowRM === 0 and is unchanged. notReservedExpr gains 'class': a valid class expression always out-matches the bare-identifier fallback under longest-match, so forbidding the fallback only rejects broken classes - 'const k = class extends D ;' with no body parsed as three statements. A zero-flip accept/reject scan over the whole single-file conformance corpus proves no valid shape regressed; 'extends' stays OUT - it is load-bearing for tsc's tolerated heritage shapes ('interface I extends { }', 'extends A extends B', 'extends Foo?.Bar' are all parse-accepted by tsc through the fallback, measured). Gates: 34/34, corpus parity 401/401, generated outputs byte-identical, transitions unchanged (~6ms first-touch, ~2ms steady). --- javascript.ts | 21 +++++++++++++-------- src/emit-parser.ts | 36 ++++++++++++++++++++++++++---------- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/javascript.ts b/javascript.ts index 6ad09e6..fa920d9 100644 --- a/javascript.ts +++ b/javascript.ts @@ -177,15 +177,20 @@ export const notReserved = not(alt( // the bare-identifier fallback inside otherwise-valid files (e.g. `export default …`, // undeclared `for (x in …)`, `class … extends (e)`, a decorator before `export`). The // words below have NO such role: the prefix operators `void`/`typeof`/`delete` (which -// must take an operand), the `catch`/`throw` keywords, `enum`, and `case` (a bare -// `case` expression let `case 1 y();` inside a switch parse as three statements). -// Forbidding the bare-identifier fallback for exactly these rejects `catch(x){}` with -// no `try`, `void ;`/`typeof ;`/`delete ;` (operatorless prefix op), `throw ;`, and a -// colon-less `case` — while leaving every valid expression (and TS's recovery cases) -// untouched. Verified per the conformance matrix's FN=0 gate: widening this set to -// other reserved words regresses valid code; these are the FN-safe maximum. +// must take an operand), the `catch`/`throw` keywords, `enum`, `case` (a bare `case` +// expression let `case 1 y();` inside a switch parse as three statements), and +// `class` (a valid class expression always out-matches the bare-identifier fallback, +// so forbidding the fallback only rejects broken classes — `class extends D ;` with +// no body parsed as three statements). Forbidding the bare-identifier fallback for +// exactly these rejects `catch(x){}` with no `try`, `void ;`/`typeof ;`/`delete ;` +// (operatorless prefix op), `throw ;`, a colon-less `case`, and a body-less `class` +// — while leaving every valid expression (and TS's recovery cases) untouched. +// Verified by a zero-flip accept/reject scan over the conformance corpus; widening +// further regresses: `extends` is load-bearing for tsc's tolerated heritage shapes +// (`interface I extends { }` reads `{` as the body, `extends A extends B`, +// `extends Foo?.Bar` — all parse-accepted by tsc through the identifier fallback). export const notReservedExpr = not(alt( - 'case', 'catch', 'delete', 'enum', 'throw', 'typeof', 'void', + 'case', 'catch', 'class', 'delete', 'enum', 'throw', 'typeof', 'void', )); // ── Precedence ladder (shared ECMAScript operator precedence) ── diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 7fa0018..09f0583 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1772,14 +1772,20 @@ function finishNode(rid, mark) { rowKC[id] = 0; rowNF[id] = 0x7fffffff; rowRM[id] = 0; - // recovery-made propagation: STRUCTURAL — a row contains an error iff a kid is an - // $error row or itself recovery-made. Batch parses never enter the branch. + // recovery-made propagation: STRUCTURAL, bitwise — bit 1: a kid is (or contains) + // an $error row; bit 2: a kid's result is context-tainted (the cycle sentinel) + // and must not be reused outside its own parse. Batch parses never enter this. if (recovering) { const ke = rowStart[id] + rowCount[id]; + let rm = 0; for (let i2 = rowStart[id]; i2 < ke; i2++) { const e2 = kids[i2]; - if (e2 >= 0 && (rowRM[e2] !== 0 || rowRule[e2] >= RID_ERROR)) { rowRM[id] = 1; break; } + if (e2 >= 0) { + rm |= rowRM[e2] | (rowRule[e2] >= RID_ERROR ? 1 : 0); + if (rm === 3) break; + } } + rowRM[id] = rm; } absChar[id] = myOff; absTok[id] = myTok; scn = mark; @@ -1818,14 +1824,20 @@ function finishWrap(rid, lhsId, mark) { rowKC[id] = 0; rowNF[id] = 0x7fffffff; rowRM[id] = 0; - // recovery-made propagation: STRUCTURAL — a row contains an error iff a kid is an - // $error row or itself recovery-made. Batch parses never enter the branch. + // recovery-made propagation: STRUCTURAL, bitwise — bit 1: a kid is (or contains) + // an $error row; bit 2: a kid's result is context-tainted (the cycle sentinel) + // and must not be reused outside its own parse. Batch parses never enter this. if (recovering) { const ke = rowStart[id] + rowCount[id]; + let rm = 0; for (let i2 = rowStart[id]; i2 < ke; i2++) { const e2 = kids[i2]; - if (e2 >= 0 && (rowRM[e2] !== 0 || rowRule[e2] >= RID_ERROR)) { rowRM[id] = 1; break; } + if (e2 >= 0) { + rm |= rowRM[e2] | (rowRule[e2] >= RID_ERROR ? 1 : 0); + if (rm === 3) break; + } } + rowRM[id] = rm; } absChar[id] = myOff; absTok[id] = myTok; scn = mark; @@ -2587,6 +2599,10 @@ function parseRuleEntry(idx, rid, name, core) { mg[start] = tainted ? -memoGenCur : memoGenCur; if (result >= 0) { rowOK[result] = 1; + // a context-tainted result (cycle refusal leaning on an ancestor) is also + // untrustworthy as a ROW: stamp rowRM bit 2 so adoption refuses it — the + // memo stamp alone only protects the entry, not the row adoptSeek can find + if (tainted) rowRM[result] |= 2; // The row's OWN watermark freezes at finishNode — for a Pratt rule that is // BEFORE the failed LED extension arms run (the NUD/shorter row survives the // longest-match), so rowExt under-records the rule's true probe extent and a @@ -2851,7 +2867,7 @@ function adoptSeek(q, rid) { let xid = e, xb = cb; for (;;) { if (rowOK[xid] !== 0 && rowRule[xid] === rid - && (recovering || rowRM[xid] === 0) + && ((recovering ? rowRM[xid] & 2 : rowRM[xid]) === 0) && (q + rowExt[xid] + 2 <= adoptDmgStart || q >= adoptDmgOldEnd)) { return xid; } @@ -3039,7 +3055,7 @@ function collectErrRows(id, charBase, tokBase) { const cs = rowStart[id], n = rowCount[id]; for (let i = 0; i < n; i++) { const e = kids[cs + i]; - if (e >= 0 && (rowRM[e] !== 0 || rowRule[e] >= RID_ERROR)) { + if (e >= 0 && ((rowRM[e] & 1) !== 0 || rowRule[e] >= RID_ERROR)) { if (rowRule[e] === RID_MISSING) { // a missing CLOSER names its matched opener (tsc's "to match this '('"): // PAIR_OPEN holds the grammar-derived structural pair, and the opener leaf @@ -3072,7 +3088,7 @@ function collectErrRows(id, charBase, tokBase) { // diagnostics (fresh survivors + adopted rowRM subtrees), ordered by offset. function settleDiags() { docPar.length = 0; - if (lastRoot >= 0 && (rowRM[lastRoot] !== 0 || rowRule[lastRoot] >= RID_ERROR)) { + if (lastRoot >= 0 && ((rowRM[lastRoot] & 1) !== 0 || rowRule[lastRoot] >= RID_ERROR)) { collectErrRows(lastRoot, rootCharBase, rootTokBase); } rebuildDiagView(); @@ -3172,7 +3188,7 @@ function runExtend(rid) { if (e < 0) break; if (pb + ktr(P, i) !== oq) break; if (rowRule[e] !== rid || rowOK[e] === 0) break; - if (!recovering && rowRM[e] !== 0) break; + if ((recovering ? rowRM[e] & 2 : rowRM[e]) !== 0) break; if (recovering && !barsWindowEq(nq, oq, rowExt[e])) break; const tl = rowTokLen[e]; if (tl === 0) break; From d61726b20b436b8750943781b9e141872b2089f9 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 04:08:37 +0800 Subject: [PATCH 18/23] O(1) shifted-resync check at depth 0 via a pop-on-empty index list The shifted lexer resync's dominant case is a depth-0 candidate (statement boundary), where 'the old suffix never pops an entry open at the candidate' collapses to 'no pop-on-empty beyond the candidate'. The lexer now records the token indices of ')' pops that found an empty paren stack (an ascending doc-level list, almost always empty - a stray closer beyond balance), recomposed by the window splice, shifted by the '>'-split, and persisted on the document register set. The depth-0 check is then one end-of-list comparison instead of an O(suffix) minimum build; only depth > 0 candidates (e.g. the fixing direction of a broken document) still build the suffix minimum, lazily once per edit. Steady-state breaking transitions on 9MB drop ~2.1ms -> ~1.6-1.9ms; the profile now reads strict-fail 0.23ms + attempts 0.46ms + spread bookkeeping, with the raw 7-column suffix memmove measured at 0.07ms - no storage floor in the way. README/TOTAL-PARSING tables refreshed from a fresh head-to-head run, with the cursor-jump amortization stated as what it is (a far jump pays once, proportional to distance; local typing never rewrites the suffix). Gates: 34/34, lexer parity 5695 diff=0, incremental-grammars 672/672, corpus parity, perf-bench under ceiling. --- README.md | 8 ++++---- TOTAL-PARSING.md | 37 +++++++++++++++++++++---------------- src/emit-lexer.ts | 19 ++++++++++++++++--- src/emit-parser.ts | 25 +++++++++++++++++++++---- 4 files changed, 62 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 1ac05aa..d575278 100644 --- a/README.md +++ b/README.md @@ -238,11 +238,11 @@ One 9 MB TypeScript document, identical single-character edit scripts (`test/hea | engine | fresh parse | valid ✎ | breaking ✎ | while-broken ✎ | fixing ✎ | |---|---:|---:|---:|---:|---:| -| **Monogram** | **177 ms** | 0.37 ms | 13.0 ms | **0.21 ms** | 1.0 ms | -| tsc `updateSourceFile` | 212 ms | 37 ms | 13.3 ms | 13.6 ms | 14.1 ms | -| tree-sitter (official) | 458 ms | **0.20 ms** | **0.26 ms** | 0.31 ms | **0.20 ms** | +| **Monogram** | **167 ms** | 0.37 ms | 12 ms | **0.22 ms** | 2.2 ms | +| tsc `updateSourceFile` | 207 ms | 35 ms | 12.0 ms | 11.9 ms | 11.9 ms | +| tree-sitter (official) | 430 ms | **0.18 ms** | **0.29 ms** | 0.30 ms | **0.22 ms** | -Monogram beats tsc on every phase (valid typing ~100×, while-broken ~60×) and beats or matches tree-sitter everywhere except the two **transition** edits (break/fix). Profiling attributes those almost entirely to lexer-layer suffix bookkeeping on a first-touch 4.5 MB cursor jump (a one-time table allocation plus EOF-relative re-basing of the token columns) — the parser passes themselves measure under 1 ms, and repeated break/fix transitions at one cursor position settle to ~2 ms. +Monogram beats tsc on every phase (valid typing ~100×, while-broken ~50×) and beats or matches tree-sitter everywhere except the two **transition** edits (break/fix). Profiling attributes those almost entirely to the bench's 4.5 MB cursor jump: token-column offsets are EOF-relative-biased so that local typing never rewrites the suffix (that is what makes the valid keystroke 0.37 ms), and the bias boundary moves with the cursor — a far jump pays once, proportional to the jump distance, then repeated break/fix transitions at that position settle to **~1.6–2 ms** (the parser passes measure under 1 ms of that). ## What you get diff --git a/TOTAL-PARSING.md b/TOTAL-PARSING.md index 4344850..9583a1e 100644 --- a/TOTAL-PARSING.md +++ b/TOTAL-PARSING.md @@ -141,10 +141,10 @@ Corollaries, each carrying one optimization: prefix of the final list, so one check against the final list covers every attempt. The spliced tree keeps its bar list, suffix bars shifted. -**Known caveat (open).** Taint is tracked on memo entries, not on rows: a -tainted frame's *successful* row is still adoptable by `adoptSeek`. No gate -has constructed a divergence through this path; the candidate fix is a taint -bit on `rowRM` propagated like error containment. +Taint is tracked on rows as well as memo entries: a tainted frame's row +carries `rowRM` bit 2, propagated structurally like error containment, and +recovering adoption / run extension refuse it — a context-dependent result is +never reused outside the parse that computed it. ## Lexer resync under depth shifts @@ -167,7 +167,11 @@ reads): shift δ — the splice re-bases the adopted depth records by δ, restoring true absolute depths (`(`-head bits are local facts of their own neighbors and stay valid). This is what makes a paren-balance-changing edit O(window) - instead of a relex-to-EOF. + instead of a relex-to-EOF. The dominant candidate depth is 0 (statement + boundaries), where the condition collapses to "no pop-on-empty beyond the + candidate" — answered O(1) from an ascending doc-level list of pop-on-empty + token indices (almost always empty) instead of an O(suffix) min-build; only + depth > 0 candidates build the suffix minimum, lazily once per edit. ## Diagnostics are data, derived from the tree @@ -194,19 +198,20 @@ messages from the current token columns. Two derived enrichments: | phase | Monogram | tsc `updateSourceFile` | tree-sitter | |---|---:|---:|---:| -| fresh parse | **177 ms** | 212 ms | 458 ms | -| valid keystroke | 0.37 ms | 37 ms | **0.20 ms** | -| breaking edit | 13 ms | 13.3 ms | **0.26 ms** | -| while-broken keystroke | **0.21 ms** | 13.6 ms | 0.31 ms | -| fixing edit | 1.0 ms | 14.1 ms | **0.20 ms** | +| fresh parse | **167 ms** | 207 ms | 430 ms | +| valid keystroke | 0.37 ms | 35 ms | **0.18 ms** | +| breaking edit | 12 ms | 12.0 ms | **0.29 ms** | +| while-broken keystroke | **0.22 ms** | 11.9 ms | 0.30 ms | +| fixing edit | 2.2 ms | 11.9 ms | **0.22 ms** | (`test/head-to-head.ts`.) The transition rows measure a first-touch 4.5 MB -cursor jump: profiling splits the 13 ms into lexer-layer suffix bookkeeping -(a one-time suffix-min allocation plus EOF-relative re-basing of the token -columns across the jump) with the strict-fail pass at 0.35 ms and the -recovery attempts at 0.6 ms; repeated break/fix transitions at one cursor -position settle to ~2 ms. The remaining gap to tree-sitter is array-storage -suffix splicing, not parsing. +cursor jump: token offsets are EOF-relative-biased so local typing never +rewrites the suffix (the 0.37 ms valid keystroke), and the bias boundary +moves with the cursor — a far jump pays once, proportional to the distance. +Repeated break/fix transitions at one position settle to ~1.6–2 ms, of +which the strict-fail pass is 0.23 ms and the recovery attempts 0.46 ms; +the raw 7-column suffix memmove measures 0.07 ms, so the residual is spread +bookkeeping, not a storage floor. Error-report agreement with tsc's parser on the conformance files it rejects (`test/recovery-conformance.ts`, ±8 chars): recall 59.1%, precision 82.4%, diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 4a9832c..625c872 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -118,6 +118,8 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`let lexResyncPd = 0;`); emit(`let altSuffMin = null;`); emit(`let altSuffMinBuf = null;`); + emit(`// ')' pops that found an empty stack, in THIS lexCore call's token indices`); + emit(`let lexEmptyPops = [];`); emit(`// Min OLD-stream paren depth over the tokens inside the damage itself (set by the`); emit(`// caller before the window lex): the old-side trajectory min starts from here.`); emit(`let wndOldMin0 = 0x7fffffff;`); @@ -268,6 +270,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` const parenHeadStack = initParens !== undefined && initParens !== null ? initParens : [];`); emit(` let wndPtr = wndPtr0;`); emit(` let wndHit = -1;`); + emit(` lexEmptyPops.length = 0;`); emit(` // Trajectory minimums since the point the two lexes diverge (the damage start;`); emit(` // before it, identical bytes from an identical anchor state give identical`); emit(` // tokens and stack ops). An entry at depth <= BOTH mins was open at the`); @@ -318,8 +321,17 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` wndHit = wndPtr;`); emit(` lexResyncPd = 0;`); emit(` } else {`); - emit(` if (altSuffMin === null) buildAltSuffMin(wndPtr0);`); - emit(` if (altSuffMin[wndPtr + 1] >= q) {`); + emit(` // shifted: q = 0 needs only "no pop-on-empty beyond the candidate"`); + emit(` // (the doc-level list is ascending - one end check); q > 0 needs the`); + emit(` // full suffix minimum, built lazily once per edit`); + emit(` let okTail;`); + emit(` if (q === 0) {`); + emit(` okTail = docEmptyPops.length === 0 || docEmptyPops[docEmptyPops.length - 1] <= wndPtr;`); + emit(` } else {`); + emit(` if (altSuffMin === null) buildAltSuffMin(wndPtr0);`); + emit(` okTail = altSuffMin[wndPtr + 1] >= q;`); + emit(` }`); + emit(` if (okTail) {`); emit(` wndHit = wndPtr;`); emit(` lexResyncPd = pd - q;`); emit(` }`); @@ -461,7 +473,8 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`${ind} parenHeadStack.push(_ph);`); emit(`${ind} extraFl = _ph ? 8 : 0; }`); } else if (lit === ')') { - emit(`${ind}lastCloseWasParenHead = parenHeadStack.pop() ?? false;`); + emit(`${ind}if (parenHeadStack.length === 0) { lastCloseWasParenHead = false; lexEmptyPops.push(tokN); }`); + emit(`${ind}else lastCloseWasParenHead = parenHeadStack.pop();`); } if (regexCtx?.postfixAfterValueTexts?.includes(lit)) { emit(`${ind}lastBangWasPostfix = prevIsValue();`); diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 09f0583..86f9373 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1950,6 +1950,7 @@ function matchPuLitGT(pu, vs) { memoGenCur++; // positions shifted mid-parse: every stamped entry is stale memoRecFloor = 0x7fffffff; // including across attempts: pre-split positions // can never be revalidated against the new stream + for (let _ep = docEmptyPops.length - 1; _ep >= 0 && docEmptyPops[_ep] >= pos; _ep--) docEmptyPops[_ep]++; // GREEN tree: no kids/scratch fixup — every completed row and scratch entry lies // wholly BEFORE the splice point (token pos is being consumed right now), and the // carried memo was just cleared, so nothing reachable references shifted indices. @@ -2719,7 +2720,8 @@ function visitCore(entry, fns, charBase, tokBase) { // Parse to the ARENA: returns the root node id. function lexInto(source) { -${e.soa ? ` tokenize(source);` : String.raw` docPieces = [source]; docPieceOff = [0]; docLen = source.length; docFlat = source; docCur = 0; +${e.soa ? ` tokenize(source); + docEmptyPops = lexEmptyPops.slice();` : String.raw` docPieces = [source]; docPieceOff = [0]; docLen = source.length; docFlat = source; docCur = 0; const _toks = tokenize(source); const _n = _toks.length; while (tkCap < _n + 1) growTok(); @@ -3108,6 +3110,12 @@ function rebuildDiagView() { // repetition ends PAST a bar stay silent (pos > bar), and the runParse safety net // obeys the same discipline (an ungated net would absorb on the FIRST bar-less // attempt and pre-empt the whole iteration). +// Token indices of ')' pops that found an EMPTY paren stack, ascending (the lexer +// appends as it lexes; the window splice recomposes). Almost always empty — a +// stray closer beyond balance. The shifted lexer resync's dominant q=0 case needs +// exactly one fact about the whole old suffix ("no pop-on-empty beyond the +// candidate"), which this list answers O(1) instead of an O(suffix) min-build. +let docEmptyPops = []; // Bar list that built lastRoot (that run's token coords); null = free-fire built // (free-fire decisions are not bar-pure — such a tree is never adoptable while // recovering). Strict trees carry []. @@ -3597,7 +3605,7 @@ function makeDoc() { memoNode: [], memoEnd: [], memoExt: [], memoGen: [], memoGenCur: 0, docDiags: [], docLex: [], docPar: [], docPieces: null, docPieceOff: null, docLen: 0, docFlat: null, docCur: 0, - rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, + rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, docEmptyPops: [], ${e.soa ? ' parenCachePos: -1, parenCacheStack: [],' : ''} altK: null, altT: null, altOff: null, altEnd: null, altFl: null, altDp: null, altPd: null, altCap: 0, altN: 0, @@ -3616,7 +3624,7 @@ function saveDoc(d) { d.docDiags = docDiags; d.docLex = docLex; d.docPar = docPar; d.docPieces = docPieces; d.docPieceOff = docPieceOff; d.docLen = docLen; d.docFlat = docFlat; d.docCur = docCur; d.rootCharBase = rootCharBase; d.rootTokBase = rootTokBase; - d.lastRoot = lastRoot; d.lastRootTok = lastRootTok; d.lastBars = lastBars; + d.lastRoot = lastRoot; d.lastRootTok = lastRootTok; d.lastBars = lastBars; d.docEmptyPops = docEmptyPops; ${e.soa ? ' d.parenCachePos = parenCachePos; d.parenCacheStack = parenCacheStack;' : ''} d.altK = altK; d.altT = altT; d.altOff = altOff; d.altEnd = altEnd; d.altFl = altFl; d.altDp = altDp; d.altPd = altPd; d.altCap = altCap; d.altN = altN; @@ -3634,7 +3642,7 @@ function loadDoc(d) { docDiags = d.docDiags; docLex = d.docLex; docPar = d.docPar; docPieces = d.docPieces; docPieceOff = d.docPieceOff; docLen = d.docLen; docFlat = d.docFlat; docCur = d.docCur; rootCharBase = d.rootCharBase; rootTokBase = d.rootTokBase; - lastRoot = d.lastRoot; lastRootTok = d.lastRootTok; lastBars = d.lastBars; + lastRoot = d.lastRoot; lastRootTok = d.lastRootTok; lastBars = d.lastBars; docEmptyPops = d.docEmptyPops; ${e.soa ? ' parenCachePos = d.parenCachePos; parenCacheStack = d.parenCacheStack;' : ''} altK = d.altK; altT = d.altT; altOff = d.altOff; altEnd = d.altEnd; altFl = d.altFl; altDp = d.altDp; altPd = d.altPd; altCap = d.altCap; altN = d.altN; @@ -3933,6 +3941,15 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── for (let i = B + 1 + W; i < nN; i++) tkPd[i] += lexResyncPd; lexResyncPd = 0; } + // recompose the pop-on-empty index list: kept prefix + the window's own + // (window-relative + B+1) + kept suffix riding the token delta + { + const nep = []; + for (let i = 0; i < docEmptyPops.length && docEmptyPops[i] <= B; i++) nep.push(docEmptyPops[i]); + for (let i = 0; i < lexEmptyPops.length; i++) nep.push(lexEmptyPops[i] + B + 1); + for (let i = 0; i < docEmptyPops.length; i++) { const v = docEmptyPops[i]; if (v >= R) nep.push(v + tokenDelta); } + docEmptyPops = nep; + } const nN2 = nN;` : String.raw` // (fallback-lexer grammars keep the full-relex + token-diff path) const oK = tkK, oT = tkT, oOff = tkOff, oEnd = tkEnd, oFl = tkFl, oN = tokN; const oText = tkText; From 3d8f494cd427d9fbe89e58016500cb263ce84ce7 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 04:21:08 +0800 Subject: [PATCH 19/23] Block bare statement keywords as expressions; for-in takes comma objects notReservedExpr grows by the statement keywords with no expression role: break, continue, debugger, do, else, finally, for, if, return, switch, try, while, with. Bare 'if' parsed as an identifier expression, which let 'namespace if {}' (the namespace arm correctly fails its notReserved name) fall apart into three accepted identifier statements - the same fallback family as 'case'/'class'. 'var' stays OUT: tsc parse-accepts 'for (var of X)' through shapes that need it. Blocking 'for' exposed a real grammar gap the fallback had been MASKING: 'for (a in b[c] = b[c] || [], d)' previously parsed as a CALL of the identifier 'for' (the for-statement arm failed, the call parse won). The for-in OBJECT is a full Expression - comma included - so both ForHead in-arms gain many(',', Expr); for-of keeps a single AssignmentExpression (tsc rejects 'for (x of a, b)', and so do we, where we previously mis-accepted it through the call fallback). Per-flip tsc verdict over the whole single-file conformance corpus: 7 flips, ALL toward tsc, 0 away. Error-recovery conformance recall 59.1% -> 61.2%, first-error agreement 57.5% -> 59.7%, we-accept files 115 -> 108. Gates 34/34, corpus parity 401/401, tree-sitter generate clean on all 4 affected grammars, gate:treesitter 96.0%. --- javascript.ts | 12 +++++++++--- tree-sitter/javascript/grammar.js | 2 +- tree-sitter/javascriptreact/grammar.js | 2 +- tree-sitter/typescript/grammar.js | 2 +- tree-sitter/typescriptreact/grammar.js | 2 +- typescript.ts | 8 ++++++-- 6 files changed, 19 insertions(+), 9 deletions(-) diff --git a/javascript.ts b/javascript.ts index fa920d9..07ac539 100644 --- a/javascript.ts +++ b/javascript.ts @@ -190,7 +190,9 @@ export const notReserved = not(alt( // (`interface I extends { }` reads `{` as the body, `extends A extends B`, // `extends Foo?.Bar` — all parse-accepted by tsc through the identifier fallback). export const notReservedExpr = not(alt( - 'case', 'catch', 'class', 'delete', 'enum', 'throw', 'typeof', 'void', + 'break', 'case', 'catch', 'class', 'continue', 'debugger', 'delete', 'do', + 'else', 'enum', 'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', + 'typeof', 'void', 'while', 'with', )); // ── Precedence ladder (shared ECMAScript operator precedence) ── @@ -385,14 +387,18 @@ const ForHead = rule($ => { // ForBinding gives a no-`in` initializer so `for (var a = 1 in xs)` parses. [alt('let', 'const', 'var', 'using', ['await', 'using']), sep(ForBinding, ','), alt( cTail, - [alt('in', 'of'), Expr], + // the for-in OBJECT is a full Expression (comma included: `for (a in b, c)`); + // for-of takes an AssignmentExpression - no comma (tsc rejects `for (x of a, b)`) + ['in', Expr, many(',', Expr)], + ['of', Expr], )], [opt(Expr, many(',', Expr)), ...cTail], // C-style, no declaration: `for (i=0; …; …)` / `for (;;)` // for-in/of, no declaration: `for (x of xs)`. The target Expr parses in a no-`in` // context (same exclude as binding initializers): the `in` belongs to the for-head, // not to an in-LED inside the target — without it `for (key in obj)` swallowed the // `in`, the arm failed, and the statement fell back to a CALL parse `for(...)`. - [exclude('in', Expr), alt('in', 'of'), Expr], + [exclude('in', Expr), 'in', Expr, many(',', Expr)], + [exclude('in', Expr), 'of', Expr], ]; }); diff --git a/tree-sitter/javascript/grammar.js b/tree-sitter/javascript/grammar.js index e03c6d0..25908da 100644 --- a/tree-sitter/javascript/grammar.js +++ b/tree-sitter/javascript/grammar.js @@ -153,7 +153,7 @@ module.exports = grammar({ param: $ => seq(optional($.decorator_expr), choice(seq($.ident, optional(seq("=", $.expr))), seq($.binding_pattern, optional(seq("=", $.expr))), seq("...", choice($.ident, $.binding_pattern), optional(seq("=", $.expr))))), - for_head: $ => choice(seq(choice("let", "const", "var", "using", seq("await", "using")), optional(seq($.for_binding, repeat(seq(",", $.for_binding)), optional(","))), choice(seq(";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq(choice("in", "of"), $.expr))), seq(optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq($.expr, choice("in", "of"), $.expr)), + for_head: $ => choice(seq(choice("let", "const", "var", "using", seq("await", "using")), optional(seq($.for_binding, repeat(seq(",", $.for_binding)), optional(","))), choice(seq(";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq("in", $.expr, repeat(seq(",", $.expr))), seq("of", $.expr))), seq(optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq($.expr, "in", $.expr, repeat(seq(",", $.expr))), seq($.expr, "of", $.expr)), switch_case: $ => choice(seq("case", $.expr, repeat(seq(",", $.expr)), ":"), seq("default", ":"), $.stmt), diff --git a/tree-sitter/javascriptreact/grammar.js b/tree-sitter/javascriptreact/grammar.js index 52da7cc..f904503 100644 --- a/tree-sitter/javascriptreact/grammar.js +++ b/tree-sitter/javascriptreact/grammar.js @@ -155,7 +155,7 @@ module.exports = grammar({ param: $ => seq(optional($.decorator_expr), choice(seq($.ident, optional(seq("=", $.expr))), seq($.binding_pattern, optional(seq("=", $.expr))), seq("...", choice($.ident, $.binding_pattern), optional(seq("=", $.expr))))), - for_head: $ => choice(seq(choice("let", "const", "var", "using", seq("await", "using")), optional(seq($.for_binding, repeat(seq(",", $.for_binding)), optional(","))), choice(seq(";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq(choice("in", "of"), $.expr))), seq(optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq($.expr, choice("in", "of"), $.expr)), + for_head: $ => choice(seq(choice("let", "const", "var", "using", seq("await", "using")), optional(seq($.for_binding, repeat(seq(",", $.for_binding)), optional(","))), choice(seq(";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq("in", $.expr, repeat(seq(",", $.expr))), seq("of", $.expr))), seq(optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq($.expr, "in", $.expr, repeat(seq(",", $.expr))), seq($.expr, "of", $.expr)), switch_case: $ => choice(seq("case", $.expr, repeat(seq(",", $.expr)), ":"), seq("default", ":"), $.stmt), diff --git a/tree-sitter/typescript/grammar.js b/tree-sitter/typescript/grammar.js index ee16223..ce31307 100644 --- a/tree-sitter/typescript/grammar.js +++ b/tree-sitter/typescript/grammar.js @@ -208,7 +208,7 @@ module.exports = grammar({ param: $ => choice(seq("this", ":", $.type), seq(optional($.decorator_expr), repeat1(choice("public", "private", "protected", "readonly", "override", "static", "abstract", "accessor", "async", "export", "declare", "in", "out")), choice(seq($.ident, optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))), seq($.binding_pattern, optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))), seq("...", choice($.ident, $.binding_pattern), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))))), seq(optional($.decorator_expr), choice(seq($.ident, optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))), seq($.binding_pattern, optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))), seq("...", choice($.ident, $.binding_pattern), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)))))), - for_head: $ => choice(seq(choice("let", "const", "var", "using", seq("await", "using")), optional(seq($.for_binding, repeat(seq(",", $.for_binding)), optional(","))), choice(seq(";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq(choice("in", "of"), $.expr))), seq(optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq($.expr, choice("in", "of"), $.expr)), + for_head: $ => choice(seq(choice("let", "const", "var", "using", seq("await", "using")), optional(seq($.for_binding, repeat(seq(",", $.for_binding)), optional(","))), choice(seq(";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq("in", $.expr, repeat(seq(",", $.expr))), seq("of", $.expr))), seq(optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq($.expr, "in", $.expr, repeat(seq(",", $.expr))), seq($.expr, "of", $.expr)), switch_case: $ => choice(seq("case", $.expr, repeat(seq(",", $.expr)), ":"), seq("default", ":"), $.stmt), diff --git a/tree-sitter/typescriptreact/grammar.js b/tree-sitter/typescriptreact/grammar.js index f0d68db..0272e8b 100644 --- a/tree-sitter/typescriptreact/grammar.js +++ b/tree-sitter/typescriptreact/grammar.js @@ -210,7 +210,7 @@ module.exports = grammar({ param: $ => choice(seq("this", ":", $.type), seq(optional($.decorator_expr), repeat1(choice("public", "private", "protected", "readonly", "override", "static", "abstract", "accessor", "async", "export", "declare", "in", "out")), choice(seq($.ident, optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))), seq($.binding_pattern, optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))), seq("...", choice($.ident, $.binding_pattern), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))))), seq(optional($.decorator_expr), choice(seq($.ident, optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))), seq($.binding_pattern, optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr))), seq("...", choice($.ident, $.binding_pattern), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)))))), - for_head: $ => choice(seq(choice("let", "const", "var", "using", seq("await", "using")), optional(seq($.for_binding, repeat(seq(",", $.for_binding)), optional(","))), choice(seq(";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq(choice("in", "of"), $.expr))), seq(optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq($.expr, choice("in", "of"), $.expr)), + for_head: $ => choice(seq(choice("let", "const", "var", "using", seq("await", "using")), optional(seq($.for_binding, repeat(seq(",", $.for_binding)), optional(","))), choice(seq(";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq("in", $.expr, repeat(seq(",", $.expr))), seq("of", $.expr))), seq(optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr)))), ";", optional(seq($.expr, repeat(seq(",", $.expr))))), seq($.expr, "in", $.expr, repeat(seq(",", $.expr))), seq($.expr, "of", $.expr)), switch_case: $ => choice(seq("case", $.expr, repeat(seq(",", $.expr)), ":"), seq("default", ":"), $.stmt), diff --git a/typescript.ts b/typescript.ts index 105c79b..e8ce25f 100644 --- a/typescript.ts +++ b/typescript.ts @@ -355,14 +355,18 @@ const ForHead = rule($ => { // ForBinding gives a no-`in` initializer so `for (var a = 1 in xs)` parses. [alt('let', 'const', 'var', 'using', ['await', 'using']), sep(ForBinding, ','), alt( cTail, - [alt('in', 'of'), Expr], + // the for-in OBJECT is a full Expression (comma included: `for (a in b, c)`); + // for-of takes an AssignmentExpression - no comma (tsc rejects `for (x of a, b)`) + ['in', Expr, many(',', Expr)], + ['of', Expr], )], [opt(Expr, many(',', Expr)), ...cTail], // C-style, no declaration: `for (i=0; …; …)` / `for (;;)` // for-in/of, no declaration: `for (x of xs)`. The target Expr parses in a no-`in` // context (same exclude as binding initializers): the `in` belongs to the for-head, // not to an in-LED inside the target — without it `for (key in obj)` swallowed the // `in`, the arm failed, and the statement fell back to a CALL parse `for(...)`. - [exclude('in', Expr), alt('in', 'of'), Expr], + [exclude('in', Expr), 'in', Expr, many(',', Expr)], + [exclude('in', Expr), 'of', Expr], ]; }); From f8a574273a9aba95c3113d37ca49538e9b118c45 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 04:22:27 +0800 Subject: [PATCH 20/23] Roadmap: enumerate the parser-acceptance long tail vs tsc The 108 remaining accept-divergences split into the [Await]/[Yield] context class (31 files - needs exclude()-style identifier-text context threading in the engine) and 77 per-shape strictness items, each named with its fix recipe (fix + flip-scan FN=0 proof). --- ROADMAP.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ROADMAP.md b/ROADMAP.md index c8f8673..80f0664 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -26,6 +26,9 @@ Three parser-grounded layers (in `test/`), each comparing against the language's ## What's next +- **Parser-acceptance long tail vs tsc** (measured by `test/recovery-conformance.ts`: recall 61.2%, 108 conformance files we parse-accept that tsc's parser rejects). The remainder is fully enumerated, two buckets: + - **`[Await]`/`[Yield]` parameter contexts** (31 files): `await`/`yield` must be reserved *inside* async/generator bodies and parameter lists, identifiers elsewhere. Needs a context-threading mechanism in the engine — the same shape as `exclude('in', …)` for the no-`in` context, but suppressing identifier *texts* over a subtree. Designed direction, not yet built. + - **Per-shape strictness** (77 files, each class small and named): declaration-modifier ordering (`public @dec method`), private names outside classes (`const #foo`), strict-mode octal literals (`001`), member declarations with `var` (`class C { var x }`), paren-less `new` arguments (`new C0 32`), reserved words in dotted namespace tails, template-literal module names, `extends void`, `super` tagged templates. Each wants the same treatment that landed for `case`/`class`/statement keywords: fix, then prove FN=0 with the accept/reject flip-scan against the corpus. - **More vscode#203212 bundles** — low-effort first (ini, diff, git config, xml); the large ones (ruby, perl, c/c++, groovy) each need an instrumentable official parser (WASM / native-coverage) + a corpus. - **Field labels** in the grammar DSL → richer named-field AST types. - **Highlighter long tail** — the few remaining per-language divergences are documented (in the PR) as either the shared TextMate-vs-parser ceiling or proven architectural floors; where a construct provably exceeds the TextMate model, the derived **tree-sitter** target (a real whole-tree parser) resolves it. From d37332b2435e75af57c42de7f26c1517273cc22c Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 04:39:05 +0800 Subject: [PATCH 21/23] Decorators prefix class members; orphan and post-modifier decorators reject ClassMember modeled decorators as a STANDALONE sibling alternative, which tolerated an orphan '@dec' with no member and (together with the modifier-named-field fallback) any decorator/modifier interleaving. Decorators are now a prefix of the member shape ([many(DecoratorExpr), many(Modifier), ...]) in both grammars, with the static-block arm taking the same prefix ('@dec static {}' is parse-clean for tsc - the decorator there is a semantic error only). Cumulative flip-scan with per-flip tsc adjudication: 7 toward tsc, 0 away (the first attempt rejected the decorated static block - tsc accepts it - and the scan caught it). The 'public @dec method()' sub-case still parses through the modifier-named-field fallback; matching tsc's greedy modifier commitment there needs the fallback's bare-name arm split, recorded in the ROADMAP item. Gates 34/34, corpus parity 401/401, tree-sitter generate clean on all 4 affected grammars, gate:treesitter green. --- javascript.ts | 5 +++-- tree-sitter/javascript/grammar.js | 2 +- tree-sitter/javascriptreact/grammar.js | 2 +- tree-sitter/typescript/grammar.js | 2 +- tree-sitter/typescriptreact/grammar.js | 2 +- typescript.ts | 7 +++++-- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/javascript.ts b/javascript.ts index 07ac539..cc058da 100644 --- a/javascript.ts +++ b/javascript.ts @@ -465,10 +465,11 @@ const Modifier = alt('static', 'accessor', 'async'); const callTail = ['(', sep(Param, ','), ')', opt(Block), opt(';')] as const; const ClassMember = rule($ => [ ';', // SemicolonClassElement: `class C { ; }` - DecoratorExpr, ['constructor', '(', sep(Param, ','), ')', Block, opt(';')], - ['static', Block], + [many(DecoratorExpr), 'static', Block], // decorated static block parses (decorators on it are a SEMANTIC error) + // decorators PREFIX a member, before any modifier (see typescript.ts) [ + many(DecoratorExpr), many(Modifier), alt( ['*', MemberName, ...callTail], // generator method diff --git a/tree-sitter/javascript/grammar.js b/tree-sitter/javascript/grammar.js index 25908da..716589a 100644 --- a/tree-sitter/javascript/grammar.js +++ b/tree-sitter/javascript/grammar.js @@ -159,7 +159,7 @@ module.exports = grammar({ decl: $ => choice(seq(optional("async"), "function", optional("*"), field('name', $.ident), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block), seq(repeat($.decorator_expr), "class", field('name', $.ident), repeat(seq("extends", optional(seq(choice($.class_heritage), repeat(seq(",", choice($.class_heritage))), optional(","))))), "{", repeat($.class_member), "}"), seq("export", choice($.decl, $.stmt)), seq(repeat1($.decorator_expr), $.decl), seq("export", "default", choice(seq(optional("async"), "function", optional("*"), optional(field('name', $.ident)), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block), seq($.expr, optional(";")))), seq("export", "*", choice(seq("from", $.string, optional(";")), seq("as", $.ident, "from", $.string, optional(";")))), seq("export", "{", optional(seq($.export_specifier, repeat(seq(",", $.export_specifier)), optional(","))), "}", optional(seq("from", $.string)), optional(";")), seq("import", choice(seq($.import_clause, "from", $.string, optional(";")), seq($.ident, "=", $.expr, optional(";")), seq($.string, optional(";")))), seq(repeat($.decorator_expr), "export", choice($.decl, $.stmt))), - class_member: $ => choice(";", $.decorator_expr, seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq("static", $.block), seq(repeat(choice("static", "accessor", "async")), choice(seq("*", $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional($.block), optional(";")), seq($.member_name, choice(seq("(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional(seq("=", $.expr)), optional(";")), seq($.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";"))), + class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("static", "accessor", "async")), choice(seq("*", $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional($.block), optional(";")), seq($.member_name, choice(seq("(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional(seq("=", $.expr)), optional(";")), seq($.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";"))), import_clause: $ => choice(seq($.ident, optional(seq(",", choice(seq("{", optional(seq($.import_specifier, repeat(seq(",", $.import_specifier)), optional(","))), "}"), seq("*", "as", $.ident))))), seq("{", optional(seq($.import_specifier, repeat(seq(",", $.import_specifier)), optional(","))), "}"), seq("*", "as", $.ident)), diff --git a/tree-sitter/javascriptreact/grammar.js b/tree-sitter/javascriptreact/grammar.js index f904503..e80cabe 100644 --- a/tree-sitter/javascriptreact/grammar.js +++ b/tree-sitter/javascriptreact/grammar.js @@ -161,7 +161,7 @@ module.exports = grammar({ decl: $ => choice(seq(optional("async"), "function", optional("*"), field('name', $.ident), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block), seq(repeat($.decorator_expr), "class", field('name', $.ident), repeat(seq("extends", optional(seq(choice($.class_heritage), repeat(seq(",", choice($.class_heritage))), optional(","))))), "{", repeat($.class_member), "}"), seq("export", choice($.decl, $.stmt)), seq(repeat1($.decorator_expr), $.decl), seq("export", "default", choice(seq(optional("async"), "function", optional("*"), optional(field('name', $.ident)), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block), seq($.expr, optional(";")))), seq("export", "*", choice(seq("from", $.string, optional(";")), seq("as", $.ident, "from", $.string, optional(";")))), seq("export", "{", optional(seq($.export_specifier, repeat(seq(",", $.export_specifier)), optional(","))), "}", optional(seq("from", $.string)), optional(";")), seq("import", choice(seq($.import_clause, "from", $.string, optional(";")), seq($.ident, "=", $.expr, optional(";")), seq($.string, optional(";")))), seq(repeat($.decorator_expr), "export", choice($.decl, $.stmt))), - class_member: $ => choice(";", $.decorator_expr, seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq("static", $.block), seq(repeat(choice("static", "accessor", "async")), choice(seq("*", $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional($.block), optional(";")), seq($.member_name, choice(seq("(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional(seq("=", $.expr)), optional(";")), seq($.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";"))), + class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("static", "accessor", "async")), choice(seq("*", $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional($.block), optional(";")), seq($.member_name, choice(seq("(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional(seq("=", $.expr)), optional(";")), seq($.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";"))), import_clause: $ => choice(seq($.ident, optional(seq(",", choice(seq("{", optional(seq($.import_specifier, repeat(seq(",", $.import_specifier)), optional(","))), "}"), seq("*", "as", $.ident))))), seq("{", optional(seq($.import_specifier, repeat(seq(",", $.import_specifier)), optional(","))), "}"), seq("*", "as", $.ident)), diff --git a/tree-sitter/typescript/grammar.js b/tree-sitter/typescript/grammar.js index ce31307..4263f3a 100644 --- a/tree-sitter/typescript/grammar.js +++ b/tree-sitter/typescript/grammar.js @@ -220,7 +220,7 @@ module.exports = grammar({ interface_member: $ => choice(seq(optional("new"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), seq(choice("get", "set"), $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), seq(optional("static"), optional(choice("+", "-")), optional("readonly"), "[", $.ident, "in", $.type, optional(seq("as", $.type)), "]", optional(choice("+", "-")), optional("?"), ":", $.type), seq("readonly", $.member_name, optional("?"), ":", $.type), seq($.member_name, optional("?"), choice(seq(optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), optional(seq(":", $.type)))), seq(optional("static"), optional("readonly"), "[", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), "]", optional(seq(":", $.type)))), - class_member: $ => choice(";", $.decorator_expr, seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq("static", $.block), seq(repeat(choice("public", "private", "protected", "static", "abstract", "readonly", "override", "accessor", "async")), choice(seq("*", $.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq("[", $.ident, ":", $.type, "]", ":", $.type, optional(";")), seq($.member_name, choice(seq(optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")), seq($.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";"))), + class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("public", "private", "protected", "static", "abstract", "readonly", "override", "accessor", "async")), choice(seq("*", $.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq("[", $.ident, ":", $.type, "]", ":", $.type, optional(";")), seq($.member_name, choice(seq(optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")), seq($.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";"))), enum_member: $ => seq($.member_name, optional(seq("=", $.expr))), diff --git a/tree-sitter/typescriptreact/grammar.js b/tree-sitter/typescriptreact/grammar.js index 0272e8b..4285b06 100644 --- a/tree-sitter/typescriptreact/grammar.js +++ b/tree-sitter/typescriptreact/grammar.js @@ -222,7 +222,7 @@ module.exports = grammar({ interface_member: $ => choice(seq(optional("new"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), seq(choice("get", "set"), $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), seq(optional("static"), optional(choice("+", "-")), optional("readonly"), "[", $.ident, "in", $.type, optional(seq("as", $.type)), "]", optional(choice("+", "-")), optional("?"), ":", $.type), seq("readonly", $.member_name, optional("?"), ":", $.type), seq($.member_name, optional("?"), choice(seq(optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), optional(seq(":", $.type)))), seq(optional("static"), optional("readonly"), "[", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), "]", optional(seq(":", $.type)))), - class_member: $ => choice(";", $.decorator_expr, seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq("static", $.block), seq(repeat(choice("public", "private", "protected", "static", "abstract", "readonly", "override", "accessor", "async")), choice(seq("*", $.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq("[", $.ident, ":", $.type, "]", ":", $.type, optional(";")), seq($.member_name, choice(seq(optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")), seq($.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";"))), + class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("public", "private", "protected", "static", "abstract", "readonly", "override", "accessor", "async")), choice(seq("*", $.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq("[", $.ident, ":", $.type, "]", ":", $.type, optional(";")), seq($.member_name, choice(seq(optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")), seq($.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";"))), enum_member: $ => seq($.member_name, optional(seq("=", $.expr))), diff --git a/typescript.ts b/typescript.ts index e8ce25f..e038134 100644 --- a/typescript.ts +++ b/typescript.ts @@ -478,10 +478,13 @@ const Modifier = alt('public', 'private', 'protected', 'static', 'abstract', 're const callTail = ['(', sep(Param, ','), ')', opt(':', Type), opt(Block), opt(';')] as const; const ClassMember = rule($ => [ ';', // tsc's SemicolonClassElement: `class C { ; }` is parse-clean - DecoratorExpr, ['constructor', '(', sep(Param, ','), ')', Block, opt(';')], - ['static', Block], + [many(DecoratorExpr), 'static', Block], // decorated static block parses (decorators on it are a SEMANTIC error) + // decorators PREFIX a member, before any modifier — tsc parse-rejects + // `public @dec method()` ("Decorators are not valid here") and an orphan + // `@dec` with no member, which a standalone sibling alternative tolerated [ + many(DecoratorExpr), many(Modifier), alt( ['*', MemberName, opt('?'), opt(TypeParams), ...callTail], // generator method From d77b803b6af3ff06292d85cabf12453d5b995001 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 04:56:48 +0800 Subject: [PATCH 22/23] A ';'-less class field rejects a same-line decorator after it tsc's measured rule: '@' directly after a property on the SAME LINE binds to that property ('Decorators must precede the name and all keywords of property declarations') - 'x @dec y()' and 'x = 1 @dec y()' parse-reject, while 'x; @dec y()' and a newline before '@' accept. Encoded exactly: the field tails' no-';' ending carries not([sameLine, Decorator]) in both grammars (alt([';'], [not([sameLine, Decorator])])). This also closes the 'public @dec method()' shape: the bare 'public' field reading now refuses the same-line decorator, and the modifier reading correctly fails. not() now accepts an array as a seq, like everywhere else in the rule DSL (the NotNode conversion previously threw on arrays). Cumulative flip-scan with per-flip tsc adjudication: 12 toward tsc, 0 away. Gates 34/34, corpus parity 401/401, tree-sitter generate clean x4, gate:treesitter green. --- javascript.ts | 6 ++++-- src/api.ts | 17 +++++++++++------ tree-sitter/javascript/grammar.js | 2 +- tree-sitter/javascriptreact/grammar.js | 2 +- tree-sitter/typescript/grammar.js | 2 +- tree-sitter/typescriptreact/grammar.js | 2 +- typescript.ts | 8 ++++++-- 7 files changed, 25 insertions(+), 14 deletions(-) diff --git a/javascript.ts b/javascript.ts index cc058da..ce1a71f 100644 --- a/javascript.ts +++ b/javascript.ts @@ -476,13 +476,15 @@ const ClassMember = rule($ => [ [alt('get', 'set'), MemberName, '(', opt(sep(Param, ',')), ')', opt(Block), opt(';')], // accessor [MemberName, alt( [...callTail], // method (requires `(`) - [opt('=', Expr), opt(';')], // field (all-optional → catch-all) + // field catch-all; a ';'-less field must not be followed by a same-line + // decorator (see typescript.ts) + [opt('=', Expr), alt([';'], [not([sameLine, Decorator])])], )], ), ], // Fallbacks for a member NAMED like a modifier (`static = 1`, `get = 1`, `async() {}`): // many(Modifier) would eat the name, so the member kind alt fails and we land here. - [MemberName, opt('=', Expr), opt(';')], + [MemberName, opt('=', Expr), alt([';'], [not([sameLine, Decorator])])], [MemberName, '(', sep(Param, ','), ')', opt(Block), opt(';')], ]); diff --git a/src/api.ts b/src/api.ts index b2ab873..2109ced 100644 --- a/src/api.ts +++ b/src/api.ts @@ -187,10 +187,11 @@ class ExcludeNode { } class NotNode { readonly __kind = 'not' as const; - // Zero-width negative lookahead over a single element (wrap a sequence in a - // group/alt if needed). Matches nothing; succeeds only when `item` can't match. - readonly item: Element; - constructor(item: Element) { this.item = item; } + // Zero-width negative lookahead over an element, or an array (a seq, like + // everywhere else in the rule DSL). Matches nothing; succeeds only when + // `item` can't match. + readonly item: Element | Element[]; + constructor(item: Element | Element[]) { this.item = item; } } type Combinator = SepNode | OptNode | ManyNode | Many1Node | AltNode | ExcludeNode | NotNode; @@ -224,7 +225,7 @@ export function exclude(connectors: string | string[], ...items: Element[]): Exc // Zero-width negative lookahead: `not(x)` matches nothing and succeeds only when // `x` would NOT match here. -export function not(item: Element): NotNode { +export function not(item: Element | Element[]): NotNode { return new NotNode(item); } @@ -326,7 +327,11 @@ function toRuleExpr(el: Element, names: Map): RuleExpr { }; } if (el instanceof NotNode) { - return { type: 'not', body: toRuleExpr(el.item, names) }; + // an array is a seq here like everywhere else in the rule DSL + const body = Array.isArray(el.item) + ? { type: 'seq' as const, items: el.item.map(i => toRuleExpr(i, names)) } + : toRuleExpr(el.item, names); + return { type: 'not', body }; } const marker = el as Marker; if (marker.__kind === 'op') return { type: 'op' }; diff --git a/tree-sitter/javascript/grammar.js b/tree-sitter/javascript/grammar.js index 716589a..afd8ec5 100644 --- a/tree-sitter/javascript/grammar.js +++ b/tree-sitter/javascript/grammar.js @@ -159,7 +159,7 @@ module.exports = grammar({ decl: $ => choice(seq(optional("async"), "function", optional("*"), field('name', $.ident), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block), seq(repeat($.decorator_expr), "class", field('name', $.ident), repeat(seq("extends", optional(seq(choice($.class_heritage), repeat(seq(",", choice($.class_heritage))), optional(","))))), "{", repeat($.class_member), "}"), seq("export", choice($.decl, $.stmt)), seq(repeat1($.decorator_expr), $.decl), seq("export", "default", choice(seq(optional("async"), "function", optional("*"), optional(field('name', $.ident)), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block), seq($.expr, optional(";")))), seq("export", "*", choice(seq("from", $.string, optional(";")), seq("as", $.ident, "from", $.string, optional(";")))), seq("export", "{", optional(seq($.export_specifier, repeat(seq(",", $.export_specifier)), optional(","))), "}", optional(seq("from", $.string)), optional(";")), seq("import", choice(seq($.import_clause, "from", $.string, optional(";")), seq($.ident, "=", $.expr, optional(";")), seq($.string, optional(";")))), seq(repeat($.decorator_expr), "export", choice($.decl, $.stmt))), - class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("static", "accessor", "async")), choice(seq("*", $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional($.block), optional(";")), seq($.member_name, choice(seq("(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional(seq("=", $.expr)), optional(";")), seq($.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";"))), + class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("static", "accessor", "async")), choice(seq("*", $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional($.block), optional(";")), seq($.member_name, choice(seq("(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(optional(seq("=", $.expr)), choice(";", blank())))))), seq($.member_name, optional(seq("=", $.expr)), choice(";", blank())), seq($.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";"))), import_clause: $ => choice(seq($.ident, optional(seq(",", choice(seq("{", optional(seq($.import_specifier, repeat(seq(",", $.import_specifier)), optional(","))), "}"), seq("*", "as", $.ident))))), seq("{", optional(seq($.import_specifier, repeat(seq(",", $.import_specifier)), optional(","))), "}"), seq("*", "as", $.ident)), diff --git a/tree-sitter/javascriptreact/grammar.js b/tree-sitter/javascriptreact/grammar.js index e80cabe..c5a0ce3 100644 --- a/tree-sitter/javascriptreact/grammar.js +++ b/tree-sitter/javascriptreact/grammar.js @@ -161,7 +161,7 @@ module.exports = grammar({ decl: $ => choice(seq(optional("async"), "function", optional("*"), field('name', $.ident), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block), seq(repeat($.decorator_expr), "class", field('name', $.ident), repeat(seq("extends", optional(seq(choice($.class_heritage), repeat(seq(",", choice($.class_heritage))), optional(","))))), "{", repeat($.class_member), "}"), seq("export", choice($.decl, $.stmt)), seq(repeat1($.decorator_expr), $.decl), seq("export", "default", choice(seq(optional("async"), "function", optional("*"), optional(field('name', $.ident)), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block), seq($.expr, optional(";")))), seq("export", "*", choice(seq("from", $.string, optional(";")), seq("as", $.ident, "from", $.string, optional(";")))), seq("export", "{", optional(seq($.export_specifier, repeat(seq(",", $.export_specifier)), optional(","))), "}", optional(seq("from", $.string)), optional(";")), seq("import", choice(seq($.import_clause, "from", $.string, optional(";")), seq($.ident, "=", $.expr, optional(";")), seq($.string, optional(";")))), seq(repeat($.decorator_expr), "export", choice($.decl, $.stmt))), - class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("static", "accessor", "async")), choice(seq("*", $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional($.block), optional(";")), seq($.member_name, choice(seq("(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional(seq("=", $.expr)), optional(";")), seq($.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";"))), + class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("static", "accessor", "async")), choice(seq("*", $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional($.block), optional(";")), seq($.member_name, choice(seq("(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";")), seq(optional(seq("=", $.expr)), choice(";", blank())))))), seq($.member_name, optional(seq("=", $.expr)), choice(";", blank())), seq($.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional($.block), optional(";"))), import_clause: $ => choice(seq($.ident, optional(seq(",", choice(seq("{", optional(seq($.import_specifier, repeat(seq(",", $.import_specifier)), optional(","))), "}"), seq("*", "as", $.ident))))), seq("{", optional(seq($.import_specifier, repeat(seq(",", $.import_specifier)), optional(","))), "}"), seq("*", "as", $.ident)), diff --git a/tree-sitter/typescript/grammar.js b/tree-sitter/typescript/grammar.js index 4263f3a..b7f4357 100644 --- a/tree-sitter/typescript/grammar.js +++ b/tree-sitter/typescript/grammar.js @@ -220,7 +220,7 @@ module.exports = grammar({ interface_member: $ => choice(seq(optional("new"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), seq(choice("get", "set"), $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), seq(optional("static"), optional(choice("+", "-")), optional("readonly"), "[", $.ident, "in", $.type, optional(seq("as", $.type)), "]", optional(choice("+", "-")), optional("?"), ":", $.type), seq("readonly", $.member_name, optional("?"), ":", $.type), seq($.member_name, optional("?"), choice(seq(optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), optional(seq(":", $.type)))), seq(optional("static"), optional("readonly"), "[", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), "]", optional(seq(":", $.type)))), - class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("public", "private", "protected", "static", "abstract", "readonly", "override", "accessor", "async")), choice(seq("*", $.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq("[", $.ident, ":", $.type, "]", ":", $.type, optional(";")), seq($.member_name, choice(seq(optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")), seq($.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";"))), + class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("public", "private", "protected", "static", "abstract", "readonly", "override", "accessor", "async")), choice(seq("*", $.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq("[", $.ident, ":", $.type, "]", ":", $.type, optional(";")), seq($.member_name, choice(seq(optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), choice(";", blank())))))), seq($.member_name, optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), choice(";", blank())), seq($.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";"))), enum_member: $ => seq($.member_name, optional(seq("=", $.expr))), diff --git a/tree-sitter/typescriptreact/grammar.js b/tree-sitter/typescriptreact/grammar.js index 4285b06..047dd7e 100644 --- a/tree-sitter/typescriptreact/grammar.js +++ b/tree-sitter/typescriptreact/grammar.js @@ -222,7 +222,7 @@ module.exports = grammar({ interface_member: $ => choice(seq(optional("new"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), seq(choice("get", "set"), $.member_name, "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), seq(optional("static"), optional(choice("+", "-")), optional("readonly"), "[", $.ident, "in", $.type, optional(seq("as", $.type)), "]", optional(choice("+", "-")), optional("?"), ":", $.type), seq("readonly", $.member_name, optional("?"), ":", $.type), seq($.member_name, optional("?"), choice(seq(optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type))), optional(seq(":", $.type)))), seq(optional("static"), optional("readonly"), "[", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), "]", optional(seq(":", $.type)))), - class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("public", "private", "protected", "static", "abstract", "readonly", "override", "accessor", "async")), choice(seq("*", $.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq("[", $.ident, ":", $.type, "]", ":", $.type, optional(";")), seq($.member_name, choice(seq(optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")))))), seq($.member_name, optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), optional(";")), seq($.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";"))), + class_member: $ => choice(";", seq("constructor", "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", $.block, optional(";")), seq(repeat($.decorator_expr), "static", $.block), seq(repeat($.decorator_expr), repeat(choice("public", "private", "protected", "static", "abstract", "readonly", "override", "accessor", "async")), choice(seq("*", $.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(choice("get", "set"), $.member_name, "(", optional(optional(seq($.param, repeat(seq(",", $.param)), optional(",")))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq("[", $.ident, ":", $.type, "]", ":", $.type, optional(";")), seq($.member_name, choice(seq(optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";")), seq(optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), choice(";", blank())))))), seq($.member_name, optional("!"), optional("?"), optional(seq(":", $.type)), optional(seq("=", $.expr)), choice(";", blank())), seq($.member_name, optional("?"), optional($.type_params), "(", optional(seq($.param, repeat(seq(",", $.param)), optional(","))), ")", optional(seq(":", $.type)), optional($.block), optional(";"))), enum_member: $ => seq($.member_name, optional(seq("=", $.expr))), diff --git a/typescript.ts b/typescript.ts index e038134..a00708a 100644 --- a/typescript.ts +++ b/typescript.ts @@ -492,13 +492,17 @@ const ClassMember = rule($ => [ ['[', Ident, ':', Type, ']', ':', Type, opt(';')], // index signature [MemberName, alt( [opt('?'), opt(TypeParams), ...callTail], // method (requires `(`) - [opt('!'), opt('?'), opt(':', Type), opt('=', Expr), opt(';')], // field (all-optional → catch-all) + // field (all-optional → catch-all). A field NOT ended by ';' must not be + // followed by a SAME-LINE decorator: tsc reads that '@' as belonging to + // THIS property ("Decorators must precede the name and all keywords") — + // `x @dec y()` and `x = 1 @dec y()` reject, `x; @dec` and newline accept + [opt('!'), opt('?'), opt(':', Type), opt('=', Expr), alt([';'], [not([sameLine, Decorator])])], )], ), ], // Fallbacks for a member NAMED like a modifier (`static = 1`, `get = 1`, `async() {}`): // many(Modifier) would eat the name, so the member kind alt fails and we land here. - [MemberName, opt('!'), opt('?'), opt(':', Type), opt('=', Expr), opt(';')], + [MemberName, opt('!'), opt('?'), opt(':', Type), opt('=', Expr), alt([';'], [not([sameLine, Decorator])])], [MemberName, opt('?'), opt(TypeParams), '(', sep(Param, ','), ')', opt(':', Type), opt(Block), opt(';')], ]); From 777fe214efa0144873359f6b10706e1ee8d09e85 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 12 Jun 2026 06:09:22 +0800 Subject: [PATCH 23/23] Lexer resync also validates the candidate's leading-trivia flags The windowed-relex resync aligned candidates on kind/text/offset/end but NOT on the token's flags - yet the gap BEFORE the candidate can sit inside the edit: inserting '42' into '}\n privat' leaves every token byte identical from the candidate on while removing its preceding newline. The old token was adopted with a stale newlineBefore, and anything reading the flag downstream (sameLine assertions, comment-aware folds) diverged from a fresh parse. Found by delta-debugging an edit/fresh divergence to a 690-char repro and diffing full streams including flags; the leaf tilings were identical, which is why tree comparisons alone never caught it. The window lex has already computed the candidate's true flags when the resync fires (it lexed the gap), so the fix is one equality in the resync condition: the pushed candidate's flags must match the old token's. A mismatch just keeps lexing - the next candidate's gap lies beyond the edit, so the flags converge and the regrow terminates. Gates: 34/34, lexer parity 5695 diff=0, incremental-grammars 672/672, corpus parity 401/401. --- src/emit-lexer.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 625c872..18d9c0d 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -312,6 +312,10 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` while (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta < off) { if (altPd[wndPtr] < dmgMinOld) dmgMinOld = altPd[wndPtr]; wndPtr++; }`); emit(` if (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta === off && altK[wndPtr] === k && altT[wndPtr] === t`); emit(` && (altEnd[wndPtr] < 0 ? altEnd[wndPtr] + srcLenP1 : altEnd[wndPtr]) + wndDelta === end`); + emit(` // the candidate's LEADING-TRIVIA flags must match too: the gap before`); + emit(` // it may sit inside the edit (newline removed/added without moving any`); + emit(` // token bytes), and parsers read these flags (sameLine / commentBefore)`); + emit(` && altFl[wndPtr] === tkFl[tokN - 1]`); emit(` && templateStack.length === 0 && altDp[wndPtr] === 0`); emit(` && LX_PFXV[t] === 0 && LX_PARENKW[t] === 0`); emit(` && !(k === K_PUNCT && (t === ${tLParen} || t === ${tRParen}))) {`);