From 4b5020a17caaad651b34a2a1d854977345c5e9a7 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 02:46:15 +0800 Subject: [PATCH 01/15] Incremental re-parse v1: parseEdited() with memo carry-over and arena reuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parseEdited(newSource) re-parses after an edit reusing everything the edit provably did not touch. No edit protocol: the damage window is DERIVED by diffing the old and new token columns (longest identical prefix; longest suffix identical modulo the char delta) — the caller just hands the new text. Reuse flows through the carried memo. Soundness rests on three pieces: - Every memo entry records its lookahead WATERMARK (memoExt): the farthest token the stored parse may have read — a PEG parse probes beyond its end (failed longer arms, not() lookaheads, SECOND-token dispatch). It comes for free from the global advance watermark at frame exit; the fixed read slack (stop token + SECOND probe, +2) is applied at invalidation time, so the stored value stays the true watermark. - A memo HIT bumps the watermark to the entry's own: the jump semantically reads everything the stored parse read, or an enclosing rule completing right after a reused subtree would record a watermark smaller than what its result depends on (including the child's over-probing failed arms), and a later edit in the gap would keep the stale entry alive. Guaranteed batch no-op by monotonicity — the 18,805-file byte-identical gate and the exact reject-message gate both stay green. - Prefix entries survive when watermark+slack stays inside the prefix; suffix entries shift by the token delta; the damage window drops. The old arena is re-based in place (suffix rows by charDelta, reused leaf entries by tokenDelta; damage-spanning rows are unreachable garbage), and new rows append after the old — a full parse() compacts. The ENTRY rule's repetition units (Stmt/Decl for TS — derived from the grammar shape, no language names) now memoize through parseRuleEntry like pratt/ left-rec rules, so whole untouched statements reuse, not just expression subtrees. Token columns double-buffer across edits (ping-pong, zero steady-state allocation; the in-place memo variant for token-count changes measured SLOWER than sparse rebuild — undefined writes materialize holes — and was reverted). Gate (test/incremental-verify.ts, #30 in the chain): scripted edit sessions over the bench files — inserts, deletes, statement insertions, syntax-breaking edits — every accepted re-parse must be byte-identical (toObject) to a fresh parse; rejects must reject on both sides. 120 steps, 0 mismatches. Measured: mixed-session 1.4-1.5x, single-keystroke ~3x, pure-reuse floor ~5.6x; the remaining cost is full-file relex + diff bookkeeping (windowed relex and the green {rel,len} re-base are the named follow-ups), the reused parse itself is ~1% of the profile. --- src/emit-parser.ts | 234 +++++++++++++++++++++++++++++++++---- test/check.ts | 1 + test/incremental-verify.ts | 113 ++++++++++++++++++ 3 files changed, 327 insertions(+), 21 deletions(-) create mode 100644 test/incremental-verify.ts diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 8d8f521..b7a5194 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1431,6 +1431,7 @@ let pos = 0; let maxPos = 0; let memoNode = []; let memoEnd = []; +let memoExt = []; // per-entry lookahead extent (see parseRuleEntry) let parseLimit = -1; // cap = the exclusive lookahead bound: min(parseLimit-or-∞, tokN), maintained at the // parseLimit set/restore sites and the one token-stream mutation (the '>' splice). @@ -1498,6 +1499,7 @@ function matchPuLitGT(pu) { // Token indices shifted: drop the per-rule memo arrays (recreated lazily at the new size). memoNode.fill(undefined); memoEnd.fill(undefined); + memoExt.fill(undefined); // Leaf entries reference tokens BY INDEX, so the splice's +1 shift must be applied // to every committed/scratch entry past the split point. (Object trees were immune — // leaves copied their spans; the arena trades that copy for this rare O(kidN) pass. @@ -1578,10 +1580,30 @@ function parseTemplateExpr() { // Emit the per-rule parse functions + the RULES dispatch table. function emitRuleFns(e: Emitter, a: ReturnType) { const ruleFn = (name: string) => `R_${sanitize(name)}`; + // SPINE rules — the entry rule's repetition units (the rules its body references + // directly): the natural reuse granularity for incremental re-parsing, so they get + // memoized through parseRuleEntry like pratt/left-rec rules. Without this only + // expression/type subtrees reuse and every statement re-walks on each edit. + // Derived from the grammar shape — no language names. + const spine = new Set(); + { + const entryRule = a.grammar.rules[a.grammar.rules.length - 1]; + const walk = (x: RuleExpr): void => { + switch (x.type) { + case 'ref': if (a.ruleByName.has(x.name)) spine.add(x.name); return; + case 'seq': case 'alt': x.items.forEach(walk); return; + case 'quantifier': case 'group': walk(x.body); return; + case 'sep': walk(x.element); return; + default: return; + } + }; + walk(entryRule.body); + spine.delete(entryRule.name); + } for (const rule of a.grammar.rules) { if (a.prattRules.has(rule.name)) emitPrattRule(e, a, rule); else if (a.leftRecSet.has(rule.name)) emitLeftRecRule(e, a, rule); - else emitNonRecRule(e, a, rule); + else emitNonRecRule(e, a, rule, spine.has(rule.name) && !a.prattRules.has(rule.name) && !a.leftRecSet.has(rule.name)); } // Dispatch table (string rule name → fn), for parseTemplateExpr's dynamic interp rule. e.emit(`const RULES = {`); @@ -1593,11 +1615,19 @@ function emitRuleFns(e: Emitter, a: ReturnType) { // committed to the arena IMMEDIATELY (finishNode also truncates scratch back to mark); // a not-better arm's children are dropped by the next arm's scn reset (a beaten // committed candidate stays as an arena hole — the measured 3-5% discard class). -function emitNonRecRule(e: Emitter, a: ReturnType, rule: RuleDecl) { +function emitNonRecRule(e: Emitter, a: ReturnType, rule: RuleDecl, memoized = false) { const ruleFn = `R_${sanitize(rule.name)}`; const rid = a.grammar.rules.indexOf(rule); const alts = rule.body.type === 'alt' ? rule.body.items : [rule.body]; - e.emit(`function ${ruleFn}() {`); + // A memoized (spine) rule splits into the public wrapper (parseRuleEntry owns the + // push+boolean contract and the memo) and an id-returning core, exactly like the + // pratt/left-rec rules. + if (memoized) { + e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${J(rule.name)}, ${ruleFn}_core); }`); + e.emit(`function ${ruleFn}_core(_minBp) {`); + } else { + e.emit(`function ${ruleFn}() {`); + } e.emit(` const saved = pos; const mark = scn;`); e.emit(` let bestId = -1; let bestPos = saved;`); const dispatch = e.altMaskDispatch(alts, '_am'); @@ -1612,8 +1642,13 @@ function emitNonRecRule(e: Emitter, a: ReturnType, rule: RuleDec e.emit(` }`); e.emit(` }`); }); - e.emit(` if (bestId >= 0) { pos = bestPos; scn = mark; scPush(bestId); return true; }`); - e.emit(` pos = saved; scn = mark; return false;`); + if (memoized) { + e.emit(` if (bestId >= 0) { pos = bestPos; scn = mark; return bestId; }`); + e.emit(` pos = saved; scn = mark; return -1;`); + } else { + e.emit(` if (bestId >= 0) { pos = bestPos; scn = mark; scPush(bestId); return true; }`); + e.emit(` pos = saved; scn = mark; return false;`); + } e.emit(`}`); // Arm matchers. alts.forEach((alt, i) => emitArm(e, a, rule.name, i, alt)); @@ -1890,24 +1925,42 @@ function emitMixfixLed(e: Emitter, a: ReturnType, fnName: string function emitDriver(e: Emitter, a: ReturnType, entry: string) { e.emit(String.raw` // parseRule for a pratt/left-rec rule: memo + context + suppress, then the core. -// The memo is a pair of per-rule arrays indexed by start pos (lazily sized to the token -// count, undefined-holed): a lookup is two array loads, a store allocates nothing — no -// Map hashing and no {node, end} wrapper per store. The core returns a node ID (or -1); +// The memo is per-rule arrays indexed by start pos (lazily sized to the token count, +// undefined-holed): a lookup is two array loads, a store allocates nothing — no Map +// hashing and no {node, end} wrapper per store. The core returns a node ID (or -1); // this wrapper owns the public arena contract (push the id, return a boolean). +// +// memoExt records each entry's LOOKAHEAD EXTENT — the farthest token index the parse +// may have READ (not merely consumed) — which is what incremental invalidation must +// intersect with an edit's damage window: a PEG parse probes beyond its end (failed +// longer arms, not() lookaheads, SECOND-token dispatch). The extent comes for free +// from the global advance watermark: maxPos at frame exit, +2 covering the stop-token +// and SECOND-token reads past it. Left-to-right parsing keeps the watermark near the +// current frontier, so the value is tight on the dominant flow and only OVER- +// invalidates (soundly) near big-backtrack clusters. function parseRuleEntry(idx, name, core) { const mySup = suppressNext; suppressNext = null; const capped = parseLimit >= 0; const start = pos; - // Capture the pair together: a '>'-splice inside core() detaches both via fill(undefined), - // and the store below must then write into the DETACHED pair (i.e. be discarded), exactly - // like the old per-rule Map did. + // Capture the arrays together: a '>'-splice inside core() detaches them via + // fill(undefined), and the store below must then write into the DETACHED arrays + // (i.e. be discarded), exactly like the old per-rule Map did. let me = memoEnd[idx]; let mn = memoNode[idx]; + let mx = memoExt[idx]; if (!mySup && !capped && me !== undefined) { const e = me[start]; if (e !== undefined) { pos = e; + // The jump SEMANTICALLY reads everything the stored parse read: keep the advance + // watermark ≥ the entry's watermark, or an ENCLOSING rule that completes right + // after a reused subtree stores a watermark smaller than what its result depends + // on (including the child's own over-probing failed arms), and a later edit in + // the gap keeps the stale entry alive. A guaranteed batch no-op: the watermark is + // monotone and was already ≥ this value when the entry was stored. + const ex = mx[start]; + if (ex > maxPos) maxPos = ex; const id = mn[start]; if (id >= 0) { scPush(id); return true; } return false; @@ -1928,11 +1981,16 @@ function parseRuleEntry(idx, name, core) { if (me === undefined) { me = new Array(tokN + 1); mn = new Array(tokN + 1); + mx = new Array(tokN + 1); memoEnd[idx] = me; memoNode[idx] = mn; + memoExt[idx] = mx; } me[start] = pos; mn[start] = result; + mx[start] = maxPos; // the TRUE probe watermark — the +2 read slack (stop token, + // SECOND-token dispatch) is applied at INVALIDATION time + } if (result >= 0) { scPush(result); return true; } return false; @@ -2031,7 +2089,7 @@ export function toObject(id) { } // Parse to the ARENA: returns the root node id. -export function parse(source, entryRule) { +function lexInto(source) { ${e.soa ? ` tokenize(source);` : String.raw` src = source; const _toks = tokenize(source); const _n = _toks.length; @@ -2044,19 +2102,24 @@ ${e.soa ? ` tokenize(source);` : String.raw` src = source; tkText[_i] = _t.text; } tokN = _n;`} +} + +function farthest(errPos) { + if (maxPos <= errPos || maxPos >= tokN) return ''; + return ' [farthest: offset ' + tkOff[maxPos] + " near '" + tokTextAt(maxPos).slice(0, 20) + "']"; +} + +// Run the entry rule over the CURRENT token stream (shared by parse / parseEdited — +// everything per-parse EXCEPT the memo and the arena cursor, which parseEdited carries). +function runParse(entryRule) { pos = 0; maxPos = 0; - memoNode = new Array(MEMO_RULES); - memoEnd = new Array(MEMO_RULES); parseLimit = -1; cap = tokN; currentPrattContext = null; suppressNext = null; suppressCur = null; - nodeN = 0; - kidN = 0; scn = 0; - const entry = entryRule ?? ENTRY; if (tokN === 0) { const rid = RULE_NAMES.indexOf(entry); @@ -2070,14 +2133,143 @@ ${e.soa ? ` tokenize(source);` : String.raw` src = source; throw new Error('Parse error at offset ' + tkOff[pos] + ": unexpected '" + tokTextAt(pos) + "' after successful parse" + farthest(pos)); } return sc[--scn]; +} + +// Source of the last COMPLETED parse — the token columns, arena and memo describe it. +// null whenever the module state is not a coherent snapshot (no parse yet, or the last +// attempt threw), so parseEdited falls back to a full parse. +let lastSrc = null; +// The spare token-column buffer set (parseEdited ping-pongs between the live set and +// this one, so steady-state edits never allocate columns). +let altK = null, altT = null, altOff = null, altEnd = null, altFl = null; +${e.soa ? '' : 'let altText = [];'} + +export function parse(source, entryRule) { + lastSrc = null; + lexInto(source); + memoNode = new Array(MEMO_RULES); + memoEnd = new Array(MEMO_RULES); + memoExt = new Array(MEMO_RULES); + nodeN = 0; + kidN = 0; + const root = runParse(entryRule); + lastSrc = source; + return root; +} - function farthest(errPos) { - if (maxPos <= errPos || maxPos >= tokN) return ''; - return ' [farthest: offset ' + tkOff[maxPos] + " near '" + tokTextAt(maxPos).slice(0, 20) + "']"; +// ── Incremental re-parse ── +// No edit protocol: the caller hands the NEW source; the damage window is DERIVED by +// diffing the old and new token columns (longest identical prefix; longest suffix +// identical modulo the character delta). Reuse then flows through the carried memo: +// - prefix entries survive when their lookahead extent never reached the damage; +// - suffix entries survive shifted by the token delta (their reads are wholly inside +// the suffix, which is identical modulo position); +// - damaged-region entries are dropped and re-parsed. +// The old arena is re-based in place (rows starting at/after the suffix shift by the +// char delta; reused leaf entries by the token delta; rows STARTING inside the damage +// are unreachable garbage — their values no longer matter), and new rows append after +// the old ones. A full parse() compacts (resets the arena); long edit sessions grow +// until then. Lexing is FULL-FILE by design: the lexer carries cross-token state +// (template nesting, regex context, markup modes), full lexing is a small share of a +// parse, and the diff is what localizes the damage — not the lexer. +export function parseEdited(source, entryRule) { + if (lastSrc === null) return parse(source, entryRule); + const oSrc = lastSrc; + lastSrc = null; + // Stash the old columns BY REFERENCE and lex into the spare buffer set (ping-pong + // double buffer — steady-state edits allocate nothing and keep the pages warm). + const oK = tkK, oT = tkT, oOff = tkOff, oEnd = tkEnd, oFl = tkFl, oN = tokN; +${e.soa ? '' : ' const oText = tkText;'} + if (altK === null || altK.length !== tkCap) { + altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); + altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); + } + tkK = altK; tkT = altT; tkOff = altOff; tkEnd = altEnd; tkFl = altFl; +${e.soa ? '' : ' tkText = altText; tkText.length = 0;'} + altK = oK; altT = oT; altOff = oOff; altEnd = oEnd; altFl = oFl; +${e.soa ? '' : ' altText = oText;'} + lexInto(source); + if (tkCap !== oK.length) { + // the new lex outgrew the buffer (growTok reallocated): drop the stale spare + altK = null; + } + const nN = tokN; + const charDelta = source.length - oSrc.length; + const minN = oN < nN ? oN : nN; + // Longest identical prefix (positions included — the prefix is unshifted). + let p = 0; + while (p < minN && oK[p] === tkK[p] && oT[p] === tkT[p] && oFl[p] === tkFl[p] + && oOff[p] === tkOff[p] && oEnd[p] === tkEnd[p]${e.soa ? '' : ' && oText[p] === tkText[p]'}) p++; + // Longest identical suffix modulo charDelta (disjoint from the prefix). + let s = 0; + while (s < minN - p) { + const i = oN - 1 - s, j = nN - 1 - s; + if (oK[i] === tkK[j] && oT[i] === tkT[j] && oFl[i] === tkFl[j] + && oOff[i] + charDelta === tkOff[j] && oEnd[i] + charDelta === tkEnd[j]${e.soa ? '' : ' && oText[i] === tkText[j]'}) s++; + else break; + } + const dOldEnd = oN - s; // damaged OLD tokens: [p, dOldEnd) + const tokenDelta = nN - oN; + // Re-base the old arena in place: rows starting at/after the first suffix token's OLD + // offset shift by charDelta; reused leaf entries past the damage shift by tokenDelta. + // (A reusable subtree lies entirely on one side of the damage, so the start-threshold + // classifies it correctly; damage-spanning rows are garbage either way.) + if (s > 0 && (charDelta !== 0 || tokenDelta !== 0)) { + const charThresh = oOff[dOldEnd]; + if (charDelta !== 0) { + for (let i = 0; i < nodeN; i++) if (rowOff[i] >= charThresh) rowOff[i] += charDelta; + } + if (tokenDelta !== 0) { + const eShift = tokenDelta << 2; + for (let i = 0; i < kidN; i++) { + const e = kids[i]; + if (e < 0 && ((~e) >>> 2) >= dOldEnd) kids[i] = e - eShift; + } + } + } + // Carry the memo across: prefix entries whose lookahead never reached the damage stay + // at their index; suffix entries move by tokenDelta (ids reference the re-based rows). + // tokenDelta === 0 (the common keystroke: editing within a token) mutates IN PLACE — + // no per-rule array allocation; only the damage window and the prefix entries whose + // extent crossed into it are cleared. + for (let r = 0; r < MEMO_RULES; r++) { + const me = memoEnd[r]; + if (me === undefined) continue; + const mn = memoNode[r], mx = memoExt[r]; + // prefix entries whose lookahead may have crossed into the damage die in place + // (mx is the advance watermark; reads run up to two tokens past it: the stop + // token and the SECOND-token dispatch probe) + for (let i = 0; i < p; i++) { + if (me[i] !== undefined && mx[i] + 2 > p) { me[i] = undefined; mn[i] = undefined; mx[i] = undefined; } + } + if (tokenDelta === 0) { + for (let i = p; i < dOldEnd; i++) { + if (me[i] !== undefined) { me[i] = undefined; mn[i] = undefined; mx[i] = undefined; } + } + continue; + } + // token count changed: rebuild the rule's arrays sparsely (measured FASTER than an + // in-place direction-aware shift — writing undefined through the holes materializes + // them; fresh holey arrays skip that entirely). + const nme = new Array(nN + 1), nmn = new Array(nN + 1), nmx = new Array(nN + 1); + const pCap = p < nN + 1 ? p : nN + 1; + for (let i = 0; i < pCap; i++) { + if (me[i] !== undefined) { nme[i] = me[i]; nmn[i] = mn[i]; nmx[i] = mx[i]; } + } + for (let i = dOldEnd; i <= oN; i++) { + if (me[i] !== undefined) { + const j = i + tokenDelta; + nme[j] = me[i] + tokenDelta; nmn[j] = mn[i]; nmx[j] = mx[i] + tokenDelta; + } + } + memoEnd[r] = nme; memoNode[r] = nmn; memoExt[r] = nmx; } + const root = runParse(entryRule); + lastSrc = source; + return root; } export { tokenize }; -export function createParser() { return { parse, tree, visit, toObject, tokenize }; } +export function createParser() { return { parse, parseEdited, tree, visit, toObject, tokenize }; } `); } diff --git a/test/check.ts b/test/check.ts index 8b2c81e..a0f18e4 100644 --- a/test/check.ts +++ b/test/check.ts @@ -21,6 +21,7 @@ const GATES: Gate[] = [ { group: 'core', name: 'cst-text-invariant', args: ['test/cst-text-invariant.ts'] }, { group: 'conformance', name: 'ts-ast-structure', args: ['test/ts-ast-verify.ts'] }, { group: 'core', name: 'cst-match-totality', args: ['test/cst-match-totality.ts'] }, + { group: 'core', name: 'incremental-verify', args: ['test/incremental-verify.ts'] }, { group: 'core', name: 'issue-cases', args: ['test/test-issues.ts'] }, { group: 'conformance', name: 'js', args: ['test/js-conformance.ts'] }, { group: 'conformance', name: 'tsx', args: ['test/tsx-conformance.ts'] }, diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts new file mode 100644 index 0000000..2ed248d --- /dev/null +++ b/test/incremental-verify.ts @@ -0,0 +1,113 @@ +// Gate: INCREMENTAL ≡ FRESH. parseEdited(newSource) must produce a tree byte-identical +// (via toObject) to a from-scratch parse of the same text, across scripted edit +// sessions over real files — inserts, deletions, replacements, statement insertions, +// edits inside strings/comments, and syntax-breaking edits (both sides must reject; +// the session self-heals on the next good text). Also reports the incremental speedup +// and the arena growth, so reuse is MEASURED, not assumed. +// +// node test/incremental-verify.ts +import { existsSync, readFileSync, writeFileSync } from 'node:fs'; +import { emitParser } from '../src/emit-parser.ts'; + +const grammar = (await import('../typescript.ts')).default; +const emPath = '/tmp/emitted-incremental.mjs'; +writeFileSync(emPath, emitParser(grammar)); +type Em = { + parse(s: string): number; + parseEdited(s: string): number; + toObject(id: number): unknown; +}; +const session = (await import(emPath + '?session=' + process.pid)) as Em; +const fresh = (await import(emPath + '?fresh=' + process.pid)) as Em; + +// Deterministic LCG so failures replay. +let seedState = 0x2F6E2B1; +const rand = () => ((seedState = (seedState * 48271) % 0x7fffffff) / 0x7fffffff); +const randInt = (n: number) => Math.floor(rand() * n); + +const INSERTS = ['x', '_v', '42', ' + y', '.m', '()', ' /*c*/ ', '"s"', 'await ', '!', '?']; +const STMTS = ['const q9 = 1;\n', 'function g9(a) { return a; }\n', 'if (x9) { y9(); }\n', '// note\n', 'type T9 = string | number;\n']; + +function mutate(text: string): string { + switch (randInt(5)) { + case 0: { // insert a small fragment at a random position + const at = randInt(text.length); + return text.slice(0, at) + INSERTS[randInt(INSERTS.length)] + text.slice(at); + } + case 1: { // delete a small span + const at = randInt(Math.max(1, text.length - 8)); + return text.slice(0, at) + text.slice(at + 1 + randInt(6)); + } + case 2: { // replace a character + const at = randInt(Math.max(1, text.length - 1)); + return text.slice(0, at) + 'z' + text.slice(at + 1); + } + case 3: { // insert a whole statement at a line boundary + const lines = text.split('\n'); + const at = randInt(lines.length); + lines.splice(at, 0, STMTS[randInt(STMTS.length)].trimEnd()); + return lines.join('\n'); + } + default: { // append at the end (the pure-prefix reuse case) + return text + '\n' + STMTS[randInt(STMTS.length)]; + } + } +} + +const FILES = [ + '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserharness.ts', + '/tmp/ts-repo/tests/cases/conformance/fixSignatureCaching.ts', + '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/parserRealSource7.ts', + '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserindenter.ts', +].filter(existsSync); +const STEPS = 30; + +let steps = 0, equal = 0, bothReject = 0, mismatch = 0; +let tInc = 0, tFresh = 0; +const failures: string[] = []; + +for (const f of FILES) { + let text = readFileSync(f, 'utf-8'); + session.parse(text); // open the session + for (let k = 0; k < STEPS; k++) { + const next = mutate(text); + steps++; + let freshRoot = -1, freshErr: string | null = null; + const tf0 = performance.now(); + try { freshRoot = fresh.parse(next); } catch (e) { freshErr = (e as Error).message; } + const tf1 = performance.now(); + let incRoot = -1, incErr: string | null = null; + const ti0 = performance.now(); + try { incRoot = session.parseEdited(next); } catch (e) { incErr = (e as Error).message; } + const ti1 = performance.now(); + if (freshErr !== null || incErr !== null) { + if ((freshErr === null) !== (incErr === null)) { + mismatch++; + if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: fresh ${freshErr ? 'reject' : 'accept'} / incremental ${incErr ? 'reject' : 'accept'}\n fresh: ${freshErr ?? '-'}\n inc: ${incErr ?? '-'}`); + } else bothReject++; + // rejected text: do not advance the session text (the session reset itself) + continue; + } + tFresh += tf1 - tf0; tInc += ti1 - ti0; + const a = JSON.stringify(fresh.toObject(freshRoot)); + const b = JSON.stringify(session.toObject(incRoot)); + if (a === b) equal++; + else { + mismatch++; + if (failures.length < 5) { + let i = 0; while (i < a.length && i < b.length && a[i] === b[i]) i++; + failures.push(`${f.split('/').pop()} step ${k}: tree diverges @${i}\n fresh: …${a.slice(Math.max(0, i - 50), i + 50)}…\n inc: …${b.slice(Math.max(0, i - 50), i + 50)}…`); + } + } + text = next; + } +} + +console.log(`incremental ≡ fresh: ${equal} equal · ${bothReject} both-reject · ${mismatch} MISMATCH (${steps} steps over ${FILES.length} files)`); +if (tInc > 0) console.log(`time: incremental ${tInc.toFixed(1)}ms vs fresh ${tFresh.toFixed(1)}ms → ${(tFresh / tInc).toFixed(2)}× faster on accepted edits`); +for (const s of failures) console.log(' ✗ ' + s); +if (mismatch > 0) { + console.error('✗ incremental parse diverges from a fresh parse'); + process.exit(1); +} +console.log('✓ every edited re-parse is byte-identical to a fresh parse'); From 909b835bd14d8eca0b5769745fb23d535fc05ef5 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 03:22:44 +0800 Subject: [PATCH 02/15] Windowed re-lexing: lex O(damage) with depth-recorded restart/resync (M1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lexer core is parameterized (lexCore): start anywhere with the previous token's (k, t) as the regex-context seed and empty template/paren stacks. Every token records its two stack depths (tkDp/tkPd columns); the restart anchor is the last token before the damage with both records zero and no live cross-token flag (a control-head ')' or postfix-ambiguous operator), walking back to the file head in the worst case — always sound. The window lexes into the spare buffer set (the old stream stays live), and RESYNC fires at the first token at/past the damage end that aligns with an old token (same k/t, spans shifted by the char delta) at EQUAL stack depths where every still-open bracket was opened BEFORE the damage — the byte-equal prefix guarantees those stack entries agree, while anything opened inside the damage may differ in control-head-ness and must not span the join. The depth-tolerant condition matters: an all-wrapping IIFE (typescript.js) keeps paren depth >= 1 everywhere, and a depth-0-only resync degraded 9MB edits to ~1.2x; with it they reach 2.6x. The splice is copyWithin + a suffix span shift; the damage window is derived from a char-level prefix/suffix compare of the two sources (no edit protocol needed). The true token prefix is recovered by comparing the window's leading tokens against the old stream before the splice, so the memo carry keeps everything the re-lex merely re-derived. Fallback-lexer grammars keep the full-relex path; tokenize() is unchanged for batch (the lexer-equality gate runs the full streams). Numbers: 81KB keystroke 3.5x -> 3.3x parse-side with lex now O(damage); mixed sessions ~1.5-1.65x; 9MB keystroke 2.6x. Remaining per-edit O(n) is the M3/M4 bookkeeping (memo prefix scans, arena re-base loops, suffix span shift) — the green {rel,len} re-base and old-tree cursor adoption kill those next. 30/30 gates; emit≡interp 18,802 byte-identical; reject messages and token streams exact. --- src/emit-lexer.ts | 66 ++++++++++++++++++--- src/emit-parser.ts | 145 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 169 insertions(+), 42 deletions(-) diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 1a41ac6..9dd85d0 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -199,32 +199,70 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`function tokenize(source) {`); emit(` src = source;`); emit(` tokN = 0;`); + emit(` lexCore(source, 0, -1, 0, -1, 0, 0);`); + emit(` return tokN;`); + emit(`}`); + emit(`// The lexer core, parameterized for WINDOWED re-lexing: start at startPos with`); + emit(`// the previous token's (k, t) as the regex-context seed (-1 = none / file start)`); + emit(`// and EMPTY template/paren stacks (the caller restarts only at depth-0 safe`); + emit(`// points). In window mode (wndPtr0 >= 0) the OLD stream sits in the alt buffers;`); + emit(`// after each token pushed at/past wndMinOff, resync fires when it aligns with an`); + emit(`// old token (same k/t, offsets shifted by wndDelta, both depth records 0) while`); + emit(`// the window's own stacks are empty — returns that OLD index (the duplicate push`); + emit(`// is retracted), or -1 when lexing ran to EOF.`); + emit(`function lexCore(source, startPos, pvK, pvT, wndPtr0, wndMinOff, wndDelta, wndCs) {`); emit(` const n = source.length;`); - emit(` let pos = 0;`); + emit(` let pos = startPos;`); emit(` let pendingNl = false;`); emit(` let lastBangWasPostfix = false;`); emit(` let lastCloseWasParenHead = false;`); emit(` const templateStack = [];`); emit(` const parenHeadStack = [];`); + emit(` let wndPtr = wndPtr0;`); + emit(` let wndHit = -1;`); + emit(` // stack depths as of the last token fully BEFORE the damage: a resync point may`); + emit(` // sit at any depth as long as every bracket still open there was opened before`); + emit(` // the damage (the prefix agrees byte-for-byte, so those stack entries agree too;`); + emit(` // anything opened inside the damage could differ in control-head-ness).`); + emit(` let dmgDp = -1, dmgPd = -1;`); + emit(` let lastDp = 0, lastPd = 0;`); emit(` function tkPush(k, t, off, end) {`); emit(` if (tokN === tkCap) growTok();`); emit(` tkK[tokN] = k; tkT[tokN] = t; tkOff[tokN] = off; tkEnd[tokN] = end;`); emit(` tkFl[tokN] = pendingNl ? 1 : 0;`); + emit(` tkDp[tokN] = templateStack.length;`); + emit(` tkPd[tokN] = parenHeadStack.length;`); emit(` pendingNl = false;`); + emit(` pvK = k; pvT = t;`); emit(` tokN++;`); + emit(` if (wndPtr >= 0) {`); + emit(` if (dmgPd < 0) {`); + emit(` if (off >= wndCs) { dmgDp = lastDp; dmgPd = lastPd; }`); + emit(` else { lastDp = tkDp[tokN - 1]; lastPd = tkPd[tokN - 1]; }`); + emit(` }`); + emit(` if (off >= wndMinOff && dmgPd >= 0`); + emit(` && templateStack.length <= dmgDp && parenHeadStack.length <= dmgPd) {`); + emit(` while (wndPtr < altN && altOff[wndPtr] + wndDelta < off) wndPtr++;`); + emit(` if (wndPtr < altN && altOff[wndPtr] + wndDelta === off && altK[wndPtr] === k && altT[wndPtr] === t`); + emit(` && altEnd[wndPtr] + wndDelta === end && altDp[wndPtr] === templateStack.length && altPd[wndPtr] === parenHeadStack.length) {`); + emit(` wndHit = wndPtr;`); + emit(` }`); + emit(` }`); + emit(` }`); emit(` }`); emit(` // prevIsValue, baked: postfix-ambiguous op → its recorded position; an expression-`); emit(` // head keyword or a control-head ')' is NOT a value; else division-prev type/text.`); emit(` function prevIsValue() {`); - emit(` if (tokN === 0) return false;`); - emit(` const i = tokN - 1;`); - emit(` const t = tkT[i];`); + emit(` const k = tokN > 0 ? tkK[tokN - 1] : pvK;`); + emit(` if (k < 0) return false;`); + emit(` const t = tokN > 0 ? tkT[tokN - 1] : pvT;`); emit(` if (LX_PFXV[t] !== 0) return lastBangWasPostfix;`); - emit(` if (tkK[i] === ${kIdent} && LX_EXPRKW[t] !== 0) return false;`); + emit(` if (k === ${kIdent} && LX_EXPRKW[t] !== 0) return false;`); emit(` if (t === ${tRParen} && lastCloseWasParenHead) return false;`); - emit(` return LX_DIVK[tkK[i]] !== 0 || LX_DIVT[t] !== 0;`); + emit(` return LX_DIVK[k] !== 0 || LX_DIVT[t] !== 0;`); emit(` }`); emit(` while (pos < n) {`); + emit(` if (wndHit >= 0) { tokN--; return wndHit; }`); emit(` const cc = source.charCodeAt(pos);`); emit(` // whitespace: ASCII \\s run by char loop; a non-ASCII candidate falls back to the regex`); emit(` if (cc === 32 || (cc >= 9 && cc <= 13)) {`); @@ -461,7 +499,21 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { } emit(` throw new Error("Unexpected character at offset " + pos + ": '" + source[pos] + "'");`); emit(` }`); - emit(` return tokN;`); + emit(` if (wndHit >= 0) { tokN--; return wndHit; }`); + emit(` return -1;`); + emit(`}`); + emit(`// Windowed-relex restart anchor: the last token B ending at/before the damage`); + emit(`// whose recorded stack depths are zero and whose shape leaves no cross-token`); + emit(`// lexer flag live (a control-head ')' or a postfix-ambiguous operator would`); + emit(`// make the next token's regex-context depend on unrecoverable state). -1 = file`); + emit(`// head (always sound, degrades to a full re-lex).`); + emit(`function findRestart(cs) {`); + emit(` let lo = 0, hi = tokN;`); + emit(` while (lo < hi) { const mid = (lo + hi) >> 1; if (tkEnd[mid] <= cs) lo = mid + 1; else hi = mid; }`); + emit(` for (let b = lo - 1; b >= 0; b--) {`); + emit(` if (tkDp[b] === 0 && tkPd[b] === 0 && LX_PFXV[tkT[b]] === 0 && !(tkK[b] === 1 && tkT[b] === ${tRParen})) return b;`); + emit(` }`); + emit(` return -1;`); emit(`}`); return out.join('\n'); } diff --git a/src/emit-parser.ts b/src/emit-parser.ts index b7a5194..9a0ddac 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1329,6 +1329,12 @@ let tkT = new ${T_ARR}(4096); let tkOff = new Int32Array(4096); let tkEnd = new Int32Array(4096); let tkFl = new Uint8Array(4096); +// lexer-state depth records per token (windowed relex restart/resync safety): +// tkDp = template-interp stack depth, tkPd = paren-head stack depth, both AS RECORDED +// at the token's push (the convention per token kind is fixed by the lexer's code +// path; determinism is what the predicates rely on, depth-0 is the safe state). +let tkDp = new Uint8Array(4096); +let tkPd = new Uint16Array(4096); let tkCap = 4096; let tokN = 0; let src = ''; @@ -1340,6 +1346,8 @@ function growTok() { const o = new Int32Array(tkCap); o.set(tkOff); tkOff = o; const e2 = new Int32Array(tkCap); e2.set(tkEnd); tkEnd = e2; const f = new Uint8Array(tkCap); f.set(tkFl); tkFl = f; + const d = new Uint8Array(tkCap); d.set(tkDp); tkDp = d; + const q = new Uint16Array(tkCap); q.set(tkPd); tkPd = q; } // ── CST arena: nodes are rows in parallel columns; leaves are TOKEN REFERENCES ── @@ -1489,6 +1497,8 @@ function matchPuLitGT(pu) { tkT.copyWithin(pos + 1, pos, tokN); tkOff.copyWithin(pos + 1, pos, tokN); tkEnd.copyWithin(pos + 1, pos, tokN); + tkDp.copyWithin(pos + 1, pos, tokN); + tkPd.copyWithin(pos + 1, pos, tokN); tkFl.copyWithin(pos + 1, pos, tokN); ${e.soa ? '' : "tkText.splice(pos, 1, '>', restText);"} tkT[pos] = pu; tkEnd[pos] = off + 1; tkFl[pos] = 0; @@ -2099,6 +2109,7 @@ ${e.soa ? ` tokenize(source);` : String.raw` src = source; const _t = _toks[_i]; tkK[_i] = _t.k; tkT[_i] = _t.t; tkOff[_i] = _t.offset; tkEnd[_i] = _t.offset + _t.text.length; tkFl[_i] = (_t.newlineBefore ? 1 : 0) | (_t.commentBefore ? 2 : 0) | (_t.multilineFlowBefore ? 4 : 0); + tkDp[_i] = 0; tkPd[_i] = 0; tkText[_i] = _t.text; } tokN = _n;`} @@ -2141,7 +2152,20 @@ function runParse(entryRule) { let lastSrc = null; // The spare token-column buffer set (parseEdited ping-pongs between the live set and // this one, so steady-state edits never allocate columns). -let altK = null, altT = null, altOff = null, altEnd = null, altFl = null; +let altK = null, altT = null, altOff = null, altEnd = null, altFl = null, altDp = null, altPd = null; +let altCap = 0; +let altN = 0; // old-stream token count while a window lex runs (lexCore's resync bound) +function swapBuffers() { + let x; + x = tkK; tkK = altK; altK = x; + x = tkT; tkT = altT; altT = x; + x = tkOff; tkOff = altOff; altOff = x; + x = tkEnd; tkEnd = altEnd; altEnd = x; + x = tkFl; tkFl = altFl; altFl = x; + x = tkDp; tkDp = altDp; altDp = x; + x = tkPd; tkPd = altPd; altPd = x; + x = tkCap; tkCap = altCap; altCap = x; +} ${e.soa ? '' : 'let altText = [];'} export function parse(source, entryRule) { @@ -2176,46 +2200,106 @@ export function parseEdited(source, entryRule) { if (lastSrc === null) return parse(source, entryRule); const oSrc = lastSrc; lastSrc = null; - // Stash the old columns BY REFERENCE and lex into the spare buffer set (ping-pong - // double buffer — steady-state edits allocate nothing and keep the pages warm). +${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── + // Char-level envelope (cheapest possible without an edit protocol). + const oldLen = oSrc.length, newLen = source.length; + const minL = oldLen < newLen ? oldLen : newLen; + let cs = 0; + while (cs < minL && oSrc.charCodeAt(cs) === source.charCodeAt(cs)) cs++; + let ce = 0; + while (ce < minL - cs && oSrc.charCodeAt(oldLen - 1 - ce) === source.charCodeAt(newLen - 1 - ce)) ce++; + const ceOld = oldLen - ce, ceNew = newLen - ce; + const charDelta = newLen - oldLen; + // Restart anchor: the last token B ending at/before the damage whose recorded + // depths are zero and whose shape carries no cross-token lexer flag (')' control- + // head, postfix-ambiguous op). B = -1 restarts at the file head — always sound. + const B = findRestart(cs); + const oN = tokN; + // first old token at/after the damage end — the resync search floor + let r0 = oN; + { let lo = 0, hi = oN; + while (lo < hi) { const mid = (lo + hi) >> 1; if (tkOff[mid] < ceOld) lo = mid + 1; else hi = mid; } + r0 = lo; } + // Lex the window into the spare buffers (the old stream stays live for resync). + if (altK === null || altCap < tkCap) { + altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); + altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); + altDp = new Uint8Array(tkCap); altPd = new Uint16Array(tkCap); + altCap = tkCap; + } + altN = oN; + swapBuffers(); // live = scratch, alt = OLD stream + src = source; + tokN = 0; + const startOff = B >= 0 ? altEnd[B] : 0; + const R0 = lexCore(source, startOff, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs); + const W = tokN; + const R = R0 >= 0 ? R0 : oN; + swapBuffers(); // live = OLD stream again; window sits in the alt buffers + tokN = oN; + // TRUE token prefix p: the window re-derives [B+1 .. p) byte-identically; only past + // p is real damage (compared BEFORE the splice clobbers the old slots). + let p = B + 1; + { let i = 0; + while (i < W && p < R && altK[i] === tkK[p] && altT[i] === tkT[p] && altOff[i] === tkOff[p] + && altEnd[i] === tkEnd[p] && altFl[i] === tkFl[p]) { i++; p++; } + } + const dOldEnd = R; + const tokenDelta = (B + 1 + W) - R; + const charThresh = R < oN ? tkOff[R] : 0x7fffffff; + // ── splice: old[0..B] + window[0..W) + old[R..oN), then shift the suffix spans ── + const nN = B + 1 + W + (oN - R); + while (tkCap < nN + 1) growTok(); + tkK.copyWithin(B + 1 + W, R, oN); tkT.copyWithin(B + 1 + W, R, oN); + tkOff.copyWithin(B + 1 + W, R, oN); tkEnd.copyWithin(B + 1 + W, R, oN); + tkFl.copyWithin(B + 1 + W, R, oN); tkDp.copyWithin(B + 1 + W, R, oN); tkPd.copyWithin(B + 1 + W, R, oN); + if (W > 0) { + tkK.set(altK.subarray(0, W), B + 1); tkT.set(altT.subarray(0, W), B + 1); + tkOff.set(altOff.subarray(0, W), B + 1); tkEnd.set(altEnd.subarray(0, W), B + 1); + tkFl.set(altFl.subarray(0, W), B + 1); tkDp.set(altDp.subarray(0, W), B + 1); tkPd.set(altPd.subarray(0, W), B + 1); + } + if (charDelta !== 0) { + for (let i = B + 1 + W; i < nN; i++) { tkOff[i] += charDelta; tkEnd[i] += charDelta; } + } + tokN = nN; + const nN2 = nN; + const oN2 = oN;` : String.raw` // (fallback-lexer grammars keep the full-relex + token-diff path) const oK = tkK, oT = tkT, oOff = tkOff, oEnd = tkEnd, oFl = tkFl, oN = tokN; -${e.soa ? '' : ' const oText = tkText;'} + const oText = tkText; if (altK === null || altK.length !== tkCap) { altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); + altDp = new Uint8Array(tkCap); altPd = new Uint16Array(tkCap); } tkK = altK; tkT = altT; tkOff = altOff; tkEnd = altEnd; tkFl = altFl; -${e.soa ? '' : ' tkText = altText; tkText.length = 0;'} + { const _d = tkDp; tkDp = altDp; altDp = _d; const _q = tkPd; tkPd = altPd; altPd = _q; } + tkText = altText; tkText.length = 0; altK = oK; altT = oT; altOff = oOff; altEnd = oEnd; altFl = oFl; -${e.soa ? '' : ' altText = oText;'} + altText = oText; lexInto(source); - if (tkCap !== oK.length) { - // the new lex outgrew the buffer (growTok reallocated): drop the stale spare - altK = null; - } const nN = tokN; const charDelta = source.length - oSrc.length; const minN = oN < nN ? oN : nN; - // Longest identical prefix (positions included — the prefix is unshifted). let p = 0; while (p < minN && oK[p] === tkK[p] && oT[p] === tkT[p] && oFl[p] === tkFl[p] - && oOff[p] === tkOff[p] && oEnd[p] === tkEnd[p]${e.soa ? '' : ' && oText[p] === tkText[p]'}) p++; - // Longest identical suffix modulo charDelta (disjoint from the prefix). + && oOff[p] === tkOff[p] && oEnd[p] === tkEnd[p] && oText[p] === tkText[p]) p++; let s = 0; while (s < minN - p) { const i = oN - 1 - s, j = nN - 1 - s; if (oK[i] === tkK[j] && oT[i] === tkT[j] && oFl[i] === tkFl[j] - && oOff[i] + charDelta === tkOff[j] && oEnd[i] + charDelta === tkEnd[j]${e.soa ? '' : ' && oText[i] === tkText[j]'}) s++; + && oOff[i] + charDelta === tkOff[j] && oEnd[i] + charDelta === tkEnd[j] && oText[i] === tkText[j]) s++; else break; } - const dOldEnd = oN - s; // damaged OLD tokens: [p, dOldEnd) + const dOldEnd = oN - s; const tokenDelta = nN - oN; - // Re-base the old arena in place: rows starting at/after the first suffix token's OLD - // offset shift by charDelta; reused leaf entries past the damage shift by tokenDelta. - // (A reusable subtree lies entirely on one side of the damage, so the start-threshold - // classifies it correctly; damage-spanning rows are garbage either way.) - if (s > 0 && (charDelta !== 0 || tokenDelta !== 0)) { - const charThresh = oOff[dOldEnd]; + const charThresh = s > 0 ? oOff[dOldEnd] : 0x7fffffff; + const nN2 = nN; + const oN2 = oN;`} + // Re-base the old arena in place: rows starting at/after the first kept-suffix + // token's OLD offset shift by charDelta; reused leaf entries past the damage shift + // by tokenDelta. (A reusable subtree lies entirely on one side of the damage; rows + // spanning it are unreachable garbage either way.) + if (dOldEnd < oN2 && (charDelta !== 0 || tokenDelta !== 0)) { if (charDelta !== 0) { for (let i = 0; i < nodeN; i++) if (rowOff[i] >= charThresh) rowOff[i] += charDelta; } @@ -2227,18 +2311,12 @@ ${e.soa ? '' : ' altText = oText;'} } } } - // Carry the memo across: prefix entries whose lookahead never reached the damage stay - // at their index; suffix entries move by tokenDelta (ids reference the re-based rows). - // tokenDelta === 0 (the common keystroke: editing within a token) mutates IN PLACE — - // no per-rule array allocation; only the damage window and the prefix entries whose - // extent crossed into it are cleared. + // Carry the memo across: prefix entries whose lookahead never reached the damage + // stay; suffix entries shift by tokenDelta; the damage window drops. for (let r = 0; r < MEMO_RULES; r++) { const me = memoEnd[r]; if (me === undefined) continue; const mn = memoNode[r], mx = memoExt[r]; - // prefix entries whose lookahead may have crossed into the damage die in place - // (mx is the advance watermark; reads run up to two tokens past it: the stop - // token and the SECOND-token dispatch probe) for (let i = 0; i < p; i++) { if (me[i] !== undefined && mx[i] + 2 > p) { me[i] = undefined; mn[i] = undefined; mx[i] = undefined; } } @@ -2248,15 +2326,12 @@ ${e.soa ? '' : ' altText = oText;'} } continue; } - // token count changed: rebuild the rule's arrays sparsely (measured FASTER than an - // in-place direction-aware shift — writing undefined through the holes materializes - // them; fresh holey arrays skip that entirely). - const nme = new Array(nN + 1), nmn = new Array(nN + 1), nmx = new Array(nN + 1); - const pCap = p < nN + 1 ? p : nN + 1; + const nme = new Array(nN2 + 1), nmn = new Array(nN2 + 1), nmx = new Array(nN2 + 1); + const pCap = p < nN2 + 1 ? p : nN2 + 1; for (let i = 0; i < pCap; i++) { if (me[i] !== undefined) { nme[i] = me[i]; nmn[i] = mn[i]; nmx[i] = mx[i]; } } - for (let i = dOldEnd; i <= oN; i++) { + for (let i = dOldEnd; i <= oN2; i++) { if (me[i] !== undefined) { const j = i + tokenDelta; nme[j] = me[i] + tokenDelta; nmn[j] = mn[i]; nmx[j] = mx[i] + tokenDelta; From 1c773ca1fc4674da0f354aa05bfecef51a5d7615 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 03:39:35 +0800 Subject: [PATCH 03/15] Depth-tolerant relex restart: reconstruct the live paren stack at the anchor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The restart anchor no longer requires paren depth zero — inside an all-wrapping IIFE (typescript.js's 8.9MB bundle) no interior token has it, so the anchor fell back to the file head and the window re-lexed half the file. A '(' token now records its control-head-ness as tkFl bit 8, and reconstructParens rebuilds the live stack enclosing the anchor by a backward scan: the first '(' recording exactly depth d is the live opener of level d (closed openers at that depth are re-opened later, and the re-opener comes first backward). The anchor still requires template depth zero (interp brace counters are not reconstructable) and additionally must not be a control KEYWORD — a '(' lexed first in the window would mis-derive its head-ness from a missing predecessor. Two boundary bugs found by measurement on the way: lastDp/lastPd (the "depth before the damage" baseline that resync compares against) must initialize from the ANCHOR's depths, not zero — with the anchor adjacent to the edit there are no pre-damage pushes to set them, the baseline froze at zero, resync never fired inside the IIFE and the window ran to EOF (306ms edits, worse than the depth-0 restart it replaced); and tokenize() must keep returning tokN (the lexer-equality gate consumes it). incremental ≡ fresh 0/120 mismatches; lexer streams, reject messages, and the 18,805-file byte-identical gate all green; 30/30. 9MB keystroke edits land at ~120-160ms (machine-thermal band), now dominated by the named O(n) bookkeeping (memo prefix scans ~9M iterations/edit, arena re-base, suffix span shift, char-diff scans) — the green {rel,len} + cursor-adoption milestones' targets. --- src/emit-lexer.ts | 35 +++++++++++++++++++++++++++++------ src/emit-parser.ts | 3 ++- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 9dd85d0..738b529 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -210,14 +210,15 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// old token (same k/t, offsets shifted by wndDelta, both depth records 0) while`); emit(`// the window's own stacks are empty — returns that OLD index (the duplicate push`); emit(`// is retracted), or -1 when lexing ran to EOF.`); - emit(`function lexCore(source, startPos, pvK, pvT, wndPtr0, wndMinOff, wndDelta, wndCs) {`); + emit(`function lexCore(source, startPos, pvK, pvT, wndPtr0, wndMinOff, wndDelta, wndCs, initParens) {`); emit(` const n = source.length;`); emit(` let pos = startPos;`); emit(` let pendingNl = false;`); + emit(` let extraFl = 0;`); emit(` let lastBangWasPostfix = false;`); emit(` let lastCloseWasParenHead = false;`); emit(` const templateStack = [];`); - emit(` const parenHeadStack = [];`); + emit(` const parenHeadStack = initParens !== undefined && initParens !== null ? initParens : [];`); emit(` let wndPtr = wndPtr0;`); emit(` let wndHit = -1;`); emit(` // stack depths as of the last token fully BEFORE the damage: a resync point may`); @@ -225,11 +226,12 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` // the damage (the prefix agrees byte-for-byte, so those stack entries agree too;`); emit(` // anything opened inside the damage could differ in control-head-ness).`); emit(` let dmgDp = -1, dmgPd = -1;`); - emit(` let lastDp = 0, lastPd = 0;`); + emit(` let lastDp = templateStack.length, lastPd = parenHeadStack.length;`); emit(` function tkPush(k, t, off, end) {`); emit(` if (tokN === tkCap) growTok();`); emit(` tkK[tokN] = k; tkT[tokN] = t; tkOff[tokN] = off; tkEnd[tokN] = end;`); - emit(` tkFl[tokN] = pendingNl ? 1 : 0;`); + emit(` tkFl[tokN] = (pendingNl ? 1 : 0) | extraFl;`); + emit(` extraFl = 0;`); emit(` tkDp[tokN] = templateStack.length;`); emit(` tkPd[tokN] = parenHeadStack.length;`); emit(` pendingNl = false;`); @@ -372,7 +374,9 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // Chars 1..len-1 already known to match when this leaf is reached via the chain below. if (lit === '(') { emit(`${ind}{ const isMemberName = tokN >= 2 && LX_MEMBER[tkT[tokN - 2]] !== 0;`); - emit(`${ind} parenHeadStack.push(!isMemberName && tokN >= 1 && tkK[tokN - 1] === ${kIdent} && LX_PARENKW[tkT[tokN - 1]] !== 0); }`); + emit(`${ind} const _ph = !isMemberName && tokN >= 1 && tkK[tokN - 1] === ${kIdent} && LX_PARENKW[tkT[tokN - 1]] !== 0;`); + emit(`${ind} parenHeadStack.push(_ph);`); + emit(`${ind} extraFl = _ph ? 8 : 0; }`); } else if (lit === ')') { emit(`${ind}lastCloseWasParenHead = parenHeadStack.pop() ?? false;`); } @@ -511,10 +515,29 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` let lo = 0, hi = tokN;`); emit(` while (lo < hi) { const mid = (lo + hi) >> 1; if (tkEnd[mid] <= cs) lo = mid + 1; else hi = mid; }`); emit(` for (let b = lo - 1; b >= 0; b--) {`); - emit(` if (tkDp[b] === 0 && tkPd[b] === 0 && LX_PFXV[tkT[b]] === 0 && !(tkK[b] === 1 && tkT[b] === ${tRParen})) return b;`); + emit(` // template depth must be zero (interp brace counters are not reconstructable),`); + emit(` // and the anchor token must leave no cross-token lexer flag live: not a`); + emit(` // control-head ')', not a postfix-ambiguous op, and not a control KEYWORD`); + emit(` // (a '(' lexed first in the window would mis-derive its head-ness from a`); + emit(` // missing predecessor). Paren depth may be anything — the live stack is`); + emit(` // reconstructed from the recorded depths and the '(' head bits.`); + emit(` if (tkDp[b] === 0 && LX_PFXV[tkT[b]] === 0 && LX_PARENKW[tkT[b]] === 0 && !(tkK[b] === 1 && tkT[b] === ${tRParen})) return b;`); emit(` }`); emit(` return -1;`); emit(`}`); + emit(`// Rebuild the live paren-head stack enclosing token b: scanning backward, the`); + emit(`// first '(' recording exactly depth d is the live opener of level d (closed`); + emit(`// openers at that depth are re-opened later, and the re-opener comes first`); + emit(`// backward). The '(' records its depth INCLUDING itself, and carries its`); + emit(`// control-head-ness as tkFl bit 8.`); + emit(`function reconstructParens(b) {`); + emit(` let need = b >= 0 ? tkPd[b] : 0;`); + emit(` const out = new Array(need);`); + emit(` for (let i = b; i >= 0 && need > 0; i--) {`); + emit(` if (tkK[i] === 1 && tkT[i] === ${tOf('(')} && tkPd[i] === need) { out[need - 1] = (tkFl[i] & 8) !== 0; need--; }`); + emit(` }`); + emit(` return out;`); + emit(`}`); return out.join('\n'); } diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 9a0ddac..993f6de 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -2214,6 +2214,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // depths are zero and whose shape carries no cross-token lexer flag (')' control- // head, postfix-ambiguous op). B = -1 restarts at the file head — always sound. const B = findRestart(cs); + const initParens = B >= 0 ? reconstructParens(B) : []; const oN = tokN; // first old token at/after the damage end — the resync search floor let r0 = oN; @@ -2232,7 +2233,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── src = source; tokN = 0; const startOff = B >= 0 ? altEnd[B] : 0; - const R0 = lexCore(source, startOff, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs); + const R0 = lexCore(source, startOff, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs, initParens); const W = tokN; const R = R0 >= 0 ? R0 : oN; swapBuffers(); // live = OLD stream again; window sits in the alt buffers From 7d7fbc5a5011e01fbac3b1fb4b3cb37051836127 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 03:55:11 +0800 Subject: [PATCH 04/15] Green tree (M3): relative coordinates on parent edges; re-base loops vanish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nodes no longer store absolute positions. A row owns only its LENGTHS (rowLen chars, rowTokLen tokens); a node child's relative coordinates (kidRel chars, kidTokRel tokens, both against the parent's start) live in the PARENT's kids stream, parallel to the entries — NOT on the child row: a memo-reused subtree can be a child of several longest-match CANDIDATES, and a losing candidate completing after the winner would clobber child-side rel fields (928 corpus mismatches before the edge-ownership fix). Leaf entries are node-relative token indices. The red layer is a descent: visit/toObject thread (charBase, tokBase); leaf spans come from the token columns at tokBase + rel. Build stays absolute in TRANSIENT per-row coordinates (absChar/absTok), written at finishNode/finishWrap, read by the enclosing parent, never part of the green tree. A memo HIT refreshes the reused root's transients to the current stream in O(1) — which is the whole point: - the arena re-base loops (rowOff O(nodes), kids O(kids) per edit) are GONE; - the '>'-splice kids/scratch fixup is GONE (completed rows lie wholly before the splice point; the carried memo is cleared); - a reused subtree needs zero rewriting at any depth. Matchers thread one tokBase parameter (leaf spans come from the token columns, so they never need charBase); the totality gate's visit supplies it. The ts-ast lowering moves to the INTERPRETER oracle through a new object-tree TreeAccess adapter (test/obj-tree.ts, absolute coordinates, tokBase ignored) — the grammar↔tsc structure contract is engine-independent, and the lowering needed zero semantic changes. 18,802/18,805 emit ≡ interp byte-identical (toObject reproduces absolute objects exactly); reject messages exact; incremental ≡ fresh 0/120 with the mixed session at 1.69× (best yet); totality 32,167 nodes / 0 misses; 30/30. 9MB keystrokes ~148ms — now dominated by the memo prefix scans, the suffix span shift, and the char-diff scans (the cursor-adoption and chunked-column milestones' targets). --- src/emit-parser.ts | 196 ++++++++++++++++++++++++------------- src/gen-cst-match.ts | 51 +++++----- test/cst-match-totality.ts | 14 +-- test/obj-tree.ts | 78 +++++++++++++++ test/ts-ast-lowering.ts | 9 +- test/ts-ast-verify.ts | 23 ++--- 6 files changed, 251 insertions(+), 120 deletions(-) create mode 100644 test/obj-tree.ts diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 993f6de..b0446e4 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1360,13 +1360,24 @@ function growTok() { // Rows store ABSOLUTE offsets in this phase (the green {rel,len} re-base is the // incremental round's move; flipping the stored form regenerates matchers only). let rowRule = new Uint16Array(8192); // rule id (index into RULE_NAMES) -let rowOff = new Int32Array(8192); // absolute start offset let rowLen = new Int32Array(8192); +let rowTokLen = new Int32Array(8192); // subtree token count let rowStart = new Int32Array(8192); // first index into kids let rowCount = new Int32Array(8192); +// transient BUILD coordinates (absolute), valid for rows completed in the current +// parse and REFRESHED at memo-hit time for reused roots — parents read them at +// finishNode to write the children's relative fields; never part of the green tree. +let absChar = new Int32Array(8192); +let absTok = new Int32Array(8192); let rowCap = 8192; let nodeN = 0; let kids = new Int32Array(16384); +// A node child's RELATIVE coordinates live in the PARENT's kids stream (parallel to +// kids), not on the child row: a memo-reused subtree can be a child of several +// longest-match CANDIDATES, and a losing candidate completing after the winner would +// clobber child-side rel fields. The parent owns its edges; rows own only lengths. +let kidRel = new Int32Array(16384); +let kidTokRel = new Int32Array(16384); let kidCap = 16384; let kidN = 0; // Scratch: completed-but-unattached children of in-progress arms. Every @@ -1377,21 +1388,27 @@ let scn = 0; function growRows() { rowCap *= 2; const r = new Uint16Array(rowCap); r.set(rowRule); rowRule = r; - const o = new Int32Array(rowCap); o.set(rowOff); rowOff = o; const l = new Int32Array(rowCap); l.set(rowLen); rowLen = l; + const tl = new Int32Array(rowCap); tl.set(rowTokLen); rowTokLen = tl; const s = new Int32Array(rowCap); s.set(rowStart); rowStart = s; const c = new Int32Array(rowCap); c.set(rowCount); rowCount = c; + const ac = new Int32Array(rowCap); ac.set(absChar); absChar = ac; + const at = new Int32Array(rowCap); at.set(absTok); absTok = at; } function growKids(n) { while (kidN + n > kidCap) kidCap *= 2; const k = new Int32Array(kidCap); k.set(kids.subarray(0, kidN)); kids = k; + const r = new Int32Array(kidCap); r.set(kidRel.subarray(0, kidN)); kidRel = r; + const t = new Int32Array(kidCap); t.set(kidTokRel.subarray(0, kidN)); kidTokRel = t; } function scPush(e) { if (scn === scCap) { scCap *= 2; const s = new Int32Array(scCap); s.set(sc); sc = s; } sc[scn++] = e; } -function entryOff(e) { return e >= 0 ? rowOff[e] : tkOff[(~e) >>> 2]; } -function entryEnd(e) { return e >= 0 ? rowOff[e] + rowLen[e] : tkEnd[(~e) >>> 2]; } +function entryOff(e) { return e >= 0 ? absChar[e] : tkOff[(~e) >>> 2]; } +function entryEnd(e) { return e >= 0 ? absChar[e] + rowLen[e] : tkEnd[(~e) >>> 2]; } +function entryTok(e) { return e >= 0 ? absTok[e] : (~e) >>> 2; } +function entryTokEnd(e) { return e >= 0 ? absTok[e] + rowTokLen[e] : ((~e) >>> 2) + 1; } // Complete a node whose children are scratch[mark..scn): copy them into kids, write // the row, truncate scratch, return the id. Empty children = a zero-width node // at the current token (the old offset() rule). @@ -1399,20 +1416,37 @@ function finishNode(rid, mark) { const n = scn - mark; if (nodeN === rowCap) growRows(); const id = nodeN++; - let myOff, myEnd; + let myOff, myEnd, myTok, myTokEnd; if (n > 0) { if (kidN + n > kidCap) growKids(n); const ks = kidN; - for (let i = 0; i < n; i++) kids[ks + i] = sc[mark + i]; - kidN += n; - rowStart[id] = ks; myOff = entryOff(sc[mark]); myEnd = entryEnd(sc[scn - 1]); + myTok = entryTok(sc[mark]); + myTokEnd = entryTokEnd(sc[scn - 1]); + // GREEN conversion: scratch entries carry ABSOLUTE coordinates; the kids span is + // written position-independent — a leaf becomes node-relative-token-encoded, a + // child node gets its rel fields written here (its own row knows only lengths). + for (let i = 0; i < n; i++) { + const e = sc[mark + i]; + if (e < 0) { + kids[ks + i] = ~(((((~e) >>> 2) - myTok) << 2) | ((~e) & 3)); + } else { + kids[ks + i] = e; + kidRel[ks + i] = absChar[e] - myOff; + kidTokRel[ks + i] = absTok[e] - myTok; + } + } + kidN += n; + rowStart[id] = ks; } else { rowStart[id] = kidN; myOff = offset(); myEnd = myOff; + myTok = pos; myTokEnd = pos; } - rowRule[id] = rid; rowOff[id] = myOff; rowLen[id] = myEnd - myOff; rowCount[id] = n; + rowRule[id] = rid; rowLen[id] = myEnd - myOff; rowCount[id] = n; + rowTokLen[id] = myTokEnd - myTok; + absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; } @@ -1423,13 +1457,28 @@ function finishWrap(rid, lhsId, mark) { const id = nodeN++; if (kidN + n + 1 > kidCap) growKids(n + 1); const ks = kidN; + const myOff = absChar[lhsId]; + const myTok = absTok[lhsId]; + const myEnd = n > 0 ? entryEnd(sc[scn - 1]) : myOff + rowLen[lhsId]; + const myTokEnd = n > 0 ? entryTokEnd(sc[scn - 1]) : myTok + rowTokLen[lhsId]; kids[ks] = lhsId; - for (let i = 0; i < n; i++) kids[ks + 1 + i] = sc[mark + i]; + kidRel[ks] = 0; + kidTokRel[ks] = 0; + for (let i = 0; i < n; i++) { + const e = sc[mark + i]; + if (e < 0) { + kids[ks + 1 + i] = ~(((((~e) >>> 2) - myTok) << 2) | ((~e) & 3)); + } else { + kids[ks + 1 + i] = e; + kidRel[ks + 1 + i] = absChar[e] - myOff; + kidTokRel[ks + 1 + i] = absTok[e] - myTok; + } + } kidN += n + 1; - const myOff = rowOff[lhsId]; - const myEnd = n > 0 ? entryEnd(sc[scn - 1]) : rowOff[lhsId] + rowLen[lhsId]; - rowRule[id] = rid; rowOff[id] = myOff; rowLen[id] = myEnd - myOff; + rowRule[id] = rid; rowLen[id] = myEnd - myOff; rowStart[id] = ks; rowCount[id] = n + 1; + rowTokLen[id] = myTokEnd - myTok; + absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; } @@ -1510,18 +1559,9 @@ function matchPuLitGT(pu) { memoNode.fill(undefined); memoEnd.fill(undefined); memoExt.fill(undefined); - // Leaf entries reference tokens BY INDEX, so the splice's +1 shift must be applied - // to every committed/scratch entry past the split point. (Object trees were immune — - // leaves copied their spans; the arena trades that copy for this rare O(kidN) pass. - // Entries AT pos can't exist: that token is being consumed right now.) - for (let i = 0; i < kidN; i++) { - const ke = kids[i]; - if (ke < 0 && ((~ke) >>> 2) > pos) kids[i] = ke - 4; - } - for (let i = 0; i < scn; i++) { - const se = sc[i]; - if (se < 0 && ((~se) >>> 2) > pos) sc[i] = se - 4; - } + // GREEN tree: no kids/scratch fixup — every completed row and scratch entry lies + // wholly BEFORE the splice point (token pos is being consumed right now), and the + // carried memo was just cleared, so nothing reachable references shifted indices. scPush(~(pos << 2)); if (++pos > maxPos) maxPos = pos; return true; @@ -1821,7 +1861,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` if (NOUNARY_T[tkT[pos]] !== 0 && rowCount[lhs] > 0) {`); e.emit(` const _h = kids[rowStart[lhs]];`); e.emit(` if (_h < 0 && ((~_h) & 3) === 2) {`); - e.emit(` const _ht = (~_h) >>> 2;`); + e.emit(` const _ht = absTok[lhs] + ((~_h) >>> 2);`); e.emit(` const _htext = ${e.soa ? 'src.slice(tkOff[_ht], tkEnd[_ht])' : 'tkText[_ht]'};`); e.emit(` if (prefixOps.has(_htext) && !postfixOpValues.has(_htext)) { return -1; }`); e.emit(` }`); @@ -1972,7 +2012,15 @@ function parseRuleEntry(idx, name, core) { const ex = mx[start]; if (ex > maxPos) maxPos = ex; const id = mn[start]; - if (id >= 0) { scPush(id); return true; } + if (id >= 0) { + // refresh the reused root's transient BUILD coordinates to the current stream + // (its green internals are position-independent; only the attachment point — + // what the enclosing finishNode reads — must be current). + absTok[id] = start; + absChar[id] = tkOff[start]; + scPush(id); + return true; + } return false; } } @@ -2036,8 +2084,8 @@ export function getText(node, source) { // The arena IS the tree: parse() returns the root node id and consumers traverse // via visit()/the accessors — nothing is materialized on the parse path. All views // are valid until the NEXT parse (the columns are reused). -function leafTokenType(entry) { - const tok = (~entry) >>> 2; +function leafTokenType(entry, tokBase) { + const tok = tokBase + ((~entry) >>> 2); const kind = (~entry) & 3; return kind === 1 ? '$keyword' : kind === 2 ? '$operator' @@ -2046,11 +2094,21 @@ function leafTokenType(entry) { } // Raw arena accessors. An ENTRY is a node id (>= 0) or a leaf (< 0, token-encoded); // offsetOf/endOf/textOf accept either. +// GREEN accessors: positions are RELATIVE — a node knows (rel, len) against its +// parent and (tokRel, tokLen) in tokens; consumers descend with (charBase, tokBase) +// — the node's own absolute start coordinates. Leaf spans come from the token +// columns at tokBase + the entry's node-relative token index. export const tree = { ruleNameOf: (id) => RULE_NAMES[rowRule[id]], ruleIdOf: (id) => rowRule[id], - offsetOf: (entry) => entry >= 0 ? rowOff[entry] : tkOff[(~entry) >>> 2], - endOf: (entry) => entry >= 0 ? rowOff[entry] + rowLen[entry] : tkEnd[(~entry) >>> 2], + lenOf: (id) => rowLen[id], + tokLenOf: (id) => rowTokLen[id], + // a node CHILD's relative coordinates live on the parent edge (kids-parallel) + childRelAt: (id, i) => kidRel[rowStart[id] + i], + childTokRelAt: (id, i) => kidTokRel[rowStart[id] + i], + // base-threaded spans: nodes from their bases, leaves from the token columns + offsetOf: (entry, charBase, tokBase) => entry >= 0 ? charBase : tkOff[tokBase + ((~entry) >>> 2)], + endOf: (entry, charBase, tokBase) => entry >= 0 ? charBase + rowLen[entry] : tkEnd[tokBase + ((~entry) >>> 2)], childCount: (id) => rowCount[id], childAt: (id, i) => kids[rowStart[id] + i], // Bulk child load into a caller-owned array; returns the count. One call per node @@ -2062,40 +2120,51 @@ export const tree = { return n2; }, isLeaf: (entry) => entry < 0, - leafToken: (entry) => (~entry) >>> 2, + leafToken: (entry, tokBase) => tokBase + ((~entry) >>> 2), leafTokenType, // Int-world leaf accessors (the match-path encoding): kind bits — 0 type-derived, // 1 '$keyword', 2 '$operator' — and the token's TYPE kind int (1 = punctuation). leafKindOf: (entry) => (~entry) & 3, - leafTokKindOf: (entry) => tkK[(~entry) >>> 2], - textOf: (entry, source) => entry >= 0 - ? source.slice(rowOff[entry], rowOff[entry] + rowLen[entry]) - : source.slice(tkOff[(~entry) >>> 2], tkEnd[(~entry) >>> 2]), + leafTokKindOf: (entry, tokBase) => tkK[tokBase + ((~entry) >>> 2)], + leafOffsetOf: (entry, tokBase) => tkOff[tokBase + ((~entry) >>> 2)], + leafEndOf: (entry, tokBase) => tkEnd[tokBase + ((~entry) >>> 2)], + textOf: (entry, source, charBase, tokBase) => entry >= 0 + ? source.slice(charBase, charBase + rowLen[entry]) + : source.slice(tkOff[tokBase + ((~entry) >>> 2)], tkEnd[tokBase + ((~entry) >>> 2)]), }; // Depth-first traversal from a node id or leaf entry: // enter(id) — each NODE before its children; return false to skip its subtree // leave(id) — each node after its children // leaf(entry, tok) — each leaf (tok = its token index) -export function visit(entry, fns) { - if (entry < 0) { if (fns.leaf) fns.leaf(entry, (~entry) >>> 2); return; } - if (fns.enter && fns.enter(entry) === false) return; +// Depth-first traversal threading the RED coordinates: enter/leave receive the +// node's absolute (charBase, tokBase); leaf receives its absolute token index. +// Call with the root only — the bases default from the root's rel fields. +export function visit(entry, fns, charBase, tokBase) { + if (charBase === undefined) { charBase = rootCharBase; tokBase = rootTokBase; } + if (entry < 0) { if (fns.leaf) fns.leaf(entry, tokBase + ((~entry) >>> 2)); return; } + if (fns.enter && fns.enter(entry, charBase, tokBase) === false) return; const n = rowCount[entry]; const cs = rowStart[entry]; - for (let i = 0; i < n; i++) visit(kids[cs + i], fns); - if (fns.leave) fns.leave(entry); + for (let i = 0; i < n; i++) { + const e = kids[cs + i]; + if (e < 0) { if (fns.leaf) fns.leaf(e, tokBase + ((~e) >>> 2)); } + else visit(e, fns, charBase + kidRel[cs + i], tokBase + kidTokRel[cs + i]); + } + if (fns.leave) fns.leave(entry, charBase, tokBase); } // Materialize the classic object CST from a node id — a BRIDGE for tests/debugging // (the byte-identical gate against the interpreter), not a parse-path product. -export function toObject(id) { +export function toObject(id, charBase, tokBase) { + if (charBase === undefined) { charBase = rootCharBase; tokBase = rootTokBase; } const n = rowCount[id]; const cs = rowStart[id]; const children = new Array(n); for (let i = 0; i < n; i++) { const entry = kids[cs + i]; - children[i] = entry >= 0 ? toObject(entry) - : { tokenType: leafTokenType(entry), offset: tkOff[(~entry) >>> 2], end: tkEnd[(~entry) >>> 2] }; + children[i] = entry >= 0 ? toObject(entry, charBase + kidRel[cs + i], tokBase + kidTokRel[cs + i]) + : { tokenType: leafTokenType(entry, tokBase), offset: tkOff[tokBase + ((~entry) >>> 2)], end: tkEnd[tokBase + ((~entry) >>> 2)] }; } - return { rule: RULE_NAMES[rowRule[id]], children, offset: rowOff[id], end: rowOff[id] + rowLen[id] }; + return { rule: RULE_NAMES[rowRule[id]], children, offset: charBase, end: charBase + rowLen[id] }; } // Parse to the ARENA: returns the root node id. @@ -2134,7 +2203,9 @@ function runParse(entryRule) { const entry = entryRule ?? ENTRY; if (tokN === 0) { const rid = RULE_NAMES.indexOf(entry); - return finishNode(rid < 0 ? 0 : rid, scn); + const er = finishNode(rid < 0 ? 0 : rid, scn); + rootCharBase = absChar[er]; rootTokBase = absTok[er]; + return er; } if (!RULES[entry]()) { const hasTok = pos < cap; @@ -2143,13 +2214,18 @@ function runParse(entryRule) { if (pos < tokN) { throw new Error('Parse error at offset ' + tkOff[pos] + ": unexpected '" + tokTextAt(pos) + "' after successful parse" + farthest(pos)); } - return sc[--scn]; + const rootId = sc[--scn]; + rootCharBase = absChar[rootId]; rootTokBase = absTok[rootId]; + return rootId; } // Source of the last COMPLETED parse — the token columns, arena and memo describe it. // null whenever the module state is not a coherent snapshot (no parse yet, or the last // attempt threw), so parseEdited falls back to a full parse. let lastSrc = null; +// the LAST parse root's absolute coordinates (the descent origin — see visit/toObject) +let rootCharBase = 0; +let rootTokBase = 0; // The spare token-column buffer set (parseEdited ping-pongs between the live set and // this one, so steady-state edits never allocate columns). let altK = null, altT = null, altOff = null, altEnd = null, altFl = null, altDp = null, altPd = null; @@ -2247,7 +2323,6 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── } const dOldEnd = R; const tokenDelta = (B + 1 + W) - R; - const charThresh = R < oN ? tkOff[R] : 0x7fffffff; // ── splice: old[0..B] + window[0..W) + old[R..oN), then shift the suffix spans ── const nN = B + 1 + W + (oN - R); while (tkCap < nN + 1) growTok(); @@ -2263,8 +2338,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── for (let i = B + 1 + W; i < nN; i++) { tkOff[i] += charDelta; tkEnd[i] += charDelta; } } tokN = nN; - const nN2 = nN; - const oN2 = oN;` : String.raw` // (fallback-lexer grammars keep the full-relex + token-diff path) + const nN2 = nN;` : String.raw` // (fallback-lexer grammars keep the full-relex + token-diff path) const oK = tkK, oT = tkT, oOff = tkOff, oEnd = tkEnd, oFl = tkFl, oN = tokN; const oText = tkText; if (altK === null || altK.length !== tkCap) { @@ -2293,25 +2367,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── } const dOldEnd = oN - s; const tokenDelta = nN - oN; - const charThresh = s > 0 ? oOff[dOldEnd] : 0x7fffffff; - const nN2 = nN; - const oN2 = oN;`} - // Re-base the old arena in place: rows starting at/after the first kept-suffix - // token's OLD offset shift by charDelta; reused leaf entries past the damage shift - // by tokenDelta. (A reusable subtree lies entirely on one side of the damage; rows - // spanning it are unreachable garbage either way.) - if (dOldEnd < oN2 && (charDelta !== 0 || tokenDelta !== 0)) { - if (charDelta !== 0) { - for (let i = 0; i < nodeN; i++) if (rowOff[i] >= charThresh) rowOff[i] += charDelta; - } - if (tokenDelta !== 0) { - const eShift = tokenDelta << 2; - for (let i = 0; i < kidN; i++) { - const e = kids[i]; - if (e < 0 && ((~e) >>> 2) >= dOldEnd) kids[i] = e - eShift; - } - } - } + const nN2 = nN;`} // Carry the memo across: prefix entries whose lookahead never reached the damage // stay; suffix entries shift by tokenDelta; the damage window drops. for (let r = 0; r < MEMO_RULES; r++) { @@ -2332,7 +2388,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── for (let i = 0; i < pCap; i++) { if (me[i] !== undefined) { nme[i] = me[i]; nmn[i] = mn[i]; nmx[i] = mx[i]; } } - for (let i = dOldEnd; i <= oN2; i++) { + for (let i = dOldEnd; i <= oN; i++) { if (me[i] !== undefined) { const j = i + tokenDelta; nme[j] = me[i] + tokenDelta; nmn[j] = mn[i]; nmx[j] = mx[i] + tokenDelta; diff --git a/src/gen-cst-match.ts b/src/gen-cst-match.ts index 4e8fa91..daa50ff 100644 --- a/src/gen-cst-match.ts +++ b/src/gen-cst-match.ts @@ -321,7 +321,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin return c.field === c.name ? c.name : `${c.field}: ${c.name}`; }); w(` return { arm: ${J(plan.name)}${fields.length ? ', ' + fields.join(', ') : ''} };`); - emit(`function ${fn}(t: TreeAccess, n: number, cc: number, src: string): ${matchTypeName(rule.name)} | null {`); + emit(`function ${fn}(t: TreeAccess, n: number, cc: number, tb: number, src: string): ${matchTypeName(rule.name)} | null {`); for (const line of body) emit(line); emit(`}`); return fn; @@ -333,7 +333,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin } function litCond(text: string, tt: string): string { - return `__lit(t, cc, i, src, ${J(text)}, ${tt === '$keyword' ? 1 : 0})`; + return `__lit(t, cc, tb, i, src, ${J(text)}, ${tt === '$keyword' ? 1 : 0})`; } function renderStep(st: Step, w: (s: string) => void, ind: string, fail: () => string): void { @@ -346,7 +346,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin case 'litAlt': { const conds = st.texts.map((t, k) => litCond(t, st.tt[k])); w(`${ind}if (!(${conds.join(' || ')})) ${fail()}`); - if (st.cap) assign(st.cap, `src.slice(t.offsetOf(__SC[i]), t.endOf(__SC[i])) as ${st.cap.tsType}`, w, ind); + if (st.cap) assign(st.cap, `src.slice(t.leafOffsetOf(__SC[i], tb), t.leafEndOf(__SC[i], tb)) as ${st.cap.tsType}`, w, ind); w(`${ind}i++;`); return; } @@ -354,8 +354,8 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin const cond = st.name === '$operator' ? `__opTok(t, cc, i)` : st.template - ? `__tok(t, cc, i, ${typeKind.get(st.name)}) || __nodeOf(t, cc, i, ${ruleId.get('$template')})` - : `__tok(t, cc, i, ${typeKind.get(st.name)})`; + ? `__tok(t, cc, tb, i, ${typeKind.get(st.name)}) || __nodeOf(t, cc, i, ${ruleId.get('$template')})` + : `__tok(t, cc, tb, i, ${typeKind.get(st.name)})`; w(`${ind}if (!(${cond})) ${fail()}`); if (st.cap) assign(st.cap, `__SC[i] as ${st.cap.tsType}`, w, ind); w(`${ind}i++;`); @@ -564,7 +564,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin // ("always") arms appear in every bucket at their declaration position; the buckets // are superset filters (each arm fn re-checks exactly). const admits = plans.map(p => firstAdmit(p.steps)); - const tryLine = (k: number) => ` { const m = ${fns[k]}(t, n, cc, src); if (m !== null) return m; }`; + const tryLine = (k: number) => ` { const m = ${fns[k]}(t, n, cc, tb, src); if (m !== null) return m; }`; const bucketLines = (pred: (keys: Set) => boolean): string[] => plans.map((_, k) => (admits[k].keys.size === 0 || pred(admits[k].keys) ? tryLine(k) : '')) .filter(Boolean); @@ -618,8 +618,8 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin lines.push(`${pad} break;`); lines.push(`${pad} }`); lines.push(`${pad} }`); - lines.push(`${pad}} else if ((_k1 = t.leafKindOf(e1)) === 1 || (_k1 === 0 && t.leafTokKindOf(e1) === 1)) {`); - lines.push(`${pad} switch (src.charCodeAt(t.offsetOf(e1))) {`); + lines.push(`${pad}} else if ((_k1 = t.leafKindOf(e1)) === 1 || (_k1 === 0 && t.leafTokKindOf(e1, tb) === 1)) {`); + lines.push(`${pad} switch (src.charCodeAt(t.leafOffsetOf(e1, tb))) {`); for (const cc of [...cset].sort((a, b) => a - b)) { lines.push(`${pad} case ${cc}: {`); lines.push(...subTry(i => restAdmit[i]!.keys.has('c:' + cc)).map(l => ' ' + l)); @@ -634,7 +634,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin lines.push(`${pad}} else if (_k1 === 2) {`); lines.push(...subTry(i => restAdmit[i]!.keys.has('t:$operator'))); lines.push(`${pad}} else {`); - lines.push(`${pad} switch (t.leafTokKindOf(e1)) {`); + lines.push(`${pad} switch (t.leafTokKindOf(e1, tb)) {`); for (const t of [...tset].sort()) { if (t === '$operator') continue; // handled by the kind-2 branch above lines.push(`${pad} case ${typeKind.get(t)}: { // ${t}`); @@ -652,7 +652,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin }; const disp: string[] = []; - disp.push(`export function match${sanitizeIdent(rule.name)}(t: TreeAccess, n: NodeEntry<${J(rule.name)}>, src: string): ${tName} {`); + disp.push(`export function match${sanitizeIdent(rule.name)}(t: TreeAccess, n: NodeEntry<${J(rule.name)}>, tb: number, src: string): ${tName} {`); disp.push(` const cc = __load(t, n);`); disp.push(` let e1 = 0; let _k1 = 0;`); disp.push(` if (cc === 0) {`); @@ -681,8 +681,8 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin } disp.push(` }`); disp.push(` } else { const _k0 = t.leafKindOf(e0);`); - disp.push(` if (_k0 === 1 || (_k0 === 0 && t.leafTokKindOf(e0) === 1)) {`); - disp.push(` switch (src.charCodeAt(t.offsetOf(e0))) {`); + disp.push(` if (_k0 === 1 || (_k0 === 0 && t.leafTokKindOf(e0, tb) === 1)) {`); + disp.push(` switch (src.charCodeAt(t.leafOffsetOf(e0, tb))) {`); for (const cc of [...charCodes].sort((a, b) => a - b)) { disp.push(` case ${cc}: {`); for (const l of bucketLines(keys => keys.has('c:' + cc))) disp.push(' ' + l); @@ -699,7 +699,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin disp.push(` } else if (_k0 === 2) {`); for (const l of bucketLines(keys => keys.has('t:$operator'))) disp.push(l); disp.push(` } else {`); - disp.push(` switch (t.leafTokKindOf(e0)) {`); + disp.push(` switch (t.leafTokKindOf(e0, tb)) {`); for (const t of [...tokNames].sort()) { if (t === '$operator') continue; // handled by the kind-2 branch above disp.push(` case ${typeKind.get(t)}: { // ${t}`); @@ -715,7 +715,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin } disp.push(` }`); disp.push(` } } }`); - disp.push(` throw new Error(${J(`match${sanitizeIdent(rule.name)}: no arm matches`)} + ' @' + t.offsetOf(n));`); + disp.push(` throw new Error(${J(`match${sanitizeIdent(rule.name)}: no arm matches`)} + ' @tok' + tb);`); disp.push(`}`); bodyParts.push(disp.join('\n')); matcherMapEntries.push(` ${J(rule.name)}: match${sanitizeIdent(rule.name)},`); @@ -732,11 +732,10 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin header.push(` childCount(id: number): number;`); header.push(` childAt(id: number, i: number): number;`); header.push(` childrenInto(id: number, out: number[]): number;`); - header.push(` leafTokenType(entry: number): string;`); header.push(` leafKindOf(entry: number): number;`); - header.push(` leafTokKindOf(entry: number): number;`); - header.push(` offsetOf(entry: number): number;`); - header.push(` endOf(entry: number): number;`); + header.push(` leafTokKindOf(entry: number, tokBase: number): number;`); + header.push(` leafOffsetOf(entry: number, tokBase: number): number;`); + header.push(` leafEndOf(entry: number, tokBase: number): number;`); header.push(`}`); header.push(`// Branded entry aliases — compile-time discrimination over plain numbers.`); header.push(`export type NodeEntry = number & { readonly __node?: R };`); @@ -747,17 +746,17 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin header.push(`const __SC: number[] = [];`); header.push(`const __load = (t: TreeAccess, n: number): number => t.childrenInto(n, __SC);`); header.push(`// kind: 1 = '$keyword' (leaf kind bit), 0 = '$punct' (type-derived + tok-kind 1).`); - header.push(`const __lit = (t: TreeAccess, cc: number, i: number, src: string, text: string, kind: number): boolean => {`); + header.push(`const __lit = (t: TreeAccess, cc: number, tb: number, i: number, src: string, text: string, kind: number): boolean => {`); header.push(` if (i >= cc) return false;`); header.push(` const e = __SC[i];`); - header.push(` if (e >= 0 || t.leafKindOf(e) !== kind || (kind === 0 && t.leafTokKindOf(e) !== 1)) return false;`); - header.push(` const off = t.offsetOf(e);`); - header.push(` return t.endOf(e) - off === text.length && src.startsWith(text, off);`); + header.push(` if (e >= 0 || t.leafKindOf(e) !== kind || (kind === 0 && t.leafTokKindOf(e, tb) !== 1)) return false;`); + header.push(` const off = t.leafOffsetOf(e, tb);`); + header.push(` return t.leafEndOf(e, tb) - off === text.length && src.startsWith(text, off);`); header.push(`};`); - header.push(`const __tok = (t: TreeAccess, cc: number, i: number, k: number): boolean => {`); + header.push(`const __tok = (t: TreeAccess, cc: number, tb: number, i: number, k: number): boolean => {`); header.push(` if (i >= cc) return false;`); header.push(` const e = __SC[i];`); - header.push(` return e < 0 && t.leafKindOf(e) === 0 && t.leafTokKindOf(e) === k;`); + header.push(` return e < 0 && t.leafKindOf(e) === 0 && t.leafTokKindOf(e, tb) === k;`); header.push(`};`); header.push(`const __opTok = (t: TreeAccess, cc: number, i: number): boolean => {`); header.push(` if (i >= cc) return false;`); @@ -774,11 +773,11 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin const footer = [ ``, `/** rule name → its matcher (generic walking; the totality gate uses this). */`, - `export const MATCHERS: Record { arm: string }> = {`, + `export const MATCHERS: Record { arm: string }> = {`, ...matcherMapEntries, `};`, `/** rule ID → matcher (the emitted parser's rowRule ids — declaration order). */`, - `export const MATCHERS_BY_ID: ((t: TreeAccess, n: never, src: string) => { arm: string })[] = [`, + `export const MATCHERS_BY_ID: ((t: TreeAccess, n: never, tb: number, src: string) => { arm: string })[] = [`, ...grammar.rules.map(r => ` match${sanitizeIdent(r.name)},`), `];`, ]; diff --git a/test/cst-match-totality.ts b/test/cst-match-totality.ts index fd8a6be..f688cda 100644 --- a/test/cst-match-totality.ts +++ b/test/cst-match-totality.ts @@ -24,23 +24,23 @@ const samples: string[] = []; type Emitted = { parse(src: string, entry?: string): number; - visit(entry: number, fns: { enter?(id: number): boolean | void; leaf?(e: number, tok: number): void }): void; - tree: { ruleNameOf(id: number): string; childCount(id: number): number; childAt(id: number, i: number): number; leafTokenType(e: number): string; offsetOf(e: number): number; endOf(e: number): number }; + visit(entry: number, fns: { enter?(id: number, charBase: number, tokBase: number): boolean | void; leaf?(e: number, tok: number): void }): void; + tree: { ruleNameOf(id: number): string; lenOf(id: number): number }; }; -function checkTree(em: Emitted, root: number, src: string, matchers: Record { arm: string }>, tag: string): void { +function checkTree(em: Emitted, root: number, src: string, matchers: Record { arm: string }>, tag: string): void { em.visit(root, { - enter(id) { + enter(id, charBase, tokBase) { const m = matchers[em.tree.ruleNameOf(id)]; if (m !== undefined) { nodes++; try { - m(em.tree as never, id as never, src); + m(em.tree as never, id as never, tokBase, src); } catch (e) { misses++; if (samples.length < 10) { - const off = em.tree.offsetOf(id); - samples.push(`${tag} ${em.tree.ruleNameOf(id)} @${off}..${em.tree.endOf(id)} «${src.slice(off, Math.min(em.tree.endOf(id), off + 50)).replace(/\n/g, '\\n')}» — ${(e as Error).message.slice(0, 60)}`); + const end = charBase + em.tree.lenOf(id); + samples.push(`${tag} ${em.tree.ruleNameOf(id)} @${charBase}..${end} «${src.slice(charBase, Math.min(end, charBase + 50)).replace(/\n/g, '\\n')}» — ${(e as Error).message.slice(0, 60)}`); } } } diff --git a/test/obj-tree.ts b/test/obj-tree.ts new file mode 100644 index 0000000..fee0a70 --- /dev/null +++ b/test/obj-tree.ts @@ -0,0 +1,78 @@ +// A TreeAccess adapter over an INTERPRETER object CST — absolute coordinates, ids +// assigned by one post-order walk. It lets matcher consumers (the ts-ast lowering) +// run against the interp oracle without caring that the EMITTED tree went green +// (relative coordinates): the adapter ignores every tokBase it is handed. +// +// leafTokKindOf is only ever consulted on kind-0 leaves (the generated probes test +// the kind bit first), where the object leaf's tokenType IS the token name (or +// '$punct') — so the name→type-kind map (same derivation as the engine: punct 1, +// template spans 2-4, named tokens from 5 in declaration order) is complete. +import type { CstGrammar } from '../src/types.ts'; + +type Leafish = { tokenType: string; offset: number; end: number }; +type Nodeish = { rule: string; children: (Leafish | Nodeish)[]; offset: number; end: number }; + +export interface ObjTree { + rootId: number; + // matcher-facing (TreeAccess-compatible; tokBase params ignored) + ruleNameOf(id: number): string; + ruleIdOf(id: number): number; + childCount(id: number): number; + childAt(id: number, i: number): number; + childrenInto(id: number, out: number[]): number; + leafKindOf(entry: number): number; + leafTokKindOf(entry: number, tokBase?: number): number; + leafOffsetOf(entry: number, tokBase?: number): number; + leafEndOf(entry: number, tokBase?: number): number; + // stateless absolute conveniences (the lowering's toolkit) + offsetOf(entry: number): number; + endOf(entry: number): number; + leafTokenType(entry: number): string; +} + +export function objTree(root: Nodeish, grammar: CstGrammar): ObjTree { + const typeKind = new Map([['', 1], ['$punct', 1], ['$templateHead', 2], ['$templateMiddle', 3], ['$templateTail', 4]]); + { let next = 5; for (const t of grammar.tokens) if (!typeKind.has(t.name)) typeKind.set(t.name, next++); } + const ruleIdM = new Map(grammar.rules.map((r, i) => [r.name, i])); + ruleIdM.set('$template', grammar.rules.length); + + const nodes: Nodeish[] = []; + const leaves: Leafish[] = []; + const kidsOf: number[][] = []; + const walk = (n: Nodeish): number => { + const ks: number[] = []; + for (const c of n.children) { + if ((c as Leafish).tokenType !== undefined) { + const lf = c as Leafish; + const li = leaves.length; + leaves.push(lf); + const kind = lf.tokenType === '$keyword' ? 1 : lf.tokenType === '$operator' ? 2 : 0; + ks.push(~((li << 2) | kind)); + } else { + ks.push(walk(c as Nodeish)); + } + } + const id = nodes.length; + nodes.push(n); + kidsOf.push(ks); + return id; + }; + const rootId = walk(root); + const leafOf = (e: number) => leaves[(~e) >>> 2]; + + return { + rootId, + ruleNameOf: (id) => nodes[id].rule, + ruleIdOf: (id) => ruleIdM.get(nodes[id].rule) ?? -1, + childCount: (id) => kidsOf[id].length, + childAt: (id, i) => kidsOf[id][i], + childrenInto: (id, out) => { const ks = kidsOf[id]; for (let i = 0; i < ks.length; i++) out[i] = ks[i]; return ks.length; }, + leafKindOf: (e) => (~e) & 3, + leafTokKindOf: (e) => typeKind.get(leafOf(e).tokenType) ?? 0, + leafOffsetOf: (e) => leafOf(e).offset, + leafEndOf: (e) => leafOf(e).end, + offsetOf: (e) => e >= 0 ? nodes[e].offset : leafOf(e).offset, + endOf: (e) => e >= 0 ? nodes[e].end : leafOf(e).end, + leafTokenType: (e) => leafOf(e).tokenType, + }; +} diff --git a/test/ts-ast-lowering.ts b/test/ts-ast-lowering.ts index 06edb48..a5268c7 100644 --- a/test/ts-ast-lowering.ts +++ b/test/ts-ast-lowering.ts @@ -6,7 +6,8 @@ // // Deliberately NOT complete: unlowered constructs throw Unlowered (the verify driver // counts them) — the goal is an honest pain inventory, not a shipped frontend. -import { matchStmt, type TreeAccess } from '../typescript.cst-match.ts'; +import { matchStmt } from '../typescript.cst-match.ts'; +import type { ObjTree } from './obj-tree.ts'; export type Ast = { kind: string; pos: number; end: number; children: Ast[] }; const ast = (kind: string, pos: number, end: number, children: Ast[] = []): Ast => ({ kind, pos, end, children }); @@ -27,7 +28,7 @@ export class Unlowered extends Error { // against undefined, never truthiness. type E = number; let SRC = ''; -let T!: TreeAccess; +let T!: ObjTree; const isLeaf = (n: E | undefined): boolean => n !== undefined && n < 0; const isNode = (n: E | undefined): boolean => n !== undefined && n >= 0; const off = (n: E): number => T.offsetOf(n); @@ -510,7 +511,7 @@ function lowerBindingElement(n: E): Ast { // A few arms still reach into kidsOf(n) for the positions of uncaptured structural // keywords ('catch', the switch '{') — a noted destructurer gap. function lowerStmt(n: E): Ast { - const m = matchStmt(T, n as never, SRC); + const m = matchStmt(T as never, n as never, 0, SRC); const c = kidsOf(n); switch (m.arm) { case 'block': return lowerBlock(m.block); @@ -924,7 +925,7 @@ function lowerExport(n: E, c: E[], i: number, mods: Ast[]): Ast { } // ── Entry ── -export function lowerProgram(t: TreeAccess, root: E, source: string): Ast { +export function lowerProgram(t: ObjTree, root: E, source: string): Ast { T = t; SRC = source; const stmts: Ast[] = []; diff --git a/test/ts-ast-verify.ts b/test/ts-ast-verify.ts index 9e630fa..ff33f22 100644 --- a/test/ts-ast-verify.ts +++ b/test/ts-ast-verify.ts @@ -10,19 +10,15 @@ // node test/ts-ast-verify.ts [...] # real files import { existsSync, readFileSync } from 'node:fs'; import ts from 'typescript'; -import { writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { createParser } from '../src/gen-parser.ts'; import { lowerProgram, Unlowered, type Ast } from './ts-ast-lowering.ts'; +import { objTree } from './obj-tree.ts'; -// The lowering consumes the ARENA through TreeAccess, so parse with the emitted -// parser (the product representation) — built fresh from the current grammar. +// The lowering runs against the INTERPRETER oracle through the object-tree adapter +// (absolute coordinates) — the grammar↔tsc structure contract is engine-independent, +// and the emitted tree's green (relative) coordinates stay the emitted gates' concern. const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-tsast.mjs'; -writeFileSync(emPath, emitParser(grammar)); -const parser = (await import(emPath + '?v=' + process.pid)) as { - parse(src: string, entry?: string): number; - tree: import('../typescript.cst-match.ts').TreeAccess; -}; +const parser = createParser(grammar); const kindNum = (name: string): number => { const v = (ts.SyntaxKind as unknown as Record)[name]; @@ -72,11 +68,12 @@ function run(name: string, code: string): { ok: boolean; skipped?: boolean; line return { ok: true, skipped: true, line: `${name}: SKIPPED (tsc reports ${probe.parseDiagnostics.length} parse error(s) — recovery shapes are out of contract)`, samples: [] }; } } - let root: number; - try { root = parser.parse(code); } + let rootObj; + try { rootObj = parser.parse(code); } catch (e) { return { ok: false, line: `${name}: MONOGRAM REJECT ${(e as Error).message.slice(0, 60)}`, samples: [] }; } + const adapter = objTree(rootObj as never, grammar); let mine: Ast; - try { mine = lowerProgram(parser.tree, root, code); } + try { mine = lowerProgram(adapter, adapter.rootId, code); } catch (e) { if (e instanceof Unlowered) return { ok: false, line: `${name}: UNLOWERED ${e.what} @${e.at}`, samples: [] }; return { ok: false, line: `${name}: LOWER THROW ${(e as Error).message.slice(0, 80)}`, samples: [] }; From 190e2c5583f4d1bb43fd2560b0572f40a86c68a4 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 04:29:24 +0800 Subject: [PATCH 05/15] Old-tree adoption (M4): reuse via cursor descent; the memo carry is gone MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An incremental rule entry now asks the PREVIOUS tree first: adoptSeek walks the old root toward the mapped old position (cached containment path + binary search over each node's monotone child starts) and adopts a node when the rule matches, its lookahead gap stays clear of the damage (rowExt — the ext-minus-start LENGTH, position-independent like everything green), and the old parse MEMOIZED it (rowOK): a row built under a suppress (no-'in') or parseLimit-capped context is a context-dependent parse, and adoption must not widen the contract the memo carry never offered — skipping that bit produced real divergences (an incremental reject of text the fresh parse accepts). Adoption is STATELESS: nothing is consumed, so PEG backtracking needs no cursor rollback, a node refused under one longest-match candidate can be adopted by the next, and exploratory descent through same-start chains never commits to the cache. On adoption: pos jumps by rowTokLen, the watermark bumps by the gap, the transients refresh — all O(1). The memo becomes purely intra-parse: parseEdited's whole O(rules × n) carry/invalidate machinery (the prefix watermark scans, the sparse rebuilds) is deleted; fresh memo arrays per parse. incremental ≡ fresh 0/120 with the mixed session at 1.91× (best yet); 9MB keystrokes ~121ms; 18,802/18,805 emit ≡ interp byte-identical; reject messages exact; 30/30 gates. --- src/emit-parser.ts | 175 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 143 insertions(+), 32 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index b0446e4..726bb41 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1364,6 +1364,15 @@ let rowLen = new Int32Array(8192); let rowTokLen = new Int32Array(8192); // subtree token count let rowStart = new Int32Array(8192); // first index into kids let rowCount = new Int32Array(8192); +// lookahead GAP: how far past its own first token the node's parse may have READ +// (ext − start, a length — position-independent like everything green). Adoption +// validity across edits compares q + rowExt + slack against the damage start. +let rowExt = new Int32Array(8192); +// adoption eligibility: set ONLY where the old parse MEMOIZED the node — a row built +// under a suppress (no-'in') or parseLimit-capped context is a context-dependent +// parse and must never be adopted into a normal entry (the memo carry never stored +// those; adoption must not widen the contract). +let rowOK = new Uint8Array(8192); // transient BUILD coordinates (absolute), valid for rows completed in the current // parse and REFRESHED at memo-hit time for reused roots — parents read them at // finishNode to write the children's relative fields; never part of the green tree. @@ -1392,6 +1401,8 @@ function growRows() { const tl = new Int32Array(rowCap); tl.set(rowTokLen); rowTokLen = tl; const s = new Int32Array(rowCap); s.set(rowStart); rowStart = s; const c = new Int32Array(rowCap); c.set(rowCount); rowCount = c; + const x = new Int32Array(rowCap); x.set(rowExt); rowExt = x; + const ok = new Uint8Array(rowCap); ok.set(rowOK); rowOK = ok; const ac = new Int32Array(rowCap); ac.set(absChar); absChar = ac; const at = new Int32Array(rowCap); at.set(absTok); absTok = at; } @@ -1446,6 +1457,8 @@ function finishNode(rid, mark) { } rowRule[id] = rid; rowLen[id] = myEnd - myOff; rowCount[id] = n; rowTokLen[id] = myTokEnd - myTok; + rowExt[id] = maxPos - myTok; + rowOK[id] = 0; absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; @@ -1478,6 +1491,8 @@ function finishWrap(rid, lhsId, mark) { rowRule[id] = rid; rowLen[id] = myEnd - myOff; rowStart[id] = ks; rowCount[id] = n + 1; rowTokLen[id] = myTokEnd - myTok; + rowExt[id] = maxPos - myTok; + rowOK[id] = 0; absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; @@ -1673,7 +1688,7 @@ function emitNonRecRule(e: Emitter, a: ReturnType, rule: RuleDec // push+boolean contract and the memo) and an id-returning core, exactly like the // pratt/left-rec rules. if (memoized) { - e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${J(rule.name)}, ${ruleFn}_core); }`); + e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_core); }`); e.emit(`function ${ruleFn}_core(_minBp) {`); } else { e.emit(`function ${ruleFn}() {`); @@ -1713,8 +1728,8 @@ function emitLeftRecRule(e: Emitter, a: ReturnType, rule: RuleDe // suppress wrapper in the interpreter — so currentPrattContext is set to this rule // (the template-interpolation rule resolution depends on it: a `${…}` hole inside a // template-literal TYPE must parse as Type, not the default expression rule). - e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${J(rule.name)}, ${ruleFn}_lr); }`); const rid = a.grammar.rules.indexOf(rule); + e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_lr); }`); e.emit(`function ${ruleFn}_lr(_minBp) {`); e.emit(` const saved = pos; const mark = scn;`); e.emit(` let node = -1; let bestAtomPos = saved;`); @@ -1767,7 +1782,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl // R_() wraps parseRule's memo/context handling, then calls the bp-taking core. const rid = a.grammar.rules.indexOf(rule); - e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${J(rule.name)}, ${ruleFn}_pratt); }`); + e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_pratt); }`); e.emit(`function ${ruleFn}_pratt(minBp) {`); e.emit(` const saved = pos; const mark = scn;`); e.emit(` let lhs = -1; let bestNudPos = saved;`); @@ -1988,7 +2003,7 @@ function emitDriver(e: Emitter, a: ReturnType, entry: string) { // and SECOND-token reads past it. Left-to-right parsing keeps the watermark near the // current frontier, so the value is tight on the dominant flow and only OVER- // invalidates (soundly) near big-backtrack clusters. -function parseRuleEntry(idx, name, core) { +function parseRuleEntry(idx, rid, name, core) { const mySup = suppressNext; suppressNext = null; const capped = parseLimit >= 0; @@ -2024,6 +2039,34 @@ function parseRuleEntry(idx, name, core) { return false; } } + if (!mySup && !capped && adoptRoot >= 0) { + // map the new position into OLD token coordinates; inside the damage = no mapping + const q = start < adoptDmgStart ? start + : start >= adoptDmgOldEnd + adoptDelta ? start - adoptDelta : -1; + if (q >= 0) { + const aid = adoptSeek(q, rid); + if (aid >= 0) { + pos = start + rowTokLen[aid]; + const ext = start + rowExt[aid]; + if (ext > maxPos) maxPos = ext; + absTok[aid] = start; + absChar[aid] = tkOff[start]; + if (me === undefined) { + me = new Array(tokN + 1); + mn = new Array(tokN + 1); + mx = new Array(tokN + 1); + memoEnd[idx] = me; + memoNode[idx] = mn; + memoExt[idx] = mx; + } + me[start] = pos; + mn[start] = aid; + mx[start] = maxPos; + scPush(aid); + return true; + } + } + } const prevContext = currentPrattContext; currentPrattContext = name; const prevSup = suppressCur; @@ -2048,6 +2091,7 @@ function parseRuleEntry(idx, name, core) { mn[start] = result; mx[start] = maxPos; // the TRUE probe watermark — the +2 read slack (stop token, // SECOND-token dispatch) is applied at INVALIDATION time + if (result >= 0) rowOK[result] = 1; } if (result >= 0) { scPush(result); return true; } @@ -2226,6 +2270,82 @@ let lastSrc = null; // the LAST parse root's absolute coordinates (the descent origin — see visit/toObject) let rootCharBase = 0; let rootTokBase = 0; + +// ── M4: old-tree ADOPTION (cursor reuse) ── +// During an incremental re-parse, a rule entry first asks the PREVIOUS tree: is there +// an old node of this rule starting at the corresponding old position whose lookahead +// stayed clear of the damage? Adoption is STATELESS — nothing is consumed, so PEG +// backtracking needs no cursor rollback, and a node refused under one candidate arm +// can be adopted by the next. The memo stays purely intra-parse. +let lastRoot = -1; // previous parse's root id + its absolute first token +let lastRootTok = 0; +let adoptRoot = -1; // previous root id (-1 = no adoption) +let adoptRootTok = 0; // its absolute first token (old coords) +let adoptDmgStart = 0; // damage window in OLD token coords: [adoptDmgStart, adoptDmgOldEnd) +let adoptDmgOldEnd = 0; +let adoptDelta = 0; // new-minus-old token delta past the damage +// cached descent path (top-down): ids + their absolute old token bases +let adoptPath = []; +let adoptBase = []; +function adoptSeek(q, rid) { + // reuse the cached path while it still CONTAINS q (strictly inside, not at start) + let depth = 0; + while (depth < adoptPath.length) { + const id = adoptPath[depth]; + const b = adoptBase[depth]; + if (b < q && q < b + rowTokLen[id]) depth++; + else break; + } + adoptPath.length = depth; + adoptBase.length = depth; + let id, base; + if (depth === 0) { + if (q < adoptRootTok || q >= adoptRootTok + rowTokLen[adoptRoot]) return -1; + id = adoptRoot; base = adoptRootTok; + if (base === q) { /* root itself starts at q — fall through to the chain walk */ } + adoptPath.push(id); adoptBase.push(base); + } else { + id = adoptPath[depth - 1]; base = adoptBase[depth - 1]; + } + // descend: containment steps are committed to the cache; the exploratory chain of + // nodes starting EXACTLY at q is walked in locals (a later seek with another rule + // must see the same chain). + for (;;) { + // binary search the first child whose END exceeds q + const cs = rowStart[id]; + const n = rowCount[id]; + let lo = 0, hi = n; + while (lo < hi) { + const mid = (lo + hi) >> 1; + const e = kids[cs + mid]; + const end = e < 0 ? base + ((~e) >>> 2) + 1 : base + kidTokRel[cs + mid] + rowTokLen[e]; + if (end <= q) lo = mid + 1; else hi = mid; + } + if (lo >= n) return -1; + const e = kids[cs + lo]; + if (e < 0) return -1; // the position is a leaf here + const cb = base + kidTokRel[cs + lo]; + if (cb > q) return -1; // a gap — nothing starts at q + if (cb === q) { + // the exploratory chain: every node from here down whose start is exactly q + let xid = e, xb = cb; + for (;;) { + if (rowOK[xid] !== 0 && rowRule[xid] === rid + && (q + rowExt[xid] + 2 <= adoptDmgStart || q >= adoptDmgOldEnd)) { + return xid; + } + const xcs = rowStart[xid]; + if (rowCount[xid] === 0) return -1; + const fe = kids[xcs]; + if (fe < 0 || kidTokRel[xcs] !== 0) return -1; + xid = fe; xb = xb; + } + } + // containment: commit and descend + id = e; base = cb; + adoptPath.push(id); adoptBase.push(base); + } +} // The spare token-column buffer set (parseEdited ping-pongs between the live set and // this one, so steady-state edits never allocate columns). let altK = null, altT = null, altOff = null, altEnd = null, altFl = null, altDp = null, altPd = null; @@ -2246,6 +2366,7 @@ ${e.soa ? '' : 'let altText = [];'} export function parse(source, entryRule) { lastSrc = null; + adoptRoot = -1; lexInto(source); memoNode = new Array(MEMO_RULES); memoEnd = new Array(MEMO_RULES); @@ -2253,6 +2374,8 @@ export function parse(source, entryRule) { nodeN = 0; kidN = 0; const root = runParse(entryRule); + lastRoot = root; + lastRootTok = rootTokBase; lastSrc = source; return root; } @@ -2368,35 +2491,23 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── const dOldEnd = oN - s; const tokenDelta = nN - oN; const nN2 = nN;`} - // Carry the memo across: prefix entries whose lookahead never reached the damage - // stay; suffix entries shift by tokenDelta; the damage window drops. - for (let r = 0; r < MEMO_RULES; r++) { - const me = memoEnd[r]; - if (me === undefined) continue; - const mn = memoNode[r], mx = memoExt[r]; - for (let i = 0; i < p; i++) { - if (me[i] !== undefined && mx[i] + 2 > p) { me[i] = undefined; mn[i] = undefined; mx[i] = undefined; } - } - if (tokenDelta === 0) { - for (let i = p; i < dOldEnd; i++) { - if (me[i] !== undefined) { me[i] = undefined; mn[i] = undefined; mx[i] = undefined; } - } - continue; - } - const nme = new Array(nN2 + 1), nmn = new Array(nN2 + 1), nmx = new Array(nN2 + 1); - const pCap = p < nN2 + 1 ? p : nN2 + 1; - for (let i = 0; i < pCap; i++) { - if (me[i] !== undefined) { nme[i] = me[i]; nmn[i] = mn[i]; nmx[i] = mx[i]; } - } - for (let i = dOldEnd; i <= oN; i++) { - if (me[i] !== undefined) { - const j = i + tokenDelta; - nme[j] = me[i] + tokenDelta; nmn[j] = mn[i]; nmx[j] = mx[i] + tokenDelta; - } - } - memoEnd[r] = nme; memoNode[r] = nmn; memoExt[r] = nmx; - } + // M4: NO memo carry — the memo is intra-parse; reuse flows through old-tree + // adoption (parseRuleEntry consults the previous root via adoptSeek), so the whole + // O(rules × n) carry/invalidate machinery is gone. + memoNode = new Array(MEMO_RULES); + memoEnd = new Array(MEMO_RULES); + memoExt = new Array(MEMO_RULES); + adoptRoot = lastRoot; + adoptRootTok = lastRootTok; + adoptDmgStart = p; + adoptDmgOldEnd = dOldEnd; + adoptDelta = tokenDelta; + adoptPath.length = 0; + adoptBase.length = 0; const root = runParse(entryRule); + adoptRoot = -1; + lastRoot = root; + lastRootTok = rootTokBase; lastSrc = source; return root; } From e1b3a5cae17ee6ef1711aadbb734870ff781176d Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 04:34:46 +0800 Subject: [PATCH 06/15] Generation-stamped persistent memo: the per-edit array churn dies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intra-parse memo arrays persist across parses; an entry is live iff its stamp (a new memoGen Int32Array per rule) equals the current generation, and bumping the generation counter IS the whole reset — parse(), parseEdited() and the '>'-splice all just increment it. Allocating fresh multi-million-slot arrays per edit was ~30% of a large-file edit in GC alone (and pushed V8 toward dictionary elements); now steady-state edits allocate nothing. 9MB keystroke edits: ~121ms -> ~50ms (5.4x vs a full parse); mixed sessions 2.27x. incremental ≡ fresh 0/120; 18,802 byte-identical; reject messages exact; 30/30 gates. --- src/emit-parser.ts | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 726bb41..04f6b09 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1504,6 +1504,12 @@ let maxPos = 0; let memoNode = []; let memoEnd = []; let memoExt = []; // per-entry lookahead extent (see parseRuleEntry) +// GENERATION-STAMPED memo: the per-rule arrays persist across parses (allocating +// fresh multi-million-slot arrays per edit cost ~30% of a large-file edit in GC +// alone); an entry is live iff its stamp equals the current generation — bumping +// memoGenCur IS the whole reset. +let memoGen = []; +let memoGenCur = 0; let parseLimit = -1; // cap = the exclusive lookahead bound: min(parseLimit-or-∞, tokN), maintained at the // parseLimit set/restore sites and the one token-stream mutation (the '>' splice). @@ -1571,9 +1577,7 @@ function matchPuLitGT(pu) { tokN++; if (parseLimit < 0) cap = tokN; // Token indices shifted: drop the per-rule memo arrays (recreated lazily at the new size). - memoNode.fill(undefined); - memoEnd.fill(undefined); - memoExt.fill(undefined); + memoGenCur++; // positions shifted mid-parse: every stamped entry is stale // GREEN tree: no kids/scratch fixup — every completed row and scratch entry lies // wholly BEFORE the splice point (token pos is being consumed right now), and the // carried memo was just cleared, so nothing reachable references shifted indices. @@ -2014,7 +2018,8 @@ function parseRuleEntry(idx, rid, name, core) { let me = memoEnd[idx]; let mn = memoNode[idx]; let mx = memoExt[idx]; - if (!mySup && !capped && me !== undefined) { + let mg = memoGen[idx]; + if (!mySup && !capped && me !== undefined && mg[start] === memoGenCur) { const e = me[start]; if (e !== undefined) { pos = e; @@ -2051,17 +2056,20 @@ function parseRuleEntry(idx, rid, name, core) { if (ext > maxPos) maxPos = ext; absTok[aid] = start; absChar[aid] = tkOff[start]; - if (me === undefined) { + if (me === undefined || me.length < tokN + 1) { me = new Array(tokN + 1); mn = new Array(tokN + 1); mx = new Array(tokN + 1); + mg = new Int32Array(tokN + 1); memoEnd[idx] = me; memoNode[idx] = mn; memoExt[idx] = mx; + memoGen[idx] = mg; } me[start] = pos; mn[start] = aid; mx[start] = maxPos; + mg[start] = memoGenCur; scPush(aid); return true; } @@ -2079,17 +2087,20 @@ function parseRuleEntry(idx, rid, name, core) { suppressCur = prevSup; } if (!mySup && !capped) { - if (me === undefined) { + if (me === undefined || me.length < tokN + 1) { me = new Array(tokN + 1); mn = new Array(tokN + 1); mx = new Array(tokN + 1); + mg = new Int32Array(tokN + 1); memoEnd[idx] = me; memoNode[idx] = mn; memoExt[idx] = mx; + memoGen[idx] = mg; } me[start] = pos; mn[start] = result; - mx[start] = maxPos; // the TRUE probe watermark — the +2 read slack (stop token, + mx[start] = maxPos; + mg[start] = memoGenCur; // the TRUE probe watermark — the +2 read slack (stop token, // SECOND-token dispatch) is applied at INVALIDATION time if (result >= 0) rowOK[result] = 1; @@ -2368,9 +2379,13 @@ export function parse(source, entryRule) { lastSrc = null; adoptRoot = -1; lexInto(source); - memoNode = new Array(MEMO_RULES); - memoEnd = new Array(MEMO_RULES); - memoExt = new Array(MEMO_RULES); + if (memoEnd.length !== MEMO_RULES) { + memoNode = new Array(MEMO_RULES); + memoEnd = new Array(MEMO_RULES); + memoExt = new Array(MEMO_RULES); + memoGen = new Array(MEMO_RULES); + } + memoGenCur++; nodeN = 0; kidN = 0; const root = runParse(entryRule); @@ -2494,9 +2509,13 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // M4: NO memo carry — the memo is intra-parse; reuse flows through old-tree // adoption (parseRuleEntry consults the previous root via adoptSeek), so the whole // O(rules × n) carry/invalidate machinery is gone. - memoNode = new Array(MEMO_RULES); - memoEnd = new Array(MEMO_RULES); - memoExt = new Array(MEMO_RULES); + if (memoEnd.length !== MEMO_RULES) { + memoNode = new Array(MEMO_RULES); + memoEnd = new Array(MEMO_RULES); + memoExt = new Array(MEMO_RULES); + memoGen = new Array(MEMO_RULES); + } + memoGenCur++; adoptRoot = lastRoot; adoptRootTok = lastRootTok; adoptDmgStart = p; From d4535ad5072d97010db9e15cfeb76c843281345b Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 04:37:17 +0800 Subject: [PATCH 07/15] Edit protocol: parseEdited(source, entry, edits) skips the char-diff scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An editor knows its edit ranges, so the damage envelope can come from the caller ([{start, oldEnd, newEnd}], merged over multiple edits) instead of the char-level prefix/suffix compare — which was the largest remaining O(file) scan (two charCodeAt sweeps over a 9MB source per keystroke). The compare stays as the no-protocol fallback. 9MB keystroke edits: ~50ms -> 7.8ms (34.6x vs a full parse), equivalence verified. What remains per edit is memcpy-grade: the suffix span shift and the token-column splice (the chunked-columns endgame), the window lex, and the adoption walk. incremental ≡ fresh 0/120 (the gate exercises the fallback path); 30/30 gates. --- src/emit-parser.ts | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 04f6b09..a8e0508 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -2410,19 +2410,33 @@ export function parse(source, entryRule) { // until then. Lexing is FULL-FILE by design: the lexer carries cross-token state // (template nesting, regex context, markup modes), full lexing is a small share of a // parse, and the diff is what localizes the damage — not the lexer. -export function parseEdited(source, entryRule) { +export function parseEdited(source, entryRule, edits) { if (lastSrc === null) return parse(source, entryRule); const oSrc = lastSrc; lastSrc = null; ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── - // Char-level envelope (cheapest possible without an edit protocol). + // Damage envelope: from the EDIT PROTOCOL when the caller provides it (an editor + // knows its edit ranges — [{start, oldEnd, newEnd}] in old/new coordinates), else + // derived by the char-level prefix/suffix compare (the cheapest possible fallback, + // but O(file) scans). const oldLen = oSrc.length, newLen = source.length; - const minL = oldLen < newLen ? oldLen : newLen; - let cs = 0; - while (cs < minL && oSrc.charCodeAt(cs) === source.charCodeAt(cs)) cs++; - let ce = 0; - while (ce < minL - cs && oSrc.charCodeAt(oldLen - 1 - ce) === source.charCodeAt(newLen - 1 - ce)) ce++; - const ceOld = oldLen - ce, ceNew = newLen - ce; + let cs, ceOld, ceNew; + if (edits !== undefined && edits.length > 0) { + cs = edits[0].start; ceOld = edits[0].oldEnd; ceNew = edits[0].newEnd; + for (let i = 1; i < edits.length; i++) { + const ed = edits[i]; + if (ed.start < cs) cs = ed.start; + if (ed.oldEnd > ceOld) ceOld = ed.oldEnd; + if (ed.newEnd > ceNew) ceNew = ed.newEnd; + } + } else { + const minL = oldLen < newLen ? oldLen : newLen; + cs = 0; + while (cs < minL && oSrc.charCodeAt(cs) === source.charCodeAt(cs)) cs++; + let ce = 0; + while (ce < minL - cs && oSrc.charCodeAt(oldLen - 1 - ce) === source.charCodeAt(newLen - 1 - ce)) ce++; + ceOld = oldLen - ce; ceNew = newLen - ce; + } const charDelta = newLen - oldLen; // Restart anchor: the last token B ending at/before the damage whose recorded // depths are zero and whose shape carries no cross-token lexer flag (')' control- From 597b0f36caba997eea49e3f8aea89c3779b06fa8 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 05:16:56 +0800 Subject: [PATCH 08/15] Run-adoption: rep loops bulk-adopt old sibling runs; session paren-stack cache A 9MB flat-body keystroke spent 79% of its 61ms re-entering parseRuleEntry/adoptSeek once per undamaged statement, and another 7ms re-deriving the live paren stack by backward scan (the IIFE worst case). - adoptSeek publishes the hit site (old parent row / kid index / base) when the adopted node is the parent's direct kid; parseRuleEntry arms a (pos, rid, generation)-signed run signal on such adoptions. - '*'/'+' loops whose element is a parseRuleEntry-routed rule (pratt / left-rec / spine) consume the signal via runExtend: following old siblings are adopted in one tight loop under exactly the single-adopt eligibility (same-rule row, rowOK, contiguous, damage-clear, non-zero width). A member's existence proves the loop's FIRST guard true at its position; the signature triple keeps an inner rule's adoption from feeding elements into an outer loop. Members skip memo stores - a backtracking re-probe just re-adopts. - reconstructParensCached rolls the previous anchor's stack FORWARD over the tokens between the anchors (tokens at/before the cached anchor are splice-stable); backward jumps fall back to the full scan. Invalidated by full lexes and the '>' splice. - The spine-rule set moved to Emitter.spineSet(), shared by emitRuleFns and the quantifier hook. 9MB IIFE keystroke: 61ms -> 10.4ms (parse 50.5 -> 6.9, parens 7.2 -> 0.2). Gates: 30/30, incremental 0/120, emit-parser-verify 0 mismatch, emit-lexer-verify streams equal, batch bench unchanged (11.4x aggregate). --- src/emit-lexer.ts | 24 +++ src/emit-parser.ts | 127 ++++++++++-- src/token-dfa.ts | 417 +++++++++++++++++++++++++++++++++++++++ test/token-dfa-verify.ts | 74 +++++++ 4 files changed, 624 insertions(+), 18 deletions(-) create mode 100644 src/token-dfa.ts create mode 100644 test/token-dfa-verify.ts diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 738b529..cf4291d 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -199,6 +199,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`function tokenize(source) {`); emit(` src = source;`); emit(` tokN = 0;`); + emit(` parenCachePos = -1;`); emit(` lexCore(source, 0, -1, 0, -1, 0, 0);`); emit(` return tokN;`); emit(`}`); @@ -538,6 +539,29 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` }`); emit(` return out;`); emit(`}`); + emit(`// Session cache for the live paren stack: the previous edit's anchor stack rolled`); + emit(`// FORWARD over the tokens between the two anchors (push on '(', pop on ')') — the`); + emit(`// backward scan is O(distance to the outermost live opener), which a deep`); + emit(`// stationary session would pay per keystroke. Tokens at/before the cached anchor`); + emit(`// are splice-stable (every splice begins past its own anchor), so the baseline`); + emit(`// stays exact; a backward jump (b < cached) falls back to the full scan.`); + emit(`let parenCachePos = -1;`); + emit(`let parenCacheStack = [];`); + emit(`function reconstructParensCached(b) {`); + emit(` let stack;`); + emit(` if (b < 0) stack = [];`); + emit(` else if (parenCachePos >= 0 && parenCachePos <= b) {`); + emit(` stack = parenCacheStack;`); + emit(` for (let i = parenCachePos + 1; i <= b; i++) {`); + emit(` if (tkK[i] === 1) {`); + emit(` if (tkT[i] === ${tOf('(')}) stack.push((tkFl[i] & 8) !== 0);`); + emit(` else if (tkT[i] === ${tRParen}) { if (stack.length > 0) stack.pop(); }`); + emit(` }`); + emit(` }`); + emit(` } else stack = reconstructParens(b);`); + emit(` parenCachePos = b; parenCacheStack = stack;`); + emit(` return stack.slice();`); + emit(`}`); return out.join('\n'); } diff --git a/src/emit-parser.ts b/src/emit-parser.ts index a8e0508..41d571c 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -689,6 +689,49 @@ class Emitter { // Reference to a rule's parse function (token refs are inlined where used). private ruleFn(name: string) { return `R_${sanitize(name)}`; } + // SPINE rules — the entry rule's repetition units (the rules its body references + // directly): memoized through parseRuleEntry and therefore the adoption/run- + // extension granularity. Shared by emitRuleFns (memoized emission) and the + // quantifier run-extension hook. Grammar-shape-derived — no language names. + private spine: Set | null = null; + spineSet(): Set { + if (this.spine !== null) return this.spine; + const a = this.a; + const spine = new Set(); + const entryRule = a.grammar.rules[a.grammar.rules.length - 1]; + const walk = (x: RuleExpr): void => { + switch (x.type) { + case 'ref': if (a.ruleByName.has(x.name)) spine.add(x.name); return; + case 'seq': case 'alt': x.items.forEach(walk); return; + case 'quantifier': case 'group': walk(x.body); return; + case 'sep': walk(x.element); return; + default: return; + } + }; + walk(entryRule.body); + spine.delete(entryRule.name); + return (this.spine = spine); + } + // The run-extension target of a repetition: when the body unwraps to a plain ref of + // a rule that routes through parseRuleEntry (pratt / left-rec / spine), its rule id; + // else -1 (the loop gets no extension hook — adoption stays element-by-element). + private quantRunRuleId(body: RuleExpr): number { + const a = this.a; + let expr = body; + while (true) { + if (expr.type === 'group' && !(expr.suppress && expr.suppress.length)) { expr = expr.body; continue; } + if (expr.type === 'seq') { + const real = expr.items.filter(it => it.type !== 'op' && it.type !== 'prefix' && it.type !== 'postfix'); + if (real.length === 1) { expr = real[0]; continue; } + } + break; + } + if (expr.type !== 'ref' || !a.ruleByName.has(expr.name)) return -1; + const name = expr.name; + if (!(a.prattRules.has(name) || a.leftRecSet.has(name) || this.spineSet().has(name))) return -1; + return a.grammar.rules.findIndex(r => r.name === name); + } + /** * Emit (once) a helper fn for a compound `expr` and return its name. The helper * has the matchExpr contract: returns the matched children array or null, with pos @@ -853,13 +896,20 @@ class Emitter { // Try once; on failure the helper restored pos/scn itself. return `${fn}();`; } + // Run-extension: after an iteration whose element was ADOPTED from the old tree, + // bulk-adopt its following old siblings (runExtend) instead of re-entering the + // rule machinery once per element. Only loops over a parseRuleEntry-routed rule + // get the hook, and runExtend re-checks rid + generation, so an inner rule's + // adoption can never feed elements into an outer loop. + const runId = this.quantRunRuleId(body); + const ext = runId >= 0 ? `\n if (adoptRunPos === pos) runExtend(${runId});` : ''; if (kind === '*') { const before = this.id(), bsn = this.id(); return [ `while (true) {`, ` const ${before} = pos; const ${bsn} = scn;`, ` if (!${fn}()) break;`, - ` if (pos === ${before} && scn === ${bsn}) break;`, + ` if (pos === ${before} && scn === ${bsn}) break;` + ext, `}`, ].join('\n'); } @@ -870,7 +920,7 @@ class Emitter { `while (true) {`, ` const ${before} = pos; const ${bsn} = scn;`, ` if (!${fn}()) break;`, - ` if (pos === ${before} && scn === ${bsn}) break;`, + ` if (pos === ${before} && scn === ${bsn}) break;` + ext, `}`, ].join('\n'); } @@ -1563,6 +1613,7 @@ function matchPuLitGT(pu) { const end0 = tkEnd[pos]; ${e.soa ? '' : 'const restText = tkText[pos].slice(1);'} if (tokN === tkCap) growTok(); + parenCachePos = -1; tkK.copyWithin(pos + 1, pos, tokN); tkT.copyWithin(pos + 1, pos, tokN); tkOff.copyWithin(pos + 1, pos, tokN); @@ -1654,21 +1705,7 @@ function emitRuleFns(e: Emitter, a: ReturnType) { // memoized through parseRuleEntry like pratt/left-rec rules. Without this only // expression/type subtrees reuse and every statement re-walks on each edit. // Derived from the grammar shape — no language names. - const spine = new Set(); - { - const entryRule = a.grammar.rules[a.grammar.rules.length - 1]; - const walk = (x: RuleExpr): void => { - switch (x.type) { - case 'ref': if (a.ruleByName.has(x.name)) spine.add(x.name); return; - case 'seq': case 'alt': x.items.forEach(walk); return; - case 'quantifier': case 'group': walk(x.body); return; - case 'sep': walk(x.element); return; - default: return; - } - }; - walk(entryRule.body); - spine.delete(entryRule.name); - } + const spine = e.spineSet(); for (const rule of a.grammar.rules) { if (a.prattRules.has(rule.name)) emitPrattRule(e, a, rule); else if (a.leftRecSet.has(rule.name)) emitLeftRecRule(e, a, rule); @@ -2056,6 +2093,11 @@ function parseRuleEntry(idx, rid, name, core) { if (ext > maxPos) maxPos = ext; absTok[aid] = start; absChar[aid] = tkOff[start]; + if (adoptHitP >= 0) { + adoptRunPos = pos; adoptRunRid = rid; adoptRunGen = memoGenCur; + adoptRunP = adoptHitP; adoptRunKid = adoptHitKid + 1; + adoptRunOq = q + rowTokLen[aid]; adoptRunBase = adoptHitBase; + } if (me === undefined || me.length < tokN + 1) { me = new Array(tokN + 1); mn = new Array(tokN + 1); @@ -2298,6 +2340,12 @@ let adoptDelta = 0; // new-minus-old token delta past the damage // cached descent path (top-down): ids + their absolute old token bases let adoptPath = []; let adoptBase = []; +// run-extension state: where the last single adoption sat in the old tree (its +// parent row / kid index / parent token base), published by adoptSeek, plus the +// (pos, rid, generation) signature a repetition must present to consume it. +let adoptHitP = -1, adoptHitKid = 0, adoptHitBase = 0; +let adoptRunPos = -1, adoptRunRid = -1, adoptRunGen = -1; +let adoptRunP = -1, adoptRunKid = 0, adoptRunOq = 0, adoptRunBase = 0; function adoptSeek(q, rid) { // reuse the cached path while it still CONTAINS q (strictly inside, not at start) let depth = 0; @@ -2339,6 +2387,7 @@ function adoptSeek(q, rid) { if (cb > q) return -1; // a gap — nothing starts at q if (cb === q) { // the exploratory chain: every node from here down whose start is exactly q + adoptHitP = id; adoptHitKid = cs + lo; adoptHitBase = base; let xid = e, xb = cb; for (;;) { if (rowOK[xid] !== 0 && rowRule[xid] === rid @@ -2349,6 +2398,7 @@ function adoptSeek(q, rid) { if (rowCount[xid] === 0) return -1; const fe = kids[xcs]; if (fe < 0 || kidTokRel[xcs] !== 0) return -1; + adoptHitP = -1; xid = fe; xb = xb; } } @@ -2357,6 +2407,45 @@ function adoptSeek(q, rid) { adoptPath.push(id); adoptBase.push(base); } } +// Run-extension: a repetition whose element was just ADOPTED bulk-adopts the +// following OLD SIBLINGS in one tight loop — whole-statement reuse without +// re-entering parseRuleEntry/adoptSeek once per element. Soundness: each member +// re-passes exactly the single-adoption eligibility (same-rule row, memoized +// [rowOK], contiguous, lookahead clear of the damage), a member's existence +// proves the loop's FIRST-set guard true at its position (its first token starts +// the rule), and the loop's own continuation checks run again after the run +// breaks. Members get no memo entries — a backtracking re-probe just re-adopts. +function runExtend(rid) { + if (rid !== adoptRunRid || memoGenCur !== adoptRunGen) { adoptRunPos = -1; return; } + adoptRunPos = -1; + const P = adoptRunP; + const csEnd = rowStart[P] + rowCount[P]; + const pb = adoptRunBase; + let i = adoptRunKid; + let oq = adoptRunOq; + let nq = pos; + const sfx = oq >= adoptDmgOldEnd; // past the damage: monotone, no per-member ext check + let mp = maxPos; + while (i < csEnd) { + const e = kids[i]; + if (e < 0) break; + if (pb + kidTokRel[i] !== oq) break; + if (rowRule[e] !== rid || rowOK[e] === 0) break; + const tl = rowTokLen[e]; + if (tl === 0) break; + const ex = rowExt[e]; + if (!sfx && oq + ex + 2 > adoptDmgStart) break; + absTok[e] = nq; absChar[e] = tkOff[nq]; + scPush(e); + const w = nq + ex; + if (w > mp) mp = w; + nq += tl; oq += tl; + i++; + } + if (mp > maxPos) maxPos = mp; + pos = nq; +} + // The spare token-column buffer set (parseEdited ping-pongs between the live set and // this one, so steady-state edits never allocate columns). let altK = null, altT = null, altOff = null, altEnd = null, altFl = null, altDp = null, altPd = null; @@ -2378,6 +2467,7 @@ ${e.soa ? '' : 'let altText = [];'} export function parse(source, entryRule) { lastSrc = null; adoptRoot = -1; + adoptRunPos = -1; lexInto(source); if (memoEnd.length !== MEMO_RULES) { memoNode = new Array(MEMO_RULES); @@ -2442,7 +2532,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // depths are zero and whose shape carries no cross-token lexer flag (')' control- // head, postfix-ambiguous op). B = -1 restarts at the file head — always sound. const B = findRestart(cs); - const initParens = B >= 0 ? reconstructParens(B) : []; + const initParens = reconstructParensCached(B); const oN = tokN; // first old token at/after the damage end — the resync search floor let r0 = oN; @@ -2537,6 +2627,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── adoptDelta = tokenDelta; adoptPath.length = 0; adoptBase.length = 0; + adoptRunPos = -1; const root = runParse(entryRule); adoptRoot = -1; lastRoot = root; diff --git a/src/token-dfa.ts b/src/token-dfa.ts new file mode 100644 index 0000000..12b83ca --- /dev/null +++ b/src/token-dfa.ts @@ -0,0 +1,417 @@ +// ───────────────────────────────────────────────────────────────────────────── +// token-dfa.ts — derive a char-code DFA matcher from a token's structured pattern IR +// (src/token-pattern.ts), as the forward path to a scanner that dispatches on char +// codes instead of executing a regex per token (issue #5). +// +// The lexer matches one token at a time, anchored at `pos`, taking that token's +// greedy/longest match (sticky `re.lastIndex = pos; re.exec(s)`). This compiles the +// REGULAR subset of the IR — literal · charClass · anyChar · seq · alt · greedy +// repeat · never, plus a single TRAILING lookahead over a char class (the `(?!…)` +// guard the numeric tokens end with) — to an NFA (Thompson), then a DFA (subset +// construction), and runs it over `charCodeAt` code units. `match(s, pos)` returns +// the same match length the token's sticky regex would, or -1. +// +// Anything outside that subset (mid-pattern look-around, lookbehind, anchors, a +// non-greedy quantifier) → `compileTokenDfa` returns null and the caller keeps using +// the regex. So the scanner is byte-identical by construction: a DFA where the IR is +// regular, the proven regex elsewhere. Char classes are matched over UTF-16 code +// units (0..0xFFFF) exactly like the non-`/u` regexes the lexer emits today. +// ───────────────────────────────────────────────────────────────────────────── + +import type { TokenPattern, TokenCharClassItem } from './types.ts'; + +// UTF-16 code-unit alphabet. Negated classes complement within [0, MAX_CODE]. +const MAX_CODE = 0xffff; + +// A half-open is avoided: ranges are inclusive [lo, hi] of code units. +export interface Range { lo: number; hi: number } + +// ── Char-class → sorted, merged, inclusive ranges ── +function classRanges(items: TokenCharClassItem[], negate: boolean): Range[] { + const raw: Range[] = []; + for (const item of items) { + if (item.type === 'char') { + const c = item.value.charCodeAt(0); + raw.push({ lo: c, hi: c }); + } else { + const a = item.from.charCodeAt(0), b = item.to.charCodeAt(0); + raw.push({ lo: Math.min(a, b), hi: Math.max(a, b) }); + } + } + const merged = mergeRanges(raw); + return negate ? complementRanges(merged) : merged; +} + +function mergeRanges(ranges: Range[]): Range[] { + if (ranges.length === 0) return []; + const sorted = [...ranges].sort((a, b) => a.lo - b.lo || a.hi - b.hi); + const out: Range[] = [{ ...sorted[0] }]; + for (let i = 1; i < sorted.length; i++) { + const last = out[out.length - 1], r = sorted[i]; + if (r.lo <= last.hi + 1) last.hi = Math.max(last.hi, r.hi); + else out.push({ ...r }); + } + return out; +} + +function complementRanges(ranges: Range[]): Range[] { + // ranges are sorted+merged; complement within [0, MAX_CODE]. + const out: Range[] = []; + let next = 0; + for (const r of ranges) { + if (r.lo > next) out.push({ lo: next, hi: r.lo - 1 }); + next = r.hi + 1; + } + if (next <= MAX_CODE) out.push({ lo: next, hi: MAX_CODE }); + return out; +} + +// ── NFA (Thompson) ── +// A transition is either an epsilon move or a move on any code unit inside `ranges`. +interface NfaState { eps: number[]; trans: { ranges: Range[]; to: number }[] } + +class UnsupportedPattern extends Error {} + +class Nfa { + states: NfaState[] = []; + newState(): number { this.states.push({ eps: [], trans: [] }); return this.states.length - 1; } + eps(a: number, b: number): void { this.states[a].eps.push(b); } + move(a: number, ranges: Range[], b: number): void { this.states[a].trans.push({ ranges, to: b }); } +} + +// Build an NFA fragment for `pattern`; returns [start, accept]. Throws UnsupportedPattern +// for any non-regular construct so the caller can fall back to the regex. +function build(nfa: Nfa, pattern: TokenPattern): [number, number] { + if (typeof pattern === 'string') return buildLiteral(nfa, pattern); + switch (pattern.type) { + case 'anyChar': { + const s = nfa.newState(), a = nfa.newState(); + nfa.move(s, [{ lo: 0, hi: MAX_CODE }], a); + return [s, a]; + } + case 'charClass': { + const ranges = classRanges(pattern.items, pattern.negate); + const s = nfa.newState(), a = nfa.newState(); + if (ranges.length) nfa.move(s, ranges, a); // empty class → no edge → never matches + return [s, a]; + } + case 'seq': { + if (pattern.items.length === 0) { const s = nfa.newState(); return [s, s]; } + let [start, acc] = build(nfa, pattern.items[0]); + for (let i = 1; i < pattern.items.length; i++) { + const [s2, a2] = build(nfa, pattern.items[i]); + nfa.eps(acc, s2); + acc = a2; + } + return [start, acc]; + } + case 'alt': { + const s = nfa.newState(), a = nfa.newState(); + for (const item of pattern.items) { + const [s2, a2] = build(nfa, item); + nfa.eps(s, s2); + nfa.eps(a2, a); + } + return [s, a]; + } + case 'repeat': { + if (!pattern.greedy) throw new UnsupportedPattern('non-greedy repeat'); + // min mandatory copies, then either an unbounded star or (max-min) optional copies. + const s = nfa.newState(); + let acc = s; + for (let i = 0; i < pattern.min; i++) { + const [s2, a2] = build(nfa, pattern.body); + nfa.eps(acc, s2); + acc = a2; + } + if (pattern.max === undefined) { + // star: acc --eps--> bodyStart, bodyAccept --eps--> acc (loop) and onward. + const [s2, a2] = build(nfa, pattern.body); + const a = nfa.newState(); + nfa.eps(acc, s2); + nfa.eps(a2, s2); // loop + nfa.eps(acc, a); // skip (zero more) + nfa.eps(a2, a); // exit after >=1 + return [s, a]; + } else { + const a = nfa.newState(); + let cur = acc; + for (let i = pattern.min; i < pattern.max; i++) { + const [s2, a2] = build(nfa, pattern.body); + nfa.eps(cur, s2); + nfa.eps(cur, a); // optional: skip the rest + cur = a2; + } + nfa.eps(cur, a); + return [s, a]; + } + } + case 'never': { + const s = nfa.newState(), a = nfa.newState(); // no edge s→a → never accepts + return [s, a]; + } + // Non-regular: the caller must fall back to the regex. + case 'lookahead': + case 'lookbehind': + case 'anchor': + throw new UnsupportedPattern(pattern.type); + } +} + +function buildLiteral(nfa: Nfa, literal: string): [number, number] { + const start = nfa.newState(); + let cur = start; + for (let i = 0; i < literal.length; i++) { + const c = literal.charCodeAt(i); + const next = nfa.newState(); + nfa.move(cur, [{ lo: c, hi: c }], next); + cur = next; + } + return [start, cur]; +} + +// ── Subset construction → DFA ── +interface DfaState { accept: boolean; edges: { ranges: Range[]; to: number }[] } + +function epsilonClosure(nfa: Nfa, set: Set): Set { + const stack = [...set], out = new Set(set); + while (stack.length) { + const s = stack.pop()!; + for (const t of nfa.states[s].eps) if (!out.has(t)) { out.add(t); stack.push(t); } + } + return out; +} + +function setKey(set: Set): string { + return [...set].sort((a, b) => a - b).join(','); +} + +// Partition boundaries: every code unit where some transition's membership flips. We +// build a sorted list of "cut points" so the alphabet splits into intervals on which +// every NFA transition is constant — the classic DFA alphabet partition. +function buildDfa(nfa: Nfa, start: number, accept: number): DfaState[] { + const startSet = epsilonClosure(nfa, new Set([start])); + const dfa: DfaState[] = []; + const index = new Map(); + const queue: Set[] = []; + + const intern = (set: Set): number => { + const key = setKey(set); + let id = index.get(key); + if (id === undefined) { + id = dfa.length; + index.set(key, id); + dfa.push({ accept: set.has(accept), edges: [] }); + queue.push(set); + } + return id; + }; + + intern(startSet); + while (queue.length) { + const set = queue.shift()!; + const id = index.get(setKey(set))!; + // Collect this state's outgoing transitions, then split into disjoint intervals. + const trans: { ranges: Range[]; to: number }[] = []; + for (const ns of set) for (const tr of nfa.states[ns].trans) trans.push(tr); + if (trans.length === 0) continue; + // Cut points: for every range [lo,hi] add boundaries at lo and hi+1. + const cuts = new Set(); + for (const tr of trans) for (const r of tr.ranges) { cuts.add(r.lo); cuts.add(r.hi + 1); } + const points = [...cuts].sort((a, b) => a - b); + // For each elementary interval [points[i], points[i+1]-1], gather NFA targets. + const edges: { ranges: Range[]; to: number }[] = []; + for (let i = 0; i < points.length - 1; i++) { + const lo = points[i], hi = points[i + 1] - 1; + if (hi < lo) continue; + const targets = new Set(); + for (const tr of trans) { + for (const r of tr.ranges) if (r.lo <= lo && hi <= r.hi) { targets.add(tr.to); break; } + } + if (targets.size === 0) continue; + const toId = intern(epsilonClosure(nfa, targets)); + edges.push({ ranges: [{ lo, hi }], to: toId }); + } + // Merge adjacent intervals that go to the same DFA state (compacts the table). + edges.sort((a, b) => a.ranges[0].lo - b.ranges[0].lo); + const merged: { ranges: Range[]; to: number }[] = []; + for (const e of edges) { + const last = merged[merged.length - 1]; + if (last && last.to === e.to && last.ranges[last.ranges.length - 1].hi + 1 === e.ranges[0].lo) { + last.ranges[last.ranges.length - 1].hi = e.ranges[0].hi; + } else merged.push({ ranges: [{ ...e.ranges[0] }], to: e.to }); + } + dfa[id].edges = merged; + } + return dfa; +} + +function dfaNext(state: DfaState, code: number): number { + for (const e of state.edges) { + for (const r of e.ranges) { + if (code < r.lo) break; // ranges are sorted ascending + if (code <= r.hi) return e.to; + } + } + return -1; +} + +// Run the DFA from `pos`, recording every accepting length. Returns the lengths in +// DESCENDING order (longest first) — what a greedy regex would prefer, and what the +// trailing-lookahead retry needs. +function runAcceptLengths(dfa: DfaState[], s: string, pos: number): number[] { + const accepts: number[] = []; + let state = 0, i = pos; + if (dfa[0].accept) accepts.push(0); + while (state >= 0 && i < s.length) { + const next = dfaNext(dfa[state], s.charCodeAt(i)); + if (next < 0) break; + state = next; + i++; + if (dfa[state].accept) accepts.push(i - pos); + } + return accepts.reverse(); +} + +// ── Public compile ── +export interface TokenDfa { + /** Match length at `pos`, or -1 — byte-identical to the token's sticky regex exec. */ + match(s: string, pos: number): number; +} + +// The compiled DFA + any trailing char-class assertion, exposed so a code emitter can +// turn it into specialized straight-line JS (a generic interpreter over this structure +// is SLOWER than V8's regex — the win is in emitting tight char-code branches). +export type { DfaState }; +export interface CompiledTokenDfa { states: DfaState[]; trailing: { ranges: Range[]; negate: boolean } | null } + +export function buildTokenDfaRaw(pattern: TokenPattern): CompiledTokenDfa | null { + try { + const look = trailingLookahead(pattern); + const nfa = new Nfa(); + const [start, accept] = build(nfa, look ? look.body : pattern); + const states = buildDfa(nfa, start, accept); + return { states, trailing: look ? { ranges: look.ranges, negate: look.negate } : null }; + } catch (e) { + if (e instanceof UnsupportedPattern) return null; + throw e; + } +} + +// ── DFA → specialized straight-line JS ── +// A GENERIC interpreter over the DFA is slower than V8's JIT-compiled regex; the win is +// in emitting tight char-code branches (measured ~1.3–1.6× over the sticky regex on the +// common tokens). Above this many DFA states the emitted switch stops paying off (a large +// escape-heavy token like a string literal lands ~even with the regex), so we decline and +// the caller keeps the regex — correctness is identical either way. +const MAX_SCANNER_STATES = 64; + +function rangesCond(ranges: Range[], v: string): string { + return ranges.map(r => r.lo === r.hi ? `${v}===${r.lo}` : `${v}>=${r.lo}&&${v}<=${r.hi}`).join('||'); +} + +/** + * Emit a token scanner as a JS function BODY with parameters `(s, pos, re)`: returns the + * match length at `pos` (byte-identical to the token's sticky regex), or -1. `re` is the + * token's own regex, used only on the rare trailing-lookahead retry. Returns null when the + * pattern is outside the supported subset or its DFA is too large (caller keeps the regex). + */ +export function emitTokenScannerBody(pattern: TokenPattern): string | null { + const compiled = buildTokenDfaRaw(pattern); + if (!compiled) return null; + const { states, trailing } = compiled; + if (states.length > MAX_SCANNER_STATES) return null; + const accept = states.map(s => s.accept); + const L: string[] = []; + L.push(`const n=s.length;let i=pos,st=0,acc=${accept[0] ? 0 : -1};`); + L.push(`for(;;){if(i>=n)break;const c=s.charCodeAt(i);switch(st){`); + states.forEach((state, si) => { + if (state.edges.length === 0) { L.push(`case ${si}:break;`); return; } + let body = `case ${si}:{`; + for (const e of state.edges) { + const cond = rangesCond(e.ranges, 'c'); + body += `if(${e.ranges.length > 1 ? `(${cond})` : cond}){st=${e.to};i++;${accept[e.to] ? 'acc=i-pos;' : ''}continue;}`; + } + L.push(body + 'break;}'); + }); + L.push('}break;}'); + if (trailing) { + // longest accept = acc; a trailing `(?!class)`/`(?=class)` may force a shorter match — + // rare (well-formed input ends the token at a boundary), so defer that to the regex. + L.push('if(acc<0)return -1;const at=pos+acc;const cc=at number) | null { + const body = emitTokenScannerBody(pattern); + if (body === null) return null; + const fn = new Function('s', 'pos', 're', body) as (s: string, pos: number, re: RegExp) => number; + return (s, pos) => fn(s, pos, regex); +} + +// A trailing `(?!class)` / `(?=class)` over a single char class is the only look-around +// the numeric tokens use; supported by retrying shorter body matches until the assertion +// at the body's end holds. Detected structurally on the IR. +function trailingLookahead(pattern: TokenPattern): { body: TokenPattern; ranges: Range[]; negate: boolean } | null { + if (typeof pattern === 'string' || pattern.type !== 'seq') return null; + const last = pattern.items[pattern.items.length - 1]; + if (typeof last === 'string' || last.type !== 'lookahead') return null; + const inner = last.body; + if (typeof inner === 'string' || inner.type !== 'charClass') return null; // only a char-class assertion + const body: TokenPattern = pattern.items.length === 2 + ? pattern.items[0] + : { type: 'seq', items: pattern.items.slice(0, -1) }; + return { body, ranges: classRanges(inner.items, inner.negate), negate: last.negate }; +} + +function inRanges(ranges: Range[], code: number): boolean { + for (const r of ranges) if (code >= r.lo && code <= r.hi) return true; + return false; +} + +/** + * Compile a token's pattern to a char-code DFA matcher, or return null if the pattern + * uses a construct outside the supported regular subset (caller falls back to regex). + */ +export function compileTokenDfa(pattern: TokenPattern): TokenDfa | null { + try { + const look = trailingLookahead(pattern); + if (look) { + const nfa = new Nfa(); + const [start, accept] = build(nfa, look.body); + const dfa = buildDfa(nfa, start, accept); + const { ranges, negate } = look; + return { + match(s, pos) { + const lens = runAcceptLengths(dfa, s, pos); // longest first + for (const len of lens) { + const at = pos + len; + const has = at < s.length && inRanges(ranges, s.charCodeAt(at)); + // negative lookahead succeeds when the char is absent (incl. EOF); positive needs it present. + if (negate ? !has : has) return len; + } + return -1; + }, + }; + } + const nfa = new Nfa(); + const [start, accept] = build(nfa, pattern); + const dfa = buildDfa(nfa, start, accept); + return { + match(s, pos) { + const lens = runAcceptLengths(dfa, s, pos); + return lens.length ? lens[0] : -1; + }, + }; + } catch (e) { + if (e instanceof UnsupportedPattern) return null; + throw e; + } +} diff --git a/test/token-dfa-verify.ts b/test/token-dfa-verify.ts new file mode 100644 index 0000000..a86f6c8 --- /dev/null +++ b/test/token-dfa-verify.ts @@ -0,0 +1,74 @@ +// Correctness + speed gate for token-dfa.ts: for every TS token whose pattern compiles +// to a DFA, the DFA's match length must equal the token's sticky-regex match length at +// EVERY position of the corpus (byte-identical), and we measure the per-token speedup. +// +// node test/token-dfa-verify.ts +import { compileTokenDfa } from '../src/token-dfa.ts'; +import { tokenPatternSource } from '../src/token-pattern.ts'; +import { readFileSync, readdirSync } from 'fs'; +import { join } from 'path'; + +const grammar = (await import('../typescript.ts')).default; + +const base = '/tmp/ts-repo/tests/cases/conformance'; +function walk(d: string): string[] { + const o: string[] = []; + for (const e of readdirSync(d, { withFileTypes: true })) { + const f = join(d, e.name); + if (e.isDirectory()) o.push(...walk(f)); + else if (e.name.endsWith('.ts') && !e.name.endsWith('.d.ts')) o.push(f); + } + return o; +} +const files = walk(base).sort().filter((_, i) => i % 11 === 0); // ~stride sample +const sources = files.map(f => { try { return readFileSync(f, 'utf-8'); } catch { return ''; } }).filter(Boolean); +const totalChars = sources.reduce((a, s) => a + s.length, 0); + +// Tokens the per-position lexer loop actually runs through a regex (skip template). +const tokens = grammar.tokens.filter(t => !t.template); + +console.log(`tokens: ${tokens.length} · corpus sample: ${sources.length} files, ${(totalChars / 1024).toFixed(0)} KB\n`); +console.log('token DFA? positions mism regex ms dfa ms speedup'); +console.log('-'.repeat(78)); + +let totalMism = 0, compiled = 0, fellBack = 0; +for (const t of tokens) { + let src: string; + try { src = tokenPatternSource(t); } catch { src = ''; } + const dfa = compileTokenDfa(t.pattern); + if (!dfa) { + fellBack++; + console.log(`${t.name.padEnd(16)} regex ${'—'.padStart(10)} ${'—'.padStart(4)} (unsupported → falls back to regex)`); + continue; + } + compiled++; + const re = new RegExp(`(?:${src})`, 'y'); + + // Correctness: at every position, DFA length === regex length. + let mism = 0, positions = 0; + for (const s of sources) { + for (let pos = 0; pos < s.length; pos++) { + re.lastIndex = pos; + const m = re.exec(s); + const reLen = m ? m[0].length : -1; + const dfaLen = dfa.match(s, pos); + positions++; + if (reLen !== dfaLen) { + if (mism < 3) console.log(` MISMATCH @${pos} re=${reLen} dfa=${dfaLen} ctx=${JSON.stringify(s.slice(pos, pos + 24))}`); + mism++; + } + } + } + totalMism += mism; + + // Speed: scan each source once via regex vs DFA (best-of-5). + const timeRe = () => { let acc = 0; for (const s of sources) for (let p = 0; p < s.length; p++) { re.lastIndex = p; const m = re.exec(s); acc += m ? m[0].length : 0; } return acc; }; + const timeDfa = () => { let acc = 0; for (const s of sources) for (let p = 0; p < s.length; p++) { const l = dfa.match(s, p); acc += l > 0 ? l : 0; } return acc; }; + const best = (fn: () => number) => { for (let w = 0; w < 2; w++) fn(); let b = Infinity; for (let r = 0; r < 5; r++) { const t0 = process.hrtime.bigint(); fn(); const dt = Number(process.hrtime.bigint() - t0) / 1e6; if (dt < b) b = dt; } return b; }; + const reMs = best(timeRe), dfaMs = best(timeDfa); + console.log(`${t.name.padEnd(16)} dfa ${String(positions).padStart(10)} ${String(mism).padStart(4)} ${reMs.toFixed(1).padStart(8)} ${dfaMs.toFixed(1).padStart(6)} ${(reMs / dfaMs).toFixed(2)}×`); +} + +console.log('-'.repeat(78)); +console.log(`compiled to DFA: ${compiled} · fell back to regex: ${fellBack} · TOTAL mismatches: ${totalMism}`); +process.exit(totalMism === 0 ? 0 : 1); From 0215b24a4bdd443f1478e05f0986c827e6fcb189 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 06:09:36 +0800 Subject: [PATCH 09/15] Node surgery + EOF-relative spans: a 9MB keystroke re-parse in ~0.05ms After run-adoption, three O(n)-per-edit costs remained on a 9MB flat body: the damage-path list parent re-collected all 180k kids through scratch (and the arena grew by that much per edit), the suffix token spans took a char-delta add-loop, and the spliced parent's suffix kids took a rel add-loop. - Node SURGERY patches the damage path in place. Descend the old tree along single-affected-row kids; at the deepest PURE container (SURG_ELEM: a seq of literals/refs around exactly one '*'/'+' rep of a parseRuleEntry-routed rule - no alt/sep/opt/not at the container's own level, so every probe is owned by a kid row), re-parse only the affected elements with the real rule fn (adoption reuses their undamaged subtrees), require exact rejoin at an old kid start, then splice the kid range and patch lengths up the path. Every check runs before any row is mutated; any failure falls back to the full adoption re-parse. Prefix kids are kept under the adoption watermark rule, made transitive by rowKC (lazy kid-containment bit). Pure insertions at a kid boundary must touch the rep zone (a neighbour element), or the splice would stitch the element into a CLOSED node. Char lengths are re-DERIVED from the token columns, not patched by the char delta: a pure-trivia edit can sit token-inside but char-outside a node (the gap belongs to no node). - EOF-relative token spans: tkOff/tkEnd at/after the damage store value - (srcLen + 1); decode adds the current length back, so updating srcLenP1 IS the suffix shift. Values self-describe by sign; negFrom bounds the flip band (cursor-locality sized). The '>' splice writes its pair sign-consistently with the zone it lands in. - END-relative kid rels: a row kid's kidRel/kidTokRel may be stored relative to the parent's END (strictly negative, decoded with the parent's current lengths), so a surgical splice shifts the whole kid suffix by updating the parent's lengths. Stable across edits while the parent row is untouched; rowNF bounds the per-row band. Leaf kids stay start-relative (packed) - a pure container's trailing leaves get an O(1) backward walk. - incremental-verify now alternates the edits-protocol and char-diff envelopes, and its seeded sessions caught three real holes during development (trivia-boundary length leak, closed-node stitching, Int32 overflow in the relocated-range boundary remap). 9MB keystroke: 10.4ms -> median 0.04ms / p90 0.07ms (~750x vs fresh, steady state; the first edit of a session pays the one-time flip + buffer allocation). 8MB nested real-code shape: median 0.13ms. 81KB: median 0.10ms. Batch is sign-clean: emitted aggregate 11.4-11.6x (unchanged band), 30/30 gates, emit-parser-verify 0 mismatches, emit-lexer-verify byte-identical streams. --- src/emit-lexer.ts | 10 +- src/emit-parser.ts | 457 +++++++++++++++++++++++++++++++++---- test/incremental-verify.ts | 30 ++- 3 files changed, 442 insertions(+), 55 deletions(-) diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index cf4291d..07745ea 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -200,6 +200,8 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` src = source;`); emit(` tokN = 0;`); emit(` parenCachePos = -1;`); + emit(` srcLenP1 = source.length + 1;`); + emit(` negFrom = 0x7fffffff;`); emit(` lexCore(source, 0, -1, 0, -1, 0, 0);`); emit(` return tokN;`); emit(`}`); @@ -245,9 +247,9 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` }`); emit(` if (off >= wndMinOff && dmgPd >= 0`); emit(` && templateStack.length <= dmgDp && parenHeadStack.length <= dmgPd) {`); - emit(` while (wndPtr < altN && altOff[wndPtr] + wndDelta < off) wndPtr++;`); - emit(` if (wndPtr < altN && altOff[wndPtr] + wndDelta === off && altK[wndPtr] === k && altT[wndPtr] === t`); - emit(` && altEnd[wndPtr] + wndDelta === end && altDp[wndPtr] === templateStack.length && altPd[wndPtr] === parenHeadStack.length) {`); + emit(` while (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta < off) wndPtr++;`); + emit(` if (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta === off && altK[wndPtr] === k && altT[wndPtr] === t`); + emit(` && (altEnd[wndPtr] < 0 ? altEnd[wndPtr] + srcLenP1 : altEnd[wndPtr]) + wndDelta === end && altDp[wndPtr] === templateStack.length && altPd[wndPtr] === parenHeadStack.length) {`); emit(` wndHit = wndPtr;`); emit(` }`); emit(` }`); @@ -514,7 +516,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// head (always sound, degrades to a full re-lex).`); emit(`function findRestart(cs) {`); emit(` let lo = 0, hi = tokN;`); - emit(` while (lo < hi) { const mid = (lo + hi) >> 1; if (tkEnd[mid] <= cs) lo = mid + 1; else hi = mid; }`); + emit(` while (lo < hi) { const mid = (lo + hi) >> 1; if (tend(mid) <= cs) lo = mid + 1; else hi = mid; }`); emit(` for (let b = lo - 1; b >= 0; b--) {`); emit(` // template depth must be zero (interp brace counters are not reconstructable),`); emit(` // and the anchor token must leave no cross-token lexer flag live: not a`); diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 41d571c..f17a4ac 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -715,7 +715,7 @@ class Emitter { // The run-extension target of a repetition: when the body unwraps to a plain ref of // a rule that routes through parseRuleEntry (pratt / left-rec / spine), its rule id; // else -1 (the loop gets no extension hook — adoption stays element-by-element). - private quantRunRuleId(body: RuleExpr): number { + quantRunRuleId(body: RuleExpr): number { const a = this.a; let expr = body; while (true) { @@ -1388,6 +1388,17 @@ let tkPd = new Uint16Array(4096); let tkCap = 4096; let tokN = 0; let src = ''; +// ── EOF-relative spans (incremental sessions) ── +// A token's tkOff/tkEnd may be stored EOF-RELATIVE (value − (srcLen + 1), strictly +// negative): the decode adds the CURRENT length back, so a pure suffix never needs +// the O(suffix) add-loop a char delta would otherwise force — updating srcLenP1 IS +// the shift. Values self-describe by sign, so mixed zones stay readable; negFrom +// only bounds where negatives may exist (the flip-band maintenance range). Batch +// parses are all-positive and the decode branch never fires. +let srcLenP1 = 1; +let negFrom = 0x7fffffff; +function toff(i) { const v = tkOff[i]; return v < 0 ? v + srcLenP1 : v; } +function tend(i) { const v = tkEnd[i]; return v < 0 ? v + srcLenP1 : v; } ${e.soa ? '' : 'let tkText = []; // fallback-lexer text column (synthetic tokens are not source spans)'} function growTok() { tkCap *= 2; @@ -1423,6 +1434,23 @@ let rowExt = new Int32Array(8192); // parse and must never be adopted into a normal entry (the memo carry never stored // those; adoption must not widen the contract). let rowOK = new Uint8Array(8192); +// kid-containment bit (lazy): 0 unknown, 1 = every kid's probe watermark stays +// at/below the next kid's start (so a prefix-keep check of the LAST kept kid +// transitively bounds all earlier ones), 2 = violated somewhere. Computed on +// first surgical use of a row, maintained across in-place splices. +let rowKC = new Uint8Array(8192); +// END-RELATIVE kid rels (incremental sessions): a ROW kid's kidTokRel/kidRel may be +// stored relative to the parent's END (value − (parentLen + 1), strictly negative); +// the decode adds the parent's CURRENT length back. A surgical splice then shifts +// the whole suffix by updating the parent's lengths — no per-kid add-loop — and the +// values stay correct as long as the parent row is unedited (only surgery changes a +// row's lengths, and it maintains its own band). Leaf kids pack their rel inside the +// kids value and always stay start-relative (the trailing-leaf walk shifts them +// eagerly). rowNF = first kid index (absolute, like rowStart) that may hold an +// end-relative value; batch parses never flip, so the decode branch never fires. +let rowNF = new Int32Array(8192).fill(0x7fffffff); +function ktr(p, k) { const v = kidTokRel[k]; return v < 0 ? v + rowTokLen[p] + 1 : v; } +function kcr(p, k) { const v = kidRel[k]; return v < 0 ? v + rowLen[p] + 1 : v; } // transient BUILD coordinates (absolute), valid for rows completed in the current // parse and REFRESHED at memo-hit time for reused roots — parents read them at // finishNode to write the children's relative fields; never part of the green tree. @@ -1453,6 +1481,8 @@ function growRows() { const c = new Int32Array(rowCap); c.set(rowCount); rowCount = c; const x = new Int32Array(rowCap); x.set(rowExt); rowExt = x; const ok = new Uint8Array(rowCap); ok.set(rowOK); rowOK = ok; + const kc = new Uint8Array(rowCap); kc.set(rowKC); rowKC = kc; + const nf = new Int32Array(rowCap).fill(0x7fffffff); nf.set(rowNF.subarray(0, nodeN)); rowNF = nf; const ac = new Int32Array(rowCap); ac.set(absChar); absChar = ac; const at = new Int32Array(rowCap); at.set(absTok); absTok = at; } @@ -1466,8 +1496,8 @@ function scPush(e) { if (scn === scCap) { scCap *= 2; const s = new Int32Array(scCap); s.set(sc); sc = s; } sc[scn++] = e; } -function entryOff(e) { return e >= 0 ? absChar[e] : tkOff[(~e) >>> 2]; } -function entryEnd(e) { return e >= 0 ? absChar[e] + rowLen[e] : tkEnd[(~e) >>> 2]; } +function entryOff(e) { return e >= 0 ? absChar[e] : toff((~e) >>> 2); } +function entryEnd(e) { return e >= 0 ? absChar[e] + rowLen[e] : tend((~e) >>> 2); } function entryTok(e) { return e >= 0 ? absTok[e] : (~e) >>> 2; } function entryTokEnd(e) { return e >= 0 ? absTok[e] + rowTokLen[e] : ((~e) >>> 2) + 1; } // Complete a node whose children are scratch[mark..scn): copy them into kids, write @@ -1509,6 +1539,8 @@ function finishNode(rid, mark) { rowTokLen[id] = myTokEnd - myTok; rowExt[id] = maxPos - myTok; rowOK[id] = 0; + rowKC[id] = 0; + rowNF[id] = 0x7fffffff; absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; @@ -1543,6 +1575,8 @@ function finishWrap(rid, lhsId, mark) { rowTokLen[id] = myTokEnd - myTok; rowExt[id] = maxPos - myTok; rowOK[id] = 0; + rowKC[id] = 0; + rowNF[id] = 0x7fffffff; absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; @@ -1569,8 +1603,8 @@ let suppressNext = null; let suppressCur = null; function offset() { - if (pos < cap) return tkOff[pos]; - return tokN > 0 ? tkEnd[tokN - 1] : 0; + if (pos < cap) return toff(pos); + return tokN > 0 ? tend(tokN - 1) : 0; } // ── Lever 1: integer-kind matchers ── @@ -1600,7 +1634,7 @@ function matchPuLit(pu) { } function matchPuLitGT(pu) { if (pos >= cap) return false; - const off = tkOff[pos]; + const off = toff(pos); if (tkT[pos] === pu) { scPush(~(pos << 2)); if (++pos > maxPos) maxPos = pos; @@ -1609,8 +1643,8 @@ function matchPuLitGT(pu) { // Split multi-'>' tokens: '>>', '>>>', '>>=', '>>>=' can yield a single '>': shift the // columns up one slot and write the '>' + rest pair in place (both born flag-less, // matching the old mkPunct pair). - if (tkK[pos] === K_PUNCT && tkEnd[pos] - off > 1 && ${e.soa ? 'src.charCodeAt(off) === 62' : "tkText[pos].charCodeAt(0) === 62"}) { - const end0 = tkEnd[pos]; + if (tkK[pos] === K_PUNCT && tend(pos) - off > 1 && ${e.soa ? 'src.charCodeAt(off) === 62' : "tkText[pos].charCodeAt(0) === 62"}) { + const end0 = tend(pos); ${e.soa ? '' : 'const restText = tkText[pos].slice(1);'} if (tokN === tkCap) growTok(); parenCachePos = -1; @@ -1622,8 +1656,17 @@ function matchPuLitGT(pu) { tkPd.copyWithin(pos + 1, pos, tokN); tkFl.copyWithin(pos + 1, pos, tokN); ${e.soa ? '' : "tkText.splice(pos, 1, '>', restText);"} - tkT[pos] = pu; tkEnd[pos] = off + 1; tkFl[pos] = 0; - tkOff[pos + 1] = off + 1; tkFl[pos + 1] = 0; + // Keep the EOF-relative zone invariant: a split at/past negFrom writes the new + // pair EOF-relative (a positive value there would not ride later srcLenP1 + // shifts); below it, the boundary index moves up one slot with the suffix. + if (pos < negFrom) { + negFrom++; + tkT[pos] = pu; tkEnd[pos] = off + 1; tkFl[pos] = 0; + tkOff[pos + 1] = off + 1; tkFl[pos + 1] = 0; + } else { + tkT[pos] = pu; tkEnd[pos] = off + 1 - srcLenP1; tkFl[pos] = 0; + tkOff[pos + 1] = off + 1 - srcLenP1; tkFl[pos + 1] = 0; + } tkT[pos + 1] = ${e.soa ? 'LIT_PU.get(src.slice(off + 1, end0)) ?? 0' : 'LIT_PU.get(restText) ?? 0'}; tokN++; if (parseLimit < 0) cap = tokN; @@ -1715,6 +1758,37 @@ function emitRuleFns(e: Emitter, a: ReturnType) { e.emit(`const RULES = {`); for (const rule of a.grammar.rules) e.emit(` ${J(rule.name)}: ${ruleFn(rule.name)},`); e.emit(`};`); + + // Surgical-container table: rule id → its repetition element's rule id, for rules + // whose body is a PURE seq/group of literals/refs around exactly one '*'/'+' rep + // of a parseRuleEntry-routed rule. No alt/sep/opt/not anywhere in the body: a + // longest-match arm (or lookahead) at the container's OWN level may probe into + // the rep zone without any kid row owning the read, which would break the + // prefix-keep watermark argument node surgery relies on. + const surg: number[] = a.grammar.rules.map(() => -1); + a.grammar.rules.forEach((rule, ri) => { + if (a.prattRules.has(rule.name) || a.leftRecSet.has(rule.name)) return; + let reps = 0; let bad = false; let elem = -1; + const walk = (x: RuleExpr): void => { + if (bad) return; + switch (x.type) { + case 'seq': x.items.forEach(walk); return; + case 'group': + if (x.suppress && x.suppress.length) { bad = true; return; } + walk(x.body); return; + case 'literal': case 'ref': case 'op': case 'prefix': case 'postfix': return; + case 'quantifier': + if (x.kind === '?') { bad = true; return; } + reps++; elem = e.quantRunRuleId(x.body); + return; + default: bad = true; return; + } + }; + walk(rule.body); + if (!bad && reps === 1 && elem >= 0) surg[ri] = elem; + }); + e.emit(`const SURG_ELEM = new Int32Array([${surg.join(',')}]);`); + e.emit(`const RULE_FN_BY_ID = [${a.grammar.rules.map(r => ruleFn(r.name)).join(', ')}];`); } // Non-recursive rule: longest-match over alts (mirrors parseNonRec). A better arm is @@ -1918,7 +1992,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` const _h = kids[rowStart[lhs]];`); e.emit(` if (_h < 0 && ((~_h) & 3) === 2) {`); e.emit(` const _ht = absTok[lhs] + ((~_h) >>> 2);`); - e.emit(` const _htext = ${e.soa ? 'src.slice(tkOff[_ht], tkEnd[_ht])' : 'tkText[_ht]'};`); + e.emit(` const _htext = ${e.soa ? 'src.slice(toff(_ht), tend(_ht))' : 'tkText[_ht]'};`); e.emit(` if (prefixOps.has(_htext) && !postfixOpValues.has(_htext)) { return -1; }`); e.emit(` }`); e.emit(` }`); @@ -2074,7 +2148,7 @@ function parseRuleEntry(idx, rid, name, core) { // (its green internals are position-independent; only the attachment point — // what the enclosing finishNode reads — must be current). absTok[id] = start; - absChar[id] = tkOff[start]; + absChar[id] = toff(start); scPush(id); return true; } @@ -2092,7 +2166,7 @@ function parseRuleEntry(idx, rid, name, core) { const ext = start + rowExt[aid]; if (ext > maxPos) maxPos = ext; absTok[aid] = start; - absChar[aid] = tkOff[start]; + absChar[aid] = toff(start); if (adoptHitP >= 0) { adoptRunPos = pos; adoptRunRid = rid; adoptRunGen = memoGenCur; adoptRunP = adoptHitP; adoptRunKid = adoptHitKid + 1; @@ -2153,7 +2227,7 @@ function parseRuleEntry(idx, rid, name, core) { // Token text at an arbitrary index (cold paths: errors, the tokenAt debug view). function tokTextAt(i) { - return ${e.soa ? 'src.slice(tkOff[i], tkEnd[i])' : 'tkText[i]'}; + return ${e.soa ? 'src.slice(toff(i), tend(i))' : 'tkText[i]'}; } // The k → type-name inverse, for reconstructing a token object (tokenAt). const K_NAMES = []; @@ -2163,7 +2237,7 @@ export function tokenAt(i) { return { type: K_NAMES[tkK[i]] ?? '', text: tokTextAt(i), - offset: tkOff[i], + offset: toff(i), k: tkK[i], t: tkT[i], newlineBefore: (tkFl[i] & 1) !== 0, @@ -2201,11 +2275,11 @@ export const tree = { lenOf: (id) => rowLen[id], tokLenOf: (id) => rowTokLen[id], // a node CHILD's relative coordinates live on the parent edge (kids-parallel) - childRelAt: (id, i) => kidRel[rowStart[id] + i], - childTokRelAt: (id, i) => kidTokRel[rowStart[id] + i], + childRelAt: (id, i) => kcr(id, rowStart[id] + i), + childTokRelAt: (id, i) => ktr(id, rowStart[id] + i), // base-threaded spans: nodes from their bases, leaves from the token columns - offsetOf: (entry, charBase, tokBase) => entry >= 0 ? charBase : tkOff[tokBase + ((~entry) >>> 2)], - endOf: (entry, charBase, tokBase) => entry >= 0 ? charBase + rowLen[entry] : tkEnd[tokBase + ((~entry) >>> 2)], + offsetOf: (entry, charBase, tokBase) => entry >= 0 ? charBase : toff(tokBase + ((~entry) >>> 2)), + endOf: (entry, charBase, tokBase) => entry >= 0 ? charBase + rowLen[entry] : tend(tokBase + ((~entry) >>> 2)), childCount: (id) => rowCount[id], childAt: (id, i) => kids[rowStart[id] + i], // Bulk child load into a caller-owned array; returns the count. One call per node @@ -2223,11 +2297,11 @@ export const tree = { // 1 '$keyword', 2 '$operator' — and the token's TYPE kind int (1 = punctuation). leafKindOf: (entry) => (~entry) & 3, leafTokKindOf: (entry, tokBase) => tkK[tokBase + ((~entry) >>> 2)], - leafOffsetOf: (entry, tokBase) => tkOff[tokBase + ((~entry) >>> 2)], - leafEndOf: (entry, tokBase) => tkEnd[tokBase + ((~entry) >>> 2)], + leafOffsetOf: (entry, tokBase) => toff(tokBase + ((~entry) >>> 2)), + leafEndOf: (entry, tokBase) => tend(tokBase + ((~entry) >>> 2)), textOf: (entry, source, charBase, tokBase) => entry >= 0 ? source.slice(charBase, charBase + rowLen[entry]) - : source.slice(tkOff[tokBase + ((~entry) >>> 2)], tkEnd[tokBase + ((~entry) >>> 2)]), + : source.slice(toff(tokBase + ((~entry) >>> 2)), tend(tokBase + ((~entry) >>> 2))), }; // Depth-first traversal from a node id or leaf entry: // enter(id) — each NODE before its children; return false to skip its subtree @@ -2245,7 +2319,7 @@ export function visit(entry, fns, charBase, tokBase) { for (let i = 0; i < n; i++) { const e = kids[cs + i]; if (e < 0) { if (fns.leaf) fns.leaf(e, tokBase + ((~e) >>> 2)); } - else visit(e, fns, charBase + kidRel[cs + i], tokBase + kidTokRel[cs + i]); + else visit(e, fns, charBase + kcr(entry, cs + i), tokBase + ktr(entry, cs + i)); } if (fns.leave) fns.leave(entry, charBase, tokBase); } @@ -2258,8 +2332,8 @@ export function toObject(id, charBase, tokBase) { const children = new Array(n); for (let i = 0; i < n; i++) { const entry = kids[cs + i]; - children[i] = entry >= 0 ? toObject(entry, charBase + kidRel[cs + i], tokBase + kidTokRel[cs + i]) - : { tokenType: leafTokenType(entry, tokBase), offset: tkOff[tokBase + ((~entry) >>> 2)], end: tkEnd[tokBase + ((~entry) >>> 2)] }; + children[i] = entry >= 0 ? toObject(entry, charBase + kcr(id, cs + i), tokBase + ktr(id, cs + i)) + : { tokenType: leafTokenType(entry, tokBase), offset: toff(tokBase + ((~entry) >>> 2)), end: tend(tokBase + ((~entry) >>> 2)) }; } return { rule: RULE_NAMES[rowRule[id]], children, offset: charBase, end: charBase + rowLen[id] }; } @@ -2283,7 +2357,7 @@ ${e.soa ? ` tokenize(source);` : String.raw` src = source; function farthest(errPos) { if (maxPos <= errPos || maxPos >= tokN) return ''; - return ' [farthest: offset ' + tkOff[maxPos] + " near '" + tokTextAt(maxPos).slice(0, 20) + "']"; + return ' [farthest: offset ' + toff(maxPos) + " near '" + tokTextAt(maxPos).slice(0, 20) + "']"; } // Run the entry rule over the CURRENT token stream (shared by parse / parseEdited — @@ -2306,10 +2380,10 @@ function runParse(entryRule) { } if (!RULES[entry]()) { const hasTok = pos < cap; - throw new Error('Parse error at offset ' + (hasTok ? tkOff[pos] : 0) + ': unexpected ' + (hasTok ? "'" + tokTextAt(pos) + "'" : 'end of input') + farthest(pos)); + throw new Error('Parse error at offset ' + (hasTok ? toff(pos) : 0) + ': unexpected ' + (hasTok ? "'" + tokTextAt(pos) + "'" : 'end of input') + farthest(pos)); } if (pos < tokN) { - throw new Error('Parse error at offset ' + tkOff[pos] + ": unexpected '" + tokTextAt(pos) + "' after successful parse" + farthest(pos)); + throw new Error('Parse error at offset ' + toff(pos) + ": unexpected '" + tokTextAt(pos) + "' after successful parse" + farthest(pos)); } const rootId = sc[--scn]; rootCharBase = absChar[rootId]; rootTokBase = absTok[rootId]; @@ -2377,13 +2451,13 @@ function adoptSeek(q, rid) { while (lo < hi) { const mid = (lo + hi) >> 1; const e = kids[cs + mid]; - const end = e < 0 ? base + ((~e) >>> 2) + 1 : base + kidTokRel[cs + mid] + rowTokLen[e]; + const end = e < 0 ? base + ((~e) >>> 2) + 1 : base + ktr(id, cs + mid) + rowTokLen[e]; if (end <= q) lo = mid + 1; else hi = mid; } if (lo >= n) return -1; const e = kids[cs + lo]; if (e < 0) return -1; // the position is a leaf here - const cb = base + kidTokRel[cs + lo]; + const cb = base + ktr(id, cs + lo); if (cb > q) return -1; // a gap — nothing starts at q if (cb === q) { // the exploratory chain: every node from here down whose start is exactly q @@ -2397,7 +2471,7 @@ function adoptSeek(q, rid) { const xcs = rowStart[xid]; if (rowCount[xid] === 0) return -1; const fe = kids[xcs]; - if (fe < 0 || kidTokRel[xcs] !== 0) return -1; + if (fe < 0 || ktr(xid, xcs) !== 0) return -1; adoptHitP = -1; xid = fe; xb = xb; } @@ -2429,13 +2503,13 @@ function runExtend(rid) { while (i < csEnd) { const e = kids[i]; if (e < 0) break; - if (pb + kidTokRel[i] !== oq) break; + if (pb + ktr(P, i) !== oq) break; if (rowRule[e] !== rid || rowOK[e] === 0) break; const tl = rowTokLen[e]; if (tl === 0) break; const ex = rowExt[e]; if (!sfx && oq + ex + 2 > adoptDmgStart) break; - absTok[e] = nq; absChar[e] = tkOff[nq]; + absTok[e] = nq; absChar[e] = toff(nq); scPush(e); const w = nq + ex; if (w > mp) mp = w; @@ -2446,6 +2520,277 @@ function runExtend(rid) { pos = nq; } +// ── Node SURGERY: patch the damage path in place ── +// Even with run-adoption, a keystroke inside one statement of a large list rebuilds +// every node on the damage path — the list parent re-collects ALL its kids through +// scratch (and the arena grows by that much per edit). Surgery keeps those rows: +// descend the old tree to the deepest PURE container (SURG_ELEM), re-parse only the +// affected elements with the real rule fn (adoption reuses their undamaged +// subtrees), and when the fresh elements REJOIN an old kid start exactly, splice the +// container's kid range and shift the suffix rels by the edit deltas. Every check +// happens BEFORE any row is mutated; any failure falls back to the full adoption +// re-parse. Prefix kids are kept under the same watermark rule single adoption +// uses, made transitive by rowKC: each kid's probe watermark stays at/below the +// next kid's start, so checking the LAST kept kid bounds them all. +let surgX = [], surgBase = [], surgA = [], surgB = []; +function rowKCof(id) { + const c = rowKC[id]; + if (c !== 0) return c; + const cs = rowStart[id], n = rowCount[id]; + let ok = 1, prevW = -1; + for (let k = 0; k < n; k++) { + const e = kids[cs + k]; + const st = e < 0 ? (~e) >>> 2 : ktr(id, cs + k); + if (prevW > st) { ok = 2; break; } + prevW = e < 0 ? st + 1 : st + rowExt[e]; + } + rowKC[id] = ok; + return ok; +} +function trySurgery(dmgA, dmgB, tokD, chrD) { + if (adoptRoot < 0) return -1; + // the whole-file token math must close, or the shape changed beyond a splice + if (adoptRootTok + rowTokLen[adoptRoot] + tokD !== tokN) return -1; + // 1. descend along single-affected-row kids, recording the path + surgX.length = 0; surgBase.length = 0; surgA.length = 0; surgB.length = 0; + let X = adoptRoot, base = adoptRootTok; + for (;;) { + const cs = rowStart[X], n = rowCount[X]; + let lo = 0, hi = n; + while (lo < hi) { + const m = (lo + hi) >> 1; + const e = kids[cs + m]; + const st = base + (e < 0 ? (~e) >>> 2 : ktr(X, cs + m)); + if (st < dmgB) lo = m + 1; else hi = m; + } + const b = lo; + let a = b; + while (a > 0) { + const e = kids[cs + a - 1]; + const st = base + (e < 0 ? (~e) >>> 2 : ktr(X, cs + a - 1)); + if (e < 0 ? st < dmgA : st + rowExt[e] + 2 <= dmgA) break; + a--; + } + surgX.push(X); surgBase.push(base); surgA.push(a); surgB.push(b); + if (b - a !== 1) break; + const e = kids[cs + a]; + if (e < 0 || rowCount[e] === 0) break; + base = base + ktr(X, cs + a); + X = e; + } + // 2. choose D: the deepest surgical level whose affected kids are all rep rows + let L = -1; + for (let i = surgX.length - 1; i >= 0; i--) { + const Xi = surgX[i]; + const elem = SURG_ELEM[rowRule[Xi]]; + if (elem < 0) continue; + const cs = rowStart[Xi]; + const ai = surgA[i], bi = surgB[i]; + let okR = true; + for (let k = ai; k < bi; k++) { + const e = kids[cs + k]; + if (e < 0 || rowRule[e] !== elem) { okR = false; break; } + } + if (!okR) continue; + if (bi === ai) { + // pure insertion at a kid boundary: it must sit INSIDE the rep zone — at + // least one neighbour is an element row. Otherwise the insertion belongs to + // an enclosing list (e.g. right after this container's closing brace, where + // an element-loop alignment would stitch the new element into a CLOSED node). + const pe = ai > 0 ? kids[cs + ai - 1] : -1; + const ne = ai < rowCount[Xi] ? kids[cs + ai] : -1; + const prevOk = pe >= 0 && rowRule[pe] === elem; + const nextOk = ne >= 0 && rowRule[ne] === elem; + if (!prevOk && !nextOk) continue; + } + if (ai > 0 && rowKCof(Xi) !== 1) continue; + L = i; + break; + } + if (L < 0) return -1; + const D = surgX[L], Dbase = surgBase[L], Da = surgA[L]; + const Db = surgB[L]; + const elem = SURG_ELEM[rowRule[D]]; + const csD = rowStart[D], nD = rowCount[D]; + const DendNew = Dbase + rowTokLen[D] + tokD; + // 3. re-parse the affected span with the real rule (adoption live); the first + // affected kid starts at/before the damage, so old == new coordinates there + pos = Da < Db + ? Dbase + (kids[csD + Da] < 0 ? (~kids[csD + Da]) >>> 2 : ktr(D, csD + Da)) + : dmgA; + maxPos = pos; scn = 0; parseLimit = -1; cap = tokN; + currentPrattContext = null; suppressNext = null; suppressCur = null; + const genAt = memoGenCur; + const fn = RULE_FN_BY_ID[elem]; + let j = Db, guard = 0; + for (;;) { + let target; + if (j < nD) { + const e = kids[csD + j]; + target = Dbase + (e < 0 ? (~e) >>> 2 : ktr(D, csD + j)) + tokD; + } else target = DendNew; + if (pos === target) break; + if (pos > target) { + // the fresh parse consumed past old kid j: only a rep row may be subsumed + if (j >= nD) return -1; + const e = kids[csD + j]; + if (e < 0 || rowRule[e] !== elem) return -1; + j++; + continue; + } + if (++guard > 65536) return -1; + const pp = pos; + if (!fn()) return -1; + if (memoGenCur !== genAt || pos === pp) return -1; + } + // 4. POINT OF NO RETURN — splice D's kid range, shift suffix rels, patch the path + const f = scn; + const removed = j - Da; + const DcharBase = toff(Dbase); + let csD2 = csD; + if (f === removed) { + for (let k = 0; k < f; k++) { + const id = sc[k]; + kids[csD + Da + k] = id; + kidTokRel[csD + Da + k] = absTok[id] - Dbase; + kidRel[csD + Da + k] = absChar[id] - DcharBase; + } + } else { + const n2k = nD - removed + f; + if (kidN + n2k > kidCap) growKids(n2k); + const ks = kidN; + for (let k = 0; k < Da; k++) { + kids[ks + k] = kids[csD + k]; + kidRel[ks + k] = kidRel[csD + k]; + kidTokRel[ks + k] = kidTokRel[csD + k]; + } + for (let k = 0; k < f; k++) { + const id = sc[k]; + kids[ks + Da + k] = id; + kidTokRel[ks + Da + k] = absTok[id] - Dbase; + kidRel[ks + Da + k] = absChar[id] - DcharBase; + } + for (let k = j; k < nD; k++) { + kids[ks + Da + f + (k - j)] = kids[csD + k]; + kidRel[ks + Da + f + (k - j)] = kidRel[csD + k]; + kidTokRel[ks + Da + f + (k - j)] = kidTokRel[csD + k]; + } + kidN = ks + n2k; + rowStart[D] = ks; + rowCount[D] = n2k; + // remap the end-relative boundary into the relocated range (suffix kids kept + // their sign-encoded values; indices shifted by the move + the count change). + // Three cases keep it Int32-safe: no negatives among the copied kids (the + // sentinel maps to itself, NOT through the index arithmetic), all possibly + // negative, or a boundary inside the copied range. + const nfOld = rowNF[D]; + rowNF[D] = nfOld >= csD + nD ? 0x7fffffff + : nfOld <= csD + j ? ks + Da + f + : (nfOld - csD - j) + ks + Da + f; + csD2 = ks; + } + const n2 = rowCount[D]; + // End-relative band maintenance (old lengths — the bias cancels against the new + // ones exactly like the token-level flip): rows entering the suffix flip to + // end-relative; rows leaving it flip back to absolute rels. Rows already beyond + // the old boundary auto-shift via the length update below. Leaf kids cannot be + // sign-encoded (packed): inside the flip-up band they are re-packed eagerly, and + // the trailing run (a pure container's only leaves past the rep) gets the same + // eager shift by the backward walk. + const bnd = csD2 + Da + f; + const nf = rowNF[D]; + const kidsEnd = csD2 + n2; + if (nf < bnd) { + for (let k = nf; k < bnd; k++) { + const v = kidTokRel[k]; + if (v < 0) { kidTokRel[k] = v + rowTokLen[D] + 1; kidRel[k] += rowLen[D] + 1; } + } + } else if (nf > bnd) { + const hi = nf < kidsEnd ? nf : kidsEnd; + for (let k = bnd; k < hi; k++) { + const e = kids[k]; + if (e < 0) { if (tokD !== 0) kids[k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); } + else { + const v = kidTokRel[k]; + if (v >= 0) { kidTokRel[k] = v - rowTokLen[D] - 1; kidRel[k] -= rowLen[D] + 1; } + } + } + } + if (tokD !== 0) { + const tlFrom = nf > bnd ? (nf < kidsEnd ? nf : kidsEnd) : bnd; + for (let k = kidsEnd - 1; k >= tlFrom; k--) { + const e = kids[k]; + if (e >= 0) break; + kids[k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); + } + } + rowNF[D] = bnd; + rowTokLen[D] += tokD; + // Derive the char length from the token columns rather than adding chrD: a pure- + // trivia edit can sit at a node's token BOUNDARY (between its last token and the + // next sibling's first), token-inside but char-outside — the gap belongs to no + // node. tend/toff give the exact new span; when suffix tokens exist inside the + // node the delta equals chrD (so the suffix-kid rel adds and the end-relative + // bias-cancel stay consistent), and when they don't there are no suffix kids. + if (rowTokLen[D] > 0) rowLen[D] = tend(Dbase + rowTokLen[D] - 1) - toff(Dbase); + { + let x = rowExt[D] + (tokD > 0 ? tokD : 0); + const fw = maxPos - Dbase; + if (fw > x) x = fw; + rowExt[D] = x; + } + // containment bit: only the pairs around the splice changed + if (rowKC[D] === 1) { + let okB = 1; + const from = Da > 0 ? Da - 1 : 0; + for (let k = from; k < Da + f && k + 1 < n2; k++) { + const e = kids[csD2 + k]; + const w = e < 0 ? ((~e) >>> 2) + 1 : ktr(D, csD2 + k) + rowExt[e]; + const e2 = kids[csD2 + k + 1]; + const st2 = e2 < 0 ? (~e2) >>> 2 : ktr(D, csD2 + k + 1); + if (w > st2) { okB = 2; break; } + } + rowKC[D] = okB; + } + // 5. ancestors bottom-up: lengths, suffix rels, ext, containment boundary pair + for (let i = L - 1; i >= 0; i--) { + const Ai = surgX[i]; + const csA = rowStart[Ai], nA = rowCount[Ai]; + const ki = surgA[i]; + // kids at/before the path kid are NOT suffix for this edit (the damage sits + // inside the path kid): any end-relative rel there must flip back to absolute + // with the OLD lengths, or the length update below would shift it + const nfA = rowNF[Ai]; + if (nfA <= csA + ki) { + for (let k = nfA; k <= csA + ki; k++) { + const v = kidTokRel[k]; + if (v < 0) { kidTokRel[k] = v + rowTokLen[Ai] + 1; kidRel[k] += rowLen[Ai] + 1; } + } + rowNF[Ai] = csA + ki + 1; + } + for (let k = ki + 1; k < nA; k++) { + const e = kids[csA + k]; + if (e < 0) kids[csA + k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); + else if (kidTokRel[csA + k] >= 0) { kidTokRel[csA + k] += tokD; kidRel[csA + k] += chrD; } + // (end-relative kids past the boundary auto-shift via the length update below) + } + rowTokLen[Ai] += tokD; + if (rowTokLen[Ai] > 0) rowLen[Ai] = tend(surgBase[i] + rowTokLen[Ai] - 1) - toff(surgBase[i]); + { + let x = rowExt[Ai] + (tokD > 0 ? tokD : 0); + const cw = ktr(Ai, csA + ki) + rowExt[surgX[i + 1]]; + if (cw > x) x = cw; + rowExt[Ai] = x; + } + if (rowKC[Ai] === 1 && ki + 1 < nA) { + const e2 = kids[csA + ki + 1]; + const st2 = e2 < 0 ? (~e2) >>> 2 : ktr(Ai, csA + ki + 1); + if (ktr(Ai, csA + ki) + rowExt[surgX[i + 1]] > st2) rowKC[Ai] = 2; + } + } + return adoptRoot; +} + // The spare token-column buffer set (parseEdited ping-pongs between the live set and // this one, so steady-state edits never allocate columns). let altK = null, altT = null, altOff = null, altEnd = null, altFl = null, altDp = null, altPd = null; @@ -2537,7 +2882,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // first old token at/after the damage end — the resync search floor let r0 = oN; { let lo = 0, hi = oN; - while (lo < hi) { const mid = (lo + hi) >> 1; if (tkOff[mid] < ceOld) lo = mid + 1; else hi = mid; } + while (lo < hi) { const mid = (lo + hi) >> 1; if (toff(mid) < ceOld) lo = mid + 1; else hi = mid; } r0 = lo; } // Lex the window into the spare buffers (the old stream stays live for resync). if (altK === null || altCap < tkCap) { @@ -2550,12 +2895,31 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── swapBuffers(); // live = scratch, alt = OLD stream src = source; tokN = 0; - const startOff = B >= 0 ? altEnd[B] : 0; + const startOff = B >= 0 ? (altEnd[B] < 0 ? altEnd[B] + srcLenP1 : altEnd[B]) : 0; const R0 = lexCore(source, startOff, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs, initParens); const W = tokN; const R = R0 >= 0 ? R0 : oN; swapBuffers(); // live = OLD stream again; window sits in the alt buffers tokN = oN; + // EOF-relative maintenance: move the negative-zone boundary to THIS edit's suffix + // start R. Tokens dropping out of the suffix ([negFrom, R)) flip back to absolute + // (they sit at/before the damage now — EOF-unstable); tokens entering it + // ([R, negFrom)) flip to EOF-relative, encoded against the OLD length (their new + // absolute is oldValue + charDelta, and newLen = oldLen + charDelta, so the bias + // cancels). Both bands are cursor-locality sized; the suffix itself is never + // walked again — updating srcLenP1 after the splice IS the char-delta shift the + // old O(suffix) add-loop used to apply. + if (negFrom < R) { + for (let i = negFrom, e2 = R < oN ? R : oN; i < e2; i++) { + const o = tkOff[i]; if (o < 0) tkOff[i] = o + srcLenP1; + const en = tkEnd[i]; if (en < 0) tkEnd[i] = en + srcLenP1; + } + } else if (negFrom > R) { + for (let i = R, e2 = negFrom < oN ? negFrom : oN; i < e2; i++) { + const o = tkOff[i]; if (o >= 0) tkOff[i] = o - srcLenP1; + const en = tkEnd[i]; if (en >= 0) tkEnd[i] = en - srcLenP1; + } + } // TRUE token prefix p: the window re-derives [B+1 .. p) byte-identically; only past // p is real damage (compared BEFORE the splice clobbers the old slots). let p = B + 1; @@ -2568,17 +2932,18 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // ── splice: old[0..B] + window[0..W) + old[R..oN), then shift the suffix spans ── const nN = B + 1 + W + (oN - R); while (tkCap < nN + 1) growTok(); - tkK.copyWithin(B + 1 + W, R, oN); tkT.copyWithin(B + 1 + W, R, oN); - tkOff.copyWithin(B + 1 + W, R, oN); tkEnd.copyWithin(B + 1 + W, R, oN); - tkFl.copyWithin(B + 1 + W, R, oN); tkDp.copyWithin(B + 1 + W, R, oN); tkPd.copyWithin(B + 1 + W, R, oN); + if (R !== B + 1 + W) { + tkK.copyWithin(B + 1 + W, R, oN); tkT.copyWithin(B + 1 + W, R, oN); + tkOff.copyWithin(B + 1 + W, R, oN); tkEnd.copyWithin(B + 1 + W, R, oN); + tkFl.copyWithin(B + 1 + W, R, oN); tkDp.copyWithin(B + 1 + W, R, oN); tkPd.copyWithin(B + 1 + W, R, oN); + } if (W > 0) { tkK.set(altK.subarray(0, W), B + 1); tkT.set(altT.subarray(0, W), B + 1); tkOff.set(altOff.subarray(0, W), B + 1); tkEnd.set(altEnd.subarray(0, W), B + 1); tkFl.set(altFl.subarray(0, W), B + 1); tkDp.set(altDp.subarray(0, W), B + 1); tkPd.set(altPd.subarray(0, W), B + 1); } - if (charDelta !== 0) { - for (let i = B + 1 + W; i < nN; i++) { tkOff[i] += charDelta; tkEnd[i] += charDelta; } - } + negFrom = B + 1 + W; + srcLenP1 = newLen + 1; tokN = nN; const nN2 = nN;` : String.raw` // (fallback-lexer grammars keep the full-relex + token-diff path) const oK = tkK, oT = tkT, oOff = tkOff, oEnd = tkEnd, oFl = tkFl, oN = tokN; @@ -2628,6 +2993,16 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── adoptPath.length = 0; adoptBase.length = 0; adoptRunPos = -1; + const sroot = trySurgery(p, dOldEnd, tokenDelta, charDelta); + if (sroot >= 0) { + adoptRoot = -1; + rootCharBase = toff(adoptRootTok); + rootTokBase = adoptRootTok; + lastRoot = sroot; + lastRootTok = adoptRootTok; + lastSrc = source; + return sroot; + } const root = runParse(entryRule); adoptRoot = -1; lastRoot = root; diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index 2ed248d..f6f8838 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -12,9 +12,10 @@ import { emitParser } from '../src/emit-parser.ts'; const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-incremental.mjs'; writeFileSync(emPath, emitParser(grammar)); +type Edit = { start: number; oldEnd: number; newEnd: number }; type Em = { parse(s: string): number; - parseEdited(s: string): number; + parseEdited(s: string, entryRule?: string, edits?: Edit[]): number; toObject(id: number): unknown; }; const session = (await import(emPath + '?session=' + process.pid)) as Em; @@ -28,28 +29,36 @@ const randInt = (n: number) => Math.floor(rand() * n); const INSERTS = ['x', '_v', '42', ' + y', '.m', '()', ' /*c*/ ', '"s"', 'await ', '!', '?']; const STMTS = ['const q9 = 1;\n', 'function g9(a) { return a; }\n', 'if (x9) { y9(); }\n', '// note\n', 'type T9 = string | number;\n']; -function mutate(text: string): string { +// Mutations return the edit RANGE too, so half the steps can exercise the edits +// PROTOCOL path (the editor-facing API) while the other half exercises the +// char-diff fallback envelope. +function mutate(text: string): { next: string; edit: Edit } { switch (randInt(5)) { case 0: { // insert a small fragment at a random position const at = randInt(text.length); - return text.slice(0, at) + INSERTS[randInt(INSERTS.length)] + text.slice(at); + const ins = INSERTS[randInt(INSERTS.length)]; + return { next: text.slice(0, at) + ins + text.slice(at), edit: { start: at, oldEnd: at, newEnd: at + ins.length } }; } case 1: { // delete a small span const at = randInt(Math.max(1, text.length - 8)); - return text.slice(0, at) + text.slice(at + 1 + randInt(6)); + const n = 1 + randInt(6); + return { next: text.slice(0, at) + text.slice(at + n), edit: { start: at, oldEnd: at + n, newEnd: at } }; } case 2: { // replace a character const at = randInt(Math.max(1, text.length - 1)); - return text.slice(0, at) + 'z' + text.slice(at + 1); + return { next: text.slice(0, at) + 'z' + text.slice(at + 1), edit: { start: at, oldEnd: at + 1, newEnd: at + 1 } }; } case 3: { // insert a whole statement at a line boundary const lines = text.split('\n'); const at = randInt(lines.length); - lines.splice(at, 0, STMTS[randInt(STMTS.length)].trimEnd()); - return lines.join('\n'); + const stmt = STMTS[randInt(STMTS.length)].trimEnd(); + lines.splice(at, 0, stmt); + const start = at === 0 ? 0 : lines.slice(0, at).join('\n').length + 1; + return { next: lines.join('\n'), edit: { start, oldEnd: start, newEnd: start + stmt.length + 1 } }; } default: { // append at the end (the pure-prefix reuse case) - return text + '\n' + STMTS[randInt(STMTS.length)]; + const stmt = '\n' + STMTS[randInt(STMTS.length)]; + return { next: text + stmt, edit: { start: text.length, oldEnd: text.length, newEnd: text.length + stmt.length } }; } } } @@ -70,15 +79,16 @@ for (const f of FILES) { let text = readFileSync(f, 'utf-8'); session.parse(text); // open the session for (let k = 0; k < STEPS; k++) { - const next = mutate(text); + const { next, edit } = mutate(text); steps++; let freshRoot = -1, freshErr: string | null = null; const tf0 = performance.now(); try { freshRoot = fresh.parse(next); } catch (e) { freshErr = (e as Error).message; } const tf1 = performance.now(); let incRoot = -1, incErr: string | null = null; + const useProtocol = k % 2 === 1; // alternate: edits protocol / char-diff fallback const ti0 = performance.now(); - try { incRoot = session.parseEdited(next); } catch (e) { incErr = (e as Error).message; } + try { incRoot = session.parseEdited(next, undefined, useProtocol ? [edit] : undefined); } catch (e) { incErr = (e as Error).message; } const ti1 = performance.now(); if (freshErr !== null || incErr !== null) { if ((freshErr === null) !== (incErr === null)) { From 390b715c7eb35b62df7f0e55af6c6e2bf79dd1f5 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 06:32:21 +0800 Subject: [PATCH 10/15] Pure-container ancestors get the end-relative kid band too A deep edit under a giant flat list paid an O(suffix-kids) eager rel walk per keystroke on every ANCESTOR with a large suffix - the band so far existed only on the surgical container itself. Measured on the 9MB flat body as ancestor: 0.60ms median / 1.85ms p90 per keystroke. Ancestors whose rule is a pure container (SURG_ELEM: interior = element rows only, leaves only as a trailing run) now maintain the same end-relative band as D: rows entering the suffix flip once (old-length bias cancels), rows beyond the boundary ride the parent length update, trailing leaves get the O(1) backward re-pack. Mixed-content ancestors (interleaved leaves cannot sign-encode inside the packed kid entry) keep the eager walk - those are the grammar's non-list shapes with small kid counts. Nested edit on the 9MB flat body: 0.60ms -> 0.031ms median / 0.047ms p90. List-level keystroke unchanged (0.04ms median), 8MB nested real shape 0.13ms, batch aggregate in band (11.2x), 30/30 gates, incremental-verify 0/120, emit-parser-verify 0 mismatches. --- src/emit-parser.ts | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index f17a4ac..4656d9d 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -2768,11 +2768,43 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { } rowNF[Ai] = csA + ki + 1; } - for (let k = ki + 1; k < nA; k++) { - const e = kids[csA + k]; - if (e < 0) kids[csA + k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); - else if (kidTokRel[csA + k] >= 0) { kidTokRel[csA + k] += tokD; kidRel[csA + k] += chrD; } - // (end-relative kids past the boundary auto-shift via the length update below) + // Suffix kids: a PURE-container ancestor (interior = element rows only, leaves + // only as a trailing run) gets the same end-relative band as D — without it, a + // deep edit under a giant flat list pays an O(suffix) eager walk per keystroke + // (measured: 0.6ms median on the 9MB body as ancestor). Mixed-content ancestors + // (interleaved leaves can't sign-encode inside the packed entry) keep the eager + // walk; their kid counts are the grammar's non-list shapes. + if (SURG_ELEM[rowRule[Ai]] >= 0) { + const bndA = csA + ki + 1; + const nfA2 = rowNF[Ai]; + const kidsEndA = csA + nA; + if (nfA2 > bndA) { + const hi = nfA2 < kidsEndA ? nfA2 : kidsEndA; + for (let k = bndA; k < hi; k++) { + const e = kids[k]; + if (e < 0) { if (tokD !== 0) kids[k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); } + else { + const v = kidTokRel[k]; + if (v >= 0) { kidTokRel[k] = v - rowTokLen[Ai] - 1; kidRel[k] -= rowLen[Ai] + 1; } + } + } + } + if (tokD !== 0) { + const tlFrom = nfA2 > bndA ? (nfA2 < kidsEndA ? nfA2 : kidsEndA) : bndA; + for (let k = kidsEndA - 1; k >= tlFrom; k--) { + const e = kids[k]; + if (e >= 0) break; + kids[k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); + } + } + rowNF[Ai] = bndA; + } else { + for (let k = ki + 1; k < nA; k++) { + const e = kids[csA + k]; + if (e < 0) kids[csA + k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); + else if (kidTokRel[csA + k] >= 0) { kidTokRel[csA + k] += tokD; kidRel[csA + k] += chrD; } + // (end-relative kids past the boundary auto-shift via the length update below) + } } rowTokLen[Ai] += tokD; if (rowTokLen[Ai] > 0) rowLen[Ai] = tend(surgBase[i] + rowTokLen[Ai] - 1) - toff(surgBase[i]); From 6ce7717ed14732daae6dd35da570f6636037e41c Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 06:59:23 +0800 Subject: [PATCH 11/15] Handle API: explicit tree handles over per-document state; strict-< restart anchor API rework (the session model made the edit base IMPLICIT - parseEdited acted on whatever was parsed last, and two interleaved documents shared one module state): const p = createParser(); const cst = p.parse(text); const cst2 = p.edit(cst, next[, edits]); - Each parser instance owns a DOCUMENT: the 51 per-document fields (token columns, arena, kids, memo, session, paren cache, spare buffers) live in a doc object; the module-level variables stay the ACTIVE REGISTER SET and activate() lazily swaps on instance switch - the hot paths never indirect through an object (batch unchanged, 11.0-11.3x band; handle-API keystroke median 0.06ms). - Handles are generation-stamped: trees are edited IN PLACE (node surgery), so an edit invalidates earlier handles of that parser - using one throws instead of silently reading a mutated tree. A REJECTED edit leaves the previous handle valid and the next edit falls back to a full re-parse internally. - Module-level parse/parseEdited/visit/toObject keep working on a default document (gates/back-compat); the interpreter's createParser gains edit() (full re-parse - immutable object trees) for API parity. - NEW gate test/multi-doc.ts: two instances over two sources, edits interleaved with the default doc mixed in - every edited tree must equal a fresh parse (a missed swap field = cross-document corruption), plus the stale/foreign/reject handle contract. SOUNDNESS FIX the new smoke test exposed (predates this branch, M1-era): findRestart anchored at tokens ENDING exactly at the damage start, but maximal munch lets the edit EXTEND such a token ('b' + inserted 'x' lexes as 'bx', '=' + '=' as '==', deleting a gap glues neighbours) and the anchor itself is never re-lexed - the spliced stream then carried 'b','x' as two tokens where a batch lex has one, parsing a DIFFERENT (sometimes still valid) program. The fixed-seed gate sessions never hit the abutment in 120 steps. Anchor comparison is now STRICT (<), so the abutting token falls inside the window and the merge is re-derived; incremental-verify gains a deterministic boundary-glue session (ident glue, operator glue, gap deletion, '>>' split sites) so the class stays pinned. 31/31 gates (multi-doc included), incremental-verify 128 steps 0 mismatch, emit-parser-verify 0 mismatches, agnostic 9/9, batch in band. --- src/emit-lexer.ts | 6 +- src/emit-parser.ts | 128 ++++++++++++++++++++++++++++++++++--- src/gen-parser.ts | 5 +- test/check.ts | 1 + test/incremental-verify.ts | 55 ++++++++++++++-- test/multi-doc.ts | 104 ++++++++++++++++++++++++++++++ 6 files changed, 282 insertions(+), 17 deletions(-) create mode 100644 test/multi-doc.ts diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 07745ea..704111e 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -516,7 +516,11 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// head (always sound, degrades to a full re-lex).`); emit(`function findRestart(cs) {`); emit(` let lo = 0, hi = tokN;`); - emit(` while (lo < hi) { const mid = (lo + hi) >> 1; if (tend(mid) <= cs) lo = mid + 1; else hi = mid; }`); + // STRICTLY before the damage: a token ENDING exactly at cs can be EXTENDED by + // the edit under maximal munch ('b' + inserted 'x' = 'bx'; '=' + '=' = '=='; + // deleting the gap glues neighbours) and the anchor itself is never re-lexed — + // with < the abutting token falls inside the window and the merge is re-derived. + emit(` while (lo < hi) { const mid = (lo + hi) >> 1; if (tend(mid) < cs) lo = mid + 1; else hi = mid; }`); emit(` for (let b = lo - 1; b >= 0; b--) {`); emit(` // template depth must be zero (interp brace counters are not reconstructable),`); emit(` // and the anchor token must leave no cross-token lexer flag live: not a`); diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 4656d9d..2991fb2 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -2310,7 +2310,7 @@ export const tree = { // Depth-first traversal threading the RED coordinates: enter/leave receive the // node's absolute (charBase, tokBase); leaf receives its absolute token index. // Call with the root only — the bases default from the root's rel fields. -export function visit(entry, fns, charBase, tokBase) { +function visitCore(entry, fns, charBase, tokBase) { if (charBase === undefined) { charBase = rootCharBase; tokBase = rootTokBase; } if (entry < 0) { if (fns.leaf) fns.leaf(entry, tokBase + ((~entry) >>> 2)); return; } if (fns.enter && fns.enter(entry, charBase, tokBase) === false) return; @@ -2319,20 +2319,20 @@ export function visit(entry, fns, charBase, tokBase) { for (let i = 0; i < n; i++) { const e = kids[cs + i]; if (e < 0) { if (fns.leaf) fns.leaf(e, tokBase + ((~e) >>> 2)); } - else visit(e, fns, charBase + kcr(entry, cs + i), tokBase + ktr(entry, cs + i)); + else visitCore(e, fns, charBase + kcr(entry, cs + i), tokBase + ktr(entry, cs + i)); } if (fns.leave) fns.leave(entry, charBase, tokBase); } // Materialize the classic object CST from a node id — a BRIDGE for tests/debugging // (the byte-identical gate against the interpreter), not a parse-path product. -export function toObject(id, charBase, tokBase) { +function toObjectCore(id, charBase, tokBase) { if (charBase === undefined) { charBase = rootCharBase; tokBase = rootTokBase; } const n = rowCount[id]; const cs = rowStart[id]; const children = new Array(n); for (let i = 0; i < n; i++) { const entry = kids[cs + i]; - children[i] = entry >= 0 ? toObject(entry, charBase + kcr(id, cs + i), tokBase + ktr(id, cs + i)) + children[i] = entry >= 0 ? toObjectCore(entry, charBase + kcr(id, cs + i), tokBase + ktr(id, cs + i)) : { tokenType: leafTokenType(entry, tokBase), offset: toff(tokBase + ((~entry) >>> 2)), end: tend(tokBase + ((~entry) >>> 2)) }; } return { rule: RULE_NAMES[rowRule[id]], children, offset: charBase, end: charBase + rowLen[id] }; @@ -2828,6 +2828,76 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { let altK = null, altT = null, altOff = null, altEnd = null, altFl = null, altDp = null, altPd = null; let altCap = 0; let altN = 0; // old-stream token count while a window lex runs (lexCore's resync bound) + +// ── Documents: the per-document state set behind the handle API ── +// The module-level variables above are the ACTIVE REGISTER SET — the hot paths +// never indirect through an object. A document object stores the same 51 fields; +// activate() lazily swaps: the active doc's object may be stale while the module +// variables are the truth, and is written back only when another doc activates. +// Per-PARSE transients (pos/maxPos/scratch/adopt*/surg*) reset on every entry and +// are shared safely. +function makeDoc() { + return { + tkK: new tkK.constructor(4096), tkT: new tkT.constructor(4096), + tkOff: new Int32Array(4096), tkEnd: new Int32Array(4096), tkFl: new Uint8Array(4096), + tkDp: new Uint8Array(4096), tkPd: new Uint16Array(4096), + tkCap: 4096, tokN: 0, src: '', srcLenP1: 1, negFrom: 0x7fffffff, + rowRule: new Uint16Array(8192), rowLen: new Int32Array(8192), rowTokLen: new Int32Array(8192), + rowStart: new Int32Array(8192), rowCount: new Int32Array(8192), rowExt: new Int32Array(8192), + rowOK: new Uint8Array(8192), rowKC: new Uint8Array(8192), + rowNF: new Int32Array(8192).fill(0x7fffffff), + absChar: new Int32Array(8192), absTok: new Int32Array(8192), + rowCap: 8192, nodeN: 0, + kids: new Int32Array(16384), kidRel: new Int32Array(16384), kidTokRel: new Int32Array(16384), + kidCap: 16384, kidN: 0, + memoNode: [], memoEnd: [], memoExt: [], memoGen: [], memoGenCur: 0, + lastSrc: null, rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, +${e.soa ? ' parenCachePos: -1, parenCacheStack: [],' : ''} + altK: null, altT: null, altOff: null, altEnd: null, altFl: null, altDp: null, altPd: null, + altCap: 0, altN: 0, + }; +} +function saveDoc(d) { + d.tkK = tkK; d.tkT = tkT; d.tkOff = tkOff; d.tkEnd = tkEnd; d.tkFl = tkFl; + d.tkDp = tkDp; d.tkPd = tkPd; d.tkCap = tkCap; d.tokN = tokN; d.src = src; + d.srcLenP1 = srcLenP1; d.negFrom = negFrom; + d.rowRule = rowRule; d.rowLen = rowLen; d.rowTokLen = rowTokLen; d.rowStart = rowStart; + d.rowCount = rowCount; d.rowExt = rowExt; d.rowOK = rowOK; d.rowKC = rowKC; d.rowNF = rowNF; + d.absChar = absChar; d.absTok = absTok; d.rowCap = rowCap; d.nodeN = nodeN; + d.kids = kids; d.kidRel = kidRel; d.kidTokRel = kidTokRel; d.kidCap = kidCap; d.kidN = kidN; + d.memoNode = memoNode; d.memoEnd = memoEnd; d.memoExt = memoExt; d.memoGen = memoGen; + d.memoGenCur = memoGenCur; + d.lastSrc = lastSrc; d.rootCharBase = rootCharBase; d.rootTokBase = rootTokBase; + d.lastRoot = lastRoot; d.lastRootTok = lastRootTok; +${e.soa ? ' d.parenCachePos = parenCachePos; d.parenCacheStack = parenCacheStack;' : ''} + d.altK = altK; d.altT = altT; d.altOff = altOff; d.altEnd = altEnd; d.altFl = altFl; + d.altDp = altDp; d.altPd = altPd; d.altCap = altCap; d.altN = altN; +} +function loadDoc(d) { + tkK = d.tkK; tkT = d.tkT; tkOff = d.tkOff; tkEnd = d.tkEnd; tkFl = d.tkFl; + tkDp = d.tkDp; tkPd = d.tkPd; tkCap = d.tkCap; tokN = d.tokN; src = d.src; + srcLenP1 = d.srcLenP1; negFrom = d.negFrom; + rowRule = d.rowRule; rowLen = d.rowLen; rowTokLen = d.rowTokLen; rowStart = d.rowStart; + rowCount = d.rowCount; rowExt = d.rowExt; rowOK = d.rowOK; rowKC = d.rowKC; rowNF = d.rowNF; + absChar = d.absChar; absTok = d.absTok; rowCap = d.rowCap; nodeN = d.nodeN; + kids = d.kids; kidRel = d.kidRel; kidTokRel = d.kidTokRel; kidCap = d.kidCap; kidN = d.kidN; + memoNode = d.memoNode; memoEnd = d.memoEnd; memoExt = d.memoExt; memoGen = d.memoGen; + memoGenCur = d.memoGenCur; + lastSrc = d.lastSrc; rootCharBase = d.rootCharBase; rootTokBase = d.rootTokBase; + lastRoot = d.lastRoot; lastRootTok = d.lastRootTok; +${e.soa ? ' parenCachePos = d.parenCachePos; parenCacheStack = d.parenCacheStack;' : ''} + altK = d.altK; altT = d.altT; altOff = d.altOff; altEnd = d.altEnd; altFl = d.altFl; + altDp = d.altDp; altPd = d.altPd; altCap = d.altCap; altN = d.altN; +} +const docDefault = makeDoc(); +let curDoc = docDefault; +loadDoc(docDefault); +function activate(d) { + if (d === curDoc) return; + saveDoc(curDoc); + loadDoc(d); + curDoc = d; +} function swapBuffers() { let x; x = tkK; tkK = altK; altK = x; @@ -2841,7 +2911,7 @@ function swapBuffers() { } ${e.soa ? '' : 'let altText = [];'} -export function parse(source, entryRule) { +function parseCore(source, entryRule) { lastSrc = null; adoptRoot = -1; adoptRunPos = -1; @@ -2877,8 +2947,8 @@ export function parse(source, entryRule) { // until then. Lexing is FULL-FILE by design: the lexer carries cross-token state // (template nesting, regex context, markup modes), full lexing is a small share of a // parse, and the diff is what localizes the damage — not the lexer. -export function parseEdited(source, entryRule, edits) { - if (lastSrc === null) return parse(source, entryRule); +function editCore(source, entryRule, edits) { + if (lastSrc === null) return parseCore(source, entryRule); const oSrc = lastSrc; lastSrc = null; ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── @@ -3044,6 +3114,48 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── } export { tokenize }; -export function createParser() { return { parse, parseEdited, tree, visit, toObject, tokenize }; } +// ── Module-level API: the DEFAULT document (one shared session; tokenize and the +// raw tree/tokenAt views read the ACTIVE doc — they are gate/debug surfaces) ── +export function parse(source, entryRule) { activate(docDefault); return parseCore(source, entryRule); } +export function parseEdited(source, entryRule, edits) { activate(docDefault); return editCore(source, entryRule, edits); } +export function visit(entry, fns, charBase, tokBase) { activate(docDefault); return visitCore(entry, fns, charBase, tokBase); } +export function toObject(id, charBase, tokBase) { activate(docDefault); return toObjectCore(id, charBase, tokBase); } +// ── Handle API: explicit trees over per-instance documents ── +// const p = createParser(); const cst = p.parse(text); const cst2 = p.edit(cst, next[, edits]); +// Trees are edited IN PLACE (node surgery): an edit invalidates every earlier handle +// of this parser — using one throws instead of silently reading a mutated tree. A +// REJECTED edit (parse error) leaves the previous handle valid; the next edit falls +// back to a full re-parse internally. +export function createParser() { + const d = makeDoc(); + let gen = 0; + let entryUsed; + const chk = (cst) => { + if (cst === null || cst === undefined || cst.d !== d) throw new Error('foreign tree handle: it belongs to another parser instance'); + if (cst.gen !== gen) throw new Error('stale tree handle: trees are edited in place - use the handle returned by the latest parse/edit'); + }; + const view = {}; + for (const k of Object.keys(tree)) { + const f = tree[k]; + view[k] = (a, b) => { activate(d); return f(a, b); }; + } + return { + parse(source, entryRule) { + activate(d); + entryUsed = entryRule; + const root = parseCore(source, entryRule); + return { d, gen: ++gen, root }; + }, + edit(cst, source, edits) { + chk(cst); + activate(d); + const root = editCore(source, entryUsed, edits); + return { d, gen: ++gen, root }; + }, + visit(cst, fns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, + toObject(cst) { chk(cst); activate(d); return toObjectCore(cst.root); }, + tree: view, + }; +} `); } diff --git a/src/gen-parser.ts b/src/gen-parser.ts index f56f405..830f819 100644 --- a/src/gen-parser.ts +++ b/src/gen-parser.ts @@ -1482,7 +1482,10 @@ export function createParser(grammar: CstGrammar) { } } - return { parse, tokenize, profCounts }; + // API parity with the emitted engine's handle surface: the interpreter builds + // immutable object trees, so edit() is a full re-parse (no reuse, no staleness). + const edit = (_cst: unknown, source: string) => parse(source); + return { parse, edit, tokenize, profCounts }; } // ── Helpers ── diff --git a/test/check.ts b/test/check.ts index a0f18e4..8754566 100644 --- a/test/check.ts +++ b/test/check.ts @@ -22,6 +22,7 @@ const GATES: Gate[] = [ { group: 'conformance', name: 'ts-ast-structure', args: ['test/ts-ast-verify.ts'] }, { group: 'core', name: 'cst-match-totality', args: ['test/cst-match-totality.ts'] }, { group: 'core', name: 'incremental-verify', args: ['test/incremental-verify.ts'] }, + { group: 'core', name: 'multi-doc', args: ['test/multi-doc.ts'] }, { group: 'core', name: 'issue-cases', args: ['test/test-issues.ts'] }, { group: 'conformance', name: 'js', args: ['test/js-conformance.ts'] }, { group: 'conformance', name: 'tsx', args: ['test/tsx-conformance.ts'] }, diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index f6f8838..e452c07 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -13,12 +13,18 @@ const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-incremental.mjs'; writeFileSync(emPath, emitParser(grammar)); type Edit = { start: number; oldEnd: number; newEnd: number }; +type Cst = { root: number }; +type Parser = { + parse(s: string): Cst; + edit(cst: Cst, s: string, edits?: Edit[]): Cst; + toObject(cst: Cst): unknown; +}; type Em = { parse(s: string): number; - parseEdited(s: string, entryRule?: string, edits?: Edit[]): number; toObject(id: number): unknown; + createParser(): Parser; }; -const session = (await import(emPath + '?session=' + process.pid)) as Em; +const session = ((await import(emPath + '?session=' + process.pid)) as Em).createParser(); const fresh = (await import(emPath + '?fresh=' + process.pid)) as Em; // Deterministic LCG so failures replay. @@ -71,13 +77,47 @@ const FILES = [ ].filter(existsSync); const STEPS = 30; +// ── Adversarial boundary edits (deterministic) ── +// The fixed-seed random sessions MISSED the restart-anchor abutment hole (a token +// ending exactly at the damage start can be EXTENDED under maximal munch — 'b'+'x' +// = 'bx', '='+'=' = '==', deleting a gap glues neighbours). These cases pin the +// strict-< restart anchor; every one must match fresh (tree or reject) exactly. +const GLUE: Array<[string, string]> = [ + ['const a = 1;\nconst b = 2;\n', 'const a = 1;\nconst bx = 2;\n'], + ['let a = b; let c = 1;\n', 'let a = b1; let c = 1;\n'], + ['if (a = b) { f(); }\n', 'if (a == b) { f(); }\n'], + ['const x = a b;\n', 'const x = ab;\n'], + ['const q = w / 2;\n', 'const q = w /= 2;\n'], + ['const t = a + b;\n', 'const t = a ++ b;\n'], + ['const u = x(z);\n', 'const u = x>(z);\n'], + ['f(a, b);\ng(c);\n', 'f(a, bc);\ng(c);\n'], +]; + let steps = 0, equal = 0, bothReject = 0, mismatch = 0; let tInc = 0, tFresh = 0; const failures: string[] = []; +for (const [base, edited] of GLUE) { + steps++; + let c0 = session.parse(base); + let fe: string | null = null, ie: string | null = null; + let fr = -1, ic: Cst | null = null; + try { fr = fresh.parse(edited); } catch (e) { fe = (e as Error).message; } + try { ic = session.edit(c0, edited); } catch (e) { ie = (e as Error).message; } + if (fe !== null || ie !== null) { + if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: fresh ${fe ? 'reject' : 'accept'} / incremental ${ie ? 'reject' : 'accept'}`); } + else bothReject++; + continue; + } + const a = JSON.stringify(fresh.toObject(fr)); + const b = JSON.stringify(session.toObject(ic!)); + if (a === b) equal++; + else { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: tree diverges`); } +} + for (const f of FILES) { let text = readFileSync(f, 'utf-8'); - session.parse(text); // open the session + let cst = session.parse(text); // open the session for (let k = 0; k < STEPS; k++) { const { next, edit } = mutate(text); steps++; @@ -85,22 +125,23 @@ for (const f of FILES) { const tf0 = performance.now(); try { freshRoot = fresh.parse(next); } catch (e) { freshErr = (e as Error).message; } const tf1 = performance.now(); - let incRoot = -1, incErr: string | null = null; + let incCst: Cst | null = null, incErr: string | null = null; const useProtocol = k % 2 === 1; // alternate: edits protocol / char-diff fallback const ti0 = performance.now(); - try { incRoot = session.parseEdited(next, undefined, useProtocol ? [edit] : undefined); } catch (e) { incErr = (e as Error).message; } + try { incCst = session.edit(cst, next, useProtocol ? [edit] : undefined); } catch (e) { incErr = (e as Error).message; } const ti1 = performance.now(); if (freshErr !== null || incErr !== null) { if ((freshErr === null) !== (incErr === null)) { mismatch++; if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: fresh ${freshErr ? 'reject' : 'accept'} / incremental ${incErr ? 'reject' : 'accept'}\n fresh: ${freshErr ?? '-'}\n inc: ${incErr ?? '-'}`); } else bothReject++; - // rejected text: do not advance the session text (the session reset itself) + // rejected text: the handle stays valid; the session does not advance continue; } + cst = incCst!; tFresh += tf1 - tf0; tInc += ti1 - ti0; const a = JSON.stringify(fresh.toObject(freshRoot)); - const b = JSON.stringify(session.toObject(incRoot)); + const b = JSON.stringify(session.toObject(cst)); if (a === b) equal++; else { mismatch++; diff --git a/test/multi-doc.ts b/test/multi-doc.ts new file mode 100644 index 0000000..299798c --- /dev/null +++ b/test/multi-doc.ts @@ -0,0 +1,104 @@ +// Gate: DOCUMENTS ARE ISOLATED. The handle API (createParser → parse/edit with +// explicit tree handles) keeps one document's state per parser instance behind a +// lazily-swapped register set — a missed swap field shows up as cross-document +// corruption. Two instances edit two different sources interleaved (plus the +// module-level default-doc API mixed in between); every edited tree must be +// byte-identical (toObject) to a fresh parse of the same text. Also pins the +// handle contract: stale and foreign handles throw instead of silently reading +// an in-place-mutated tree, and a REJECTED edit leaves the old handle valid. +// +// node test/multi-doc.ts +import { writeFileSync } from 'node:fs'; +import { emitParser } from '../src/emit-parser.ts'; + +const grammar = (await import('../typescript.ts')).default; +const emPath = '/tmp/emitted-multidoc.mjs'; +writeFileSync(emPath, emitParser(grammar)); +type Edit = { start: number; oldEnd: number; newEnd: number }; +type Cst = { root: number }; +type Parser = { parse(s: string): Cst; edit(cst: Cst, s: string, edits?: Edit[]): Cst; toObject(cst: Cst): unknown; visit(cst: Cst, fns: object): void }; +type Em = { parse(s: string): number; toObject(id: number): unknown; createParser(): Parser }; +const em = (await import(emPath + '?v=' + process.pid)) as Em; + +// Two synthetic documents (no corpus dependency — the gate always exercises). +const mk = (tag: string, n: number) => { + let s = ''; + for (let i = 0; i < n; i++) s += `function ${tag}_${i}(a) { if (a > ${i}) { return a * ${i}; } const v_${i} = { x: ${i} }; return v_${i}.x; }\n`; + return s; +}; +let textA = mk('alpha', 400); +let textB = `(function () {\n${mk('beta', 300)}})();\n`; + +let seed = 0x51C0FFEE; +const rand = () => ((seed = (seed * 48271) % 0x7fffffff) / 0x7fffffff); +const randInt = (n: number) => Math.floor(rand() * n); +const INS = ['x', '1', ' + q', '.m', '(/*c*/)', '"s"']; +function mutate(text: string): string { + switch (randInt(3)) { + case 0: { const at = randInt(text.length); return text.slice(0, at) + INS[randInt(INS.length)] + text.slice(at); } + case 1: { const at = randInt(Math.max(1, text.length - 6)); return text.slice(0, at) + text.slice(at + 1 + randInt(4)); } + default: { const at = randInt(Math.max(1, text.length - 1)); return text.slice(0, at) + 'z' + text.slice(at + 1); } + } +} + +const p1 = em.createParser(); +const p2 = em.createParser(); +const f = em.createParser(); +let cstA = p1.parse(textA); +let cstB = p2.parse(textB); + +let steps = 0, equal = 0, bothReject = 0, mismatch = 0; +const failures: string[] = []; +for (let k = 0; k < 60; k++) { + const onA = (k & 1) === 0; + const text = onA ? textA : textB; + const next = mutate(text); + steps++; + let fe: string | null = null, ie: string | null = null; + let fc: Cst | null = null, ic: Cst | null = null; + try { fc = f.parse(next); } catch (e) { fe = (e as Error).message; } + try { ic = (onA ? p1 : p2).edit(onA ? cstA : cstB, next); } catch (e) { ie = (e as Error).message; } + if (fe !== null || ie !== null) { + if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): fresh ${fe ? 'reject' : 'accept'} / edit ${ie ? 'reject' : 'accept'}`); } + else bothReject++; + continue; + } + // mix the module-level default doc in between: it must not disturb either instance + if (k % 5 === 0) em.parse('const mix = ' + k + ';'); + const a = JSON.stringify(f.toObject(fc!)); + const b = JSON.stringify((onA ? p1 : p2).toObject(ic!)); + if (a === b) equal++; + else { + mismatch++; + if (failures.length < 5) { + let i = 0; while (i < a.length && a[i] === b[i]) i++; + failures.push(`step ${k} (${onA ? 'A' : 'B'}): tree diverges @${i}`); + } + } + if (onA) { textA = next; cstA = ic!; } else { textB = next; cstB = ic!; } +} + +// handle contract +let contract = 0; +{ + const p = em.createParser(); + const c1 = p.parse('const a = 1;'); + const c2 = p.edit(c1, 'const ab = 1;'); + try { p.edit(c1, 'const x = 2;'); failures.push('stale handle did not throw'); } catch { contract++; } + try { p.toObject(c1); failures.push('stale toObject did not throw'); } catch { contract++; } + try { p2.edit(c2, 'const y = 3;'); failures.push('foreign handle did not throw'); } catch { contract++; } + // a rejected edit leaves the handle valid + let rejected = false; + try { p.edit(c2, 'const ] = ;'); } catch { rejected = true; } + const c3 = rejected ? p.edit(c2, 'const ab = 12;') : null; + if (!rejected || c3 === null) failures.push('reject-then-edit flow broke'); + else contract++; +} + +console.log(`multi-doc: ${equal} equal · ${bothReject} both-reject · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/4`); +for (const s of failures) console.log(' ✗ ' + s); +if (mismatch > 0 || contract !== 4 || failures.length > 0) { + console.error('✗ document isolation / handle contract violated'); + process.exit(1); +} +console.log('✓ documents are isolated; handles enforce the in-place-edit contract'); From 70064d24bd47662f1fe978f13060912680c0835d Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 07:08:38 +0800 Subject: [PATCH 12/15] edit() mutates the handle in place and returns nothing; reject-safe contract Returning a new handle from edit() read like value semantics - as if the old cst survived and edit produced a clone. There is no clone: surgery mutates the tree in place. The handle is now the STABLE IDENTITY of the document's tree: p.edit(cst, next) updates cst.root and returns void; the same reference always reads the current tree. Making that honest exposed two reject holes the contract tests pinned: - A rejected edit had already spliced the token columns to the rejected text (the splice precedes the parse attempt), so the kept tree's leaf spans read corrupted data. The reject path now restores the columns by re-lexing the LIVE tree's source (treeSrc, which unlike lastSrc survives rejects) - O(n) on the reject path only; #39's recovery mode is what makes rejects rare. - The full-parse fallback inside edit (after a previous reject) went through parseCore, whose arena reset destroys the live tree BEFORE knowing whether the new text parses. edit now falls back in APPEND mode; parse() is the only compaction point - and since its reset happens before its outcome is known, parse() bumps the generation on entry: old handles die when a document is re-opened, success or not. Handle contract (gated, 5/5): in-place edit updates the same handle; a rejected edit throws and keeps the handle on the previous tree (readable); foreign handles throw; re-opening via parse() - including a REJECTING parse() - invalidates prior handles. The interpreter's edit() mirrors the in-place semantics by replacing the tree object's fields. 31/31 gates, incremental-verify 128 steps 0 mismatch, parity 0 mismatches, handle-API keystroke median 0.028ms. --- src/emit-parser.ts | 67 ++++++++++++++++++++++++++++++-------- src/gen-parser.ts | 11 +++++-- test/incremental-verify.ts | 17 +++++----- test/multi-doc.ts | 43 ++++++++++++++---------- 4 files changed, 96 insertions(+), 42 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 2991fb2..50f18ed 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -2394,6 +2394,10 @@ function runParse(entryRule) { // null whenever the module state is not a coherent snapshot (no parse yet, or the last // attempt threw), so parseEdited falls back to a full parse. let lastSrc = null; +// Source text of the LIVE tree (unlike lastSrc it survives a rejected edit): the +// reject path restores the token columns to it so the handle keeps reading the +// previous tree; only a successful parse/edit moves it. +let treeSrc = null; // the LAST parse root's absolute coordinates (the descent origin — see visit/toObject) let rootCharBase = 0; let rootTokBase = 0; @@ -2851,7 +2855,7 @@ function makeDoc() { kids: new Int32Array(16384), kidRel: new Int32Array(16384), kidTokRel: new Int32Array(16384), kidCap: 16384, kidN: 0, memoNode: [], memoEnd: [], memoExt: [], memoGen: [], memoGenCur: 0, - lastSrc: null, rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, + lastSrc: null, treeSrc: null, rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, ${e.soa ? ' parenCachePos: -1, parenCacheStack: [],' : ''} altK: null, altT: null, altOff: null, altEnd: null, altFl: null, altDp: null, altPd: null, altCap: 0, altN: 0, @@ -2867,7 +2871,7 @@ function saveDoc(d) { d.kids = kids; d.kidRel = kidRel; d.kidTokRel = kidTokRel; d.kidCap = kidCap; d.kidN = kidN; d.memoNode = memoNode; d.memoEnd = memoEnd; d.memoExt = memoExt; d.memoGen = memoGen; d.memoGenCur = memoGenCur; - d.lastSrc = lastSrc; d.rootCharBase = rootCharBase; d.rootTokBase = rootTokBase; + d.lastSrc = lastSrc; d.treeSrc = treeSrc; d.rootCharBase = rootCharBase; d.rootTokBase = rootTokBase; d.lastRoot = lastRoot; d.lastRootTok = lastRootTok; ${e.soa ? ' d.parenCachePos = parenCachePos; d.parenCacheStack = parenCacheStack;' : ''} d.altK = altK; d.altT = altT; d.altOff = altOff; d.altEnd = altEnd; d.altFl = altFl; @@ -2883,7 +2887,7 @@ function loadDoc(d) { kids = d.kids; kidRel = d.kidRel; kidTokRel = d.kidTokRel; kidCap = d.kidCap; kidN = d.kidN; memoNode = d.memoNode; memoEnd = d.memoEnd; memoExt = d.memoExt; memoGen = d.memoGen; memoGenCur = d.memoGenCur; - lastSrc = d.lastSrc; rootCharBase = d.rootCharBase; rootTokBase = d.rootTokBase; + lastSrc = d.lastSrc; treeSrc = d.treeSrc; rootCharBase = d.rootCharBase; rootTokBase = d.rootTokBase; lastRoot = d.lastRoot; lastRootTok = d.lastRootTok; ${e.soa ? ' parenCachePos = d.parenCachePos; parenCacheStack = d.parenCacheStack;' : ''} altK = d.altK; altT = d.altT; altOff = d.altOff; altEnd = d.altEnd; altFl = d.altFl; @@ -2929,6 +2933,7 @@ function parseCore(source, entryRule) { lastRoot = root; lastRootTok = rootTokBase; lastSrc = source; + treeSrc = source; return root; } @@ -2948,7 +2953,40 @@ function parseCore(source, entryRule) { // (template nesting, regex context, markup modes), full lexing is a small share of a // parse, and the diff is what localizes the damage — not the lexer. function editCore(source, entryRule, edits) { - if (lastSrc === null) return parseCore(source, entryRule); + try { + return editCoreRun(source, entryRule, edits); + } catch (e) { + // REJECTED edit: the splice (and any '>' splits of the failed attempt) already + // rewrote the token columns to the rejected text, and the append-mode fallback + // may have grown the arena — but the live tree's ROWS are untouched. Re-lexing + // the live tree's source restores every read path (leaf spans, visit, next + // edit's restart anchors); O(n) on the reject path only. + if (treeSrc !== null) { lexInto(treeSrc); lastSrc = null; } + throw e; + } +} +function editCoreRun(source, entryRule, edits) { + if (lastSrc === null) { + // No coherent edit base (a previous attempt rejected): full re-parse in APPEND + // mode — parseCore would reset the arena and destroy the live tree the handle + // still exposes if THIS parse rejects too. parse() is the only compaction point. + lexInto(source); + if (memoEnd.length !== MEMO_RULES) { + memoNode = new Array(MEMO_RULES); + memoEnd = new Array(MEMO_RULES); + memoExt = new Array(MEMO_RULES); + memoGen = new Array(MEMO_RULES); + } + memoGenCur++; + adoptRoot = -1; + adoptRunPos = -1; + const root = runParse(entryRule); + lastRoot = root; + lastRootTok = rootTokBase; + lastSrc = source; + treeSrc = source; + return root; + } const oSrc = lastSrc; lastSrc = null; ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── @@ -3103,6 +3141,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── lastRoot = sroot; lastRootTok = adoptRootTok; lastSrc = source; + treeSrc = source; return sroot; } const root = runParse(entryRule); @@ -3110,6 +3149,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── lastRoot = root; lastRootTok = rootTokBase; lastSrc = source; + treeSrc = source; return root; } @@ -3121,18 +3161,19 @@ export function parseEdited(source, entryRule, edits) { activate(docDefault); re export function visit(entry, fns, charBase, tokBase) { activate(docDefault); return visitCore(entry, fns, charBase, tokBase); } export function toObject(id, charBase, tokBase) { activate(docDefault); return toObjectCore(id, charBase, tokBase); } // ── Handle API: explicit trees over per-instance documents ── -// const p = createParser(); const cst = p.parse(text); const cst2 = p.edit(cst, next[, edits]); -// Trees are edited IN PLACE (node surgery): an edit invalidates every earlier handle -// of this parser — using one throws instead of silently reading a mutated tree. A -// REJECTED edit (parse error) leaves the previous handle valid; the next edit falls -// back to a full re-parse internally. +// const p = createParser(); const cst = p.parse(text); p.edit(cst, next[, edits]); +// The handle is the STABLE IDENTITY of this document's tree: edit() mutates it in +// place (node surgery) and returns nothing — a return value would read as a clone, +// and there is none. A REJECTED edit (parse error) throws and leaves the handle on +// the previous tree; the next edit falls back to a full re-parse internally. Only +// parse() re-opening the document invalidates old handles (they throw). export function createParser() { const d = makeDoc(); let gen = 0; let entryUsed; const chk = (cst) => { if (cst === null || cst === undefined || cst.d !== d) throw new Error('foreign tree handle: it belongs to another parser instance'); - if (cst.gen !== gen) throw new Error('stale tree handle: trees are edited in place - use the handle returned by the latest parse/edit'); + if (cst.gen !== gen) throw new Error('stale tree handle: parse() re-opened this document - use the handle from the latest parse()'); }; const view = {}; for (const k of Object.keys(tree)) { @@ -3143,14 +3184,14 @@ export function createParser() { parse(source, entryRule) { activate(d); entryUsed = entryRule; + gen++; // re-opening resets the arena: old handles die even if THIS parse rejects const root = parseCore(source, entryRule); - return { d, gen: ++gen, root }; + return { d, gen, root }; }, edit(cst, source, edits) { chk(cst); activate(d); - const root = editCore(source, entryUsed, edits); - return { d, gen: ++gen, root }; + cst.root = editCore(source, entryUsed, edits); }, visit(cst, fns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, toObject(cst) { chk(cst); activate(d); return toObjectCore(cst.root); }, diff --git a/src/gen-parser.ts b/src/gen-parser.ts index 830f819..66b09c2 100644 --- a/src/gen-parser.ts +++ b/src/gen-parser.ts @@ -1482,9 +1482,14 @@ export function createParser(grammar: CstGrammar) { } } - // API parity with the emitted engine's handle surface: the interpreter builds - // immutable object trees, so edit() is a full re-parse (no reuse, no staleness). - const edit = (_cst: unknown, source: string) => parse(source); + // API parity with the emitted engine's handle surface: edit() re-parses and + // updates the SAME tree object in place (the handle is the document's tree — + // edit returns nothing, exactly like the emitted engine; no reuse here). + const edit = (cst: { rule: string; children: unknown[]; offset: number; end: number }, source: string): void => { + const next = parse(source) as typeof cst; + cst.rule = next.rule; cst.children = next.children; + cst.offset = next.offset; cst.end = next.end; + }; return { parse, edit, tokenize, profCounts }; } diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index e452c07..241f9c1 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -16,7 +16,7 @@ type Edit = { start: number; oldEnd: number; newEnd: number }; type Cst = { root: number }; type Parser = { parse(s: string): Cst; - edit(cst: Cst, s: string, edits?: Edit[]): Cst; + edit(cst: Cst, s: string, edits?: Edit[]): void; toObject(cst: Cst): unknown; }; type Em = { @@ -99,18 +99,18 @@ const failures: string[] = []; for (const [base, edited] of GLUE) { steps++; - let c0 = session.parse(base); + const c0 = session.parse(base); let fe: string | null = null, ie: string | null = null; - let fr = -1, ic: Cst | null = null; + let fr = -1; try { fr = fresh.parse(edited); } catch (e) { fe = (e as Error).message; } - try { ic = session.edit(c0, edited); } catch (e) { ie = (e as Error).message; } + try { session.edit(c0, edited); } catch (e) { ie = (e as Error).message; } if (fe !== null || ie !== null) { if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: fresh ${fe ? 'reject' : 'accept'} / incremental ${ie ? 'reject' : 'accept'}`); } else bothReject++; continue; } const a = JSON.stringify(fresh.toObject(fr)); - const b = JSON.stringify(session.toObject(ic!)); + const b = JSON.stringify(session.toObject(c0)); if (a === b) equal++; else { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: tree diverges`); } } @@ -125,20 +125,19 @@ for (const f of FILES) { const tf0 = performance.now(); try { freshRoot = fresh.parse(next); } catch (e) { freshErr = (e as Error).message; } const tf1 = performance.now(); - let incCst: Cst | null = null, incErr: string | null = null; + let incErr: string | null = null; const useProtocol = k % 2 === 1; // alternate: edits protocol / char-diff fallback const ti0 = performance.now(); - try { incCst = session.edit(cst, next, useProtocol ? [edit] : undefined); } catch (e) { incErr = (e as Error).message; } + try { session.edit(cst, next, useProtocol ? [edit] : undefined); } catch (e) { incErr = (e as Error).message; } const ti1 = performance.now(); if (freshErr !== null || incErr !== null) { if ((freshErr === null) !== (incErr === null)) { mismatch++; if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: fresh ${freshErr ? 'reject' : 'accept'} / incremental ${incErr ? 'reject' : 'accept'}\n fresh: ${freshErr ?? '-'}\n inc: ${incErr ?? '-'}`); } else bothReject++; - // rejected text: the handle stays valid; the session does not advance + // rejected text: the handle stays on the previous tree; do not advance continue; } - cst = incCst!; tFresh += tf1 - tf0; tInc += ti1 - ti0; const a = JSON.stringify(fresh.toObject(freshRoot)); const b = JSON.stringify(session.toObject(cst)); diff --git a/test/multi-doc.ts b/test/multi-doc.ts index 299798c..111fc6c 100644 --- a/test/multi-doc.ts +++ b/test/multi-doc.ts @@ -16,7 +16,7 @@ const emPath = '/tmp/emitted-multidoc.mjs'; writeFileSync(emPath, emitParser(grammar)); type Edit = { start: number; oldEnd: number; newEnd: number }; type Cst = { root: number }; -type Parser = { parse(s: string): Cst; edit(cst: Cst, s: string, edits?: Edit[]): Cst; toObject(cst: Cst): unknown; visit(cst: Cst, fns: object): void }; +type Parser = { parse(s: string): Cst; edit(cst: Cst, s: string, edits?: Edit[]): void; toObject(cst: Cst): unknown; visit(cst: Cst, fns: object): void }; type Em = { parse(s: string): number; toObject(id: number): unknown; createParser(): Parser }; const em = (await import(emPath + '?v=' + process.pid)) as Em; @@ -55,9 +55,9 @@ for (let k = 0; k < 60; k++) { const next = mutate(text); steps++; let fe: string | null = null, ie: string | null = null; - let fc: Cst | null = null, ic: Cst | null = null; + let fc: Cst | null = null; try { fc = f.parse(next); } catch (e) { fe = (e as Error).message; } - try { ic = (onA ? p1 : p2).edit(onA ? cstA : cstB, next); } catch (e) { ie = (e as Error).message; } + try { (onA ? p1 : p2).edit(onA ? cstA : cstB, next); } catch (e) { ie = (e as Error).message; } if (fe !== null || ie !== null) { if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): fresh ${fe ? 'reject' : 'accept'} / edit ${ie ? 'reject' : 'accept'}`); } else bothReject++; @@ -66,7 +66,7 @@ for (let k = 0; k < 60; k++) { // mix the module-level default doc in between: it must not disturb either instance if (k % 5 === 0) em.parse('const mix = ' + k + ';'); const a = JSON.stringify(f.toObject(fc!)); - const b = JSON.stringify((onA ? p1 : p2).toObject(ic!)); + const b = JSON.stringify((onA ? p1 : p2).toObject(onA ? cstA : cstB)); if (a === b) equal++; else { mismatch++; @@ -75,29 +75,38 @@ for (let k = 0; k < 60; k++) { failures.push(`step ${k} (${onA ? 'A' : 'B'}): tree diverges @${i}`); } } - if (onA) { textA = next; cstA = ic!; } else { textB = next; cstB = ic!; } + if (onA) textA = next; else textB = next; } -// handle contract +// handle contract: edit mutates the handle IN PLACE (no return — no clone illusion); +// only parse() re-opening the document invalidates old handles; rejects keep the tree. let contract = 0; { const p = em.createParser(); const c1 = p.parse('const a = 1;'); - const c2 = p.edit(c1, 'const ab = 1;'); - try { p.edit(c1, 'const x = 2;'); failures.push('stale handle did not throw'); } catch { contract++; } - try { p.toObject(c1); failures.push('stale toObject did not throw'); } catch { contract++; } - try { p2.edit(c2, 'const y = 3;'); failures.push('foreign handle did not throw'); } catch { contract++; } - // a rejected edit leaves the handle valid + const before = JSON.stringify(p.toObject(c1)); + p.edit(c1, 'const ab = 1;'); + const after = JSON.stringify(p.toObject(c1)); + if (after !== before && after.includes('"end":8')) contract++; // same handle, new tree + else failures.push('in-place edit did not update the handle'); + try { p2.edit(c1, 'const y = 3;'); failures.push('foreign handle did not throw'); } catch { contract++; } let rejected = false; - try { p.edit(c2, 'const ] = ;'); } catch { rejected = true; } - const c3 = rejected ? p.edit(c2, 'const ab = 12;') : null; - if (!rejected || c3 === null) failures.push('reject-then-edit flow broke'); - else contract++; + try { p.edit(c1, 'const ] = ;'); } catch { rejected = true; } + if (rejected && JSON.stringify(p.toObject(c1)) === after) contract++; // reject keeps the tree + else failures.push('reject-then-read flow broke'); + const c2 = p.parse('let q = 1;'); + try { p.toObject(c1); failures.push('re-opened document: old handle did not throw'); } catch { contract++; } + // a REJECTING parse() resets the arena too — it must invalidate prior handles + try { p.parse('const ] = ;'); } catch { /* expected reject */ } + let dead = false; + try { p.toObject(c2); } catch { dead = true; } + if (dead) contract++; + else failures.push('rejecting parse() left the old handle readable over a reset arena'); } -console.log(`multi-doc: ${equal} equal · ${bothReject} both-reject · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/4`); +console.log(`multi-doc: ${equal} equal · ${bothReject} both-reject · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/5`); for (const s of failures) console.log(' ✗ ' + s); -if (mismatch > 0 || contract !== 4 || failures.length > 0) { +if (mismatch > 0 || contract !== 5 || failures.length > 0) { console.error('✗ document isolation / handle contract violated'); process.exit(1); } From ed5941e4892a778cde22163e1bb48a85095fd151 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 07:17:23 +0800 Subject: [PATCH 13/15] Remove toObject from the engine: visit + tree accessors are the only surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The arena design's premise (PR #36) is that parse() hands out a tree to TRAVERSE, not an object tree to materialize - toObject was the materialization back door left on both the module API and the handle API, and its only real consumer was the gate layer's byte-identical JSON comparison. That is a test concern: gates now build the comparison object through visit + tree accessors (test/emitted-obj.ts, mirroring the interpreter's object shape and key order exactly, so the emit ≡ interp and incremental ≡ fresh comparisons are unchanged). The unused emitted getText went with it; the interpreter keeps returning its native object trees (that IS its representation, not a conversion). 31/31 gates, emit-parser-verify 0 mismatches, multi-doc contract 5/5, handle-API keystroke median 0.028ms (9MB) / 0.089ms (8MB nested). --- src/emit-parser.ts | 20 ------------------- test/emit-parser-verify.ts | 3 ++- test/emitted-obj.ts | 39 ++++++++++++++++++++++++++++++++++++++ test/incremental-verify.ts | 15 +++++++++------ test/multi-doc.ts | 21 +++++++++++--------- 5 files changed, 62 insertions(+), 36 deletions(-) create mode 100644 test/emitted-obj.ts diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 50f18ed..c39a622 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -2247,10 +2247,6 @@ export function tokenAt(i) { } // The CST is span-only: a node's text is derived from the source it was parsed from. -export function getText(node, source) { - return source.slice(node.offset, node.end); -} - // ── Arena tree access ── // The arena IS the tree: parse() returns the root node id and consumers traverse // via visit()/the accessors — nothing is materialized on the parse path. All views @@ -2323,20 +2319,6 @@ function visitCore(entry, fns, charBase, tokBase) { } if (fns.leave) fns.leave(entry, charBase, tokBase); } -// Materialize the classic object CST from a node id — a BRIDGE for tests/debugging -// (the byte-identical gate against the interpreter), not a parse-path product. -function toObjectCore(id, charBase, tokBase) { - if (charBase === undefined) { charBase = rootCharBase; tokBase = rootTokBase; } - const n = rowCount[id]; - const cs = rowStart[id]; - const children = new Array(n); - for (let i = 0; i < n; i++) { - const entry = kids[cs + i]; - children[i] = entry >= 0 ? toObjectCore(entry, charBase + kcr(id, cs + i), tokBase + ktr(id, cs + i)) - : { tokenType: leafTokenType(entry, tokBase), offset: toff(tokBase + ((~entry) >>> 2)), end: tend(tokBase + ((~entry) >>> 2)) }; - } - return { rule: RULE_NAMES[rowRule[id]], children, offset: charBase, end: charBase + rowLen[id] }; -} // Parse to the ARENA: returns the root node id. function lexInto(source) { @@ -3159,7 +3141,6 @@ export { tokenize }; export function parse(source, entryRule) { activate(docDefault); return parseCore(source, entryRule); } export function parseEdited(source, entryRule, edits) { activate(docDefault); return editCore(source, entryRule, edits); } export function visit(entry, fns, charBase, tokBase) { activate(docDefault); return visitCore(entry, fns, charBase, tokBase); } -export function toObject(id, charBase, tokBase) { activate(docDefault); return toObjectCore(id, charBase, tokBase); } // ── Handle API: explicit trees over per-instance documents ── // const p = createParser(); const cst = p.parse(text); p.edit(cst, next[, edits]); // The handle is the STABLE IDENTITY of this document's tree: edit() mutates it in @@ -3194,7 +3175,6 @@ export function createParser() { cst.root = editCore(source, entryUsed, edits); }, visit(cst, fns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, - toObject(cst) { chk(cst); activate(d); return toObjectCore(cst.root); }, tree: view, }; } diff --git a/test/emit-parser-verify.ts b/test/emit-parser-verify.ts index 2269874..c7c2732 100644 --- a/test/emit-parser-verify.ts +++ b/test/emit-parser-verify.ts @@ -9,6 +9,7 @@ // node test/emit-parser-verify.ts # 4 bench files + ~400-file corpus sample // node test/emit-parser-verify.ts # sample stride N (default ~ to hit ~400) // node test/emit-parser-verify.ts all # every .ts file under conformance +import { objectify } from './emitted-obj.ts'; import { createParser } from '../src/gen-parser.ts'; import { emitParser } from '../src/emit-parser.ts'; import { readdir } from 'fs/promises'; @@ -41,7 +42,7 @@ function compare(code: string): { verdict: string; detail?: string } { const o = run(oracle.parse, code); // The emitted parser returns an arena node id; materialize the object view for the // byte-identical comparison against the interpreter's object tree. - const e = run((s: string) => emitted.toObject(emitted.parse(s)), code); + const e = run((s: string) => { const r = emitted.parse(s); return objectify(emitted.tree, (fns) => emitted.visit(r, fns)); }, code); if (!o.ok && o.err.includes('Maximum call stack')) { // The interpreter recursed out of stack — a CAPACITY limit, not a parse verdict; // the emitted parser's flatter frames can legitimately survive deeper inputs diff --git a/test/emitted-obj.ts b/test/emitted-obj.ts new file mode 100644 index 0000000..cc4c123 --- /dev/null +++ b/test/emitted-obj.ts @@ -0,0 +1,39 @@ +// Materialize an emitted-engine tree as a plain object — TEST-SIDE ONLY. The engine +// deliberately exposes a single consumption surface (visit + tree accessors); full +// materialization is a consumer choice, and the only consumer that needs it is the +// gate layer's byte-identical JSON comparison (incremental ≡ fresh, emit ≡ interp). +// The shape (and KEY ORDER — JSON.stringify equality depends on it) mirrors the +// interpreter's native object trees: nodes { rule, children, offset, end }, leaves +// { tokenType, offset, end }. +export interface TreeView { + ruleNameOf(id: number): string; + lenOf(id: number): number; + leafTokenType(entry: number, tokBase: number): string; + leafOffsetOf(entry: number, tokBase: number): number; + leafEndOf(entry: number, tokBase: number): number; +} +type VisitFns = { + enter?(id: number, charBase: number, tokBase: number): boolean | void; + leave?(id: number, charBase: number, tokBase: number): void; + leaf?(entry: number, tok: number): void; +}; +export type ObjNode = { rule: string; children: (ObjNode | ObjLeaf)[]; offset: number; end: number }; +export type ObjLeaf = { tokenType: string; offset: number; end: number }; + +export function objectify(tree: TreeView, runVisit: (fns: VisitFns) => void): ObjNode { + const rootHolder: { children: (ObjNode | ObjLeaf)[] } = { children: [] }; + const stack: { children: (ObjNode | ObjLeaf)[] }[] = [rootHolder]; + runVisit({ + enter(id, charBase) { + const node: ObjNode = { rule: tree.ruleNameOf(id), children: [], offset: charBase, end: charBase + tree.lenOf(id) }; + stack[stack.length - 1].children.push(node); + stack.push(node); + }, + leave() { stack.pop(); }, + leaf(entry, tok) { + const tb = tok - ((~entry) >>> 2); + stack[stack.length - 1].children.push({ tokenType: tree.leafTokenType(entry, tb), offset: tree.leafOffsetOf(entry, tb), end: tree.leafEndOf(entry, tb) }); + }, + }); + return rootHolder.children[0] as ObjNode; +} diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index 241f9c1..3b1f73a 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -6,6 +6,7 @@ // and the arena growth, so reuse is MEASURED, not assumed. // // node test/incremental-verify.ts +import { objectify } from './emitted-obj.ts'; import { existsSync, readFileSync, writeFileSync } from 'node:fs'; import { emitParser } from '../src/emit-parser.ts'; @@ -17,11 +18,13 @@ type Cst = { root: number }; type Parser = { parse(s: string): Cst; edit(cst: Cst, s: string, edits?: Edit[]): void; - toObject(cst: Cst): unknown; + visit(cst: Cst, fns: object): void; + tree: import('./emitted-obj.ts').TreeView; }; type Em = { parse(s: string): number; - toObject(id: number): unknown; + visit(entry: number, fns: object): void; + tree: import('./emitted-obj.ts').TreeView; createParser(): Parser; }; const session = ((await import(emPath + '?session=' + process.pid)) as Em).createParser(); @@ -109,8 +112,8 @@ for (const [base, edited] of GLUE) { else bothReject++; continue; } - const a = JSON.stringify(fresh.toObject(fr)); - const b = JSON.stringify(session.toObject(c0)); + const a = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(fr, fns))); + const b = JSON.stringify(objectify(session.tree, (fns) => session.visit(c0, fns))); if (a === b) equal++; else { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: tree diverges`); } } @@ -139,8 +142,8 @@ for (const f of FILES) { continue; } tFresh += tf1 - tf0; tInc += ti1 - ti0; - const a = JSON.stringify(fresh.toObject(freshRoot)); - const b = JSON.stringify(session.toObject(cst)); + const a = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(freshRoot, fns))); + const b = JSON.stringify(objectify(session.tree, (fns) => session.visit(cst, fns))); if (a === b) equal++; else { mismatch++; diff --git a/test/multi-doc.ts b/test/multi-doc.ts index 111fc6c..1b18f4e 100644 --- a/test/multi-doc.ts +++ b/test/multi-doc.ts @@ -8,6 +8,7 @@ // an in-place-mutated tree, and a REJECTED edit leaves the old handle valid. // // node test/multi-doc.ts +import { objectify } from './emitted-obj.ts'; import { writeFileSync } from 'node:fs'; import { emitParser } from '../src/emit-parser.ts'; @@ -16,8 +17,8 @@ const emPath = '/tmp/emitted-multidoc.mjs'; writeFileSync(emPath, emitParser(grammar)); type Edit = { start: number; oldEnd: number; newEnd: number }; type Cst = { root: number }; -type Parser = { parse(s: string): Cst; edit(cst: Cst, s: string, edits?: Edit[]): void; toObject(cst: Cst): unknown; visit(cst: Cst, fns: object): void }; -type Em = { parse(s: string): number; toObject(id: number): unknown; createParser(): Parser }; +type Parser = { parse(s: string): Cst; edit(cst: Cst, s: string, edits?: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; +type Em = { parse(s: string): number; createParser(): Parser }; const em = (await import(emPath + '?v=' + process.pid)) as Em; // Two synthetic documents (no corpus dependency — the gate always exercises). @@ -65,8 +66,9 @@ for (let k = 0; k < 60; k++) { } // mix the module-level default doc in between: it must not disturb either instance if (k % 5 === 0) em.parse('const mix = ' + k + ';'); - const a = JSON.stringify(f.toObject(fc!)); - const b = JSON.stringify((onA ? p1 : p2).toObject(onA ? cstA : cstB)); + const a = JSON.stringify(objectify(f.tree, (fns) => f.visit(fc!, fns))); + const q = onA ? p1 : p2; + const b = JSON.stringify(objectify(q.tree, (fns) => q.visit(onA ? cstA : cstB, fns))); if (a === b) equal++; else { mismatch++; @@ -84,22 +86,23 @@ let contract = 0; { const p = em.createParser(); const c1 = p.parse('const a = 1;'); - const before = JSON.stringify(p.toObject(c1)); + const obj = (h: Cst) => JSON.stringify(objectify(p.tree, (fns) => p.visit(h, fns))); + const before = obj(c1); p.edit(c1, 'const ab = 1;'); - const after = JSON.stringify(p.toObject(c1)); + const after = obj(c1); if (after !== before && after.includes('"end":8')) contract++; // same handle, new tree else failures.push('in-place edit did not update the handle'); try { p2.edit(c1, 'const y = 3;'); failures.push('foreign handle did not throw'); } catch { contract++; } let rejected = false; try { p.edit(c1, 'const ] = ;'); } catch { rejected = true; } - if (rejected && JSON.stringify(p.toObject(c1)) === after) contract++; // reject keeps the tree + if (rejected && obj(c1) === after) contract++; // reject keeps the tree else failures.push('reject-then-read flow broke'); const c2 = p.parse('let q = 1;'); - try { p.toObject(c1); failures.push('re-opened document: old handle did not throw'); } catch { contract++; } + try { obj(c1); failures.push('re-opened document: old handle did not throw'); } catch { contract++; } // a REJECTING parse() resets the arena too — it must invalidate prior handles try { p.parse('const ] = ;'); } catch { /* expected reject */ } let dead = false; - try { p.toObject(c2); } catch { dead = true; } + try { obj(c2); } catch { dead = true; } if (dead) contract++; else failures.push('rejecting parse() left the old handle readable over a reset arena'); } From e6c4e6cab8f6a06de36d0086c7e4ea5960bcba87 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 07:21:56 +0800 Subject: [PATCH 14/15] edit() has ONE usage: the edit ranges are required The char-diff envelope was the protocol's predecessor left in as a convenience default - but it silently spends O(file) prefix/suffix scans, defeating the O(damage) contract exactly for the callers who reached for the incremental API. Callers that track edits (editors) all have the ranges; a caller without them passes the whole-file range and gets an honest full re-parse instead of hidden scans. The ranges MUST cover every change: over-claiming shrinks via the true token-prefix compare; under-claiming is the caller's bug (the same garbage-in contract as tree-sitter's tree.edit, now documented at the envelope). edit() without ranges throws (gated, contract 6/6); the seeded sessions and glue cases all pass explicit ranges (the gate keeps a small test-side diff helper for its constructed pairs). 31/31 gates, parity 0 mismatches, keystroke median 0.028ms. --- src/emit-parser.ts | 36 ++++++++++++++++------------------- test/incremental-verify.ts | 16 +++++++++++++--- test/multi-doc.ts | 39 +++++++++++++++++++++++++++----------- 3 files changed, 57 insertions(+), 34 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index c39a622..559c1dc 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -2948,6 +2948,9 @@ function editCore(source, entryRule, edits) { } } function editCoreRun(source, entryRule, edits) { + if (edits === undefined || edits.length === 0) { + throw new Error('edit() requires the edit ranges: [{ start, oldEnd, newEnd }] in old/new character coordinates (covering every change); pass [{ start: 0, oldEnd: , newEnd: }] to force a full re-parse'); + } if (lastSrc === null) { // No coherent edit base (a previous attempt rejected): full re-parse in APPEND // mode — parseCore would reset the arena and destroy the live tree the handle @@ -2972,27 +2975,20 @@ function editCoreRun(source, entryRule, edits) { const oSrc = lastSrc; lastSrc = null; ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── - // Damage envelope: from the EDIT PROTOCOL when the caller provides it (an editor - // knows its edit ranges — [{start, oldEnd, newEnd}] in old/new coordinates), else - // derived by the char-level prefix/suffix compare (the cheapest possible fallback, - // but O(file) scans). + // Damage envelope: the caller's edit ranges, merged ([{start, oldEnd, newEnd}] in + // old/new coordinates — an editor's change events). The ranges MUST cover every + // change: over-claiming only shrinks via the true token-prefix compare below; + // under-claiming means text outside the window is never re-lexed (the same + // garbage-in contract as tree-sitter's tree.edit). There is deliberately no + // char-diff fallback — it would silently spend O(file) scans, and a caller + // without ranges can pass the whole-file range for an honest full re-parse. const oldLen = oSrc.length, newLen = source.length; - let cs, ceOld, ceNew; - if (edits !== undefined && edits.length > 0) { - cs = edits[0].start; ceOld = edits[0].oldEnd; ceNew = edits[0].newEnd; - for (let i = 1; i < edits.length; i++) { - const ed = edits[i]; - if (ed.start < cs) cs = ed.start; - if (ed.oldEnd > ceOld) ceOld = ed.oldEnd; - if (ed.newEnd > ceNew) ceNew = ed.newEnd; - } - } else { - const minL = oldLen < newLen ? oldLen : newLen; - cs = 0; - while (cs < minL && oSrc.charCodeAt(cs) === source.charCodeAt(cs)) cs++; - let ce = 0; - while (ce < minL - cs && oSrc.charCodeAt(oldLen - 1 - ce) === source.charCodeAt(newLen - 1 - ce)) ce++; - ceOld = oldLen - ce; ceNew = newLen - ce; + let cs = edits[0].start, ceOld = edits[0].oldEnd, ceNew = edits[0].newEnd; + for (let i = 1; i < edits.length; i++) { + const ed = edits[i]; + if (ed.start < cs) cs = ed.start; + if (ed.oldEnd > ceOld) ceOld = ed.oldEnd; + if (ed.newEnd > ceNew) ceNew = ed.newEnd; } const charDelta = newLen - oldLen; // Restart anchor: the last token B ending at/before the damage whose recorded diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index 3b1f73a..e7f6826 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -85,6 +85,17 @@ const STEPS = 30; // ending exactly at the damage start can be EXTENDED under maximal munch — 'b'+'x' // = 'bx', '='+'=' = '==', deleting a gap glues neighbours). These cases pin the // strict-< restart anchor; every one must match fresh (tree or reject) exactly. +// Test-side range derivation for constructed pairs (the ENGINE requires explicit +// ranges — a caller without them passes the whole-file range for a full re-parse). +function diffRange(a: string, b: string): Edit { + const minL = Math.min(a.length, b.length); + let s = 0; + while (s < minL && a.charCodeAt(s) === b.charCodeAt(s)) s++; + let e = 0; + while (e < minL - s && a.charCodeAt(a.length - 1 - e) === b.charCodeAt(b.length - 1 - e)) e++; + return { start: s, oldEnd: a.length - e, newEnd: b.length - e }; +} + const GLUE: Array<[string, string]> = [ ['const a = 1;\nconst b = 2;\n', 'const a = 1;\nconst bx = 2;\n'], ['let a = b; let c = 1;\n', 'let a = b1; let c = 1;\n'], @@ -106,7 +117,7 @@ for (const [base, edited] of GLUE) { let fe: string | null = null, ie: string | null = null; let fr = -1; try { fr = fresh.parse(edited); } catch (e) { fe = (e as Error).message; } - try { session.edit(c0, edited); } catch (e) { ie = (e as Error).message; } + try { session.edit(c0, edited, [diffRange(base, edited)]); } catch (e) { ie = (e as Error).message; } if (fe !== null || ie !== null) { if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: fresh ${fe ? 'reject' : 'accept'} / incremental ${ie ? 'reject' : 'accept'}`); } else bothReject++; @@ -129,9 +140,8 @@ for (const f of FILES) { try { freshRoot = fresh.parse(next); } catch (e) { freshErr = (e as Error).message; } const tf1 = performance.now(); let incErr: string | null = null; - const useProtocol = k % 2 === 1; // alternate: edits protocol / char-diff fallback const ti0 = performance.now(); - try { session.edit(cst, next, useProtocol ? [edit] : undefined); } catch (e) { incErr = (e as Error).message; } + try { session.edit(cst, next, [edit]); } catch (e) { incErr = (e as Error).message; } const ti1 = performance.now(); if (freshErr !== null || incErr !== null) { if ((freshErr === null) !== (incErr === null)) { diff --git a/test/multi-doc.ts b/test/multi-doc.ts index 1b18f4e..dbe5f6e 100644 --- a/test/multi-doc.ts +++ b/test/multi-doc.ts @@ -34,11 +34,22 @@ let seed = 0x51C0FFEE; const rand = () => ((seed = (seed * 48271) % 0x7fffffff) / 0x7fffffff); const randInt = (n: number) => Math.floor(rand() * n); const INS = ['x', '1', ' + q', '.m', '(/*c*/)', '"s"']; -function mutate(text: string): string { +function mutate(text: string): { next: string; edit: Edit } { switch (randInt(3)) { - case 0: { const at = randInt(text.length); return text.slice(0, at) + INS[randInt(INS.length)] + text.slice(at); } - case 1: { const at = randInt(Math.max(1, text.length - 6)); return text.slice(0, at) + text.slice(at + 1 + randInt(4)); } - default: { const at = randInt(Math.max(1, text.length - 1)); return text.slice(0, at) + 'z' + text.slice(at + 1); } + case 0: { + const at = randInt(text.length); + const ins = INS[randInt(INS.length)]; + return { next: text.slice(0, at) + ins + text.slice(at), edit: { start: at, oldEnd: at, newEnd: at + ins.length } }; + } + case 1: { + const at = randInt(Math.max(1, text.length - 6)); + const n = 1 + randInt(4); + return { next: text.slice(0, at) + text.slice(at + n), edit: { start: at, oldEnd: at + n, newEnd: at } }; + } + default: { + const at = randInt(Math.max(1, text.length - 1)); + return { next: text.slice(0, at) + 'z' + text.slice(at + 1), edit: { start: at, oldEnd: at + 1, newEnd: at + 1 } }; + } } } @@ -53,12 +64,12 @@ const failures: string[] = []; for (let k = 0; k < 60; k++) { const onA = (k & 1) === 0; const text = onA ? textA : textB; - const next = mutate(text); + const { next, edit } = mutate(text); steps++; let fe: string | null = null, ie: string | null = null; let fc: Cst | null = null; try { fc = f.parse(next); } catch (e) { fe = (e as Error).message; } - try { (onA ? p1 : p2).edit(onA ? cstA : cstB, next); } catch (e) { ie = (e as Error).message; } + try { (onA ? p1 : p2).edit(onA ? cstA : cstB, next, [edit]); } catch (e) { ie = (e as Error).message; } if (fe !== null || ie !== null) { if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): fresh ${fe ? 'reject' : 'accept'} / edit ${ie ? 'reject' : 'accept'}`); } else bothReject++; @@ -88,17 +99,23 @@ let contract = 0; const c1 = p.parse('const a = 1;'); const obj = (h: Cst) => JSON.stringify(objectify(p.tree, (fns) => p.visit(h, fns))); const before = obj(c1); - p.edit(c1, 'const ab = 1;'); + p.edit(c1, 'const ab = 1;', [{ start: 7, oldEnd: 7, newEnd: 8 }]); const after = obj(c1); if (after !== before && after.includes('"end":8')) contract++; // same handle, new tree else failures.push('in-place edit did not update the handle'); - try { p2.edit(c1, 'const y = 3;'); failures.push('foreign handle did not throw'); } catch { contract++; } + try { p2.edit(c1, 'const y = 3;', [{ start: 0, oldEnd: 13, newEnd: 12 }]); failures.push('foreign handle did not throw'); } catch { contract++; } let rejected = false; - try { p.edit(c1, 'const ] = ;'); } catch { rejected = true; } + try { p.edit(c1, 'const ] = ;', [{ start: 6, oldEnd: 13, newEnd: 11 }]); } catch { rejected = true; } if (rejected && obj(c1) === after) contract++; // reject keeps the tree else failures.push('reject-then-read flow broke'); const c2 = p.parse('let q = 1;'); try { obj(c1); failures.push('re-opened document: old handle did not throw'); } catch { contract++; } + // missing ranges: ONE usage only — edit() without ranges must throw, not + // silently fall back to O(file) diff scans + let needsRanges = false; + try { (p as unknown as { edit(c: Cst, s: string): void }).edit(c2, 'let q = 2;'); } catch { needsRanges = true; } + if (needsRanges) contract++; + else failures.push('edit() without ranges did not throw'); // a REJECTING parse() resets the arena too — it must invalidate prior handles try { p.parse('const ] = ;'); } catch { /* expected reject */ } let dead = false; @@ -107,9 +124,9 @@ let contract = 0; else failures.push('rejecting parse() left the old handle readable over a reset arena'); } -console.log(`multi-doc: ${equal} equal · ${bothReject} both-reject · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/5`); +console.log(`multi-doc: ${equal} equal · ${bothReject} both-reject · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/6`); for (const s of failures) console.log(' ✗ ' + s); -if (mismatch > 0 || contract !== 5 || failures.length > 0) { +if (mismatch > 0 || contract !== 6 || failures.length > 0) { console.error('✗ document isolation / handle contract violated'); process.exit(1); } From 2f8e87a4785170ededb93d164e08d817545d4847 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Thu, 11 Jun 2026 07:47:23 +0800 Subject: [PATCH 15/15] edit() takes changes, the engine owns the text as pieces: end-to-end O(damage) Why next had to go: it was the only carrier of the inserted content (the ranges carry positions, not text), which meant the API could express 'the text and the ranges disagree' - the garbage-in hazard. The change protocol [{ start, end, text }] is LSP/VS Code's native shape (each edit in the coordinates of the document after the preceding ones) and makes the inconsistency unrepresentable: the engine BUILDS the new text from the changes. Building it as a string exposed a cost that was always there, hidden in the caller: slicing the previous edit's cons string flattens it in V8 - measured 1.18ms per keystroke on 9MB, paid by whoever materializes the text. The engine now owns the document as PIECES (flat fragments; applying a change splits via O(1) SlicedString views and never flattens), with: - window-materialized relexing: lexCore reads a small flat slice with an absolute srcBase bias (biased once per token in tkPush - batch cost is two adds per token); running off the window end - including a matcher failing at the EDGE (a truncated string literal is not a lex error) - signals a retry with a larger window via LEX_RETRY. A cut token cannot fake a resync: suffix-zone equality makes its end mismatch the old token's. - doc reads route through docChar/docText (flat fast path, cursor- cached piece lookup otherwise); cold paths (errors, debug) flatten lazily; pieces consolidate past 256 fragments (amortized join). - the reject restore re-lexes the live tree's pieces but preserves the DOCUMENT pieces (the editor's buffer holds the rejected text; the gates' editor model now advances on reject and verifies an UNDO revert edit against a fresh parse every time). End-to-end keystroke (engine builds the text, nothing hidden): 9MB median 0.024ms / p90 0.047ms; 8MB nested 0.072ms / p90 0.094ms. 31/31 gates, parity 0 mismatches, lexer streams byte-identical, batch in band (11.5x). --- src/emit-lexer.ts | 27 ++-- src/emit-parser.ts | 248 ++++++++++++++++++++++++++++--------- test/incremental-verify.ts | 41 ++++-- test/multi-doc.ts | 62 +++++++--- 4 files changed, 287 insertions(+), 91 deletions(-) diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 704111e..bf2ce1d 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -103,6 +103,11 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// ── Emitted lexer (emit-lexer.ts): specialized tokenize for this grammar ──`); for (const m of matchers) emit(`const ${m.re} = new RegExp(${J(`(?:${m.pattern})`)}, ${J(m.flags)});`); emit(`const LX_WS = /\\s+/y;`); + emit(`// window-truncation retry: a matcher failing at the WINDOW edge is not a lex`); + emit(`// error — the caller re-materializes a larger window (truncation cannot fake a`); + emit(`// resync: suffix-zone equality makes a cut token's END mismatch the old one)`); + emit(`const LEX_RETRY = { retry: true };`); + emit(`let lexWindowMore = false;`); emit(`const LX_UNI_IDENT = /[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/uy;`); emit(`const LX_UNI_CONT = /[$\\u200c\\u200d\\p{ID_Continue}]+/uy;`); emit(`const LX_UNI_FULL = /^[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/u;`); @@ -177,7 +182,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` if (validateEscapes) {`); emit(` LX_TPL_ESC.lastIndex = pos;`); emit(` const m = LX_TPL_ESC.exec(source);`); - emit(` if (!m) throw new Error('Invalid escape sequence in template at offset ' + pos);`); + emit(` if (!m) { if (lexWindowMore) throw LEX_RETRY; throw new Error('Invalid escape sequence in template at offset ' + pos); }`); emit(` pos += m[0].length;`); emit(` } else { pos += 2; }`); } else { @@ -188,6 +193,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` if (${startsWithExpr('source', 'pos', tplOpen)}) return { endsWithInterp: false, end: pos + ${tplOpen.length} };`); emit(` pos++;`); emit(` }`); + emit(` if (lexWindowMore) throw LEX_RETRY;`); emit(` throw new Error('Unterminated template literal at offset ' + pos);`); emit(`}`); } @@ -197,7 +203,8 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // when a CST leaf is built. Flag bits: 1 = newlineBefore (the only stamp this emitted // lexer ever sets; comment/multilineFlow stamps belong to fallback-only grammars). emit(`function tokenize(source) {`); - emit(` src = source;`); + emit(` docPieces = [source]; docPieceOff = [0]; docLen = source.length;`); + emit(` docFlat = source; docCur = 0;`); emit(` tokN = 0;`); emit(` parenCachePos = -1;`); emit(` srcLenP1 = source.length + 1;`); @@ -213,7 +220,9 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// old token (same k/t, offsets shifted by wndDelta, both depth records 0) while`); emit(`// the window's own stacks are empty — returns that OLD index (the duplicate push`); emit(`// is retracted), or -1 when lexing ran to EOF.`); - emit(`function lexCore(source, startPos, pvK, pvT, wndPtr0, wndMinOff, wndDelta, wndCs, initParens) {`); + emit(`function lexCore(source, startPos, pvK, pvT, wndPtr0, wndMinOff, wndDelta, wndCs, initParens, srcBase, hasMore) {`); + emit(` if (srcBase === undefined) srcBase = 0;`); + emit(` lexWindowMore = hasMore === true;`); emit(` const n = source.length;`); emit(` let pos = startPos;`); emit(` let pendingNl = false;`); @@ -231,6 +240,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` let dmgDp = -1, dmgPd = -1;`); emit(` let lastDp = templateStack.length, lastPd = parenHeadStack.length;`); emit(` function tkPush(k, t, off, end) {`); + emit(` off += srcBase; end += srcBase;`); emit(` if (tokN === tkCap) growTok();`); emit(` tkK[tokN] = k; tkT[tokN] = t; tkOff[tokN] = off; tkEnd[tokN] = end;`); emit(` tkFl[tokN] = (pendingNl ? 1 : 0) | extraFl;`); @@ -360,7 +370,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`${ind} if (m !== null) {`); if (m.identLike) { const plen = (identPrefixByName.get(m.name) ?? '').length; - emit(`${ind} if (!lexIdentValid(m[0], ${plen})) throw new Error("Invalid identifier escape at offset " + pos + ": '" + m[0] + "'");`); + emit(`${ind} if (!lexIdentValid(m[0], ${plen})) { if (lexWindowMore) throw LEX_RETRY; throw new Error("Invalid identifier escape at offset " + pos + ": '" + m[0] + "'"); }`); } if (m.skip) { emit(`${ind} if (m[0].includes('\\n')) pendingNl = true;`); @@ -470,13 +480,13 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` const _li = tokN - 1;`); const likeKs = [...identLike].map(kOf); const likeCond = likeKs.map(k => `tkK[_li] === ${k}`).join(' || '); - emit(` if ((${likeCond}) && tkEnd[_li] === pos) {`); + emit(` if ((${likeCond}) && tkEnd[_li] === pos + srcBase) {`); emit(` LX_UNI_CONT.lastIndex = pos;`); emit(` const cont = LX_UNI_CONT.exec(source);`); emit(` if (cont !== null) {`); emit(` pos += cont[0].length;`); - emit(` tkEnd[_li] = pos;`); - emit(` tkT[_li] = lexKwT(source, tkOff[_li], pos);`); + emit(` tkEnd[_li] = pos + srcBase;`); + emit(` tkT[_li] = lexKwT(source, tkOff[_li] - srcBase, pos);`); emit(` continue;`); emit(` }`); emit(` }`); @@ -504,10 +514,11 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` }`); emit(` }`); } + emit(` if (lexWindowMore) throw LEX_RETRY;`); emit(` throw new Error("Unexpected character at offset " + pos + ": '" + source[pos] + "'");`); emit(` }`); emit(` if (wndHit >= 0) { tokN--; return wndHit; }`); - emit(` return -1;`); + emit(` return hasMore ? -2 : -1;`); emit(`}`); emit(`// Windowed-relex restart anchor: the last token B ending at/before the damage`); emit(`// whose recorded stack depths are zero and whose shape leaves no cross-token`); diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 559c1dc..4498f64 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -1387,7 +1387,85 @@ let tkDp = new Uint8Array(4096); let tkPd = new Uint16Array(4096); let tkCap = 4096; let tokN = 0; -let src = ''; +// ── The DOCUMENT text layer ── +// The text lives as PIECES (flat string fragments): applying a change splits the +// covering pieces (O(1) SlicedString views — never a flatten) and splices the new +// text in, so a keystroke costs O(pieces), not the O(n) cons-flatten a slice+concat +// per edit forces in V8 (measured: ~1.2ms per edit on 9MB). docFlat caches the +// joined form for the cold paths that need one (errors, debug views); batch parses +// set it directly. Reads route through docChar/docText: flat fast path, piece +// lookup (cursor-cached) otherwise. +let docPieces = null; +let docPieceOff = null; +let docLen = 0; +let docFlat = null; +let docCur = 0; +function docLocate(i) { + let k = docCur; + const po = docPieceOff; + const n = po.length; + if (k >= n || po[k] > i || (k + 1 < n && po[k + 1] <= i)) { + let lo = 0, hi = n; + while (lo < hi) { const m = (lo + hi) >> 1; if (po[m] <= i) lo = m + 1; else hi = m; } + k = lo - 1; + docCur = k; + } + return k; +} +function docChar(i) { + if (docFlat !== null) return docFlat.charCodeAt(i); + const k = docLocate(i); + return docPieces[k].charCodeAt(i - docPieceOff[k]); +} +function docText(a, b) { + if (docFlat !== null) return docFlat.slice(a, b); + if (b <= a) return ''; + let k = docLocate(a); + const first = docPieces[k]; + const lo = a - docPieceOff[k]; + if (b - docPieceOff[k] <= first.length) return first.slice(lo, b - docPieceOff[k]); + let out = first.slice(lo); + k++; + while (k < docPieces.length && docPieceOff[k] < b) { + const piece = docPieces[k]; + const need = b - docPieceOff[k]; + out += need >= piece.length ? piece : piece.slice(0, need); + k++; + } + return out; +} +function flattenDoc() { + if (docFlat === null) docFlat = docPieces.join(''); + return docFlat; +} +function applyChange(start, end, text) { + const ks = docLocate(start); + const ke = docLocate(end > start ? end - 1 : start); + const head = docPieces[ks].slice(0, start - docPieceOff[ks]); + const tailPiece = end > start ? docPieces[ke] : docPieces[ks]; + const tailOff = end - docPieceOff[end > start ? ke : ks]; + const tail = tailPiece.slice(tailOff); + const repl = []; + if (head.length > 0) repl.push(head); + if (text.length > 0) repl.push(text); + if (tail.length > 0) repl.push(tail); + docPieces.splice(ks, (end > start ? ke : ks) - ks + 1, ...repl); + // consolidate when fragmenting (amortized: a join every ≥256 edits) + if (docPieces.length > 256) { + docPieces = [docPieces.join('')]; + } + docLen += text.length - (end - start); + // rebuild offsets from the splice point (suffix offsets shifted anyway) + if (docPieceOff.length !== docPieces.length) docPieceOff.length = docPieces.length; + let off = ks > 0 && ks - 1 < docPieces.length ? docPieceOff[ks - 1] + docPieces[ks - 1].length : 0; + for (let k2 = ks > 0 ? ks : 0; k2 < docPieces.length; k2++) { + docPieceOff[k2] = off; + off += docPieces[k2].length; + } + if (docPieces.length === 1) docPieceOff[0] = 0; + docCur = 0; + docFlat = null; +} // ── EOF-relative spans (incremental sessions) ── // A token's tkOff/tkEnd may be stored EOF-RELATIVE (value − (srcLen + 1), strictly // negative): the decode adds the CURRENT length back, so a pure suffix never needs @@ -1643,7 +1721,7 @@ function matchPuLitGT(pu) { // Split multi-'>' tokens: '>>', '>>>', '>>=', '>>>=' can yield a single '>': shift the // columns up one slot and write the '>' + rest pair in place (both born flag-less, // matching the old mkPunct pair). - if (tkK[pos] === K_PUNCT && tend(pos) - off > 1 && ${e.soa ? 'src.charCodeAt(off) === 62' : "tkText[pos].charCodeAt(0) === 62"}) { + if (tkK[pos] === K_PUNCT && tend(pos) - off > 1 && ${e.soa ? 'docChar(off) === 62' : "tkText[pos].charCodeAt(0) === 62"}) { const end0 = tend(pos); ${e.soa ? '' : 'const restText = tkText[pos].slice(1);'} if (tokN === tkCap) growTok(); @@ -1667,7 +1745,7 @@ function matchPuLitGT(pu) { tkT[pos] = pu; tkEnd[pos] = off + 1 - srcLenP1; tkFl[pos] = 0; tkOff[pos + 1] = off + 1 - srcLenP1; tkFl[pos + 1] = 0; } - tkT[pos + 1] = ${e.soa ? 'LIT_PU.get(src.slice(off + 1, end0)) ?? 0' : 'LIT_PU.get(restText) ?? 0'}; + tkT[pos + 1] = ${e.soa ? 'LIT_PU.get(docText(off + 1, end0)) ?? 0' : 'LIT_PU.get(restText) ?? 0'}; tokN++; if (parseLimit < 0) cap = tokN; // Token indices shifted: drop the per-rule memo arrays (recreated lazily at the new size). @@ -1992,7 +2070,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` const _h = kids[rowStart[lhs]];`); e.emit(` if (_h < 0 && ((~_h) & 3) === 2) {`); e.emit(` const _ht = absTok[lhs] + ((~_h) >>> 2);`); - e.emit(` const _htext = ${e.soa ? 'src.slice(toff(_ht), tend(_ht))' : 'tkText[_ht]'};`); + e.emit(` const _htext = ${e.soa ? 'docText(toff(_ht), tend(_ht))' : 'tkText[_ht]'};`); e.emit(` if (prefixOps.has(_htext) && !postfixOpValues.has(_htext)) { return -1; }`); e.emit(` }`); e.emit(` }`); @@ -2227,7 +2305,7 @@ function parseRuleEntry(idx, rid, name, core) { // Token text at an arbitrary index (cold paths: errors, the tokenAt debug view). function tokTextAt(i) { - return ${e.soa ? 'src.slice(toff(i), tend(i))' : 'tkText[i]'}; + return ${e.soa ? 'docText(toff(i), tend(i))' : 'tkText[i]'}; } // The k → type-name inverse, for reconstructing a token object (tokenAt). const K_NAMES = []; @@ -2322,7 +2400,7 @@ function visitCore(entry, fns, charBase, tokBase) { // Parse to the ARENA: returns the root node id. function lexInto(source) { -${e.soa ? ` tokenize(source);` : String.raw` src = source; +${e.soa ? ` tokenize(source);` : String.raw` docPieces = [source]; docPieceOff = [0]; docLen = source.length; docFlat = source; docCur = 0; const _toks = tokenize(source); const _n = _toks.length; while (tkCap < _n + 1) growTok(); @@ -2375,11 +2453,14 @@ function runParse(entryRule) { // Source of the last COMPLETED parse — the token columns, arena and memo describe it. // null whenever the module state is not a coherent snapshot (no parse yet, or the last // attempt threw), so parseEdited falls back to a full parse. -let lastSrc = null; -// Source text of the LIVE tree (unlike lastSrc it survives a rejected edit): the -// reject path restores the token columns to it so the handle keeps reading the -// previous tree; only a successful parse/edit moves it. -let treeSrc = null; +// Coherent-edit-base flag: false after a rejected attempt (the next edit falls +// back to a full re-parse of the document text). +let lastOk = false; +// Pieces snapshot of the LIVE tree's text (survives a rejected edit): the reject +// path re-lexes it so the handle keeps reading the previous tree. The document +// pieces above advance on EVERY edit, accepted or rejected — the editor's buffer +// applied the change regardless, and later coordinates are against it. +let treePieces = null; // the LAST parse root's absolute coordinates (the descent origin — see visit/toObject) let rootCharBase = 0; let rootTokBase = 0; @@ -2519,6 +2600,8 @@ function runExtend(rid) { // uses, made transitive by rowKC: each kid's probe watermark stays at/below the // next kid's start, so checking the LAST kept kid bounds them all. let surgX = [], surgBase = [], surgA = [], surgB = []; +// composed change envelope handed from the text-application step to the window relex +let editDmgS = 0, editDmgE = 0; function rowKCof(id) { const c = rowKC[id]; if (c !== 0) return c; @@ -2827,7 +2910,7 @@ function makeDoc() { tkK: new tkK.constructor(4096), tkT: new tkT.constructor(4096), tkOff: new Int32Array(4096), tkEnd: new Int32Array(4096), tkFl: new Uint8Array(4096), tkDp: new Uint8Array(4096), tkPd: new Uint16Array(4096), - tkCap: 4096, tokN: 0, src: '', srcLenP1: 1, negFrom: 0x7fffffff, + tkCap: 4096, tokN: 0, srcLenP1: 1, negFrom: 0x7fffffff, rowRule: new Uint16Array(8192), rowLen: new Int32Array(8192), rowTokLen: new Int32Array(8192), rowStart: new Int32Array(8192), rowCount: new Int32Array(8192), rowExt: new Int32Array(8192), rowOK: new Uint8Array(8192), rowKC: new Uint8Array(8192), @@ -2837,7 +2920,9 @@ function makeDoc() { kids: new Int32Array(16384), kidRel: new Int32Array(16384), kidTokRel: new Int32Array(16384), kidCap: 16384, kidN: 0, memoNode: [], memoEnd: [], memoExt: [], memoGen: [], memoGenCur: 0, - lastSrc: null, treeSrc: null, rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, + lastOk: false, treePieces: null, + docPieces: null, docPieceOff: null, docLen: 0, docFlat: null, docCur: 0, + rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, ${e.soa ? ' parenCachePos: -1, parenCacheStack: [],' : ''} altK: null, altT: null, altOff: null, altEnd: null, altFl: null, altDp: null, altPd: null, altCap: 0, altN: 0, @@ -2845,7 +2930,7 @@ ${e.soa ? ' parenCachePos: -1, parenCacheStack: [],' : ''} } function saveDoc(d) { d.tkK = tkK; d.tkT = tkT; d.tkOff = tkOff; d.tkEnd = tkEnd; d.tkFl = tkFl; - d.tkDp = tkDp; d.tkPd = tkPd; d.tkCap = tkCap; d.tokN = tokN; d.src = src; + d.tkDp = tkDp; d.tkPd = tkPd; d.tkCap = tkCap; d.tokN = tokN; d.srcLenP1 = srcLenP1; d.negFrom = negFrom; d.rowRule = rowRule; d.rowLen = rowLen; d.rowTokLen = rowTokLen; d.rowStart = rowStart; d.rowCount = rowCount; d.rowExt = rowExt; d.rowOK = rowOK; d.rowKC = rowKC; d.rowNF = rowNF; @@ -2853,7 +2938,9 @@ function saveDoc(d) { d.kids = kids; d.kidRel = kidRel; d.kidTokRel = kidTokRel; d.kidCap = kidCap; d.kidN = kidN; d.memoNode = memoNode; d.memoEnd = memoEnd; d.memoExt = memoExt; d.memoGen = memoGen; d.memoGenCur = memoGenCur; - d.lastSrc = lastSrc; d.treeSrc = treeSrc; d.rootCharBase = rootCharBase; d.rootTokBase = rootTokBase; + d.lastOk = lastOk; d.treePieces = treePieces; + d.docPieces = docPieces; d.docPieceOff = docPieceOff; d.docLen = docLen; d.docFlat = docFlat; d.docCur = docCur; + d.rootCharBase = rootCharBase; d.rootTokBase = rootTokBase; d.lastRoot = lastRoot; d.lastRootTok = lastRootTok; ${e.soa ? ' d.parenCachePos = parenCachePos; d.parenCacheStack = parenCacheStack;' : ''} d.altK = altK; d.altT = altT; d.altOff = altOff; d.altEnd = altEnd; d.altFl = altFl; @@ -2861,7 +2948,7 @@ ${e.soa ? ' d.parenCachePos = parenCachePos; d.parenCacheStack = parenCacheStac } function loadDoc(d) { tkK = d.tkK; tkT = d.tkT; tkOff = d.tkOff; tkEnd = d.tkEnd; tkFl = d.tkFl; - tkDp = d.tkDp; tkPd = d.tkPd; tkCap = d.tkCap; tokN = d.tokN; src = d.src; + tkDp = d.tkDp; tkPd = d.tkPd; tkCap = d.tkCap; tokN = d.tokN; srcLenP1 = d.srcLenP1; negFrom = d.negFrom; rowRule = d.rowRule; rowLen = d.rowLen; rowTokLen = d.rowTokLen; rowStart = d.rowStart; rowCount = d.rowCount; rowExt = d.rowExt; rowOK = d.rowOK; rowKC = d.rowKC; rowNF = d.rowNF; @@ -2869,7 +2956,9 @@ function loadDoc(d) { kids = d.kids; kidRel = d.kidRel; kidTokRel = d.kidTokRel; kidCap = d.kidCap; kidN = d.kidN; memoNode = d.memoNode; memoEnd = d.memoEnd; memoExt = d.memoExt; memoGen = d.memoGen; memoGenCur = d.memoGenCur; - lastSrc = d.lastSrc; treeSrc = d.treeSrc; rootCharBase = d.rootCharBase; rootTokBase = d.rootTokBase; + lastOk = d.lastOk; treePieces = d.treePieces; + docPieces = d.docPieces; docPieceOff = d.docPieceOff; docLen = d.docLen; docFlat = d.docFlat; docCur = d.docCur; + rootCharBase = d.rootCharBase; rootTokBase = d.rootTokBase; lastRoot = d.lastRoot; lastRootTok = d.lastRootTok; ${e.soa ? ' parenCachePos = d.parenCachePos; parenCacheStack = d.parenCacheStack;' : ''} altK = d.altK; altT = d.altT; altOff = d.altOff; altEnd = d.altEnd; altFl = d.altFl; @@ -2898,7 +2987,7 @@ function swapBuffers() { ${e.soa ? '' : 'let altText = [];'} function parseCore(source, entryRule) { - lastSrc = null; + lastOk = false; adoptRoot = -1; adoptRunPos = -1; lexInto(source); @@ -2914,8 +3003,8 @@ function parseCore(source, entryRule) { const root = runParse(entryRule); lastRoot = root; lastRootTok = rootTokBase; - lastSrc = source; - treeSrc = source; + lastOk = true; + treePieces = docPieces.slice(); return root; } @@ -2934,28 +3023,65 @@ function parseCore(source, entryRule) { // until then. Lexing is FULL-FILE by design: the lexer carries cross-token state // (template nesting, regex context, markup modes), full lexing is a small share of a // parse, and the diff is what localizes the damage — not the lexer. -function editCore(source, entryRule, edits) { +function editCore(entryRule, edits) { try { - return editCoreRun(source, entryRule, edits); + return editCoreRun(entryRule, edits); } catch (e) { // REJECTED edit: the splice (and any '>' splits of the failed attempt) already // rewrote the token columns to the rejected text, and the append-mode fallback // may have grown the arena — but the live tree's ROWS are untouched. Re-lexing // the live tree's source restores every read path (leaf spans, visit, next // edit's restart anchors); O(n) on the reject path only. - if (treeSrc !== null) { lexInto(treeSrc); lastSrc = null; } + if (treePieces !== null) { + // restore the token columns to the LIVE TREE's text — but the DOCUMENT text + // must stay on the rejected content (lexInto/tokenize resets the doc layer + // as a side effect, so save it around the re-lex) + const kP = docPieces, kO = docPieceOff, kL = docLen, kF = docFlat; + lexInto(treePieces.join('')); + docPieces = kP; docPieceOff = kO; docLen = kL; docFlat = kF; docCur = 0; + lastOk = false; + } throw e; } } -function editCoreRun(source, entryRule, edits) { +function editCoreRun(entryRule, edits) { if (edits === undefined || edits.length === 0) { - throw new Error('edit() requires the edit ranges: [{ start, oldEnd, newEnd }] in old/new character coordinates (covering every change); pass [{ start: 0, oldEnd: , newEnd: }] to force a full re-parse'); + throw new Error('edit() requires the changes: [{ start, end, text }] (LSP-style - each edit in the coordinates of the document AFTER the preceding edits in the array)'); + } + // The engine owns the document text: the new source is BUILT from the changes, + // so "the ranges do not match the text" is unrepresentable. Each edit is applied + // sequentially (LSP incremental-sync semantics); the damage envelope is composed + // alongside: dS in prefix coordinates (identical old/new), dE in FINAL + // coordinates, the old end recovered through the total delta. V8 cons strings + // make the slice+concat construction cheap; the flat-string cost, where a read + // path needs one, is the same the caller would have paid building the text. + if (docPieces === null) throw new Error('edit() before parse(): no document'); + const oldLen = docLen; + { + let dS = 0x7fffffff; + let dE = -1; + for (let i = 0; i < edits.length; i++) { + const ed = edits[i]; + const start = ed.start, end = ed.end, text = ed.text; + if (!(start >= 0 && start <= end && end <= docLen) || typeof text !== 'string') { + throw new Error('edit() change #' + i + ' out of range: [' + start + ', ' + end + ') of ' + docLen); + } + applyChange(start, end, text); + const newEnd = start + text.length; + const delta = newEnd - end; + if (dE > start) dE = dE >= end ? dE + delta : newEnd; + if (newEnd > dE) dE = newEnd; + if (start < dS) dS = start; + } + editDmgS = dS; + editDmgE = dE; } - if (lastSrc === null) { + if (!lastOk) { // No coherent edit base (a previous attempt rejected): full re-parse in APPEND // mode — parseCore would reset the arena and destroy the live tree the handle // still exposes if THIS parse rejects too. parse() is the only compaction point. - lexInto(source); + const whole = flattenDoc(); + lexInto(whole); if (memoEnd.length !== MEMO_RULES) { memoNode = new Array(MEMO_RULES); memoEnd = new Array(MEMO_RULES); @@ -2968,28 +3094,18 @@ function editCoreRun(source, entryRule, edits) { const root = runParse(entryRule); lastRoot = root; lastRootTok = rootTokBase; - lastSrc = source; - treeSrc = source; + lastOk = true; + treePieces = docPieces.slice(); return root; } - const oSrc = lastSrc; - lastSrc = null; + lastOk = false; ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── - // Damage envelope: the caller's edit ranges, merged ([{start, oldEnd, newEnd}] in - // old/new coordinates — an editor's change events). The ranges MUST cover every - // change: over-claiming only shrinks via the true token-prefix compare below; - // under-claiming means text outside the window is never re-lexed (the same - // garbage-in contract as tree-sitter's tree.edit). There is deliberately no - // char-diff fallback — it would silently spend O(file) scans, and a caller - // without ranges can pass the whole-file range for an honest full re-parse. - const oldLen = oSrc.length, newLen = source.length; - let cs = edits[0].start, ceOld = edits[0].oldEnd, ceNew = edits[0].newEnd; - for (let i = 1; i < edits.length; i++) { - const ed = edits[i]; - if (ed.start < cs) cs = ed.start; - if (ed.oldEnd > ceOld) ceOld = ed.oldEnd; - if (ed.newEnd > ceNew) ceNew = ed.newEnd; - } + // Damage envelope from the composed changes: prefix coordinates are shared, the + // old end comes back through the total delta. + const newLen = docLen; + const cs = editDmgS < newLen ? editDmgS : newLen; + const ceNew = editDmgE < cs ? cs : editDmgE; + const ceOld = ceNew - (newLen - oldLen); const charDelta = newLen - oldLen; // Restart anchor: the last token B ending at/before the damage whose recorded // depths are zero and whose shape carries no cross-token lexer flag (')' control- @@ -3011,10 +3127,28 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── } altN = oN; swapBuffers(); // live = scratch, alt = OLD stream - src = source; tokN = 0; const startOff = B >= 0 ? (altEnd[B] < 0 ? altEnd[B] + srcLenP1 : altEnd[B]) : 0; - const R0 = lexCore(source, startOff, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs, initParens); + // Window-materialized relex: lexCore reads a SMALL flat slice of the pieces with + // an absolute bias; -2 = ran off the window end before resyncing — re-materialize + // a larger window and retry (the common case fits the first one). + let R0; + { + let wHi = ceNew + 4096; + for (;;) { + if (wHi > docLen) wHi = docLen; + const windowStr = docText(startOff, wHi); + tokN = 0; + try { + R0 = lexCore(windowStr, 0, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs, initParens.slice(), startOff, wHi < docLen); + } catch (e2) { + if (e2 !== LEX_RETRY) throw e2; + R0 = -2; + } + if (R0 !== -2) break; + wHi = wHi >= docLen ? docLen : (wHi - startOff) * 4 + startOff; + } + } const W = tokN; const R = R0 >= 0 ? R0 : oN; swapBuffers(); // live = OLD stream again; window sits in the alt buffers @@ -3076,9 +3210,9 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── tkText = altText; tkText.length = 0; altK = oK; altT = oT; altOff = oOff; altEnd = oEnd; altFl = oFl; altText = oText; - lexInto(source); + lexInto(flattenDoc()); const nN = tokN; - const charDelta = source.length - oSrc.length; + const charDelta = docLen - oldLen; const minN = oN < nN ? oN : nN; let p = 0; while (p < minN && oK[p] === tkK[p] && oT[p] === tkT[p] && oFl[p] === tkFl[p] @@ -3118,16 +3252,16 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── rootTokBase = adoptRootTok; lastRoot = sroot; lastRootTok = adoptRootTok; - lastSrc = source; - treeSrc = source; + lastOk = true; + treePieces = docPieces.slice(); return sroot; } const root = runParse(entryRule); adoptRoot = -1; lastRoot = root; lastRootTok = rootTokBase; - lastSrc = source; - treeSrc = source; + lastOk = true; + treePieces = docPieces.slice(); return root; } @@ -3135,7 +3269,7 @@ export { tokenize }; // ── Module-level API: the DEFAULT document (one shared session; tokenize and the // raw tree/tokenAt views read the ACTIVE doc — they are gate/debug surfaces) ── export function parse(source, entryRule) { activate(docDefault); return parseCore(source, entryRule); } -export function parseEdited(source, entryRule, edits) { activate(docDefault); return editCore(source, entryRule, edits); } +export function parseEdited(entryRule, edits) { activate(docDefault); return editCore(entryRule, edits); } export function visit(entry, fns, charBase, tokBase) { activate(docDefault); return visitCore(entry, fns, charBase, tokBase); } // ── Handle API: explicit trees over per-instance documents ── // const p = createParser(); const cst = p.parse(text); p.edit(cst, next[, edits]); @@ -3165,10 +3299,10 @@ export function createParser() { const root = parseCore(source, entryRule); return { d, gen, root }; }, - edit(cst, source, edits) { + edit(cst, edits) { chk(cst); activate(d); - cst.root = editCore(source, entryUsed, edits); + cst.root = editCore(entryUsed, edits); }, visit(cst, fns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, tree: view, diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index e7f6826..0178d84 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -13,11 +13,11 @@ import { emitParser } from '../src/emit-parser.ts'; const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-incremental.mjs'; writeFileSync(emPath, emitParser(grammar)); -type Edit = { start: number; oldEnd: number; newEnd: number }; +type Edit = { start: number; end: number; text: string }; type Cst = { root: number }; type Parser = { parse(s: string): Cst; - edit(cst: Cst, s: string, edits?: Edit[]): void; + edit(cst: Cst, edits: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView; }; @@ -46,16 +46,16 @@ function mutate(text: string): { next: string; edit: Edit } { case 0: { // insert a small fragment at a random position const at = randInt(text.length); const ins = INSERTS[randInt(INSERTS.length)]; - return { next: text.slice(0, at) + ins + text.slice(at), edit: { start: at, oldEnd: at, newEnd: at + ins.length } }; + return { next: text.slice(0, at) + ins + text.slice(at), edit: { start: at, end: at, text: ins } }; } case 1: { // delete a small span const at = randInt(Math.max(1, text.length - 8)); const n = 1 + randInt(6); - return { next: text.slice(0, at) + text.slice(at + n), edit: { start: at, oldEnd: at + n, newEnd: at } }; + return { next: text.slice(0, at) + text.slice(at + n), edit: { start: at, end: at + n, text: '' } }; } case 2: { // replace a character const at = randInt(Math.max(1, text.length - 1)); - return { next: text.slice(0, at) + 'z' + text.slice(at + 1), edit: { start: at, oldEnd: at + 1, newEnd: at + 1 } }; + return { next: text.slice(0, at) + 'z' + text.slice(at + 1), edit: { start: at, end: at + 1, text: 'z' } }; } case 3: { // insert a whole statement at a line boundary const lines = text.split('\n'); @@ -63,11 +63,11 @@ function mutate(text: string): { next: string; edit: Edit } { const stmt = STMTS[randInt(STMTS.length)].trimEnd(); lines.splice(at, 0, stmt); const start = at === 0 ? 0 : lines.slice(0, at).join('\n').length + 1; - return { next: lines.join('\n'), edit: { start, oldEnd: start, newEnd: start + stmt.length + 1 } }; + return { next: lines.join('\n'), edit: { start, end: start, text: stmt + '\n' } }; } default: { // append at the end (the pure-prefix reuse case) const stmt = '\n' + STMTS[randInt(STMTS.length)]; - return { next: text + stmt, edit: { start: text.length, oldEnd: text.length, newEnd: text.length + stmt.length } }; + return { next: text + stmt, edit: { start: text.length, end: text.length, text: stmt } }; } } } @@ -87,13 +87,13 @@ const STEPS = 30; // strict-< restart anchor; every one must match fresh (tree or reject) exactly. // Test-side range derivation for constructed pairs (the ENGINE requires explicit // ranges — a caller without them passes the whole-file range for a full re-parse). -function diffRange(a: string, b: string): Edit { +function diffChange(a: string, b: string): Edit { const minL = Math.min(a.length, b.length); let s = 0; while (s < minL && a.charCodeAt(s) === b.charCodeAt(s)) s++; let e = 0; while (e < minL - s && a.charCodeAt(a.length - 1 - e) === b.charCodeAt(b.length - 1 - e)) e++; - return { start: s, oldEnd: a.length - e, newEnd: b.length - e }; + return { start: s, end: a.length - e, text: b.slice(s, b.length - e) }; } const GLUE: Array<[string, string]> = [ @@ -117,7 +117,7 @@ for (const [base, edited] of GLUE) { let fe: string | null = null, ie: string | null = null; let fr = -1; try { fr = fresh.parse(edited); } catch (e) { fe = (e as Error).message; } - try { session.edit(c0, edited, [diffRange(base, edited)]); } catch (e) { ie = (e as Error).message; } + try { session.edit(c0, [diffChange(base, edited)]); } catch (e) { ie = (e as Error).message; } if (fe !== null || ie !== null) { if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: fresh ${fe ? 'reject' : 'accept'} / incremental ${ie ? 'reject' : 'accept'}`); } else bothReject++; @@ -141,14 +141,31 @@ for (const f of FILES) { const tf1 = performance.now(); let incErr: string | null = null; const ti0 = performance.now(); - try { session.edit(cst, next, [edit]); } catch (e) { incErr = (e as Error).message; } + try { session.edit(cst, [edit]); } catch (e) { incErr = (e as Error).message; } const ti1 = performance.now(); if (freshErr !== null || incErr !== null) { if ((freshErr === null) !== (incErr === null)) { mismatch++; if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: fresh ${freshErr ? 'reject' : 'accept'} / incremental ${incErr ? 'reject' : 'accept'}\n fresh: ${freshErr ?? '-'}\n inc: ${incErr ?? '-'}`); } else bothReject++; - // rejected text: the handle stays on the previous tree; do not advance + // REJECTED text: the handle stays on the previous tree, but the DOCUMENT + // advances (editor-buffer model — the buffer applied the change regardless, + // and the engine's docSrc tracks it). Model the editor's UNDO: revert via a + // diff edit in the rejected text's coordinates; it must be accepted and + // byte-identical to a fresh parse of the restored text. + try { + session.edit(cst, [diffChange(next, text)]); + const rfr = fresh.parse(text); + const ra = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(rfr, fns))); + const rb = JSON.stringify(objectify(session.tree, (fns) => session.visit(cst, fns))); + if (ra !== rb) { + mismatch++; + if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: REVERT tree diverges`); + } + } catch (e2) { + mismatch++; + if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: revert rejected: ${(e2 as Error).message.slice(0, 50)}`); + } continue; } tFresh += tf1 - tf0; tInc += ti1 - ti0; diff --git a/test/multi-doc.ts b/test/multi-doc.ts index dbe5f6e..d980cbb 100644 --- a/test/multi-doc.ts +++ b/test/multi-doc.ts @@ -15,9 +15,9 @@ import { emitParser } from '../src/emit-parser.ts'; const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-multidoc.mjs'; writeFileSync(emPath, emitParser(grammar)); -type Edit = { start: number; oldEnd: number; newEnd: number }; +type Edit = { start: number; end: number; text: string }; type Cst = { root: number }; -type Parser = { parse(s: string): Cst; edit(cst: Cst, s: string, edits?: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; +type Parser = { parse(s: string): Cst; edit(cst: Cst, edits: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; type Em = { parse(s: string): number; createParser(): Parser }; const em = (await import(emPath + '?v=' + process.pid)) as Em; @@ -39,27 +39,36 @@ function mutate(text: string): { next: string; edit: Edit } { case 0: { const at = randInt(text.length); const ins = INS[randInt(INS.length)]; - return { next: text.slice(0, at) + ins + text.slice(at), edit: { start: at, oldEnd: at, newEnd: at + ins.length } }; + return { next: text.slice(0, at) + ins + text.slice(at), edit: { start: at, end: at, text: ins } }; } case 1: { const at = randInt(Math.max(1, text.length - 6)); const n = 1 + randInt(4); - return { next: text.slice(0, at) + text.slice(at + n), edit: { start: at, oldEnd: at + n, newEnd: at } }; + return { next: text.slice(0, at) + text.slice(at + n), edit: { start: at, end: at + n, text: '' } }; } default: { const at = randInt(Math.max(1, text.length - 1)); - return { next: text.slice(0, at) + 'z' + text.slice(at + 1), edit: { start: at, oldEnd: at + 1, newEnd: at + 1 } }; + return { next: text.slice(0, at) + 'z' + text.slice(at + 1), edit: { start: at, end: at + 1, text: 'z' } }; } } } +function diffChange(a: string, b: string): Edit { + const minL = Math.min(a.length, b.length); + let s = 0; + while (s < minL && a.charCodeAt(s) === b.charCodeAt(s)) s++; + let e = 0; + while (e < minL - s && a.charCodeAt(a.length - 1 - e) === b.charCodeAt(b.length - 1 - e)) e++; + return { start: s, end: a.length - e, text: b.slice(s, b.length - e) }; +} + const p1 = em.createParser(); const p2 = em.createParser(); const f = em.createParser(); let cstA = p1.parse(textA); let cstB = p2.parse(textB); -let steps = 0, equal = 0, bothReject = 0, mismatch = 0; +let steps = 0, equal = 0, bothReject = 0, mismatch = 0, reverts = 0; const failures: string[] = []; for (let k = 0; k < 60; k++) { const onA = (k & 1) === 0; @@ -69,10 +78,29 @@ for (let k = 0; k < 60; k++) { let fe: string | null = null, ie: string | null = null; let fc: Cst | null = null; try { fc = f.parse(next); } catch (e) { fe = (e as Error).message; } - try { (onA ? p1 : p2).edit(onA ? cstA : cstB, next, [edit]); } catch (e) { ie = (e as Error).message; } + try { (onA ? p1 : p2).edit(onA ? cstA : cstB, [edit]); } catch (e) { ie = (e as Error).message; } if (fe !== null || ie !== null) { if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): fresh ${fe ? 'reject' : 'accept'} / edit ${ie ? 'reject' : 'accept'}`); } else bothReject++; + // the DOCUMENT advances on reject (editor-buffer model): later coordinates + // are against the rejected text. Model the editor's UNDO: revert to the last + // good text via a diff edit in the rejected text's coordinates — it must be + // ACCEPTED and byte-identical to a fresh parse (the post-reject recovery path + // gets exercised every time a mutation breaks the document). + const good = onA ? textA : textB; + const rv = diffChange(next, good); + try { + (onA ? p1 : p2).edit(onA ? cstA : cstB, [rv]); + const fb = f.parse(good); + const ra = JSON.stringify(objectify(f.tree, (fns) => f.visit(fb, fns))); + const qq = onA ? p1 : p2; + const rb = JSON.stringify(objectify(qq.tree, (fns) => qq.visit(onA ? cstA : cstB, fns))); + if (ra === rb) reverts++; + else { mismatch++; if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): REVERT tree diverges`); } + } catch (e2) { + mismatch++; + if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): revert rejected: ${(e2 as Error).message.slice(0, 50)}`); + } continue; } // mix the module-level default doc in between: it must not disturb either instance @@ -99,23 +127,29 @@ let contract = 0; const c1 = p.parse('const a = 1;'); const obj = (h: Cst) => JSON.stringify(objectify(p.tree, (fns) => p.visit(h, fns))); const before = obj(c1); - p.edit(c1, 'const ab = 1;', [{ start: 7, oldEnd: 7, newEnd: 8 }]); + p.edit(c1, [{ start: 7, end: 7, text: 'b' }]); // 'const a = 1;' -> 'const ab = 1;' const after = obj(c1); if (after !== before && after.includes('"end":8')) contract++; // same handle, new tree else failures.push('in-place edit did not update the handle'); - try { p2.edit(c1, 'const y = 3;', [{ start: 0, oldEnd: 13, newEnd: 12 }]); failures.push('foreign handle did not throw'); } catch { contract++; } + try { p2.edit(c1, [{ start: 0, end: 1, text: 'q' }]); failures.push('foreign handle did not throw'); } catch { contract++; } let rejected = false; - try { p.edit(c1, 'const ] = ;', [{ start: 6, oldEnd: 13, newEnd: 11 }]); } catch { rejected = true; } + try { p.edit(c1, [{ start: 6, end: 8, text: ']' }]); } catch { rejected = true; } // 'const ab…' -> 'const ] = 1;' if (rejected && obj(c1) === after) contract++; // reject keeps the tree else failures.push('reject-then-read flow broke'); + // coordinates after a REJECT are against the editor's buffer (the rejected text): + // fixing the same spot in those coordinates must recover the session + let recovered = false; + try { p.edit(c1, [{ start: 6, end: 7, text: 'ab' }]); recovered = true; } catch { /* must not throw */ } + if (recovered && obj(c1).includes('"end":13')) contract++; // 'const ] = 1;' -> 'const ab = 1;' + else failures.push('post-reject coordinates did not track the document text'); const c2 = p.parse('let q = 1;'); try { obj(c1); failures.push('re-opened document: old handle did not throw'); } catch { contract++; } // missing ranges: ONE usage only — edit() without ranges must throw, not // silently fall back to O(file) diff scans let needsRanges = false; - try { (p as unknown as { edit(c: Cst, s: string): void }).edit(c2, 'let q = 2;'); } catch { needsRanges = true; } + try { (p as unknown as { edit(c: Cst): void }).edit(c2); } catch { needsRanges = true; } if (needsRanges) contract++; - else failures.push('edit() without ranges did not throw'); + else failures.push('edit() without changes did not throw'); // a REJECTING parse() resets the arena too — it must invalidate prior handles try { p.parse('const ] = ;'); } catch { /* expected reject */ } let dead = false; @@ -124,9 +158,9 @@ let contract = 0; else failures.push('rejecting parse() left the old handle readable over a reset arena'); } -console.log(`multi-doc: ${equal} equal · ${bothReject} both-reject · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/6`); +console.log(`multi-doc: ${equal} equal · ${bothReject} both-reject (${reverts} reverts verified) · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/7`); for (const s of failures) console.log(' ✗ ' + s); -if (mismatch > 0 || contract !== 6 || failures.length > 0) { +if (mismatch > 0 || contract !== 7 || failures.length > 0) { console.error('✗ document isolation / handle contract violated'); process.exit(1); }