diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 1a41ac6..bf2ce1d 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -103,6 +103,11 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// ── Emitted lexer (emit-lexer.ts): specialized tokenize for this grammar ──`); for (const m of matchers) emit(`const ${m.re} = new RegExp(${J(`(?:${m.pattern})`)}, ${J(m.flags)});`); emit(`const LX_WS = /\\s+/y;`); + emit(`// window-truncation retry: a matcher failing at the WINDOW edge is not a lex`); + emit(`// error — the caller re-materializes a larger window (truncation cannot fake a`); + emit(`// resync: suffix-zone equality makes a cut token's END mismatch the old one)`); + emit(`const LEX_RETRY = { retry: true };`); + emit(`let lexWindowMore = false;`); emit(`const LX_UNI_IDENT = /[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/uy;`); emit(`const LX_UNI_CONT = /[$\\u200c\\u200d\\p{ID_Continue}]+/uy;`); emit(`const LX_UNI_FULL = /^[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/u;`); @@ -177,7 +182,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` if (validateEscapes) {`); emit(` LX_TPL_ESC.lastIndex = pos;`); emit(` const m = LX_TPL_ESC.exec(source);`); - emit(` if (!m) throw new Error('Invalid escape sequence in template at offset ' + pos);`); + emit(` if (!m) { if (lexWindowMore) throw LEX_RETRY; throw new Error('Invalid escape sequence in template at offset ' + pos); }`); emit(` pos += m[0].length;`); emit(` } else { pos += 2; }`); } else { @@ -188,6 +193,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` if (${startsWithExpr('source', 'pos', tplOpen)}) return { endsWithInterp: false, end: pos + ${tplOpen.length} };`); emit(` pos++;`); emit(` }`); + emit(` if (lexWindowMore) throw LEX_RETRY;`); emit(` throw new Error('Unterminated template literal at offset ' + pos);`); emit(`}`); } @@ -197,34 +203,81 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // when a CST leaf is built. Flag bits: 1 = newlineBefore (the only stamp this emitted // lexer ever sets; comment/multilineFlow stamps belong to fallback-only grammars). emit(`function tokenize(source) {`); - emit(` src = source;`); + emit(` docPieces = [source]; docPieceOff = [0]; docLen = source.length;`); + emit(` docFlat = source; docCur = 0;`); emit(` tokN = 0;`); + emit(` parenCachePos = -1;`); + emit(` srcLenP1 = source.length + 1;`); + emit(` negFrom = 0x7fffffff;`); + emit(` lexCore(source, 0, -1, 0, -1, 0, 0);`); + emit(` return tokN;`); + emit(`}`); + emit(`// The lexer core, parameterized for WINDOWED re-lexing: start at startPos with`); + emit(`// the previous token's (k, t) as the regex-context seed (-1 = none / file start)`); + emit(`// and EMPTY template/paren stacks (the caller restarts only at depth-0 safe`); + emit(`// points). In window mode (wndPtr0 >= 0) the OLD stream sits in the alt buffers;`); + emit(`// after each token pushed at/past wndMinOff, resync fires when it aligns with an`); + emit(`// old token (same k/t, offsets shifted by wndDelta, both depth records 0) while`); + emit(`// the window's own stacks are empty — returns that OLD index (the duplicate push`); + emit(`// is retracted), or -1 when lexing ran to EOF.`); + emit(`function lexCore(source, startPos, pvK, pvT, wndPtr0, wndMinOff, wndDelta, wndCs, initParens, srcBase, hasMore) {`); + emit(` if (srcBase === undefined) srcBase = 0;`); + emit(` lexWindowMore = hasMore === true;`); emit(` const n = source.length;`); - emit(` let pos = 0;`); + emit(` let pos = startPos;`); emit(` let pendingNl = false;`); + emit(` let extraFl = 0;`); emit(` let lastBangWasPostfix = false;`); emit(` let lastCloseWasParenHead = false;`); emit(` const templateStack = [];`); - emit(` const parenHeadStack = [];`); + emit(` const parenHeadStack = initParens !== undefined && initParens !== null ? initParens : [];`); + emit(` let wndPtr = wndPtr0;`); + emit(` let wndHit = -1;`); + emit(` // stack depths as of the last token fully BEFORE the damage: a resync point may`); + emit(` // sit at any depth as long as every bracket still open there was opened before`); + emit(` // the damage (the prefix agrees byte-for-byte, so those stack entries agree too;`); + emit(` // anything opened inside the damage could differ in control-head-ness).`); + emit(` let dmgDp = -1, dmgPd = -1;`); + emit(` let lastDp = templateStack.length, lastPd = parenHeadStack.length;`); emit(` function tkPush(k, t, off, end) {`); + emit(` off += srcBase; end += srcBase;`); emit(` if (tokN === tkCap) growTok();`); emit(` tkK[tokN] = k; tkT[tokN] = t; tkOff[tokN] = off; tkEnd[tokN] = end;`); - emit(` tkFl[tokN] = pendingNl ? 1 : 0;`); + emit(` tkFl[tokN] = (pendingNl ? 1 : 0) | extraFl;`); + emit(` extraFl = 0;`); + emit(` tkDp[tokN] = templateStack.length;`); + emit(` tkPd[tokN] = parenHeadStack.length;`); emit(` pendingNl = false;`); + emit(` pvK = k; pvT = t;`); emit(` tokN++;`); + emit(` if (wndPtr >= 0) {`); + emit(` if (dmgPd < 0) {`); + emit(` if (off >= wndCs) { dmgDp = lastDp; dmgPd = lastPd; }`); + emit(` else { lastDp = tkDp[tokN - 1]; lastPd = tkPd[tokN - 1]; }`); + emit(` }`); + emit(` if (off >= wndMinOff && dmgPd >= 0`); + emit(` && templateStack.length <= dmgDp && parenHeadStack.length <= dmgPd) {`); + emit(` while (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta < off) wndPtr++;`); + emit(` if (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta === off && altK[wndPtr] === k && altT[wndPtr] === t`); + emit(` && (altEnd[wndPtr] < 0 ? altEnd[wndPtr] + srcLenP1 : altEnd[wndPtr]) + wndDelta === end && altDp[wndPtr] === templateStack.length && altPd[wndPtr] === parenHeadStack.length) {`); + emit(` wndHit = wndPtr;`); + emit(` }`); + emit(` }`); + emit(` }`); emit(` }`); emit(` // prevIsValue, baked: postfix-ambiguous op → its recorded position; an expression-`); emit(` // head keyword or a control-head ')' is NOT a value; else division-prev type/text.`); emit(` function prevIsValue() {`); - emit(` if (tokN === 0) return false;`); - emit(` const i = tokN - 1;`); - emit(` const t = tkT[i];`); + emit(` const k = tokN > 0 ? tkK[tokN - 1] : pvK;`); + emit(` if (k < 0) return false;`); + emit(` const t = tokN > 0 ? tkT[tokN - 1] : pvT;`); emit(` if (LX_PFXV[t] !== 0) return lastBangWasPostfix;`); - emit(` if (tkK[i] === ${kIdent} && LX_EXPRKW[t] !== 0) return false;`); + emit(` if (k === ${kIdent} && LX_EXPRKW[t] !== 0) return false;`); emit(` if (t === ${tRParen} && lastCloseWasParenHead) return false;`); - emit(` return LX_DIVK[tkK[i]] !== 0 || LX_DIVT[t] !== 0;`); + emit(` return LX_DIVK[k] !== 0 || LX_DIVT[t] !== 0;`); emit(` }`); emit(` while (pos < n) {`); + emit(` if (wndHit >= 0) { tokN--; return wndHit; }`); emit(` const cc = source.charCodeAt(pos);`); emit(` // whitespace: ASCII \\s run by char loop; a non-ASCII candidate falls back to the regex`); emit(` if (cc === 32 || (cc >= 9 && cc <= 13)) {`); @@ -317,7 +370,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`${ind} if (m !== null) {`); if (m.identLike) { const plen = (identPrefixByName.get(m.name) ?? '').length; - emit(`${ind} if (!lexIdentValid(m[0], ${plen})) throw new Error("Invalid identifier escape at offset " + pos + ": '" + m[0] + "'");`); + emit(`${ind} if (!lexIdentValid(m[0], ${plen})) { if (lexWindowMore) throw LEX_RETRY; throw new Error("Invalid identifier escape at offset " + pos + ": '" + m[0] + "'"); }`); } if (m.skip) { emit(`${ind} if (m[0].includes('\\n')) pendingNl = true;`); @@ -334,7 +387,9 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // Chars 1..len-1 already known to match when this leaf is reached via the chain below. if (lit === '(') { emit(`${ind}{ const isMemberName = tokN >= 2 && LX_MEMBER[tkT[tokN - 2]] !== 0;`); - emit(`${ind} parenHeadStack.push(!isMemberName && tokN >= 1 && tkK[tokN - 1] === ${kIdent} && LX_PARENKW[tkT[tokN - 1]] !== 0); }`); + emit(`${ind} const _ph = !isMemberName && tokN >= 1 && tkK[tokN - 1] === ${kIdent} && LX_PARENKW[tkT[tokN - 1]] !== 0;`); + emit(`${ind} parenHeadStack.push(_ph);`); + emit(`${ind} extraFl = _ph ? 8 : 0; }`); } else if (lit === ')') { emit(`${ind}lastCloseWasParenHead = parenHeadStack.pop() ?? false;`); } @@ -425,13 +480,13 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` const _li = tokN - 1;`); const likeKs = [...identLike].map(kOf); const likeCond = likeKs.map(k => `tkK[_li] === ${k}`).join(' || '); - emit(` if ((${likeCond}) && tkEnd[_li] === pos) {`); + emit(` if ((${likeCond}) && tkEnd[_li] === pos + srcBase) {`); emit(` LX_UNI_CONT.lastIndex = pos;`); emit(` const cont = LX_UNI_CONT.exec(source);`); emit(` if (cont !== null) {`); emit(` pos += cont[0].length;`); - emit(` tkEnd[_li] = pos;`); - emit(` tkT[_li] = lexKwT(source, tkOff[_li], pos);`); + emit(` tkEnd[_li] = pos + srcBase;`); + emit(` tkT[_li] = lexKwT(source, tkOff[_li] - srcBase, pos);`); emit(` continue;`); emit(` }`); emit(` }`); @@ -459,9 +514,70 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` }`); emit(` }`); } + emit(` if (lexWindowMore) throw LEX_RETRY;`); emit(` throw new Error("Unexpected character at offset " + pos + ": '" + source[pos] + "'");`); emit(` }`); - emit(` return tokN;`); + emit(` if (wndHit >= 0) { tokN--; return wndHit; }`); + emit(` return hasMore ? -2 : -1;`); + emit(`}`); + emit(`// Windowed-relex restart anchor: the last token B ending at/before the damage`); + emit(`// whose recorded stack depths are zero and whose shape leaves no cross-token`); + emit(`// lexer flag live (a control-head ')' or a postfix-ambiguous operator would`); + emit(`// make the next token's regex-context depend on unrecoverable state). -1 = file`); + emit(`// head (always sound, degrades to a full re-lex).`); + emit(`function findRestart(cs) {`); + emit(` let lo = 0, hi = tokN;`); + // STRICTLY before the damage: a token ENDING exactly at cs can be EXTENDED by + // the edit under maximal munch ('b' + inserted 'x' = 'bx'; '=' + '=' = '=='; + // deleting the gap glues neighbours) and the anchor itself is never re-lexed — + // with < the abutting token falls inside the window and the merge is re-derived. + emit(` while (lo < hi) { const mid = (lo + hi) >> 1; if (tend(mid) < cs) lo = mid + 1; else hi = mid; }`); + emit(` for (let b = lo - 1; b >= 0; b--) {`); + emit(` // template depth must be zero (interp brace counters are not reconstructable),`); + emit(` // and the anchor token must leave no cross-token lexer flag live: not a`); + emit(` // control-head ')', not a postfix-ambiguous op, and not a control KEYWORD`); + emit(` // (a '(' lexed first in the window would mis-derive its head-ness from a`); + emit(` // missing predecessor). Paren depth may be anything — the live stack is`); + emit(` // reconstructed from the recorded depths and the '(' head bits.`); + emit(` if (tkDp[b] === 0 && LX_PFXV[tkT[b]] === 0 && LX_PARENKW[tkT[b]] === 0 && !(tkK[b] === 1 && tkT[b] === ${tRParen})) return b;`); + emit(` }`); + emit(` return -1;`); + emit(`}`); + emit(`// Rebuild the live paren-head stack enclosing token b: scanning backward, the`); + emit(`// first '(' recording exactly depth d is the live opener of level d (closed`); + emit(`// openers at that depth are re-opened later, and the re-opener comes first`); + emit(`// backward). The '(' records its depth INCLUDING itself, and carries its`); + emit(`// control-head-ness as tkFl bit 8.`); + emit(`function reconstructParens(b) {`); + emit(` let need = b >= 0 ? tkPd[b] : 0;`); + emit(` const out = new Array(need);`); + emit(` for (let i = b; i >= 0 && need > 0; i--) {`); + emit(` if (tkK[i] === 1 && tkT[i] === ${tOf('(')} && tkPd[i] === need) { out[need - 1] = (tkFl[i] & 8) !== 0; need--; }`); + emit(` }`); + emit(` return out;`); + emit(`}`); + emit(`// Session cache for the live paren stack: the previous edit's anchor stack rolled`); + emit(`// FORWARD over the tokens between the two anchors (push on '(', pop on ')') — the`); + emit(`// backward scan is O(distance to the outermost live opener), which a deep`); + emit(`// stationary session would pay per keystroke. Tokens at/before the cached anchor`); + emit(`// are splice-stable (every splice begins past its own anchor), so the baseline`); + emit(`// stays exact; a backward jump (b < cached) falls back to the full scan.`); + emit(`let parenCachePos = -1;`); + emit(`let parenCacheStack = [];`); + emit(`function reconstructParensCached(b) {`); + emit(` let stack;`); + emit(` if (b < 0) stack = [];`); + emit(` else if (parenCachePos >= 0 && parenCachePos <= b) {`); + emit(` stack = parenCacheStack;`); + emit(` for (let i = parenCachePos + 1; i <= b; i++) {`); + emit(` if (tkK[i] === 1) {`); + emit(` if (tkT[i] === ${tOf('(')}) stack.push((tkFl[i] & 8) !== 0);`); + emit(` else if (tkT[i] === ${tRParen}) { if (stack.length > 0) stack.pop(); }`); + emit(` }`); + emit(` }`); + emit(` } else stack = reconstructParens(b);`); + emit(` parenCachePos = b; parenCacheStack = stack;`); + emit(` return stack.slice();`); emit(`}`); return out.join('\n'); } diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 8d8f521..4498f64 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -689,6 +689,49 @@ class Emitter { // Reference to a rule's parse function (token refs are inlined where used). private ruleFn(name: string) { return `R_${sanitize(name)}`; } + // SPINE rules — the entry rule's repetition units (the rules its body references + // directly): memoized through parseRuleEntry and therefore the adoption/run- + // extension granularity. Shared by emitRuleFns (memoized emission) and the + // quantifier run-extension hook. Grammar-shape-derived — no language names. + private spine: Set | null = null; + spineSet(): Set { + if (this.spine !== null) return this.spine; + const a = this.a; + const spine = new Set(); + const entryRule = a.grammar.rules[a.grammar.rules.length - 1]; + const walk = (x: RuleExpr): void => { + switch (x.type) { + case 'ref': if (a.ruleByName.has(x.name)) spine.add(x.name); return; + case 'seq': case 'alt': x.items.forEach(walk); return; + case 'quantifier': case 'group': walk(x.body); return; + case 'sep': walk(x.element); return; + default: return; + } + }; + walk(entryRule.body); + spine.delete(entryRule.name); + return (this.spine = spine); + } + // The run-extension target of a repetition: when the body unwraps to a plain ref of + // a rule that routes through parseRuleEntry (pratt / left-rec / spine), its rule id; + // else -1 (the loop gets no extension hook — adoption stays element-by-element). + quantRunRuleId(body: RuleExpr): number { + const a = this.a; + let expr = body; + while (true) { + if (expr.type === 'group' && !(expr.suppress && expr.suppress.length)) { expr = expr.body; continue; } + if (expr.type === 'seq') { + const real = expr.items.filter(it => it.type !== 'op' && it.type !== 'prefix' && it.type !== 'postfix'); + if (real.length === 1) { expr = real[0]; continue; } + } + break; + } + if (expr.type !== 'ref' || !a.ruleByName.has(expr.name)) return -1; + const name = expr.name; + if (!(a.prattRules.has(name) || a.leftRecSet.has(name) || this.spineSet().has(name))) return -1; + return a.grammar.rules.findIndex(r => r.name === name); + } + /** * Emit (once) a helper fn for a compound `expr` and return its name. The helper * has the matchExpr contract: returns the matched children array or null, with pos @@ -853,13 +896,20 @@ class Emitter { // Try once; on failure the helper restored pos/scn itself. return `${fn}();`; } + // Run-extension: after an iteration whose element was ADOPTED from the old tree, + // bulk-adopt its following old siblings (runExtend) instead of re-entering the + // rule machinery once per element. Only loops over a parseRuleEntry-routed rule + // get the hook, and runExtend re-checks rid + generation, so an inner rule's + // adoption can never feed elements into an outer loop. + const runId = this.quantRunRuleId(body); + const ext = runId >= 0 ? `\n if (adoptRunPos === pos) runExtend(${runId});` : ''; if (kind === '*') { const before = this.id(), bsn = this.id(); return [ `while (true) {`, ` const ${before} = pos; const ${bsn} = scn;`, ` if (!${fn}()) break;`, - ` if (pos === ${before} && scn === ${bsn}) break;`, + ` if (pos === ${before} && scn === ${bsn}) break;` + ext, `}`, ].join('\n'); } @@ -870,7 +920,7 @@ class Emitter { `while (true) {`, ` const ${before} = pos; const ${bsn} = scn;`, ` if (!${fn}()) break;`, - ` if (pos === ${before} && scn === ${bsn}) break;`, + ` if (pos === ${before} && scn === ${bsn}) break;` + ext, `}`, ].join('\n'); } @@ -1329,9 +1379,104 @@ let tkT = new ${T_ARR}(4096); let tkOff = new Int32Array(4096); let tkEnd = new Int32Array(4096); let tkFl = new Uint8Array(4096); +// lexer-state depth records per token (windowed relex restart/resync safety): +// tkDp = template-interp stack depth, tkPd = paren-head stack depth, both AS RECORDED +// at the token's push (the convention per token kind is fixed by the lexer's code +// path; determinism is what the predicates rely on, depth-0 is the safe state). +let tkDp = new Uint8Array(4096); +let tkPd = new Uint16Array(4096); let tkCap = 4096; let tokN = 0; -let src = ''; +// ── The DOCUMENT text layer ── +// The text lives as PIECES (flat string fragments): applying a change splits the +// covering pieces (O(1) SlicedString views — never a flatten) and splices the new +// text in, so a keystroke costs O(pieces), not the O(n) cons-flatten a slice+concat +// per edit forces in V8 (measured: ~1.2ms per edit on 9MB). docFlat caches the +// joined form for the cold paths that need one (errors, debug views); batch parses +// set it directly. Reads route through docChar/docText: flat fast path, piece +// lookup (cursor-cached) otherwise. +let docPieces = null; +let docPieceOff = null; +let docLen = 0; +let docFlat = null; +let docCur = 0; +function docLocate(i) { + let k = docCur; + const po = docPieceOff; + const n = po.length; + if (k >= n || po[k] > i || (k + 1 < n && po[k + 1] <= i)) { + let lo = 0, hi = n; + while (lo < hi) { const m = (lo + hi) >> 1; if (po[m] <= i) lo = m + 1; else hi = m; } + k = lo - 1; + docCur = k; + } + return k; +} +function docChar(i) { + if (docFlat !== null) return docFlat.charCodeAt(i); + const k = docLocate(i); + return docPieces[k].charCodeAt(i - docPieceOff[k]); +} +function docText(a, b) { + if (docFlat !== null) return docFlat.slice(a, b); + if (b <= a) return ''; + let k = docLocate(a); + const first = docPieces[k]; + const lo = a - docPieceOff[k]; + if (b - docPieceOff[k] <= first.length) return first.slice(lo, b - docPieceOff[k]); + let out = first.slice(lo); + k++; + while (k < docPieces.length && docPieceOff[k] < b) { + const piece = docPieces[k]; + const need = b - docPieceOff[k]; + out += need >= piece.length ? piece : piece.slice(0, need); + k++; + } + return out; +} +function flattenDoc() { + if (docFlat === null) docFlat = docPieces.join(''); + return docFlat; +} +function applyChange(start, end, text) { + const ks = docLocate(start); + const ke = docLocate(end > start ? end - 1 : start); + const head = docPieces[ks].slice(0, start - docPieceOff[ks]); + const tailPiece = end > start ? docPieces[ke] : docPieces[ks]; + const tailOff = end - docPieceOff[end > start ? ke : ks]; + const tail = tailPiece.slice(tailOff); + const repl = []; + if (head.length > 0) repl.push(head); + if (text.length > 0) repl.push(text); + if (tail.length > 0) repl.push(tail); + docPieces.splice(ks, (end > start ? ke : ks) - ks + 1, ...repl); + // consolidate when fragmenting (amortized: a join every ≥256 edits) + if (docPieces.length > 256) { + docPieces = [docPieces.join('')]; + } + docLen += text.length - (end - start); + // rebuild offsets from the splice point (suffix offsets shifted anyway) + if (docPieceOff.length !== docPieces.length) docPieceOff.length = docPieces.length; + let off = ks > 0 && ks - 1 < docPieces.length ? docPieceOff[ks - 1] + docPieces[ks - 1].length : 0; + for (let k2 = ks > 0 ? ks : 0; k2 < docPieces.length; k2++) { + docPieceOff[k2] = off; + off += docPieces[k2].length; + } + if (docPieces.length === 1) docPieceOff[0] = 0; + docCur = 0; + docFlat = null; +} +// ── EOF-relative spans (incremental sessions) ── +// A token's tkOff/tkEnd may be stored EOF-RELATIVE (value − (srcLen + 1), strictly +// negative): the decode adds the CURRENT length back, so a pure suffix never needs +// the O(suffix) add-loop a char delta would otherwise force — updating srcLenP1 IS +// the shift. Values self-describe by sign, so mixed zones stay readable; negFrom +// only bounds where negatives may exist (the flip-band maintenance range). Batch +// parses are all-positive and the decode branch never fires. +let srcLenP1 = 1; +let negFrom = 0x7fffffff; +function toff(i) { const v = tkOff[i]; return v < 0 ? v + srcLenP1 : v; } +function tend(i) { const v = tkEnd[i]; return v < 0 ? v + srcLenP1 : v; } ${e.soa ? '' : 'let tkText = []; // fallback-lexer text column (synthetic tokens are not source spans)'} function growTok() { tkCap *= 2; @@ -1340,6 +1485,8 @@ function growTok() { const o = new Int32Array(tkCap); o.set(tkOff); tkOff = o; const e2 = new Int32Array(tkCap); e2.set(tkEnd); tkEnd = e2; const f = new Uint8Array(tkCap); f.set(tkFl); tkFl = f; + const d = new Uint8Array(tkCap); d.set(tkDp); tkDp = d; + const q = new Uint16Array(tkCap); q.set(tkPd); tkPd = q; } // ── CST arena: nodes are rows in parallel columns; leaves are TOKEN REFERENCES ── @@ -1352,13 +1499,50 @@ function growTok() { // Rows store ABSOLUTE offsets in this phase (the green {rel,len} re-base is the // incremental round's move; flipping the stored form regenerates matchers only). let rowRule = new Uint16Array(8192); // rule id (index into RULE_NAMES) -let rowOff = new Int32Array(8192); // absolute start offset let rowLen = new Int32Array(8192); +let rowTokLen = new Int32Array(8192); // subtree token count let rowStart = new Int32Array(8192); // first index into kids let rowCount = new Int32Array(8192); +// lookahead GAP: how far past its own first token the node's parse may have READ +// (ext − start, a length — position-independent like everything green). Adoption +// validity across edits compares q + rowExt + slack against the damage start. +let rowExt = new Int32Array(8192); +// adoption eligibility: set ONLY where the old parse MEMOIZED the node — a row built +// under a suppress (no-'in') or parseLimit-capped context is a context-dependent +// parse and must never be adopted into a normal entry (the memo carry never stored +// those; adoption must not widen the contract). +let rowOK = new Uint8Array(8192); +// kid-containment bit (lazy): 0 unknown, 1 = every kid's probe watermark stays +// at/below the next kid's start (so a prefix-keep check of the LAST kept kid +// transitively bounds all earlier ones), 2 = violated somewhere. Computed on +// first surgical use of a row, maintained across in-place splices. +let rowKC = new Uint8Array(8192); +// END-RELATIVE kid rels (incremental sessions): a ROW kid's kidTokRel/kidRel may be +// stored relative to the parent's END (value − (parentLen + 1), strictly negative); +// the decode adds the parent's CURRENT length back. A surgical splice then shifts +// the whole suffix by updating the parent's lengths — no per-kid add-loop — and the +// values stay correct as long as the parent row is unedited (only surgery changes a +// row's lengths, and it maintains its own band). Leaf kids pack their rel inside the +// kids value and always stay start-relative (the trailing-leaf walk shifts them +// eagerly). rowNF = first kid index (absolute, like rowStart) that may hold an +// end-relative value; batch parses never flip, so the decode branch never fires. +let rowNF = new Int32Array(8192).fill(0x7fffffff); +function ktr(p, k) { const v = kidTokRel[k]; return v < 0 ? v + rowTokLen[p] + 1 : v; } +function kcr(p, k) { const v = kidRel[k]; return v < 0 ? v + rowLen[p] + 1 : v; } +// transient BUILD coordinates (absolute), valid for rows completed in the current +// parse and REFRESHED at memo-hit time for reused roots — parents read them at +// finishNode to write the children's relative fields; never part of the green tree. +let absChar = new Int32Array(8192); +let absTok = new Int32Array(8192); let rowCap = 8192; let nodeN = 0; let kids = new Int32Array(16384); +// A node child's RELATIVE coordinates live in the PARENT's kids stream (parallel to +// kids), not on the child row: a memo-reused subtree can be a child of several +// longest-match CANDIDATES, and a losing candidate completing after the winner would +// clobber child-side rel fields. The parent owns its edges; rows own only lengths. +let kidRel = new Int32Array(16384); +let kidTokRel = new Int32Array(16384); let kidCap = 16384; let kidN = 0; // Scratch: completed-but-unattached children of in-progress arms. Every @@ -1369,21 +1553,31 @@ let scn = 0; function growRows() { rowCap *= 2; const r = new Uint16Array(rowCap); r.set(rowRule); rowRule = r; - const o = new Int32Array(rowCap); o.set(rowOff); rowOff = o; const l = new Int32Array(rowCap); l.set(rowLen); rowLen = l; + const tl = new Int32Array(rowCap); tl.set(rowTokLen); rowTokLen = tl; const s = new Int32Array(rowCap); s.set(rowStart); rowStart = s; const c = new Int32Array(rowCap); c.set(rowCount); rowCount = c; + const x = new Int32Array(rowCap); x.set(rowExt); rowExt = x; + const ok = new Uint8Array(rowCap); ok.set(rowOK); rowOK = ok; + const kc = new Uint8Array(rowCap); kc.set(rowKC); rowKC = kc; + const nf = new Int32Array(rowCap).fill(0x7fffffff); nf.set(rowNF.subarray(0, nodeN)); rowNF = nf; + const ac = new Int32Array(rowCap); ac.set(absChar); absChar = ac; + const at = new Int32Array(rowCap); at.set(absTok); absTok = at; } function growKids(n) { while (kidN + n > kidCap) kidCap *= 2; const k = new Int32Array(kidCap); k.set(kids.subarray(0, kidN)); kids = k; + const r = new Int32Array(kidCap); r.set(kidRel.subarray(0, kidN)); kidRel = r; + const t = new Int32Array(kidCap); t.set(kidTokRel.subarray(0, kidN)); kidTokRel = t; } function scPush(e) { if (scn === scCap) { scCap *= 2; const s = new Int32Array(scCap); s.set(sc); sc = s; } sc[scn++] = e; } -function entryOff(e) { return e >= 0 ? rowOff[e] : tkOff[(~e) >>> 2]; } -function entryEnd(e) { return e >= 0 ? rowOff[e] + rowLen[e] : tkEnd[(~e) >>> 2]; } +function entryOff(e) { return e >= 0 ? absChar[e] : toff((~e) >>> 2); } +function entryEnd(e) { return e >= 0 ? absChar[e] + rowLen[e] : tend((~e) >>> 2); } +function entryTok(e) { return e >= 0 ? absTok[e] : (~e) >>> 2; } +function entryTokEnd(e) { return e >= 0 ? absTok[e] + rowTokLen[e] : ((~e) >>> 2) + 1; } // Complete a node whose children are scratch[mark..scn): copy them into kids, write // the row, truncate scratch, return the id. Empty children = a zero-width node // at the current token (the old offset() rule). @@ -1391,20 +1585,41 @@ function finishNode(rid, mark) { const n = scn - mark; if (nodeN === rowCap) growRows(); const id = nodeN++; - let myOff, myEnd; + let myOff, myEnd, myTok, myTokEnd; if (n > 0) { if (kidN + n > kidCap) growKids(n); const ks = kidN; - for (let i = 0; i < n; i++) kids[ks + i] = sc[mark + i]; - kidN += n; - rowStart[id] = ks; myOff = entryOff(sc[mark]); myEnd = entryEnd(sc[scn - 1]); + myTok = entryTok(sc[mark]); + myTokEnd = entryTokEnd(sc[scn - 1]); + // GREEN conversion: scratch entries carry ABSOLUTE coordinates; the kids span is + // written position-independent — a leaf becomes node-relative-token-encoded, a + // child node gets its rel fields written here (its own row knows only lengths). + for (let i = 0; i < n; i++) { + const e = sc[mark + i]; + if (e < 0) { + kids[ks + i] = ~(((((~e) >>> 2) - myTok) << 2) | ((~e) & 3)); + } else { + kids[ks + i] = e; + kidRel[ks + i] = absChar[e] - myOff; + kidTokRel[ks + i] = absTok[e] - myTok; + } + } + kidN += n; + rowStart[id] = ks; } else { rowStart[id] = kidN; myOff = offset(); myEnd = myOff; - } - rowRule[id] = rid; rowOff[id] = myOff; rowLen[id] = myEnd - myOff; rowCount[id] = n; + myTok = pos; myTokEnd = pos; + } + rowRule[id] = rid; rowLen[id] = myEnd - myOff; rowCount[id] = n; + rowTokLen[id] = myTokEnd - myTok; + rowExt[id] = maxPos - myTok; + rowOK[id] = 0; + rowKC[id] = 0; + rowNF[id] = 0x7fffffff; + absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; } @@ -1415,13 +1630,32 @@ function finishWrap(rid, lhsId, mark) { const id = nodeN++; if (kidN + n + 1 > kidCap) growKids(n + 1); const ks = kidN; + const myOff = absChar[lhsId]; + const myTok = absTok[lhsId]; + const myEnd = n > 0 ? entryEnd(sc[scn - 1]) : myOff + rowLen[lhsId]; + const myTokEnd = n > 0 ? entryTokEnd(sc[scn - 1]) : myTok + rowTokLen[lhsId]; kids[ks] = lhsId; - for (let i = 0; i < n; i++) kids[ks + 1 + i] = sc[mark + i]; + kidRel[ks] = 0; + kidTokRel[ks] = 0; + for (let i = 0; i < n; i++) { + const e = sc[mark + i]; + if (e < 0) { + kids[ks + 1 + i] = ~(((((~e) >>> 2) - myTok) << 2) | ((~e) & 3)); + } else { + kids[ks + 1 + i] = e; + kidRel[ks + 1 + i] = absChar[e] - myOff; + kidTokRel[ks + 1 + i] = absTok[e] - myTok; + } + } kidN += n + 1; - const myOff = rowOff[lhsId]; - const myEnd = n > 0 ? entryEnd(sc[scn - 1]) : rowOff[lhsId] + rowLen[lhsId]; - rowRule[id] = rid; rowOff[id] = myOff; rowLen[id] = myEnd - myOff; + rowRule[id] = rid; rowLen[id] = myEnd - myOff; rowStart[id] = ks; rowCount[id] = n + 1; + rowTokLen[id] = myTokEnd - myTok; + rowExt[id] = maxPos - myTok; + rowOK[id] = 0; + rowKC[id] = 0; + rowNF[id] = 0x7fffffff; + absChar[id] = myOff; absTok[id] = myTok; scn = mark; return id; } @@ -1431,6 +1665,13 @@ let pos = 0; let maxPos = 0; let memoNode = []; let memoEnd = []; +let memoExt = []; // per-entry lookahead extent (see parseRuleEntry) +// GENERATION-STAMPED memo: the per-rule arrays persist across parses (allocating +// fresh multi-million-slot arrays per edit cost ~30% of a large-file edit in GC +// alone); an entry is live iff its stamp equals the current generation — bumping +// memoGenCur IS the whole reset. +let memoGen = []; +let memoGenCur = 0; let parseLimit = -1; // cap = the exclusive lookahead bound: min(parseLimit-or-∞, tokN), maintained at the // parseLimit set/restore sites and the one token-stream mutation (the '>' splice). @@ -1440,8 +1681,8 @@ let suppressNext = null; let suppressCur = null; function offset() { - if (pos < cap) return tkOff[pos]; - return tokN > 0 ? tkEnd[tokN - 1] : 0; + if (pos < cap) return toff(pos); + return tokN > 0 ? tend(tokN - 1) : 0; } // ── Lever 1: integer-kind matchers ── @@ -1471,7 +1712,7 @@ function matchPuLit(pu) { } function matchPuLitGT(pu) { if (pos >= cap) return false; - const off = tkOff[pos]; + const off = toff(pos); if (tkT[pos] === pu) { scPush(~(pos << 2)); if (++pos > maxPos) maxPos = pos; @@ -1480,36 +1721,38 @@ function matchPuLitGT(pu) { // Split multi-'>' tokens: '>>', '>>>', '>>=', '>>>=' can yield a single '>': shift the // columns up one slot and write the '>' + rest pair in place (both born flag-less, // matching the old mkPunct pair). - if (tkK[pos] === K_PUNCT && tkEnd[pos] - off > 1 && ${e.soa ? 'src.charCodeAt(off) === 62' : "tkText[pos].charCodeAt(0) === 62"}) { - const end0 = tkEnd[pos]; + if (tkK[pos] === K_PUNCT && tend(pos) - off > 1 && ${e.soa ? 'docChar(off) === 62' : "tkText[pos].charCodeAt(0) === 62"}) { + const end0 = tend(pos); ${e.soa ? '' : 'const restText = tkText[pos].slice(1);'} if (tokN === tkCap) growTok(); + parenCachePos = -1; tkK.copyWithin(pos + 1, pos, tokN); tkT.copyWithin(pos + 1, pos, tokN); tkOff.copyWithin(pos + 1, pos, tokN); tkEnd.copyWithin(pos + 1, pos, tokN); + tkDp.copyWithin(pos + 1, pos, tokN); + tkPd.copyWithin(pos + 1, pos, tokN); tkFl.copyWithin(pos + 1, pos, tokN); ${e.soa ? '' : "tkText.splice(pos, 1, '>', restText);"} - tkT[pos] = pu; tkEnd[pos] = off + 1; tkFl[pos] = 0; - tkOff[pos + 1] = off + 1; tkFl[pos + 1] = 0; - tkT[pos + 1] = ${e.soa ? 'LIT_PU.get(src.slice(off + 1, end0)) ?? 0' : 'LIT_PU.get(restText) ?? 0'}; + // Keep the EOF-relative zone invariant: a split at/past negFrom writes the new + // pair EOF-relative (a positive value there would not ride later srcLenP1 + // shifts); below it, the boundary index moves up one slot with the suffix. + if (pos < negFrom) { + negFrom++; + tkT[pos] = pu; tkEnd[pos] = off + 1; tkFl[pos] = 0; + tkOff[pos + 1] = off + 1; tkFl[pos + 1] = 0; + } else { + tkT[pos] = pu; tkEnd[pos] = off + 1 - srcLenP1; tkFl[pos] = 0; + tkOff[pos + 1] = off + 1 - srcLenP1; tkFl[pos + 1] = 0; + } + tkT[pos + 1] = ${e.soa ? 'LIT_PU.get(docText(off + 1, end0)) ?? 0' : 'LIT_PU.get(restText) ?? 0'}; tokN++; if (parseLimit < 0) cap = tokN; // Token indices shifted: drop the per-rule memo arrays (recreated lazily at the new size). - memoNode.fill(undefined); - memoEnd.fill(undefined); - // Leaf entries reference tokens BY INDEX, so the splice's +1 shift must be applied - // to every committed/scratch entry past the split point. (Object trees were immune — - // leaves copied their spans; the arena trades that copy for this rare O(kidN) pass. - // Entries AT pos can't exist: that token is being consumed right now.) - for (let i = 0; i < kidN; i++) { - const ke = kids[i]; - if (ke < 0 && ((~ke) >>> 2) > pos) kids[i] = ke - 4; - } - for (let i = 0; i < scn; i++) { - const se = sc[i]; - if (se < 0 && ((~se) >>> 2) > pos) sc[i] = se - 4; - } + memoGenCur++; // positions shifted mid-parse: every stamped entry is stale + // GREEN tree: no kids/scratch fixup — every completed row and scratch entry lies + // wholly BEFORE the splice point (token pos is being consumed right now), and the + // carried memo was just cleared, so nothing reachable references shifted indices. scPush(~(pos << 2)); if (++pos > maxPos) maxPos = pos; return true; @@ -1578,26 +1821,71 @@ function parseTemplateExpr() { // Emit the per-rule parse functions + the RULES dispatch table. function emitRuleFns(e: Emitter, a: ReturnType) { const ruleFn = (name: string) => `R_${sanitize(name)}`; + // SPINE rules — the entry rule's repetition units (the rules its body references + // directly): the natural reuse granularity for incremental re-parsing, so they get + // memoized through parseRuleEntry like pratt/left-rec rules. Without this only + // expression/type subtrees reuse and every statement re-walks on each edit. + // Derived from the grammar shape — no language names. + const spine = e.spineSet(); for (const rule of a.grammar.rules) { if (a.prattRules.has(rule.name)) emitPrattRule(e, a, rule); else if (a.leftRecSet.has(rule.name)) emitLeftRecRule(e, a, rule); - else emitNonRecRule(e, a, rule); + else emitNonRecRule(e, a, rule, spine.has(rule.name) && !a.prattRules.has(rule.name) && !a.leftRecSet.has(rule.name)); } // Dispatch table (string rule name → fn), for parseTemplateExpr's dynamic interp rule. e.emit(`const RULES = {`); for (const rule of a.grammar.rules) e.emit(` ${J(rule.name)}: ${ruleFn(rule.name)},`); e.emit(`};`); + + // Surgical-container table: rule id → its repetition element's rule id, for rules + // whose body is a PURE seq/group of literals/refs around exactly one '*'/'+' rep + // of a parseRuleEntry-routed rule. No alt/sep/opt/not anywhere in the body: a + // longest-match arm (or lookahead) at the container's OWN level may probe into + // the rep zone without any kid row owning the read, which would break the + // prefix-keep watermark argument node surgery relies on. + const surg: number[] = a.grammar.rules.map(() => -1); + a.grammar.rules.forEach((rule, ri) => { + if (a.prattRules.has(rule.name) || a.leftRecSet.has(rule.name)) return; + let reps = 0; let bad = false; let elem = -1; + const walk = (x: RuleExpr): void => { + if (bad) return; + switch (x.type) { + case 'seq': x.items.forEach(walk); return; + case 'group': + if (x.suppress && x.suppress.length) { bad = true; return; } + walk(x.body); return; + case 'literal': case 'ref': case 'op': case 'prefix': case 'postfix': return; + case 'quantifier': + if (x.kind === '?') { bad = true; return; } + reps++; elem = e.quantRunRuleId(x.body); + return; + default: bad = true; return; + } + }; + walk(rule.body); + if (!bad && reps === 1 && elem >= 0) surg[ri] = elem; + }); + e.emit(`const SURG_ELEM = new Int32Array([${surg.join(',')}]);`); + e.emit(`const RULE_FN_BY_ID = [${a.grammar.rules.map(r => ruleFn(r.name)).join(', ')}];`); } // Non-recursive rule: longest-match over alts (mirrors parseNonRec). A better arm is // committed to the arena IMMEDIATELY (finishNode also truncates scratch back to mark); // a not-better arm's children are dropped by the next arm's scn reset (a beaten // committed candidate stays as an arena hole — the measured 3-5% discard class). -function emitNonRecRule(e: Emitter, a: ReturnType, rule: RuleDecl) { +function emitNonRecRule(e: Emitter, a: ReturnType, rule: RuleDecl, memoized = false) { const ruleFn = `R_${sanitize(rule.name)}`; const rid = a.grammar.rules.indexOf(rule); const alts = rule.body.type === 'alt' ? rule.body.items : [rule.body]; - e.emit(`function ${ruleFn}() {`); + // A memoized (spine) rule splits into the public wrapper (parseRuleEntry owns the + // push+boolean contract and the memo) and an id-returning core, exactly like the + // pratt/left-rec rules. + if (memoized) { + e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_core); }`); + e.emit(`function ${ruleFn}_core(_minBp) {`); + } else { + e.emit(`function ${ruleFn}() {`); + } e.emit(` const saved = pos; const mark = scn;`); e.emit(` let bestId = -1; let bestPos = saved;`); const dispatch = e.altMaskDispatch(alts, '_am'); @@ -1612,8 +1900,13 @@ function emitNonRecRule(e: Emitter, a: ReturnType, rule: RuleDec e.emit(` }`); e.emit(` }`); }); - e.emit(` if (bestId >= 0) { pos = bestPos; scn = mark; scPush(bestId); return true; }`); - e.emit(` pos = saved; scn = mark; return false;`); + if (memoized) { + e.emit(` if (bestId >= 0) { pos = bestPos; scn = mark; return bestId; }`); + e.emit(` pos = saved; scn = mark; return -1;`); + } else { + e.emit(` if (bestId >= 0) { pos = bestPos; scn = mark; scPush(bestId); return true; }`); + e.emit(` pos = saved; scn = mark; return false;`); + } e.emit(`}`); // Arm matchers. alts.forEach((alt, i) => emitArm(e, a, rule.name, i, alt)); @@ -1628,8 +1921,8 @@ function emitLeftRecRule(e: Emitter, a: ReturnType, rule: RuleDe // suppress wrapper in the interpreter — so currentPrattContext is set to this rule // (the template-interpolation rule resolution depends on it: a `${…}` hole inside a // template-literal TYPE must parse as Type, not the default expression rule). - e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${J(rule.name)}, ${ruleFn}_lr); }`); const rid = a.grammar.rules.indexOf(rule); + e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_lr); }`); e.emit(`function ${ruleFn}_lr(_minBp) {`); e.emit(` const saved = pos; const mark = scn;`); e.emit(` let node = -1; let bestAtomPos = saved;`); @@ -1682,7 +1975,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl // R_() wraps parseRule's memo/context handling, then calls the bp-taking core. const rid = a.grammar.rules.indexOf(rule); - e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${J(rule.name)}, ${ruleFn}_pratt); }`); + e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_pratt); }`); e.emit(`function ${ruleFn}_pratt(minBp) {`); e.emit(` const saved = pos; const mark = scn;`); e.emit(` let lhs = -1; let bestNudPos = saved;`); @@ -1776,8 +2069,8 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(` if (NOUNARY_T[tkT[pos]] !== 0 && rowCount[lhs] > 0) {`); e.emit(` const _h = kids[rowStart[lhs]];`); e.emit(` if (_h < 0 && ((~_h) & 3) === 2) {`); - e.emit(` const _ht = (~_h) >>> 2;`); - e.emit(` const _htext = ${e.soa ? 'src.slice(tkOff[_ht], tkEnd[_ht])' : 'tkText[_ht]'};`); + e.emit(` const _ht = absTok[lhs] + ((~_h) >>> 2);`); + e.emit(` const _htext = ${e.soa ? 'docText(toff(_ht), tend(_ht))' : 'tkText[_ht]'};`); e.emit(` if (prefixOps.has(_htext) && !postfixOpValues.has(_htext)) { return -1; }`); e.emit(` }`); e.emit(` }`); @@ -1890,29 +2183,92 @@ function emitMixfixLed(e: Emitter, a: ReturnType, fnName: string function emitDriver(e: Emitter, a: ReturnType, entry: string) { e.emit(String.raw` // parseRule for a pratt/left-rec rule: memo + context + suppress, then the core. -// The memo is a pair of per-rule arrays indexed by start pos (lazily sized to the token -// count, undefined-holed): a lookup is two array loads, a store allocates nothing — no -// Map hashing and no {node, end} wrapper per store. The core returns a node ID (or -1); +// The memo is per-rule arrays indexed by start pos (lazily sized to the token count, +// undefined-holed): a lookup is two array loads, a store allocates nothing — no Map +// hashing and no {node, end} wrapper per store. The core returns a node ID (or -1); // this wrapper owns the public arena contract (push the id, return a boolean). -function parseRuleEntry(idx, name, core) { +// +// memoExt records each entry's LOOKAHEAD EXTENT — the farthest token index the parse +// may have READ (not merely consumed) — which is what incremental invalidation must +// intersect with an edit's damage window: a PEG parse probes beyond its end (failed +// longer arms, not() lookaheads, SECOND-token dispatch). The extent comes for free +// from the global advance watermark: maxPos at frame exit, +2 covering the stop-token +// and SECOND-token reads past it. Left-to-right parsing keeps the watermark near the +// current frontier, so the value is tight on the dominant flow and only OVER- +// invalidates (soundly) near big-backtrack clusters. +function parseRuleEntry(idx, rid, name, core) { const mySup = suppressNext; suppressNext = null; const capped = parseLimit >= 0; const start = pos; - // Capture the pair together: a '>'-splice inside core() detaches both via fill(undefined), - // and the store below must then write into the DETACHED pair (i.e. be discarded), exactly - // like the old per-rule Map did. + // Capture the arrays together: a '>'-splice inside core() detaches them via + // fill(undefined), and the store below must then write into the DETACHED arrays + // (i.e. be discarded), exactly like the old per-rule Map did. let me = memoEnd[idx]; let mn = memoNode[idx]; - if (!mySup && !capped && me !== undefined) { + let mx = memoExt[idx]; + let mg = memoGen[idx]; + if (!mySup && !capped && me !== undefined && mg[start] === memoGenCur) { const e = me[start]; if (e !== undefined) { pos = e; + // The jump SEMANTICALLY reads everything the stored parse read: keep the advance + // watermark ≥ the entry's watermark, or an ENCLOSING rule that completes right + // after a reused subtree stores a watermark smaller than what its result depends + // on (including the child's own over-probing failed arms), and a later edit in + // the gap keeps the stale entry alive. A guaranteed batch no-op: the watermark is + // monotone and was already ≥ this value when the entry was stored. + const ex = mx[start]; + if (ex > maxPos) maxPos = ex; const id = mn[start]; - if (id >= 0) { scPush(id); return true; } + if (id >= 0) { + // refresh the reused root's transient BUILD coordinates to the current stream + // (its green internals are position-independent; only the attachment point — + // what the enclosing finishNode reads — must be current). + absTok[id] = start; + absChar[id] = toff(start); + scPush(id); + return true; + } return false; } } + if (!mySup && !capped && adoptRoot >= 0) { + // map the new position into OLD token coordinates; inside the damage = no mapping + const q = start < adoptDmgStart ? start + : start >= adoptDmgOldEnd + adoptDelta ? start - adoptDelta : -1; + if (q >= 0) { + const aid = adoptSeek(q, rid); + if (aid >= 0) { + pos = start + rowTokLen[aid]; + const ext = start + rowExt[aid]; + if (ext > maxPos) maxPos = ext; + absTok[aid] = start; + absChar[aid] = toff(start); + if (adoptHitP >= 0) { + adoptRunPos = pos; adoptRunRid = rid; adoptRunGen = memoGenCur; + adoptRunP = adoptHitP; adoptRunKid = adoptHitKid + 1; + adoptRunOq = q + rowTokLen[aid]; adoptRunBase = adoptHitBase; + } + if (me === undefined || me.length < tokN + 1) { + me = new Array(tokN + 1); + mn = new Array(tokN + 1); + mx = new Array(tokN + 1); + mg = new Int32Array(tokN + 1); + memoEnd[idx] = me; + memoNode[idx] = mn; + memoExt[idx] = mx; + memoGen[idx] = mg; + } + me[start] = pos; + mn[start] = aid; + mx[start] = maxPos; + mg[start] = memoGenCur; + scPush(aid); + return true; + } + } + } const prevContext = currentPrattContext; currentPrattContext = name; const prevSup = suppressCur; @@ -1925,14 +2281,23 @@ function parseRuleEntry(idx, name, core) { suppressCur = prevSup; } if (!mySup && !capped) { - if (me === undefined) { + if (me === undefined || me.length < tokN + 1) { me = new Array(tokN + 1); mn = new Array(tokN + 1); + mx = new Array(tokN + 1); + mg = new Int32Array(tokN + 1); memoEnd[idx] = me; memoNode[idx] = mn; + memoExt[idx] = mx; + memoGen[idx] = mg; } me[start] = pos; mn[start] = result; + mx[start] = maxPos; + mg[start] = memoGenCur; // the TRUE probe watermark — the +2 read slack (stop token, + // SECOND-token dispatch) is applied at INVALIDATION time + if (result >= 0) rowOK[result] = 1; + } if (result >= 0) { scPush(result); return true; } return false; @@ -1940,7 +2305,7 @@ function parseRuleEntry(idx, name, core) { // Token text at an arbitrary index (cold paths: errors, the tokenAt debug view). function tokTextAt(i) { - return ${e.soa ? 'src.slice(tkOff[i], tkEnd[i])' : 'tkText[i]'}; + return ${e.soa ? 'docText(toff(i), tend(i))' : 'tkText[i]'}; } // The k → type-name inverse, for reconstructing a token object (tokenAt). const K_NAMES = []; @@ -1950,7 +2315,7 @@ export function tokenAt(i) { return { type: K_NAMES[tkK[i]] ?? '', text: tokTextAt(i), - offset: tkOff[i], + offset: toff(i), k: tkK[i], t: tkT[i], newlineBefore: (tkFl[i] & 1) !== 0, @@ -1960,16 +2325,12 @@ export function tokenAt(i) { } // The CST is span-only: a node's text is derived from the source it was parsed from. -export function getText(node, source) { - return source.slice(node.offset, node.end); -} - // ── Arena tree access ── // The arena IS the tree: parse() returns the root node id and consumers traverse // via visit()/the accessors — nothing is materialized on the parse path. All views // are valid until the NEXT parse (the columns are reused). -function leafTokenType(entry) { - const tok = (~entry) >>> 2; +function leafTokenType(entry, tokBase) { + const tok = tokBase + ((~entry) >>> 2); const kind = (~entry) & 3; return kind === 1 ? '$keyword' : kind === 2 ? '$operator' @@ -1978,11 +2339,21 @@ function leafTokenType(entry) { } // Raw arena accessors. An ENTRY is a node id (>= 0) or a leaf (< 0, token-encoded); // offsetOf/endOf/textOf accept either. +// GREEN accessors: positions are RELATIVE — a node knows (rel, len) against its +// parent and (tokRel, tokLen) in tokens; consumers descend with (charBase, tokBase) +// — the node's own absolute start coordinates. Leaf spans come from the token +// columns at tokBase + the entry's node-relative token index. export const tree = { ruleNameOf: (id) => RULE_NAMES[rowRule[id]], ruleIdOf: (id) => rowRule[id], - offsetOf: (entry) => entry >= 0 ? rowOff[entry] : tkOff[(~entry) >>> 2], - endOf: (entry) => entry >= 0 ? rowOff[entry] + rowLen[entry] : tkEnd[(~entry) >>> 2], + lenOf: (id) => rowLen[id], + tokLenOf: (id) => rowTokLen[id], + // a node CHILD's relative coordinates live on the parent edge (kids-parallel) + childRelAt: (id, i) => kcr(id, rowStart[id] + i), + childTokRelAt: (id, i) => ktr(id, rowStart[id] + i), + // base-threaded spans: nodes from their bases, leaves from the token columns + offsetOf: (entry, charBase, tokBase) => entry >= 0 ? charBase : toff(tokBase + ((~entry) >>> 2)), + endOf: (entry, charBase, tokBase) => entry >= 0 ? charBase + rowLen[entry] : tend(tokBase + ((~entry) >>> 2)), childCount: (id) => rowCount[id], childAt: (id, i) => kids[rowStart[id] + i], // Bulk child load into a caller-owned array; returns the count. One call per node @@ -1994,45 +2365,42 @@ export const tree = { return n2; }, isLeaf: (entry) => entry < 0, - leafToken: (entry) => (~entry) >>> 2, + leafToken: (entry, tokBase) => tokBase + ((~entry) >>> 2), leafTokenType, // Int-world leaf accessors (the match-path encoding): kind bits — 0 type-derived, // 1 '$keyword', 2 '$operator' — and the token's TYPE kind int (1 = punctuation). leafKindOf: (entry) => (~entry) & 3, - leafTokKindOf: (entry) => tkK[(~entry) >>> 2], - textOf: (entry, source) => entry >= 0 - ? source.slice(rowOff[entry], rowOff[entry] + rowLen[entry]) - : source.slice(tkOff[(~entry) >>> 2], tkEnd[(~entry) >>> 2]), + leafTokKindOf: (entry, tokBase) => tkK[tokBase + ((~entry) >>> 2)], + leafOffsetOf: (entry, tokBase) => toff(tokBase + ((~entry) >>> 2)), + leafEndOf: (entry, tokBase) => tend(tokBase + ((~entry) >>> 2)), + textOf: (entry, source, charBase, tokBase) => entry >= 0 + ? source.slice(charBase, charBase + rowLen[entry]) + : source.slice(toff(tokBase + ((~entry) >>> 2)), tend(tokBase + ((~entry) >>> 2))), }; // Depth-first traversal from a node id or leaf entry: // enter(id) — each NODE before its children; return false to skip its subtree // leave(id) — each node after its children // leaf(entry, tok) — each leaf (tok = its token index) -export function visit(entry, fns) { - if (entry < 0) { if (fns.leaf) fns.leaf(entry, (~entry) >>> 2); return; } - if (fns.enter && fns.enter(entry) === false) return; +// Depth-first traversal threading the RED coordinates: enter/leave receive the +// node's absolute (charBase, tokBase); leaf receives its absolute token index. +// Call with the root only — the bases default from the root's rel fields. +function visitCore(entry, fns, charBase, tokBase) { + if (charBase === undefined) { charBase = rootCharBase; tokBase = rootTokBase; } + if (entry < 0) { if (fns.leaf) fns.leaf(entry, tokBase + ((~entry) >>> 2)); return; } + if (fns.enter && fns.enter(entry, charBase, tokBase) === false) return; const n = rowCount[entry]; const cs = rowStart[entry]; - for (let i = 0; i < n; i++) visit(kids[cs + i], fns); - if (fns.leave) fns.leave(entry); -} -// Materialize the classic object CST from a node id — a BRIDGE for tests/debugging -// (the byte-identical gate against the interpreter), not a parse-path product. -export function toObject(id) { - const n = rowCount[id]; - const cs = rowStart[id]; - const children = new Array(n); for (let i = 0; i < n; i++) { - const entry = kids[cs + i]; - children[i] = entry >= 0 ? toObject(entry) - : { tokenType: leafTokenType(entry), offset: tkOff[(~entry) >>> 2], end: tkEnd[(~entry) >>> 2] }; + const e = kids[cs + i]; + if (e < 0) { if (fns.leaf) fns.leaf(e, tokBase + ((~e) >>> 2)); } + else visitCore(e, fns, charBase + kcr(entry, cs + i), tokBase + ktr(entry, cs + i)); } - return { rule: RULE_NAMES[rowRule[id]], children, offset: rowOff[id], end: rowOff[id] + rowLen[id] }; + if (fns.leave) fns.leave(entry, charBase, tokBase); } // Parse to the ARENA: returns the root node id. -export function parse(source, entryRule) { -${e.soa ? ` tokenize(source);` : String.raw` src = source; +function lexInto(source) { +${e.soa ? ` tokenize(source);` : String.raw` docPieces = [source]; docPieceOff = [0]; docLen = source.length; docFlat = source; docCur = 0; const _toks = tokenize(source); const _n = _toks.length; while (tkCap < _n + 1) growTok(); @@ -2041,43 +2409,904 @@ ${e.soa ? ` tokenize(source);` : String.raw` src = source; const _t = _toks[_i]; tkK[_i] = _t.k; tkT[_i] = _t.t; tkOff[_i] = _t.offset; tkEnd[_i] = _t.offset + _t.text.length; tkFl[_i] = (_t.newlineBefore ? 1 : 0) | (_t.commentBefore ? 2 : 0) | (_t.multilineFlowBefore ? 4 : 0); + tkDp[_i] = 0; tkPd[_i] = 0; tkText[_i] = _t.text; } tokN = _n;`} +} + +function farthest(errPos) { + if (maxPos <= errPos || maxPos >= tokN) return ''; + return ' [farthest: offset ' + toff(maxPos) + " near '" + tokTextAt(maxPos).slice(0, 20) + "']"; +} + +// Run the entry rule over the CURRENT token stream (shared by parse / parseEdited — +// everything per-parse EXCEPT the memo and the arena cursor, which parseEdited carries). +function runParse(entryRule) { pos = 0; maxPos = 0; - memoNode = new Array(MEMO_RULES); - memoEnd = new Array(MEMO_RULES); parseLimit = -1; cap = tokN; currentPrattContext = null; suppressNext = null; suppressCur = null; - nodeN = 0; - kidN = 0; scn = 0; - const entry = entryRule ?? ENTRY; if (tokN === 0) { const rid = RULE_NAMES.indexOf(entry); - return finishNode(rid < 0 ? 0 : rid, scn); + const er = finishNode(rid < 0 ? 0 : rid, scn); + rootCharBase = absChar[er]; rootTokBase = absTok[er]; + return er; } if (!RULES[entry]()) { const hasTok = pos < cap; - throw new Error('Parse error at offset ' + (hasTok ? tkOff[pos] : 0) + ': unexpected ' + (hasTok ? "'" + tokTextAt(pos) + "'" : 'end of input') + farthest(pos)); + throw new Error('Parse error at offset ' + (hasTok ? toff(pos) : 0) + ': unexpected ' + (hasTok ? "'" + tokTextAt(pos) + "'" : 'end of input') + farthest(pos)); } if (pos < tokN) { - throw new Error('Parse error at offset ' + tkOff[pos] + ": unexpected '" + tokTextAt(pos) + "' after successful parse" + farthest(pos)); + throw new Error('Parse error at offset ' + toff(pos) + ": unexpected '" + tokTextAt(pos) + "' after successful parse" + farthest(pos)); + } + const rootId = sc[--scn]; + rootCharBase = absChar[rootId]; rootTokBase = absTok[rootId]; + return rootId; +} + +// Source of the last COMPLETED parse — the token columns, arena and memo describe it. +// null whenever the module state is not a coherent snapshot (no parse yet, or the last +// attempt threw), so parseEdited falls back to a full parse. +// Coherent-edit-base flag: false after a rejected attempt (the next edit falls +// back to a full re-parse of the document text). +let lastOk = false; +// Pieces snapshot of the LIVE tree's text (survives a rejected edit): the reject +// path re-lexes it so the handle keeps reading the previous tree. The document +// pieces above advance on EVERY edit, accepted or rejected — the editor's buffer +// applied the change regardless, and later coordinates are against it. +let treePieces = null; +// the LAST parse root's absolute coordinates (the descent origin — see visit/toObject) +let rootCharBase = 0; +let rootTokBase = 0; + +// ── M4: old-tree ADOPTION (cursor reuse) ── +// During an incremental re-parse, a rule entry first asks the PREVIOUS tree: is there +// an old node of this rule starting at the corresponding old position whose lookahead +// stayed clear of the damage? Adoption is STATELESS — nothing is consumed, so PEG +// backtracking needs no cursor rollback, and a node refused under one candidate arm +// can be adopted by the next. The memo stays purely intra-parse. +let lastRoot = -1; // previous parse's root id + its absolute first token +let lastRootTok = 0; +let adoptRoot = -1; // previous root id (-1 = no adoption) +let adoptRootTok = 0; // its absolute first token (old coords) +let adoptDmgStart = 0; // damage window in OLD token coords: [adoptDmgStart, adoptDmgOldEnd) +let adoptDmgOldEnd = 0; +let adoptDelta = 0; // new-minus-old token delta past the damage +// cached descent path (top-down): ids + their absolute old token bases +let adoptPath = []; +let adoptBase = []; +// run-extension state: where the last single adoption sat in the old tree (its +// parent row / kid index / parent token base), published by adoptSeek, plus the +// (pos, rid, generation) signature a repetition must present to consume it. +let adoptHitP = -1, adoptHitKid = 0, adoptHitBase = 0; +let adoptRunPos = -1, adoptRunRid = -1, adoptRunGen = -1; +let adoptRunP = -1, adoptRunKid = 0, adoptRunOq = 0, adoptRunBase = 0; +function adoptSeek(q, rid) { + // reuse the cached path while it still CONTAINS q (strictly inside, not at start) + let depth = 0; + while (depth < adoptPath.length) { + const id = adoptPath[depth]; + const b = adoptBase[depth]; + if (b < q && q < b + rowTokLen[id]) depth++; + else break; + } + adoptPath.length = depth; + adoptBase.length = depth; + let id, base; + if (depth === 0) { + if (q < adoptRootTok || q >= adoptRootTok + rowTokLen[adoptRoot]) return -1; + id = adoptRoot; base = adoptRootTok; + if (base === q) { /* root itself starts at q — fall through to the chain walk */ } + adoptPath.push(id); adoptBase.push(base); + } else { + id = adoptPath[depth - 1]; base = adoptBase[depth - 1]; + } + // descend: containment steps are committed to the cache; the exploratory chain of + // nodes starting EXACTLY at q is walked in locals (a later seek with another rule + // must see the same chain). + for (;;) { + // binary search the first child whose END exceeds q + const cs = rowStart[id]; + const n = rowCount[id]; + let lo = 0, hi = n; + while (lo < hi) { + const mid = (lo + hi) >> 1; + const e = kids[cs + mid]; + const end = e < 0 ? base + ((~e) >>> 2) + 1 : base + ktr(id, cs + mid) + rowTokLen[e]; + if (end <= q) lo = mid + 1; else hi = mid; + } + if (lo >= n) return -1; + const e = kids[cs + lo]; + if (e < 0) return -1; // the position is a leaf here + const cb = base + ktr(id, cs + lo); + if (cb > q) return -1; // a gap — nothing starts at q + if (cb === q) { + // the exploratory chain: every node from here down whose start is exactly q + adoptHitP = id; adoptHitKid = cs + lo; adoptHitBase = base; + let xid = e, xb = cb; + for (;;) { + if (rowOK[xid] !== 0 && rowRule[xid] === rid + && (q + rowExt[xid] + 2 <= adoptDmgStart || q >= adoptDmgOldEnd)) { + return xid; + } + const xcs = rowStart[xid]; + if (rowCount[xid] === 0) return -1; + const fe = kids[xcs]; + if (fe < 0 || ktr(xid, xcs) !== 0) return -1; + adoptHitP = -1; + xid = fe; xb = xb; + } + } + // containment: commit and descend + id = e; base = cb; + adoptPath.push(id); adoptBase.push(base); } - return sc[--scn]; +} +// Run-extension: a repetition whose element was just ADOPTED bulk-adopts the +// following OLD SIBLINGS in one tight loop — whole-statement reuse without +// re-entering parseRuleEntry/adoptSeek once per element. Soundness: each member +// re-passes exactly the single-adoption eligibility (same-rule row, memoized +// [rowOK], contiguous, lookahead clear of the damage), a member's existence +// proves the loop's FIRST-set guard true at its position (its first token starts +// the rule), and the loop's own continuation checks run again after the run +// breaks. Members get no memo entries — a backtracking re-probe just re-adopts. +function runExtend(rid) { + if (rid !== adoptRunRid || memoGenCur !== adoptRunGen) { adoptRunPos = -1; return; } + adoptRunPos = -1; + const P = adoptRunP; + const csEnd = rowStart[P] + rowCount[P]; + const pb = adoptRunBase; + let i = adoptRunKid; + let oq = adoptRunOq; + let nq = pos; + const sfx = oq >= adoptDmgOldEnd; // past the damage: monotone, no per-member ext check + let mp = maxPos; + while (i < csEnd) { + const e = kids[i]; + if (e < 0) break; + if (pb + ktr(P, i) !== oq) break; + if (rowRule[e] !== rid || rowOK[e] === 0) break; + const tl = rowTokLen[e]; + if (tl === 0) break; + const ex = rowExt[e]; + if (!sfx && oq + ex + 2 > adoptDmgStart) break; + absTok[e] = nq; absChar[e] = toff(nq); + scPush(e); + const w = nq + ex; + if (w > mp) mp = w; + nq += tl; oq += tl; + i++; + } + if (mp > maxPos) maxPos = mp; + pos = nq; +} + +// ── Node SURGERY: patch the damage path in place ── +// Even with run-adoption, a keystroke inside one statement of a large list rebuilds +// every node on the damage path — the list parent re-collects ALL its kids through +// scratch (and the arena grows by that much per edit). Surgery keeps those rows: +// descend the old tree to the deepest PURE container (SURG_ELEM), re-parse only the +// affected elements with the real rule fn (adoption reuses their undamaged +// subtrees), and when the fresh elements REJOIN an old kid start exactly, splice the +// container's kid range and shift the suffix rels by the edit deltas. Every check +// happens BEFORE any row is mutated; any failure falls back to the full adoption +// re-parse. Prefix kids are kept under the same watermark rule single adoption +// uses, made transitive by rowKC: each kid's probe watermark stays at/below the +// next kid's start, so checking the LAST kept kid bounds them all. +let surgX = [], surgBase = [], surgA = [], surgB = []; +// composed change envelope handed from the text-application step to the window relex +let editDmgS = 0, editDmgE = 0; +function rowKCof(id) { + const c = rowKC[id]; + if (c !== 0) return c; + const cs = rowStart[id], n = rowCount[id]; + let ok = 1, prevW = -1; + for (let k = 0; k < n; k++) { + const e = kids[cs + k]; + const st = e < 0 ? (~e) >>> 2 : ktr(id, cs + k); + if (prevW > st) { ok = 2; break; } + prevW = e < 0 ? st + 1 : st + rowExt[e]; + } + rowKC[id] = ok; + return ok; +} +function trySurgery(dmgA, dmgB, tokD, chrD) { + if (adoptRoot < 0) return -1; + // the whole-file token math must close, or the shape changed beyond a splice + if (adoptRootTok + rowTokLen[adoptRoot] + tokD !== tokN) return -1; + // 1. descend along single-affected-row kids, recording the path + surgX.length = 0; surgBase.length = 0; surgA.length = 0; surgB.length = 0; + let X = adoptRoot, base = adoptRootTok; + for (;;) { + const cs = rowStart[X], n = rowCount[X]; + let lo = 0, hi = n; + while (lo < hi) { + const m = (lo + hi) >> 1; + const e = kids[cs + m]; + const st = base + (e < 0 ? (~e) >>> 2 : ktr(X, cs + m)); + if (st < dmgB) lo = m + 1; else hi = m; + } + const b = lo; + let a = b; + while (a > 0) { + const e = kids[cs + a - 1]; + const st = base + (e < 0 ? (~e) >>> 2 : ktr(X, cs + a - 1)); + if (e < 0 ? st < dmgA : st + rowExt[e] + 2 <= dmgA) break; + a--; + } + surgX.push(X); surgBase.push(base); surgA.push(a); surgB.push(b); + if (b - a !== 1) break; + const e = kids[cs + a]; + if (e < 0 || rowCount[e] === 0) break; + base = base + ktr(X, cs + a); + X = e; + } + // 2. choose D: the deepest surgical level whose affected kids are all rep rows + let L = -1; + for (let i = surgX.length - 1; i >= 0; i--) { + const Xi = surgX[i]; + const elem = SURG_ELEM[rowRule[Xi]]; + if (elem < 0) continue; + const cs = rowStart[Xi]; + const ai = surgA[i], bi = surgB[i]; + let okR = true; + for (let k = ai; k < bi; k++) { + const e = kids[cs + k]; + if (e < 0 || rowRule[e] !== elem) { okR = false; break; } + } + if (!okR) continue; + if (bi === ai) { + // pure insertion at a kid boundary: it must sit INSIDE the rep zone — at + // least one neighbour is an element row. Otherwise the insertion belongs to + // an enclosing list (e.g. right after this container's closing brace, where + // an element-loop alignment would stitch the new element into a CLOSED node). + const pe = ai > 0 ? kids[cs + ai - 1] : -1; + const ne = ai < rowCount[Xi] ? kids[cs + ai] : -1; + const prevOk = pe >= 0 && rowRule[pe] === elem; + const nextOk = ne >= 0 && rowRule[ne] === elem; + if (!prevOk && !nextOk) continue; + } + if (ai > 0 && rowKCof(Xi) !== 1) continue; + L = i; + break; + } + if (L < 0) return -1; + const D = surgX[L], Dbase = surgBase[L], Da = surgA[L]; + const Db = surgB[L]; + const elem = SURG_ELEM[rowRule[D]]; + const csD = rowStart[D], nD = rowCount[D]; + const DendNew = Dbase + rowTokLen[D] + tokD; + // 3. re-parse the affected span with the real rule (adoption live); the first + // affected kid starts at/before the damage, so old == new coordinates there + pos = Da < Db + ? Dbase + (kids[csD + Da] < 0 ? (~kids[csD + Da]) >>> 2 : ktr(D, csD + Da)) + : dmgA; + maxPos = pos; scn = 0; parseLimit = -1; cap = tokN; + currentPrattContext = null; suppressNext = null; suppressCur = null; + const genAt = memoGenCur; + const fn = RULE_FN_BY_ID[elem]; + let j = Db, guard = 0; + for (;;) { + let target; + if (j < nD) { + const e = kids[csD + j]; + target = Dbase + (e < 0 ? (~e) >>> 2 : ktr(D, csD + j)) + tokD; + } else target = DendNew; + if (pos === target) break; + if (pos > target) { + // the fresh parse consumed past old kid j: only a rep row may be subsumed + if (j >= nD) return -1; + const e = kids[csD + j]; + if (e < 0 || rowRule[e] !== elem) return -1; + j++; + continue; + } + if (++guard > 65536) return -1; + const pp = pos; + if (!fn()) return -1; + if (memoGenCur !== genAt || pos === pp) return -1; + } + // 4. POINT OF NO RETURN — splice D's kid range, shift suffix rels, patch the path + const f = scn; + const removed = j - Da; + const DcharBase = toff(Dbase); + let csD2 = csD; + if (f === removed) { + for (let k = 0; k < f; k++) { + const id = sc[k]; + kids[csD + Da + k] = id; + kidTokRel[csD + Da + k] = absTok[id] - Dbase; + kidRel[csD + Da + k] = absChar[id] - DcharBase; + } + } else { + const n2k = nD - removed + f; + if (kidN + n2k > kidCap) growKids(n2k); + const ks = kidN; + for (let k = 0; k < Da; k++) { + kids[ks + k] = kids[csD + k]; + kidRel[ks + k] = kidRel[csD + k]; + kidTokRel[ks + k] = kidTokRel[csD + k]; + } + for (let k = 0; k < f; k++) { + const id = sc[k]; + kids[ks + Da + k] = id; + kidTokRel[ks + Da + k] = absTok[id] - Dbase; + kidRel[ks + Da + k] = absChar[id] - DcharBase; + } + for (let k = j; k < nD; k++) { + kids[ks + Da + f + (k - j)] = kids[csD + k]; + kidRel[ks + Da + f + (k - j)] = kidRel[csD + k]; + kidTokRel[ks + Da + f + (k - j)] = kidTokRel[csD + k]; + } + kidN = ks + n2k; + rowStart[D] = ks; + rowCount[D] = n2k; + // remap the end-relative boundary into the relocated range (suffix kids kept + // their sign-encoded values; indices shifted by the move + the count change). + // Three cases keep it Int32-safe: no negatives among the copied kids (the + // sentinel maps to itself, NOT through the index arithmetic), all possibly + // negative, or a boundary inside the copied range. + const nfOld = rowNF[D]; + rowNF[D] = nfOld >= csD + nD ? 0x7fffffff + : nfOld <= csD + j ? ks + Da + f + : (nfOld - csD - j) + ks + Da + f; + csD2 = ks; + } + const n2 = rowCount[D]; + // End-relative band maintenance (old lengths — the bias cancels against the new + // ones exactly like the token-level flip): rows entering the suffix flip to + // end-relative; rows leaving it flip back to absolute rels. Rows already beyond + // the old boundary auto-shift via the length update below. Leaf kids cannot be + // sign-encoded (packed): inside the flip-up band they are re-packed eagerly, and + // the trailing run (a pure container's only leaves past the rep) gets the same + // eager shift by the backward walk. + const bnd = csD2 + Da + f; + const nf = rowNF[D]; + const kidsEnd = csD2 + n2; + if (nf < bnd) { + for (let k = nf; k < bnd; k++) { + const v = kidTokRel[k]; + if (v < 0) { kidTokRel[k] = v + rowTokLen[D] + 1; kidRel[k] += rowLen[D] + 1; } + } + } else if (nf > bnd) { + const hi = nf < kidsEnd ? nf : kidsEnd; + for (let k = bnd; k < hi; k++) { + const e = kids[k]; + if (e < 0) { if (tokD !== 0) kids[k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); } + else { + const v = kidTokRel[k]; + if (v >= 0) { kidTokRel[k] = v - rowTokLen[D] - 1; kidRel[k] -= rowLen[D] + 1; } + } + } + } + if (tokD !== 0) { + const tlFrom = nf > bnd ? (nf < kidsEnd ? nf : kidsEnd) : bnd; + for (let k = kidsEnd - 1; k >= tlFrom; k--) { + const e = kids[k]; + if (e >= 0) break; + kids[k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); + } + } + rowNF[D] = bnd; + rowTokLen[D] += tokD; + // Derive the char length from the token columns rather than adding chrD: a pure- + // trivia edit can sit at a node's token BOUNDARY (between its last token and the + // next sibling's first), token-inside but char-outside — the gap belongs to no + // node. tend/toff give the exact new span; when suffix tokens exist inside the + // node the delta equals chrD (so the suffix-kid rel adds and the end-relative + // bias-cancel stay consistent), and when they don't there are no suffix kids. + if (rowTokLen[D] > 0) rowLen[D] = tend(Dbase + rowTokLen[D] - 1) - toff(Dbase); + { + let x = rowExt[D] + (tokD > 0 ? tokD : 0); + const fw = maxPos - Dbase; + if (fw > x) x = fw; + rowExt[D] = x; + } + // containment bit: only the pairs around the splice changed + if (rowKC[D] === 1) { + let okB = 1; + const from = Da > 0 ? Da - 1 : 0; + for (let k = from; k < Da + f && k + 1 < n2; k++) { + const e = kids[csD2 + k]; + const w = e < 0 ? ((~e) >>> 2) + 1 : ktr(D, csD2 + k) + rowExt[e]; + const e2 = kids[csD2 + k + 1]; + const st2 = e2 < 0 ? (~e2) >>> 2 : ktr(D, csD2 + k + 1); + if (w > st2) { okB = 2; break; } + } + rowKC[D] = okB; + } + // 5. ancestors bottom-up: lengths, suffix rels, ext, containment boundary pair + for (let i = L - 1; i >= 0; i--) { + const Ai = surgX[i]; + const csA = rowStart[Ai], nA = rowCount[Ai]; + const ki = surgA[i]; + // kids at/before the path kid are NOT suffix for this edit (the damage sits + // inside the path kid): any end-relative rel there must flip back to absolute + // with the OLD lengths, or the length update below would shift it + const nfA = rowNF[Ai]; + if (nfA <= csA + ki) { + for (let k = nfA; k <= csA + ki; k++) { + const v = kidTokRel[k]; + if (v < 0) { kidTokRel[k] = v + rowTokLen[Ai] + 1; kidRel[k] += rowLen[Ai] + 1; } + } + rowNF[Ai] = csA + ki + 1; + } + // Suffix kids: a PURE-container ancestor (interior = element rows only, leaves + // only as a trailing run) gets the same end-relative band as D — without it, a + // deep edit under a giant flat list pays an O(suffix) eager walk per keystroke + // (measured: 0.6ms median on the 9MB body as ancestor). Mixed-content ancestors + // (interleaved leaves can't sign-encode inside the packed entry) keep the eager + // walk; their kid counts are the grammar's non-list shapes. + if (SURG_ELEM[rowRule[Ai]] >= 0) { + const bndA = csA + ki + 1; + const nfA2 = rowNF[Ai]; + const kidsEndA = csA + nA; + if (nfA2 > bndA) { + const hi = nfA2 < kidsEndA ? nfA2 : kidsEndA; + for (let k = bndA; k < hi; k++) { + const e = kids[k]; + if (e < 0) { if (tokD !== 0) kids[k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); } + else { + const v = kidTokRel[k]; + if (v >= 0) { kidTokRel[k] = v - rowTokLen[Ai] - 1; kidRel[k] -= rowLen[Ai] + 1; } + } + } + } + if (tokD !== 0) { + const tlFrom = nfA2 > bndA ? (nfA2 < kidsEndA ? nfA2 : kidsEndA) : bndA; + for (let k = kidsEndA - 1; k >= tlFrom; k--) { + const e = kids[k]; + if (e >= 0) break; + kids[k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); + } + } + rowNF[Ai] = bndA; + } else { + for (let k = ki + 1; k < nA; k++) { + const e = kids[csA + k]; + if (e < 0) kids[csA + k] = ~(((((~e) >>> 2) + tokD) << 2) | ((~e) & 3)); + else if (kidTokRel[csA + k] >= 0) { kidTokRel[csA + k] += tokD; kidRel[csA + k] += chrD; } + // (end-relative kids past the boundary auto-shift via the length update below) + } + } + rowTokLen[Ai] += tokD; + if (rowTokLen[Ai] > 0) rowLen[Ai] = tend(surgBase[i] + rowTokLen[Ai] - 1) - toff(surgBase[i]); + { + let x = rowExt[Ai] + (tokD > 0 ? tokD : 0); + const cw = ktr(Ai, csA + ki) + rowExt[surgX[i + 1]]; + if (cw > x) x = cw; + rowExt[Ai] = x; + } + if (rowKC[Ai] === 1 && ki + 1 < nA) { + const e2 = kids[csA + ki + 1]; + const st2 = e2 < 0 ? (~e2) >>> 2 : ktr(Ai, csA + ki + 1); + if (ktr(Ai, csA + ki) + rowExt[surgX[i + 1]] > st2) rowKC[Ai] = 2; + } + } + return adoptRoot; +} - function farthest(errPos) { - if (maxPos <= errPos || maxPos >= tokN) return ''; - return ' [farthest: offset ' + tkOff[maxPos] + " near '" + tokTextAt(maxPos).slice(0, 20) + "']"; +// The spare token-column buffer set (parseEdited ping-pongs between the live set and +// this one, so steady-state edits never allocate columns). +let altK = null, altT = null, altOff = null, altEnd = null, altFl = null, altDp = null, altPd = null; +let altCap = 0; +let altN = 0; // old-stream token count while a window lex runs (lexCore's resync bound) + +// ── Documents: the per-document state set behind the handle API ── +// The module-level variables above are the ACTIVE REGISTER SET — the hot paths +// never indirect through an object. A document object stores the same 51 fields; +// activate() lazily swaps: the active doc's object may be stale while the module +// variables are the truth, and is written back only when another doc activates. +// Per-PARSE transients (pos/maxPos/scratch/adopt*/surg*) reset on every entry and +// are shared safely. +function makeDoc() { + return { + tkK: new tkK.constructor(4096), tkT: new tkT.constructor(4096), + tkOff: new Int32Array(4096), tkEnd: new Int32Array(4096), tkFl: new Uint8Array(4096), + tkDp: new Uint8Array(4096), tkPd: new Uint16Array(4096), + tkCap: 4096, tokN: 0, srcLenP1: 1, negFrom: 0x7fffffff, + rowRule: new Uint16Array(8192), rowLen: new Int32Array(8192), rowTokLen: new Int32Array(8192), + rowStart: new Int32Array(8192), rowCount: new Int32Array(8192), rowExt: new Int32Array(8192), + rowOK: new Uint8Array(8192), rowKC: new Uint8Array(8192), + rowNF: new Int32Array(8192).fill(0x7fffffff), + absChar: new Int32Array(8192), absTok: new Int32Array(8192), + rowCap: 8192, nodeN: 0, + kids: new Int32Array(16384), kidRel: new Int32Array(16384), kidTokRel: new Int32Array(16384), + kidCap: 16384, kidN: 0, + memoNode: [], memoEnd: [], memoExt: [], memoGen: [], memoGenCur: 0, + lastOk: false, treePieces: null, + docPieces: null, docPieceOff: null, docLen: 0, docFlat: null, docCur: 0, + rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, +${e.soa ? ' parenCachePos: -1, parenCacheStack: [],' : ''} + altK: null, altT: null, altOff: null, altEnd: null, altFl: null, altDp: null, altPd: null, + altCap: 0, altN: 0, + }; +} +function saveDoc(d) { + d.tkK = tkK; d.tkT = tkT; d.tkOff = tkOff; d.tkEnd = tkEnd; d.tkFl = tkFl; + d.tkDp = tkDp; d.tkPd = tkPd; d.tkCap = tkCap; d.tokN = tokN; + d.srcLenP1 = srcLenP1; d.negFrom = negFrom; + d.rowRule = rowRule; d.rowLen = rowLen; d.rowTokLen = rowTokLen; d.rowStart = rowStart; + d.rowCount = rowCount; d.rowExt = rowExt; d.rowOK = rowOK; d.rowKC = rowKC; d.rowNF = rowNF; + d.absChar = absChar; d.absTok = absTok; d.rowCap = rowCap; d.nodeN = nodeN; + d.kids = kids; d.kidRel = kidRel; d.kidTokRel = kidTokRel; d.kidCap = kidCap; d.kidN = kidN; + d.memoNode = memoNode; d.memoEnd = memoEnd; d.memoExt = memoExt; d.memoGen = memoGen; + d.memoGenCur = memoGenCur; + d.lastOk = lastOk; d.treePieces = treePieces; + d.docPieces = docPieces; d.docPieceOff = docPieceOff; d.docLen = docLen; d.docFlat = docFlat; d.docCur = docCur; + d.rootCharBase = rootCharBase; d.rootTokBase = rootTokBase; + d.lastRoot = lastRoot; d.lastRootTok = lastRootTok; +${e.soa ? ' d.parenCachePos = parenCachePos; d.parenCacheStack = parenCacheStack;' : ''} + d.altK = altK; d.altT = altT; d.altOff = altOff; d.altEnd = altEnd; d.altFl = altFl; + d.altDp = altDp; d.altPd = altPd; d.altCap = altCap; d.altN = altN; +} +function loadDoc(d) { + tkK = d.tkK; tkT = d.tkT; tkOff = d.tkOff; tkEnd = d.tkEnd; tkFl = d.tkFl; + tkDp = d.tkDp; tkPd = d.tkPd; tkCap = d.tkCap; tokN = d.tokN; + srcLenP1 = d.srcLenP1; negFrom = d.negFrom; + rowRule = d.rowRule; rowLen = d.rowLen; rowTokLen = d.rowTokLen; rowStart = d.rowStart; + rowCount = d.rowCount; rowExt = d.rowExt; rowOK = d.rowOK; rowKC = d.rowKC; rowNF = d.rowNF; + absChar = d.absChar; absTok = d.absTok; rowCap = d.rowCap; nodeN = d.nodeN; + kids = d.kids; kidRel = d.kidRel; kidTokRel = d.kidTokRel; kidCap = d.kidCap; kidN = d.kidN; + memoNode = d.memoNode; memoEnd = d.memoEnd; memoExt = d.memoExt; memoGen = d.memoGen; + memoGenCur = d.memoGenCur; + lastOk = d.lastOk; treePieces = d.treePieces; + docPieces = d.docPieces; docPieceOff = d.docPieceOff; docLen = d.docLen; docFlat = d.docFlat; docCur = d.docCur; + rootCharBase = d.rootCharBase; rootTokBase = d.rootTokBase; + lastRoot = d.lastRoot; lastRootTok = d.lastRootTok; +${e.soa ? ' parenCachePos = d.parenCachePos; parenCacheStack = d.parenCacheStack;' : ''} + altK = d.altK; altT = d.altT; altOff = d.altOff; altEnd = d.altEnd; altFl = d.altFl; + altDp = d.altDp; altPd = d.altPd; altCap = d.altCap; altN = d.altN; +} +const docDefault = makeDoc(); +let curDoc = docDefault; +loadDoc(docDefault); +function activate(d) { + if (d === curDoc) return; + saveDoc(curDoc); + loadDoc(d); + curDoc = d; +} +function swapBuffers() { + let x; + x = tkK; tkK = altK; altK = x; + x = tkT; tkT = altT; altT = x; + x = tkOff; tkOff = altOff; altOff = x; + x = tkEnd; tkEnd = altEnd; altEnd = x; + x = tkFl; tkFl = altFl; altFl = x; + x = tkDp; tkDp = altDp; altDp = x; + x = tkPd; tkPd = altPd; altPd = x; + x = tkCap; tkCap = altCap; altCap = x; +} +${e.soa ? '' : 'let altText = [];'} + +function parseCore(source, entryRule) { + lastOk = false; + adoptRoot = -1; + adoptRunPos = -1; + lexInto(source); + if (memoEnd.length !== MEMO_RULES) { + memoNode = new Array(MEMO_RULES); + memoEnd = new Array(MEMO_RULES); + memoExt = new Array(MEMO_RULES); + memoGen = new Array(MEMO_RULES); + } + memoGenCur++; + nodeN = 0; + kidN = 0; + const root = runParse(entryRule); + lastRoot = root; + lastRootTok = rootTokBase; + lastOk = true; + treePieces = docPieces.slice(); + return root; +} + +// ── Incremental re-parse ── +// No edit protocol: the caller hands the NEW source; the damage window is DERIVED by +// diffing the old and new token columns (longest identical prefix; longest suffix +// identical modulo the character delta). Reuse then flows through the carried memo: +// - prefix entries survive when their lookahead extent never reached the damage; +// - suffix entries survive shifted by the token delta (their reads are wholly inside +// the suffix, which is identical modulo position); +// - damaged-region entries are dropped and re-parsed. +// The old arena is re-based in place (rows starting at/after the suffix shift by the +// char delta; reused leaf entries by the token delta; rows STARTING inside the damage +// are unreachable garbage — their values no longer matter), and new rows append after +// the old ones. A full parse() compacts (resets the arena); long edit sessions grow +// until then. Lexing is FULL-FILE by design: the lexer carries cross-token state +// (template nesting, regex context, markup modes), full lexing is a small share of a +// parse, and the diff is what localizes the damage — not the lexer. +function editCore(entryRule, edits) { + try { + return editCoreRun(entryRule, edits); + } catch (e) { + // REJECTED edit: the splice (and any '>' splits of the failed attempt) already + // rewrote the token columns to the rejected text, and the append-mode fallback + // may have grown the arena — but the live tree's ROWS are untouched. Re-lexing + // the live tree's source restores every read path (leaf spans, visit, next + // edit's restart anchors); O(n) on the reject path only. + if (treePieces !== null) { + // restore the token columns to the LIVE TREE's text — but the DOCUMENT text + // must stay on the rejected content (lexInto/tokenize resets the doc layer + // as a side effect, so save it around the re-lex) + const kP = docPieces, kO = docPieceOff, kL = docLen, kF = docFlat; + lexInto(treePieces.join('')); + docPieces = kP; docPieceOff = kO; docLen = kL; docFlat = kF; docCur = 0; + lastOk = false; + } + throw e; } } +function editCoreRun(entryRule, edits) { + if (edits === undefined || edits.length === 0) { + throw new Error('edit() requires the changes: [{ start, end, text }] (LSP-style - each edit in the coordinates of the document AFTER the preceding edits in the array)'); + } + // The engine owns the document text: the new source is BUILT from the changes, + // so "the ranges do not match the text" is unrepresentable. Each edit is applied + // sequentially (LSP incremental-sync semantics); the damage envelope is composed + // alongside: dS in prefix coordinates (identical old/new), dE in FINAL + // coordinates, the old end recovered through the total delta. V8 cons strings + // make the slice+concat construction cheap; the flat-string cost, where a read + // path needs one, is the same the caller would have paid building the text. + if (docPieces === null) throw new Error('edit() before parse(): no document'); + const oldLen = docLen; + { + let dS = 0x7fffffff; + let dE = -1; + for (let i = 0; i < edits.length; i++) { + const ed = edits[i]; + const start = ed.start, end = ed.end, text = ed.text; + if (!(start >= 0 && start <= end && end <= docLen) || typeof text !== 'string') { + throw new Error('edit() change #' + i + ' out of range: [' + start + ', ' + end + ') of ' + docLen); + } + applyChange(start, end, text); + const newEnd = start + text.length; + const delta = newEnd - end; + if (dE > start) dE = dE >= end ? dE + delta : newEnd; + if (newEnd > dE) dE = newEnd; + if (start < dS) dS = start; + } + editDmgS = dS; + editDmgE = dE; + } + if (!lastOk) { + // No coherent edit base (a previous attempt rejected): full re-parse in APPEND + // mode — parseCore would reset the arena and destroy the live tree the handle + // still exposes if THIS parse rejects too. parse() is the only compaction point. + const whole = flattenDoc(); + lexInto(whole); + if (memoEnd.length !== MEMO_RULES) { + memoNode = new Array(MEMO_RULES); + memoEnd = new Array(MEMO_RULES); + memoExt = new Array(MEMO_RULES); + memoGen = new Array(MEMO_RULES); + } + memoGenCur++; + adoptRoot = -1; + adoptRunPos = -1; + const root = runParse(entryRule); + lastRoot = root; + lastRootTok = rootTokBase; + lastOk = true; + treePieces = docPieces.slice(); + return root; + } + lastOk = false; +${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── + // Damage envelope from the composed changes: prefix coordinates are shared, the + // old end comes back through the total delta. + const newLen = docLen; + const cs = editDmgS < newLen ? editDmgS : newLen; + const ceNew = editDmgE < cs ? cs : editDmgE; + const ceOld = ceNew - (newLen - oldLen); + const charDelta = newLen - oldLen; + // Restart anchor: the last token B ending at/before the damage whose recorded + // depths are zero and whose shape carries no cross-token lexer flag (')' control- + // head, postfix-ambiguous op). B = -1 restarts at the file head — always sound. + const B = findRestart(cs); + const initParens = reconstructParensCached(B); + const oN = tokN; + // first old token at/after the damage end — the resync search floor + let r0 = oN; + { let lo = 0, hi = oN; + while (lo < hi) { const mid = (lo + hi) >> 1; if (toff(mid) < ceOld) lo = mid + 1; else hi = mid; } + r0 = lo; } + // Lex the window into the spare buffers (the old stream stays live for resync). + if (altK === null || altCap < tkCap) { + altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); + altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); + altDp = new Uint8Array(tkCap); altPd = new Uint16Array(tkCap); + altCap = tkCap; + } + altN = oN; + swapBuffers(); // live = scratch, alt = OLD stream + tokN = 0; + const startOff = B >= 0 ? (altEnd[B] < 0 ? altEnd[B] + srcLenP1 : altEnd[B]) : 0; + // Window-materialized relex: lexCore reads a SMALL flat slice of the pieces with + // an absolute bias; -2 = ran off the window end before resyncing — re-materialize + // a larger window and retry (the common case fits the first one). + let R0; + { + let wHi = ceNew + 4096; + for (;;) { + if (wHi > docLen) wHi = docLen; + const windowStr = docText(startOff, wHi); + tokN = 0; + try { + R0 = lexCore(windowStr, 0, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs, initParens.slice(), startOff, wHi < docLen); + } catch (e2) { + if (e2 !== LEX_RETRY) throw e2; + R0 = -2; + } + if (R0 !== -2) break; + wHi = wHi >= docLen ? docLen : (wHi - startOff) * 4 + startOff; + } + } + const W = tokN; + const R = R0 >= 0 ? R0 : oN; + swapBuffers(); // live = OLD stream again; window sits in the alt buffers + tokN = oN; + // EOF-relative maintenance: move the negative-zone boundary to THIS edit's suffix + // start R. Tokens dropping out of the suffix ([negFrom, R)) flip back to absolute + // (they sit at/before the damage now — EOF-unstable); tokens entering it + // ([R, negFrom)) flip to EOF-relative, encoded against the OLD length (their new + // absolute is oldValue + charDelta, and newLen = oldLen + charDelta, so the bias + // cancels). Both bands are cursor-locality sized; the suffix itself is never + // walked again — updating srcLenP1 after the splice IS the char-delta shift the + // old O(suffix) add-loop used to apply. + if (negFrom < R) { + for (let i = negFrom, e2 = R < oN ? R : oN; i < e2; i++) { + const o = tkOff[i]; if (o < 0) tkOff[i] = o + srcLenP1; + const en = tkEnd[i]; if (en < 0) tkEnd[i] = en + srcLenP1; + } + } else if (negFrom > R) { + for (let i = R, e2 = negFrom < oN ? negFrom : oN; i < e2; i++) { + const o = tkOff[i]; if (o >= 0) tkOff[i] = o - srcLenP1; + const en = tkEnd[i]; if (en >= 0) tkEnd[i] = en - srcLenP1; + } + } + // TRUE token prefix p: the window re-derives [B+1 .. p) byte-identically; only past + // p is real damage (compared BEFORE the splice clobbers the old slots). + let p = B + 1; + { let i = 0; + while (i < W && p < R && altK[i] === tkK[p] && altT[i] === tkT[p] && altOff[i] === tkOff[p] + && altEnd[i] === tkEnd[p] && altFl[i] === tkFl[p]) { i++; p++; } + } + const dOldEnd = R; + const tokenDelta = (B + 1 + W) - R; + // ── splice: old[0..B] + window[0..W) + old[R..oN), then shift the suffix spans ── + const nN = B + 1 + W + (oN - R); + while (tkCap < nN + 1) growTok(); + if (R !== B + 1 + W) { + tkK.copyWithin(B + 1 + W, R, oN); tkT.copyWithin(B + 1 + W, R, oN); + tkOff.copyWithin(B + 1 + W, R, oN); tkEnd.copyWithin(B + 1 + W, R, oN); + tkFl.copyWithin(B + 1 + W, R, oN); tkDp.copyWithin(B + 1 + W, R, oN); tkPd.copyWithin(B + 1 + W, R, oN); + } + if (W > 0) { + tkK.set(altK.subarray(0, W), B + 1); tkT.set(altT.subarray(0, W), B + 1); + tkOff.set(altOff.subarray(0, W), B + 1); tkEnd.set(altEnd.subarray(0, W), B + 1); + tkFl.set(altFl.subarray(0, W), B + 1); tkDp.set(altDp.subarray(0, W), B + 1); tkPd.set(altPd.subarray(0, W), B + 1); + } + negFrom = B + 1 + W; + srcLenP1 = newLen + 1; + tokN = nN; + const nN2 = nN;` : String.raw` // (fallback-lexer grammars keep the full-relex + token-diff path) + const oK = tkK, oT = tkT, oOff = tkOff, oEnd = tkEnd, oFl = tkFl, oN = tokN; + const oText = tkText; + if (altK === null || altK.length !== tkCap) { + altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); + altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); + altDp = new Uint8Array(tkCap); altPd = new Uint16Array(tkCap); + } + tkK = altK; tkT = altT; tkOff = altOff; tkEnd = altEnd; tkFl = altFl; + { const _d = tkDp; tkDp = altDp; altDp = _d; const _q = tkPd; tkPd = altPd; altPd = _q; } + tkText = altText; tkText.length = 0; + altK = oK; altT = oT; altOff = oOff; altEnd = oEnd; altFl = oFl; + altText = oText; + lexInto(flattenDoc()); + const nN = tokN; + const charDelta = docLen - oldLen; + const minN = oN < nN ? oN : nN; + let p = 0; + while (p < minN && oK[p] === tkK[p] && oT[p] === tkT[p] && oFl[p] === tkFl[p] + && oOff[p] === tkOff[p] && oEnd[p] === tkEnd[p] && oText[p] === tkText[p]) p++; + let s = 0; + while (s < minN - p) { + const i = oN - 1 - s, j = nN - 1 - s; + if (oK[i] === tkK[j] && oT[i] === tkT[j] && oFl[i] === tkFl[j] + && oOff[i] + charDelta === tkOff[j] && oEnd[i] + charDelta === tkEnd[j] && oText[i] === tkText[j]) s++; + else break; + } + const dOldEnd = oN - s; + const tokenDelta = nN - oN; + const nN2 = nN;`} + // M4: NO memo carry — the memo is intra-parse; reuse flows through old-tree + // adoption (parseRuleEntry consults the previous root via adoptSeek), so the whole + // O(rules × n) carry/invalidate machinery is gone. + if (memoEnd.length !== MEMO_RULES) { + memoNode = new Array(MEMO_RULES); + memoEnd = new Array(MEMO_RULES); + memoExt = new Array(MEMO_RULES); + memoGen = new Array(MEMO_RULES); + } + memoGenCur++; + adoptRoot = lastRoot; + adoptRootTok = lastRootTok; + adoptDmgStart = p; + adoptDmgOldEnd = dOldEnd; + adoptDelta = tokenDelta; + adoptPath.length = 0; + adoptBase.length = 0; + adoptRunPos = -1; + const sroot = trySurgery(p, dOldEnd, tokenDelta, charDelta); + if (sroot >= 0) { + adoptRoot = -1; + rootCharBase = toff(adoptRootTok); + rootTokBase = adoptRootTok; + lastRoot = sroot; + lastRootTok = adoptRootTok; + lastOk = true; + treePieces = docPieces.slice(); + return sroot; + } + const root = runParse(entryRule); + adoptRoot = -1; + lastRoot = root; + lastRootTok = rootTokBase; + lastOk = true; + treePieces = docPieces.slice(); + return root; +} export { tokenize }; -export function createParser() { return { parse, tree, visit, toObject, tokenize }; } +// ── Module-level API: the DEFAULT document (one shared session; tokenize and the +// raw tree/tokenAt views read the ACTIVE doc — they are gate/debug surfaces) ── +export function parse(source, entryRule) { activate(docDefault); return parseCore(source, entryRule); } +export function parseEdited(entryRule, edits) { activate(docDefault); return editCore(entryRule, edits); } +export function visit(entry, fns, charBase, tokBase) { activate(docDefault); return visitCore(entry, fns, charBase, tokBase); } +// ── Handle API: explicit trees over per-instance documents ── +// const p = createParser(); const cst = p.parse(text); p.edit(cst, next[, edits]); +// The handle is the STABLE IDENTITY of this document's tree: edit() mutates it in +// place (node surgery) and returns nothing — a return value would read as a clone, +// and there is none. A REJECTED edit (parse error) throws and leaves the handle on +// the previous tree; the next edit falls back to a full re-parse internally. Only +// parse() re-opening the document invalidates old handles (they throw). +export function createParser() { + const d = makeDoc(); + let gen = 0; + let entryUsed; + const chk = (cst) => { + if (cst === null || cst === undefined || cst.d !== d) throw new Error('foreign tree handle: it belongs to another parser instance'); + if (cst.gen !== gen) throw new Error('stale tree handle: parse() re-opened this document - use the handle from the latest parse()'); + }; + const view = {}; + for (const k of Object.keys(tree)) { + const f = tree[k]; + view[k] = (a, b) => { activate(d); return f(a, b); }; + } + return { + parse(source, entryRule) { + activate(d); + entryUsed = entryRule; + gen++; // re-opening resets the arena: old handles die even if THIS parse rejects + const root = parseCore(source, entryRule); + return { d, gen, root }; + }, + edit(cst, edits) { + chk(cst); + activate(d); + cst.root = editCore(entryUsed, edits); + }, + visit(cst, fns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, + tree: view, + }; +} `); } diff --git a/src/gen-cst-match.ts b/src/gen-cst-match.ts index 4e8fa91..daa50ff 100644 --- a/src/gen-cst-match.ts +++ b/src/gen-cst-match.ts @@ -321,7 +321,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin return c.field === c.name ? c.name : `${c.field}: ${c.name}`; }); w(` return { arm: ${J(plan.name)}${fields.length ? ', ' + fields.join(', ') : ''} };`); - emit(`function ${fn}(t: TreeAccess, n: number, cc: number, src: string): ${matchTypeName(rule.name)} | null {`); + emit(`function ${fn}(t: TreeAccess, n: number, cc: number, tb: number, src: string): ${matchTypeName(rule.name)} | null {`); for (const line of body) emit(line); emit(`}`); return fn; @@ -333,7 +333,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin } function litCond(text: string, tt: string): string { - return `__lit(t, cc, i, src, ${J(text)}, ${tt === '$keyword' ? 1 : 0})`; + return `__lit(t, cc, tb, i, src, ${J(text)}, ${tt === '$keyword' ? 1 : 0})`; } function renderStep(st: Step, w: (s: string) => void, ind: string, fail: () => string): void { @@ -346,7 +346,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin case 'litAlt': { const conds = st.texts.map((t, k) => litCond(t, st.tt[k])); w(`${ind}if (!(${conds.join(' || ')})) ${fail()}`); - if (st.cap) assign(st.cap, `src.slice(t.offsetOf(__SC[i]), t.endOf(__SC[i])) as ${st.cap.tsType}`, w, ind); + if (st.cap) assign(st.cap, `src.slice(t.leafOffsetOf(__SC[i], tb), t.leafEndOf(__SC[i], tb)) as ${st.cap.tsType}`, w, ind); w(`${ind}i++;`); return; } @@ -354,8 +354,8 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin const cond = st.name === '$operator' ? `__opTok(t, cc, i)` : st.template - ? `__tok(t, cc, i, ${typeKind.get(st.name)}) || __nodeOf(t, cc, i, ${ruleId.get('$template')})` - : `__tok(t, cc, i, ${typeKind.get(st.name)})`; + ? `__tok(t, cc, tb, i, ${typeKind.get(st.name)}) || __nodeOf(t, cc, i, ${ruleId.get('$template')})` + : `__tok(t, cc, tb, i, ${typeKind.get(st.name)})`; w(`${ind}if (!(${cond})) ${fail()}`); if (st.cap) assign(st.cap, `__SC[i] as ${st.cap.tsType}`, w, ind); w(`${ind}i++;`); @@ -564,7 +564,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin // ("always") arms appear in every bucket at their declaration position; the buckets // are superset filters (each arm fn re-checks exactly). const admits = plans.map(p => firstAdmit(p.steps)); - const tryLine = (k: number) => ` { const m = ${fns[k]}(t, n, cc, src); if (m !== null) return m; }`; + const tryLine = (k: number) => ` { const m = ${fns[k]}(t, n, cc, tb, src); if (m !== null) return m; }`; const bucketLines = (pred: (keys: Set) => boolean): string[] => plans.map((_, k) => (admits[k].keys.size === 0 || pred(admits[k].keys) ? tryLine(k) : '')) .filter(Boolean); @@ -618,8 +618,8 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin lines.push(`${pad} break;`); lines.push(`${pad} }`); lines.push(`${pad} }`); - lines.push(`${pad}} else if ((_k1 = t.leafKindOf(e1)) === 1 || (_k1 === 0 && t.leafTokKindOf(e1) === 1)) {`); - lines.push(`${pad} switch (src.charCodeAt(t.offsetOf(e1))) {`); + lines.push(`${pad}} else if ((_k1 = t.leafKindOf(e1)) === 1 || (_k1 === 0 && t.leafTokKindOf(e1, tb) === 1)) {`); + lines.push(`${pad} switch (src.charCodeAt(t.leafOffsetOf(e1, tb))) {`); for (const cc of [...cset].sort((a, b) => a - b)) { lines.push(`${pad} case ${cc}: {`); lines.push(...subTry(i => restAdmit[i]!.keys.has('c:' + cc)).map(l => ' ' + l)); @@ -634,7 +634,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin lines.push(`${pad}} else if (_k1 === 2) {`); lines.push(...subTry(i => restAdmit[i]!.keys.has('t:$operator'))); lines.push(`${pad}} else {`); - lines.push(`${pad} switch (t.leafTokKindOf(e1)) {`); + lines.push(`${pad} switch (t.leafTokKindOf(e1, tb)) {`); for (const t of [...tset].sort()) { if (t === '$operator') continue; // handled by the kind-2 branch above lines.push(`${pad} case ${typeKind.get(t)}: { // ${t}`); @@ -652,7 +652,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin }; const disp: string[] = []; - disp.push(`export function match${sanitizeIdent(rule.name)}(t: TreeAccess, n: NodeEntry<${J(rule.name)}>, src: string): ${tName} {`); + disp.push(`export function match${sanitizeIdent(rule.name)}(t: TreeAccess, n: NodeEntry<${J(rule.name)}>, tb: number, src: string): ${tName} {`); disp.push(` const cc = __load(t, n);`); disp.push(` let e1 = 0; let _k1 = 0;`); disp.push(` if (cc === 0) {`); @@ -681,8 +681,8 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin } disp.push(` }`); disp.push(` } else { const _k0 = t.leafKindOf(e0);`); - disp.push(` if (_k0 === 1 || (_k0 === 0 && t.leafTokKindOf(e0) === 1)) {`); - disp.push(` switch (src.charCodeAt(t.offsetOf(e0))) {`); + disp.push(` if (_k0 === 1 || (_k0 === 0 && t.leafTokKindOf(e0, tb) === 1)) {`); + disp.push(` switch (src.charCodeAt(t.leafOffsetOf(e0, tb))) {`); for (const cc of [...charCodes].sort((a, b) => a - b)) { disp.push(` case ${cc}: {`); for (const l of bucketLines(keys => keys.has('c:' + cc))) disp.push(' ' + l); @@ -699,7 +699,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin disp.push(` } else if (_k0 === 2) {`); for (const l of bucketLines(keys => keys.has('t:$operator'))) disp.push(l); disp.push(` } else {`); - disp.push(` switch (t.leafTokKindOf(e0)) {`); + disp.push(` switch (t.leafTokKindOf(e0, tb)) {`); for (const t of [...tokNames].sort()) { if (t === '$operator') continue; // handled by the kind-2 branch above disp.push(` case ${typeKind.get(t)}: { // ${t}`); @@ -715,7 +715,7 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin } disp.push(` }`); disp.push(` } } }`); - disp.push(` throw new Error(${J(`match${sanitizeIdent(rule.name)}: no arm matches`)} + ' @' + t.offsetOf(n));`); + disp.push(` throw new Error(${J(`match${sanitizeIdent(rule.name)}: no arm matches`)} + ' @tok' + tb);`); disp.push(`}`); bodyParts.push(disp.join('\n')); matcherMapEntries.push(` ${J(rule.name)}: match${sanitizeIdent(rule.name)},`); @@ -732,11 +732,10 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin header.push(` childCount(id: number): number;`); header.push(` childAt(id: number, i: number): number;`); header.push(` childrenInto(id: number, out: number[]): number;`); - header.push(` leafTokenType(entry: number): string;`); header.push(` leafKindOf(entry: number): number;`); - header.push(` leafTokKindOf(entry: number): number;`); - header.push(` offsetOf(entry: number): number;`); - header.push(` endOf(entry: number): number;`); + header.push(` leafTokKindOf(entry: number, tokBase: number): number;`); + header.push(` leafOffsetOf(entry: number, tokBase: number): number;`); + header.push(` leafEndOf(entry: number, tokBase: number): number;`); header.push(`}`); header.push(`// Branded entry aliases — compile-time discrimination over plain numbers.`); header.push(`export type NodeEntry = number & { readonly __node?: R };`); @@ -747,17 +746,17 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin header.push(`const __SC: number[] = [];`); header.push(`const __load = (t: TreeAccess, n: number): number => t.childrenInto(n, __SC);`); header.push(`// kind: 1 = '$keyword' (leaf kind bit), 0 = '$punct' (type-derived + tok-kind 1).`); - header.push(`const __lit = (t: TreeAccess, cc: number, i: number, src: string, text: string, kind: number): boolean => {`); + header.push(`const __lit = (t: TreeAccess, cc: number, tb: number, i: number, src: string, text: string, kind: number): boolean => {`); header.push(` if (i >= cc) return false;`); header.push(` const e = __SC[i];`); - header.push(` if (e >= 0 || t.leafKindOf(e) !== kind || (kind === 0 && t.leafTokKindOf(e) !== 1)) return false;`); - header.push(` const off = t.offsetOf(e);`); - header.push(` return t.endOf(e) - off === text.length && src.startsWith(text, off);`); + header.push(` if (e >= 0 || t.leafKindOf(e) !== kind || (kind === 0 && t.leafTokKindOf(e, tb) !== 1)) return false;`); + header.push(` const off = t.leafOffsetOf(e, tb);`); + header.push(` return t.leafEndOf(e, tb) - off === text.length && src.startsWith(text, off);`); header.push(`};`); - header.push(`const __tok = (t: TreeAccess, cc: number, i: number, k: number): boolean => {`); + header.push(`const __tok = (t: TreeAccess, cc: number, tb: number, i: number, k: number): boolean => {`); header.push(` if (i >= cc) return false;`); header.push(` const e = __SC[i];`); - header.push(` return e < 0 && t.leafKindOf(e) === 0 && t.leafTokKindOf(e) === k;`); + header.push(` return e < 0 && t.leafKindOf(e) === 0 && t.leafTokKindOf(e, tb) === k;`); header.push(`};`); header.push(`const __opTok = (t: TreeAccess, cc: number, i: number): boolean => {`); header.push(` if (i >= cc) return false;`); @@ -774,11 +773,11 @@ export function generateCstMatch(grammar: CstGrammar, importFrom: string): strin const footer = [ ``, `/** rule name → its matcher (generic walking; the totality gate uses this). */`, - `export const MATCHERS: Record { arm: string }> = {`, + `export const MATCHERS: Record { arm: string }> = {`, ...matcherMapEntries, `};`, `/** rule ID → matcher (the emitted parser's rowRule ids — declaration order). */`, - `export const MATCHERS_BY_ID: ((t: TreeAccess, n: never, src: string) => { arm: string })[] = [`, + `export const MATCHERS_BY_ID: ((t: TreeAccess, n: never, tb: number, src: string) => { arm: string })[] = [`, ...grammar.rules.map(r => ` match${sanitizeIdent(r.name)},`), `];`, ]; diff --git a/src/gen-parser.ts b/src/gen-parser.ts index f56f405..66b09c2 100644 --- a/src/gen-parser.ts +++ b/src/gen-parser.ts @@ -1482,7 +1482,15 @@ export function createParser(grammar: CstGrammar) { } } - return { parse, tokenize, profCounts }; + // API parity with the emitted engine's handle surface: edit() re-parses and + // updates the SAME tree object in place (the handle is the document's tree — + // edit returns nothing, exactly like the emitted engine; no reuse here). + const edit = (cst: { rule: string; children: unknown[]; offset: number; end: number }, source: string): void => { + const next = parse(source) as typeof cst; + cst.rule = next.rule; cst.children = next.children; + cst.offset = next.offset; cst.end = next.end; + }; + return { parse, edit, tokenize, profCounts }; } // ── Helpers ── diff --git a/src/token-dfa.ts b/src/token-dfa.ts new file mode 100644 index 0000000..12b83ca --- /dev/null +++ b/src/token-dfa.ts @@ -0,0 +1,417 @@ +// ───────────────────────────────────────────────────────────────────────────── +// token-dfa.ts — derive a char-code DFA matcher from a token's structured pattern IR +// (src/token-pattern.ts), as the forward path to a scanner that dispatches on char +// codes instead of executing a regex per token (issue #5). +// +// The lexer matches one token at a time, anchored at `pos`, taking that token's +// greedy/longest match (sticky `re.lastIndex = pos; re.exec(s)`). This compiles the +// REGULAR subset of the IR — literal · charClass · anyChar · seq · alt · greedy +// repeat · never, plus a single TRAILING lookahead over a char class (the `(?!…)` +// guard the numeric tokens end with) — to an NFA (Thompson), then a DFA (subset +// construction), and runs it over `charCodeAt` code units. `match(s, pos)` returns +// the same match length the token's sticky regex would, or -1. +// +// Anything outside that subset (mid-pattern look-around, lookbehind, anchors, a +// non-greedy quantifier) → `compileTokenDfa` returns null and the caller keeps using +// the regex. So the scanner is byte-identical by construction: a DFA where the IR is +// regular, the proven regex elsewhere. Char classes are matched over UTF-16 code +// units (0..0xFFFF) exactly like the non-`/u` regexes the lexer emits today. +// ───────────────────────────────────────────────────────────────────────────── + +import type { TokenPattern, TokenCharClassItem } from './types.ts'; + +// UTF-16 code-unit alphabet. Negated classes complement within [0, MAX_CODE]. +const MAX_CODE = 0xffff; + +// A half-open is avoided: ranges are inclusive [lo, hi] of code units. +export interface Range { lo: number; hi: number } + +// ── Char-class → sorted, merged, inclusive ranges ── +function classRanges(items: TokenCharClassItem[], negate: boolean): Range[] { + const raw: Range[] = []; + for (const item of items) { + if (item.type === 'char') { + const c = item.value.charCodeAt(0); + raw.push({ lo: c, hi: c }); + } else { + const a = item.from.charCodeAt(0), b = item.to.charCodeAt(0); + raw.push({ lo: Math.min(a, b), hi: Math.max(a, b) }); + } + } + const merged = mergeRanges(raw); + return negate ? complementRanges(merged) : merged; +} + +function mergeRanges(ranges: Range[]): Range[] { + if (ranges.length === 0) return []; + const sorted = [...ranges].sort((a, b) => a.lo - b.lo || a.hi - b.hi); + const out: Range[] = [{ ...sorted[0] }]; + for (let i = 1; i < sorted.length; i++) { + const last = out[out.length - 1], r = sorted[i]; + if (r.lo <= last.hi + 1) last.hi = Math.max(last.hi, r.hi); + else out.push({ ...r }); + } + return out; +} + +function complementRanges(ranges: Range[]): Range[] { + // ranges are sorted+merged; complement within [0, MAX_CODE]. + const out: Range[] = []; + let next = 0; + for (const r of ranges) { + if (r.lo > next) out.push({ lo: next, hi: r.lo - 1 }); + next = r.hi + 1; + } + if (next <= MAX_CODE) out.push({ lo: next, hi: MAX_CODE }); + return out; +} + +// ── NFA (Thompson) ── +// A transition is either an epsilon move or a move on any code unit inside `ranges`. +interface NfaState { eps: number[]; trans: { ranges: Range[]; to: number }[] } + +class UnsupportedPattern extends Error {} + +class Nfa { + states: NfaState[] = []; + newState(): number { this.states.push({ eps: [], trans: [] }); return this.states.length - 1; } + eps(a: number, b: number): void { this.states[a].eps.push(b); } + move(a: number, ranges: Range[], b: number): void { this.states[a].trans.push({ ranges, to: b }); } +} + +// Build an NFA fragment for `pattern`; returns [start, accept]. Throws UnsupportedPattern +// for any non-regular construct so the caller can fall back to the regex. +function build(nfa: Nfa, pattern: TokenPattern): [number, number] { + if (typeof pattern === 'string') return buildLiteral(nfa, pattern); + switch (pattern.type) { + case 'anyChar': { + const s = nfa.newState(), a = nfa.newState(); + nfa.move(s, [{ lo: 0, hi: MAX_CODE }], a); + return [s, a]; + } + case 'charClass': { + const ranges = classRanges(pattern.items, pattern.negate); + const s = nfa.newState(), a = nfa.newState(); + if (ranges.length) nfa.move(s, ranges, a); // empty class → no edge → never matches + return [s, a]; + } + case 'seq': { + if (pattern.items.length === 0) { const s = nfa.newState(); return [s, s]; } + let [start, acc] = build(nfa, pattern.items[0]); + for (let i = 1; i < pattern.items.length; i++) { + const [s2, a2] = build(nfa, pattern.items[i]); + nfa.eps(acc, s2); + acc = a2; + } + return [start, acc]; + } + case 'alt': { + const s = nfa.newState(), a = nfa.newState(); + for (const item of pattern.items) { + const [s2, a2] = build(nfa, item); + nfa.eps(s, s2); + nfa.eps(a2, a); + } + return [s, a]; + } + case 'repeat': { + if (!pattern.greedy) throw new UnsupportedPattern('non-greedy repeat'); + // min mandatory copies, then either an unbounded star or (max-min) optional copies. + const s = nfa.newState(); + let acc = s; + for (let i = 0; i < pattern.min; i++) { + const [s2, a2] = build(nfa, pattern.body); + nfa.eps(acc, s2); + acc = a2; + } + if (pattern.max === undefined) { + // star: acc --eps--> bodyStart, bodyAccept --eps--> acc (loop) and onward. + const [s2, a2] = build(nfa, pattern.body); + const a = nfa.newState(); + nfa.eps(acc, s2); + nfa.eps(a2, s2); // loop + nfa.eps(acc, a); // skip (zero more) + nfa.eps(a2, a); // exit after >=1 + return [s, a]; + } else { + const a = nfa.newState(); + let cur = acc; + for (let i = pattern.min; i < pattern.max; i++) { + const [s2, a2] = build(nfa, pattern.body); + nfa.eps(cur, s2); + nfa.eps(cur, a); // optional: skip the rest + cur = a2; + } + nfa.eps(cur, a); + return [s, a]; + } + } + case 'never': { + const s = nfa.newState(), a = nfa.newState(); // no edge s→a → never accepts + return [s, a]; + } + // Non-regular: the caller must fall back to the regex. + case 'lookahead': + case 'lookbehind': + case 'anchor': + throw new UnsupportedPattern(pattern.type); + } +} + +function buildLiteral(nfa: Nfa, literal: string): [number, number] { + const start = nfa.newState(); + let cur = start; + for (let i = 0; i < literal.length; i++) { + const c = literal.charCodeAt(i); + const next = nfa.newState(); + nfa.move(cur, [{ lo: c, hi: c }], next); + cur = next; + } + return [start, cur]; +} + +// ── Subset construction → DFA ── +interface DfaState { accept: boolean; edges: { ranges: Range[]; to: number }[] } + +function epsilonClosure(nfa: Nfa, set: Set): Set { + const stack = [...set], out = new Set(set); + while (stack.length) { + const s = stack.pop()!; + for (const t of nfa.states[s].eps) if (!out.has(t)) { out.add(t); stack.push(t); } + } + return out; +} + +function setKey(set: Set): string { + return [...set].sort((a, b) => a - b).join(','); +} + +// Partition boundaries: every code unit where some transition's membership flips. We +// build a sorted list of "cut points" so the alphabet splits into intervals on which +// every NFA transition is constant — the classic DFA alphabet partition. +function buildDfa(nfa: Nfa, start: number, accept: number): DfaState[] { + const startSet = epsilonClosure(nfa, new Set([start])); + const dfa: DfaState[] = []; + const index = new Map(); + const queue: Set[] = []; + + const intern = (set: Set): number => { + const key = setKey(set); + let id = index.get(key); + if (id === undefined) { + id = dfa.length; + index.set(key, id); + dfa.push({ accept: set.has(accept), edges: [] }); + queue.push(set); + } + return id; + }; + + intern(startSet); + while (queue.length) { + const set = queue.shift()!; + const id = index.get(setKey(set))!; + // Collect this state's outgoing transitions, then split into disjoint intervals. + const trans: { ranges: Range[]; to: number }[] = []; + for (const ns of set) for (const tr of nfa.states[ns].trans) trans.push(tr); + if (trans.length === 0) continue; + // Cut points: for every range [lo,hi] add boundaries at lo and hi+1. + const cuts = new Set(); + for (const tr of trans) for (const r of tr.ranges) { cuts.add(r.lo); cuts.add(r.hi + 1); } + const points = [...cuts].sort((a, b) => a - b); + // For each elementary interval [points[i], points[i+1]-1], gather NFA targets. + const edges: { ranges: Range[]; to: number }[] = []; + for (let i = 0; i < points.length - 1; i++) { + const lo = points[i], hi = points[i + 1] - 1; + if (hi < lo) continue; + const targets = new Set(); + for (const tr of trans) { + for (const r of tr.ranges) if (r.lo <= lo && hi <= r.hi) { targets.add(tr.to); break; } + } + if (targets.size === 0) continue; + const toId = intern(epsilonClosure(nfa, targets)); + edges.push({ ranges: [{ lo, hi }], to: toId }); + } + // Merge adjacent intervals that go to the same DFA state (compacts the table). + edges.sort((a, b) => a.ranges[0].lo - b.ranges[0].lo); + const merged: { ranges: Range[]; to: number }[] = []; + for (const e of edges) { + const last = merged[merged.length - 1]; + if (last && last.to === e.to && last.ranges[last.ranges.length - 1].hi + 1 === e.ranges[0].lo) { + last.ranges[last.ranges.length - 1].hi = e.ranges[0].hi; + } else merged.push({ ranges: [{ ...e.ranges[0] }], to: e.to }); + } + dfa[id].edges = merged; + } + return dfa; +} + +function dfaNext(state: DfaState, code: number): number { + for (const e of state.edges) { + for (const r of e.ranges) { + if (code < r.lo) break; // ranges are sorted ascending + if (code <= r.hi) return e.to; + } + } + return -1; +} + +// Run the DFA from `pos`, recording every accepting length. Returns the lengths in +// DESCENDING order (longest first) — what a greedy regex would prefer, and what the +// trailing-lookahead retry needs. +function runAcceptLengths(dfa: DfaState[], s: string, pos: number): number[] { + const accepts: number[] = []; + let state = 0, i = pos; + if (dfa[0].accept) accepts.push(0); + while (state >= 0 && i < s.length) { + const next = dfaNext(dfa[state], s.charCodeAt(i)); + if (next < 0) break; + state = next; + i++; + if (dfa[state].accept) accepts.push(i - pos); + } + return accepts.reverse(); +} + +// ── Public compile ── +export interface TokenDfa { + /** Match length at `pos`, or -1 — byte-identical to the token's sticky regex exec. */ + match(s: string, pos: number): number; +} + +// The compiled DFA + any trailing char-class assertion, exposed so a code emitter can +// turn it into specialized straight-line JS (a generic interpreter over this structure +// is SLOWER than V8's regex — the win is in emitting tight char-code branches). +export type { DfaState }; +export interface CompiledTokenDfa { states: DfaState[]; trailing: { ranges: Range[]; negate: boolean } | null } + +export function buildTokenDfaRaw(pattern: TokenPattern): CompiledTokenDfa | null { + try { + const look = trailingLookahead(pattern); + const nfa = new Nfa(); + const [start, accept] = build(nfa, look ? look.body : pattern); + const states = buildDfa(nfa, start, accept); + return { states, trailing: look ? { ranges: look.ranges, negate: look.negate } : null }; + } catch (e) { + if (e instanceof UnsupportedPattern) return null; + throw e; + } +} + +// ── DFA → specialized straight-line JS ── +// A GENERIC interpreter over the DFA is slower than V8's JIT-compiled regex; the win is +// in emitting tight char-code branches (measured ~1.3–1.6× over the sticky regex on the +// common tokens). Above this many DFA states the emitted switch stops paying off (a large +// escape-heavy token like a string literal lands ~even with the regex), so we decline and +// the caller keeps the regex — correctness is identical either way. +const MAX_SCANNER_STATES = 64; + +function rangesCond(ranges: Range[], v: string): string { + return ranges.map(r => r.lo === r.hi ? `${v}===${r.lo}` : `${v}>=${r.lo}&&${v}<=${r.hi}`).join('||'); +} + +/** + * Emit a token scanner as a JS function BODY with parameters `(s, pos, re)`: returns the + * match length at `pos` (byte-identical to the token's sticky regex), or -1. `re` is the + * token's own regex, used only on the rare trailing-lookahead retry. Returns null when the + * pattern is outside the supported subset or its DFA is too large (caller keeps the regex). + */ +export function emitTokenScannerBody(pattern: TokenPattern): string | null { + const compiled = buildTokenDfaRaw(pattern); + if (!compiled) return null; + const { states, trailing } = compiled; + if (states.length > MAX_SCANNER_STATES) return null; + const accept = states.map(s => s.accept); + const L: string[] = []; + L.push(`const n=s.length;let i=pos,st=0,acc=${accept[0] ? 0 : -1};`); + L.push(`for(;;){if(i>=n)break;const c=s.charCodeAt(i);switch(st){`); + states.forEach((state, si) => { + if (state.edges.length === 0) { L.push(`case ${si}:break;`); return; } + let body = `case ${si}:{`; + for (const e of state.edges) { + const cond = rangesCond(e.ranges, 'c'); + body += `if(${e.ranges.length > 1 ? `(${cond})` : cond}){st=${e.to};i++;${accept[e.to] ? 'acc=i-pos;' : ''}continue;}`; + } + L.push(body + 'break;}'); + }); + L.push('}break;}'); + if (trailing) { + // longest accept = acc; a trailing `(?!class)`/`(?=class)` may force a shorter match — + // rare (well-formed input ends the token at a boundary), so defer that to the regex. + L.push('if(acc<0)return -1;const at=pos+acc;const cc=at number) | null { + const body = emitTokenScannerBody(pattern); + if (body === null) return null; + const fn = new Function('s', 'pos', 're', body) as (s: string, pos: number, re: RegExp) => number; + return (s, pos) => fn(s, pos, regex); +} + +// A trailing `(?!class)` / `(?=class)` over a single char class is the only look-around +// the numeric tokens use; supported by retrying shorter body matches until the assertion +// at the body's end holds. Detected structurally on the IR. +function trailingLookahead(pattern: TokenPattern): { body: TokenPattern; ranges: Range[]; negate: boolean } | null { + if (typeof pattern === 'string' || pattern.type !== 'seq') return null; + const last = pattern.items[pattern.items.length - 1]; + if (typeof last === 'string' || last.type !== 'lookahead') return null; + const inner = last.body; + if (typeof inner === 'string' || inner.type !== 'charClass') return null; // only a char-class assertion + const body: TokenPattern = pattern.items.length === 2 + ? pattern.items[0] + : { type: 'seq', items: pattern.items.slice(0, -1) }; + return { body, ranges: classRanges(inner.items, inner.negate), negate: last.negate }; +} + +function inRanges(ranges: Range[], code: number): boolean { + for (const r of ranges) if (code >= r.lo && code <= r.hi) return true; + return false; +} + +/** + * Compile a token's pattern to a char-code DFA matcher, or return null if the pattern + * uses a construct outside the supported regular subset (caller falls back to regex). + */ +export function compileTokenDfa(pattern: TokenPattern): TokenDfa | null { + try { + const look = trailingLookahead(pattern); + if (look) { + const nfa = new Nfa(); + const [start, accept] = build(nfa, look.body); + const dfa = buildDfa(nfa, start, accept); + const { ranges, negate } = look; + return { + match(s, pos) { + const lens = runAcceptLengths(dfa, s, pos); // longest first + for (const len of lens) { + const at = pos + len; + const has = at < s.length && inRanges(ranges, s.charCodeAt(at)); + // negative lookahead succeeds when the char is absent (incl. EOF); positive needs it present. + if (negate ? !has : has) return len; + } + return -1; + }, + }; + } + const nfa = new Nfa(); + const [start, accept] = build(nfa, pattern); + const dfa = buildDfa(nfa, start, accept); + return { + match(s, pos) { + const lens = runAcceptLengths(dfa, s, pos); + return lens.length ? lens[0] : -1; + }, + }; + } catch (e) { + if (e instanceof UnsupportedPattern) return null; + throw e; + } +} diff --git a/test/check.ts b/test/check.ts index 8b2c81e..8754566 100644 --- a/test/check.ts +++ b/test/check.ts @@ -21,6 +21,8 @@ const GATES: Gate[] = [ { group: 'core', name: 'cst-text-invariant', args: ['test/cst-text-invariant.ts'] }, { group: 'conformance', name: 'ts-ast-structure', args: ['test/ts-ast-verify.ts'] }, { group: 'core', name: 'cst-match-totality', args: ['test/cst-match-totality.ts'] }, + { group: 'core', name: 'incremental-verify', args: ['test/incremental-verify.ts'] }, + { group: 'core', name: 'multi-doc', args: ['test/multi-doc.ts'] }, { group: 'core', name: 'issue-cases', args: ['test/test-issues.ts'] }, { group: 'conformance', name: 'js', args: ['test/js-conformance.ts'] }, { group: 'conformance', name: 'tsx', args: ['test/tsx-conformance.ts'] }, diff --git a/test/cst-match-totality.ts b/test/cst-match-totality.ts index fd8a6be..f688cda 100644 --- a/test/cst-match-totality.ts +++ b/test/cst-match-totality.ts @@ -24,23 +24,23 @@ const samples: string[] = []; type Emitted = { parse(src: string, entry?: string): number; - visit(entry: number, fns: { enter?(id: number): boolean | void; leaf?(e: number, tok: number): void }): void; - tree: { ruleNameOf(id: number): string; childCount(id: number): number; childAt(id: number, i: number): number; leafTokenType(e: number): string; offsetOf(e: number): number; endOf(e: number): number }; + visit(entry: number, fns: { enter?(id: number, charBase: number, tokBase: number): boolean | void; leaf?(e: number, tok: number): void }): void; + tree: { ruleNameOf(id: number): string; lenOf(id: number): number }; }; -function checkTree(em: Emitted, root: number, src: string, matchers: Record { arm: string }>, tag: string): void { +function checkTree(em: Emitted, root: number, src: string, matchers: Record { arm: string }>, tag: string): void { em.visit(root, { - enter(id) { + enter(id, charBase, tokBase) { const m = matchers[em.tree.ruleNameOf(id)]; if (m !== undefined) { nodes++; try { - m(em.tree as never, id as never, src); + m(em.tree as never, id as never, tokBase, src); } catch (e) { misses++; if (samples.length < 10) { - const off = em.tree.offsetOf(id); - samples.push(`${tag} ${em.tree.ruleNameOf(id)} @${off}..${em.tree.endOf(id)} «${src.slice(off, Math.min(em.tree.endOf(id), off + 50)).replace(/\n/g, '\\n')}» — ${(e as Error).message.slice(0, 60)}`); + const end = charBase + em.tree.lenOf(id); + samples.push(`${tag} ${em.tree.ruleNameOf(id)} @${charBase}..${end} «${src.slice(charBase, Math.min(end, charBase + 50)).replace(/\n/g, '\\n')}» — ${(e as Error).message.slice(0, 60)}`); } } } diff --git a/test/emit-parser-verify.ts b/test/emit-parser-verify.ts index 2269874..c7c2732 100644 --- a/test/emit-parser-verify.ts +++ b/test/emit-parser-verify.ts @@ -9,6 +9,7 @@ // node test/emit-parser-verify.ts # 4 bench files + ~400-file corpus sample // node test/emit-parser-verify.ts # sample stride N (default ~ to hit ~400) // node test/emit-parser-verify.ts all # every .ts file under conformance +import { objectify } from './emitted-obj.ts'; import { createParser } from '../src/gen-parser.ts'; import { emitParser } from '../src/emit-parser.ts'; import { readdir } from 'fs/promises'; @@ -41,7 +42,7 @@ function compare(code: string): { verdict: string; detail?: string } { const o = run(oracle.parse, code); // The emitted parser returns an arena node id; materialize the object view for the // byte-identical comparison against the interpreter's object tree. - const e = run((s: string) => emitted.toObject(emitted.parse(s)), code); + const e = run((s: string) => { const r = emitted.parse(s); return objectify(emitted.tree, (fns) => emitted.visit(r, fns)); }, code); if (!o.ok && o.err.includes('Maximum call stack')) { // The interpreter recursed out of stack — a CAPACITY limit, not a parse verdict; // the emitted parser's flatter frames can legitimately survive deeper inputs diff --git a/test/emitted-obj.ts b/test/emitted-obj.ts new file mode 100644 index 0000000..cc4c123 --- /dev/null +++ b/test/emitted-obj.ts @@ -0,0 +1,39 @@ +// Materialize an emitted-engine tree as a plain object — TEST-SIDE ONLY. The engine +// deliberately exposes a single consumption surface (visit + tree accessors); full +// materialization is a consumer choice, and the only consumer that needs it is the +// gate layer's byte-identical JSON comparison (incremental ≡ fresh, emit ≡ interp). +// The shape (and KEY ORDER — JSON.stringify equality depends on it) mirrors the +// interpreter's native object trees: nodes { rule, children, offset, end }, leaves +// { tokenType, offset, end }. +export interface TreeView { + ruleNameOf(id: number): string; + lenOf(id: number): number; + leafTokenType(entry: number, tokBase: number): string; + leafOffsetOf(entry: number, tokBase: number): number; + leafEndOf(entry: number, tokBase: number): number; +} +type VisitFns = { + enter?(id: number, charBase: number, tokBase: number): boolean | void; + leave?(id: number, charBase: number, tokBase: number): void; + leaf?(entry: number, tok: number): void; +}; +export type ObjNode = { rule: string; children: (ObjNode | ObjLeaf)[]; offset: number; end: number }; +export type ObjLeaf = { tokenType: string; offset: number; end: number }; + +export function objectify(tree: TreeView, runVisit: (fns: VisitFns) => void): ObjNode { + const rootHolder: { children: (ObjNode | ObjLeaf)[] } = { children: [] }; + const stack: { children: (ObjNode | ObjLeaf)[] }[] = [rootHolder]; + runVisit({ + enter(id, charBase) { + const node: ObjNode = { rule: tree.ruleNameOf(id), children: [], offset: charBase, end: charBase + tree.lenOf(id) }; + stack[stack.length - 1].children.push(node); + stack.push(node); + }, + leave() { stack.pop(); }, + leaf(entry, tok) { + const tb = tok - ((~entry) >>> 2); + stack[stack.length - 1].children.push({ tokenType: tree.leafTokenType(entry, tb), offset: tree.leafOffsetOf(entry, tb), end: tree.leafEndOf(entry, tb) }); + }, + }); + return rootHolder.children[0] as ObjNode; +} diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts new file mode 100644 index 0000000..0178d84 --- /dev/null +++ b/test/incremental-verify.ts @@ -0,0 +1,193 @@ +// Gate: INCREMENTAL ≡ FRESH. parseEdited(newSource) must produce a tree byte-identical +// (via toObject) to a from-scratch parse of the same text, across scripted edit +// sessions over real files — inserts, deletions, replacements, statement insertions, +// edits inside strings/comments, and syntax-breaking edits (both sides must reject; +// the session self-heals on the next good text). Also reports the incremental speedup +// and the arena growth, so reuse is MEASURED, not assumed. +// +// node test/incremental-verify.ts +import { objectify } from './emitted-obj.ts'; +import { existsSync, readFileSync, writeFileSync } from 'node:fs'; +import { emitParser } from '../src/emit-parser.ts'; + +const grammar = (await import('../typescript.ts')).default; +const emPath = '/tmp/emitted-incremental.mjs'; +writeFileSync(emPath, emitParser(grammar)); +type Edit = { start: number; end: number; text: string }; +type Cst = { root: number }; +type Parser = { + parse(s: string): Cst; + edit(cst: Cst, edits: Edit[]): void; + visit(cst: Cst, fns: object): void; + tree: import('./emitted-obj.ts').TreeView; +}; +type Em = { + parse(s: string): number; + visit(entry: number, fns: object): void; + tree: import('./emitted-obj.ts').TreeView; + createParser(): Parser; +}; +const session = ((await import(emPath + '?session=' + process.pid)) as Em).createParser(); +const fresh = (await import(emPath + '?fresh=' + process.pid)) as Em; + +// Deterministic LCG so failures replay. +let seedState = 0x2F6E2B1; +const rand = () => ((seedState = (seedState * 48271) % 0x7fffffff) / 0x7fffffff); +const randInt = (n: number) => Math.floor(rand() * n); + +const INSERTS = ['x', '_v', '42', ' + y', '.m', '()', ' /*c*/ ', '"s"', 'await ', '!', '?']; +const STMTS = ['const q9 = 1;\n', 'function g9(a) { return a; }\n', 'if (x9) { y9(); }\n', '// note\n', 'type T9 = string | number;\n']; + +// Mutations return the edit RANGE too, so half the steps can exercise the edits +// PROTOCOL path (the editor-facing API) while the other half exercises the +// char-diff fallback envelope. +function mutate(text: string): { next: string; edit: Edit } { + switch (randInt(5)) { + case 0: { // insert a small fragment at a random position + const at = randInt(text.length); + const ins = INSERTS[randInt(INSERTS.length)]; + return { next: text.slice(0, at) + ins + text.slice(at), edit: { start: at, end: at, text: ins } }; + } + case 1: { // delete a small span + const at = randInt(Math.max(1, text.length - 8)); + const n = 1 + randInt(6); + return { next: text.slice(0, at) + text.slice(at + n), edit: { start: at, end: at + n, text: '' } }; + } + case 2: { // replace a character + const at = randInt(Math.max(1, text.length - 1)); + return { next: text.slice(0, at) + 'z' + text.slice(at + 1), edit: { start: at, end: at + 1, text: 'z' } }; + } + case 3: { // insert a whole statement at a line boundary + const lines = text.split('\n'); + const at = randInt(lines.length); + const stmt = STMTS[randInt(STMTS.length)].trimEnd(); + lines.splice(at, 0, stmt); + const start = at === 0 ? 0 : lines.slice(0, at).join('\n').length + 1; + return { next: lines.join('\n'), edit: { start, end: start, text: stmt + '\n' } }; + } + default: { // append at the end (the pure-prefix reuse case) + const stmt = '\n' + STMTS[randInt(STMTS.length)]; + return { next: text + stmt, edit: { start: text.length, end: text.length, text: stmt } }; + } + } +} + +const FILES = [ + '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserharness.ts', + '/tmp/ts-repo/tests/cases/conformance/fixSignatureCaching.ts', + '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/parserRealSource7.ts', + '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserindenter.ts', +].filter(existsSync); +const STEPS = 30; + +// ── Adversarial boundary edits (deterministic) ── +// The fixed-seed random sessions MISSED the restart-anchor abutment hole (a token +// ending exactly at the damage start can be EXTENDED under maximal munch — 'b'+'x' +// = 'bx', '='+'=' = '==', deleting a gap glues neighbours). These cases pin the +// strict-< restart anchor; every one must match fresh (tree or reject) exactly. +// Test-side range derivation for constructed pairs (the ENGINE requires explicit +// ranges — a caller without them passes the whole-file range for a full re-parse). +function diffChange(a: string, b: string): Edit { + const minL = Math.min(a.length, b.length); + let s = 0; + while (s < minL && a.charCodeAt(s) === b.charCodeAt(s)) s++; + let e = 0; + while (e < minL - s && a.charCodeAt(a.length - 1 - e) === b.charCodeAt(b.length - 1 - e)) e++; + return { start: s, end: a.length - e, text: b.slice(s, b.length - e) }; +} + +const GLUE: Array<[string, string]> = [ + ['const a = 1;\nconst b = 2;\n', 'const a = 1;\nconst bx = 2;\n'], + ['let a = b; let c = 1;\n', 'let a = b1; let c = 1;\n'], + ['if (a = b) { f(); }\n', 'if (a == b) { f(); }\n'], + ['const x = a b;\n', 'const x = ab;\n'], + ['const q = w / 2;\n', 'const q = w /= 2;\n'], + ['const t = a + b;\n', 'const t = a ++ b;\n'], + ['const u = x(z);\n', 'const u = x>(z);\n'], + ['f(a, b);\ng(c);\n', 'f(a, bc);\ng(c);\n'], +]; + +let steps = 0, equal = 0, bothReject = 0, mismatch = 0; +let tInc = 0, tFresh = 0; +const failures: string[] = []; + +for (const [base, edited] of GLUE) { + steps++; + const c0 = session.parse(base); + let fe: string | null = null, ie: string | null = null; + let fr = -1; + try { fr = fresh.parse(edited); } catch (e) { fe = (e as Error).message; } + try { session.edit(c0, [diffChange(base, edited)]); } catch (e) { ie = (e as Error).message; } + if (fe !== null || ie !== null) { + if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: fresh ${fe ? 'reject' : 'accept'} / incremental ${ie ? 'reject' : 'accept'}`); } + else bothReject++; + continue; + } + const a = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(fr, fns))); + const b = JSON.stringify(objectify(session.tree, (fns) => session.visit(c0, fns))); + if (a === b) equal++; + else { mismatch++; if (failures.length < 5) failures.push(`glue «${edited.slice(0, 30)}»: tree diverges`); } +} + +for (const f of FILES) { + let text = readFileSync(f, 'utf-8'); + let cst = session.parse(text); // open the session + for (let k = 0; k < STEPS; k++) { + const { next, edit } = mutate(text); + steps++; + let freshRoot = -1, freshErr: string | null = null; + const tf0 = performance.now(); + try { freshRoot = fresh.parse(next); } catch (e) { freshErr = (e as Error).message; } + const tf1 = performance.now(); + let incErr: string | null = null; + const ti0 = performance.now(); + try { session.edit(cst, [edit]); } catch (e) { incErr = (e as Error).message; } + const ti1 = performance.now(); + if (freshErr !== null || incErr !== null) { + if ((freshErr === null) !== (incErr === null)) { + mismatch++; + if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: fresh ${freshErr ? 'reject' : 'accept'} / incremental ${incErr ? 'reject' : 'accept'}\n fresh: ${freshErr ?? '-'}\n inc: ${incErr ?? '-'}`); + } else bothReject++; + // REJECTED text: the handle stays on the previous tree, but the DOCUMENT + // advances (editor-buffer model — the buffer applied the change regardless, + // and the engine's docSrc tracks it). Model the editor's UNDO: revert via a + // diff edit in the rejected text's coordinates; it must be accepted and + // byte-identical to a fresh parse of the restored text. + try { + session.edit(cst, [diffChange(next, text)]); + const rfr = fresh.parse(text); + const ra = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(rfr, fns))); + const rb = JSON.stringify(objectify(session.tree, (fns) => session.visit(cst, fns))); + if (ra !== rb) { + mismatch++; + if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: REVERT tree diverges`); + } + } catch (e2) { + mismatch++; + if (failures.length < 5) failures.push(`${f.split('/').pop()} step ${k}: revert rejected: ${(e2 as Error).message.slice(0, 50)}`); + } + continue; + } + tFresh += tf1 - tf0; tInc += ti1 - ti0; + const a = JSON.stringify(objectify(fresh.tree, (fns) => fresh.visit(freshRoot, fns))); + const b = JSON.stringify(objectify(session.tree, (fns) => session.visit(cst, fns))); + if (a === b) equal++; + else { + mismatch++; + if (failures.length < 5) { + let i = 0; while (i < a.length && i < b.length && a[i] === b[i]) i++; + failures.push(`${f.split('/').pop()} step ${k}: tree diverges @${i}\n fresh: …${a.slice(Math.max(0, i - 50), i + 50)}…\n inc: …${b.slice(Math.max(0, i - 50), i + 50)}…`); + } + } + text = next; + } +} + +console.log(`incremental ≡ fresh: ${equal} equal · ${bothReject} both-reject · ${mismatch} MISMATCH (${steps} steps over ${FILES.length} files)`); +if (tInc > 0) console.log(`time: incremental ${tInc.toFixed(1)}ms vs fresh ${tFresh.toFixed(1)}ms → ${(tFresh / tInc).toFixed(2)}× faster on accepted edits`); +for (const s of failures) console.log(' ✗ ' + s); +if (mismatch > 0) { + console.error('✗ incremental parse diverges from a fresh parse'); + process.exit(1); +} +console.log('✓ every edited re-parse is byte-identical to a fresh parse'); diff --git a/test/multi-doc.ts b/test/multi-doc.ts new file mode 100644 index 0000000..d980cbb --- /dev/null +++ b/test/multi-doc.ts @@ -0,0 +1,167 @@ +// Gate: DOCUMENTS ARE ISOLATED. The handle API (createParser → parse/edit with +// explicit tree handles) keeps one document's state per parser instance behind a +// lazily-swapped register set — a missed swap field shows up as cross-document +// corruption. Two instances edit two different sources interleaved (plus the +// module-level default-doc API mixed in between); every edited tree must be +// byte-identical (toObject) to a fresh parse of the same text. Also pins the +// handle contract: stale and foreign handles throw instead of silently reading +// an in-place-mutated tree, and a REJECTED edit leaves the old handle valid. +// +// node test/multi-doc.ts +import { objectify } from './emitted-obj.ts'; +import { writeFileSync } from 'node:fs'; +import { emitParser } from '../src/emit-parser.ts'; + +const grammar = (await import('../typescript.ts')).default; +const emPath = '/tmp/emitted-multidoc.mjs'; +writeFileSync(emPath, emitParser(grammar)); +type Edit = { start: number; end: number; text: string }; +type Cst = { root: number }; +type Parser = { parse(s: string): Cst; edit(cst: Cst, edits: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; +type Em = { parse(s: string): number; createParser(): Parser }; +const em = (await import(emPath + '?v=' + process.pid)) as Em; + +// Two synthetic documents (no corpus dependency — the gate always exercises). +const mk = (tag: string, n: number) => { + let s = ''; + for (let i = 0; i < n; i++) s += `function ${tag}_${i}(a) { if (a > ${i}) { return a * ${i}; } const v_${i} = { x: ${i} }; return v_${i}.x; }\n`; + return s; +}; +let textA = mk('alpha', 400); +let textB = `(function () {\n${mk('beta', 300)}})();\n`; + +let seed = 0x51C0FFEE; +const rand = () => ((seed = (seed * 48271) % 0x7fffffff) / 0x7fffffff); +const randInt = (n: number) => Math.floor(rand() * n); +const INS = ['x', '1', ' + q', '.m', '(/*c*/)', '"s"']; +function mutate(text: string): { next: string; edit: Edit } { + switch (randInt(3)) { + case 0: { + const at = randInt(text.length); + const ins = INS[randInt(INS.length)]; + return { next: text.slice(0, at) + ins + text.slice(at), edit: { start: at, end: at, text: ins } }; + } + case 1: { + const at = randInt(Math.max(1, text.length - 6)); + const n = 1 + randInt(4); + return { next: text.slice(0, at) + text.slice(at + n), edit: { start: at, end: at + n, text: '' } }; + } + default: { + const at = randInt(Math.max(1, text.length - 1)); + return { next: text.slice(0, at) + 'z' + text.slice(at + 1), edit: { start: at, end: at + 1, text: 'z' } }; + } + } +} + +function diffChange(a: string, b: string): Edit { + const minL = Math.min(a.length, b.length); + let s = 0; + while (s < minL && a.charCodeAt(s) === b.charCodeAt(s)) s++; + let e = 0; + while (e < minL - s && a.charCodeAt(a.length - 1 - e) === b.charCodeAt(b.length - 1 - e)) e++; + return { start: s, end: a.length - e, text: b.slice(s, b.length - e) }; +} + +const p1 = em.createParser(); +const p2 = em.createParser(); +const f = em.createParser(); +let cstA = p1.parse(textA); +let cstB = p2.parse(textB); + +let steps = 0, equal = 0, bothReject = 0, mismatch = 0, reverts = 0; +const failures: string[] = []; +for (let k = 0; k < 60; k++) { + const onA = (k & 1) === 0; + const text = onA ? textA : textB; + const { next, edit } = mutate(text); + steps++; + let fe: string | null = null, ie: string | null = null; + let fc: Cst | null = null; + try { fc = f.parse(next); } catch (e) { fe = (e as Error).message; } + try { (onA ? p1 : p2).edit(onA ? cstA : cstB, [edit]); } catch (e) { ie = (e as Error).message; } + if (fe !== null || ie !== null) { + if ((fe === null) !== (ie === null)) { mismatch++; if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): fresh ${fe ? 'reject' : 'accept'} / edit ${ie ? 'reject' : 'accept'}`); } + else bothReject++; + // the DOCUMENT advances on reject (editor-buffer model): later coordinates + // are against the rejected text. Model the editor's UNDO: revert to the last + // good text via a diff edit in the rejected text's coordinates — it must be + // ACCEPTED and byte-identical to a fresh parse (the post-reject recovery path + // gets exercised every time a mutation breaks the document). + const good = onA ? textA : textB; + const rv = diffChange(next, good); + try { + (onA ? p1 : p2).edit(onA ? cstA : cstB, [rv]); + const fb = f.parse(good); + const ra = JSON.stringify(objectify(f.tree, (fns) => f.visit(fb, fns))); + const qq = onA ? p1 : p2; + const rb = JSON.stringify(objectify(qq.tree, (fns) => qq.visit(onA ? cstA : cstB, fns))); + if (ra === rb) reverts++; + else { mismatch++; if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): REVERT tree diverges`); } + } catch (e2) { + mismatch++; + if (failures.length < 5) failures.push(`step ${k} (${onA ? 'A' : 'B'}): revert rejected: ${(e2 as Error).message.slice(0, 50)}`); + } + continue; + } + // mix the module-level default doc in between: it must not disturb either instance + if (k % 5 === 0) em.parse('const mix = ' + k + ';'); + const a = JSON.stringify(objectify(f.tree, (fns) => f.visit(fc!, fns))); + const q = onA ? p1 : p2; + const b = JSON.stringify(objectify(q.tree, (fns) => q.visit(onA ? cstA : cstB, fns))); + if (a === b) equal++; + else { + mismatch++; + if (failures.length < 5) { + let i = 0; while (i < a.length && a[i] === b[i]) i++; + failures.push(`step ${k} (${onA ? 'A' : 'B'}): tree diverges @${i}`); + } + } + if (onA) textA = next; else textB = next; +} + +// handle contract: edit mutates the handle IN PLACE (no return — no clone illusion); +// only parse() re-opening the document invalidates old handles; rejects keep the tree. +let contract = 0; +{ + const p = em.createParser(); + const c1 = p.parse('const a = 1;'); + const obj = (h: Cst) => JSON.stringify(objectify(p.tree, (fns) => p.visit(h, fns))); + const before = obj(c1); + p.edit(c1, [{ start: 7, end: 7, text: 'b' }]); // 'const a = 1;' -> 'const ab = 1;' + const after = obj(c1); + if (after !== before && after.includes('"end":8')) contract++; // same handle, new tree + else failures.push('in-place edit did not update the handle'); + try { p2.edit(c1, [{ start: 0, end: 1, text: 'q' }]); failures.push('foreign handle did not throw'); } catch { contract++; } + let rejected = false; + try { p.edit(c1, [{ start: 6, end: 8, text: ']' }]); } catch { rejected = true; } // 'const ab…' -> 'const ] = 1;' + if (rejected && obj(c1) === after) contract++; // reject keeps the tree + else failures.push('reject-then-read flow broke'); + // coordinates after a REJECT are against the editor's buffer (the rejected text): + // fixing the same spot in those coordinates must recover the session + let recovered = false; + try { p.edit(c1, [{ start: 6, end: 7, text: 'ab' }]); recovered = true; } catch { /* must not throw */ } + if (recovered && obj(c1).includes('"end":13')) contract++; // 'const ] = 1;' -> 'const ab = 1;' + else failures.push('post-reject coordinates did not track the document text'); + const c2 = p.parse('let q = 1;'); + try { obj(c1); failures.push('re-opened document: old handle did not throw'); } catch { contract++; } + // missing ranges: ONE usage only — edit() without ranges must throw, not + // silently fall back to O(file) diff scans + let needsRanges = false; + try { (p as unknown as { edit(c: Cst): void }).edit(c2); } catch { needsRanges = true; } + if (needsRanges) contract++; + else failures.push('edit() without changes did not throw'); + // a REJECTING parse() resets the arena too — it must invalidate prior handles + try { p.parse('const ] = ;'); } catch { /* expected reject */ } + let dead = false; + try { obj(c2); } catch { dead = true; } + if (dead) contract++; + else failures.push('rejecting parse() left the old handle readable over a reset arena'); +} + +console.log(`multi-doc: ${equal} equal · ${bothReject} both-reject (${reverts} reverts verified) · ${mismatch} MISMATCH (${steps} interleaved steps) · contract ${contract}/7`); +for (const s of failures) console.log(' ✗ ' + s); +if (mismatch > 0 || contract !== 7 || failures.length > 0) { + console.error('✗ document isolation / handle contract violated'); + process.exit(1); +} +console.log('✓ documents are isolated; handles enforce the in-place-edit contract'); diff --git a/test/obj-tree.ts b/test/obj-tree.ts new file mode 100644 index 0000000..fee0a70 --- /dev/null +++ b/test/obj-tree.ts @@ -0,0 +1,78 @@ +// A TreeAccess adapter over an INTERPRETER object CST — absolute coordinates, ids +// assigned by one post-order walk. It lets matcher consumers (the ts-ast lowering) +// run against the interp oracle without caring that the EMITTED tree went green +// (relative coordinates): the adapter ignores every tokBase it is handed. +// +// leafTokKindOf is only ever consulted on kind-0 leaves (the generated probes test +// the kind bit first), where the object leaf's tokenType IS the token name (or +// '$punct') — so the name→type-kind map (same derivation as the engine: punct 1, +// template spans 2-4, named tokens from 5 in declaration order) is complete. +import type { CstGrammar } from '../src/types.ts'; + +type Leafish = { tokenType: string; offset: number; end: number }; +type Nodeish = { rule: string; children: (Leafish | Nodeish)[]; offset: number; end: number }; + +export interface ObjTree { + rootId: number; + // matcher-facing (TreeAccess-compatible; tokBase params ignored) + ruleNameOf(id: number): string; + ruleIdOf(id: number): number; + childCount(id: number): number; + childAt(id: number, i: number): number; + childrenInto(id: number, out: number[]): number; + leafKindOf(entry: number): number; + leafTokKindOf(entry: number, tokBase?: number): number; + leafOffsetOf(entry: number, tokBase?: number): number; + leafEndOf(entry: number, tokBase?: number): number; + // stateless absolute conveniences (the lowering's toolkit) + offsetOf(entry: number): number; + endOf(entry: number): number; + leafTokenType(entry: number): string; +} + +export function objTree(root: Nodeish, grammar: CstGrammar): ObjTree { + const typeKind = new Map([['', 1], ['$punct', 1], ['$templateHead', 2], ['$templateMiddle', 3], ['$templateTail', 4]]); + { let next = 5; for (const t of grammar.tokens) if (!typeKind.has(t.name)) typeKind.set(t.name, next++); } + const ruleIdM = new Map(grammar.rules.map((r, i) => [r.name, i])); + ruleIdM.set('$template', grammar.rules.length); + + const nodes: Nodeish[] = []; + const leaves: Leafish[] = []; + const kidsOf: number[][] = []; + const walk = (n: Nodeish): number => { + const ks: number[] = []; + for (const c of n.children) { + if ((c as Leafish).tokenType !== undefined) { + const lf = c as Leafish; + const li = leaves.length; + leaves.push(lf); + const kind = lf.tokenType === '$keyword' ? 1 : lf.tokenType === '$operator' ? 2 : 0; + ks.push(~((li << 2) | kind)); + } else { + ks.push(walk(c as Nodeish)); + } + } + const id = nodes.length; + nodes.push(n); + kidsOf.push(ks); + return id; + }; + const rootId = walk(root); + const leafOf = (e: number) => leaves[(~e) >>> 2]; + + return { + rootId, + ruleNameOf: (id) => nodes[id].rule, + ruleIdOf: (id) => ruleIdM.get(nodes[id].rule) ?? -1, + childCount: (id) => kidsOf[id].length, + childAt: (id, i) => kidsOf[id][i], + childrenInto: (id, out) => { const ks = kidsOf[id]; for (let i = 0; i < ks.length; i++) out[i] = ks[i]; return ks.length; }, + leafKindOf: (e) => (~e) & 3, + leafTokKindOf: (e) => typeKind.get(leafOf(e).tokenType) ?? 0, + leafOffsetOf: (e) => leafOf(e).offset, + leafEndOf: (e) => leafOf(e).end, + offsetOf: (e) => e >= 0 ? nodes[e].offset : leafOf(e).offset, + endOf: (e) => e >= 0 ? nodes[e].end : leafOf(e).end, + leafTokenType: (e) => leafOf(e).tokenType, + }; +} diff --git a/test/token-dfa-verify.ts b/test/token-dfa-verify.ts new file mode 100644 index 0000000..a86f6c8 --- /dev/null +++ b/test/token-dfa-verify.ts @@ -0,0 +1,74 @@ +// Correctness + speed gate for token-dfa.ts: for every TS token whose pattern compiles +// to a DFA, the DFA's match length must equal the token's sticky-regex match length at +// EVERY position of the corpus (byte-identical), and we measure the per-token speedup. +// +// node test/token-dfa-verify.ts +import { compileTokenDfa } from '../src/token-dfa.ts'; +import { tokenPatternSource } from '../src/token-pattern.ts'; +import { readFileSync, readdirSync } from 'fs'; +import { join } from 'path'; + +const grammar = (await import('../typescript.ts')).default; + +const base = '/tmp/ts-repo/tests/cases/conformance'; +function walk(d: string): string[] { + const o: string[] = []; + for (const e of readdirSync(d, { withFileTypes: true })) { + const f = join(d, e.name); + if (e.isDirectory()) o.push(...walk(f)); + else if (e.name.endsWith('.ts') && !e.name.endsWith('.d.ts')) o.push(f); + } + return o; +} +const files = walk(base).sort().filter((_, i) => i % 11 === 0); // ~stride sample +const sources = files.map(f => { try { return readFileSync(f, 'utf-8'); } catch { return ''; } }).filter(Boolean); +const totalChars = sources.reduce((a, s) => a + s.length, 0); + +// Tokens the per-position lexer loop actually runs through a regex (skip template). +const tokens = grammar.tokens.filter(t => !t.template); + +console.log(`tokens: ${tokens.length} · corpus sample: ${sources.length} files, ${(totalChars / 1024).toFixed(0)} KB\n`); +console.log('token DFA? positions mism regex ms dfa ms speedup'); +console.log('-'.repeat(78)); + +let totalMism = 0, compiled = 0, fellBack = 0; +for (const t of tokens) { + let src: string; + try { src = tokenPatternSource(t); } catch { src = ''; } + const dfa = compileTokenDfa(t.pattern); + if (!dfa) { + fellBack++; + console.log(`${t.name.padEnd(16)} regex ${'—'.padStart(10)} ${'—'.padStart(4)} (unsupported → falls back to regex)`); + continue; + } + compiled++; + const re = new RegExp(`(?:${src})`, 'y'); + + // Correctness: at every position, DFA length === regex length. + let mism = 0, positions = 0; + for (const s of sources) { + for (let pos = 0; pos < s.length; pos++) { + re.lastIndex = pos; + const m = re.exec(s); + const reLen = m ? m[0].length : -1; + const dfaLen = dfa.match(s, pos); + positions++; + if (reLen !== dfaLen) { + if (mism < 3) console.log(` MISMATCH @${pos} re=${reLen} dfa=${dfaLen} ctx=${JSON.stringify(s.slice(pos, pos + 24))}`); + mism++; + } + } + } + totalMism += mism; + + // Speed: scan each source once via regex vs DFA (best-of-5). + const timeRe = () => { let acc = 0; for (const s of sources) for (let p = 0; p < s.length; p++) { re.lastIndex = p; const m = re.exec(s); acc += m ? m[0].length : 0; } return acc; }; + const timeDfa = () => { let acc = 0; for (const s of sources) for (let p = 0; p < s.length; p++) { const l = dfa.match(s, p); acc += l > 0 ? l : 0; } return acc; }; + const best = (fn: () => number) => { for (let w = 0; w < 2; w++) fn(); let b = Infinity; for (let r = 0; r < 5; r++) { const t0 = process.hrtime.bigint(); fn(); const dt = Number(process.hrtime.bigint() - t0) / 1e6; if (dt < b) b = dt; } return b; }; + const reMs = best(timeRe), dfaMs = best(timeDfa); + console.log(`${t.name.padEnd(16)} dfa ${String(positions).padStart(10)} ${String(mism).padStart(4)} ${reMs.toFixed(1).padStart(8)} ${dfaMs.toFixed(1).padStart(6)} ${(reMs / dfaMs).toFixed(2)}×`); +} + +console.log('-'.repeat(78)); +console.log(`compiled to DFA: ${compiled} · fell back to regex: ${fellBack} · TOTAL mismatches: ${totalMism}`); +process.exit(totalMism === 0 ? 0 : 1); diff --git a/test/ts-ast-lowering.ts b/test/ts-ast-lowering.ts index 06edb48..a5268c7 100644 --- a/test/ts-ast-lowering.ts +++ b/test/ts-ast-lowering.ts @@ -6,7 +6,8 @@ // // Deliberately NOT complete: unlowered constructs throw Unlowered (the verify driver // counts them) — the goal is an honest pain inventory, not a shipped frontend. -import { matchStmt, type TreeAccess } from '../typescript.cst-match.ts'; +import { matchStmt } from '../typescript.cst-match.ts'; +import type { ObjTree } from './obj-tree.ts'; export type Ast = { kind: string; pos: number; end: number; children: Ast[] }; const ast = (kind: string, pos: number, end: number, children: Ast[] = []): Ast => ({ kind, pos, end, children }); @@ -27,7 +28,7 @@ export class Unlowered extends Error { // against undefined, never truthiness. type E = number; let SRC = ''; -let T!: TreeAccess; +let T!: ObjTree; const isLeaf = (n: E | undefined): boolean => n !== undefined && n < 0; const isNode = (n: E | undefined): boolean => n !== undefined && n >= 0; const off = (n: E): number => T.offsetOf(n); @@ -510,7 +511,7 @@ function lowerBindingElement(n: E): Ast { // A few arms still reach into kidsOf(n) for the positions of uncaptured structural // keywords ('catch', the switch '{') — a noted destructurer gap. function lowerStmt(n: E): Ast { - const m = matchStmt(T, n as never, SRC); + const m = matchStmt(T as never, n as never, 0, SRC); const c = kidsOf(n); switch (m.arm) { case 'block': return lowerBlock(m.block); @@ -924,7 +925,7 @@ function lowerExport(n: E, c: E[], i: number, mods: Ast[]): Ast { } // ── Entry ── -export function lowerProgram(t: TreeAccess, root: E, source: string): Ast { +export function lowerProgram(t: ObjTree, root: E, source: string): Ast { T = t; SRC = source; const stmts: Ast[] = []; diff --git a/test/ts-ast-verify.ts b/test/ts-ast-verify.ts index 9e630fa..ff33f22 100644 --- a/test/ts-ast-verify.ts +++ b/test/ts-ast-verify.ts @@ -10,19 +10,15 @@ // node test/ts-ast-verify.ts [...] # real files import { existsSync, readFileSync } from 'node:fs'; import ts from 'typescript'; -import { writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { createParser } from '../src/gen-parser.ts'; import { lowerProgram, Unlowered, type Ast } from './ts-ast-lowering.ts'; +import { objTree } from './obj-tree.ts'; -// The lowering consumes the ARENA through TreeAccess, so parse with the emitted -// parser (the product representation) — built fresh from the current grammar. +// The lowering runs against the INTERPRETER oracle through the object-tree adapter +// (absolute coordinates) — the grammar↔tsc structure contract is engine-independent, +// and the emitted tree's green (relative) coordinates stay the emitted gates' concern. const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-tsast.mjs'; -writeFileSync(emPath, emitParser(grammar)); -const parser = (await import(emPath + '?v=' + process.pid)) as { - parse(src: string, entry?: string): number; - tree: import('../typescript.cst-match.ts').TreeAccess; -}; +const parser = createParser(grammar); const kindNum = (name: string): number => { const v = (ts.SyntaxKind as unknown as Record)[name]; @@ -72,11 +68,12 @@ function run(name: string, code: string): { ok: boolean; skipped?: boolean; line return { ok: true, skipped: true, line: `${name}: SKIPPED (tsc reports ${probe.parseDiagnostics.length} parse error(s) — recovery shapes are out of contract)`, samples: [] }; } } - let root: number; - try { root = parser.parse(code); } + let rootObj; + try { rootObj = parser.parse(code); } catch (e) { return { ok: false, line: `${name}: MONOGRAM REJECT ${(e as Error).message.slice(0, 60)}`, samples: [] }; } + const adapter = objTree(rootObj as never, grammar); let mine: Ast; - try { mine = lowerProgram(parser.tree, root, code); } + try { mine = lowerProgram(adapter, adapter.rootId, code); } catch (e) { if (e instanceof Unlowered) return { ok: false, line: `${name}: UNLOWERED ${e.what} @${e.at}`, samples: [] }; return { ok: false, line: `${name}: LOWER THROW ${(e as Error).message.slice(0, 80)}`, samples: [] };