From be5a3af9104ef573a5a56c6e901a42c05e79b4ef Mon Sep 17 00:00:00 2001 From: Mikhail Kotelnikov Date: Tue, 28 Apr 2026 15:48:08 +0200 Subject: [PATCH 1/3] feat(uri-graph): add @statewalker/uri-graph kernel + dual repo backends Introduce a persistent URI dependency graph kernel with workers as async generators and a single-writer orchestrator driving a fixpoint loop over two interchangeable repository backends: - MemoryGraphStore: in-memory state with abstract persistence (lock/load/store/unlock); ships FilesApi-JSON snapshot adapter and an in-process variant for tests. - SqlGraphStore: libSQL/Turso (Node + browser/OPFS) via @statewalker/db-api. Both backends pass one shared contract test suite (defineGraphStoreContract), guaranteeing isofunctional behaviour. End-to-end pipeline test runs the full file -> text -> chunk -> embedding -> index flow against both stores. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/uri-graph/.gitignore | 3 + packages/uri-graph/README.md | 63 ++++ packages/uri-graph/package.json | 53 +++ packages/uri-graph/src/browser.ts | 2 + .../uri-graph/src/graph/selector-helpers.ts | 79 +++++ packages/uri-graph/src/index.ts | 81 +++++ packages/uri-graph/src/node.ts | 2 + packages/uri-graph/src/orchestrator/drain.ts | 164 +++++++++ .../src/orchestrator/orchestrator.ts | 167 +++++++++ packages/uri-graph/src/store/contract.ts | 323 ++++++++++++++++++ .../src/store/memory/files-persistence.ts | 87 +++++ .../uri-graph/src/store/memory/persistence.ts | 22 ++ .../uri-graph/src/store/memory/snapshot.ts | 167 +++++++++ packages/uri-graph/src/store/memory/state.ts | 94 +++++ packages/uri-graph/src/store/memory/store.ts | 224 ++++++++++++ .../uri-graph/src/store/memory/transaction.ts | 133 ++++++++ packages/uri-graph/src/store/sql/schema.ts | 103 ++++++ packages/uri-graph/src/store/sql/store.ts | 264 ++++++++++++++ .../uri-graph/src/store/sql/transaction.ts | 153 +++++++++ .../uri-graph/src/store/sql/uri-intern.ts | 28 ++ packages/uri-graph/src/store/types.ts | 75 ++++ packages/uri-graph/src/types/update.ts | 19 ++ packages/uri-graph/src/types/worker.ts | 29 ++ packages/uri-graph/src/util/hash.ts | 19 ++ packages/uri-graph/src/workers/chunker.ts | 138 ++++++++ packages/uri-graph/src/workers/embedder.ts | 121 +++++++ .../uri-graph/src/workers/extractors/base.ts | 120 +++++++ .../src/workers/extractors/html-extractor.ts | 27 ++ .../workers/extractors/markdown-extractor.ts | 19 ++ .../extractors/plain-text-extractor.ts | 14 + .../uri-graph/src/workers/file-watcher.ts | 110 ++++++ .../src/workers/index-backends/memory-fts.ts | 75 ++++ .../workers/index-backends/memory-vector.ts | 46 +++ packages/uri-graph/src/workers/indexer.ts | 255 ++++++++++++++ packages/uri-graph/tests/e2e/pipeline.test.ts | 162 +++++++++ .../tests/graph/selector-helpers.test.ts | 170 +++++++++ packages/uri-graph/tests/helpers.ts | 12 + .../tests/orchestrator/drain.test.ts | 131 +++++++ .../tests/orchestrator/orchestrator.test.ts | 134 ++++++++ .../tests/orchestrator/status.test.ts | 43 +++ .../tests/store/memory-snapshot.test.ts | 151 ++++++++ packages/uri-graph/tests/store/memory.test.ts | 18 + packages/uri-graph/tests/store/sql.test.ts | 49 +++ packages/uri-graph/tests/types/update.test.ts | 50 +++ packages/uri-graph/tests/types/worker.test.ts | 69 ++++ .../uri-graph/tests/workers/chunker.test.ts | 97 ++++++ .../uri-graph/tests/workers/embedder.test.ts | 92 +++++ .../tests/workers/extractors.test.ts | 127 +++++++ .../tests/workers/file-watcher.test.ts | 130 +++++++ .../tests/workers/index-backends.test.ts | 54 +++ .../uri-graph/tests/workers/indexer.test.ts | 117 +++++++ packages/uri-graph/tsconfig.json | 17 + 52 files changed, 4902 insertions(+) create mode 100644 packages/uri-graph/.gitignore create mode 100644 packages/uri-graph/README.md create mode 100644 packages/uri-graph/package.json create mode 100644 packages/uri-graph/src/browser.ts create mode 100644 packages/uri-graph/src/graph/selector-helpers.ts create mode 100644 packages/uri-graph/src/index.ts create mode 100644 packages/uri-graph/src/node.ts create mode 100644 packages/uri-graph/src/orchestrator/drain.ts create mode 100644 packages/uri-graph/src/orchestrator/orchestrator.ts create mode 100644 packages/uri-graph/src/store/contract.ts create mode 100644 packages/uri-graph/src/store/memory/files-persistence.ts create mode 100644 packages/uri-graph/src/store/memory/persistence.ts create mode 100644 packages/uri-graph/src/store/memory/snapshot.ts create mode 100644 packages/uri-graph/src/store/memory/state.ts create mode 100644 packages/uri-graph/src/store/memory/store.ts create mode 100644 packages/uri-graph/src/store/memory/transaction.ts create mode 100644 packages/uri-graph/src/store/sql/schema.ts create mode 100644 packages/uri-graph/src/store/sql/store.ts create mode 100644 packages/uri-graph/src/store/sql/transaction.ts create mode 100644 packages/uri-graph/src/store/sql/uri-intern.ts create mode 100644 packages/uri-graph/src/store/types.ts create mode 100644 packages/uri-graph/src/types/update.ts create mode 100644 packages/uri-graph/src/types/worker.ts create mode 100644 packages/uri-graph/src/util/hash.ts create mode 100644 packages/uri-graph/src/workers/chunker.ts create mode 100644 packages/uri-graph/src/workers/embedder.ts create mode 100644 packages/uri-graph/src/workers/extractors/base.ts create mode 100644 packages/uri-graph/src/workers/extractors/html-extractor.ts create mode 100644 packages/uri-graph/src/workers/extractors/markdown-extractor.ts create mode 100644 packages/uri-graph/src/workers/extractors/plain-text-extractor.ts create mode 100644 packages/uri-graph/src/workers/file-watcher.ts create mode 100644 packages/uri-graph/src/workers/index-backends/memory-fts.ts create mode 100644 packages/uri-graph/src/workers/index-backends/memory-vector.ts create mode 100644 packages/uri-graph/src/workers/indexer.ts create mode 100644 packages/uri-graph/tests/e2e/pipeline.test.ts create mode 100644 packages/uri-graph/tests/graph/selector-helpers.test.ts create mode 100644 packages/uri-graph/tests/helpers.ts create mode 100644 packages/uri-graph/tests/orchestrator/drain.test.ts create mode 100644 packages/uri-graph/tests/orchestrator/orchestrator.test.ts create mode 100644 packages/uri-graph/tests/orchestrator/status.test.ts create mode 100644 packages/uri-graph/tests/store/memory-snapshot.test.ts create mode 100644 packages/uri-graph/tests/store/memory.test.ts create mode 100644 packages/uri-graph/tests/store/sql.test.ts create mode 100644 packages/uri-graph/tests/types/update.test.ts create mode 100644 packages/uri-graph/tests/types/worker.test.ts create mode 100644 packages/uri-graph/tests/workers/chunker.test.ts create mode 100644 packages/uri-graph/tests/workers/embedder.test.ts create mode 100644 packages/uri-graph/tests/workers/extractors.test.ts create mode 100644 packages/uri-graph/tests/workers/file-watcher.test.ts create mode 100644 packages/uri-graph/tests/workers/index-backends.test.ts create mode 100644 packages/uri-graph/tests/workers/indexer.test.ts create mode 100644 packages/uri-graph/tsconfig.json diff --git a/packages/uri-graph/.gitignore b/packages/uri-graph/.gitignore new file mode 100644 index 0000000..06e6038 --- /dev/null +++ b/packages/uri-graph/.gitignore @@ -0,0 +1,3 @@ +node_modules +dist +*.tsbuildinfo diff --git a/packages/uri-graph/README.md b/packages/uri-graph/README.md new file mode 100644 index 0000000..3b5df79 --- /dev/null +++ b/packages/uri-graph/README.md @@ -0,0 +1,63 @@ +# @statewalker/uri-graph + +Persistent URI dependency graph kernel. Workers are async generators that consume `Update` streams and yield `Update` streams; a single-writer orchestrator drives them to a fixpoint over a persistent graph state. + +Two interchangeable repository backends implement one `GraphStore` interface: + +- `MemoryGraphStore` — in-memory state with an abstract persistence interface (`lock` / `load` / `store` / `unlock`). Ship with a JSON-snapshot adapter over `FilesApi` for filesystem persistence, or use the in-process variant for tests. +- `SqlGraphStore` — libSQL/Turso (Node and browser/OPFS) via `@statewalker/db-api`'s `Db`. + +Both pass the same shared contract test suite (`defineGraphStoreContract`), so worker code is identical across backends. + +## When to use which store + +Use `MemoryGraphStore` when state fits in process memory and persistence is one JSON file (or none): tests, scripted ETL jobs, browser-without-OPFS, embedded scenarios. The persistence layer is abstract — you provide `lock` / `load` / `store` / `unlock` callbacks; the package ships an `FilesApi` helper and an in-process helper. + +Use `SqlGraphStore` when graphs are large or need durable, query-friendly storage: long-lived daemons, multi-million-URI workloads, OPFS-backed browser deployments where the same libSQL database serves both the graph and the FTS5/vector index. Single-writer orchestration applies in both cases. + +## Minimal Node bootstrap + +```ts +import { newNodeTursoDb } from "@statewalker/db-turso-node"; +import { MemFilesApi } from "@statewalker/webrun-files-mem"; +import { + createOrchestrator, + openGraphStore, + SqlGraphStore, + createChunker, + createEmbedder, + createFileWatcher, + createMarkdownExtractor, + createIndexer, + createMemoryFtsBackend, + createMemoryVectorBackend, +} from "@statewalker/uri-graph"; + +const db = await newNodeTursoDb({ path: "./graph.db" }); +const store = await openGraphStore(new SqlGraphStore({ db })); +const files = new MemFilesApi(); // or any FilesApi +const fts = createMemoryFtsBackend(); +const vector = createMemoryVectorBackend(); + +const orch = createOrchestrator({ graph: store }); +await orch.registerWorker(createFileWatcher({ files, rootPath: "/" })); +await orch.registerWorker(createMarkdownExtractor({ files, graph: store })); +await orch.registerWorker(createChunker({ chunkSize: 1000, graph: store })); +await orch.registerWorker(createEmbedder({ graph: store, embed: yourEmbedFn })); +await orch.registerWorker(createIndexer({ graph: store, fts, vector })); + +const ac = new AbortController(); +process.on("SIGINT", () => ac.abort()); +await orch.start(ac.signal); +await db.close(); +``` + +## Browser / OPFS + +Same kernel, same workers. Swap `newNodeTursoDb` for `newBrowserTursoDb` (OPFS path) and `MemFilesApi` for an OPFS-backed `FilesApi`. `MemoryGraphStore` works in the browser too with the in-process persistence helper. + +## See also + +- Proposal: [openspec/changes/uri-dependency-graph-kernel/proposal.md](../../../../openspec/changes/uri-dependency-graph-kernel/proposal.md) +- Design: [openspec/changes/uri-dependency-graph-kernel/design.md](../../../../openspec/changes/uri-dependency-graph-kernel/design.md) +- Specs: [openspec/changes/uri-dependency-graph-kernel/specs/](../../../../openspec/changes/uri-dependency-graph-kernel/specs/) diff --git a/packages/uri-graph/package.json b/packages/uri-graph/package.json new file mode 100644 index 0000000..929f20f --- /dev/null +++ b/packages/uri-graph/package.json @@ -0,0 +1,53 @@ +{ + "name": "@statewalker/uri-graph", + "version": "0.1.0", + "private": false, + "type": "module", + "description": "Persistent URI dependency graph kernel: workers as async generators, single-writer orchestrator, two interchangeable repository backends (in-memory+JSON snapshot, SQL).", + "homepage": "https://github.com/statewalker/statewalker-content", + "author": { + "name": "Mikhail Kotelnikov", + "email": "mikhail.kotelnikov@gmail.com" + }, + "license": "MIT", + "repository": { + "type": "git", + "url": "git+ssh://git@github.com/statewalker/statewalker-content.git" + }, + "exports": { + ".": "./src/index.ts", + "./node": "./src/node.ts", + "./browser": "./src/browser.ts" + }, + "files": [ + "dist", + "src" + ], + "scripts": { + "build": "tsdown", + "dev": "tsdown --watch", + "test": "vitest run", + "test:watch": "vitest", + "typecheck": "tsc --noEmit", + "clean": "rimraf dist", + "lint": "biome check --write .", + "format": "biome format --write ." + }, + "dependencies": { + "@statewalker/db-api": "workspace:*", + "@statewalker/webrun-files": "catalog:" + }, + "devDependencies": { + "@statewalker/db-turso-node": "workspace:*", + "@statewalker/webrun-files-mem": "catalog:", + "@types/node": "catalog:", + "rimraf": "catalog:", + "tsdown": "catalog:", + "typescript": "catalog:", + "vitest": "catalog:" + }, + "sideEffects": false, + "publishConfig": { + "access": "public" + } +} diff --git a/packages/uri-graph/src/browser.ts b/packages/uri-graph/src/browser.ts new file mode 100644 index 0000000..f14757c --- /dev/null +++ b/packages/uri-graph/src/browser.ts @@ -0,0 +1,2 @@ +// Browser bootstrap helpers. Filled in once the orchestrator + workers land. +export {}; diff --git a/packages/uri-graph/src/graph/selector-helpers.ts b/packages/uri-graph/src/graph/selector-helpers.ts new file mode 100644 index 0000000..2206f8a --- /dev/null +++ b/packages/uri-graph/src/graph/selector-helpers.ts @@ -0,0 +1,79 @@ +import type { GraphReader, GraphStore } from "../store/types.js"; +import type { Update } from "../types/update.js"; + +export interface FindDirtyOptions { + forWorker: string; + forVersion: string; + uriLike: string; + limit: number; +} + +/** + * Yields `Update`s for URIs matching `uriLike` that the worker has NOT processed + * at its current version. Stops at `limit` URIs. + * + * Synthesizes one `Update` per matching URI from the URI's committed state. + * Sets `scope = uri` and `role = undefined`; multi-input workers should compose + * multiple `findDirty` calls via `joinInputs`. + */ +export async function* findDirty( + graph: GraphStore, + opts: FindDirtyOptions, +): AsyncIterableIterator { + let yielded = 0; + for await (const view of graph.find(opts.uriLike)) { + if (yielded >= opts.limit) break; + const processed = await graph.isInputProcessed(opts.forWorker, opts.forVersion, view.uri); + if (processed) continue; + yielded += 1; + yield { + uri: view.uri, + stamp: view.stamp, + status: view.status, + hash: view.hash, + scope: view.uri, + attributes: view.attributes, + }; + } +} + +/** + * Merges multiple `Update` streams, yielding all updates ordered by `(scope, role, uri)`. + * Inputs SHOULD already be ordered by scope so the merge is k-way; otherwise the + * helper buffers and sorts which may use more memory. + */ +export async function* joinInputs( + ...streams: Array> +): AsyncIterableIterator { + const all: Update[] = []; + await Promise.all( + streams.map(async (s) => { + for await (const u of s) all.push(u); + }), + ); + all.sort((a, b) => { + const sa = a.scope ?? ""; + const sb = b.scope ?? ""; + if (sa !== sb) return sa < sb ? -1 : 1; + const ra = a.role ?? ""; + const rb = b.role ?? ""; + if (ra !== rb) return ra < rb ? -1 : 1; + return a.uri < b.uri ? -1 : a.uri > b.uri ? 1 : 0; + }); + yield* all; +} + +/** + * A trivially-empty selector. Useful for source workers that need a non-empty + * tick selector to be polled by the orchestrator. Yields a single sentinel update. + */ +export async function* singleTickSelector(workerName: string): AsyncIterableIterator { + yield { + uri: `tick://${workerName}`, + stamp: 0, + status: "updated", + }; +} + +// Re-export GraphReader so consumers writing custom selectors can type their context. +export type { GraphReader }; diff --git a/packages/uri-graph/src/index.ts b/packages/uri-graph/src/index.ts new file mode 100644 index 0000000..bc3041d --- /dev/null +++ b/packages/uri-graph/src/index.ts @@ -0,0 +1,81 @@ +// Core types + +// Selector helpers +export { + type FindDirtyOptions, + findDirty, + joinInputs, + singleTickSelector, +} from "./graph/selector-helpers.js"; +export { type DrainOptions, type DrainResult, drain } from "./orchestrator/drain.js"; +// Orchestrator +export { + createOrchestrator, + type Orchestrator, + type OrchestratorOptions, + type OrchestratorStatusReport, +} from "./orchestrator/orchestrator.js"; +// Store interfaces and contract +export { + defineGraphStoreContract, + type GraphStoreHarness, + type GraphStoreHarnessFactory, +} from "./store/contract.js"; + +// Memory store +export { + createFilesPersistence, + createInMemoryPersistence, +} from "./store/memory/files-persistence.js"; +export type { Dump, LockId, MemoryPersistence } from "./store/memory/persistence.js"; +export { + MemoryGraphStore, + type MemoryGraphStoreOptions, +} from "./store/memory/store.js"; + +// SQL store +export { SqlGraphStore, type SqlGraphStoreOptions } from "./store/sql/store.js"; +export { + type BeginTransactionOpts, + type GraphReader, + type GraphStore, + type GraphTransaction, + openGraphStore, + type RecoverOrphansResult, + type RegisterWorkerInput, + type RegisterWorkerResult, +} from "./store/types.js"; +export type { + ReadOnlyView, + Status, + Update, +} from "./types/update.js"; +export type { + Selector, + SelectorContext, + WorkerDefinition, + WorkerParams, +} from "./types/worker.js"; +// Utilities +export { sha256Hex } from "./util/hash.js"; +// Workers +export { type ChunkerOptions, createChunker } from "./workers/chunker.js"; +export { createEmbedder, type EmbedderOptions } from "./workers/embedder.js"; +export { createHtmlExtractor } from "./workers/extractors/html-extractor.js"; +export { createMarkdownExtractor } from "./workers/extractors/markdown-extractor.js"; +export { createPlainTextExtractor } from "./workers/extractors/plain-text-extractor.js"; +export { + createFileWatcher, + type FileWatcherOptions, +} from "./workers/file-watcher.js"; +export { + createMemoryFtsBackend, + type FtsBackend, + type FtsHit, +} from "./workers/index-backends/memory-fts.js"; +export { + createMemoryVectorBackend, + type VectorBackend, + type VectorHit, +} from "./workers/index-backends/memory-vector.js"; +export { createIndexer, type IndexerOptions } from "./workers/indexer.js"; diff --git a/packages/uri-graph/src/node.ts b/packages/uri-graph/src/node.ts new file mode 100644 index 0000000..e46a3f3 --- /dev/null +++ b/packages/uri-graph/src/node.ts @@ -0,0 +1,2 @@ +// Node bootstrap helpers. Filled in once the orchestrator + workers land. +export {}; diff --git a/packages/uri-graph/src/orchestrator/drain.ts b/packages/uri-graph/src/orchestrator/drain.ts new file mode 100644 index 0000000..9247009 --- /dev/null +++ b/packages/uri-graph/src/orchestrator/drain.ts @@ -0,0 +1,164 @@ +import type { GraphStore, GraphTransaction } from "../store/types.js"; +import type { Update } from "../types/update.js"; +import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; + +export interface DrainOptions { + /** Yield to the event loop after every N committed updates. */ + yieldEveryN?: number; + /** Warn if a logical transaction stays open longer than this many ms. */ + txnWarnMs?: number; + /** Hook for warnings (used in tests). */ + onWarn?: (msg: string) => void; + /** AbortSignal forwarded to the worker's run. */ + signal?: AbortSignal; +} + +export interface DrainResult { + /** Number of commits that produced at least one non-noop write. */ + committedWithChanges: number; + /** Number of commits regardless of changes. */ + commits: number; +} + +/** + * Drives a `WorkerDefinition.run` to completion against an `input` stream: + * - opens a logical transaction at the first yield of a new stamp, + * - applies every same-stamp update under that transaction, + * - commits at the stamp boundary and opens the next, + * - rolls back on generator throw, + * - asserts stamp monotonicity per generator invocation, + * - records every consumed input into the current run. + * + * Returns when the generator exhausts (success) or throws (error rethrown). + */ +export async function drain( + worker: WorkerDefinition, + input: AsyncIterable, + graph: GraphStore, + opts: DrainOptions = {}, +): Promise { + const yieldEveryN = opts.yieldEveryN ?? 100; + const txnWarnMs = opts.txnWarnMs ?? 200; + const warn = opts.onWarn ?? ((m) => console.warn(m)); + const signal = opts.signal ?? new AbortController().signal; + + // Tee input so we record every consumed update against the current run. + const consumed: Update[] = []; + async function* teeInput(): AsyncGenerator { + for await (const u of input) { + consumed.push(u); + yield u; + } + } + + const params: WorkerParams = { + stamp: () => graph.mintStamp(), + read: (uri) => graph.getState(uri), + find: (pattern) => graph.find(pattern), + priorOutputs: (uri) => graph.priorOutputs(worker.name, uri), + recordRead: (uri, role) => { + consumed.push({ + uri, + stamp: 0, + status: "updated", + ...(role !== undefined ? { role } : {}), + }); + }, + signal, + }; + + let txn: GraphTransaction | undefined; + let currentStamp: number | undefined; + let txnOpenedAt = 0; + let yieldedCount = 0; + + // Inputs consumed since last commit; flushed atomically with the commit. + const consumedAtCommit: Update[] = []; + + async function openTransaction(forStamp: number): Promise { + txn = await graph.beginTransaction({ + worker: worker.name, + version: worker.version, + scope: null, + initialStamp: forStamp, + }); + txnOpenedAt = performance.now(); + } + + async function commitCurrent(): Promise { + if (!txn) return; + if (consumedAtCommit.length > 0) { + await txn.recordInputs( + consumedAtCommit.map((u) => ({ + uri: u.uri, + observedStamp: u.stamp, + ...(u.role !== undefined ? { role: u.role } : {}), + })), + ); + consumedAtCommit.length = 0; + } + const elapsed = performance.now() - txnOpenedAt; + if (elapsed > txnWarnMs) { + warn(`${worker.name} stamp ${currentStamp} held logical txn ${elapsed.toFixed(0)}ms`); + } + await txn.commit(); + txn = undefined; + } + + async function rollbackCurrent(): Promise { + if (!txn) return; + try { + await txn.rollback(); + } finally { + txn = undefined; + } + } + + const gen = worker.run(params, teeInput()); + let commits = 0; + try { + while (true) { + const next = await gen.next(); + if (next.done) break; + const u = next.value; + + // Stamp regression guard. + if (currentStamp !== undefined && u.stamp < currentStamp) { + throw new Error(`stamp regression in ${worker.name}: ${currentStamp} → ${u.stamp}`); + } + + // Stamp boundary: close prior batch, open new one. + if (currentStamp !== undefined && u.stamp !== currentStamp) { + await commitCurrent(); + commits += 1; + } + if (!txn) { + await openTransaction(u.stamp); + } + // Always move newly-consumed inputs into the active batch. + consumedAtCommit.push(...consumed); + consumed.length = 0; + + currentStamp = u.stamp; + if (!txn) throw new Error("internal: txn missing"); + await txn.applyUpdate(u); + + yieldedCount += 1; + if (yieldedCount % yieldEveryN === 0) { + await new Promise((r) => setImmediate(r)); + } + } + + if (txn) { + await commitCurrent(); + commits += 1; + } + } catch (err) { + await rollbackCurrent(); + throw err; + } + return { + commits, + committedWithChanges: yieldedCount > 0 ? commits : 0, + }; +} diff --git a/packages/uri-graph/src/orchestrator/orchestrator.ts b/packages/uri-graph/src/orchestrator/orchestrator.ts new file mode 100644 index 0000000..b85c160 --- /dev/null +++ b/packages/uri-graph/src/orchestrator/orchestrator.ts @@ -0,0 +1,167 @@ +import type { GraphStore } from "../store/types.js"; +import type { Update } from "../types/update.js"; +import type { WorkerDefinition } from "../types/worker.js"; +import { type DrainOptions, drain } from "./drain.js"; + +export interface OrchestratorOptions { + graph: GraphStore; + pollMs?: number; + selectorBatchSize?: number; + txnWarnMs?: number; + yieldEveryN?: number; + onWarn?: (msg: string) => void; + /** Optional logger for run failures. Defaults to console.error. */ + onRunError?: (workerName: string, err: unknown) => void; +} + +export interface OrchestratorStatusReport { + running: boolean; + workers: Array<{ + name: string; + version: string; + }>; +} + +export interface Orchestrator { + registerWorker(def: WorkerDefinition): Promise; + start(signal?: AbortSignal): Promise; + stop(): Promise; + status(): Promise; +} + +export function createOrchestrator(opts: OrchestratorOptions): Orchestrator { + const pollMs = opts.pollMs ?? 200; + const selectorBatchSize = opts.selectorBatchSize ?? 100; + const drainOpts: DrainOptions = { + yieldEveryN: opts.yieldEveryN, + txnWarnMs: opts.txnWarnMs, + onWarn: opts.onWarn, + }; + const onRunError = opts.onRunError ?? ((name, e) => console.error(`worker ${name} failed:`, e)); + + const workers: WorkerDefinition[] = []; + let running = false; + let internalSignal: AbortController | undefined; + + async function pollOnce(signal: AbortSignal): Promise { + let advanced = false; + for (const w of workers) { + if (signal.aborted) return advanced; + const cursor = w.selector({ + workerName: w.name, + workerVersion: w.version, + limit: selectorBatchSize, + }); + const stream = await drainIfNonEmpty(cursor); + if (!stream) continue; + try { + const result = await drain(w, stream, opts.graph, { + ...drainOpts, + signal, + }); + // Only treat as progress when the worker actually committed real outputs. + // A worker whose run() consumes a sentinel tick and yields nothing must + // not loop the orchestrator forever. + if (result.committedWithChanges > 0) advanced = true; + } catch (err) { + onRunError(w.name, err); + } + } + return advanced; + } + + return { + async registerWorker(def: WorkerDefinition): Promise { + await opts.graph.registerWorker({ + name: def.name, + version: def.version, + description: def.description, + inputPattern: def.inputPattern, + outputPattern: def.outputPattern, + scopeExpr: def.scopeExpr, + }); + workers.push(def); + }, + async start(signal?: AbortSignal): Promise { + if (running) return; + running = true; + internalSignal = new AbortController(); + const composedSignal = mergeSignals(signal, internalSignal.signal); + + try { + while (!composedSignal.aborted) { + const advanced = await pollOnce(composedSignal); + if (composedSignal.aborted) break; + if (!advanced) { + await sleep(pollMs, composedSignal); + } + } + } finally { + running = false; + } + }, + async stop(): Promise { + internalSignal?.abort(); + }, + async status(): Promise { + return { + running, + workers: workers.map((w) => ({ name: w.name, version: w.version })), + }; + }, + }; +} + +async function drainIfNonEmpty( + it: AsyncIterableIterator, +): Promise | null> { + const first = await it.next(); + if (first.done) { + if (it.return) await it.return(undefined); + return null; + } + async function* prepended(): AsyncIterableIterator { + yield first.value; + while (true) { + const n = await it.next(); + if (n.done) return; + yield n.value; + } + } + return prepended(); +} + +function sleep(ms: number, signal: AbortSignal): Promise { + return new Promise((resolve) => { + if (signal.aborted) { + resolve(); + return; + } + const t = setTimeout(() => { + signal.removeEventListener("abort", onAbort); + resolve(); + }, ms); + const onAbort = (): void => { + clearTimeout(t); + signal.removeEventListener("abort", onAbort); + resolve(); + }; + signal.addEventListener("abort", onAbort, { once: true }); + }); +} + +function mergeSignals(...signals: Array): AbortSignal { + const ctrl = new AbortController(); + for (const s of signals) { + if (!s) continue; + if (s.aborted) { + ctrl.abort(); + return ctrl.signal; + } + s.addEventListener("abort", () => ctrl.abort(), { once: true }); + } + return ctrl.signal; +} + +// Re-export Update for consumers writing tests against the orchestrator. +export type { Update }; diff --git a/packages/uri-graph/src/store/contract.ts b/packages/uri-graph/src/store/contract.ts new file mode 100644 index 0000000..a80d3d1 --- /dev/null +++ b/packages/uri-graph/src/store/contract.ts @@ -0,0 +1,323 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import type { Update } from "../types/update.js"; +import type { GraphStore } from "./types.js"; + +/** + * Shape returned by a backend's contract harness factory. Each call to `open()` opens + * a store against the same backing (so persistence-boundary tests can re-open the + * same data). `abandon(store)` simulates a crash by closing the store without committing + * any in-flight transaction. + */ +export interface GraphStoreHarness { + open(): Promise; + close(store: GraphStore): Promise; +} + +export type GraphStoreHarnessFactory = () => GraphStoreHarness; + +/** + * Drains an `AsyncIterable` into an array. Helper for `find`-style scenarios. + */ +async function drain(iterable: AsyncIterable): Promise { + const out: T[] = []; + for await (const x of iterable) out.push(x); + return out; +} + +export function defineGraphStoreContract(name: string, factory: GraphStoreHarnessFactory): void { + describe(`GraphStore contract: ${name}`, () => { + let harness: GraphStoreHarness; + let store: GraphStore; + + beforeEach(async () => { + harness = factory(); + store = await harness.open(); + }); + + describe("reads", () => { + it("getState returns null for unknown URI", async () => { + expect(await store.getState("file:///unknown")).toBeNull(); + }); + + it("find yields matching URIs", async () => { + await store.registerWorker({ name: "seed", version: "v1" }); + const stamp1 = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "seed", + version: "v1", + scope: null, + initialStamp: stamp1, + }); + for (const path of ["a.md", "b.md", "c.txt"]) { + await txn.applyUpdate({ + uri: `file:///${path}`, + stamp: stamp1, + status: "added", + hash: `h:${path}`, + }); + } + await txn.commit(); + const matches = await drain(store.find("file:///%.md")); + expect(matches.map((m) => m.uri).sort()).toEqual(["file:///a.md", "file:///b.md"]); + }); + + it("priorOutputs returns last successful run outputs for an input", async () => { + await store.registerWorker({ name: "ext", version: "v1" }); + const inputUri = "file:///x.md"; + const inputStamp = await store.mintStamp(); + const seed = await store.beginTransaction({ + worker: "ext", + version: "v1", + scope: null, + initialStamp: inputStamp, + }); + await seed.applyUpdate({ + uri: inputUri, + stamp: inputStamp, + status: "added", + hash: "ih", + }); + await seed.commit(); + + const outStamp = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "ext", + version: "v1", + scope: inputUri, + initialStamp: outStamp, + }); + await txn.recordInputs([{ uri: inputUri, observedStamp: inputStamp }]); + await txn.applyUpdate({ + uri: "text:///x.md", + stamp: outStamp, + status: "added", + hash: "th", + }); + await txn.commit(); + + const prior = await store.priorOutputs("ext", inputUri); + expect(prior.map((p) => p.uri)).toEqual(["text:///x.md"]); + }); + }); + + describe("logical transaction lifecycle", () => { + it("commit promotes staged updates and clears staging", async () => { + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.applyUpdate({ uri: "u://a", stamp: s, status: "added", hash: "1" }); + await txn.applyUpdate({ uri: "u://b", stamp: s, status: "added", hash: "2" }); + await txn.commit(); + expect((await store.getState("u://a"))?.stamp).toBe(s); + expect((await store.getState("u://b"))?.stamp).toBe(s); + }); + + it("rollback discards staged updates", async () => { + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.applyUpdate({ uri: "u://a", stamp: s, status: "added", hash: "1" }); + await txn.applyUpdate({ uri: "u://b", stamp: s, status: "added", hash: "2" }); + await txn.rollback(); + expect(await store.getState("u://a")).toBeNull(); + expect(await store.getState("u://b")).toBeNull(); + }); + + it("reuse after commit throws", async () => { + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.commit(); + await expect( + txn.applyUpdate({ uri: "u://a", stamp: s, status: "added" }), + ).rejects.toThrow(); + }); + + it("reuse after rollback throws", async () => { + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.rollback(); + await expect(txn.commit()).rejects.toThrow(); + }); + + it("commit twice throws", async () => { + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.commit(); + await expect(txn.commit()).rejects.toThrow(); + }); + }); + + describe("no-op rule", () => { + it("identical content does not bump the stamp", async () => { + await store.registerWorker({ name: "w", version: "v1" }); + const s1 = await store.mintStamp(); + const txn1 = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s1, + }); + await txn1.applyUpdate({ + uri: "u://x", + stamp: s1, + status: "added", + hash: "h", + }); + await txn1.commit(); + + const s2 = await store.mintStamp(); + const txn2 = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s2, + }); + await txn2.applyUpdate({ + uri: "u://x", + stamp: s2, + status: "added", + hash: "h", + }); + await txn2.commit(); + + expect((await store.getState("u://x"))?.stamp).toBe(s1); + }); + + it("changed content bumps the stamp", async () => { + await store.registerWorker({ name: "w", version: "v1" }); + const s1 = await store.mintStamp(); + const txn1 = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s1, + }); + await txn1.applyUpdate({ + uri: "u://x", + stamp: s1, + status: "added", + hash: "h1", + }); + await txn1.commit(); + + const s2 = await store.mintStamp(); + const txn2 = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s2, + }); + await txn2.applyUpdate({ + uri: "u://x", + stamp: s2, + status: "updated", + hash: "h2", + }); + await txn2.commit(); + + expect((await store.getState("u://x"))?.stamp).toBe(s2); + }); + }); + + describe("stamps", () => { + it("two consecutive stamps differ", async () => { + const a = await store.mintStamp(); + const b = await store.mintStamp(); + expect(b).toBeGreaterThan(a); + }); + + it("ten concurrent stamps are distinct", async () => { + const stamps = await Promise.all(Array.from({ length: 10 }, () => store.mintStamp())); + const set = new Set(stamps); + expect(set.size).toBe(10); + }); + }); + + describe("worker registry", () => { + it("register same name and version is idempotent", async () => { + const r1 = await store.registerWorker({ name: "w", version: "v1" }); + const r2 = await store.registerWorker({ name: "w", version: "v1" }); + expect(r1.versionChanged).toBe(true); // first register is technically a change + expect(r2.versionChanged).toBe(false); + }); + + it("register with bumped version reports versionChanged", async () => { + await store.registerWorker({ name: "w", version: "v1" }); + const r = await store.registerWorker({ name: "w", version: "v2" }); + expect(r.versionChanged).toBe(true); + }); + }); + + describe("recovery", () => { + it("recoverOrphans is a no-op when no running runs exist", async () => { + const result = await store.recoverOrphans(); + expect(result.cancelled).toBe(0); + expect(result.pendingRowsDropped).toBe(0); + }); + + it("a crashed run leaves no committed state on next open", async () => { + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.applyUpdate({ + uri: "u://x", + stamp: s, + status: "added", + hash: "h", + }); + // Don't commit. Force-close. + await harness.close(store); + + // Reopen — `openGraphStore` runs recoverOrphans automatically. + store = await harness.open(); + expect(await store.getState("u://x")).toBeNull(); + // A subsequent recoverOrphans call is a no-op (idempotent). + const second = await store.recoverOrphans(); + expect(second.cancelled).toBe(0); + }); + }); + }); +} + +/** Minimal fixture for tests that just need to seed updates. */ +export function buildSeedUpdates(uris: string[], stamp: number): Update[] { + return uris.map((uri) => ({ + uri, + stamp, + status: "added" as const, + hash: `h:${uri}`, + })); +} diff --git a/packages/uri-graph/src/store/memory/files-persistence.ts b/packages/uri-graph/src/store/memory/files-persistence.ts new file mode 100644 index 0000000..424eb7a --- /dev/null +++ b/packages/uri-graph/src/store/memory/files-persistence.ts @@ -0,0 +1,87 @@ +import type { FilesApi } from "@statewalker/webrun-files"; +import { readText, writeText } from "@statewalker/webrun-files"; +import type { Dump, LockId, MemoryPersistence } from "./persistence.js"; + +/** + * Process-local lock map: rejects a second `lock(key)` against the same + * `(files, key)` while a prior LockId is still outstanding. + */ +const locks = new WeakMap>(); + +function acquireLock(files: FilesApi, key: string): LockId { + let map = locks.get(files); + if (!map) { + map = new Map(); + locks.set(files, map); + } + if (map.has(key)) { + throw new Error(`already open at ${key}`); + } + const id: LockId = `${key}@${Math.random().toString(36).slice(2, 10)}`; + map.set(key, id); + return id; +} + +function releaseLock(files: FilesApi, key: string, id: LockId): void { + const map = locks.get(files); + if (!map) return; + if (map.get(key) === id) map.delete(key); +} + +/** + * Produce a `MemoryPersistence` that stores the dump as a JSON file inside the + * given `FilesApi` at `path`. Suitable for Node + browser (OPFS) wiring. + * + * Atomic publish: writes to `.tmp`, removes the prior target, then moves + * the temp file. If the process crashes between `writeText` and `move`, the + * prior committed snapshot remains intact. + */ +export function createFilesPersistence(files: FilesApi, path: string): MemoryPersistence { + return { + key: path, + async lock(key) { + return acquireLock(files, key); + }, + async load() { + if (!(await files.exists(path))) return null; + const text = await readText(files, path); + if (!text.trim()) return null; + return JSON.parse(text) as Dump; + }, + async store(_id, dump) { + const tmp = `${path}.tmp`; + await writeText(files, tmp, JSON.stringify(dump)); + if (await files.exists(path)) await files.remove(path); + await files.move(tmp, path); + }, + async unlock(id) { + releaseLock(files, path, id); + }, + }; +} + +/** + * Produce a `MemoryPersistence` that keeps the dump in process memory only. + * Useful for tests that don't need durability across restarts. + */ +export function createInMemoryPersistence(key = "graph"): MemoryPersistence { + let dump: Dump | null = null; + let activeLock: LockId | null = null; + return { + key, + async lock(k) { + if (activeLock) throw new Error(`already open at ${k}`); + activeLock = `${k}@local`; + return activeLock; + }, + async load() { + return dump; + }, + async store(_id, value) { + dump = value; + }, + async unlock(_id) { + activeLock = null; + }, + }; +} diff --git a/packages/uri-graph/src/store/memory/persistence.ts b/packages/uri-graph/src/store/memory/persistence.ts new file mode 100644 index 0000000..5ec2e54 --- /dev/null +++ b/packages/uri-graph/src/store/memory/persistence.ts @@ -0,0 +1,22 @@ +/** + * Persistence interface for `MemoryGraphStore`. Decouples the in-memory store + * from any specific filesystem or storage layer. + * + * The `key` field identifies this store within the persistence layer's + * namespace. The store calls `lock(key)` once at open time to acquire a + * `LockId`, then uses that id for every subsequent `load` / `store` / + * `unlock` call. + * + * A second `lock(key)` call against the same key while a prior LockId is still + * outstanding SHOULD reject (single-writer guarantee). + */ +export type Dump = unknown; +export type LockId = string; + +export interface MemoryPersistence { + key: string; + lock: (key: string) => Promise; + load: (id: LockId) => Promise; + store: (id: LockId, dump: Dump) => Promise; + unlock: (id: LockId) => Promise; +} diff --git a/packages/uri-graph/src/store/memory/snapshot.ts b/packages/uri-graph/src/store/memory/snapshot.ts new file mode 100644 index 0000000..ae6e153 --- /dev/null +++ b/packages/uri-graph/src/store/memory/snapshot.ts @@ -0,0 +1,167 @@ +import type { Dump } from "./persistence.js"; +import { + createEmptyState, + type RunInputRow, + type RunOutputRow, + type RunRow, + type State, + type UriStateEntry, + type WorkerRegistryEntry, +} from "./state.js"; + +interface SerializedUriState { + uri: string; + status: UriStateEntry["status"]; + stamp: number; + hash?: string; + attributes?: Record; +} + +interface SerializedRun extends Omit { + id: number; + inputs: Array<{ uri: string; role: string | null; observedStamp: number }>; + outputs: Array<{ uri: string; writtenStamp: number; wasNoop: boolean }>; +} + +export interface Snapshot { + schemaVersion: 1; + uris: Array<{ id: number; text: string }>; + state: SerializedUriState[]; + runs: SerializedRun[]; + workers: WorkerRegistryEntry[]; + stampSeq: number; + nextUriId: number; + nextRunId: number; +} + +export function serialize(state: State): Snapshot { + const liveUriIds = new Set(); + for (const uriId of state.uriState.keys()) liveUriIds.add(uriId); + for (const run of state.runs.values()) { + for (const ri of state.runInput.get(run.id) ?? []) liveUriIds.add(ri.uriId); + for (const ro of state.runOutput.get(run.id) ?? []) liveUriIds.add(ro.uriId); + } + + const stateRows: SerializedUriState[] = []; + for (const [uriId, entry] of state.uriState) { + const text = state.uriById.get(uriId); + if (text === undefined) continue; + stateRows.push({ + uri: text, + status: entry.status, + stamp: entry.stamp, + hash: entry.hash, + attributes: entry.attributes, + }); + } + stateRows.sort((a, b) => a.uri.localeCompare(b.uri)); + + const runs: SerializedRun[] = []; + for (const run of state.runs.values()) { + const inputs: SerializedRun["inputs"] = []; + for (const ri of state.runInput.get(run.id) ?? []) { + const text = state.uriById.get(ri.uriId); + if (text === undefined) continue; + inputs.push({ uri: text, role: ri.role, observedStamp: ri.observedStamp }); + } + const outputs: SerializedRun["outputs"] = []; + for (const ro of state.runOutput.get(run.id) ?? []) { + const text = state.uriById.get(ro.uriId); + if (text === undefined) continue; + outputs.push({ + uri: text, + writtenStamp: ro.writtenStamp, + wasNoop: ro.wasNoop, + }); + } + runs.push({ ...run, inputs, outputs }); + } + runs.sort((a, b) => a.id - b.id); + + const uris: Snapshot["uris"] = []; + for (const [id, text] of state.uriById) { + if (!liveUriIds.has(id)) continue; + uris.push({ id, text }); + } + uris.sort((a, b) => a.id - b.id); + + const workers: WorkerRegistryEntry[] = []; + for (const w of state.workers.values()) workers.push(w); + workers.sort((a, b) => a.name.localeCompare(b.name)); + + return { + schemaVersion: 1, + uris, + state: stateRows, + runs, + workers, + stampSeq: state.stampSeq, + nextUriId: state.nextUriId, + nextRunId: state.nextRunId, + }; +} + +export function deserialize(snapshot: Snapshot): State { + if (snapshot.schemaVersion !== 1) { + throw new Error(`Unknown snapshot schemaVersion: ${snapshot.schemaVersion}`); + } + const state = createEmptyState(); + state.stampSeq = snapshot.stampSeq; + state.nextUriId = snapshot.nextUriId; + state.nextRunId = snapshot.nextRunId; + + for (const u of snapshot.uris) { + state.uriById.set(u.id, u.text); + state.uriIdByText.set(u.text, u.id); + } + for (const row of snapshot.state) { + const id = state.uriIdByText.get(row.uri); + if (id === undefined) continue; + state.uriState.set(id, { + status: row.status, + stamp: row.stamp, + hash: row.hash, + attributes: row.attributes, + }); + } + for (const run of snapshot.runs) { + const { inputs, outputs, ...rest } = run; + state.runs.set(run.id, rest); + const inputRows: RunInputRow[] = []; + for (const i of inputs) { + const id = state.uriIdByText.get(i.uri); + if (id === undefined) continue; + inputRows.push({ + uriId: id, + role: i.role, + observedStamp: i.observedStamp, + }); + } + state.runInput.set(run.id, inputRows); + const outputRows: RunOutputRow[] = []; + for (const o of outputs) { + const id = state.uriIdByText.get(o.uri); + if (id === undefined) continue; + outputRows.push({ + uriId: id, + writtenStamp: o.writtenStamp, + wasNoop: o.wasNoop, + }); + } + state.runOutput.set(run.id, outputRows); + } + for (const w of snapshot.workers) { + state.workers.set(w.name, w); + } + return state; +} + +export function dumpFromState(state: State): Dump { + return serialize(state) as unknown as Dump; +} + +export function stateFromDump(dump: Dump | null): State { + if (dump === null || dump === undefined) return createEmptyState(); + // Validate shape minimally; deserialize will throw on schema mismatch. + return deserialize(dump as Snapshot); +} diff --git a/packages/uri-graph/src/store/memory/state.ts b/packages/uri-graph/src/store/memory/state.ts new file mode 100644 index 0000000..b5cdad8 --- /dev/null +++ b/packages/uri-graph/src/store/memory/state.ts @@ -0,0 +1,94 @@ +import type { Status } from "../../types/update.js"; + +export interface UriStateEntry { + status: Status; + stamp: number; + hash?: string; + attributes?: Record; +} + +export interface PendingEntry extends UriStateEntry { + uriId: number; +} + +export type RunOutcome = "running" | "success" | "cancelled" | "error"; + +export interface RunRow { + id: number; + action: string; + actionVersion: string; + scope: string | null; + stamp: number; + startedAt: number; + finishedAt: number | null; + outcome: RunOutcome; +} + +export interface RunInputRow { + uriId: number; + role: string | null; + observedStamp: number; +} + +export interface RunOutputRow { + uriId: number; + writtenStamp: number; + wasNoop: boolean; +} + +export interface WorkerRegistryEntry { + name: string; + version: string; + description: string | null; + inputPattern: string | null; + outputPattern: string | null; + scopeExpr: string | null; + registeredAt: number; + lastRunAt: number | null; +} + +/** Internal state kept in memory by `MemoryGraphStore`. */ +export interface State { + schemaVersion: 1; + uriById: Map; + uriIdByText: Map; + nextUriId: number; + uriState: Map; + pending: Map>; // runId → uriId → entry + runs: Map; + nextRunId: number; + runInput: Map; + runOutput: Map; + workers: Map; + stampSeq: number; +} + +export function createEmptyState(): State { + return { + schemaVersion: 1, + uriById: new Map(), + uriIdByText: new Map(), + nextUriId: 1, + uriState: new Map(), + pending: new Map(), + runs: new Map(), + nextRunId: 1, + runInput: new Map(), + runOutput: new Map(), + workers: new Map(), + stampSeq: 0, + }; +} + +export function internUri(state: State, text: string): number { + const existing = state.uriIdByText.get(text); + if (existing !== undefined) return existing; + const id = state.nextUriId++; + state.uriById.set(id, text); + state.uriIdByText.set(text, id); + return id; +} + +export function getUriId(state: State, text: string): number | undefined { + return state.uriIdByText.get(text); +} diff --git a/packages/uri-graph/src/store/memory/store.ts b/packages/uri-graph/src/store/memory/store.ts new file mode 100644 index 0000000..8a52e7a --- /dev/null +++ b/packages/uri-graph/src/store/memory/store.ts @@ -0,0 +1,224 @@ +import type { ReadOnlyView } from "../../types/update.js"; +import type { + BeginTransactionOpts, + GraphStore, + GraphTransaction, + RecoverOrphansResult, + RegisterWorkerInput, + RegisterWorkerResult, +} from "../types.js"; +import type { LockId, MemoryPersistence } from "./persistence.js"; +import { dumpFromState, stateFromDump } from "./snapshot.js"; +import { createEmptyState, internUri, type State } from "./state.js"; +import { MemoryTransaction } from "./transaction.js"; + +export type MemoryGraphStoreOptions = MemoryPersistence; + +function uriMatchesLikePattern(text: string, pattern: string): boolean { + // Translate SQL LIKE pattern (% any, _ one) to a RegExp. + let re = "^"; + for (const ch of pattern) { + if (ch === "%") re += ".*"; + else if (ch === "_") re += "."; + else re += ch.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + } + re += "$"; + return new RegExp(re).test(text); +} + +export class MemoryGraphStore implements GraphStore { + private persistence: MemoryPersistence; + private state: State = createEmptyState(); + private lockId: LockId | null = null; + private initialized = false; + private closed = false; + + constructor(options: MemoryGraphStoreOptions) { + this.persistence = options; + } + + async initialize(): Promise { + if (this.initialized) return; + this.lockId = await this.persistence.lock(this.persistence.key); + const dump = await this.persistence.load(this.lockId); + this.state = stateFromDump(dump); + this.initialized = true; + } + + async close(): Promise { + if (this.closed) return; + this.closed = true; + if (this.lockId !== null) { + await this.persistence.unlock(this.lockId); + this.lockId = null; + } + } + + private async flush(): Promise { + if (this.lockId === null) return; + await this.persistence.store(this.lockId, dumpFromState(this.state)); + } + + async getState(uri: string): Promise { + const id = this.state.uriIdByText.get(uri); + if (id === undefined) return null; + const entry = this.state.uriState.get(id); + if (!entry) return null; + return { + uri, + stamp: entry.stamp, + status: entry.status, + hash: entry.hash, + attributes: entry.attributes, + }; + } + + async *find(pattern: string): AsyncIterable { + for (const [uriId, entry] of this.state.uriState) { + const text = this.state.uriById.get(uriId); + if (text === undefined) continue; + if (!uriMatchesLikePattern(text, pattern)) continue; + yield { + uri: text, + stamp: entry.stamp, + status: entry.status, + hash: entry.hash, + attributes: entry.attributes, + }; + } + } + + async priorOutputs(workerName: string, inputUri: string): Promise { + const inputId = this.state.uriIdByText.get(inputUri); + if (inputId === undefined) return []; + + let bestRunId: number | undefined; + for (const run of this.state.runs.values()) { + if (run.action !== workerName) continue; + if (run.outcome !== "success") continue; + const inputs = this.state.runInput.get(run.id) ?? []; + const observed = inputs.some((i) => i.uriId === inputId); + if (!observed) continue; + if (bestRunId === undefined || run.id > bestRunId) { + bestRunId = run.id; + } + } + if (bestRunId === undefined) return []; + const outputs = this.state.runOutput.get(bestRunId) ?? []; + const result: ReadOnlyView[] = []; + for (const o of outputs) { + const uri = this.state.uriById.get(o.uriId); + if (uri === undefined) continue; + const entry = this.state.uriState.get(o.uriId); + if (!entry) continue; + result.push({ + uri, + stamp: entry.stamp, + status: entry.status, + hash: entry.hash, + attributes: entry.attributes, + }); + } + return result; + } + + async beginTransaction(opts: BeginTransactionOpts): Promise { + const runId = this.state.nextRunId++; + this.state.runs.set(runId, { + id: runId, + action: opts.worker, + actionVersion: opts.version, + scope: opts.scope, + stamp: opts.initialStamp, + startedAt: Date.now(), + finishedAt: null, + outcome: "running", + }); + await this.flush(); + return new MemoryTransaction(runId, { + state: this.state, + flush: () => this.flush(), + }); + } + + async mintStamp(): Promise { + this.state.stampSeq += 1; + return this.state.stampSeq; + } + + async recoverOrphans(): Promise { + let cancelled = 0; + let pendingRowsDropped = 0; + for (const run of this.state.runs.values()) { + if (run.outcome !== "running") continue; + run.outcome = "cancelled"; + run.finishedAt = Date.now(); + cancelled += 1; + const pendingForRun = this.state.pending.get(run.id); + if (pendingForRun) { + pendingRowsDropped += pendingForRun.size; + this.state.pending.delete(run.id); + } + this.state.runInput.delete(run.id); + this.state.runOutput.delete(run.id); + } + if (cancelled > 0) await this.flush(); + return { cancelled, pendingRowsDropped }; + } + + async registerWorker(def: RegisterWorkerInput): Promise { + const existing = this.state.workers.get(def.name); + const versionChanged = !existing || existing.version !== def.version; + this.state.workers.set(def.name, { + name: def.name, + version: def.version, + description: def.description ?? null, + inputPattern: def.inputPattern ?? null, + outputPattern: def.outputPattern ?? null, + scopeExpr: def.scopeExpr ?? null, + registeredAt: existing ? existing.registeredAt : Date.now(), + lastRunAt: existing ? existing.lastRunAt : null, + }); + if (versionChanged) await this.flush(); + return { versionChanged }; + } + + async lastSuccessfulRunVersion(workerName: string, inputUri: string): Promise { + const inputId = this.state.uriIdByText.get(inputUri); + if (inputId === undefined) return null; + let bestRun: { id: number; version: string } | undefined; + for (const run of this.state.runs.values()) { + if (run.action !== workerName || run.outcome !== "success") continue; + const inputs = this.state.runInput.get(run.id) ?? []; + if (!inputs.some((i) => i.uriId === inputId)) continue; + if (!bestRun || run.id > bestRun.id) { + bestRun = { id: run.id, version: run.actionVersion }; + } + } + return bestRun ? bestRun.version : null; + } + + async isInputProcessed( + workerName: string, + workerVersion: string, + inputUri: string, + ): Promise { + const inputId = this.state.uriIdByText.get(inputUri); + if (inputId === undefined) return false; + const currentStamp = this.state.uriState.get(inputId)?.stamp ?? 0; + for (const run of this.state.runs.values()) { + if (run.action !== workerName || run.outcome !== "success") continue; + if (run.actionVersion !== workerVersion) continue; + const inputs = this.state.runInput.get(run.id) ?? []; + const obs = inputs.find((i) => i.uriId === inputId); + if (!obs) continue; + if (obs.observedStamp >= currentStamp) return true; + } + return false; + } + + /** Internal helper used in tests when interning a URI on read paths. */ + internUri(text: string): number { + return internUri(this.state, text); + } +} diff --git a/packages/uri-graph/src/store/memory/transaction.ts b/packages/uri-graph/src/store/memory/transaction.ts new file mode 100644 index 0000000..c6c2e71 --- /dev/null +++ b/packages/uri-graph/src/store/memory/transaction.ts @@ -0,0 +1,133 @@ +import type { Update } from "../../types/update.js"; +import type { GraphTransaction } from "../types.js"; +import { internUri, type State } from "./state.js"; + +export interface MemoryTransactionDeps { + state: State; + /** Called after commit/rollback to persist the snapshot. */ + flush: () => Promise; +} + +type TerminalState = "open" | "committed" | "rolledback"; + +export class MemoryTransaction implements GraphTransaction { + readonly runId: number; + private state: State; + private flush: () => Promise; + private status: TerminalState = "open"; + + constructor(runId: number, deps: MemoryTransactionDeps) { + this.runId = runId; + this.state = deps.state; + this.flush = deps.flush; + } + + private ensureOpen(op: string): void { + if (this.status !== "open") { + throw new Error(`transaction ${this.runId} is closed (${this.status}); cannot ${op}`); + } + } + + async applyUpdate(u: Update): Promise { + this.ensureOpen("applyUpdate"); + const uriId = internUri(this.state, u.uri); + + // No-op rule: skip staging if committed (status, hash) match. + const committed = this.state.uriState.get(uriId); + const isNoop = + committed !== undefined && committed.status === u.status && committed.hash === u.hash; + + let pendingForRun = this.state.pending.get(this.runId); + if (!pendingForRun) { + pendingForRun = new Map(); + this.state.pending.set(this.runId, pendingForRun); + } + + if (isNoop && committed) { + // Record an output marker with prior stamp + wasNoop flag (deferred until commit). + const outputs = this.state.runOutput.get(this.runId) ?? []; + outputs.push({ + uriId, + writtenStamp: committed.stamp, + wasNoop: true, + }); + this.state.runOutput.set(this.runId, outputs); + return; + } + + pendingForRun.set(uriId, { + uriId, + status: u.status, + stamp: u.stamp, + hash: u.hash, + attributes: u.attributes, + }); + } + + async recordInputs( + inputs: ReadonlyArray<{ + uri: string; + observedStamp: number; + role?: string; + }>, + ): Promise { + this.ensureOpen("recordInputs"); + const rows = this.state.runInput.get(this.runId) ?? []; + for (const i of inputs) { + const id = internUri(this.state, i.uri); + rows.push({ + uriId: id, + role: i.role ?? null, + observedStamp: i.observedStamp, + }); + } + this.state.runInput.set(this.runId, rows); + } + + async commit(): Promise { + this.ensureOpen("commit"); + this.status = "committed"; + + // Promote pending → committed; record run_output for non-noop entries. + const pendingForRun = this.state.pending.get(this.runId); + const outputs = this.state.runOutput.get(this.runId) ?? []; + if (pendingForRun) { + for (const [uriId, entry] of pendingForRun) { + this.state.uriState.set(uriId, { + status: entry.status, + stamp: entry.stamp, + hash: entry.hash, + attributes: entry.attributes, + }); + outputs.push({ + uriId, + writtenStamp: entry.stamp, + wasNoop: false, + }); + } + } + this.state.runOutput.set(this.runId, outputs); + this.state.pending.delete(this.runId); + + const run = this.state.runs.get(this.runId); + if (run) { + run.outcome = "success"; + run.finishedAt = Date.now(); + } + await this.flush(); + } + + async rollback(): Promise { + this.ensureOpen("rollback"); + this.status = "rolledback"; + this.state.pending.delete(this.runId); + this.state.runInput.delete(this.runId); + this.state.runOutput.delete(this.runId); + const run = this.state.runs.get(this.runId); + if (run) { + run.outcome = "cancelled"; + run.finishedAt = Date.now(); + } + await this.flush(); + } +} diff --git a/packages/uri-graph/src/store/sql/schema.ts b/packages/uri-graph/src/store/sql/schema.ts new file mode 100644 index 0000000..0bdda96 --- /dev/null +++ b/packages/uri-graph/src/store/sql/schema.ts @@ -0,0 +1,103 @@ +import type { Db } from "@statewalker/db-api"; + +const STATEMENTS: string[] = [ + // 1. URI interning. INTEGER PRIMARY KEY is an alias for rowid; SQLite + // auto-generates the id on INSERT. + `CREATE TABLE IF NOT EXISTS uri ( + id INTEGER PRIMARY KEY, + text TEXT NOT NULL UNIQUE + )`, + `CREATE INDEX IF NOT EXISTS uri_text ON uri(text)`, + + // 2. Stamp source + `CREATE TABLE IF NOT EXISTS stamp_seq ( + id INTEGER PRIMARY KEY CHECK (id = 1), + next INTEGER NOT NULL + )`, + `INSERT OR IGNORE INTO stamp_seq (id, next) VALUES (1, 1)`, + + // 3. Worker registry + `CREATE TABLE IF NOT EXISTS worker_registry ( + name TEXT PRIMARY KEY, + version TEXT NOT NULL, + description TEXT, + input_pattern TEXT, + output_pattern TEXT, + scope_expr TEXT, + selector_kind TEXT NOT NULL DEFAULT 'code', + registered_at INTEGER NOT NULL, + last_run_at INTEGER + )`, + + // 4. Committed state + `CREATE TABLE IF NOT EXISTS uri_state ( + uri_id INTEGER PRIMARY KEY REFERENCES uri(id), + status TEXT NOT NULL, + stamp INTEGER NOT NULL, + hash TEXT, + attributes TEXT + )`, + `CREATE INDEX IF NOT EXISTS uri_state_stamp ON uri_state(stamp)`, + + // 5. Pending (staging) + `CREATE TABLE IF NOT EXISTS uri_state_pending ( + run_id INTEGER NOT NULL, + uri_id INTEGER NOT NULL REFERENCES uri(id), + status TEXT NOT NULL, + stamp INTEGER NOT NULL, + hash TEXT, + attributes TEXT, + PRIMARY KEY (run_id, uri_id) + )`, + `CREATE INDEX IF NOT EXISTS uri_state_pending_uri ON uri_state_pending(uri_id)`, + + // 6. Run history + `CREATE TABLE IF NOT EXISTS run ( + id INTEGER PRIMARY KEY, + action TEXT NOT NULL, + action_version TEXT NOT NULL, + scope TEXT, + stamp INTEGER NOT NULL, + started_at INTEGER NOT NULL, + finished_at INTEGER, + outcome TEXT NOT NULL + )`, + `CREATE INDEX IF NOT EXISTS run_action_scope ON run(action, scope, id)`, + `CREATE INDEX IF NOT EXISTS run_outcome_started ON run(outcome, started_at)`, + + // 7. Run inputs / outputs + `CREATE TABLE IF NOT EXISTS run_input ( + run_id INTEGER NOT NULL REFERENCES run(id) ON DELETE CASCADE, + uri_id INTEGER NOT NULL REFERENCES uri(id), + role TEXT, + observed_stamp INTEGER NOT NULL, + PRIMARY KEY (run_id, uri_id) + )`, + `CREATE TABLE IF NOT EXISTS run_output ( + run_id INTEGER NOT NULL REFERENCES run(id) ON DELETE CASCADE, + uri_id INTEGER NOT NULL REFERENCES uri(id), + written_stamp INTEGER NOT NULL, + was_noop INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (run_id, uri_id) + )`, + `CREATE INDEX IF NOT EXISTS run_output_uri ON run_output(uri_id, run_id)`, +]; + +const PRAGMAS: string[] = [ + "PRAGMA journal_mode = WAL", + "PRAGMA synchronous = NORMAL", + "PRAGMA foreign_keys = ON", +]; + +export async function applySchema(db: Db): Promise { + for (const pragma of PRAGMAS) { + try { + await db.exec(pragma); + } catch { + // libSQL may no-op some PRAGMAs; tolerate. + } + } + for (const stmt of STATEMENTS) { + await db.exec(stmt); + } +} diff --git a/packages/uri-graph/src/store/sql/store.ts b/packages/uri-graph/src/store/sql/store.ts new file mode 100644 index 0000000..15a9ec5 --- /dev/null +++ b/packages/uri-graph/src/store/sql/store.ts @@ -0,0 +1,264 @@ +import type { Db } from "@statewalker/db-api"; +import type { ReadOnlyView } from "../../types/update.js"; +import type { + BeginTransactionOpts, + GraphStore, + GraphTransaction, + RecoverOrphansResult, + RegisterWorkerInput, + RegisterWorkerResult, +} from "../types.js"; +import { applySchema } from "./schema.js"; +import { SqlTransaction } from "./transaction.js"; +import { getUriId } from "./uri-intern.js"; + +export interface SqlGraphStoreOptions { + db: Db; +} + +interface UriStateRow { + status: string; + stamp: number; + hash: string | null; + attributes: string | null; +} + +interface UriRow extends UriStateRow { + text: string; +} + +function rowToView(uri: string, row: UriStateRow): ReadOnlyView { + return { + uri, + stamp: row.stamp, + status: row.status as ReadOnlyView["status"], + hash: row.hash ?? undefined, + attributes: + row.attributes !== null && row.attributes !== undefined + ? (JSON.parse(row.attributes) as Record) + : undefined, + }; +} + +export class SqlGraphStore implements GraphStore { + private db: Db; + private initialized = false; + private closed = false; + + constructor(options: SqlGraphStoreOptions) { + this.db = options.db; + } + + async initialize(): Promise { + if (this.initialized) return; + await applySchema(this.db); + this.initialized = true; + } + + async close(): Promise { + if (this.closed) return; + this.closed = true; + // Caller owns Db lifecycle; we just mark closed. + } + + async getState(uri: string): Promise { + const id = await getUriId(this.db, uri); + if (id === null) return null; + const rows = await this.db.query( + "SELECT status, stamp, hash, attributes FROM uri_state WHERE uri_id = ?", + [id], + ); + if (rows.length === 0 || !rows[0]) return null; + return rowToView(uri, rows[0]); + } + + async *find(pattern: string): AsyncIterable { + const rows = await this.db.query( + `SELECT u.text AS text, s.status, s.stamp, s.hash, s.attributes + FROM uri_state s + JOIN uri u ON u.id = s.uri_id + WHERE u.text LIKE ?`, + [pattern], + ); + for (const r of rows) { + yield rowToView(r.text, r); + } + } + + async priorOutputs(workerName: string, inputUri: string): Promise { + const inputId = await getUriId(this.db, inputUri); + if (inputId === null) return []; + + const latest = await this.db.query<{ run_id: number }>( + `SELECT r.id AS run_id + FROM run r + JOIN run_input ri ON ri.run_id = r.id + WHERE r.action = ? + AND r.outcome = 'success' + AND ri.uri_id = ? + ORDER BY r.id DESC + LIMIT 1`, + [workerName, inputId], + ); + if (latest.length === 0 || !latest[0]) return []; + const runId = latest[0].run_id; + + const rows = await this.db.query( + `SELECT u.text AS text, s.status, s.stamp, s.hash, s.attributes + FROM run_output ro + JOIN uri u ON u.id = ro.uri_id + JOIN uri_state s ON s.uri_id = ro.uri_id + WHERE ro.run_id = ? AND ro.was_noop = 0`, + [runId], + ); + return rows.map((r) => rowToView(r.text, r)); + } + + async beginTransaction(opts: BeginTransactionOpts): Promise { + const result = await this.db.query<{ id: number }>( + `INSERT INTO run (action, action_version, scope, stamp, started_at, outcome) + VALUES (?, ?, ?, ?, ?, 'running') + RETURNING id`, + [opts.worker, opts.version, opts.scope, opts.initialStamp, Date.now()], + ); + if (result.length === 0 || !result[0]) { + throw new Error("failed to allocate run id"); + } + return new SqlTransaction(this.db, result[0].id); + } + + async mintStamp(): Promise { + // Atomic increment-and-fetch. + const rows = await this.db.query<{ next: number }>( + "UPDATE stamp_seq SET next = next + 1 WHERE id = 1 RETURNING next - 1 AS next", + ); + if (rows.length === 0 || !rows[0]) { + throw new Error("stamp_seq row missing"); + } + return rows[0].next; + } + + async recoverOrphans(): Promise { + const before = await this.db.query<{ count: number }>( + "SELECT COUNT(*) AS count FROM run WHERE outcome = 'running'", + ); + const cancelled = before.length > 0 && before[0] ? before[0].count : 0; + if (cancelled === 0) return { cancelled: 0, pendingRowsDropped: 0 }; + + const pendingBefore = await this.db.query<{ count: number }>( + `SELECT COUNT(*) AS count FROM uri_state_pending + WHERE run_id IN (SELECT id FROM run WHERE outcome = 'running')`, + ); + const pendingRowsDropped = + pendingBefore.length > 0 && pendingBefore[0] ? pendingBefore[0].count : 0; + + await this.db.exec("BEGIN IMMEDIATE"); + try { + await this.db.query( + `DELETE FROM uri_state_pending + WHERE run_id IN (SELECT id FROM run WHERE outcome = 'running')`, + ); + await this.db.query( + `DELETE FROM run_input + WHERE run_id IN (SELECT id FROM run WHERE outcome = 'running')`, + ); + await this.db.query( + `DELETE FROM run_output + WHERE run_id IN (SELECT id FROM run WHERE outcome = 'running')`, + ); + await this.db.query( + "UPDATE run SET outcome = 'cancelled', finished_at = ? WHERE outcome = 'running'", + [Date.now()], + ); + await this.db.exec("COMMIT"); + } catch (err) { + try { + await this.db.exec("ROLLBACK"); + } catch { + // ignore + } + throw err; + } + return { cancelled, pendingRowsDropped }; + } + + async registerWorker(def: RegisterWorkerInput): Promise { + const existing = await this.db.query<{ version: string }>( + "SELECT version FROM worker_registry WHERE name = ?", + [def.name], + ); + const versionChanged = + existing.length === 0 || !existing[0] || existing[0].version !== def.version; + const now = Date.now(); + if (existing.length === 0) { + await this.db.query( + `INSERT INTO worker_registry + (name, version, description, input_pattern, output_pattern, scope_expr, registered_at) + VALUES (?, ?, ?, ?, ?, ?, ?)`, + [ + def.name, + def.version, + def.description ?? null, + def.inputPattern ?? null, + def.outputPattern ?? null, + def.scopeExpr ?? null, + now, + ], + ); + } else if (versionChanged) { + await this.db.query( + `UPDATE worker_registry + SET version = ?, description = ?, input_pattern = ?, output_pattern = ?, scope_expr = ? + WHERE name = ?`, + [ + def.version, + def.description ?? null, + def.inputPattern ?? null, + def.outputPattern ?? null, + def.scopeExpr ?? null, + def.name, + ], + ); + } + return { versionChanged }; + } + + async lastSuccessfulRunVersion(workerName: string, inputUri: string): Promise { + const inputId = await getUriId(this.db, inputUri); + if (inputId === null) return null; + const rows = await this.db.query<{ action_version: string }>( + `SELECT r.action_version + FROM run r + JOIN run_input ri ON ri.run_id = r.id + WHERE r.action = ? + AND r.outcome = 'success' + AND ri.uri_id = ? + ORDER BY r.id DESC + LIMIT 1`, + [workerName, inputId], + ); + return rows.length > 0 && rows[0] ? rows[0].action_version : null; + } + + async isInputProcessed( + workerName: string, + workerVersion: string, + inputUri: string, + ): Promise { + const inputId = await getUriId(this.db, inputUri); + if (inputId === null) return false; + const rows = await this.db.query<{ count: number }>( + `SELECT COUNT(*) AS count + FROM run r + JOIN run_input ri ON ri.run_id = r.id + LEFT JOIN uri_state s ON s.uri_id = ri.uri_id + WHERE r.action = ? + AND r.action_version = ? + AND r.outcome = 'success' + AND ri.uri_id = ? + AND ri.observed_stamp >= COALESCE(s.stamp, 0)`, + [workerName, workerVersion, inputId], + ); + return rows.length > 0 && rows[0] !== undefined && rows[0].count > 0; + } +} diff --git a/packages/uri-graph/src/store/sql/transaction.ts b/packages/uri-graph/src/store/sql/transaction.ts new file mode 100644 index 0000000..1d4e9fc --- /dev/null +++ b/packages/uri-graph/src/store/sql/transaction.ts @@ -0,0 +1,153 @@ +import type { Db } from "@statewalker/db-api"; +import type { Update } from "../../types/update.js"; +import type { GraphTransaction } from "../types.js"; +import { internUri } from "./uri-intern.js"; + +type TxnStatus = "open" | "committed" | "rolledback"; + +export class SqlTransaction implements GraphTransaction { + readonly runId: number; + private db: Db; + private status: TxnStatus = "open"; + + constructor(db: Db, runId: number) { + this.db = db; + this.runId = runId; + } + + private ensureOpen(op: string): void { + if (this.status !== "open") { + throw new Error(`transaction ${this.runId} is closed (${this.status}); cannot ${op}`); + } + } + + async applyUpdate(u: Update): Promise { + this.ensureOpen("applyUpdate"); + const uriId = await internUri(this.db, u.uri); + + // No-op check against committed state. + const committed = await this.db.query<{ status: string; hash: string | null; stamp: number }>( + "SELECT status, hash, stamp FROM uri_state WHERE uri_id = ?", + [uriId], + ); + const isNoop = + committed.length > 0 && + committed[0] !== undefined && + committed[0].status === u.status && + (committed[0].hash ?? null) === (u.hash ?? null); + + if (isNoop && committed[0]) { + // Record the no-op output marker now (durable, since each applyUpdate is its own physical txn). + await this.db.query( + `INSERT OR REPLACE INTO run_output (run_id, uri_id, written_stamp, was_noop) + VALUES (?, ?, ?, 1)`, + [this.runId, uriId, committed[0].stamp], + ); + return; + } + + const attrJson = u.attributes !== undefined ? JSON.stringify(u.attributes) : null; + await this.db.query( + `INSERT INTO uri_state_pending (run_id, uri_id, status, stamp, hash, attributes) + VALUES (?, ?, ?, ?, ?, ?) + ON CONFLICT(run_id, uri_id) DO UPDATE SET + status = excluded.status, + stamp = excluded.stamp, + hash = excluded.hash, + attributes = excluded.attributes`, + [this.runId, uriId, u.status, u.stamp, u.hash ?? null, attrJson], + ); + } + + async recordInputs( + inputs: ReadonlyArray<{ + uri: string; + observedStamp: number; + role?: string; + }>, + ): Promise { + this.ensureOpen("recordInputs"); + for (const i of inputs) { + const uriId = await internUri(this.db, i.uri); + await this.db.query( + `INSERT INTO run_input (run_id, uri_id, role, observed_stamp) + VALUES (?, ?, ?, ?) + ON CONFLICT(run_id, uri_id) DO UPDATE SET + role = excluded.role, + observed_stamp = excluded.observed_stamp`, + [this.runId, uriId, i.role ?? null, i.observedStamp], + ); + } + } + + async commit(): Promise { + this.ensureOpen("commit"); + this.status = "committed"; + + await this.db.exec("BEGIN IMMEDIATE"); + try { + // Promote pending → committed. + await this.db.query( + `INSERT INTO uri_state (uri_id, status, stamp, hash, attributes) + SELECT uri_id, status, stamp, hash, attributes + FROM uri_state_pending + WHERE run_id = ? + ON CONFLICT(uri_id) DO UPDATE SET + status = excluded.status, + stamp = excluded.stamp, + hash = excluded.hash, + attributes = excluded.attributes`, + [this.runId], + ); + + // Record run_output for promoted entries (was_noop = 0). + await this.db.query( + `INSERT OR REPLACE INTO run_output (run_id, uri_id, written_stamp, was_noop) + SELECT run_id, uri_id, stamp, 0 FROM uri_state_pending WHERE run_id = ?`, + [this.runId], + ); + + // Mark run success. + await this.db.query("UPDATE run SET outcome = 'success', finished_at = ? WHERE id = ?", [ + Date.now(), + this.runId, + ]); + + // Drop staging. + await this.db.query("DELETE FROM uri_state_pending WHERE run_id = ?", [this.runId]); + await this.db.exec("COMMIT"); + } catch (err) { + try { + await this.db.exec("ROLLBACK"); + } catch { + // ignore + } + throw err; + } + } + + async rollback(): Promise { + this.ensureOpen("rollback"); + this.status = "rolledback"; + + await this.db.exec("BEGIN IMMEDIATE"); + try { + await this.db.query("DELETE FROM uri_state_pending WHERE run_id = ?", [this.runId]); + await this.db.query("UPDATE run SET outcome = 'cancelled', finished_at = ? WHERE id = ?", [ + Date.now(), + this.runId, + ]); + // Drop run_input/run_output for the cancelled run so they don't pollute history. + await this.db.query("DELETE FROM run_input WHERE run_id = ?", [this.runId]); + await this.db.query("DELETE FROM run_output WHERE run_id = ?", [this.runId]); + await this.db.exec("COMMIT"); + } catch (err) { + try { + await this.db.exec("ROLLBACK"); + } catch { + // ignore + } + throw err; + } + } +} diff --git a/packages/uri-graph/src/store/sql/uri-intern.ts b/packages/uri-graph/src/store/sql/uri-intern.ts new file mode 100644 index 0000000..011eda5 --- /dev/null +++ b/packages/uri-graph/src/store/sql/uri-intern.ts @@ -0,0 +1,28 @@ +import type { Db } from "@statewalker/db-api"; + +/** + * Intern a URI text, returning its integer id. Idempotent. + * Single-writer assumption: no concurrent inserts of the same text from peers. + */ +export async function internUri(db: Db, text: string): Promise { + // First try fast lookup. + const existing = await db.query<{ id: number }>("SELECT id FROM uri WHERE text = ?", [text]); + if (existing.length > 0 && existing[0]) return existing[0].id; + // Insert; on race (won't happen under single-writer) fall back to lookup. + await db.query("INSERT OR IGNORE INTO uri (text) VALUES (?)", [text]); + const fresh = await db.query<{ id: number }>("SELECT id FROM uri WHERE text = ?", [text]); + if (!fresh.length || !fresh[0]) { + throw new Error(`failed to intern URI: ${text}`); + } + return fresh[0].id; +} + +export async function getUriId(db: Db, text: string): Promise { + const rows = await db.query<{ id: number }>("SELECT id FROM uri WHERE text = ?", [text]); + return rows.length > 0 && rows[0] ? rows[0].id : null; +} + +export async function getUriText(db: Db, id: number): Promise { + const rows = await db.query<{ text: string }>("SELECT text FROM uri WHERE id = ?", [id]); + return rows.length > 0 && rows[0] ? rows[0].text : null; +} diff --git a/packages/uri-graph/src/store/types.ts b/packages/uri-graph/src/store/types.ts new file mode 100644 index 0000000..61fc298 --- /dev/null +++ b/packages/uri-graph/src/store/types.ts @@ -0,0 +1,75 @@ +import type { ReadOnlyView, Update } from "../types/update.js"; + +export interface GraphReader { + getState(uri: string): Promise; + find(pattern: string): AsyncIterable; + priorOutputs(workerName: string, inputUri: string): Promise; +} + +export interface BeginTransactionOpts { + worker: string; + version: string; + scope: string | null; + initialStamp: number; +} + +export interface RegisterWorkerInput { + name: string; + version: string; + description?: string; + inputPattern?: string; + outputPattern?: string; + scopeExpr?: string; +} + +export interface RegisterWorkerResult { + versionChanged: boolean; +} + +export interface RecoverOrphansResult { + cancelled: number; + pendingRowsDropped: number; +} + +export interface GraphStore extends GraphReader { + beginTransaction(opts: BeginTransactionOpts): Promise; + mintStamp(): Promise; + recoverOrphans(): Promise; + registerWorker(def: RegisterWorkerInput): Promise; + /** + * Returns the latest successful run's action_version for the given worker against the given URI, + * or null if no successful run exists. Used by selectors to detect version-bump invalidation. + */ + lastSuccessfulRunVersion(workerName: string, inputUri: string): Promise; + /** + * Returns true if the given worker has a successful run that observed inputUri at a stamp + * greater than or equal to the URI's current committed stamp AND at the worker's current version. + * Used by `findDirty`-style selectors. + */ + isInputProcessed(workerName: string, workerVersion: string, inputUri: string): Promise; +} + +export interface GraphTransaction { + readonly runId: number; + applyUpdate(u: Update): Promise; + recordInputs( + inputs: ReadonlyArray<{ uri: string; observedStamp: number; role?: string }>, + ): Promise; + commit(): Promise; + rollback(): Promise; +} + +/** + * Lifecycle helper. `openGraphStore(store)` runs schema setup + recovery and returns the store. + * Backend-specific factories (`new MemoryGraphStore(...)`, `new SqlGraphStore(...)`) build the + * raw store; `openGraphStore` makes it ready for use. + */ +export async function openGraphStore Promise }>( + store: T, +): Promise { + if (store.initialize) { + await store.initialize(); + } + await store.recoverOrphans(); + return store; +} diff --git a/packages/uri-graph/src/types/update.ts b/packages/uri-graph/src/types/update.ts new file mode 100644 index 0000000..633beb9 --- /dev/null +++ b/packages/uri-graph/src/types/update.ts @@ -0,0 +1,19 @@ +export type Status = "added" | "updated" | "removed"; + +export interface Update { + uri: string; + stamp: number; + status: Status; + hash?: string; + scope?: string; + role?: string; + attributes?: Record; +} + +export interface ReadOnlyView { + uri: string; + stamp: number; + status: Status; + hash?: string; + attributes?: Record; +} diff --git a/packages/uri-graph/src/types/worker.ts b/packages/uri-graph/src/types/worker.ts new file mode 100644 index 0000000..55c987a --- /dev/null +++ b/packages/uri-graph/src/types/worker.ts @@ -0,0 +1,29 @@ +import type { ReadOnlyView, Update } from "./update.js"; + +export interface WorkerParams { + stamp: () => Promise; + read: (uri: string) => Promise; + find: (pattern: string) => AsyncIterable; + priorOutputs: (inputUri: string) => Promise; + recordRead: (uri: string, role?: string) => void; + signal: AbortSignal; +} + +export interface SelectorContext { + workerName: string; + workerVersion: string; + limit: number; +} + +export type Selector = (ctx: SelectorContext) => AsyncIterableIterator; + +export interface WorkerDefinition { + name: string; + version: string; + description?: string; + inputPattern?: string; + outputPattern?: string; + scopeExpr?: string; + selector: Selector; + run: (params: WorkerParams, input: AsyncIterable) => AsyncGenerator; +} diff --git a/packages/uri-graph/src/util/hash.ts b/packages/uri-graph/src/util/hash.ts new file mode 100644 index 0000000..15e4eb2 --- /dev/null +++ b/packages/uri-graph/src/util/hash.ts @@ -0,0 +1,19 @@ +/** + * Compute a hex SHA-256 hash of the given string. Uses Web Crypto when available + * (Node 19+, all modern browsers). + */ +export async function sha256Hex(text: string): Promise { + const subtle = (globalThis as { crypto?: { subtle?: SubtleCrypto } }).crypto?.subtle; + if (!subtle) { + throw new Error("Web Crypto SubtleCrypto is not available"); + } + const buf = new TextEncoder().encode(text); + const digest = await subtle.digest("SHA-256", buf); + const bytes = new Uint8Array(digest); + let out = ""; + for (let i = 0; i < bytes.length; i++) { + const b = bytes[i] ?? 0; + out += b.toString(16).padStart(2, "0"); + } + return out; +} diff --git a/packages/uri-graph/src/workers/chunker.ts b/packages/uri-graph/src/workers/chunker.ts new file mode 100644 index 0000000..b9514ea --- /dev/null +++ b/packages/uri-graph/src/workers/chunker.ts @@ -0,0 +1,138 @@ +import { findDirty } from "../graph/selector-helpers.js"; +import type { GraphStore } from "../store/types.js"; +import type { Update } from "../types/update.js"; +import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; +import { sha256Hex } from "../util/hash.js"; + +export interface ChunkerOptions { + /** Maximum characters per chunk. Default 1000. */ + chunkSize?: number; + /** Optional graph; if provided, the selector uses `findDirty` to yield pending text:// URIs. */ + graph?: GraphStore; + name?: string; + version?: string; +} + +function chunkUri(textUri: string, index: number): string { + // chunk:///path#i — simple URI scheme. + return `chunk:${textUri.slice("text:".length)}#${index}`; +} + +function chunkIndex(uri: string): number { + const m = /#(\d+)$/.exec(uri); + return m && m[1] !== undefined ? Number(m[1]) : -1; +} + +function makeChunkerSelector(graph: GraphStore | undefined): WorkerDefinition["selector"] { + if (!graph) { + return async function* () { + // Driven externally (test harness). + }; + } + return (ctx) => + findDirty(graph, { + forWorker: ctx.workerName, + forVersion: ctx.workerVersion, + uriLike: "text:///%", + limit: ctx.limit, + }); +} + +function splitText(text: string, size: number): string[] { + if (text.length === 0) return []; + const chunks: string[] = []; + for (let i = 0; i < text.length; i += size) { + chunks.push(text.slice(i, i + size)); + } + return chunks; +} + +/** + * Format-agnostic chunker: consumes `text://` URIs, splits each document into + * fixed-size character chunks, and yields `chunk://...` URIs under one stamp + * shared by all chunks of one document. Removed inputs cascade to their prior + * chunks via `priorOutputs`. + */ +export function createChunker(opts: ChunkerOptions = {}): WorkerDefinition { + const chunkSize = opts.chunkSize ?? 1000; + const name = opts.name ?? "chunker"; + const version = opts.version ?? "v1"; + + return { + name, + version, + description: "Splits text:// URIs into fixed-size chunks; format-agnostic.", + inputPattern: "text://**", + outputPattern: "chunk://**", + scopeExpr: "uri", + selector: makeChunkerSelector(opts.graph), + run: async function* ( + params: WorkerParams, + input: AsyncIterable, + ): AsyncGenerator { + for await (const doc of input) { + // findDirty doesn't carry the document text in attributes by default for + // text URIs that were committed by the extractor. Re-read the latest state. + if ( + doc.uri.startsWith("text:") && + (doc.attributes === undefined || + (doc.attributes as Record).text === undefined) + ) { + const live = await params.read(doc.uri); + if (live?.attributes) { + doc.attributes = live.attributes as Record; + } + } + if (params.signal.aborted) return; + const prior = await params.priorOutputs(doc.uri); + if (doc.status === "removed") { + const stamp = await params.stamp(); + for (const old of prior) { + yield { + uri: old.uri, + stamp, + status: "removed", + scope: doc.uri, + role: "chunk", + }; + } + continue; + } + + const text = ((doc.attributes as Record)?.text as string) ?? ""; + const chunks = splitText(text, chunkSize); + const stamp = await params.stamp(); + + const priorChunkUris = new Set(prior.map((p) => p.uri)); + for (const [i, chunk] of chunks.entries()) { + const uri = chunkUri(doc.uri, i); + const hash = await sha256Hex(chunk); + yield { + uri, + stamp, + status: priorChunkUris.has(uri) ? "updated" : "added", + hash, + scope: doc.uri, + role: "chunk", + attributes: { text: chunk, index: i }, + }; + } + + // Cascade removals for indices beyond current count. + const currentMax = chunks.length; + for (const old of prior) { + const idx = chunkIndex(old.uri); + if (idx >= currentMax) { + yield { + uri: old.uri, + stamp, + status: "removed", + scope: doc.uri, + role: "chunk", + }; + } + } + } + }, + }; +} diff --git a/packages/uri-graph/src/workers/embedder.ts b/packages/uri-graph/src/workers/embedder.ts new file mode 100644 index 0000000..6f704f1 --- /dev/null +++ b/packages/uri-graph/src/workers/embedder.ts @@ -0,0 +1,121 @@ +import { findDirty } from "../graph/selector-helpers.js"; +import type { GraphStore } from "../store/types.js"; +import type { Update } from "../types/update.js"; +import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; +import { sha256Hex } from "../util/hash.js"; + +export interface EmbedderOptions { + /** Embedding function — must run before the stamp is minted to keep txns short. */ + embed: (text: string, opts: { signal: AbortSignal }) => Promise; + /** Optional metadata attached to each emitted embedding update. */ + model?: string; + /** Optional graph; when provided, the selector finds pending chunk:// URIs. */ + graph?: GraphStore; + name?: string; + version?: string; +} + +function embeddingUri(chunkUri: string): string { + return `embedding://${chunkUri}`; +} + +function makeEmbedderSelector(graph: GraphStore | undefined): WorkerDefinition["selector"] { + if (!graph) { + return async function* () { + // Driven externally (test harness). + }; + } + return (ctx) => + findDirty(graph, { + forWorker: ctx.workerName, + forVersion: ctx.workerVersion, + uriLike: "chunk:///%", + limit: ctx.limit, + }); +} + +function vectorHashSync(vec: Float32Array): string { + // Cheap fingerprint; sha256 over the bytes. + // Returns a string usable for the no-op rule. + const buf = new Uint8Array(vec.buffer, vec.byteOffset, vec.byteLength); + let acc = 0n; + for (let i = 0; i < buf.length; i++) { + acc = ((acc << 7n) ^ BigInt(buf[i] ?? 0)) & 0xffffffffffffffffn; + } + return acc.toString(16); +} + +/** + * Consumes `chunk://**` updates and emits `embedding://...` updates. Mints one + * stamp per item AFTER the embedding call to keep the logical transaction + * boundary tight (the slow API call happens outside the txn). + */ +export function createEmbedder(opts: EmbedderOptions): WorkerDefinition { + const name = opts.name ?? "embedder"; + const version = opts.version ?? "v1"; + const model = opts.model ?? "unknown"; + + return { + name, + version, + description: "Embeds chunk:// updates into embedding:// vectors.", + inputPattern: "chunk://**", + outputPattern: "embedding://**", + scopeExpr: "uri", + selector: makeEmbedderSelector(opts.graph), + run: async function* ( + params: WorkerParams, + input: AsyncIterable, + ): AsyncGenerator { + for await (const chunk of input) { + if (params.signal.aborted) return; + + // If the selector handed us a chunk URI without text in attributes, + // re-read the live state to get the chunk body. + if ( + chunk.uri.startsWith("chunk:") && + (chunk.attributes === undefined || + (chunk.attributes as Record).text === undefined) + ) { + const live = await params.read(chunk.uri); + if (live?.attributes) { + chunk.attributes = live.attributes as Record; + } + } + + if (chunk.status === "removed") { + const stamp = await params.stamp(); + yield { + uri: embeddingUri(chunk.uri), + stamp, + status: "removed", + scope: chunk.uri, + role: "embedding", + }; + continue; + } + + const text = ((chunk.attributes as Record)?.text as string) ?? ""; + // Slow work outside the logical transaction. + const vector = await opts.embed(text, { signal: params.signal }); + if (params.signal.aborted) return; + + const hash = vectorHashSync(vector) || (await sha256Hex(text)); + const stamp = await params.stamp(); + yield { + uri: embeddingUri(chunk.uri), + stamp, + status: "updated", + hash, + scope: chunk.uri, + role: "embedding", + attributes: { + vector: Array.from(vector), + model, + sourceChunkUri: chunk.uri, + }, + }; + } + }, + }; +} diff --git a/packages/uri-graph/src/workers/extractors/base.ts b/packages/uri-graph/src/workers/extractors/base.ts new file mode 100644 index 0000000..6b483f4 --- /dev/null +++ b/packages/uri-graph/src/workers/extractors/base.ts @@ -0,0 +1,120 @@ +import type { FilesApi } from "@statewalker/webrun-files"; +import { readText } from "@statewalker/webrun-files"; +import { findDirty } from "../../graph/selector-helpers.js"; +import type { GraphStore } from "../../store/types.js"; +import type { Update } from "../../types/update.js"; +import type { WorkerDefinition, WorkerParams } from "../../types/worker.js"; +import { sha256Hex } from "../../util/hash.js"; + +export interface ExtractorOptions { + files: FilesApi; + /** + * Optional graph; if provided, the extractor's selector uses `findDirty` to + * yield pending file:// URIs matching its pattern. Without it, the selector is + * empty and the extractor must be fed via an external input stream (useful for + * unit tests). + */ + graph?: GraphStore; + /** Optional override of worker name/version. */ + name?: string; + version?: string; +} + +export interface ExtractorSpec extends ExtractorOptions { + /** Worker name (defaults to spec.defaultName). */ + defaultName: string; + /** Worker version. */ + defaultVersion: string; + /** SQL-LIKE pattern matching the file URIs this extractor handles. */ + uriLike: string; + /** RegExp tested against the path; only matches are extracted. */ + pathPattern: RegExp; + /** Mime declared on the produced text:// update. */ + mime: string; + /** + * Transform raw file text into the extracted body that gets indexed. + * For markdown / plain text this can be the identity; for html, strip tags. + */ + transform(raw: string): string; +} + +function fileToTextUri(uri: string): string { + return uri.replace(/^file:/, "text:"); +} + +function makeSelector(spec: ExtractorSpec): WorkerDefinition["selector"] { + const graph = spec.graph; + if (!graph) { + return async function* () { + // No graph wired; selector is empty (driven externally for unit tests). + }; + } + return (ctx) => + findDirty(graph, { + forWorker: ctx.workerName, + forVersion: ctx.workerVersion, + uriLike: spec.uriLike, + limit: ctx.limit, + }); +} + +/** + * Build a content extractor that consumes `file://**` URIs matching the spec's + * `pathPattern`, reads bytes via `FilesApi`, and emits a `text://...` URI carrying + * the extracted text and a real content hash. + */ +export function createExtractor(spec: ExtractorSpec): WorkerDefinition { + const name = spec.name ?? spec.defaultName; + const version = spec.version ?? spec.defaultVersion; + const files = spec.files; + + return { + name, + version, + description: `Extracts plain text from files matching ${spec.uriLike}.`, + inputPattern: spec.uriLike, + outputPattern: "text://**", + scopeExpr: "uri", + selector: makeSelector(spec), + run: async function* ( + params: WorkerParams, + input: AsyncIterable, + ): AsyncGenerator { + for await (const file of input) { + if (params.signal.aborted) return; + const path = file.uri.replace(/^file:\/\//, ""); + if (!spec.pathPattern.test(path)) continue; + + if (file.status === "removed") { + const stamp = await params.stamp(); + yield { + uri: fileToTextUri(file.uri), + stamp, + status: "removed", + scope: file.uri, + }; + continue; + } + + // Slow work BEFORE minting stamp. + const raw = await readText(files, path); + const body = spec.transform(raw); + const hash = await sha256Hex(body); + + const stamp = await params.stamp(); + yield { + uri: fileToTextUri(file.uri), + stamp, + status: file.status === "added" ? "added" : "updated", + hash, + scope: file.uri, + attributes: { + text: body, + mime: spec.mime, + sourceUri: file.uri, + }, + }; + } + }, + }; +} diff --git a/packages/uri-graph/src/workers/extractors/html-extractor.ts b/packages/uri-graph/src/workers/extractors/html-extractor.ts new file mode 100644 index 0000000..3f8e1ff --- /dev/null +++ b/packages/uri-graph/src/workers/extractors/html-extractor.ts @@ -0,0 +1,27 @@ +import type { ExtractorOptions } from "./base.js"; +import { createExtractor } from "./base.js"; + +/** + * Strip every HTML tag and collapse whitespace. Sufficient for indexing plain + * text content; not a structural HTML parser. + */ +function stripTags(html: string): string { + return html + .replace(/)<[^<]*)*<\/script>/gi, " ") + .replace(/)<[^<]*)*<\/style>/gi, " ") + .replace(/<[^>]+>/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +export function createHtmlExtractor(opts: ExtractorOptions) { + return createExtractor({ + ...opts, + defaultName: "extract-html", + defaultVersion: "v1", + uriLike: "file:///%.html", + pathPattern: /\.html?$/i, + mime: "text/html", + transform: stripTags, + }); +} diff --git a/packages/uri-graph/src/workers/extractors/markdown-extractor.ts b/packages/uri-graph/src/workers/extractors/markdown-extractor.ts new file mode 100644 index 0000000..b8fad9e --- /dev/null +++ b/packages/uri-graph/src/workers/extractors/markdown-extractor.ts @@ -0,0 +1,19 @@ +import type { ExtractorOptions } from "./base.js"; +import { createExtractor } from "./base.js"; + +/** + * Markdown extractor: matches `*.md` files. Currently passes raw markdown through + * unchanged (the indexer treats it as text). Replace `transform` if a structured + * markdown → plain conversion is needed downstream. + */ +export function createMarkdownExtractor(opts: ExtractorOptions) { + return createExtractor({ + ...opts, + defaultName: "extract-markdown", + defaultVersion: "v1", + uriLike: "file:///%.md", + pathPattern: /\.md$/i, + mime: "text/markdown", + transform: (raw) => raw, + }); +} diff --git a/packages/uri-graph/src/workers/extractors/plain-text-extractor.ts b/packages/uri-graph/src/workers/extractors/plain-text-extractor.ts new file mode 100644 index 0000000..bf63c6e --- /dev/null +++ b/packages/uri-graph/src/workers/extractors/plain-text-extractor.ts @@ -0,0 +1,14 @@ +import type { ExtractorOptions } from "./base.js"; +import { createExtractor } from "./base.js"; + +export function createPlainTextExtractor(opts: ExtractorOptions) { + return createExtractor({ + ...opts, + defaultName: "extract-plain-text", + defaultVersion: "v1", + uriLike: "file:///%.txt", + pathPattern: /\.txt$/i, + mime: "text/plain", + transform: (raw) => raw, + }); +} diff --git a/packages/uri-graph/src/workers/file-watcher.ts b/packages/uri-graph/src/workers/file-watcher.ts new file mode 100644 index 0000000..71dcf30 --- /dev/null +++ b/packages/uri-graph/src/workers/file-watcher.ts @@ -0,0 +1,110 @@ +import type { FilesApi } from "@statewalker/webrun-files"; +import type { Update } from "../types/update.js"; +import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; + +export interface FileWatcherOptions { + files: FilesApi; + rootPath: string; + /** Worker name; defaults to `file-watcher`. */ + name?: string; + /** Worker version; defaults to `v1`. */ + version?: string; +} + +interface FileFingerprint { + size: number; + mtime: number; + path: string; +} + +function fileUri(path: string): string { + // Maps virtual path '/a/b.md' → 'file:///a/b.md' (three slashes per RFC 8089). + return `file://${path}`; +} + +function fingerprint(size: number, mtime: number): string { + return `${size}:${mtime}`; +} + +/** + * Source worker that scans all files under `rootPath` via `FilesApi`. Emits + * `file://` URIs with status added/updated/removed based on a + * `(size, mtime)` fingerprint diffed against committed state. + * + * The watcher does NOT read file bytes; downstream extractors do that work. + */ +export function createFileWatcher(opts: FileWatcherOptions): WorkerDefinition { + const { files, rootPath } = opts; + const name = opts.name ?? "file-watcher"; + const version = opts.version ?? "v1"; + + return { + name, + version, + description: `Polls FilesApi at ${rootPath}; emits file:// URIs on change.`, + outputPattern: "file://**", + selector: async function* () { + yield { uri: `tick://${name}`, stamp: 0, status: "updated" }; + }, + run: async function* ( + params: WorkerParams, + input: AsyncIterable, + ): AsyncGenerator { + for await (const _tick of input) { + if (params.signal.aborted) return; + // Snapshot the FS. + const found = new Map(); + for await (const info of files.list(rootPath, { recursive: true })) { + if (info.kind !== "file") continue; + found.set(info.path, { + path: info.path, + size: info.size ?? 0, + mtime: info.lastModified ?? 0, + }); + } + + // Snapshot prior known files from the graph. + const known = new Map(); + for await (const view of params.find("file:///%")) { + const path = view.uri.replace(/^file:\/\//, ""); + const attrs = (view.attributes ?? {}) as Partial; + known.set(path, { + path, + size: attrs.size ?? 0, + mtime: attrs.mtime ?? 0, + }); + } + + const stamp = await params.stamp(); + + // Emit added / updated. + for (const [path, info] of found) { + const prev = known.get(path); + const changed = !prev || prev.size !== info.size || prev.mtime !== info.mtime; + if (!changed) continue; + yield { + uri: fileUri(path), + stamp, + status: prev ? "updated" : "added", + hash: fingerprint(info.size, info.mtime), + attributes: { + path: info.path, + size: info.size, + mtime: info.mtime, + }, + }; + } + + // Emit removed. + for (const path of known.keys()) { + if (found.has(path)) continue; + yield { + uri: fileUri(path), + stamp, + status: "removed", + }; + } + } + }, + }; +} diff --git a/packages/uri-graph/src/workers/index-backends/memory-fts.ts b/packages/uri-graph/src/workers/index-backends/memory-fts.ts new file mode 100644 index 0000000..752b87d --- /dev/null +++ b/packages/uri-graph/src/workers/index-backends/memory-fts.ts @@ -0,0 +1,75 @@ +export interface FtsHit { + scope: string; + score: number; +} + +export interface FtsBackend { + upsert(scope: string, docs: string[]): void; + remove(scope: string): void; + query(text: string): FtsHit[]; +} + +function tokenize(text: string): string[] { + return text + .toLowerCase() + .split(/[^a-z0-9]+/) + .filter(Boolean); +} + +/** + * Tiny inverted index keyed by scope. Score = number of distinct query terms + * that appear in the scope's documents. + */ +export function createMemoryFtsBackend(): FtsBackend { + // term → set of scopes that contain it. + const termIndex = new Map>(); + // scope → set of terms it contributed (so we can remove cleanly). + const scopeTerms = new Map>(); + + function dropScope(scope: string): void { + const terms = scopeTerms.get(scope); + if (!terms) return; + for (const t of terms) { + const set = termIndex.get(t); + if (!set) continue; + set.delete(scope); + if (set.size === 0) termIndex.delete(t); + } + scopeTerms.delete(scope); + } + + return { + upsert(scope, docs) { + dropScope(scope); + const terms = new Set(); + for (const d of docs) for (const t of tokenize(d)) terms.add(t); + for (const t of terms) { + let set = termIndex.get(t); + if (!set) { + set = new Set(); + termIndex.set(t, set); + } + set.add(scope); + } + scopeTerms.set(scope, terms); + }, + remove(scope) { + dropScope(scope); + }, + query(text) { + const queryTerms = tokenize(text); + const score = new Map(); + for (const t of queryTerms) { + const set = termIndex.get(t); + if (!set) continue; + for (const scope of set) { + score.set(scope, (score.get(scope) ?? 0) + 1); + } + } + const hits: FtsHit[] = []; + for (const [scope, s] of score) hits.push({ scope, score: s }); + hits.sort((a, b) => b.score - a.score || a.scope.localeCompare(b.scope)); + return hits; + }, + }; +} diff --git a/packages/uri-graph/src/workers/index-backends/memory-vector.ts b/packages/uri-graph/src/workers/index-backends/memory-vector.ts new file mode 100644 index 0000000..37f77db --- /dev/null +++ b/packages/uri-graph/src/workers/index-backends/memory-vector.ts @@ -0,0 +1,46 @@ +export interface VectorHit { + id: string; + score: number; +} + +export interface VectorBackend { + upsert(id: string, vec: Float32Array): void; + remove(id: string): void; + search(query: Float32Array, k: number): VectorHit[]; +} + +function cosineSim(a: Float32Array, b: Float32Array): number { + const len = Math.min(a.length, b.length); + let dot = 0; + let na = 0; + let nb = 0; + for (let i = 0; i < len; i++) { + const av = a[i] ?? 0; + const bv = b[i] ?? 0; + dot += av * bv; + na += av * av; + nb += bv * bv; + } + if (na === 0 || nb === 0) return 0; + return dot / (Math.sqrt(na) * Math.sqrt(nb)); +} + +export function createMemoryVectorBackend(): VectorBackend { + const vectors = new Map(); + return { + upsert(id, vec) { + vectors.set(id, vec); + }, + remove(id) { + vectors.delete(id); + }, + search(query, k) { + const hits: VectorHit[] = []; + for (const [id, v] of vectors) { + hits.push({ id, score: cosineSim(query, v) }); + } + hits.sort((a, b) => b.score - a.score); + return hits.slice(0, k); + }, + }; +} diff --git a/packages/uri-graph/src/workers/indexer.ts b/packages/uri-graph/src/workers/indexer.ts new file mode 100644 index 0000000..6f7ef36 --- /dev/null +++ b/packages/uri-graph/src/workers/indexer.ts @@ -0,0 +1,255 @@ +import type { GraphStore } from "../store/types.js"; +import type { Update } from "../types/update.js"; +import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; +import { sha256Hex } from "../util/hash.js"; +import type { FtsBackend } from "./index-backends/memory-fts.js"; +import type { VectorBackend } from "./index-backends/memory-vector.js"; + +export interface IndexerOptions { + fts: FtsBackend; + vector: VectorBackend; + /** Optional graph; when provided, the selector emits ready scopes. */ + graph?: GraphStore; + name?: string; + version?: string; +} + +interface ScopeBag { + text?: Update; + chunks: Update[]; + embeddings: Update[]; +} + +function ftsUri(scope: string): string { + return `index://fts/${scope}`; +} + +function vectorUri(scope: string): string { + return `index://vector/${scope}`; +} + +/** + * Multi-input indexer scoped by `text://` URI. Consumes interleaved updates + * grouped by `scope` and `role` (text | chunk | embedding) and writes per-scope + * entries to the configured FTS and vector backends. + * + * The indexer expects its input to be ordered by `scope` so that all rows for + * one scope arrive contiguously. Use `joinInputs` to merge multiple selector + * streams into a single ordered input. + */ +export function createIndexer(opts: IndexerOptions): WorkerDefinition { + const name = opts.name ?? "indexer"; + const version = opts.version ?? "v1"; + + return { + name, + version, + description: "Builds FTS + vector indexes from text/chunk/embedding inputs.", + inputPattern: "text:// + chunk:// + embedding://", + outputPattern: "index://**", + scopeExpr: "text_uri", + selector: makeIndexerSelector(opts.graph), + run: async function* ( + params: WorkerParams, + input: AsyncIterable, + ): AsyncGenerator { + let currentScope: string | undefined; + let bag: ScopeBag = { chunks: [], embeddings: [] }; + + async function* flush(): AsyncGenerator { + if (currentScope === undefined) return; + yield* indexOne(opts, params, currentScope, bag); + currentScope = undefined; + bag = { chunks: [], embeddings: [] }; + } + + for await (const u of input) { + if (params.signal.aborted) return; + const scope = u.scope ?? u.uri; + if (currentScope !== undefined && scope !== currentScope) { + yield* flush(); + } + currentScope = scope; + if (u.role === "text") bag.text = u; + else if (u.role === "chunk") bag.chunks.push(u); + else if (u.role === "embedding") bag.embeddings.push(u); + else if (u.uri.startsWith("text:")) bag.text = u; + else if (u.uri.startsWith("chunk:")) bag.chunks.push(u); + else if (u.uri.startsWith("embedding:")) bag.embeddings.push(u); + } + yield* flush(); + }, + }; +} + +function makeIndexerSelector(graph: GraphStore | undefined): WorkerDefinition["selector"] { + if (!graph) { + return async function* () { + // Driven externally (test harness). + }; + } + return (ctx) => indexerSelector(graph, ctx.workerName, ctx.workerVersion, ctx.limit); +} + +/** + * Selector for the indexer: emit one stream of `(text, chunk*, embedding*)` + * updates for each `text://` URI where every chunk has a matching embedding AND + * the indexer has not run for that scope at its current version. + * + * Streams are ordered by scope so the run can group via `(currentScope, bag)`. + */ +async function* indexerSelector( + graph: GraphStore, + workerName: string, + workerVersion: string, + limit: number, +): AsyncIterableIterator { + const scopes: string[] = []; + for await (const v of graph.find("text:///%")) { + if (v.status === "removed") { + // Removed text — emit a single sentinel so the run can cascade. + const processed = await graph.isInputProcessed(workerName, workerVersion, v.uri); + if (!processed) scopes.push(v.uri); + continue; + } + const processed = await graph.isInputProcessed(workerName, workerVersion, v.uri); + if (processed) continue; + // Verify all chunks have embeddings. + const chunkPattern = `chunk:${v.uri.slice("text:".length)}#%`; + let allEmbedded = true; + let chunkCount = 0; + for await (const c of graph.find(chunkPattern)) { + if (c.status === "removed") continue; + chunkCount += 1; + const emb = await graph.getState(`embedding://${c.uri}`); + if (!emb || emb.status === "removed") { + allEmbedded = false; + break; + } + } + if (chunkCount === 0 || !allEmbedded) continue; + scopes.push(v.uri); + } + + for (const scope of scopes.slice(0, limit)) { + const text = await graph.getState(scope); + if (!text) continue; + yield { + uri: scope, + stamp: text.stamp, + status: text.status, + hash: text.hash, + scope, + role: "text", + attributes: text.attributes, + }; + if (text.status === "removed") continue; + + const chunkPattern = `chunk:${scope.slice("text:".length)}#%`; + const chunks: Array<{ + uri: string; + stamp: number; + status: Update["status"]; + hash?: string; + attributes?: Record; + }> = []; + for await (const c of graph.find(chunkPattern)) { + chunks.push({ + uri: c.uri, + stamp: c.stamp, + status: c.status, + hash: c.hash, + attributes: c.attributes, + }); + } + chunks.sort((a, b) => a.uri.localeCompare(b.uri)); + for (const c of chunks) { + yield { + uri: c.uri, + stamp: c.stamp, + status: c.status, + hash: c.hash, + scope, + role: "chunk", + attributes: c.attributes, + }; + const emb = await graph.getState(`embedding://${c.uri}`); + if (!emb) continue; + yield { + uri: `embedding://${c.uri}`, + stamp: emb.stamp, + status: emb.status, + hash: emb.hash, + scope, + role: "embedding", + attributes: emb.attributes, + }; + } + } +} + +async function* indexOne( + opts: IndexerOptions, + params: WorkerParams, + scope: string, + bag: ScopeBag, +): AsyncGenerator { + // Removal: any "removed" text/chunk/embedding for this scope drops the indexes. + if (bag.text && bag.text.status === "removed") { + opts.fts.remove(scope); + for (const c of bag.chunks) opts.vector.remove(c.uri); + // Also drop any prior chunk-based vectors that may live under the scope's + // chunks even if the chunks themselves weren't passed in. + const stamp = await params.stamp(); + yield { + uri: ftsUri(scope), + stamp, + status: "removed", + scope, + role: "fts-index", + }; + yield { + uri: vectorUri(scope), + stamp, + status: "removed", + scope, + role: "vector-index", + }; + return; + } + + // Build / update entries. + const chunkTexts: string[] = []; + for (const c of bag.chunks) { + const t = (c.attributes as Record)?.text as string; + if (typeof t === "string") chunkTexts.push(t); + } + opts.fts.upsert(scope, chunkTexts); + + for (const e of bag.embeddings) { + const v = (e.attributes as Record)?.vector as number[] | undefined; + if (!Array.isArray(v)) continue; + const arr = new Float32Array(v); + opts.vector.upsert(e.uri, arr); + } + + const stamp = await params.stamp(); + const ftsHash = await sha256Hex(chunkTexts.join("\n")); + const vecHash = await sha256Hex(bag.embeddings.map((e) => e.hash ?? "").join("|")); + yield { + uri: ftsUri(scope), + stamp, + status: "updated", + hash: ftsHash, + scope, + role: "fts-index", + }; + yield { + uri: vectorUri(scope), + stamp, + status: "updated", + hash: vecHash, + scope, + role: "vector-index", + }; +} diff --git a/packages/uri-graph/tests/e2e/pipeline.test.ts b/packages/uri-graph/tests/e2e/pipeline.test.ts new file mode 100644 index 0000000..eede2e3 --- /dev/null +++ b/packages/uri-graph/tests/e2e/pipeline.test.ts @@ -0,0 +1,162 @@ +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { newNodeTursoDb } from "@statewalker/db-turso-node"; +import { writeText } from "@statewalker/webrun-files"; +import { MemFilesApi } from "@statewalker/webrun-files-mem"; +import { afterEach, describe, expect, it } from "vitest"; +import { createOrchestrator } from "../../src/orchestrator/orchestrator.js"; +import { createInMemoryPersistence } from "../../src/store/memory/files-persistence.js"; +import { MemoryGraphStore } from "../../src/store/memory/store.js"; +import { SqlGraphStore } from "../../src/store/sql/store.js"; +import type { GraphStore } from "../../src/store/types.js"; +import { openGraphStore } from "../../src/store/types.js"; +import { createChunker } from "../../src/workers/chunker.js"; +import { createEmbedder } from "../../src/workers/embedder.js"; +import { createMarkdownExtractor } from "../../src/workers/extractors/markdown-extractor.js"; +import { createFileWatcher } from "../../src/workers/file-watcher.js"; +import { createMemoryFtsBackend } from "../../src/workers/index-backends/memory-fts.js"; +import { createMemoryVectorBackend } from "../../src/workers/index-backends/memory-vector.js"; +import { createIndexer } from "../../src/workers/indexer.js"; + +const tmpDirs: string[] = []; +afterEach(() => { + while (tmpDirs.length) { + const d = tmpDirs.pop(); + if (d) { + try { + rmSync(d, { recursive: true, force: true }); + } catch { + // ignore + } + } + } +}); + +interface E2EHarness { + store: GraphStore; + files: MemFilesApi; + fts: ReturnType; + vector: ReturnType; + cleanup: () => Promise; +} + +async function makeMemoryHarness(): Promise { + const files = new MemFilesApi(); + const store = await openGraphStore(new MemoryGraphStore(createInMemoryPersistence("graph"))); + return { + store, + files, + fts: createMemoryFtsBackend(), + vector: createMemoryVectorBackend(), + async cleanup() { + await (store as unknown as { close(): Promise }).close(); + }, + }; +} + +async function makeSqlHarness(): Promise { + const dir = mkdtempSync(join(tmpdir(), "uri-graph-e2e-")); + tmpDirs.push(dir); + const db = await newNodeTursoDb({ path: join(dir, "graph.db") }); + const store = await openGraphStore(new SqlGraphStore({ db })); + return { + store, + files: new MemFilesApi(), + fts: createMemoryFtsBackend(), + vector: createMemoryVectorBackend(), + async cleanup() { + await db.close(); + }, + }; +} + +async function runToFixpoint( + store: GraphStore, + files: MemFilesApi, + fts: ReturnType, + vector: ReturnType, + maxRounds = 30, +): Promise { + const orch = createOrchestrator({ graph: store, pollMs: 5 }); + await orch.registerWorker(createFileWatcher({ files, rootPath: "/" })); + await orch.registerWorker(createMarkdownExtractor({ files, graph: store })); + await orch.registerWorker(createChunker({ chunkSize: 5, graph: store })); + await orch.registerWorker( + createEmbedder({ + graph: store, + embed: async (text: string) => new Float32Array([text.length, text.charCodeAt(0) || 0]), + }), + ); + await orch.registerWorker(createIndexer({ graph: store, fts, vector })); + + const ac = new AbortController(); + const startPromise = orch.start(ac.signal); + + // Poll until any indexer outputs exist or timeout. + for (let i = 0; i < maxRounds * 10; i++) { + await new Promise((r) => setTimeout(r, 20)); + const seen: string[] = []; + for await (const v of store.find("index://%")) seen.push(v.uri); + if (seen.length >= 2) break; + } + ac.abort(); + await startPromise; +} + +async function indexUris(store: GraphStore): Promise { + const out: string[] = []; + for await (const v of store.find("index://%")) { + if (v.status !== "removed") out.push(v.uri); + } + return out.sort(); +} + +describe("E2E pipeline (memory store)", () => { + it("two markdown files → fts + vector indexes built", async () => { + const h = await makeMemoryHarness(); + try { + await writeText(h.files, "/a.md", "hello world"); + await writeText(h.files, "/b.md", "another doc"); + await runToFixpoint(h.store, h.files, h.fts, h.vector); + const indexes = await indexUris(h.store); + expect(indexes).toContain("index://fts/text:///a.md"); + expect(indexes).toContain("index://fts/text:///b.md"); + expect(indexes).toContain("index://vector/text:///a.md"); + expect(indexes).toContain("index://vector/text:///b.md"); + expect(h.fts.query("hello").map((x) => x.scope)).toContain("text:///a.md"); + } finally { + await h.cleanup(); + } + }, 30000); + + it("non-markdown files do not produce indexes", async () => { + const h = await makeMemoryHarness(); + try { + await writeText(h.files, "/c.png", "binary-ish"); + await runToFixpoint(h.store, h.files, h.fts, h.vector, 5); + const indexes = await indexUris(h.store); + expect(indexes).toEqual([]); + } finally { + await h.cleanup(); + } + }, 15000); +}); + +describe("E2E pipeline (sql store)", () => { + it("two markdown files → fts + vector indexes built", async () => { + const h = await makeSqlHarness(); + try { + await writeText(h.files, "/a.md", "hello world"); + await writeText(h.files, "/b.md", "another doc"); + await runToFixpoint(h.store, h.files, h.fts, h.vector); + const indexes = await indexUris(h.store); + expect(indexes).toContain("index://fts/text:///a.md"); + expect(indexes).toContain("index://fts/text:///b.md"); + expect(indexes).toContain("index://vector/text:///a.md"); + expect(indexes).toContain("index://vector/text:///b.md"); + } finally { + await h.cleanup(); + } + }, 30000); +}); diff --git a/packages/uri-graph/tests/graph/selector-helpers.test.ts b/packages/uri-graph/tests/graph/selector-helpers.test.ts new file mode 100644 index 0000000..13ad54f --- /dev/null +++ b/packages/uri-graph/tests/graph/selector-helpers.test.ts @@ -0,0 +1,170 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import { findDirty, joinInputs } from "../../src/graph/selector-helpers.js"; +import type { GraphStore } from "../../src/store/types.js"; +import type { Update } from "../../src/types/update.js"; +import { openTempMemoryStore } from "../helpers.js"; + +describe("findDirty", () => { + let store: GraphStore; + + beforeEach(async () => { + store = await openTempMemoryStore(); + // seed three URIs + await store.registerWorker({ name: "seed", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "seed", + version: "v1", + scope: null, + initialStamp: s, + }); + for (const u of ["file:///a.md", "file:///b.md", "file:///c.txt"]) { + await txn.applyUpdate({ uri: u, stamp: s, status: "added", hash: u }); + } + await txn.commit(); + }); + + async function collect(it: AsyncIterable): Promise { + const out: T[] = []; + for await (const x of it) out.push(x); + return out; + } + + it("yields URIs matching the pattern that the worker has not processed", async () => { + const it = findDirty(store, { + forWorker: "ext", + forVersion: "v1", + uriLike: "file:///%.md", + limit: 10, + }); + const results = await collect(it); + expect(results.map((r) => r.uri).sort()).toEqual(["file:///a.md", "file:///b.md"]); + }); + + it("excludes URIs the worker has already processed at this version", async () => { + // simulate ext worker processed a.md at v1 + await store.registerWorker({ name: "ext", version: "v1" }); + const aState = await store.getState("file:///a.md"); + expect(aState).not.toBeNull(); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "ext", + version: "v1", + scope: "file:///a.md", + initialStamp: s, + }); + await txn.recordInputs([{ uri: "file:///a.md", observedStamp: aState?.stamp ?? 0 }]); + await txn.applyUpdate({ + uri: "text:///a.md", + stamp: s, + status: "added", + hash: "h", + }); + await txn.commit(); + + const it = findDirty(store, { + forWorker: "ext", + forVersion: "v1", + uriLike: "file:///%.md", + limit: 10, + }); + const results = await collect(it); + expect(results.map((r) => r.uri)).toEqual(["file:///b.md"]); + }); + + it("re-yields a URI when the worker version is bumped", async () => { + await store.registerWorker({ name: "ext", version: "v1" }); + const aState = await store.getState("file:///a.md"); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "ext", + version: "v1", + scope: "file:///a.md", + initialStamp: s, + }); + await txn.recordInputs([{ uri: "file:///a.md", observedStamp: aState?.stamp ?? 0 }]); + await txn.applyUpdate({ + uri: "text:///a.md", + stamp: s, + status: "added", + hash: "h", + }); + await txn.commit(); + + const it = findDirty(store, { + forWorker: "ext", + forVersion: "v2", + uriLike: "file:///%.md", + limit: 10, + }); + const results = await collect(it); + expect(results.map((r) => r.uri).sort()).toEqual(["file:///a.md", "file:///b.md"]); + }); + + it("respects limit", async () => { + const it = findDirty(store, { + forWorker: "ext", + forVersion: "v1", + uriLike: "file:///%", + limit: 1, + }); + const results: Array<{ uri: string }> = []; + for await (const x of it) results.push(x); + expect(results.length).toBe(1); + }); +}); + +describe("joinInputs", () => { + async function* asyncIter(items: T[]): AsyncIterableIterator { + for (const x of items) yield x; + } + + async function collect(it: AsyncIterable): Promise { + const out: T[] = []; + for await (const x of it) out.push(x); + return out; + } + + it("merges streams ordered by scope, then role", async () => { + const a: Update[] = [ + { + uri: "text://x", + stamp: 1, + status: "added", + scope: "x", + role: "text", + }, + { + uri: "text://y", + stamp: 1, + status: "added", + scope: "y", + role: "text", + }, + ]; + const b: Update[] = [ + { + uri: "chunk://x#0", + stamp: 1, + status: "added", + scope: "x", + role: "chunk", + }, + { + uri: "chunk://y#0", + stamp: 1, + status: "added", + scope: "y", + role: "chunk", + }, + ]; + const merged = joinInputs(asyncIter(a), asyncIter(b)); + const result = await collect(merged); + expect(result.map((u) => `${u.scope}:${u.role}`)).toEqual([ + "x:chunk", + "x:text", + "y:chunk", + "y:text", + ]); + }); +}); diff --git a/packages/uri-graph/tests/helpers.ts b/packages/uri-graph/tests/helpers.ts new file mode 100644 index 0000000..725f1f8 --- /dev/null +++ b/packages/uri-graph/tests/helpers.ts @@ -0,0 +1,12 @@ +import { createInMemoryPersistence } from "../src/store/memory/files-persistence.js"; +import { MemoryGraphStore } from "../src/store/memory/store.js"; +import type { GraphStore } from "../src/store/types.js"; +import { openGraphStore } from "../src/store/types.js"; + +/** + * Open an in-memory `MemoryGraphStore` with throwaway persistence. Each call + * gets an isolated store; no filesystem involved. + */ +export async function openTempMemoryStore(key = "graph"): Promise { + return openGraphStore(new MemoryGraphStore(createInMemoryPersistence(key))); +} diff --git a/packages/uri-graph/tests/orchestrator/drain.test.ts b/packages/uri-graph/tests/orchestrator/drain.test.ts new file mode 100644 index 0000000..eb278ac --- /dev/null +++ b/packages/uri-graph/tests/orchestrator/drain.test.ts @@ -0,0 +1,131 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import { drain } from "../../src/orchestrator/drain.js"; +import type { GraphStore } from "../../src/store/types.js"; +import type { Update } from "../../src/types/update.js"; +import type { WorkerDefinition } from "../../src/types/worker.js"; +import { openTempMemoryStore } from "../helpers.js"; + +describe("drain", () => { + let store: GraphStore; + + beforeEach(async () => { + store = await openTempMemoryStore(); + await store.registerWorker({ name: "w", version: "v1" }); + }); + + function makeWorker( + runFn: ( + input: AsyncIterable, + params: { stamp(): Promise }, + ) => AsyncGenerator, + ): WorkerDefinition { + return { + name: "w", + version: "v1", + selector: async function* () { + // empty + }, + run: async function* (params, input) { + yield* runFn(input, { stamp: params.stamp }); + }, + }; + } + + async function* asyncIter(items: T[]): AsyncIterableIterator { + for (const x of items) yield x; + } + + it("multiple updates with same stamp commit together", async () => { + const worker = makeWorker(async function* (_input, p) { + const s = await p.stamp(); + yield { uri: "u://a", stamp: s, status: "added", hash: "ha" }; + yield { uri: "u://b", stamp: s, status: "added", hash: "hb" }; + yield { uri: "u://c", stamp: s, status: "added", hash: "hc" }; + }); + await drain(worker, asyncIter([]), store); + const a = await store.getState("u://a"); + const b = await store.getState("u://b"); + const c = await store.getState("u://c"); + expect(a?.stamp).toBe(b?.stamp); + expect(b?.stamp).toBe(c?.stamp); + }); + + it("multiple stamps produce multiple commits", async () => { + const worker = makeWorker(async function* (_input, p) { + const s1 = await p.stamp(); + yield { uri: "u://a", stamp: s1, status: "added", hash: "ha" }; + const s2 = await p.stamp(); + yield { uri: "u://b", stamp: s2, status: "added", hash: "hb" }; + }); + await drain(worker, asyncIter([]), store); + const a = await store.getState("u://a"); + const b = await store.getState("u://b"); + expect(a).not.toBeNull(); + expect(b).not.toBeNull(); + expect(a?.stamp).not.toBe(b?.stamp); + if (a && b) expect(b.stamp).toBeGreaterThan(a.stamp); + }); + + it("generator throw rolls back current batch", async () => { + const worker = makeWorker(async function* (_input, p) { + const s = await p.stamp(); + yield { uri: "u://x", stamp: s, status: "added", hash: "h" }; + throw new Error("boom"); + }); + await expect(drain(worker, asyncIter([]), store)).rejects.toThrow(/boom/); + expect(await store.getState("u://x")).toBeNull(); + }); + + it("stamp regression aborts run", async () => { + const worker = makeWorker(async function* () { + yield { uri: "u://a", stamp: 100, status: "added", hash: "h1" }; + yield { uri: "u://b", stamp: 99, status: "added", hash: "h2" }; + }); + await expect(drain(worker, asyncIter([]), store)).rejects.toThrow(/stamp/i); + expect(await store.getState("u://a")).toBeNull(); + expect(await store.getState("u://b")).toBeNull(); + }); + + it("consumed inputs are recorded", async () => { + // Seed two committed input URIs. + const seedStamp = await store.mintStamp(); + const seed = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: seedStamp, + }); + await seed.applyUpdate({ + uri: "in://1", + stamp: seedStamp, + status: "added", + hash: "1", + }); + await seed.applyUpdate({ + uri: "in://2", + stamp: seedStamp, + status: "added", + hash: "2", + }); + await seed.commit(); + + const inputs: Update[] = [ + { uri: "in://1", stamp: seedStamp, status: "added", hash: "1" }, + { uri: "in://2", stamp: seedStamp, status: "added", hash: "2" }, + ]; + const worker = makeWorker(async function* (input, p) { + for await (const u of input) { + const s = await p.stamp(); + yield { uri: `out://${u.uri}`, stamp: s, status: "added", hash: `h:${u.uri}` }; + } + }); + await drain(worker, asyncIter(inputs), store); + + // Both outputs should exist. + expect(await store.getState("out://in://1")).not.toBeNull(); + expect(await store.getState("out://in://2")).not.toBeNull(); + // priorOutputs should map input URIs back to their outputs. + const prior1 = await store.priorOutputs("w", "in://1"); + expect(prior1.map((p) => p.uri)).toEqual(["out://in://1"]); + }); +}); diff --git a/packages/uri-graph/tests/orchestrator/orchestrator.test.ts b/packages/uri-graph/tests/orchestrator/orchestrator.test.ts new file mode 100644 index 0000000..785526e --- /dev/null +++ b/packages/uri-graph/tests/orchestrator/orchestrator.test.ts @@ -0,0 +1,134 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import { createOrchestrator } from "../../src/orchestrator/orchestrator.js"; +import type { GraphStore } from "../../src/store/types.js"; +import type { WorkerDefinition } from "../../src/types/worker.js"; +import { openTempMemoryStore } from "../helpers.js"; + +describe("Orchestrator", () => { + let store: GraphStore; + + beforeEach(async () => { + store = await openTempMemoryStore(); + }); + + function makeOneShotWorker( + name: string, + version: string, + behavior: { selectorYields: number; runYields: number }, + ): { def: WorkerDefinition; selectorCalls: number; runCalls: number } { + const stats = { selectorCalls: 0, runCalls: 0 }; + let selectorEmitted = false; + const def: WorkerDefinition = { + name, + version, + selector: async function* () { + stats.selectorCalls += 1; + if (selectorEmitted) return; + for (let i = 0; i < behavior.selectorYields; i++) { + yield { + uri: `tick://${name}#${i}`, + stamp: 0, + status: "updated", + }; + } + selectorEmitted = true; + }, + run: async function* (params, input) { + stats.runCalls += 1; + // consume input fully + for await (const _ of input) { + // just drain + } + for (let i = 0; i < behavior.runYields; i++) { + const s = await params.stamp(); + yield { + uri: `out://${name}#${i}`, + stamp: s, + status: "added", + hash: `h:${i}`, + }; + } + }, + }; + return { + def, + get selectorCalls() { + return stats.selectorCalls; + }, + get runCalls() { + return stats.runCalls; + }, + }; + } + + it("invokes a worker when its selector has work", async () => { + const w = makeOneShotWorker("a", "v1", { selectorYields: 1, runYields: 2 }); + const orch = createOrchestrator({ graph: store, pollMs: 10 }); + await orch.registerWorker(w.def); + const ac = new AbortController(); + const startPromise = orch.start(ac.signal); + // Wait until run completes. + while (w.runCalls === 0) await new Promise((r) => setTimeout(r, 10)); + ac.abort(); + await startPromise; + expect(w.runCalls).toBe(1); + expect(await store.getState("out://a#0")).not.toBeNull(); + expect(await store.getState("out://a#1")).not.toBeNull(); + }); + + it("sleeps when no work is pending and stops on abort", async () => { + const orch = createOrchestrator({ graph: store, pollMs: 5 }); + const w = makeOneShotWorker("idle", "v1", { + selectorYields: 0, + runYields: 0, + }); + await orch.registerWorker(w.def); + const ac = new AbortController(); + const startPromise = orch.start(ac.signal); + await new Promise((r) => setTimeout(r, 50)); + ac.abort(); + await startPromise; + expect(w.runCalls).toBe(0); + expect(w.selectorCalls).toBeGreaterThan(0); + }); + + it("workers are addressable by name in run records", async () => { + const w = makeOneShotWorker("named", "v3", { + selectorYields: 1, + runYields: 1, + }); + const orch = createOrchestrator({ graph: store, pollMs: 5 }); + await orch.registerWorker(w.def); + const ac = new AbortController(); + const startPromise = orch.start(ac.signal); + while (w.runCalls === 0) await new Promise((r) => setTimeout(r, 5)); + ac.abort(); + await startPromise; + // priorOutputs by name should yield the run's outputs. + const prior = await store.priorOutputs("named", "tick://named#0"); + expect(prior.map((p) => p.uri)).toEqual(["out://named#0"]); + }); + + it("bumping a worker version triggers reprocessing", async () => { + const w1 = makeOneShotWorker("ver", "v1", { + selectorYields: 1, + runYields: 1, + }); + const orch1 = createOrchestrator({ graph: store, pollMs: 5 }); + await orch1.registerWorker(w1.def); + const ac1 = new AbortController(); + const p1 = orch1.start(ac1.signal); + while (w1.runCalls === 0) await new Promise((r) => setTimeout(r, 5)); + ac1.abort(); + await p1; + + // Sanity: v1 ran once. + expect(w1.runCalls).toBe(1); + + // Now register v2 with same name; isInputProcessed should be false against new version. + const v1Done = await store.isInputProcessed("ver", "v1", "tick://ver#0"); + const v2Done = await store.isInputProcessed("ver", "v2", "tick://ver#0"); + expect(v1Done).toBe(true); + expect(v2Done).toBe(false); + }); +}); diff --git a/packages/uri-graph/tests/orchestrator/status.test.ts b/packages/uri-graph/tests/orchestrator/status.test.ts new file mode 100644 index 0000000..db5015b --- /dev/null +++ b/packages/uri-graph/tests/orchestrator/status.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from "vitest"; +import { createOrchestrator } from "../../src/orchestrator/orchestrator.js"; +import { openTempMemoryStore } from "../helpers.js"; + +describe("Orchestrator.status()", () => { + it("reports running flag and registered workers", async () => { + const store = await openTempMemoryStore(); + const orch = createOrchestrator({ graph: store, pollMs: 5 }); + await orch.registerWorker({ + name: "a", + version: "v1", + selector: async function* () { + // empty + }, + run: async function* () { + // empty + }, + }); + await orch.registerWorker({ + name: "b", + version: "v2", + selector: async function* () { + // empty + }, + run: async function* () { + // empty + }, + }); + const before = await orch.status(); + expect(before.running).toBe(false); + expect(before.workers.map((w) => `${w.name}:${w.version}`).sort()).toEqual(["a:v1", "b:v2"]); + + const ac = new AbortController(); + const startPromise = orch.start(ac.signal); + await new Promise((r) => setTimeout(r, 20)); + const during = await orch.status(); + expect(during.running).toBe(true); + ac.abort(); + await startPromise; + const after = await orch.status(); + expect(after.running).toBe(false); + }); +}); diff --git a/packages/uri-graph/tests/store/memory-snapshot.test.ts b/packages/uri-graph/tests/store/memory-snapshot.test.ts new file mode 100644 index 0000000..1ab2e4e --- /dev/null +++ b/packages/uri-graph/tests/store/memory-snapshot.test.ts @@ -0,0 +1,151 @@ +import { MemFilesApi } from "@statewalker/webrun-files-mem"; +import { describe, expect, it } from "vitest"; +import { createFilesPersistence } from "../../src/store/memory/files-persistence.js"; +import { MemoryGraphStore } from "../../src/store/memory/store.js"; +import { openGraphStore } from "../../src/store/types.js"; + +describe("MemoryGraphStore — files-backed persistence", () => { + it("opens with no existing snapshot and starts empty", async () => { + const files = new MemFilesApi(); + const store = await openGraphStore( + new MemoryGraphStore(createFilesPersistence(files, "/g.json")), + ); + expect(await store.getState("u://x")).toBeNull(); + expect(await store.mintStamp()).toBe(1); + }); + + it("opens with existing snapshot and restores state", async () => { + const files = new MemFilesApi(); + { + const store = await openGraphStore( + new MemoryGraphStore(createFilesPersistence(files, "/g.json")), + ); + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.applyUpdate({ + uri: "u://x", + stamp: s, + status: "added", + hash: "h", + }); + await txn.commit(); + await (store as unknown as { close(): Promise }).close(); + } + const store2 = await openGraphStore( + new MemoryGraphStore(createFilesPersistence(files, "/g.json")), + ); + expect((await store2.getState("u://x"))?.hash).toBe("h"); + const next = await store2.mintStamp(); + expect(next).toBeGreaterThan(1); + }); + + it("snapshot omits pending data", async () => { + const files = new MemFilesApi(); + const store = await openGraphStore( + new MemoryGraphStore(createFilesPersistence(files, "/g.json")), + ); + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.applyUpdate({ uri: "u://x", stamp: s, status: "added", hash: "h" }); + // Without commit, snapshot should not contain u://x. Force one via rollback. + await txn.rollback(); + const decoder = new TextDecoder(); + const chunks: Uint8Array[] = []; + for await (const c of files.read("/g.json")) chunks.push(c); + const total = chunks.reduce((a, c) => a + c.length, 0); + const buf = new Uint8Array(total); + let off = 0; + for (const c of chunks) { + buf.set(c, off); + off += c.length; + } + const json = JSON.parse(decoder.decode(buf)); + expect(JSON.stringify(json)).not.toContain("u://x"); + }); + + it("commit triggers snapshot write", async () => { + const files = new MemFilesApi(); + const store = await openGraphStore( + new MemoryGraphStore(createFilesPersistence(files, "/g.json")), + ); + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.applyUpdate({ uri: "u://x", stamp: s, status: "added", hash: "h" }); + await txn.commit(); + expect(await files.exists("/g.json")).toBe(true); + }); + + it("second open against same path errors", async () => { + const files = new MemFilesApi(); + const persistence = createFilesPersistence(files, "/g.json"); + await openGraphStore(new MemoryGraphStore(persistence)); + await expect( + openGraphStore(new MemoryGraphStore(createFilesPersistence(files, "/g.json"))), + ).rejects.toThrow(/already open/); + }); + + it("applyUpdate stages without affecting reads", async () => { + const files = new MemFilesApi(); + const store = await openGraphStore( + new MemoryGraphStore(createFilesPersistence(files, "/g.json")), + ); + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + await txn.applyUpdate({ uri: "u://x", stamp: s, status: "added", hash: "h" }); + expect(await store.getState("u://x")).toBeNull(); + await txn.commit(); + expect(await store.getState("u://x")).not.toBeNull(); + }); + + it("commit promotes staging atomically", async () => { + const files = new MemFilesApi(); + const store = await openGraphStore( + new MemoryGraphStore(createFilesPersistence(files, "/g.json")), + ); + await store.registerWorker({ name: "w", version: "v1" }); + const s = await store.mintStamp(); + const txn = await store.beginTransaction({ + worker: "w", + version: "v1", + scope: null, + initialStamp: s, + }); + for (let i = 0; i < 10; i++) { + await txn.applyUpdate({ + uri: `u://${i}`, + stamp: s, + status: "added", + hash: `h${i}`, + }); + } + expect(await store.getState("u://0")).toBeNull(); + await txn.commit(); + for (let i = 0; i < 10; i++) { + expect(await store.getState(`u://${i}`)).not.toBeNull(); + } + }); +}); diff --git a/packages/uri-graph/tests/store/memory.test.ts b/packages/uri-graph/tests/store/memory.test.ts new file mode 100644 index 0000000..32dffde --- /dev/null +++ b/packages/uri-graph/tests/store/memory.test.ts @@ -0,0 +1,18 @@ +import { defineGraphStoreContract } from "../../src/store/contract.js"; +import { createInMemoryPersistence } from "../../src/store/memory/files-persistence.js"; +import { MemoryGraphStore } from "../../src/store/memory/store.js"; +import { openGraphStore } from "../../src/store/types.js"; + +defineGraphStoreContract("MemoryGraphStore", () => { + const persistence = createInMemoryPersistence("graph.json"); + return { + async open() { + const raw = new MemoryGraphStore(persistence); + return openGraphStore(raw); + }, + async close(store) { + const closable = store as { close?: () => Promise }; + if (closable.close) await closable.close(); + }, + }; +}); diff --git a/packages/uri-graph/tests/store/sql.test.ts b/packages/uri-graph/tests/store/sql.test.ts new file mode 100644 index 0000000..13e9190 --- /dev/null +++ b/packages/uri-graph/tests/store/sql.test.ts @@ -0,0 +1,49 @@ +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { Db } from "@statewalker/db-api"; +import { newNodeTursoDb } from "@statewalker/db-turso-node"; +import { afterEach } from "vitest"; +import { defineGraphStoreContract } from "../../src/store/contract.js"; +import { SqlGraphStore } from "../../src/store/sql/store.js"; +import { openGraphStore } from "../../src/store/types.js"; + +const tmpDirs: string[] = []; + +afterEach(() => { + while (tmpDirs.length) { + const dir = tmpDirs.pop(); + if (dir) { + try { + rmSync(dir, { recursive: true, force: true }); + } catch { + // ignore + } + } + } +}); + +defineGraphStoreContract("SqlGraphStore", () => { + const dir = mkdtempSync(join(tmpdir(), "uri-graph-sql-")); + tmpDirs.push(dir); + const dbPath = join(dir, "graph.db"); + const opened: { db: Db; store: SqlGraphStore }[] = []; + return { + async open() { + const db = await newNodeTursoDb({ path: dbPath }); + const store = new SqlGraphStore({ db }); + opened.push({ db, store }); + return openGraphStore(store); + }, + async close(store) { + const idx = opened.findIndex((o) => o.store === (store as SqlGraphStore)); + if (idx >= 0) { + const entry = opened.splice(idx, 1)[0]; + if (entry) { + await entry.store.close(); + await entry.db.close(); + } + } + }, + }; +}); diff --git a/packages/uri-graph/tests/types/update.test.ts b/packages/uri-graph/tests/types/update.test.ts new file mode 100644 index 0000000..17750b8 --- /dev/null +++ b/packages/uri-graph/tests/types/update.test.ts @@ -0,0 +1,50 @@ +import { describe, expectTypeOf, it } from "vitest"; +import type { ReadOnlyView, Status, Update } from "../../src/types/update.js"; + +describe("Update", () => { + it("requires uri, stamp, status", () => { + expectTypeOf().toHaveProperty("uri").toEqualTypeOf(); + expectTypeOf().toHaveProperty("stamp").toEqualTypeOf(); + expectTypeOf().toHaveProperty("status").toEqualTypeOf(); + }); + + it("Status is the three-value union", () => { + expectTypeOf().toEqualTypeOf<"added" | "updated" | "removed">(); + }); + + it("hash, scope, role, attributes are optional", () => { + const minimal: Update = { uri: "u", stamp: 1, status: "added" }; + expectTypeOf(minimal).toMatchTypeOf(); + const full: Update = { + uri: "u", + stamp: 1, + status: "updated", + hash: "h", + scope: "s", + role: "r", + attributes: { k: 1 }, + }; + expectTypeOf(full).toMatchTypeOf(); + }); +}); + +describe("ReadOnlyView", () => { + it("has uri, stamp, status, optional hash and attributes", () => { + const view: ReadOnlyView = { uri: "u", stamp: 1, status: "added" }; + expectTypeOf(view).toMatchTypeOf(); + const view2: ReadOnlyView = { + uri: "u", + stamp: 1, + status: "removed", + hash: "h", + attributes: { x: true }, + }; + expectTypeOf(view2).toMatchTypeOf(); + }); + + it("does not carry scope or role", () => { + type ROVKeys = keyof ReadOnlyView; + expectTypeOf<"scope" extends ROVKeys ? true : false>().toEqualTypeOf(); + expectTypeOf<"role" extends ROVKeys ? true : false>().toEqualTypeOf(); + }); +}); diff --git a/packages/uri-graph/tests/types/worker.test.ts b/packages/uri-graph/tests/types/worker.test.ts new file mode 100644 index 0000000..e4e11c3 --- /dev/null +++ b/packages/uri-graph/tests/types/worker.test.ts @@ -0,0 +1,69 @@ +import { describe, expectTypeOf, it } from "vitest"; +import type { ReadOnlyView, Update } from "../../src/types/update.js"; +import type { + Selector, + SelectorContext, + WorkerDefinition, + WorkerParams, +} from "../../src/types/worker.js"; + +describe("WorkerParams", () => { + it("exposes stamp, read, find, priorOutputs, recordRead, signal", () => { + expectTypeOf().toHaveProperty("stamp").toEqualTypeOf<() => Promise>(); + expectTypeOf() + .toHaveProperty("read") + .toEqualTypeOf<(uri: string) => Promise>(); + expectTypeOf() + .toHaveProperty("find") + .toEqualTypeOf<(pattern: string) => AsyncIterable>(); + expectTypeOf() + .toHaveProperty("priorOutputs") + .toEqualTypeOf<(inputUri: string) => Promise>(); + expectTypeOf() + .toHaveProperty("recordRead") + .toEqualTypeOf<(uri: string, role?: string) => void>(); + expectTypeOf().toHaveProperty("signal").toEqualTypeOf(); + }); +}); + +describe("Selector", () => { + it("is a function from SelectorContext to AsyncIterableIterator", () => { + expectTypeOf().toEqualTypeOf< + (ctx: SelectorContext) => AsyncIterableIterator + >(); + }); +}); + +describe("WorkerDefinition", () => { + it("requires name, version, selector, run; rest optional", () => { + const def: WorkerDefinition = { + name: "w", + version: "v1", + selector: async function* () { + // empty + }, + run: async function* () { + // empty + }, + }; + expectTypeOf(def).toMatchTypeOf(); + }); + + it("accepts inputPattern, outputPattern, scopeExpr, description", () => { + const def: WorkerDefinition = { + name: "w", + version: "v1", + description: "d", + inputPattern: "file://**", + outputPattern: "text://**", + scopeExpr: "uri", + selector: async function* () { + // empty + }, + run: async function* () { + // empty + }, + }; + expectTypeOf(def).toMatchTypeOf(); + }); +}); diff --git a/packages/uri-graph/tests/workers/chunker.test.ts b/packages/uri-graph/tests/workers/chunker.test.ts new file mode 100644 index 0000000..8f59b2b --- /dev/null +++ b/packages/uri-graph/tests/workers/chunker.test.ts @@ -0,0 +1,97 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import { drain } from "../../src/orchestrator/drain.js"; +import type { GraphStore } from "../../src/store/types.js"; +import type { Update } from "../../src/types/update.js"; +import { createChunker } from "../../src/workers/chunker.js"; +import { openTempMemoryStore } from "../helpers.js"; + +describe("chunker", () => { + let store: GraphStore; + + beforeEach(async () => { + store = await openTempMemoryStore(); + }); + + async function feedTextUpdates(updates: Update[]): Promise { + const chunker = createChunker({ chunkSize: 5 }); + await store.registerWorker({ + name: chunker.name, + version: chunker.version, + }); + async function* feed(): AsyncIterableIterator { + for (const u of updates) yield u; + } + await drain(chunker, feed(), store); + } + + function textUpdate(uri: string, text: string, hash: string): Update { + return { + uri, + stamp: 0, + status: "added", + hash, + attributes: { text }, + }; + } + + async function chunkUris(): Promise { + const out: string[] = []; + for await (const v of store.find("chunk:///%")) out.push(v.uri); + return out.sort(); + } + + it("splits text into chunks under one shared stamp", async () => { + const text = "abcde" + "fghij" + "klmno"; // 15 chars, chunkSize 5 → 3 chunks + await feedTextUpdates([textUpdate("text:///a.md", text, "h")]); + const uris = await chunkUris(); + expect(uris).toEqual(["chunk:///a.md#0", "chunk:///a.md#1", "chunk:///a.md#2"]); + const stamps = await Promise.all(uris.map(async (u) => (await store.getState(u))?.stamp)); + expect(new Set(stamps).size).toBe(1); + }); + + it("stable chunk URIs across re-runs", async () => { + const text = "abcdefghij"; // 10 chars → 2 chunks + await feedTextUpdates([textUpdate("text:///x.md", text, "h1")]); + const first = await chunkUris(); + await feedTextUpdates([textUpdate("text:///x.md", text, "h2")]); + const second = await chunkUris(); + expect(second).toEqual(first); + }); + + it("shrinking output cleans surplus chunks", async () => { + const long = "12345" + "67890" + "abcde" + "fghij" + "klmno"; // 25 chars → 5 chunks + await feedTextUpdates([textUpdate("text:///s.md", long, "h1")]); + const before = await chunkUris(); + expect(before.length).toBe(5); + + const short = "12345" + "67890" + "abcde"; // 15 chars → 3 chunks + await feedTextUpdates([textUpdate("text:///s.md", short, "h2")]); + + // The first 3 chunks should still exist; chunks #3 and #4 should be removed. + const c3 = await store.getState("chunk:///s.md#3"); + const c4 = await store.getState("chunk:///s.md#4"); + expect(c3?.status).toBe("removed"); + expect(c4?.status).toBe("removed"); + }); + + it("two text docs get distinct stamps", async () => { + await feedTextUpdates([ + textUpdate("text:///a.md", "hello world!", "ha"), + textUpdate("text:///b.md", "another doc", "hb"), + ]); + const a0 = await store.getState("chunk:///a.md#0"); + const b0 = await store.getState("chunk:///b.md#0"); + expect(a0?.stamp).not.toBe(b0?.stamp); + }); + + it("upstream removal cascades to chunks", async () => { + await feedTextUpdates([textUpdate("text:///a.md", "abcdefghij", "h")]); + expect((await store.getState("chunk:///a.md#0"))?.status).toBe("added"); + + await feedTextUpdates([{ uri: "text:///a.md", stamp: 0, status: "removed" }]); + const c0 = await store.getState("chunk:///a.md#0"); + const c1 = await store.getState("chunk:///a.md#1"); + expect(c0?.status).toBe("removed"); + expect(c1?.status).toBe("removed"); + }); +}); diff --git a/packages/uri-graph/tests/workers/embedder.test.ts b/packages/uri-graph/tests/workers/embedder.test.ts new file mode 100644 index 0000000..7daa3e8 --- /dev/null +++ b/packages/uri-graph/tests/workers/embedder.test.ts @@ -0,0 +1,92 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import { drain } from "../../src/orchestrator/drain.js"; +import type { GraphStore } from "../../src/store/types.js"; +import type { Update } from "../../src/types/update.js"; +import { createEmbedder } from "../../src/workers/embedder.js"; +import { openTempMemoryStore } from "../helpers.js"; + +describe("embedder", () => { + let store: GraphStore; + + beforeEach(async () => { + store = await openTempMemoryStore(); + }); + + function chunkUpdate(uri: string, text: string): Update { + return { + uri, + stamp: 0, + status: "added", + hash: `h:${uri}`, + attributes: { text }, + }; + } + + it("yields one stamp per chunk", async () => { + const embedder = createEmbedder({ + embed: async (text: string) => new Float32Array([text.length, 0, 0]), + }); + await store.registerWorker({ + name: embedder.name, + version: embedder.version, + }); + const inputs = [ + chunkUpdate("chunk:///a.md#0", "hello"), + chunkUpdate("chunk:///a.md#1", "world"), + chunkUpdate("chunk:///a.md#2", "!"), + ]; + async function* feed(): AsyncIterableIterator { + for (const u of inputs) yield u; + } + await drain(embedder, feed(), store); + const e0 = await store.getState("embedding://chunk:///a.md#0"); + const e1 = await store.getState("embedding://chunk:///a.md#1"); + const e2 = await store.getState("embedding://chunk:///a.md#2"); + expect(e0?.stamp).not.toBe(e1?.stamp); + expect(e1?.stamp).not.toBe(e2?.stamp); + expect(e0?.stamp).not.toBe(e2?.stamp); + }); + + it("calls the embedding API once per chunk", async () => { + let calls = 0; + const embedder = createEmbedder({ + embed: async (_text: string) => { + calls += 1; + return new Float32Array([1, 2, 3]); + }, + }); + await store.registerWorker({ + name: embedder.name, + version: embedder.version, + }); + async function* feed(): AsyncIterableIterator { + yield chunkUpdate("chunk:///x#0", "a"); + yield chunkUpdate("chunk:///x#1", "b"); + } + await drain(embedder, feed(), store); + expect(calls).toBe(2); + }); + + it("respects abort: stops mid-stream", async () => { + const ac = new AbortController(); + const embedder = createEmbedder({ + embed: async (text: string) => { + if (text === "abort-here") ac.abort(); + return new Float32Array([1]); + }, + }); + await store.registerWorker({ + name: embedder.name, + version: embedder.version, + }); + async function* feed(): AsyncIterableIterator { + yield chunkUpdate("chunk:///a", "ok"); + yield chunkUpdate("chunk:///b", "abort-here"); + yield chunkUpdate("chunk:///c", "should-not-process"); + } + await drain(embedder, feed(), store, { signal: ac.signal }); + // a was processed; b might or might not have committed (its embed returned + // before abort took effect). c should NOT have been processed. + expect(await store.getState("embedding://chunk:///c")).toBeNull(); + }); +}); diff --git a/packages/uri-graph/tests/workers/extractors.test.ts b/packages/uri-graph/tests/workers/extractors.test.ts new file mode 100644 index 0000000..4569dea --- /dev/null +++ b/packages/uri-graph/tests/workers/extractors.test.ts @@ -0,0 +1,127 @@ +import { writeText } from "@statewalker/webrun-files"; +import { MemFilesApi } from "@statewalker/webrun-files-mem"; +import { beforeEach, describe, expect, it } from "vitest"; +import { drain } from "../../src/orchestrator/drain.js"; +import type { GraphStore } from "../../src/store/types.js"; +import type { Update } from "../../src/types/update.js"; +import { createHtmlExtractor } from "../../src/workers/extractors/html-extractor.js"; +import { createMarkdownExtractor } from "../../src/workers/extractors/markdown-extractor.js"; +import { createPlainTextExtractor } from "../../src/workers/extractors/plain-text-extractor.js"; +import { createFileWatcher } from "../../src/workers/file-watcher.js"; +import { openTempMemoryStore } from "../helpers.js"; + +describe("extractors", () => { + let store: GraphStore; + let files: MemFilesApi; + + beforeEach(async () => { + files = new MemFilesApi(); + store = await openTempMemoryStore(); + }); + + async function runWatcher(): Promise { + const watcher = createFileWatcher({ files, rootPath: "/" }); + await store.registerWorker({ + name: watcher.name, + version: watcher.version, + }); + async function* tick(): AsyncIterableIterator { + yield { uri: `tick://${watcher.name}`, stamp: 0, status: "updated" }; + } + await drain(watcher, tick(), store); + } + + async function runExtractor( + extractor: ReturnType, + ): Promise { + await store.registerWorker({ + name: extractor.name, + version: extractor.version, + }); + // Synthesize input from committed file:// URIs. + const inputs: Update[] = []; + for await (const v of store.find("file:///%")) { + inputs.push({ + uri: v.uri, + stamp: v.stamp, + status: v.status, + hash: v.hash, + attributes: v.attributes, + }); + } + async function* feed(): AsyncIterableIterator { + for (const u of inputs) yield u; + } + await drain(extractor, feed(), store); + } + + describe("markdown", () => { + it("matches only .md files and emits text://", async () => { + await writeText(files, "/a.md", "# Heading\n\nbody"); + await writeText(files, "/b.txt", "text only"); + await writeText(files, "/c.png", "binary"); + await runWatcher(); + await runExtractor(createMarkdownExtractor({ files })); + + const textViews: string[] = []; + for await (const v of store.find("text:///%")) textViews.push(v.uri); + expect(textViews.sort()).toEqual(["text:///a.md"]); + }); + + it("two files with identical content yield identical hashes", async () => { + await writeText(files, "/x.md", "same body"); + await writeText(files, "/y.md", "same body"); + await runWatcher(); + await runExtractor(createMarkdownExtractor({ files })); + const x = await store.getState("text:///x.md"); + const y = await store.getState("text:///y.md"); + expect(x?.hash).toBe(y?.hash); + expect(x?.hash).toBeTruthy(); + }); + + it("removed source cascades to text URI", async () => { + await writeText(files, "/a.md", "body"); + await runWatcher(); + await runExtractor(createMarkdownExtractor({ files })); + expect((await store.getState("text:///a.md"))?.status).toBe("added"); + + await files.remove("/a.md"); + await runWatcher(); + await runExtractor(createMarkdownExtractor({ files })); + expect((await store.getState("text:///a.md"))?.status).toBe("removed"); + }); + + it("ignores non-md files in the input stream", async () => { + await writeText(files, "/c.png", "binary"); + await runWatcher(); + await runExtractor(createMarkdownExtractor({ files })); + expect(await store.getState("text:///c.png")).toBeNull(); + }); + }); + + describe("plain text", () => { + it("matches only .txt files", async () => { + await writeText(files, "/a.md", "md"); + await writeText(files, "/b.txt", "plain"); + await runWatcher(); + await runExtractor(createPlainTextExtractor({ files })); + const textViews: string[] = []; + for await (const v of store.find("text:///%")) textViews.push(v.uri); + expect(textViews).toEqual(["text:///b.txt"]); + }); + }); + + describe("html", () => { + it("strips tags and emits plain text", async () => { + await writeText(files, "/a.html", "

Hi

world

"); + await runWatcher(); + await runExtractor(createHtmlExtractor({ files })); + const view = await store.getState("text:///a.html"); + expect(view).not.toBeNull(); + const text = (view?.attributes as Record)?.text as string; + expect(text).not.toContain("<"); + expect(text).toContain("Hi"); + expect(text).toContain("world"); + }); + }); +}); diff --git a/packages/uri-graph/tests/workers/file-watcher.test.ts b/packages/uri-graph/tests/workers/file-watcher.test.ts new file mode 100644 index 0000000..9b81dec --- /dev/null +++ b/packages/uri-graph/tests/workers/file-watcher.test.ts @@ -0,0 +1,130 @@ +import { writeText } from "@statewalker/webrun-files"; +import { MemFilesApi } from "@statewalker/webrun-files-mem"; +import { beforeEach, describe, expect, it } from "vitest"; +import { drain } from "../../src/orchestrator/drain.js"; +import type { GraphStore } from "../../src/store/types.js"; +import type { Update } from "../../src/types/update.js"; +import { createFileWatcher } from "../../src/workers/file-watcher.js"; +import { openTempMemoryStore } from "../helpers.js"; + +describe("file watcher", () => { + let store: GraphStore; + let files: MemFilesApi; + + beforeEach(async () => { + files = new MemFilesApi(); + store = await openTempMemoryStore(); + }); + + async function* singleTick(): AsyncIterableIterator { + yield { uri: "tick://file-watcher", stamp: 0, status: "updated" }; + } + + async function runOnce(rootPath: string): Promise { + const watcher = createFileWatcher({ files, rootPath }); + await store.registerWorker({ + name: watcher.name, + version: watcher.version, + }); + await drain(watcher, singleTick(), store); + } + + async function fileUris(): Promise { + const out: string[] = []; + for await (const v of store.find("file:///%")) out.push(v.uri); + return out.sort(); + } + + it("scans all files, not just one extension", async () => { + await writeText(files, "/a.md", "hello"); + await writeText(files, "/b.txt", "world"); + await writeText(files, "/c.png", "binary-ish"); + await writeText(files, "/d.pdf", "blob"); + await runOnce("/"); + expect(await fileUris()).toEqual([ + "file:///a.md", + "file:///b.txt", + "file:///c.png", + "file:///d.pdf", + ]); + }); + + it("emits added on first sighting and nothing on second sighting if unchanged", async () => { + await writeText(files, "/a.md", "hello"); + await runOnce("/"); + const firstStamp = (await store.getState("file:///a.md"))?.stamp; + await runOnce("/"); + const secondStamp = (await store.getState("file:///a.md"))?.stamp; + expect(secondStamp).toBe(firstStamp); + }); + + it("emits update when file mtime/size changes", async () => { + await writeText(files, "/a.md", "hello"); + await runOnce("/"); + const before = await store.getState("file:///a.md"); + // re-write with different content (different size) + await new Promise((r) => setTimeout(r, 5)); + await writeText(files, "/a.md", "hello world"); + await runOnce("/"); + const after = await store.getState("file:///a.md"); + expect(before).not.toBeNull(); + expect(after).not.toBeNull(); + if (before && after) { + expect(after.stamp).toBeGreaterThan(before.stamp); + } + }); + + it("emits removed when file is deleted", async () => { + await writeText(files, "/a.md", "hello"); + await runOnce("/"); + expect((await store.getState("file:///a.md"))?.status).toBe("added"); + await files.remove("/a.md"); + await runOnce("/"); + expect((await store.getState("file:///a.md"))?.status).toBe("removed"); + }); + + it("does not read file bytes", async () => { + await writeText(files, "/a.md", "hello"); + let readCalled = false; + const wrappedFiles = new Proxy(files, { + get(target, prop, receiver) { + if (prop === "read") { + return (...args: unknown[]) => { + readCalled = true; + // biome-ignore lint/suspicious/noExplicitAny: proxy passthrough + return (target as any).read(...args); + }; + } + return Reflect.get(target, prop, receiver); + }, + }); + const watcher = createFileWatcher({ files: wrappedFiles, rootPath: "/" }); + await store.registerWorker({ + name: watcher.name, + version: watcher.version, + }); + await drain(watcher, singleTick(), store); + expect(readCalled).toBe(false); + }); + + it("hash format is size:mtime", async () => { + await writeText(files, "/a.md", "hello"); + await runOnce("/"); + const view = await store.getState("file:///a.md"); + expect(view?.hash).toMatch(/^\d+:\d+$/); + }); + + it("empty FS yields nothing", async () => { + await runOnce("/"); + expect(await fileUris()).toEqual([]); + }); + + it("skips directories", async () => { + await writeText(files, "/sub/a.md", "hello"); + await runOnce("/"); + const uris = await fileUris(); + expect(uris).toEqual(["file:///sub/a.md"]); + // No URI for the directory itself. + expect(uris.some((u) => u.endsWith("/sub"))).toBe(false); + }); +}); diff --git a/packages/uri-graph/tests/workers/index-backends.test.ts b/packages/uri-graph/tests/workers/index-backends.test.ts new file mode 100644 index 0000000..43fe326 --- /dev/null +++ b/packages/uri-graph/tests/workers/index-backends.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, it } from "vitest"; +import { createMemoryFtsBackend } from "../../src/workers/index-backends/memory-fts.js"; +import { createMemoryVectorBackend } from "../../src/workers/index-backends/memory-vector.js"; + +describe("memory FTS backend", () => { + it("indexes documents per scope and finds matches", () => { + const fts = createMemoryFtsBackend(); + fts.upsert("doc:a", ["hello world", "foo bar"]); + fts.upsert("doc:b", ["hello there"]); + const hits = fts + .query("hello") + .map((h) => h.scope) + .sort(); + expect(hits).toEqual(["doc:a", "doc:b"]); + }); + + it("delete removes a scope's entries", () => { + const fts = createMemoryFtsBackend(); + fts.upsert("doc:a", ["hello"]); + fts.upsert("doc:b", ["hello"]); + fts.remove("doc:a"); + const hits = fts.query("hello").map((h) => h.scope); + expect(hits).toEqual(["doc:b"]); + }); + + it("upsert replaces existing entries for a scope", () => { + const fts = createMemoryFtsBackend(); + fts.upsert("doc:a", ["banana"]); + fts.upsert("doc:a", ["apple"]); + expect(fts.query("banana").length).toBe(0); + expect(fts.query("apple").length).toBe(1); + }); +}); + +describe("memory vector backend", () => { + it("stores vectors and finds nearest", () => { + const vec = createMemoryVectorBackend(); + vec.upsert("v:a", new Float32Array([1, 0])); + vec.upsert("v:b", new Float32Array([0, 1])); + vec.upsert("v:c", new Float32Array([0.9, 0.1])); + const top = vec.search(new Float32Array([1, 0]), 2); + expect(top[0]?.id).toBe("v:a"); + expect(top[1]?.id).toBe("v:c"); + }); + + it("delete removes the vector", () => { + const vec = createMemoryVectorBackend(); + vec.upsert("v:a", new Float32Array([1])); + vec.upsert("v:b", new Float32Array([1])); + vec.remove("v:a"); + const top = vec.search(new Float32Array([1]), 5); + expect(top.map((t) => t.id)).toEqual(["v:b"]); + }); +}); diff --git a/packages/uri-graph/tests/workers/indexer.test.ts b/packages/uri-graph/tests/workers/indexer.test.ts new file mode 100644 index 0000000..413f81f --- /dev/null +++ b/packages/uri-graph/tests/workers/indexer.test.ts @@ -0,0 +1,117 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import { drain } from "../../src/orchestrator/drain.js"; +import type { GraphStore } from "../../src/store/types.js"; +import type { Update } from "../../src/types/update.js"; +import { createMemoryFtsBackend } from "../../src/workers/index-backends/memory-fts.js"; +import { createMemoryVectorBackend } from "../../src/workers/index-backends/memory-vector.js"; +import { createIndexer } from "../../src/workers/indexer.js"; +import { openTempMemoryStore } from "../helpers.js"; + +describe("indexer", () => { + let store: GraphStore; + + beforeEach(async () => { + store = await openTempMemoryStore(); + }); + + function indexerInputs(scope: string): Update[] { + return [ + { + uri: scope, + stamp: 0, + status: "added", + scope, + role: "text", + attributes: { text: "hello world" }, + }, + { + uri: `chunk:${scope.slice(5)}#0`, + stamp: 0, + status: "added", + scope, + role: "chunk", + attributes: { text: "hello world" }, + }, + { + uri: `embedding://chunk:${scope.slice(5)}#0`, + stamp: 0, + status: "added", + scope, + role: "embedding", + attributes: { vector: [1, 0, 0] }, + }, + ]; + } + + it("emits one fts and one vector index URI per ready scope", async () => { + const fts = createMemoryFtsBackend(); + const vec = createMemoryVectorBackend(); + const indexer = createIndexer({ fts, vector: vec }); + await store.registerWorker({ + name: indexer.name, + version: indexer.version, + }); + + const inputs = indexerInputs("text:///x.md"); + async function* feed(): AsyncIterableIterator { + for (const u of inputs) yield u; + } + await drain(indexer, feed(), store); + + expect(await store.getState("index://fts/text:///x.md")).not.toBeNull(); + expect(await store.getState("index://vector/text:///x.md")).not.toBeNull(); + // The FTS backend now indexes the scope. + expect(fts.query("hello").map((h) => h.scope)).toContain("text:///x.md"); + }); + + it("groups inputs by scope when they interleave (sorted by joinInputs upstream)", async () => { + const fts = createMemoryFtsBackend(); + const vec = createMemoryVectorBackend(); + const indexer = createIndexer({ fts, vector: vec }); + await store.registerWorker({ + name: indexer.name, + version: indexer.version, + }); + + const inputs = [...indexerInputs("text:///a.md"), ...indexerInputs("text:///b.md")].sort( + (x, y) => (x.scope ?? "").localeCompare(y.scope ?? ""), + ); + async function* feed(): AsyncIterableIterator { + for (const u of inputs) yield u; + } + await drain(indexer, feed(), store); + expect(await store.getState("index://fts/text:///a.md")).not.toBeNull(); + expect(await store.getState("index://fts/text:///b.md")).not.toBeNull(); + }); + + it("removed text cascades indexes", async () => { + const fts = createMemoryFtsBackend(); + const vec = createMemoryVectorBackend(); + const indexer = createIndexer({ fts, vector: vec }); + await store.registerWorker({ + name: indexer.name, + version: indexer.version, + }); + + // First index it, then send a removed text update. + async function* feed1(): AsyncIterableIterator { + for (const u of indexerInputs("text:///r.md")) yield u; + } + await drain(indexer, feed1(), store); + expect(fts.query("hello").length).toBeGreaterThan(0); + + async function* feed2(): AsyncIterableIterator { + yield { + uri: "text:///r.md", + stamp: 0, + status: "removed", + scope: "text:///r.md", + role: "text", + }; + } + await drain(indexer, feed2(), store); + expect((await store.getState("index://fts/text:///r.md"))?.status).toBe("removed"); + expect((await store.getState("index://vector/text:///r.md"))?.status).toBe("removed"); + expect(fts.query("hello").length).toBe(0); + }); +}); diff --git a/packages/uri-graph/tsconfig.json b/packages/uri-graph/tsconfig.json new file mode 100644 index 0000000..c342f5b --- /dev/null +++ b/packages/uri-graph/tsconfig.json @@ -0,0 +1,17 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "lib": ["ES2022", "DOM"], + "types": ["node"], + "verbatimModuleSyntax": true, + "isolatedModules": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noImplicitReturns": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + "noEmit": true + }, + "include": ["./src", "./tests"], + "exclude": ["node_modules", "dist"] +} From ef7e8a4a73debd0443e2011e0514bf1bbb61f7e5 Mon Sep 17 00:00:00 2001 From: Mikhail Kotelnikov Date: Tue, 28 Apr 2026 16:58:31 +0200 Subject: [PATCH 2/3] docs(uri-graph): rewrite README to follow standard structure; remove repo-internal links MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous README was minimal. Rewritten to follow the project's README rules (What it is / Why it exists / How to use / Examples / Internals / License) with self-contained content — no relative links to other packages or notes. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/uri-graph/README.md | 236 ++++++++++++++++++++++++++++++----- 1 file changed, 208 insertions(+), 28 deletions(-) diff --git a/packages/uri-graph/README.md b/packages/uri-graph/README.md index 3b5df79..3e81792 100644 --- a/packages/uri-graph/README.md +++ b/packages/uri-graph/README.md @@ -1,63 +1,243 @@ # @statewalker/uri-graph -Persistent URI dependency graph kernel. Workers are async generators that consume `Update` streams and yield `Update` streams; a single-writer orchestrator drives them to a fixpoint over a persistent graph state. +## What it is -Two interchangeable repository backends implement one `GraphStore` interface: +A persistent URI dependency graph kernel. Every observable thing — files, extracted texts, chunks, embeddings, indexes, diagnostics — is a URI with an `Update` (status, monotonic stamp, optional hash and attributes). Workers are async generators that consume `Update` streams and yield `Update` streams; a single-writer orchestrator drives them to a fixpoint over a persistent graph state. -- `MemoryGraphStore` — in-memory state with an abstract persistence interface (`lock` / `load` / `store` / `unlock`). Ship with a JSON-snapshot adapter over `FilesApi` for filesystem persistence, or use the in-process variant for tests. +The package ships two interchangeable storage backends behind one `GraphStore` interface: + +- `MemoryGraphStore` — in-memory state with an abstract persistence interface (`{key, lock, load, store, unlock}`); ships a JSON-snapshot adapter over `FilesApi` and an in-process variant for tests. - `SqlGraphStore` — libSQL/Turso (Node and browser/OPFS) via `@statewalker/db-api`'s `Db`. -Both pass the same shared contract test suite (`defineGraphStoreContract`), so worker code is identical across backends. +A bundled worker library (file watcher, markdown/text/html extractors, chunker, embedder, indexer with in-memory FTS + vector backends) wires the canonical pipeline `file:// → text:// → chunk:// → embedding:// → index://`. + +## Why it exists + +Earlier scanner-style pipelines (extract / split / embed / index) each carried their own ad-hoc state and re-did work after restart. This package replaces them with a uniform URI-shaped graph that: + +- Survives restarts. A crashed worker leaves no inconsistent state — `recoverOrphans` runs on every open. +- Avoids redundant work. The no-op rule (stamp bumps only on real content change) makes re-runs idempotent and stops downstream cascades when nothing changed. +- Composes cleanly. New file formats are added by writing one extractor; the watcher, chunker, embedder, and indexer are unchanged. +- Works in Node and browser. The same kernel runs over libSQL/Turso (Node + OPFS) or in-memory state with JSON persistence. -## When to use which store +The two-backend split exists because lighter scenarios (tests, CLI scripts, browsers without OPFS) do not need a SQL engine, and because the `MemoryGraphStore` boots faster — but worker code must not branch on backend. -Use `MemoryGraphStore` when state fits in process memory and persistence is one JSON file (or none): tests, scripted ETL jobs, browser-without-OPFS, embedded scenarios. The persistence layer is abstract — you provide `lock` / `load` / `store` / `unlock` callbacks; the package ships an `FilesApi` helper and an in-process helper. +## How to use -Use `SqlGraphStore` when graphs are large or need durable, query-friendly storage: long-lived daemons, multi-million-URI workloads, OPFS-backed browser deployments where the same libSQL database serves both the graph and the FTS5/vector index. Single-writer orchestration applies in both cases. +```sh +pnpm add @statewalker/uri-graph +``` -## Minimal Node bootstrap +Pick a backend, register workers, run the orchestrator to fixpoint: ```ts -import { newNodeTursoDb } from "@statewalker/db-turso-node"; -import { MemFilesApi } from "@statewalker/webrun-files-mem"; import { createOrchestrator, openGraphStore, - SqlGraphStore, - createChunker, - createEmbedder, + MemoryGraphStore, + createInMemoryPersistence, createFileWatcher, createMarkdownExtractor, + createChunker, + createEmbedder, createIndexer, createMemoryFtsBackend, createMemoryVectorBackend, } from "@statewalker/uri-graph"; -const db = await newNodeTursoDb({ path: "./graph.db" }); -const store = await openGraphStore(new SqlGraphStore({ db })); -const files = new MemFilesApi(); // or any FilesApi -const fts = createMemoryFtsBackend(); -const vector = createMemoryVectorBackend(); - +const store = await openGraphStore( + new MemoryGraphStore(createInMemoryPersistence()), +); const orch = createOrchestrator({ graph: store }); await orch.registerWorker(createFileWatcher({ files, rootPath: "/" })); await orch.registerWorker(createMarkdownExtractor({ files, graph: store })); await orch.registerWorker(createChunker({ chunkSize: 1000, graph: store })); await orch.registerWorker(createEmbedder({ graph: store, embed: yourEmbedFn })); -await orch.registerWorker(createIndexer({ graph: store, fts, vector })); +await orch.registerWorker( + createIndexer({ + graph: store, + fts: createMemoryFtsBackend(), + vector: createMemoryVectorBackend(), + }), +); const ac = new AbortController(); -process.on("SIGINT", () => ac.abort()); await orch.start(ac.signal); -await db.close(); ``` -## Browser / OPFS +### Choosing a backend + +| | `MemoryGraphStore` | `SqlGraphStore` | +|---|---|---| +| State | JS maps + JSON snapshot | libSQL tables | +| Persistence | `lock`/`load`/`store`/`unlock` callbacks | `Db` from `@statewalker/db-api` | +| Best for | tests, scripts, browser-without-OPFS | daemons, large graphs, OPFS browser | +| Same FTS + vector index store as graph | external | possible (FTS5 + `F32_BLOB` in same `Db`) | + +## Examples + +### Custom persistence for the memory store + +The memory store does not depend on `FilesApi`. Pass any implementation of the persistence contract: + +```ts +import { + MemoryGraphStore, + openGraphStore, + type MemoryPersistence, +} from "@statewalker/uri-graph"; + +const persistence: MemoryPersistence = { + key: "graph", + async lock(key) { + /* acquire — return a LockId */ + }, + async load(_id) { + /* return prior dump (JSON) or null */ + }, + async store(_id, dump) { + /* persist dump */ + }, + async unlock(_id) { + /* release */ + }, +}; +const store = await openGraphStore(new MemoryGraphStore(persistence)); +``` + +Two adapters ship: `createFilesPersistence(files, path)` (FilesApi/JSON, atomic via temp+move) and `createInMemoryPersistence(key?)` (process-local, no durability). + +### SQL backend + +```ts +import { newNodeTursoDb } from "@statewalker/db-turso-node"; +import { SqlGraphStore, openGraphStore } from "@statewalker/uri-graph"; + +const db = await newNodeTursoDb({ path: "./graph.db" }); +const store = await openGraphStore(new SqlGraphStore({ db })); +// ... use store ... +await db.close(); // caller owns Db lifecycle +``` + +### Writing a custom worker + +```ts +import type { WorkerDefinition } from "@statewalker/uri-graph"; +import { findDirty } from "@statewalker/uri-graph"; + +export function createUppercase(opts: { graph: GraphStore }): WorkerDefinition { + return { + name: "uppercase", + version: "v1", + inputPattern: "text:///%", + outputPattern: "upper://**", + selector: (ctx) => + findDirty(opts.graph, { + forWorker: ctx.workerName, + forVersion: ctx.workerVersion, + uriLike: "text:///%", + limit: ctx.limit, + }), + async *run(params, input) { + for await (const doc of input) { + const view = await params.read(doc.uri); + const text = (view?.attributes?.text as string) ?? ""; + const stamp = await params.stamp(); + yield { + uri: `upper://${doc.uri.slice("text:".length)}`, + stamp, + status: "updated", + hash: text.toUpperCase().length.toString(), + attributes: { text: text.toUpperCase() }, + }; + } + }, + }; +} +``` + +### Verifying both backends with one suite + +The contract test suite is exported. New `GraphStore` implementations ride on the same tests: + +```ts +import { defineGraphStoreContract } from "@statewalker/uri-graph"; + +defineGraphStoreContract("MyCustomStore", () => { + return { + async open() { + /* return GraphStore */ + }, + async close(store) { + /* tear down */ + }, + }; +}); +``` + +## Internals + +### Architectural decisions + +- **`GraphStore` is the abstraction, not raw SQL.** Lifting above `Db` lets `MemoryGraphStore` exist without a SQL-over-maps shim. Both backends pass the same `defineGraphStoreContract` suite — isofunctional by construction. +- **Logical transaction ≠ physical transaction.** A worker's stamp boundary opens a logical transaction; each `applyUpdate` runs as its own small physical transaction that stages writes into `uri_state_pending` (SQL) or a per-run pending map (memory). Promotion happens atomically on `commit`. Long worker calls between yields never hold writer locks. +- **Stamps bump only on real change (no-op rule).** The orchestrator compares `(status, hash)` against committed state. If unchanged, no stamp bump and no downstream cascade. This makes cycles terminate, replays safe, and "save without changes" cheap. +- **`advanced` requires committed yields.** Sentinel-tick selectors (file watcher) consume but produce no yields when nothing changed; the orchestrator treats those rounds as no-progress and sleeps. Without this guard, the fixpoint loop would spin. +- **Workers don't branch on backend.** The same `WorkerDefinition` runs against `MemoryGraphStore` and `SqlGraphStore`. The worker-library uses `findDirty(graph, ...)` selectors and `params.read(uri)` for live re-reads. +- **Memory persistence is abstract.** The store calls `lock(key)` once at open, then `load`/`store`/`unlock` against the resulting `LockId`. This decouples the store from any specific filesystem; FilesApi is one adapter, not a hard dependency. + +### Schema (SQL backend) + +Seven tables with the indexes shown below: + +``` +uri (id, text) ← URI interning +stamp_seq (id=1, next) ← strictly monotonic stamps +worker_registry (name, version, …) ← worker metadata +uri_state (uri_id, status, stamp, hash, attributes) ← committed truth +uri_state_pending (run_id, uri_id, …) ← staging +run (id, action, action_version, scope, stamp, outcome, …) ← run history +run_input (run_id, uri_id, role, observed_stamp) ← what each run consumed +run_output (run_id, uri_id, written_stamp, was_noop) ← what each run produced +``` + +### Crash recovery + +``` +beginTransaction → flush running run row +applyUpdate × N → no-op check; stage if changed (each = small physical txn) +commit → promote staging → committed (single physical txn) +rollback → drop staging, mark cancelled + +crash anywhere → on next openGraphStore, recoverOrphans: + UPDATE running runs → cancelled + DELETE pending rows for those runs + uri_state untouched +``` + +### Constraints + +- Single-writer orchestrator. Multi-process is out of scope; OPFS already enforces single-writer in browser. +- `MemoryGraphStore` is one process per `key`. A second open against the same key throws "already open". +- `MemoryGraphStore` keeps full state in memory; not intended for 5M-URI workloads. Use `SqlGraphStore` at scale. +- Workers must produce deterministic output URIs (function of input, never of time/randomness). Re-run = same URIs, no orphans. +- Stamps within one generator invocation must be non-decreasing; the orchestrator throws on regression. + +### Dependencies + +- `@statewalker/db-api` — abstract `Db` interface (only used by SQL backend; declared as a regular dependency so consumers wiring `MemoryGraphStore` still get the type). +- `@statewalker/webrun-files` — `FilesApi` interface used by the file watcher / extractors and by the optional `createFilesPersistence` helper. +- Dev: `@statewalker/db-turso-node`, `@statewalker/webrun-files-mem`, `@types/node`, vitest, biome, tsdown, rimraf, typescript. + +The kernel is environment-agnostic. Bootstrap helpers in `./node` and `./browser` are thin and pull in env-specific factories (`newNodeTursoDb` / `newBrowserTursoDb`, `NodeFilesApi` / `getOPFSFilesApi`). + +## Related -Same kernel, same workers. Swap `newNodeTursoDb` for `newBrowserTursoDb` (OPFS path) and `MemFilesApi` for an OPFS-backed `FilesApi`. `MemoryGraphStore` works in the browser too with the in-process persistence helper. +- `@statewalker/db-api` — abstract DB interface used by `SqlGraphStore`. +- `@statewalker/db-turso-node` / `@statewalker/db-turso-browser` — libSQL adapters. +- `@statewalker/webrun-files` (and `-mem` / `-node` / `-browser`) — FilesApi interface and implementations. +- `@statewalker/content-pipeline` — earlier scanner-based pipeline this package is positioned to replace. -## See also +## License -- Proposal: [openspec/changes/uri-dependency-graph-kernel/proposal.md](../../../../openspec/changes/uri-dependency-graph-kernel/proposal.md) -- Design: [openspec/changes/uri-dependency-graph-kernel/design.md](../../../../openspec/changes/uri-dependency-graph-kernel/design.md) -- Specs: [openspec/changes/uri-dependency-graph-kernel/specs/](../../../../openspec/changes/uri-dependency-graph-kernel/specs/) +MIT. From 5790299bb17e08a83432825338aea660f8bad1da Mon Sep 17 00:00:00 2001 From: Mikhail Kotelnikov Date: Fri, 1 May 2026 16:32:59 +0200 Subject: [PATCH 3/3] =?UTF-8?q?refactor(uri-graph):=20rewrite=20as=20minim?= =?UTF-8?q?al=20kernel=20=E2=80=94=20Engine=20+=20Store=20+=20watermarks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the prior package (~3200 LOC, 7 SQL tables, staging/run-history, content-hash no-op rule, scope/role machinery) with a minimal core: - Engine.runWorker / Engine.stabilize fixpoint over a Store interface. - Workers are async generators with selects/emits scheme prefixes; the engine streams resources whose latest event has stamp > worker watermark. - Per-worker watermark is the completion stamp minted after the worker generator finishes; by construction it is greater than every input or output stamp the run touched, so filter workers (consume input, produce nothing) advance their watermark and never spin. - Store has two interchangeable backends: MemoryStore (in-process, kept as production-grade for in-memory cell evaluation) and SqlStore (libSQL via @statewalker/db-api). Same WorkerFn runs against either. - Three append-only tables: resources(uri,stamp,status,meta), completions(worker,stamp,finished_at), workers(name,selects,emits) plus a single-row stamp_seq counter. No staging, no run history, no hashes. - Operator-driven: store.invalidate(prefix) emits 'removed' events to cascade re-execution; purgeResources / purgeCompletions for retention. - topoLayers is introspection-only; the engine is data-driven by stamps. Source: 521 LOC (engine 69, sql 224, memory 113, types/store/topo/index). Tests: 624 LOC including a shared store contract suite exercising both backends and an e2e pipeline test parametrised over MemoryStore + SqlStore. All 41 tests pass; tsc clean; biome clean. Design exploration: notes/2026-04/2026-04-25/10.dependencies-exporation.md Implementation plan: notes/2026-04/2026-04-25/11.dependencies-implementation-plan.md Minimal redesign: notes/2026-05/2026-05-01/notes.md --- packages/uri-graph/README.md | 267 +++++---------- packages/uri-graph/package.json | 12 +- packages/uri-graph/src/browser.ts | 2 - packages/uri-graph/src/engine.ts | 69 ++++ .../uri-graph/src/graph/selector-helpers.ts | 79 ----- packages/uri-graph/src/index.ts | 90 +---- packages/uri-graph/src/node.ts | 2 - packages/uri-graph/src/orchestrator/drain.ts | 164 --------- .../src/orchestrator/orchestrator.ts | 167 --------- packages/uri-graph/src/store/contract.ts | 323 ------------------ packages/uri-graph/src/store/memory.ts | 113 ++++++ .../src/store/memory/files-persistence.ts | 87 ----- .../uri-graph/src/store/memory/persistence.ts | 22 -- .../uri-graph/src/store/memory/snapshot.ts | 167 --------- packages/uri-graph/src/store/memory/state.ts | 94 ----- packages/uri-graph/src/store/memory/store.ts | 224 ------------ .../uri-graph/src/store/memory/transaction.ts | 133 -------- packages/uri-graph/src/store/sql.ts | 224 ++++++++++++ packages/uri-graph/src/store/sql/schema.ts | 103 ------ packages/uri-graph/src/store/sql/store.ts | 264 -------------- .../uri-graph/src/store/sql/transaction.ts | 153 --------- .../uri-graph/src/store/sql/uri-intern.ts | 28 -- packages/uri-graph/src/store/store.ts | 35 ++ packages/uri-graph/src/store/types.ts | 75 ---- packages/uri-graph/src/topo-layers.ts | 45 +++ packages/uri-graph/src/types.ts | 24 ++ packages/uri-graph/src/types/update.ts | 19 -- packages/uri-graph/src/types/worker.ts | 29 -- packages/uri-graph/src/util/hash.ts | 19 -- packages/uri-graph/src/workers/chunker.ts | 138 -------- packages/uri-graph/src/workers/embedder.ts | 121 ------- .../uri-graph/src/workers/extractors/base.ts | 120 ------- .../src/workers/extractors/html-extractor.ts | 27 -- .../workers/extractors/markdown-extractor.ts | 19 -- .../extractors/plain-text-extractor.ts | 14 - .../uri-graph/src/workers/file-watcher.ts | 110 ------ .../src/workers/index-backends/memory-fts.ts | 75 ---- .../workers/index-backends/memory-vector.ts | 46 --- packages/uri-graph/src/workers/indexer.ts | 255 -------------- packages/uri-graph/tests/e2e/pipeline.test.ts | 256 ++++++-------- packages/uri-graph/tests/engine.test.ts | 248 ++++++++++++++ .../tests/graph/selector-helpers.test.ts | 170 --------- packages/uri-graph/tests/helpers.ts | 12 - .../tests/orchestrator/drain.test.ts | 131 ------- .../tests/orchestrator/orchestrator.test.ts | 134 -------- .../tests/orchestrator/status.test.ts | 43 --- packages/uri-graph/tests/store/contract.ts | 200 +++++++++++ .../tests/store/memory-snapshot.test.ts | 151 -------- packages/uri-graph/tests/store/memory.test.ts | 21 +- packages/uri-graph/tests/store/sql.test.ts | 51 +-- packages/uri-graph/tests/topo-layers.test.ts | 45 +++ packages/uri-graph/tests/types/update.test.ts | 50 --- packages/uri-graph/tests/types/worker.test.ts | 69 ---- .../uri-graph/tests/workers/chunker.test.ts | 97 ------ .../uri-graph/tests/workers/embedder.test.ts | 92 ----- .../tests/workers/extractors.test.ts | 127 ------- .../tests/workers/file-watcher.test.ts | 130 ------- .../tests/workers/index-backends.test.ts | 54 --- .../uri-graph/tests/workers/indexer.test.ts | 117 ------- 59 files changed, 1218 insertions(+), 4938 deletions(-) delete mode 100644 packages/uri-graph/src/browser.ts create mode 100644 packages/uri-graph/src/engine.ts delete mode 100644 packages/uri-graph/src/graph/selector-helpers.ts delete mode 100644 packages/uri-graph/src/node.ts delete mode 100644 packages/uri-graph/src/orchestrator/drain.ts delete mode 100644 packages/uri-graph/src/orchestrator/orchestrator.ts delete mode 100644 packages/uri-graph/src/store/contract.ts create mode 100644 packages/uri-graph/src/store/memory.ts delete mode 100644 packages/uri-graph/src/store/memory/files-persistence.ts delete mode 100644 packages/uri-graph/src/store/memory/persistence.ts delete mode 100644 packages/uri-graph/src/store/memory/snapshot.ts delete mode 100644 packages/uri-graph/src/store/memory/state.ts delete mode 100644 packages/uri-graph/src/store/memory/store.ts delete mode 100644 packages/uri-graph/src/store/memory/transaction.ts create mode 100644 packages/uri-graph/src/store/sql.ts delete mode 100644 packages/uri-graph/src/store/sql/schema.ts delete mode 100644 packages/uri-graph/src/store/sql/store.ts delete mode 100644 packages/uri-graph/src/store/sql/transaction.ts delete mode 100644 packages/uri-graph/src/store/sql/uri-intern.ts create mode 100644 packages/uri-graph/src/store/store.ts delete mode 100644 packages/uri-graph/src/store/types.ts create mode 100644 packages/uri-graph/src/topo-layers.ts create mode 100644 packages/uri-graph/src/types.ts delete mode 100644 packages/uri-graph/src/types/update.ts delete mode 100644 packages/uri-graph/src/types/worker.ts delete mode 100644 packages/uri-graph/src/util/hash.ts delete mode 100644 packages/uri-graph/src/workers/chunker.ts delete mode 100644 packages/uri-graph/src/workers/embedder.ts delete mode 100644 packages/uri-graph/src/workers/extractors/base.ts delete mode 100644 packages/uri-graph/src/workers/extractors/html-extractor.ts delete mode 100644 packages/uri-graph/src/workers/extractors/markdown-extractor.ts delete mode 100644 packages/uri-graph/src/workers/extractors/plain-text-extractor.ts delete mode 100644 packages/uri-graph/src/workers/file-watcher.ts delete mode 100644 packages/uri-graph/src/workers/index-backends/memory-fts.ts delete mode 100644 packages/uri-graph/src/workers/index-backends/memory-vector.ts delete mode 100644 packages/uri-graph/src/workers/indexer.ts create mode 100644 packages/uri-graph/tests/engine.test.ts delete mode 100644 packages/uri-graph/tests/graph/selector-helpers.test.ts delete mode 100644 packages/uri-graph/tests/helpers.ts delete mode 100644 packages/uri-graph/tests/orchestrator/drain.test.ts delete mode 100644 packages/uri-graph/tests/orchestrator/orchestrator.test.ts delete mode 100644 packages/uri-graph/tests/orchestrator/status.test.ts create mode 100644 packages/uri-graph/tests/store/contract.ts delete mode 100644 packages/uri-graph/tests/store/memory-snapshot.test.ts create mode 100644 packages/uri-graph/tests/topo-layers.test.ts delete mode 100644 packages/uri-graph/tests/types/update.test.ts delete mode 100644 packages/uri-graph/tests/types/worker.test.ts delete mode 100644 packages/uri-graph/tests/workers/chunker.test.ts delete mode 100644 packages/uri-graph/tests/workers/embedder.test.ts delete mode 100644 packages/uri-graph/tests/workers/extractors.test.ts delete mode 100644 packages/uri-graph/tests/workers/file-watcher.test.ts delete mode 100644 packages/uri-graph/tests/workers/index-backends.test.ts delete mode 100644 packages/uri-graph/tests/workers/indexer.test.ts diff --git a/packages/uri-graph/README.md b/packages/uri-graph/README.md index 3e81792..cee606e 100644 --- a/packages/uri-graph/README.md +++ b/packages/uri-graph/README.md @@ -2,25 +2,29 @@ ## What it is -A persistent URI dependency graph kernel. Every observable thing — files, extracted texts, chunks, embeddings, indexes, diagnostics — is a URI with an `Update` (status, monotonic stamp, optional hash and attributes). Workers are async generators that consume `Update` streams and yield `Update` streams; a single-writer orchestrator drives them to a fixpoint over a persistent graph state. +A minimal persistent dependency graph for URI-shaped work. Every observable thing — a file, an extracted text, a chunk, an index, a transformed cell — is a URI. Each URI has a `Resource` (status, monotonic stamp, optional meta). Workers are async generators that consume an input stream of resources and yield an output stream. A fixpoint engine drives them to convergence. -The package ships two interchangeable storage backends behind one `GraphStore` interface: +The package is small on purpose: one `Engine`, one `Store` interface with two interchangeable backends (`MemoryStore`, `SqlStore`), and a `topoLayers` helper for introspection. There is no run history, no staging table, no hash-based no-op rule, no scope/role machinery — those concerns either belong to the worker or are handled implicitly by the watermark. -- `MemoryGraphStore` — in-memory state with an abstract persistence interface (`{key, lock, load, store, unlock}`); ships a JSON-snapshot adapter over `FilesApi` and an in-process variant for tests. -- `SqlGraphStore` — libSQL/Turso (Node and browser/OPFS) via `@statewalker/db-api`'s `Db`. +## Why it exists -A bundled worker library (file watcher, markdown/text/html extractors, chunker, embedder, indexer with in-memory FTS + vector backends) wires the canonical pipeline `file:// → text:// → chunk:// → embedding:// → index://`. +To replace heavier scanner-style pipelines with a single algorithm that works for: -## Why it exists +- file transformations (scan → extract → index) +- code transformations (TS/TSX → JS) +- code execution (ObservableHQ-style cells) +- in-process transient pipelines and durable ones, with the same code -Earlier scanner-style pipelines (extract / split / embed / index) each carried their own ad-hoc state and re-did work after restart. This package replaces them with a uniform URI-shaped graph that: +The algorithm: -- Survives restarts. A crashed worker leaves no inconsistent state — `recoverOrphans` runs on every open. -- Avoids redundant work. The no-op rule (stamp bumps only on real content change) makes re-runs idempotent and stops downstream cascades when nothing changed. -- Composes cleanly. New file formats are added by writing one extractor; the watcher, chunker, embedder, and indexer are unchanged. -- Works in Node and browser. The same kernel runs over libSQL/Turso (Node + OPFS) or in-memory state with JSON persistence. +1. Each worker declares a single input scheme (`selects`) and a single output scheme (`emits`). +2. The engine reads the worker's last completion stamp. +3. The engine streams every resource whose latest event has `stamp > watermark` and whose URI begins with `selects` into the worker. +4. The worker yields output resources, each carrying a fresh stamp it minted via `ctx.newStamp()`. Each yield is persisted immediately by the engine. +5. On clean completion, the engine mints another stamp and writes it to `completions(worker, stamp)`. By construction this stamp is larger than every input or output stamp the run touched. +6. `stabilize()` repeats this round until no worker progressed. -The two-backend split exists because lighter scenarios (tests, CLI scripts, browsers without OPFS) do not need a SQL engine, and because the `MemoryGraphStore` boots faster — but worker code must not branch on backend. +Crash safety falls out of the design: outputs are URI-keyed (idempotent on retry); the completion row is the last write of a run. A crashed run leaves no completion; the next round re-executes the same inputs and overwrites the same output URIs. ## How to use @@ -28,215 +32,116 @@ The two-backend split exists because lighter scenarios (tests, CLI scripts, brow pnpm add @statewalker/uri-graph ``` -Pick a backend, register workers, run the orchestrator to fixpoint: - ```ts -import { - createOrchestrator, - openGraphStore, - MemoryGraphStore, - createInMemoryPersistence, - createFileWatcher, - createMarkdownExtractor, - createChunker, - createEmbedder, - createIndexer, - createMemoryFtsBackend, - createMemoryVectorBackend, -} from "@statewalker/uri-graph"; - -const store = await openGraphStore( - new MemoryGraphStore(createInMemoryPersistence()), -); -const orch = createOrchestrator({ graph: store }); -await orch.registerWorker(createFileWatcher({ files, rootPath: "/" })); -await orch.registerWorker(createMarkdownExtractor({ files, graph: store })); -await orch.registerWorker(createChunker({ chunkSize: 1000, graph: store })); -await orch.registerWorker(createEmbedder({ graph: store, embed: yourEmbedFn })); -await orch.registerWorker( - createIndexer({ - graph: store, - fts: createMemoryFtsBackend(), - vector: createMemoryVectorBackend(), - }), -); - -const ac = new AbortController(); -await orch.start(ac.signal); -``` +import { Engine, MemoryStore, type WorkerFn } from "@statewalker/uri-graph"; -### Choosing a backend +const store = new MemoryStore(); +const engine = new Engine(store); -| | `MemoryGraphStore` | `SqlGraphStore` | -|---|---|---| -| State | JS maps + JSON snapshot | libSQL tables | -| Persistence | `lock`/`load`/`store`/`unlock` callbacks | `Db` from `@statewalker/db-api` | -| Best for | tests, scripts, browser-without-OPFS | daemons, large graphs, OPFS browser | -| Same FTS + vector index store as graph | external | possible (FTS5 + `F32_BLOB` in same `Db`) | +const scanner: WorkerFn = async function* (input, ctx) { + for await (const _tick of input) { + const stamp = await ctx.newStamp(); + yield { uri: "file://a.md", stamp, status: "added" }; + yield { uri: "file://b.md", stamp, status: "added" }; + } +}; -## Examples +const extractor: WorkerFn = async function* (input, ctx) { + for await (const r of input) { + if (!r.uri.endsWith(".md")) continue; + const stamp = await ctx.newStamp(); + yield { uri: `text://${r.uri.slice("file://".length)}`, stamp, status: r.status }; + } +}; -### Custom persistence for the memory store +await engine.register({ name: "scanner", selects: "tick://", emits: "file://" }, scanner); +await engine.register({ name: "extractor", selects: "file://", emits: "text://" }, extractor); -The memory store does not depend on `FilesApi`. Pass any implementation of the persistence contract: +// publish a tick to wake up the source worker +await store.put({ uri: "tick://run", stamp: await store.newStamp(), status: "updated" }); -```ts -import { - MemoryGraphStore, - openGraphStore, - type MemoryPersistence, -} from "@statewalker/uri-graph"; - -const persistence: MemoryPersistence = { - key: "graph", - async lock(key) { - /* acquire — return a LockId */ - }, - async load(_id) { - /* return prior dump (JSON) or null */ - }, - async store(_id, dump) { - /* persist dump */ - }, - async unlock(_id) { - /* release */ - }, -}; -const store = await openGraphStore(new MemoryGraphStore(persistence)); +for await (const r of engine.stabilize()) { + console.log(r.uri, r.stamp, r.status); +} ``` -Two adapters ship: `createFilesPersistence(files, path)` (FilesApi/JSON, atomic via temp+move) and `createInMemoryPersistence(key?)` (process-local, no durability). +### Choosing a backend -### SQL backend +| | `MemoryStore` | `SqlStore` | +|---|---|---| +| State | JS maps | libSQL/SQLite tables | +| Persistence | none (in-process) | the underlying `Db` | +| Best for | unit tests, in-memory cell evaluation, scratch pipelines | daemons, durable indexes, cross-restart work | ```ts import { newNodeTursoDb } from "@statewalker/db-turso-node"; -import { SqlGraphStore, openGraphStore } from "@statewalker/uri-graph"; +import { Engine, SqlStore } from "@statewalker/uri-graph"; const db = await newNodeTursoDb({ path: "./graph.db" }); -const store = await openGraphStore(new SqlGraphStore({ db })); -// ... use store ... -await db.close(); // caller owns Db lifecycle +const store = new SqlStore(db); +const engine = new Engine(store); +// ... register, stabilize ... +await db.close(); // caller owns the Db ``` -### Writing a custom worker +Workers do not branch on backend. The same `WorkerFn` runs against either store. -```ts -import type { WorkerDefinition } from "@statewalker/uri-graph"; -import { findDirty } from "@statewalker/uri-graph"; - -export function createUppercase(opts: { graph: GraphStore }): WorkerDefinition { - return { - name: "uppercase", - version: "v1", - inputPattern: "text:///%", - outputPattern: "upper://**", - selector: (ctx) => - findDirty(opts.graph, { - forWorker: ctx.workerName, - forVersion: ctx.workerVersion, - uriLike: "text:///%", - limit: ctx.limit, - }), - async *run(params, input) { - for await (const doc of input) { - const view = await params.read(doc.uri); - const text = (view?.attributes?.text as string) ?? ""; - const stamp = await params.stamp(); - yield { - uri: `upper://${doc.uri.slice("text:".length)}`, - stamp, - status: "updated", - hash: text.toUpperCase().length.toString(), - attributes: { text: text.toUpperCase() }, - }; - } - }, - }; -} -``` +### Operator actions -### Verifying both backends with one suite +- `store.invalidate(prefix)` — appends `'removed'` events for every live URI under the prefix. Downstream workers see these on the next `stabilize()` and cascade. +- `store.purgeResources({ keepLatestPerUri: true })` — collapses the event log to one row per URI. +- `store.purgeCompletions({ keepLatestPerWorker: N })` — trims completion history. +- `engine.unregister(name)` — removes the worker and its completion rows; resources stay. -The contract test suite is exported. New `GraphStore` implementations ride on the same tests: +### Introspection ```ts -import { defineGraphStoreContract } from "@statewalker/uri-graph"; - -defineGraphStoreContract("MyCustomStore", () => { - return { - async open() { - /* return GraphStore */ - }, - async close(store) { - /* tear down */ - }, - }; -}); -``` +import { topoLayers } from "@statewalker/uri-graph"; -## Internals +const layers = topoLayers(workers); +// [[w1, w2], [w3], [w4, w5]] — workers in the same layer have no dependency between them +``` -### Architectural decisions +`topoLayers` is for visualization and parallel scheduling decisions. The engine itself does not use it: the fixpoint loop is data-driven by stamps. -- **`GraphStore` is the abstraction, not raw SQL.** Lifting above `Db` lets `MemoryGraphStore` exist without a SQL-over-maps shim. Both backends pass the same `defineGraphStoreContract` suite — isofunctional by construction. -- **Logical transaction ≠ physical transaction.** A worker's stamp boundary opens a logical transaction; each `applyUpdate` runs as its own small physical transaction that stages writes into `uri_state_pending` (SQL) or a per-run pending map (memory). Promotion happens atomically on `commit`. Long worker calls between yields never hold writer locks. -- **Stamps bump only on real change (no-op rule).** The orchestrator compares `(status, hash)` against committed state. If unchanged, no stamp bump and no downstream cascade. This makes cycles terminate, replays safe, and "save without changes" cheap. -- **`advanced` requires committed yields.** Sentinel-tick selectors (file watcher) consume but produce no yields when nothing changed; the orchestrator treats those rounds as no-progress and sleeps. Without this guard, the fixpoint loop would spin. -- **Workers don't branch on backend.** The same `WorkerDefinition` runs against `MemoryGraphStore` and `SqlGraphStore`. The worker-library uses `findDirty(graph, ...)` selectors and `params.read(uri)` for live re-reads. -- **Memory persistence is abstract.** The store calls `lock(key)` once at open, then `load`/`store`/`unlock` against the resulting `LockId`. This decouples the store from any specific filesystem; FilesApi is one adapter, not a hard dependency. +## Internals -### Schema (SQL backend) +### Storage shape -Seven tables with the indexes shown below: +Three append-only tables plus a worker registry and a stamp counter: ``` -uri (id, text) ← URI interning -stamp_seq (id=1, next) ← strictly monotonic stamps -worker_registry (name, version, …) ← worker metadata -uri_state (uri_id, status, stamp, hash, attributes) ← committed truth -uri_state_pending (run_id, uri_id, …) ← staging -run (id, action, action_version, scope, stamp, outcome, …) ← run history -run_input (run_id, uri_id, role, observed_stamp) ← what each run consumed -run_output (run_id, uri_id, written_stamp, was_noop) ← what each run produced +stamp_seq (id=1, next) — single-row counter, only UPDATE in the system +resources (uri, stamp, status, meta) PK (uri, stamp) +workers (name, selects, emits) CRUD +completions (worker, stamp, finished_at) PK (worker, stamp) ``` -### Crash recovery +`get(uri)` reads `MAX(stamp)` per URI. `list({ prefix, afterStamp })` joins each URI's max-stamp row and filters. `allWatermarks()` is `SELECT worker, MAX(stamp) FROM completions GROUP BY worker`. The same shape backs `MemoryStore`. -``` -beginTransaction → flush running run row -applyUpdate × N → no-op check; stage if changed (each = small physical txn) -commit → promote staging → committed (single physical txn) -rollback → drop staging, mark cancelled - -crash anywhere → on next openGraphStore, recoverOrphans: - UPDATE running runs → cancelled - DELETE pending rows for those runs - uri_state untouched -``` +### Watermark semantics -### Constraints +A row in `completions(worker, stamp)` means: *this worker has fully processed every resource that existed when that stamp was minted.* Because the completion stamp is minted after the run finishes, it is strictly greater than every output stamp produced by the run, which is in turn greater than every input stamp consumed. + +A worker that consumes input but produces nothing (filter case) still advances its watermark — the engine writes a completion row whenever the worker's input stream yielded at least one resource. A worker that finds no input writes no completion row and re-runs cheaply on the next round. -- Single-writer orchestrator. Multi-process is out of scope; OPFS already enforces single-writer in browser. -- `MemoryGraphStore` is one process per `key`. A second open against the same key throws "already open". -- `MemoryGraphStore` keeps full state in memory; not intended for 5M-URI workloads. Use `SqlGraphStore` at scale. -- Workers must produce deterministic output URIs (function of input, never of time/randomness). Re-run = same URIs, no orphans. -- Stamps within one generator invocation must be non-decreasing; the orchestrator throws on regression. +### Crash safety -### Dependencies +- `put(resource)` is `INSERT OR REPLACE` keyed by `(uri, stamp)`; safe to re-emit the same URI/stamp pair. +- `markCompleted` runs only on clean generator completion; a crash leaves the watermark unchanged. +- On restart, the engine sees the unchanged watermark and re-runs the worker with the same inputs. Workers are required to be deterministic on URI keys; re-emitted outputs overwrite previous ones. -- `@statewalker/db-api` — abstract `Db` interface (only used by SQL backend; declared as a regular dependency so consumers wiring `MemoryGraphStore` still get the type). -- `@statewalker/webrun-files` — `FilesApi` interface used by the file watcher / extractors and by the optional `createFilesPersistence` helper. -- Dev: `@statewalker/db-turso-node`, `@statewalker/webrun-files-mem`, `@types/node`, vitest, biome, tsdown, rimraf, typescript. +### Constraints -The kernel is environment-agnostic. Bootstrap helpers in `./node` and `./browser` are thin and pull in env-specific factories (`newNodeTursoDb` / `newBrowserTursoDb`, `NodeFilesApi` / `getOPFSFilesApi`). +- Single-writer engine. Multi-process is out of scope; if you need it, run a single engine and feed work in. +- `selects` and `emits` are scheme prefixes, not glob patterns. If you need finer filtering, do it inside the worker. +- Workers must produce deterministic output URIs (a function of input, never of time/randomness). Re-running yields the same URIs and overwrites prior values. +- `MemoryStore` keeps full state in memory; not intended for huge graphs. ## Related -- `@statewalker/db-api` — abstract DB interface used by `SqlGraphStore`. -- `@statewalker/db-turso-node` / `@statewalker/db-turso-browser` — libSQL adapters. -- `@statewalker/webrun-files` (and `-mem` / `-node` / `-browser`) — FilesApi interface and implementations. -- `@statewalker/content-pipeline` — earlier scanner-based pipeline this package is positioned to replace. +- `@statewalker/db-api` — abstract `Db` interface used by `SqlStore`. +- `@statewalker/db-turso-node`, `@statewalker/db-turso-browser` — libSQL adapters that implement `Db`. ## License diff --git a/packages/uri-graph/package.json b/packages/uri-graph/package.json index 929f20f..9b52252 100644 --- a/packages/uri-graph/package.json +++ b/packages/uri-graph/package.json @@ -1,9 +1,9 @@ { "name": "@statewalker/uri-graph", - "version": "0.1.0", + "version": "0.2.0", "private": false, "type": "module", - "description": "Persistent URI dependency graph kernel: workers as async generators, single-writer orchestrator, two interchangeable repository backends (in-memory+JSON snapshot, SQL).", + "description": "Minimal persistent URI dependency graph: workers as async generators, stamp watermarks, fixpoint stabilizer over an in-memory or SQL store.", "homepage": "https://github.com/statewalker/statewalker-content", "author": { "name": "Mikhail Kotelnikov", @@ -15,9 +15,7 @@ "url": "git+ssh://git@github.com/statewalker/statewalker-content.git" }, "exports": { - ".": "./src/index.ts", - "./node": "./src/node.ts", - "./browser": "./src/browser.ts" + ".": "./src/index.ts" }, "files": [ "dist", @@ -34,12 +32,10 @@ "format": "biome format --write ." }, "dependencies": { - "@statewalker/db-api": "workspace:*", - "@statewalker/webrun-files": "catalog:" + "@statewalker/db-api": "workspace:*" }, "devDependencies": { "@statewalker/db-turso-node": "workspace:*", - "@statewalker/webrun-files-mem": "catalog:", "@types/node": "catalog:", "rimraf": "catalog:", "tsdown": "catalog:", diff --git a/packages/uri-graph/src/browser.ts b/packages/uri-graph/src/browser.ts deleted file mode 100644 index f14757c..0000000 --- a/packages/uri-graph/src/browser.ts +++ /dev/null @@ -1,2 +0,0 @@ -// Browser bootstrap helpers. Filled in once the orchestrator + workers land. -export {}; diff --git a/packages/uri-graph/src/engine.ts b/packages/uri-graph/src/engine.ts new file mode 100644 index 0000000..83a860d --- /dev/null +++ b/packages/uri-graph/src/engine.ts @@ -0,0 +1,69 @@ +import type { Store } from "./store/store.js"; +import type { Resource, Worker, WorkerContext, WorkerFn } from "./types.js"; + +export class Engine { + private fns = new Map(); + + constructor(private store: Store) {} + + async register(worker: Worker, fn: WorkerFn): Promise { + await this.store.saveWorker(worker); + this.fns.set(worker.name, fn); + } + + async unregister(name: string): Promise { + await this.store.deleteWorker(name); + this.fns.delete(name); + } + + async *runWorker(name: string): AsyncIterable { + const worker = await this.store.getWorker(name); + if (!worker) return; + const watermarks = await this.store.allWatermarks(); + yield* this.runOne(worker, watermarks.get(name) ?? 0); + } + + async *stabilize(): AsyncIterable { + for (;;) { + const watermarks = await this.store.allWatermarks(); + let progressed = false; + for await (const worker of this.store.listWorkers()) { + const watermark = watermarks.get(worker.name) ?? 0; + for await (const r of this.runOne(worker, watermark)) { + progressed = true; + yield r; + } + } + if (!progressed) break; + } + } + + private async *runOne(worker: Worker, watermark: number): AsyncIterable { + const fn = this.fns.get(worker.name); + if (!fn) return; + + const store = this.store; + let consumed = false; + const input = (async function* () { + for await (const r of store.list({ prefix: worker.selects, afterStamp: watermark })) { + consumed = true; + yield r; + } + })(); + + const ctx: WorkerContext = { + newStamp: () => store.newStamp(), + read: (uri) => store.get(uri), + }; + + for await (const out of fn(input, ctx)) { + await store.put(out); + yield out; + } + + if (consumed) { + const completionStamp = await store.newStamp(); + await store.markCompleted(worker.name, completionStamp); + } + } +} diff --git a/packages/uri-graph/src/graph/selector-helpers.ts b/packages/uri-graph/src/graph/selector-helpers.ts deleted file mode 100644 index 2206f8a..0000000 --- a/packages/uri-graph/src/graph/selector-helpers.ts +++ /dev/null @@ -1,79 +0,0 @@ -import type { GraphReader, GraphStore } from "../store/types.js"; -import type { Update } from "../types/update.js"; - -export interface FindDirtyOptions { - forWorker: string; - forVersion: string; - uriLike: string; - limit: number; -} - -/** - * Yields `Update`s for URIs matching `uriLike` that the worker has NOT processed - * at its current version. Stops at `limit` URIs. - * - * Synthesizes one `Update` per matching URI from the URI's committed state. - * Sets `scope = uri` and `role = undefined`; multi-input workers should compose - * multiple `findDirty` calls via `joinInputs`. - */ -export async function* findDirty( - graph: GraphStore, - opts: FindDirtyOptions, -): AsyncIterableIterator { - let yielded = 0; - for await (const view of graph.find(opts.uriLike)) { - if (yielded >= opts.limit) break; - const processed = await graph.isInputProcessed(opts.forWorker, opts.forVersion, view.uri); - if (processed) continue; - yielded += 1; - yield { - uri: view.uri, - stamp: view.stamp, - status: view.status, - hash: view.hash, - scope: view.uri, - attributes: view.attributes, - }; - } -} - -/** - * Merges multiple `Update` streams, yielding all updates ordered by `(scope, role, uri)`. - * Inputs SHOULD already be ordered by scope so the merge is k-way; otherwise the - * helper buffers and sorts which may use more memory. - */ -export async function* joinInputs( - ...streams: Array> -): AsyncIterableIterator { - const all: Update[] = []; - await Promise.all( - streams.map(async (s) => { - for await (const u of s) all.push(u); - }), - ); - all.sort((a, b) => { - const sa = a.scope ?? ""; - const sb = b.scope ?? ""; - if (sa !== sb) return sa < sb ? -1 : 1; - const ra = a.role ?? ""; - const rb = b.role ?? ""; - if (ra !== rb) return ra < rb ? -1 : 1; - return a.uri < b.uri ? -1 : a.uri > b.uri ? 1 : 0; - }); - yield* all; -} - -/** - * A trivially-empty selector. Useful for source workers that need a non-empty - * tick selector to be polled by the orchestrator. Yields a single sentinel update. - */ -export async function* singleTickSelector(workerName: string): AsyncIterableIterator { - yield { - uri: `tick://${workerName}`, - stamp: 0, - status: "updated", - }; -} - -// Re-export GraphReader so consumers writing custom selectors can type their context. -export type { GraphReader }; diff --git a/packages/uri-graph/src/index.ts b/packages/uri-graph/src/index.ts index bc3041d..98a7793 100644 --- a/packages/uri-graph/src/index.ts +++ b/packages/uri-graph/src/index.ts @@ -1,81 +1,11 @@ -// Core types - -// Selector helpers -export { - type FindDirtyOptions, - findDirty, - joinInputs, - singleTickSelector, -} from "./graph/selector-helpers.js"; -export { type DrainOptions, type DrainResult, drain } from "./orchestrator/drain.js"; -// Orchestrator -export { - createOrchestrator, - type Orchestrator, - type OrchestratorOptions, - type OrchestratorStatusReport, -} from "./orchestrator/orchestrator.js"; -// Store interfaces and contract -export { - defineGraphStoreContract, - type GraphStoreHarness, - type GraphStoreHarnessFactory, -} from "./store/contract.js"; - -// Memory store -export { - createFilesPersistence, - createInMemoryPersistence, -} from "./store/memory/files-persistence.js"; -export type { Dump, LockId, MemoryPersistence } from "./store/memory/persistence.js"; -export { - MemoryGraphStore, - type MemoryGraphStoreOptions, -} from "./store/memory/store.js"; - -// SQL store -export { SqlGraphStore, type SqlGraphStoreOptions } from "./store/sql/store.js"; -export { - type BeginTransactionOpts, - type GraphReader, - type GraphStore, - type GraphTransaction, - openGraphStore, - type RecoverOrphansResult, - type RegisterWorkerInput, - type RegisterWorkerResult, -} from "./store/types.js"; +export { Engine } from "./engine.js"; +export { MemoryStore } from "./store/memory.js"; +export { SqlStore } from "./store/sql.js"; export type { - ReadOnlyView, - Status, - Update, -} from "./types/update.js"; -export type { - Selector, - SelectorContext, - WorkerDefinition, - WorkerParams, -} from "./types/worker.js"; -// Utilities -export { sha256Hex } from "./util/hash.js"; -// Workers -export { type ChunkerOptions, createChunker } from "./workers/chunker.js"; -export { createEmbedder, type EmbedderOptions } from "./workers/embedder.js"; -export { createHtmlExtractor } from "./workers/extractors/html-extractor.js"; -export { createMarkdownExtractor } from "./workers/extractors/markdown-extractor.js"; -export { createPlainTextExtractor } from "./workers/extractors/plain-text-extractor.js"; -export { - createFileWatcher, - type FileWatcherOptions, -} from "./workers/file-watcher.js"; -export { - createMemoryFtsBackend, - type FtsBackend, - type FtsHit, -} from "./workers/index-backends/memory-fts.js"; -export { - createMemoryVectorBackend, - type VectorBackend, - type VectorHit, -} from "./workers/index-backends/memory-vector.js"; -export { createIndexer, type IndexerOptions } from "./workers/indexer.js"; + ListOptions, + PurgeCompletionsOptions, + PurgeResourcesOptions, + Store, +} from "./store/store.js"; +export { topoLayers } from "./topo-layers.js"; +export type { Resource, Status, Worker, WorkerContext, WorkerFn } from "./types.js"; diff --git a/packages/uri-graph/src/node.ts b/packages/uri-graph/src/node.ts deleted file mode 100644 index e46a3f3..0000000 --- a/packages/uri-graph/src/node.ts +++ /dev/null @@ -1,2 +0,0 @@ -// Node bootstrap helpers. Filled in once the orchestrator + workers land. -export {}; diff --git a/packages/uri-graph/src/orchestrator/drain.ts b/packages/uri-graph/src/orchestrator/drain.ts deleted file mode 100644 index 9247009..0000000 --- a/packages/uri-graph/src/orchestrator/drain.ts +++ /dev/null @@ -1,164 +0,0 @@ -import type { GraphStore, GraphTransaction } from "../store/types.js"; -import type { Update } from "../types/update.js"; -import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; - -export interface DrainOptions { - /** Yield to the event loop after every N committed updates. */ - yieldEveryN?: number; - /** Warn if a logical transaction stays open longer than this many ms. */ - txnWarnMs?: number; - /** Hook for warnings (used in tests). */ - onWarn?: (msg: string) => void; - /** AbortSignal forwarded to the worker's run. */ - signal?: AbortSignal; -} - -export interface DrainResult { - /** Number of commits that produced at least one non-noop write. */ - committedWithChanges: number; - /** Number of commits regardless of changes. */ - commits: number; -} - -/** - * Drives a `WorkerDefinition.run` to completion against an `input` stream: - * - opens a logical transaction at the first yield of a new stamp, - * - applies every same-stamp update under that transaction, - * - commits at the stamp boundary and opens the next, - * - rolls back on generator throw, - * - asserts stamp monotonicity per generator invocation, - * - records every consumed input into the current run. - * - * Returns when the generator exhausts (success) or throws (error rethrown). - */ -export async function drain( - worker: WorkerDefinition, - input: AsyncIterable, - graph: GraphStore, - opts: DrainOptions = {}, -): Promise { - const yieldEveryN = opts.yieldEveryN ?? 100; - const txnWarnMs = opts.txnWarnMs ?? 200; - const warn = opts.onWarn ?? ((m) => console.warn(m)); - const signal = opts.signal ?? new AbortController().signal; - - // Tee input so we record every consumed update against the current run. - const consumed: Update[] = []; - async function* teeInput(): AsyncGenerator { - for await (const u of input) { - consumed.push(u); - yield u; - } - } - - const params: WorkerParams = { - stamp: () => graph.mintStamp(), - read: (uri) => graph.getState(uri), - find: (pattern) => graph.find(pattern), - priorOutputs: (uri) => graph.priorOutputs(worker.name, uri), - recordRead: (uri, role) => { - consumed.push({ - uri, - stamp: 0, - status: "updated", - ...(role !== undefined ? { role } : {}), - }); - }, - signal, - }; - - let txn: GraphTransaction | undefined; - let currentStamp: number | undefined; - let txnOpenedAt = 0; - let yieldedCount = 0; - - // Inputs consumed since last commit; flushed atomically with the commit. - const consumedAtCommit: Update[] = []; - - async function openTransaction(forStamp: number): Promise { - txn = await graph.beginTransaction({ - worker: worker.name, - version: worker.version, - scope: null, - initialStamp: forStamp, - }); - txnOpenedAt = performance.now(); - } - - async function commitCurrent(): Promise { - if (!txn) return; - if (consumedAtCommit.length > 0) { - await txn.recordInputs( - consumedAtCommit.map((u) => ({ - uri: u.uri, - observedStamp: u.stamp, - ...(u.role !== undefined ? { role: u.role } : {}), - })), - ); - consumedAtCommit.length = 0; - } - const elapsed = performance.now() - txnOpenedAt; - if (elapsed > txnWarnMs) { - warn(`${worker.name} stamp ${currentStamp} held logical txn ${elapsed.toFixed(0)}ms`); - } - await txn.commit(); - txn = undefined; - } - - async function rollbackCurrent(): Promise { - if (!txn) return; - try { - await txn.rollback(); - } finally { - txn = undefined; - } - } - - const gen = worker.run(params, teeInput()); - let commits = 0; - try { - while (true) { - const next = await gen.next(); - if (next.done) break; - const u = next.value; - - // Stamp regression guard. - if (currentStamp !== undefined && u.stamp < currentStamp) { - throw new Error(`stamp regression in ${worker.name}: ${currentStamp} → ${u.stamp}`); - } - - // Stamp boundary: close prior batch, open new one. - if (currentStamp !== undefined && u.stamp !== currentStamp) { - await commitCurrent(); - commits += 1; - } - if (!txn) { - await openTransaction(u.stamp); - } - // Always move newly-consumed inputs into the active batch. - consumedAtCommit.push(...consumed); - consumed.length = 0; - - currentStamp = u.stamp; - if (!txn) throw new Error("internal: txn missing"); - await txn.applyUpdate(u); - - yieldedCount += 1; - if (yieldedCount % yieldEveryN === 0) { - await new Promise((r) => setImmediate(r)); - } - } - - if (txn) { - await commitCurrent(); - commits += 1; - } - } catch (err) { - await rollbackCurrent(); - throw err; - } - return { - commits, - committedWithChanges: yieldedCount > 0 ? commits : 0, - }; -} diff --git a/packages/uri-graph/src/orchestrator/orchestrator.ts b/packages/uri-graph/src/orchestrator/orchestrator.ts deleted file mode 100644 index b85c160..0000000 --- a/packages/uri-graph/src/orchestrator/orchestrator.ts +++ /dev/null @@ -1,167 +0,0 @@ -import type { GraphStore } from "../store/types.js"; -import type { Update } from "../types/update.js"; -import type { WorkerDefinition } from "../types/worker.js"; -import { type DrainOptions, drain } from "./drain.js"; - -export interface OrchestratorOptions { - graph: GraphStore; - pollMs?: number; - selectorBatchSize?: number; - txnWarnMs?: number; - yieldEveryN?: number; - onWarn?: (msg: string) => void; - /** Optional logger for run failures. Defaults to console.error. */ - onRunError?: (workerName: string, err: unknown) => void; -} - -export interface OrchestratorStatusReport { - running: boolean; - workers: Array<{ - name: string; - version: string; - }>; -} - -export interface Orchestrator { - registerWorker(def: WorkerDefinition): Promise; - start(signal?: AbortSignal): Promise; - stop(): Promise; - status(): Promise; -} - -export function createOrchestrator(opts: OrchestratorOptions): Orchestrator { - const pollMs = opts.pollMs ?? 200; - const selectorBatchSize = opts.selectorBatchSize ?? 100; - const drainOpts: DrainOptions = { - yieldEveryN: opts.yieldEveryN, - txnWarnMs: opts.txnWarnMs, - onWarn: opts.onWarn, - }; - const onRunError = opts.onRunError ?? ((name, e) => console.error(`worker ${name} failed:`, e)); - - const workers: WorkerDefinition[] = []; - let running = false; - let internalSignal: AbortController | undefined; - - async function pollOnce(signal: AbortSignal): Promise { - let advanced = false; - for (const w of workers) { - if (signal.aborted) return advanced; - const cursor = w.selector({ - workerName: w.name, - workerVersion: w.version, - limit: selectorBatchSize, - }); - const stream = await drainIfNonEmpty(cursor); - if (!stream) continue; - try { - const result = await drain(w, stream, opts.graph, { - ...drainOpts, - signal, - }); - // Only treat as progress when the worker actually committed real outputs. - // A worker whose run() consumes a sentinel tick and yields nothing must - // not loop the orchestrator forever. - if (result.committedWithChanges > 0) advanced = true; - } catch (err) { - onRunError(w.name, err); - } - } - return advanced; - } - - return { - async registerWorker(def: WorkerDefinition): Promise { - await opts.graph.registerWorker({ - name: def.name, - version: def.version, - description: def.description, - inputPattern: def.inputPattern, - outputPattern: def.outputPattern, - scopeExpr: def.scopeExpr, - }); - workers.push(def); - }, - async start(signal?: AbortSignal): Promise { - if (running) return; - running = true; - internalSignal = new AbortController(); - const composedSignal = mergeSignals(signal, internalSignal.signal); - - try { - while (!composedSignal.aborted) { - const advanced = await pollOnce(composedSignal); - if (composedSignal.aborted) break; - if (!advanced) { - await sleep(pollMs, composedSignal); - } - } - } finally { - running = false; - } - }, - async stop(): Promise { - internalSignal?.abort(); - }, - async status(): Promise { - return { - running, - workers: workers.map((w) => ({ name: w.name, version: w.version })), - }; - }, - }; -} - -async function drainIfNonEmpty( - it: AsyncIterableIterator, -): Promise | null> { - const first = await it.next(); - if (first.done) { - if (it.return) await it.return(undefined); - return null; - } - async function* prepended(): AsyncIterableIterator { - yield first.value; - while (true) { - const n = await it.next(); - if (n.done) return; - yield n.value; - } - } - return prepended(); -} - -function sleep(ms: number, signal: AbortSignal): Promise { - return new Promise((resolve) => { - if (signal.aborted) { - resolve(); - return; - } - const t = setTimeout(() => { - signal.removeEventListener("abort", onAbort); - resolve(); - }, ms); - const onAbort = (): void => { - clearTimeout(t); - signal.removeEventListener("abort", onAbort); - resolve(); - }; - signal.addEventListener("abort", onAbort, { once: true }); - }); -} - -function mergeSignals(...signals: Array): AbortSignal { - const ctrl = new AbortController(); - for (const s of signals) { - if (!s) continue; - if (s.aborted) { - ctrl.abort(); - return ctrl.signal; - } - s.addEventListener("abort", () => ctrl.abort(), { once: true }); - } - return ctrl.signal; -} - -// Re-export Update for consumers writing tests against the orchestrator. -export type { Update }; diff --git a/packages/uri-graph/src/store/contract.ts b/packages/uri-graph/src/store/contract.ts deleted file mode 100644 index a80d3d1..0000000 --- a/packages/uri-graph/src/store/contract.ts +++ /dev/null @@ -1,323 +0,0 @@ -import { beforeEach, describe, expect, it } from "vitest"; -import type { Update } from "../types/update.js"; -import type { GraphStore } from "./types.js"; - -/** - * Shape returned by a backend's contract harness factory. Each call to `open()` opens - * a store against the same backing (so persistence-boundary tests can re-open the - * same data). `abandon(store)` simulates a crash by closing the store without committing - * any in-flight transaction. - */ -export interface GraphStoreHarness { - open(): Promise; - close(store: GraphStore): Promise; -} - -export type GraphStoreHarnessFactory = () => GraphStoreHarness; - -/** - * Drains an `AsyncIterable` into an array. Helper for `find`-style scenarios. - */ -async function drain(iterable: AsyncIterable): Promise { - const out: T[] = []; - for await (const x of iterable) out.push(x); - return out; -} - -export function defineGraphStoreContract(name: string, factory: GraphStoreHarnessFactory): void { - describe(`GraphStore contract: ${name}`, () => { - let harness: GraphStoreHarness; - let store: GraphStore; - - beforeEach(async () => { - harness = factory(); - store = await harness.open(); - }); - - describe("reads", () => { - it("getState returns null for unknown URI", async () => { - expect(await store.getState("file:///unknown")).toBeNull(); - }); - - it("find yields matching URIs", async () => { - await store.registerWorker({ name: "seed", version: "v1" }); - const stamp1 = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "seed", - version: "v1", - scope: null, - initialStamp: stamp1, - }); - for (const path of ["a.md", "b.md", "c.txt"]) { - await txn.applyUpdate({ - uri: `file:///${path}`, - stamp: stamp1, - status: "added", - hash: `h:${path}`, - }); - } - await txn.commit(); - const matches = await drain(store.find("file:///%.md")); - expect(matches.map((m) => m.uri).sort()).toEqual(["file:///a.md", "file:///b.md"]); - }); - - it("priorOutputs returns last successful run outputs for an input", async () => { - await store.registerWorker({ name: "ext", version: "v1" }); - const inputUri = "file:///x.md"; - const inputStamp = await store.mintStamp(); - const seed = await store.beginTransaction({ - worker: "ext", - version: "v1", - scope: null, - initialStamp: inputStamp, - }); - await seed.applyUpdate({ - uri: inputUri, - stamp: inputStamp, - status: "added", - hash: "ih", - }); - await seed.commit(); - - const outStamp = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "ext", - version: "v1", - scope: inputUri, - initialStamp: outStamp, - }); - await txn.recordInputs([{ uri: inputUri, observedStamp: inputStamp }]); - await txn.applyUpdate({ - uri: "text:///x.md", - stamp: outStamp, - status: "added", - hash: "th", - }); - await txn.commit(); - - const prior = await store.priorOutputs("ext", inputUri); - expect(prior.map((p) => p.uri)).toEqual(["text:///x.md"]); - }); - }); - - describe("logical transaction lifecycle", () => { - it("commit promotes staged updates and clears staging", async () => { - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.applyUpdate({ uri: "u://a", stamp: s, status: "added", hash: "1" }); - await txn.applyUpdate({ uri: "u://b", stamp: s, status: "added", hash: "2" }); - await txn.commit(); - expect((await store.getState("u://a"))?.stamp).toBe(s); - expect((await store.getState("u://b"))?.stamp).toBe(s); - }); - - it("rollback discards staged updates", async () => { - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.applyUpdate({ uri: "u://a", stamp: s, status: "added", hash: "1" }); - await txn.applyUpdate({ uri: "u://b", stamp: s, status: "added", hash: "2" }); - await txn.rollback(); - expect(await store.getState("u://a")).toBeNull(); - expect(await store.getState("u://b")).toBeNull(); - }); - - it("reuse after commit throws", async () => { - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.commit(); - await expect( - txn.applyUpdate({ uri: "u://a", stamp: s, status: "added" }), - ).rejects.toThrow(); - }); - - it("reuse after rollback throws", async () => { - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.rollback(); - await expect(txn.commit()).rejects.toThrow(); - }); - - it("commit twice throws", async () => { - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.commit(); - await expect(txn.commit()).rejects.toThrow(); - }); - }); - - describe("no-op rule", () => { - it("identical content does not bump the stamp", async () => { - await store.registerWorker({ name: "w", version: "v1" }); - const s1 = await store.mintStamp(); - const txn1 = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s1, - }); - await txn1.applyUpdate({ - uri: "u://x", - stamp: s1, - status: "added", - hash: "h", - }); - await txn1.commit(); - - const s2 = await store.mintStamp(); - const txn2 = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s2, - }); - await txn2.applyUpdate({ - uri: "u://x", - stamp: s2, - status: "added", - hash: "h", - }); - await txn2.commit(); - - expect((await store.getState("u://x"))?.stamp).toBe(s1); - }); - - it("changed content bumps the stamp", async () => { - await store.registerWorker({ name: "w", version: "v1" }); - const s1 = await store.mintStamp(); - const txn1 = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s1, - }); - await txn1.applyUpdate({ - uri: "u://x", - stamp: s1, - status: "added", - hash: "h1", - }); - await txn1.commit(); - - const s2 = await store.mintStamp(); - const txn2 = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s2, - }); - await txn2.applyUpdate({ - uri: "u://x", - stamp: s2, - status: "updated", - hash: "h2", - }); - await txn2.commit(); - - expect((await store.getState("u://x"))?.stamp).toBe(s2); - }); - }); - - describe("stamps", () => { - it("two consecutive stamps differ", async () => { - const a = await store.mintStamp(); - const b = await store.mintStamp(); - expect(b).toBeGreaterThan(a); - }); - - it("ten concurrent stamps are distinct", async () => { - const stamps = await Promise.all(Array.from({ length: 10 }, () => store.mintStamp())); - const set = new Set(stamps); - expect(set.size).toBe(10); - }); - }); - - describe("worker registry", () => { - it("register same name and version is idempotent", async () => { - const r1 = await store.registerWorker({ name: "w", version: "v1" }); - const r2 = await store.registerWorker({ name: "w", version: "v1" }); - expect(r1.versionChanged).toBe(true); // first register is technically a change - expect(r2.versionChanged).toBe(false); - }); - - it("register with bumped version reports versionChanged", async () => { - await store.registerWorker({ name: "w", version: "v1" }); - const r = await store.registerWorker({ name: "w", version: "v2" }); - expect(r.versionChanged).toBe(true); - }); - }); - - describe("recovery", () => { - it("recoverOrphans is a no-op when no running runs exist", async () => { - const result = await store.recoverOrphans(); - expect(result.cancelled).toBe(0); - expect(result.pendingRowsDropped).toBe(0); - }); - - it("a crashed run leaves no committed state on next open", async () => { - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.applyUpdate({ - uri: "u://x", - stamp: s, - status: "added", - hash: "h", - }); - // Don't commit. Force-close. - await harness.close(store); - - // Reopen — `openGraphStore` runs recoverOrphans automatically. - store = await harness.open(); - expect(await store.getState("u://x")).toBeNull(); - // A subsequent recoverOrphans call is a no-op (idempotent). - const second = await store.recoverOrphans(); - expect(second.cancelled).toBe(0); - }); - }); - }); -} - -/** Minimal fixture for tests that just need to seed updates. */ -export function buildSeedUpdates(uris: string[], stamp: number): Update[] { - return uris.map((uri) => ({ - uri, - stamp, - status: "added" as const, - hash: `h:${uri}`, - })); -} diff --git a/packages/uri-graph/src/store/memory.ts b/packages/uri-graph/src/store/memory.ts new file mode 100644 index 0000000..578e273 --- /dev/null +++ b/packages/uri-graph/src/store/memory.ts @@ -0,0 +1,113 @@ +import type { Resource, Worker } from "../types.js"; +import type { + ListOptions, + PurgeCompletionsOptions, + PurgeResourcesOptions, + Store, +} from "./store.js"; + +export class MemoryStore implements Store { + private nextStampValue = 1; + private resourcesByUri = new Map(); + private workers = new Map(); + private completionsByWorker = new Map(); + + async newStamp(): Promise { + return this.nextStampValue++; + } + + async put(resource: Resource): Promise { + if (resource.stamp >= this.nextStampValue) { + this.nextStampValue = resource.stamp + 1; + } + const arr = this.resourcesByUri.get(resource.uri); + if (arr) { + arr.push(resource); + } else { + this.resourcesByUri.set(resource.uri, [resource]); + } + } + + async get(uri: string): Promise { + const arr = this.resourcesByUri.get(uri); + if (!arr || arr.length === 0) return undefined; + return arr[arr.length - 1]; + } + + async *list(options: ListOptions): AsyncIterable { + const after = options.afterStamp ?? 0; + const matches: Resource[] = []; + for (const [uri, arr] of this.resourcesByUri) { + if (!uri.startsWith(options.prefix)) continue; + const latest = arr[arr.length - 1]; + if (latest && latest.stamp > after) matches.push(latest); + } + matches.sort((a, b) => a.stamp - b.stamp); + for (const r of matches) yield r; + } + + async saveWorker(worker: Worker): Promise { + this.workers.set(worker.name, { ...worker }); + } + + async deleteWorker(name: string): Promise { + this.workers.delete(name); + this.completionsByWorker.delete(name); + } + + async getWorker(name: string): Promise { + const w = this.workers.get(name); + return w ? { ...w } : undefined; + } + + async *listWorkers(): AsyncIterable { + for (const w of this.workers.values()) yield { ...w }; + } + + async markCompleted(worker: string, stamp: number): Promise { + const arr = this.completionsByWorker.get(worker); + if (arr) arr.push(stamp); + else this.completionsByWorker.set(worker, [stamp]); + } + + async allWatermarks(): Promise> { + const result = new Map(); + for (const [worker, stamps] of this.completionsByWorker) { + let max = 0; + for (const s of stamps) if (s > max) max = s; + result.set(worker, max); + } + return result; + } + + async invalidate(prefix: string): Promise { + const stamp = await this.newStamp(); + for (const [uri, arr] of this.resourcesByUri) { + if (!uri.startsWith(prefix)) continue; + const latest = arr[arr.length - 1]; + if (!latest || latest.status === "removed") continue; + arr.push({ uri, stamp, status: "removed" }); + } + } + + async purgeResources(options?: PurgeResourcesOptions): Promise { + if (options?.keepLatestPerUri !== true) return; + for (const [uri, arr] of this.resourcesByUri) { + if (arr.length > 1) { + const latest = arr[arr.length - 1]; + if (latest) this.resourcesByUri.set(uri, [latest]); + } + } + } + + async purgeCompletions(options?: PurgeCompletionsOptions): Promise { + const keep = options?.keepLatestPerWorker; + if (keep === undefined || keep < 1) return; + for (const [worker, stamps] of this.completionsByWorker) { + if (stamps.length > keep) { + stamps.sort((a, b) => a - b); + this.completionsByWorker.set(worker, stamps.slice(stamps.length - keep)); + } + } + } +} diff --git a/packages/uri-graph/src/store/memory/files-persistence.ts b/packages/uri-graph/src/store/memory/files-persistence.ts deleted file mode 100644 index 424eb7a..0000000 --- a/packages/uri-graph/src/store/memory/files-persistence.ts +++ /dev/null @@ -1,87 +0,0 @@ -import type { FilesApi } from "@statewalker/webrun-files"; -import { readText, writeText } from "@statewalker/webrun-files"; -import type { Dump, LockId, MemoryPersistence } from "./persistence.js"; - -/** - * Process-local lock map: rejects a second `lock(key)` against the same - * `(files, key)` while a prior LockId is still outstanding. - */ -const locks = new WeakMap>(); - -function acquireLock(files: FilesApi, key: string): LockId { - let map = locks.get(files); - if (!map) { - map = new Map(); - locks.set(files, map); - } - if (map.has(key)) { - throw new Error(`already open at ${key}`); - } - const id: LockId = `${key}@${Math.random().toString(36).slice(2, 10)}`; - map.set(key, id); - return id; -} - -function releaseLock(files: FilesApi, key: string, id: LockId): void { - const map = locks.get(files); - if (!map) return; - if (map.get(key) === id) map.delete(key); -} - -/** - * Produce a `MemoryPersistence` that stores the dump as a JSON file inside the - * given `FilesApi` at `path`. Suitable for Node + browser (OPFS) wiring. - * - * Atomic publish: writes to `.tmp`, removes the prior target, then moves - * the temp file. If the process crashes between `writeText` and `move`, the - * prior committed snapshot remains intact. - */ -export function createFilesPersistence(files: FilesApi, path: string): MemoryPersistence { - return { - key: path, - async lock(key) { - return acquireLock(files, key); - }, - async load() { - if (!(await files.exists(path))) return null; - const text = await readText(files, path); - if (!text.trim()) return null; - return JSON.parse(text) as Dump; - }, - async store(_id, dump) { - const tmp = `${path}.tmp`; - await writeText(files, tmp, JSON.stringify(dump)); - if (await files.exists(path)) await files.remove(path); - await files.move(tmp, path); - }, - async unlock(id) { - releaseLock(files, path, id); - }, - }; -} - -/** - * Produce a `MemoryPersistence` that keeps the dump in process memory only. - * Useful for tests that don't need durability across restarts. - */ -export function createInMemoryPersistence(key = "graph"): MemoryPersistence { - let dump: Dump | null = null; - let activeLock: LockId | null = null; - return { - key, - async lock(k) { - if (activeLock) throw new Error(`already open at ${k}`); - activeLock = `${k}@local`; - return activeLock; - }, - async load() { - return dump; - }, - async store(_id, value) { - dump = value; - }, - async unlock(_id) { - activeLock = null; - }, - }; -} diff --git a/packages/uri-graph/src/store/memory/persistence.ts b/packages/uri-graph/src/store/memory/persistence.ts deleted file mode 100644 index 5ec2e54..0000000 --- a/packages/uri-graph/src/store/memory/persistence.ts +++ /dev/null @@ -1,22 +0,0 @@ -/** - * Persistence interface for `MemoryGraphStore`. Decouples the in-memory store - * from any specific filesystem or storage layer. - * - * The `key` field identifies this store within the persistence layer's - * namespace. The store calls `lock(key)` once at open time to acquire a - * `LockId`, then uses that id for every subsequent `load` / `store` / - * `unlock` call. - * - * A second `lock(key)` call against the same key while a prior LockId is still - * outstanding SHOULD reject (single-writer guarantee). - */ -export type Dump = unknown; -export type LockId = string; - -export interface MemoryPersistence { - key: string; - lock: (key: string) => Promise; - load: (id: LockId) => Promise; - store: (id: LockId, dump: Dump) => Promise; - unlock: (id: LockId) => Promise; -} diff --git a/packages/uri-graph/src/store/memory/snapshot.ts b/packages/uri-graph/src/store/memory/snapshot.ts deleted file mode 100644 index ae6e153..0000000 --- a/packages/uri-graph/src/store/memory/snapshot.ts +++ /dev/null @@ -1,167 +0,0 @@ -import type { Dump } from "./persistence.js"; -import { - createEmptyState, - type RunInputRow, - type RunOutputRow, - type RunRow, - type State, - type UriStateEntry, - type WorkerRegistryEntry, -} from "./state.js"; - -interface SerializedUriState { - uri: string; - status: UriStateEntry["status"]; - stamp: number; - hash?: string; - attributes?: Record; -} - -interface SerializedRun extends Omit { - id: number; - inputs: Array<{ uri: string; role: string | null; observedStamp: number }>; - outputs: Array<{ uri: string; writtenStamp: number; wasNoop: boolean }>; -} - -export interface Snapshot { - schemaVersion: 1; - uris: Array<{ id: number; text: string }>; - state: SerializedUriState[]; - runs: SerializedRun[]; - workers: WorkerRegistryEntry[]; - stampSeq: number; - nextUriId: number; - nextRunId: number; -} - -export function serialize(state: State): Snapshot { - const liveUriIds = new Set(); - for (const uriId of state.uriState.keys()) liveUriIds.add(uriId); - for (const run of state.runs.values()) { - for (const ri of state.runInput.get(run.id) ?? []) liveUriIds.add(ri.uriId); - for (const ro of state.runOutput.get(run.id) ?? []) liveUriIds.add(ro.uriId); - } - - const stateRows: SerializedUriState[] = []; - for (const [uriId, entry] of state.uriState) { - const text = state.uriById.get(uriId); - if (text === undefined) continue; - stateRows.push({ - uri: text, - status: entry.status, - stamp: entry.stamp, - hash: entry.hash, - attributes: entry.attributes, - }); - } - stateRows.sort((a, b) => a.uri.localeCompare(b.uri)); - - const runs: SerializedRun[] = []; - for (const run of state.runs.values()) { - const inputs: SerializedRun["inputs"] = []; - for (const ri of state.runInput.get(run.id) ?? []) { - const text = state.uriById.get(ri.uriId); - if (text === undefined) continue; - inputs.push({ uri: text, role: ri.role, observedStamp: ri.observedStamp }); - } - const outputs: SerializedRun["outputs"] = []; - for (const ro of state.runOutput.get(run.id) ?? []) { - const text = state.uriById.get(ro.uriId); - if (text === undefined) continue; - outputs.push({ - uri: text, - writtenStamp: ro.writtenStamp, - wasNoop: ro.wasNoop, - }); - } - runs.push({ ...run, inputs, outputs }); - } - runs.sort((a, b) => a.id - b.id); - - const uris: Snapshot["uris"] = []; - for (const [id, text] of state.uriById) { - if (!liveUriIds.has(id)) continue; - uris.push({ id, text }); - } - uris.sort((a, b) => a.id - b.id); - - const workers: WorkerRegistryEntry[] = []; - for (const w of state.workers.values()) workers.push(w); - workers.sort((a, b) => a.name.localeCompare(b.name)); - - return { - schemaVersion: 1, - uris, - state: stateRows, - runs, - workers, - stampSeq: state.stampSeq, - nextUriId: state.nextUriId, - nextRunId: state.nextRunId, - }; -} - -export function deserialize(snapshot: Snapshot): State { - if (snapshot.schemaVersion !== 1) { - throw new Error(`Unknown snapshot schemaVersion: ${snapshot.schemaVersion}`); - } - const state = createEmptyState(); - state.stampSeq = snapshot.stampSeq; - state.nextUriId = snapshot.nextUriId; - state.nextRunId = snapshot.nextRunId; - - for (const u of snapshot.uris) { - state.uriById.set(u.id, u.text); - state.uriIdByText.set(u.text, u.id); - } - for (const row of snapshot.state) { - const id = state.uriIdByText.get(row.uri); - if (id === undefined) continue; - state.uriState.set(id, { - status: row.status, - stamp: row.stamp, - hash: row.hash, - attributes: row.attributes, - }); - } - for (const run of snapshot.runs) { - const { inputs, outputs, ...rest } = run; - state.runs.set(run.id, rest); - const inputRows: RunInputRow[] = []; - for (const i of inputs) { - const id = state.uriIdByText.get(i.uri); - if (id === undefined) continue; - inputRows.push({ - uriId: id, - role: i.role, - observedStamp: i.observedStamp, - }); - } - state.runInput.set(run.id, inputRows); - const outputRows: RunOutputRow[] = []; - for (const o of outputs) { - const id = state.uriIdByText.get(o.uri); - if (id === undefined) continue; - outputRows.push({ - uriId: id, - writtenStamp: o.writtenStamp, - wasNoop: o.wasNoop, - }); - } - state.runOutput.set(run.id, outputRows); - } - for (const w of snapshot.workers) { - state.workers.set(w.name, w); - } - return state; -} - -export function dumpFromState(state: State): Dump { - return serialize(state) as unknown as Dump; -} - -export function stateFromDump(dump: Dump | null): State { - if (dump === null || dump === undefined) return createEmptyState(); - // Validate shape minimally; deserialize will throw on schema mismatch. - return deserialize(dump as Snapshot); -} diff --git a/packages/uri-graph/src/store/memory/state.ts b/packages/uri-graph/src/store/memory/state.ts deleted file mode 100644 index b5cdad8..0000000 --- a/packages/uri-graph/src/store/memory/state.ts +++ /dev/null @@ -1,94 +0,0 @@ -import type { Status } from "../../types/update.js"; - -export interface UriStateEntry { - status: Status; - stamp: number; - hash?: string; - attributes?: Record; -} - -export interface PendingEntry extends UriStateEntry { - uriId: number; -} - -export type RunOutcome = "running" | "success" | "cancelled" | "error"; - -export interface RunRow { - id: number; - action: string; - actionVersion: string; - scope: string | null; - stamp: number; - startedAt: number; - finishedAt: number | null; - outcome: RunOutcome; -} - -export interface RunInputRow { - uriId: number; - role: string | null; - observedStamp: number; -} - -export interface RunOutputRow { - uriId: number; - writtenStamp: number; - wasNoop: boolean; -} - -export interface WorkerRegistryEntry { - name: string; - version: string; - description: string | null; - inputPattern: string | null; - outputPattern: string | null; - scopeExpr: string | null; - registeredAt: number; - lastRunAt: number | null; -} - -/** Internal state kept in memory by `MemoryGraphStore`. */ -export interface State { - schemaVersion: 1; - uriById: Map; - uriIdByText: Map; - nextUriId: number; - uriState: Map; - pending: Map>; // runId → uriId → entry - runs: Map; - nextRunId: number; - runInput: Map; - runOutput: Map; - workers: Map; - stampSeq: number; -} - -export function createEmptyState(): State { - return { - schemaVersion: 1, - uriById: new Map(), - uriIdByText: new Map(), - nextUriId: 1, - uriState: new Map(), - pending: new Map(), - runs: new Map(), - nextRunId: 1, - runInput: new Map(), - runOutput: new Map(), - workers: new Map(), - stampSeq: 0, - }; -} - -export function internUri(state: State, text: string): number { - const existing = state.uriIdByText.get(text); - if (existing !== undefined) return existing; - const id = state.nextUriId++; - state.uriById.set(id, text); - state.uriIdByText.set(text, id); - return id; -} - -export function getUriId(state: State, text: string): number | undefined { - return state.uriIdByText.get(text); -} diff --git a/packages/uri-graph/src/store/memory/store.ts b/packages/uri-graph/src/store/memory/store.ts deleted file mode 100644 index 8a52e7a..0000000 --- a/packages/uri-graph/src/store/memory/store.ts +++ /dev/null @@ -1,224 +0,0 @@ -import type { ReadOnlyView } from "../../types/update.js"; -import type { - BeginTransactionOpts, - GraphStore, - GraphTransaction, - RecoverOrphansResult, - RegisterWorkerInput, - RegisterWorkerResult, -} from "../types.js"; -import type { LockId, MemoryPersistence } from "./persistence.js"; -import { dumpFromState, stateFromDump } from "./snapshot.js"; -import { createEmptyState, internUri, type State } from "./state.js"; -import { MemoryTransaction } from "./transaction.js"; - -export type MemoryGraphStoreOptions = MemoryPersistence; - -function uriMatchesLikePattern(text: string, pattern: string): boolean { - // Translate SQL LIKE pattern (% any, _ one) to a RegExp. - let re = "^"; - for (const ch of pattern) { - if (ch === "%") re += ".*"; - else if (ch === "_") re += "."; - else re += ch.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); - } - re += "$"; - return new RegExp(re).test(text); -} - -export class MemoryGraphStore implements GraphStore { - private persistence: MemoryPersistence; - private state: State = createEmptyState(); - private lockId: LockId | null = null; - private initialized = false; - private closed = false; - - constructor(options: MemoryGraphStoreOptions) { - this.persistence = options; - } - - async initialize(): Promise { - if (this.initialized) return; - this.lockId = await this.persistence.lock(this.persistence.key); - const dump = await this.persistence.load(this.lockId); - this.state = stateFromDump(dump); - this.initialized = true; - } - - async close(): Promise { - if (this.closed) return; - this.closed = true; - if (this.lockId !== null) { - await this.persistence.unlock(this.lockId); - this.lockId = null; - } - } - - private async flush(): Promise { - if (this.lockId === null) return; - await this.persistence.store(this.lockId, dumpFromState(this.state)); - } - - async getState(uri: string): Promise { - const id = this.state.uriIdByText.get(uri); - if (id === undefined) return null; - const entry = this.state.uriState.get(id); - if (!entry) return null; - return { - uri, - stamp: entry.stamp, - status: entry.status, - hash: entry.hash, - attributes: entry.attributes, - }; - } - - async *find(pattern: string): AsyncIterable { - for (const [uriId, entry] of this.state.uriState) { - const text = this.state.uriById.get(uriId); - if (text === undefined) continue; - if (!uriMatchesLikePattern(text, pattern)) continue; - yield { - uri: text, - stamp: entry.stamp, - status: entry.status, - hash: entry.hash, - attributes: entry.attributes, - }; - } - } - - async priorOutputs(workerName: string, inputUri: string): Promise { - const inputId = this.state.uriIdByText.get(inputUri); - if (inputId === undefined) return []; - - let bestRunId: number | undefined; - for (const run of this.state.runs.values()) { - if (run.action !== workerName) continue; - if (run.outcome !== "success") continue; - const inputs = this.state.runInput.get(run.id) ?? []; - const observed = inputs.some((i) => i.uriId === inputId); - if (!observed) continue; - if (bestRunId === undefined || run.id > bestRunId) { - bestRunId = run.id; - } - } - if (bestRunId === undefined) return []; - const outputs = this.state.runOutput.get(bestRunId) ?? []; - const result: ReadOnlyView[] = []; - for (const o of outputs) { - const uri = this.state.uriById.get(o.uriId); - if (uri === undefined) continue; - const entry = this.state.uriState.get(o.uriId); - if (!entry) continue; - result.push({ - uri, - stamp: entry.stamp, - status: entry.status, - hash: entry.hash, - attributes: entry.attributes, - }); - } - return result; - } - - async beginTransaction(opts: BeginTransactionOpts): Promise { - const runId = this.state.nextRunId++; - this.state.runs.set(runId, { - id: runId, - action: opts.worker, - actionVersion: opts.version, - scope: opts.scope, - stamp: opts.initialStamp, - startedAt: Date.now(), - finishedAt: null, - outcome: "running", - }); - await this.flush(); - return new MemoryTransaction(runId, { - state: this.state, - flush: () => this.flush(), - }); - } - - async mintStamp(): Promise { - this.state.stampSeq += 1; - return this.state.stampSeq; - } - - async recoverOrphans(): Promise { - let cancelled = 0; - let pendingRowsDropped = 0; - for (const run of this.state.runs.values()) { - if (run.outcome !== "running") continue; - run.outcome = "cancelled"; - run.finishedAt = Date.now(); - cancelled += 1; - const pendingForRun = this.state.pending.get(run.id); - if (pendingForRun) { - pendingRowsDropped += pendingForRun.size; - this.state.pending.delete(run.id); - } - this.state.runInput.delete(run.id); - this.state.runOutput.delete(run.id); - } - if (cancelled > 0) await this.flush(); - return { cancelled, pendingRowsDropped }; - } - - async registerWorker(def: RegisterWorkerInput): Promise { - const existing = this.state.workers.get(def.name); - const versionChanged = !existing || existing.version !== def.version; - this.state.workers.set(def.name, { - name: def.name, - version: def.version, - description: def.description ?? null, - inputPattern: def.inputPattern ?? null, - outputPattern: def.outputPattern ?? null, - scopeExpr: def.scopeExpr ?? null, - registeredAt: existing ? existing.registeredAt : Date.now(), - lastRunAt: existing ? existing.lastRunAt : null, - }); - if (versionChanged) await this.flush(); - return { versionChanged }; - } - - async lastSuccessfulRunVersion(workerName: string, inputUri: string): Promise { - const inputId = this.state.uriIdByText.get(inputUri); - if (inputId === undefined) return null; - let bestRun: { id: number; version: string } | undefined; - for (const run of this.state.runs.values()) { - if (run.action !== workerName || run.outcome !== "success") continue; - const inputs = this.state.runInput.get(run.id) ?? []; - if (!inputs.some((i) => i.uriId === inputId)) continue; - if (!bestRun || run.id > bestRun.id) { - bestRun = { id: run.id, version: run.actionVersion }; - } - } - return bestRun ? bestRun.version : null; - } - - async isInputProcessed( - workerName: string, - workerVersion: string, - inputUri: string, - ): Promise { - const inputId = this.state.uriIdByText.get(inputUri); - if (inputId === undefined) return false; - const currentStamp = this.state.uriState.get(inputId)?.stamp ?? 0; - for (const run of this.state.runs.values()) { - if (run.action !== workerName || run.outcome !== "success") continue; - if (run.actionVersion !== workerVersion) continue; - const inputs = this.state.runInput.get(run.id) ?? []; - const obs = inputs.find((i) => i.uriId === inputId); - if (!obs) continue; - if (obs.observedStamp >= currentStamp) return true; - } - return false; - } - - /** Internal helper used in tests when interning a URI on read paths. */ - internUri(text: string): number { - return internUri(this.state, text); - } -} diff --git a/packages/uri-graph/src/store/memory/transaction.ts b/packages/uri-graph/src/store/memory/transaction.ts deleted file mode 100644 index c6c2e71..0000000 --- a/packages/uri-graph/src/store/memory/transaction.ts +++ /dev/null @@ -1,133 +0,0 @@ -import type { Update } from "../../types/update.js"; -import type { GraphTransaction } from "../types.js"; -import { internUri, type State } from "./state.js"; - -export interface MemoryTransactionDeps { - state: State; - /** Called after commit/rollback to persist the snapshot. */ - flush: () => Promise; -} - -type TerminalState = "open" | "committed" | "rolledback"; - -export class MemoryTransaction implements GraphTransaction { - readonly runId: number; - private state: State; - private flush: () => Promise; - private status: TerminalState = "open"; - - constructor(runId: number, deps: MemoryTransactionDeps) { - this.runId = runId; - this.state = deps.state; - this.flush = deps.flush; - } - - private ensureOpen(op: string): void { - if (this.status !== "open") { - throw new Error(`transaction ${this.runId} is closed (${this.status}); cannot ${op}`); - } - } - - async applyUpdate(u: Update): Promise { - this.ensureOpen("applyUpdate"); - const uriId = internUri(this.state, u.uri); - - // No-op rule: skip staging if committed (status, hash) match. - const committed = this.state.uriState.get(uriId); - const isNoop = - committed !== undefined && committed.status === u.status && committed.hash === u.hash; - - let pendingForRun = this.state.pending.get(this.runId); - if (!pendingForRun) { - pendingForRun = new Map(); - this.state.pending.set(this.runId, pendingForRun); - } - - if (isNoop && committed) { - // Record an output marker with prior stamp + wasNoop flag (deferred until commit). - const outputs = this.state.runOutput.get(this.runId) ?? []; - outputs.push({ - uriId, - writtenStamp: committed.stamp, - wasNoop: true, - }); - this.state.runOutput.set(this.runId, outputs); - return; - } - - pendingForRun.set(uriId, { - uriId, - status: u.status, - stamp: u.stamp, - hash: u.hash, - attributes: u.attributes, - }); - } - - async recordInputs( - inputs: ReadonlyArray<{ - uri: string; - observedStamp: number; - role?: string; - }>, - ): Promise { - this.ensureOpen("recordInputs"); - const rows = this.state.runInput.get(this.runId) ?? []; - for (const i of inputs) { - const id = internUri(this.state, i.uri); - rows.push({ - uriId: id, - role: i.role ?? null, - observedStamp: i.observedStamp, - }); - } - this.state.runInput.set(this.runId, rows); - } - - async commit(): Promise { - this.ensureOpen("commit"); - this.status = "committed"; - - // Promote pending → committed; record run_output for non-noop entries. - const pendingForRun = this.state.pending.get(this.runId); - const outputs = this.state.runOutput.get(this.runId) ?? []; - if (pendingForRun) { - for (const [uriId, entry] of pendingForRun) { - this.state.uriState.set(uriId, { - status: entry.status, - stamp: entry.stamp, - hash: entry.hash, - attributes: entry.attributes, - }); - outputs.push({ - uriId, - writtenStamp: entry.stamp, - wasNoop: false, - }); - } - } - this.state.runOutput.set(this.runId, outputs); - this.state.pending.delete(this.runId); - - const run = this.state.runs.get(this.runId); - if (run) { - run.outcome = "success"; - run.finishedAt = Date.now(); - } - await this.flush(); - } - - async rollback(): Promise { - this.ensureOpen("rollback"); - this.status = "rolledback"; - this.state.pending.delete(this.runId); - this.state.runInput.delete(this.runId); - this.state.runOutput.delete(this.runId); - const run = this.state.runs.get(this.runId); - if (run) { - run.outcome = "cancelled"; - run.finishedAt = Date.now(); - } - await this.flush(); - } -} diff --git a/packages/uri-graph/src/store/sql.ts b/packages/uri-graph/src/store/sql.ts new file mode 100644 index 0000000..ebedc09 --- /dev/null +++ b/packages/uri-graph/src/store/sql.ts @@ -0,0 +1,224 @@ +import type { Db } from "@statewalker/db-api"; +import type { Resource, Status, Worker } from "../types.js"; +import type { + ListOptions, + PurgeCompletionsOptions, + PurgeResourcesOptions, + Store, +} from "./store.js"; + +const SCHEMA = ` +CREATE TABLE IF NOT EXISTS stamp_seq ( + id INTEGER PRIMARY KEY CHECK (id = 1), + next INTEGER NOT NULL +); +INSERT OR IGNORE INTO stamp_seq (id, next) VALUES (1, 1); + +CREATE TABLE IF NOT EXISTS resources ( + uri TEXT NOT NULL, + stamp INTEGER NOT NULL, + status TEXT NOT NULL, + meta TEXT, + PRIMARY KEY (uri, stamp) +); +CREATE INDEX IF NOT EXISTS resources_stamp ON resources(stamp); + +CREATE TABLE IF NOT EXISTS workers ( + name TEXT PRIMARY KEY, + selects TEXT NOT NULL, + emits TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS completions ( + worker TEXT NOT NULL, + stamp INTEGER NOT NULL, + finished_at INTEGER NOT NULL, + PRIMARY KEY (worker, stamp) +); +CREATE INDEX IF NOT EXISTS completions_worker_stamp ON completions(worker, stamp DESC); +`; + +type ResourceRow = { + uri: string; + stamp: number; + status: string; + meta: string | null; +}; + +type WorkerRow = { + name: string; + selects: string; + emits: string; +}; + +function rowToResource(row: ResourceRow): Resource { + const r: Resource = { + uri: row.uri, + stamp: row.stamp, + status: row.status as Status, + }; + if (row.meta !== null) r.meta = JSON.parse(row.meta); + return r; +} + +export class SqlStore implements Store { + private initialized = false; + + constructor(private db: Db) {} + + private async ensureInit(): Promise { + if (this.initialized) return; + for (const stmt of SCHEMA.split(";")) { + const trimmed = stmt.trim(); + if (trimmed) await this.db.exec(trimmed); + } + this.initialized = true; + } + + async newStamp(): Promise { + await this.ensureInit(); + const rows = await this.db.query<{ next: number }>( + "UPDATE stamp_seq SET next = next + 1 WHERE id = 1 RETURNING next - 1 AS next", + ); + const row = rows[0]; + if (!row) throw new Error("stamp_seq is missing"); + return row.next; + } + + async put(resource: Resource): Promise { + await this.ensureInit(); + const meta = resource.meta === undefined ? null : JSON.stringify(resource.meta); + await this.db.query( + "INSERT OR REPLACE INTO resources (uri, stamp, status, meta) VALUES (?, ?, ?, ?)", + [resource.uri, resource.stamp, resource.status, meta], + ); + await this.db.query("UPDATE stamp_seq SET next = MAX(next, ? + 1) WHERE id = 1", [ + resource.stamp, + ]); + } + + async get(uri: string): Promise { + await this.ensureInit(); + const rows = await this.db.query( + "SELECT uri, stamp, status, meta FROM resources WHERE uri = ? ORDER BY stamp DESC LIMIT 1", + [uri], + ); + const row = rows[0]; + return row ? rowToResource(row) : undefined; + } + + async *list(options: ListOptions): AsyncIterable { + await this.ensureInit(); + const after = options.afterStamp ?? 0; + const rows = await this.db.query( + `WITH latest AS ( + SELECT uri, MAX(stamp) AS stamp FROM resources + WHERE uri LIKE ? || '%' + GROUP BY uri + ) + SELECT r.uri, r.stamp, r.status, r.meta + FROM latest l + JOIN resources r ON r.uri = l.uri AND r.stamp = l.stamp + WHERE r.stamp > ? + ORDER BY r.stamp ASC, r.uri ASC`, + [options.prefix, after], + ); + for (const row of rows) yield rowToResource(row); + } + + async saveWorker(worker: Worker): Promise { + await this.ensureInit(); + await this.db.query( + `INSERT INTO workers (name, selects, emits) VALUES (?, ?, ?) + ON CONFLICT(name) DO UPDATE SET selects = excluded.selects, emits = excluded.emits`, + [worker.name, worker.selects, worker.emits], + ); + } + + async deleteWorker(name: string): Promise { + await this.ensureInit(); + await this.db.query("DELETE FROM workers WHERE name = ?", [name]); + await this.db.query("DELETE FROM completions WHERE worker = ?", [name]); + } + + async getWorker(name: string): Promise { + await this.ensureInit(); + const rows = await this.db.query( + "SELECT name, selects, emits FROM workers WHERE name = ?", + [name], + ); + const row = rows[0]; + return row ? { name: row.name, selects: row.selects, emits: row.emits } : undefined; + } + + async *listWorkers(): AsyncIterable { + await this.ensureInit(); + const rows = await this.db.query( + "SELECT name, selects, emits FROM workers ORDER BY name ASC", + ); + for (const row of rows) yield { name: row.name, selects: row.selects, emits: row.emits }; + } + + async markCompleted(worker: string, stamp: number): Promise { + await this.ensureInit(); + await this.db.query( + "INSERT OR REPLACE INTO completions (worker, stamp, finished_at) VALUES (?, ?, ?)", + [worker, stamp, Date.now()], + ); + } + + async allWatermarks(): Promise> { + await this.ensureInit(); + const rows = await this.db.query<{ worker: string; stamp: number }>( + "SELECT worker, MAX(stamp) AS stamp FROM completions GROUP BY worker", + ); + const result = new Map(); + for (const row of rows) result.set(row.worker, row.stamp); + return result; + } + + async invalidate(prefix: string): Promise { + await this.ensureInit(); + const stamp = await this.newStamp(); + await this.db.query( + `INSERT OR REPLACE INTO resources (uri, stamp, status, meta) + SELECT r.uri, ?, 'removed', NULL + FROM ( + SELECT uri, MAX(stamp) AS stamp FROM resources + WHERE uri LIKE ? || '%' + GROUP BY uri + ) l + JOIN resources r ON r.uri = l.uri AND r.stamp = l.stamp + WHERE r.status != 'removed'`, + [stamp, prefix], + ); + } + + async purgeResources(options?: PurgeResourcesOptions): Promise { + await this.ensureInit(); + if (options?.keepLatestPerUri !== true) return; + await this.db.exec( + `DELETE FROM resources + WHERE (uri, stamp) NOT IN ( + SELECT uri, MAX(stamp) FROM resources GROUP BY uri + )`, + ); + } + + async purgeCompletions(options?: PurgeCompletionsOptions): Promise { + await this.ensureInit(); + const keep = options?.keepLatestPerWorker; + if (keep === undefined || keep < 1) return; + await this.db.query( + `DELETE FROM completions + WHERE rowid NOT IN ( + SELECT rowid FROM ( + SELECT rowid, + ROW_NUMBER() OVER (PARTITION BY worker ORDER BY stamp DESC) AS rn + FROM completions + ) WHERE rn <= ? + )`, + [keep], + ); + } +} diff --git a/packages/uri-graph/src/store/sql/schema.ts b/packages/uri-graph/src/store/sql/schema.ts deleted file mode 100644 index 0bdda96..0000000 --- a/packages/uri-graph/src/store/sql/schema.ts +++ /dev/null @@ -1,103 +0,0 @@ -import type { Db } from "@statewalker/db-api"; - -const STATEMENTS: string[] = [ - // 1. URI interning. INTEGER PRIMARY KEY is an alias for rowid; SQLite - // auto-generates the id on INSERT. - `CREATE TABLE IF NOT EXISTS uri ( - id INTEGER PRIMARY KEY, - text TEXT NOT NULL UNIQUE - )`, - `CREATE INDEX IF NOT EXISTS uri_text ON uri(text)`, - - // 2. Stamp source - `CREATE TABLE IF NOT EXISTS stamp_seq ( - id INTEGER PRIMARY KEY CHECK (id = 1), - next INTEGER NOT NULL - )`, - `INSERT OR IGNORE INTO stamp_seq (id, next) VALUES (1, 1)`, - - // 3. Worker registry - `CREATE TABLE IF NOT EXISTS worker_registry ( - name TEXT PRIMARY KEY, - version TEXT NOT NULL, - description TEXT, - input_pattern TEXT, - output_pattern TEXT, - scope_expr TEXT, - selector_kind TEXT NOT NULL DEFAULT 'code', - registered_at INTEGER NOT NULL, - last_run_at INTEGER - )`, - - // 4. Committed state - `CREATE TABLE IF NOT EXISTS uri_state ( - uri_id INTEGER PRIMARY KEY REFERENCES uri(id), - status TEXT NOT NULL, - stamp INTEGER NOT NULL, - hash TEXT, - attributes TEXT - )`, - `CREATE INDEX IF NOT EXISTS uri_state_stamp ON uri_state(stamp)`, - - // 5. Pending (staging) - `CREATE TABLE IF NOT EXISTS uri_state_pending ( - run_id INTEGER NOT NULL, - uri_id INTEGER NOT NULL REFERENCES uri(id), - status TEXT NOT NULL, - stamp INTEGER NOT NULL, - hash TEXT, - attributes TEXT, - PRIMARY KEY (run_id, uri_id) - )`, - `CREATE INDEX IF NOT EXISTS uri_state_pending_uri ON uri_state_pending(uri_id)`, - - // 6. Run history - `CREATE TABLE IF NOT EXISTS run ( - id INTEGER PRIMARY KEY, - action TEXT NOT NULL, - action_version TEXT NOT NULL, - scope TEXT, - stamp INTEGER NOT NULL, - started_at INTEGER NOT NULL, - finished_at INTEGER, - outcome TEXT NOT NULL - )`, - `CREATE INDEX IF NOT EXISTS run_action_scope ON run(action, scope, id)`, - `CREATE INDEX IF NOT EXISTS run_outcome_started ON run(outcome, started_at)`, - - // 7. Run inputs / outputs - `CREATE TABLE IF NOT EXISTS run_input ( - run_id INTEGER NOT NULL REFERENCES run(id) ON DELETE CASCADE, - uri_id INTEGER NOT NULL REFERENCES uri(id), - role TEXT, - observed_stamp INTEGER NOT NULL, - PRIMARY KEY (run_id, uri_id) - )`, - `CREATE TABLE IF NOT EXISTS run_output ( - run_id INTEGER NOT NULL REFERENCES run(id) ON DELETE CASCADE, - uri_id INTEGER NOT NULL REFERENCES uri(id), - written_stamp INTEGER NOT NULL, - was_noop INTEGER NOT NULL DEFAULT 0, - PRIMARY KEY (run_id, uri_id) - )`, - `CREATE INDEX IF NOT EXISTS run_output_uri ON run_output(uri_id, run_id)`, -]; - -const PRAGMAS: string[] = [ - "PRAGMA journal_mode = WAL", - "PRAGMA synchronous = NORMAL", - "PRAGMA foreign_keys = ON", -]; - -export async function applySchema(db: Db): Promise { - for (const pragma of PRAGMAS) { - try { - await db.exec(pragma); - } catch { - // libSQL may no-op some PRAGMAs; tolerate. - } - } - for (const stmt of STATEMENTS) { - await db.exec(stmt); - } -} diff --git a/packages/uri-graph/src/store/sql/store.ts b/packages/uri-graph/src/store/sql/store.ts deleted file mode 100644 index 15a9ec5..0000000 --- a/packages/uri-graph/src/store/sql/store.ts +++ /dev/null @@ -1,264 +0,0 @@ -import type { Db } from "@statewalker/db-api"; -import type { ReadOnlyView } from "../../types/update.js"; -import type { - BeginTransactionOpts, - GraphStore, - GraphTransaction, - RecoverOrphansResult, - RegisterWorkerInput, - RegisterWorkerResult, -} from "../types.js"; -import { applySchema } from "./schema.js"; -import { SqlTransaction } from "./transaction.js"; -import { getUriId } from "./uri-intern.js"; - -export interface SqlGraphStoreOptions { - db: Db; -} - -interface UriStateRow { - status: string; - stamp: number; - hash: string | null; - attributes: string | null; -} - -interface UriRow extends UriStateRow { - text: string; -} - -function rowToView(uri: string, row: UriStateRow): ReadOnlyView { - return { - uri, - stamp: row.stamp, - status: row.status as ReadOnlyView["status"], - hash: row.hash ?? undefined, - attributes: - row.attributes !== null && row.attributes !== undefined - ? (JSON.parse(row.attributes) as Record) - : undefined, - }; -} - -export class SqlGraphStore implements GraphStore { - private db: Db; - private initialized = false; - private closed = false; - - constructor(options: SqlGraphStoreOptions) { - this.db = options.db; - } - - async initialize(): Promise { - if (this.initialized) return; - await applySchema(this.db); - this.initialized = true; - } - - async close(): Promise { - if (this.closed) return; - this.closed = true; - // Caller owns Db lifecycle; we just mark closed. - } - - async getState(uri: string): Promise { - const id = await getUriId(this.db, uri); - if (id === null) return null; - const rows = await this.db.query( - "SELECT status, stamp, hash, attributes FROM uri_state WHERE uri_id = ?", - [id], - ); - if (rows.length === 0 || !rows[0]) return null; - return rowToView(uri, rows[0]); - } - - async *find(pattern: string): AsyncIterable { - const rows = await this.db.query( - `SELECT u.text AS text, s.status, s.stamp, s.hash, s.attributes - FROM uri_state s - JOIN uri u ON u.id = s.uri_id - WHERE u.text LIKE ?`, - [pattern], - ); - for (const r of rows) { - yield rowToView(r.text, r); - } - } - - async priorOutputs(workerName: string, inputUri: string): Promise { - const inputId = await getUriId(this.db, inputUri); - if (inputId === null) return []; - - const latest = await this.db.query<{ run_id: number }>( - `SELECT r.id AS run_id - FROM run r - JOIN run_input ri ON ri.run_id = r.id - WHERE r.action = ? - AND r.outcome = 'success' - AND ri.uri_id = ? - ORDER BY r.id DESC - LIMIT 1`, - [workerName, inputId], - ); - if (latest.length === 0 || !latest[0]) return []; - const runId = latest[0].run_id; - - const rows = await this.db.query( - `SELECT u.text AS text, s.status, s.stamp, s.hash, s.attributes - FROM run_output ro - JOIN uri u ON u.id = ro.uri_id - JOIN uri_state s ON s.uri_id = ro.uri_id - WHERE ro.run_id = ? AND ro.was_noop = 0`, - [runId], - ); - return rows.map((r) => rowToView(r.text, r)); - } - - async beginTransaction(opts: BeginTransactionOpts): Promise { - const result = await this.db.query<{ id: number }>( - `INSERT INTO run (action, action_version, scope, stamp, started_at, outcome) - VALUES (?, ?, ?, ?, ?, 'running') - RETURNING id`, - [opts.worker, opts.version, opts.scope, opts.initialStamp, Date.now()], - ); - if (result.length === 0 || !result[0]) { - throw new Error("failed to allocate run id"); - } - return new SqlTransaction(this.db, result[0].id); - } - - async mintStamp(): Promise { - // Atomic increment-and-fetch. - const rows = await this.db.query<{ next: number }>( - "UPDATE stamp_seq SET next = next + 1 WHERE id = 1 RETURNING next - 1 AS next", - ); - if (rows.length === 0 || !rows[0]) { - throw new Error("stamp_seq row missing"); - } - return rows[0].next; - } - - async recoverOrphans(): Promise { - const before = await this.db.query<{ count: number }>( - "SELECT COUNT(*) AS count FROM run WHERE outcome = 'running'", - ); - const cancelled = before.length > 0 && before[0] ? before[0].count : 0; - if (cancelled === 0) return { cancelled: 0, pendingRowsDropped: 0 }; - - const pendingBefore = await this.db.query<{ count: number }>( - `SELECT COUNT(*) AS count FROM uri_state_pending - WHERE run_id IN (SELECT id FROM run WHERE outcome = 'running')`, - ); - const pendingRowsDropped = - pendingBefore.length > 0 && pendingBefore[0] ? pendingBefore[0].count : 0; - - await this.db.exec("BEGIN IMMEDIATE"); - try { - await this.db.query( - `DELETE FROM uri_state_pending - WHERE run_id IN (SELECT id FROM run WHERE outcome = 'running')`, - ); - await this.db.query( - `DELETE FROM run_input - WHERE run_id IN (SELECT id FROM run WHERE outcome = 'running')`, - ); - await this.db.query( - `DELETE FROM run_output - WHERE run_id IN (SELECT id FROM run WHERE outcome = 'running')`, - ); - await this.db.query( - "UPDATE run SET outcome = 'cancelled', finished_at = ? WHERE outcome = 'running'", - [Date.now()], - ); - await this.db.exec("COMMIT"); - } catch (err) { - try { - await this.db.exec("ROLLBACK"); - } catch { - // ignore - } - throw err; - } - return { cancelled, pendingRowsDropped }; - } - - async registerWorker(def: RegisterWorkerInput): Promise { - const existing = await this.db.query<{ version: string }>( - "SELECT version FROM worker_registry WHERE name = ?", - [def.name], - ); - const versionChanged = - existing.length === 0 || !existing[0] || existing[0].version !== def.version; - const now = Date.now(); - if (existing.length === 0) { - await this.db.query( - `INSERT INTO worker_registry - (name, version, description, input_pattern, output_pattern, scope_expr, registered_at) - VALUES (?, ?, ?, ?, ?, ?, ?)`, - [ - def.name, - def.version, - def.description ?? null, - def.inputPattern ?? null, - def.outputPattern ?? null, - def.scopeExpr ?? null, - now, - ], - ); - } else if (versionChanged) { - await this.db.query( - `UPDATE worker_registry - SET version = ?, description = ?, input_pattern = ?, output_pattern = ?, scope_expr = ? - WHERE name = ?`, - [ - def.version, - def.description ?? null, - def.inputPattern ?? null, - def.outputPattern ?? null, - def.scopeExpr ?? null, - def.name, - ], - ); - } - return { versionChanged }; - } - - async lastSuccessfulRunVersion(workerName: string, inputUri: string): Promise { - const inputId = await getUriId(this.db, inputUri); - if (inputId === null) return null; - const rows = await this.db.query<{ action_version: string }>( - `SELECT r.action_version - FROM run r - JOIN run_input ri ON ri.run_id = r.id - WHERE r.action = ? - AND r.outcome = 'success' - AND ri.uri_id = ? - ORDER BY r.id DESC - LIMIT 1`, - [workerName, inputId], - ); - return rows.length > 0 && rows[0] ? rows[0].action_version : null; - } - - async isInputProcessed( - workerName: string, - workerVersion: string, - inputUri: string, - ): Promise { - const inputId = await getUriId(this.db, inputUri); - if (inputId === null) return false; - const rows = await this.db.query<{ count: number }>( - `SELECT COUNT(*) AS count - FROM run r - JOIN run_input ri ON ri.run_id = r.id - LEFT JOIN uri_state s ON s.uri_id = ri.uri_id - WHERE r.action = ? - AND r.action_version = ? - AND r.outcome = 'success' - AND ri.uri_id = ? - AND ri.observed_stamp >= COALESCE(s.stamp, 0)`, - [workerName, workerVersion, inputId], - ); - return rows.length > 0 && rows[0] !== undefined && rows[0].count > 0; - } -} diff --git a/packages/uri-graph/src/store/sql/transaction.ts b/packages/uri-graph/src/store/sql/transaction.ts deleted file mode 100644 index 1d4e9fc..0000000 --- a/packages/uri-graph/src/store/sql/transaction.ts +++ /dev/null @@ -1,153 +0,0 @@ -import type { Db } from "@statewalker/db-api"; -import type { Update } from "../../types/update.js"; -import type { GraphTransaction } from "../types.js"; -import { internUri } from "./uri-intern.js"; - -type TxnStatus = "open" | "committed" | "rolledback"; - -export class SqlTransaction implements GraphTransaction { - readonly runId: number; - private db: Db; - private status: TxnStatus = "open"; - - constructor(db: Db, runId: number) { - this.db = db; - this.runId = runId; - } - - private ensureOpen(op: string): void { - if (this.status !== "open") { - throw new Error(`transaction ${this.runId} is closed (${this.status}); cannot ${op}`); - } - } - - async applyUpdate(u: Update): Promise { - this.ensureOpen("applyUpdate"); - const uriId = await internUri(this.db, u.uri); - - // No-op check against committed state. - const committed = await this.db.query<{ status: string; hash: string | null; stamp: number }>( - "SELECT status, hash, stamp FROM uri_state WHERE uri_id = ?", - [uriId], - ); - const isNoop = - committed.length > 0 && - committed[0] !== undefined && - committed[0].status === u.status && - (committed[0].hash ?? null) === (u.hash ?? null); - - if (isNoop && committed[0]) { - // Record the no-op output marker now (durable, since each applyUpdate is its own physical txn). - await this.db.query( - `INSERT OR REPLACE INTO run_output (run_id, uri_id, written_stamp, was_noop) - VALUES (?, ?, ?, 1)`, - [this.runId, uriId, committed[0].stamp], - ); - return; - } - - const attrJson = u.attributes !== undefined ? JSON.stringify(u.attributes) : null; - await this.db.query( - `INSERT INTO uri_state_pending (run_id, uri_id, status, stamp, hash, attributes) - VALUES (?, ?, ?, ?, ?, ?) - ON CONFLICT(run_id, uri_id) DO UPDATE SET - status = excluded.status, - stamp = excluded.stamp, - hash = excluded.hash, - attributes = excluded.attributes`, - [this.runId, uriId, u.status, u.stamp, u.hash ?? null, attrJson], - ); - } - - async recordInputs( - inputs: ReadonlyArray<{ - uri: string; - observedStamp: number; - role?: string; - }>, - ): Promise { - this.ensureOpen("recordInputs"); - for (const i of inputs) { - const uriId = await internUri(this.db, i.uri); - await this.db.query( - `INSERT INTO run_input (run_id, uri_id, role, observed_stamp) - VALUES (?, ?, ?, ?) - ON CONFLICT(run_id, uri_id) DO UPDATE SET - role = excluded.role, - observed_stamp = excluded.observed_stamp`, - [this.runId, uriId, i.role ?? null, i.observedStamp], - ); - } - } - - async commit(): Promise { - this.ensureOpen("commit"); - this.status = "committed"; - - await this.db.exec("BEGIN IMMEDIATE"); - try { - // Promote pending → committed. - await this.db.query( - `INSERT INTO uri_state (uri_id, status, stamp, hash, attributes) - SELECT uri_id, status, stamp, hash, attributes - FROM uri_state_pending - WHERE run_id = ? - ON CONFLICT(uri_id) DO UPDATE SET - status = excluded.status, - stamp = excluded.stamp, - hash = excluded.hash, - attributes = excluded.attributes`, - [this.runId], - ); - - // Record run_output for promoted entries (was_noop = 0). - await this.db.query( - `INSERT OR REPLACE INTO run_output (run_id, uri_id, written_stamp, was_noop) - SELECT run_id, uri_id, stamp, 0 FROM uri_state_pending WHERE run_id = ?`, - [this.runId], - ); - - // Mark run success. - await this.db.query("UPDATE run SET outcome = 'success', finished_at = ? WHERE id = ?", [ - Date.now(), - this.runId, - ]); - - // Drop staging. - await this.db.query("DELETE FROM uri_state_pending WHERE run_id = ?", [this.runId]); - await this.db.exec("COMMIT"); - } catch (err) { - try { - await this.db.exec("ROLLBACK"); - } catch { - // ignore - } - throw err; - } - } - - async rollback(): Promise { - this.ensureOpen("rollback"); - this.status = "rolledback"; - - await this.db.exec("BEGIN IMMEDIATE"); - try { - await this.db.query("DELETE FROM uri_state_pending WHERE run_id = ?", [this.runId]); - await this.db.query("UPDATE run SET outcome = 'cancelled', finished_at = ? WHERE id = ?", [ - Date.now(), - this.runId, - ]); - // Drop run_input/run_output for the cancelled run so they don't pollute history. - await this.db.query("DELETE FROM run_input WHERE run_id = ?", [this.runId]); - await this.db.query("DELETE FROM run_output WHERE run_id = ?", [this.runId]); - await this.db.exec("COMMIT"); - } catch (err) { - try { - await this.db.exec("ROLLBACK"); - } catch { - // ignore - } - throw err; - } - } -} diff --git a/packages/uri-graph/src/store/sql/uri-intern.ts b/packages/uri-graph/src/store/sql/uri-intern.ts deleted file mode 100644 index 011eda5..0000000 --- a/packages/uri-graph/src/store/sql/uri-intern.ts +++ /dev/null @@ -1,28 +0,0 @@ -import type { Db } from "@statewalker/db-api"; - -/** - * Intern a URI text, returning its integer id. Idempotent. - * Single-writer assumption: no concurrent inserts of the same text from peers. - */ -export async function internUri(db: Db, text: string): Promise { - // First try fast lookup. - const existing = await db.query<{ id: number }>("SELECT id FROM uri WHERE text = ?", [text]); - if (existing.length > 0 && existing[0]) return existing[0].id; - // Insert; on race (won't happen under single-writer) fall back to lookup. - await db.query("INSERT OR IGNORE INTO uri (text) VALUES (?)", [text]); - const fresh = await db.query<{ id: number }>("SELECT id FROM uri WHERE text = ?", [text]); - if (!fresh.length || !fresh[0]) { - throw new Error(`failed to intern URI: ${text}`); - } - return fresh[0].id; -} - -export async function getUriId(db: Db, text: string): Promise { - const rows = await db.query<{ id: number }>("SELECT id FROM uri WHERE text = ?", [text]); - return rows.length > 0 && rows[0] ? rows[0].id : null; -} - -export async function getUriText(db: Db, id: number): Promise { - const rows = await db.query<{ text: string }>("SELECT text FROM uri WHERE id = ?", [id]); - return rows.length > 0 && rows[0] ? rows[0].text : null; -} diff --git a/packages/uri-graph/src/store/store.ts b/packages/uri-graph/src/store/store.ts new file mode 100644 index 0000000..19723e0 --- /dev/null +++ b/packages/uri-graph/src/store/store.ts @@ -0,0 +1,35 @@ +import type { Resource, Worker } from "../types.js"; + +export type ListOptions = { + prefix: string; + afterStamp?: number; +}; + +export type PurgeResourcesOptions = { + keepLatestPerUri?: boolean; +}; + +export type PurgeCompletionsOptions = { + keepLatestPerWorker?: number; +}; + +export interface Store { + newStamp(): Promise; + + put(resource: Resource): Promise; + get(uri: string): Promise; + list(options: ListOptions): AsyncIterable; + + saveWorker(worker: Worker): Promise; + deleteWorker(name: string): Promise; + getWorker(name: string): Promise; + listWorkers(): AsyncIterable; + + markCompleted(worker: string, stamp: number): Promise; + allWatermarks(): Promise>; + + invalidate(prefix: string): Promise; + + purgeResources(options?: PurgeResourcesOptions): Promise; + purgeCompletions(options?: PurgeCompletionsOptions): Promise; +} diff --git a/packages/uri-graph/src/store/types.ts b/packages/uri-graph/src/store/types.ts deleted file mode 100644 index 61fc298..0000000 --- a/packages/uri-graph/src/store/types.ts +++ /dev/null @@ -1,75 +0,0 @@ -import type { ReadOnlyView, Update } from "../types/update.js"; - -export interface GraphReader { - getState(uri: string): Promise; - find(pattern: string): AsyncIterable; - priorOutputs(workerName: string, inputUri: string): Promise; -} - -export interface BeginTransactionOpts { - worker: string; - version: string; - scope: string | null; - initialStamp: number; -} - -export interface RegisterWorkerInput { - name: string; - version: string; - description?: string; - inputPattern?: string; - outputPattern?: string; - scopeExpr?: string; -} - -export interface RegisterWorkerResult { - versionChanged: boolean; -} - -export interface RecoverOrphansResult { - cancelled: number; - pendingRowsDropped: number; -} - -export interface GraphStore extends GraphReader { - beginTransaction(opts: BeginTransactionOpts): Promise; - mintStamp(): Promise; - recoverOrphans(): Promise; - registerWorker(def: RegisterWorkerInput): Promise; - /** - * Returns the latest successful run's action_version for the given worker against the given URI, - * or null if no successful run exists. Used by selectors to detect version-bump invalidation. - */ - lastSuccessfulRunVersion(workerName: string, inputUri: string): Promise; - /** - * Returns true if the given worker has a successful run that observed inputUri at a stamp - * greater than or equal to the URI's current committed stamp AND at the worker's current version. - * Used by `findDirty`-style selectors. - */ - isInputProcessed(workerName: string, workerVersion: string, inputUri: string): Promise; -} - -export interface GraphTransaction { - readonly runId: number; - applyUpdate(u: Update): Promise; - recordInputs( - inputs: ReadonlyArray<{ uri: string; observedStamp: number; role?: string }>, - ): Promise; - commit(): Promise; - rollback(): Promise; -} - -/** - * Lifecycle helper. `openGraphStore(store)` runs schema setup + recovery and returns the store. - * Backend-specific factories (`new MemoryGraphStore(...)`, `new SqlGraphStore(...)`) build the - * raw store; `openGraphStore` makes it ready for use. - */ -export async function openGraphStore Promise }>( - store: T, -): Promise { - if (store.initialize) { - await store.initialize(); - } - await store.recoverOrphans(); - return store; -} diff --git a/packages/uri-graph/src/topo-layers.ts b/packages/uri-graph/src/topo-layers.ts new file mode 100644 index 0000000..bcd790a --- /dev/null +++ b/packages/uri-graph/src/topo-layers.ts @@ -0,0 +1,45 @@ +import type { Worker } from "./types.js"; + +export function topoLayers(workers: Worker[]): Worker[][] { + const upstreamsOf = new Map>(); + for (const w of workers) { + const upstreams = new Set(); + if (w.selects !== "") { + for (const u of workers) { + if (u.name === w.name) continue; + if (u.emits === "") continue; + if (u.emits.startsWith(w.selects) || w.selects.startsWith(u.emits)) { + upstreams.add(u.name); + } + } + } + upstreamsOf.set(w.name, upstreams); + } + + const layers: Worker[][] = []; + const placed = new Set(); + while (placed.size < workers.length) { + const layer: Worker[] = []; + for (const w of workers) { + if (placed.has(w.name)) continue; + const upstreams = upstreamsOf.get(w.name); + if (!upstreams) continue; + let ready = true; + for (const u of upstreams) { + if (!placed.has(u)) { + ready = false; + break; + } + } + if (ready) layer.push(w); + } + if (layer.length === 0) { + const remaining = workers.filter((w) => !placed.has(w.name)).map((w) => w.name); + throw new Error(`cycle detected among workers: ${remaining.join(", ")}`); + } + layer.sort((a, b) => a.name.localeCompare(b.name)); + for (const w of layer) placed.add(w.name); + layers.push(layer); + } + return layers; +} diff --git a/packages/uri-graph/src/types.ts b/packages/uri-graph/src/types.ts new file mode 100644 index 0000000..1c4bd9a --- /dev/null +++ b/packages/uri-graph/src/types.ts @@ -0,0 +1,24 @@ +export type Status = "added" | "updated" | "removed"; + +export type Resource = { + uri: string; + stamp: number; + status: Status; + meta?: unknown; +}; + +export type Worker = { + name: string; + selects: string; + emits: string; +}; + +export type WorkerContext = { + newStamp: () => Promise; + read: (uri: string) => Promise; +}; + +export type WorkerFn = ( + input: AsyncIterable, + ctx: WorkerContext, +) => AsyncGenerator; diff --git a/packages/uri-graph/src/types/update.ts b/packages/uri-graph/src/types/update.ts deleted file mode 100644 index 633beb9..0000000 --- a/packages/uri-graph/src/types/update.ts +++ /dev/null @@ -1,19 +0,0 @@ -export type Status = "added" | "updated" | "removed"; - -export interface Update { - uri: string; - stamp: number; - status: Status; - hash?: string; - scope?: string; - role?: string; - attributes?: Record; -} - -export interface ReadOnlyView { - uri: string; - stamp: number; - status: Status; - hash?: string; - attributes?: Record; -} diff --git a/packages/uri-graph/src/types/worker.ts b/packages/uri-graph/src/types/worker.ts deleted file mode 100644 index 55c987a..0000000 --- a/packages/uri-graph/src/types/worker.ts +++ /dev/null @@ -1,29 +0,0 @@ -import type { ReadOnlyView, Update } from "./update.js"; - -export interface WorkerParams { - stamp: () => Promise; - read: (uri: string) => Promise; - find: (pattern: string) => AsyncIterable; - priorOutputs: (inputUri: string) => Promise; - recordRead: (uri: string, role?: string) => void; - signal: AbortSignal; -} - -export interface SelectorContext { - workerName: string; - workerVersion: string; - limit: number; -} - -export type Selector = (ctx: SelectorContext) => AsyncIterableIterator; - -export interface WorkerDefinition { - name: string; - version: string; - description?: string; - inputPattern?: string; - outputPattern?: string; - scopeExpr?: string; - selector: Selector; - run: (params: WorkerParams, input: AsyncIterable) => AsyncGenerator; -} diff --git a/packages/uri-graph/src/util/hash.ts b/packages/uri-graph/src/util/hash.ts deleted file mode 100644 index 15e4eb2..0000000 --- a/packages/uri-graph/src/util/hash.ts +++ /dev/null @@ -1,19 +0,0 @@ -/** - * Compute a hex SHA-256 hash of the given string. Uses Web Crypto when available - * (Node 19+, all modern browsers). - */ -export async function sha256Hex(text: string): Promise { - const subtle = (globalThis as { crypto?: { subtle?: SubtleCrypto } }).crypto?.subtle; - if (!subtle) { - throw new Error("Web Crypto SubtleCrypto is not available"); - } - const buf = new TextEncoder().encode(text); - const digest = await subtle.digest("SHA-256", buf); - const bytes = new Uint8Array(digest); - let out = ""; - for (let i = 0; i < bytes.length; i++) { - const b = bytes[i] ?? 0; - out += b.toString(16).padStart(2, "0"); - } - return out; -} diff --git a/packages/uri-graph/src/workers/chunker.ts b/packages/uri-graph/src/workers/chunker.ts deleted file mode 100644 index b9514ea..0000000 --- a/packages/uri-graph/src/workers/chunker.ts +++ /dev/null @@ -1,138 +0,0 @@ -import { findDirty } from "../graph/selector-helpers.js"; -import type { GraphStore } from "../store/types.js"; -import type { Update } from "../types/update.js"; -import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; -import { sha256Hex } from "../util/hash.js"; - -export interface ChunkerOptions { - /** Maximum characters per chunk. Default 1000. */ - chunkSize?: number; - /** Optional graph; if provided, the selector uses `findDirty` to yield pending text:// URIs. */ - graph?: GraphStore; - name?: string; - version?: string; -} - -function chunkUri(textUri: string, index: number): string { - // chunk:///path#i — simple URI scheme. - return `chunk:${textUri.slice("text:".length)}#${index}`; -} - -function chunkIndex(uri: string): number { - const m = /#(\d+)$/.exec(uri); - return m && m[1] !== undefined ? Number(m[1]) : -1; -} - -function makeChunkerSelector(graph: GraphStore | undefined): WorkerDefinition["selector"] { - if (!graph) { - return async function* () { - // Driven externally (test harness). - }; - } - return (ctx) => - findDirty(graph, { - forWorker: ctx.workerName, - forVersion: ctx.workerVersion, - uriLike: "text:///%", - limit: ctx.limit, - }); -} - -function splitText(text: string, size: number): string[] { - if (text.length === 0) return []; - const chunks: string[] = []; - for (let i = 0; i < text.length; i += size) { - chunks.push(text.slice(i, i + size)); - } - return chunks; -} - -/** - * Format-agnostic chunker: consumes `text://` URIs, splits each document into - * fixed-size character chunks, and yields `chunk://...` URIs under one stamp - * shared by all chunks of one document. Removed inputs cascade to their prior - * chunks via `priorOutputs`. - */ -export function createChunker(opts: ChunkerOptions = {}): WorkerDefinition { - const chunkSize = opts.chunkSize ?? 1000; - const name = opts.name ?? "chunker"; - const version = opts.version ?? "v1"; - - return { - name, - version, - description: "Splits text:// URIs into fixed-size chunks; format-agnostic.", - inputPattern: "text://**", - outputPattern: "chunk://**", - scopeExpr: "uri", - selector: makeChunkerSelector(opts.graph), - run: async function* ( - params: WorkerParams, - input: AsyncIterable, - ): AsyncGenerator { - for await (const doc of input) { - // findDirty doesn't carry the document text in attributes by default for - // text URIs that were committed by the extractor. Re-read the latest state. - if ( - doc.uri.startsWith("text:") && - (doc.attributes === undefined || - (doc.attributes as Record).text === undefined) - ) { - const live = await params.read(doc.uri); - if (live?.attributes) { - doc.attributes = live.attributes as Record; - } - } - if (params.signal.aborted) return; - const prior = await params.priorOutputs(doc.uri); - if (doc.status === "removed") { - const stamp = await params.stamp(); - for (const old of prior) { - yield { - uri: old.uri, - stamp, - status: "removed", - scope: doc.uri, - role: "chunk", - }; - } - continue; - } - - const text = ((doc.attributes as Record)?.text as string) ?? ""; - const chunks = splitText(text, chunkSize); - const stamp = await params.stamp(); - - const priorChunkUris = new Set(prior.map((p) => p.uri)); - for (const [i, chunk] of chunks.entries()) { - const uri = chunkUri(doc.uri, i); - const hash = await sha256Hex(chunk); - yield { - uri, - stamp, - status: priorChunkUris.has(uri) ? "updated" : "added", - hash, - scope: doc.uri, - role: "chunk", - attributes: { text: chunk, index: i }, - }; - } - - // Cascade removals for indices beyond current count. - const currentMax = chunks.length; - for (const old of prior) { - const idx = chunkIndex(old.uri); - if (idx >= currentMax) { - yield { - uri: old.uri, - stamp, - status: "removed", - scope: doc.uri, - role: "chunk", - }; - } - } - } - }, - }; -} diff --git a/packages/uri-graph/src/workers/embedder.ts b/packages/uri-graph/src/workers/embedder.ts deleted file mode 100644 index 6f704f1..0000000 --- a/packages/uri-graph/src/workers/embedder.ts +++ /dev/null @@ -1,121 +0,0 @@ -import { findDirty } from "../graph/selector-helpers.js"; -import type { GraphStore } from "../store/types.js"; -import type { Update } from "../types/update.js"; -import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; -import { sha256Hex } from "../util/hash.js"; - -export interface EmbedderOptions { - /** Embedding function — must run before the stamp is minted to keep txns short. */ - embed: (text: string, opts: { signal: AbortSignal }) => Promise; - /** Optional metadata attached to each emitted embedding update. */ - model?: string; - /** Optional graph; when provided, the selector finds pending chunk:// URIs. */ - graph?: GraphStore; - name?: string; - version?: string; -} - -function embeddingUri(chunkUri: string): string { - return `embedding://${chunkUri}`; -} - -function makeEmbedderSelector(graph: GraphStore | undefined): WorkerDefinition["selector"] { - if (!graph) { - return async function* () { - // Driven externally (test harness). - }; - } - return (ctx) => - findDirty(graph, { - forWorker: ctx.workerName, - forVersion: ctx.workerVersion, - uriLike: "chunk:///%", - limit: ctx.limit, - }); -} - -function vectorHashSync(vec: Float32Array): string { - // Cheap fingerprint; sha256 over the bytes. - // Returns a string usable for the no-op rule. - const buf = new Uint8Array(vec.buffer, vec.byteOffset, vec.byteLength); - let acc = 0n; - for (let i = 0; i < buf.length; i++) { - acc = ((acc << 7n) ^ BigInt(buf[i] ?? 0)) & 0xffffffffffffffffn; - } - return acc.toString(16); -} - -/** - * Consumes `chunk://**` updates and emits `embedding://...` updates. Mints one - * stamp per item AFTER the embedding call to keep the logical transaction - * boundary tight (the slow API call happens outside the txn). - */ -export function createEmbedder(opts: EmbedderOptions): WorkerDefinition { - const name = opts.name ?? "embedder"; - const version = opts.version ?? "v1"; - const model = opts.model ?? "unknown"; - - return { - name, - version, - description: "Embeds chunk:// updates into embedding:// vectors.", - inputPattern: "chunk://**", - outputPattern: "embedding://**", - scopeExpr: "uri", - selector: makeEmbedderSelector(opts.graph), - run: async function* ( - params: WorkerParams, - input: AsyncIterable, - ): AsyncGenerator { - for await (const chunk of input) { - if (params.signal.aborted) return; - - // If the selector handed us a chunk URI without text in attributes, - // re-read the live state to get the chunk body. - if ( - chunk.uri.startsWith("chunk:") && - (chunk.attributes === undefined || - (chunk.attributes as Record).text === undefined) - ) { - const live = await params.read(chunk.uri); - if (live?.attributes) { - chunk.attributes = live.attributes as Record; - } - } - - if (chunk.status === "removed") { - const stamp = await params.stamp(); - yield { - uri: embeddingUri(chunk.uri), - stamp, - status: "removed", - scope: chunk.uri, - role: "embedding", - }; - continue; - } - - const text = ((chunk.attributes as Record)?.text as string) ?? ""; - // Slow work outside the logical transaction. - const vector = await opts.embed(text, { signal: params.signal }); - if (params.signal.aborted) return; - - const hash = vectorHashSync(vector) || (await sha256Hex(text)); - const stamp = await params.stamp(); - yield { - uri: embeddingUri(chunk.uri), - stamp, - status: "updated", - hash, - scope: chunk.uri, - role: "embedding", - attributes: { - vector: Array.from(vector), - model, - sourceChunkUri: chunk.uri, - }, - }; - } - }, - }; -} diff --git a/packages/uri-graph/src/workers/extractors/base.ts b/packages/uri-graph/src/workers/extractors/base.ts deleted file mode 100644 index 6b483f4..0000000 --- a/packages/uri-graph/src/workers/extractors/base.ts +++ /dev/null @@ -1,120 +0,0 @@ -import type { FilesApi } from "@statewalker/webrun-files"; -import { readText } from "@statewalker/webrun-files"; -import { findDirty } from "../../graph/selector-helpers.js"; -import type { GraphStore } from "../../store/types.js"; -import type { Update } from "../../types/update.js"; -import type { WorkerDefinition, WorkerParams } from "../../types/worker.js"; -import { sha256Hex } from "../../util/hash.js"; - -export interface ExtractorOptions { - files: FilesApi; - /** - * Optional graph; if provided, the extractor's selector uses `findDirty` to - * yield pending file:// URIs matching its pattern. Without it, the selector is - * empty and the extractor must be fed via an external input stream (useful for - * unit tests). - */ - graph?: GraphStore; - /** Optional override of worker name/version. */ - name?: string; - version?: string; -} - -export interface ExtractorSpec extends ExtractorOptions { - /** Worker name (defaults to spec.defaultName). */ - defaultName: string; - /** Worker version. */ - defaultVersion: string; - /** SQL-LIKE pattern matching the file URIs this extractor handles. */ - uriLike: string; - /** RegExp tested against the path; only matches are extracted. */ - pathPattern: RegExp; - /** Mime declared on the produced text:// update. */ - mime: string; - /** - * Transform raw file text into the extracted body that gets indexed. - * For markdown / plain text this can be the identity; for html, strip tags. - */ - transform(raw: string): string; -} - -function fileToTextUri(uri: string): string { - return uri.replace(/^file:/, "text:"); -} - -function makeSelector(spec: ExtractorSpec): WorkerDefinition["selector"] { - const graph = spec.graph; - if (!graph) { - return async function* () { - // No graph wired; selector is empty (driven externally for unit tests). - }; - } - return (ctx) => - findDirty(graph, { - forWorker: ctx.workerName, - forVersion: ctx.workerVersion, - uriLike: spec.uriLike, - limit: ctx.limit, - }); -} - -/** - * Build a content extractor that consumes `file://**` URIs matching the spec's - * `pathPattern`, reads bytes via `FilesApi`, and emits a `text://...` URI carrying - * the extracted text and a real content hash. - */ -export function createExtractor(spec: ExtractorSpec): WorkerDefinition { - const name = spec.name ?? spec.defaultName; - const version = spec.version ?? spec.defaultVersion; - const files = spec.files; - - return { - name, - version, - description: `Extracts plain text from files matching ${spec.uriLike}.`, - inputPattern: spec.uriLike, - outputPattern: "text://**", - scopeExpr: "uri", - selector: makeSelector(spec), - run: async function* ( - params: WorkerParams, - input: AsyncIterable, - ): AsyncGenerator { - for await (const file of input) { - if (params.signal.aborted) return; - const path = file.uri.replace(/^file:\/\//, ""); - if (!spec.pathPattern.test(path)) continue; - - if (file.status === "removed") { - const stamp = await params.stamp(); - yield { - uri: fileToTextUri(file.uri), - stamp, - status: "removed", - scope: file.uri, - }; - continue; - } - - // Slow work BEFORE minting stamp. - const raw = await readText(files, path); - const body = spec.transform(raw); - const hash = await sha256Hex(body); - - const stamp = await params.stamp(); - yield { - uri: fileToTextUri(file.uri), - stamp, - status: file.status === "added" ? "added" : "updated", - hash, - scope: file.uri, - attributes: { - text: body, - mime: spec.mime, - sourceUri: file.uri, - }, - }; - } - }, - }; -} diff --git a/packages/uri-graph/src/workers/extractors/html-extractor.ts b/packages/uri-graph/src/workers/extractors/html-extractor.ts deleted file mode 100644 index 3f8e1ff..0000000 --- a/packages/uri-graph/src/workers/extractors/html-extractor.ts +++ /dev/null @@ -1,27 +0,0 @@ -import type { ExtractorOptions } from "./base.js"; -import { createExtractor } from "./base.js"; - -/** - * Strip every HTML tag and collapse whitespace. Sufficient for indexing plain - * text content; not a structural HTML parser. - */ -function stripTags(html: string): string { - return html - .replace(/)<[^<]*)*<\/script>/gi, " ") - .replace(/)<[^<]*)*<\/style>/gi, " ") - .replace(/<[^>]+>/g, " ") - .replace(/\s+/g, " ") - .trim(); -} - -export function createHtmlExtractor(opts: ExtractorOptions) { - return createExtractor({ - ...opts, - defaultName: "extract-html", - defaultVersion: "v1", - uriLike: "file:///%.html", - pathPattern: /\.html?$/i, - mime: "text/html", - transform: stripTags, - }); -} diff --git a/packages/uri-graph/src/workers/extractors/markdown-extractor.ts b/packages/uri-graph/src/workers/extractors/markdown-extractor.ts deleted file mode 100644 index b8fad9e..0000000 --- a/packages/uri-graph/src/workers/extractors/markdown-extractor.ts +++ /dev/null @@ -1,19 +0,0 @@ -import type { ExtractorOptions } from "./base.js"; -import { createExtractor } from "./base.js"; - -/** - * Markdown extractor: matches `*.md` files. Currently passes raw markdown through - * unchanged (the indexer treats it as text). Replace `transform` if a structured - * markdown → plain conversion is needed downstream. - */ -export function createMarkdownExtractor(opts: ExtractorOptions) { - return createExtractor({ - ...opts, - defaultName: "extract-markdown", - defaultVersion: "v1", - uriLike: "file:///%.md", - pathPattern: /\.md$/i, - mime: "text/markdown", - transform: (raw) => raw, - }); -} diff --git a/packages/uri-graph/src/workers/extractors/plain-text-extractor.ts b/packages/uri-graph/src/workers/extractors/plain-text-extractor.ts deleted file mode 100644 index bf63c6e..0000000 --- a/packages/uri-graph/src/workers/extractors/plain-text-extractor.ts +++ /dev/null @@ -1,14 +0,0 @@ -import type { ExtractorOptions } from "./base.js"; -import { createExtractor } from "./base.js"; - -export function createPlainTextExtractor(opts: ExtractorOptions) { - return createExtractor({ - ...opts, - defaultName: "extract-plain-text", - defaultVersion: "v1", - uriLike: "file:///%.txt", - pathPattern: /\.txt$/i, - mime: "text/plain", - transform: (raw) => raw, - }); -} diff --git a/packages/uri-graph/src/workers/file-watcher.ts b/packages/uri-graph/src/workers/file-watcher.ts deleted file mode 100644 index 71dcf30..0000000 --- a/packages/uri-graph/src/workers/file-watcher.ts +++ /dev/null @@ -1,110 +0,0 @@ -import type { FilesApi } from "@statewalker/webrun-files"; -import type { Update } from "../types/update.js"; -import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; - -export interface FileWatcherOptions { - files: FilesApi; - rootPath: string; - /** Worker name; defaults to `file-watcher`. */ - name?: string; - /** Worker version; defaults to `v1`. */ - version?: string; -} - -interface FileFingerprint { - size: number; - mtime: number; - path: string; -} - -function fileUri(path: string): string { - // Maps virtual path '/a/b.md' → 'file:///a/b.md' (three slashes per RFC 8089). - return `file://${path}`; -} - -function fingerprint(size: number, mtime: number): string { - return `${size}:${mtime}`; -} - -/** - * Source worker that scans all files under `rootPath` via `FilesApi`. Emits - * `file://` URIs with status added/updated/removed based on a - * `(size, mtime)` fingerprint diffed against committed state. - * - * The watcher does NOT read file bytes; downstream extractors do that work. - */ -export function createFileWatcher(opts: FileWatcherOptions): WorkerDefinition { - const { files, rootPath } = opts; - const name = opts.name ?? "file-watcher"; - const version = opts.version ?? "v1"; - - return { - name, - version, - description: `Polls FilesApi at ${rootPath}; emits file:// URIs on change.`, - outputPattern: "file://**", - selector: async function* () { - yield { uri: `tick://${name}`, stamp: 0, status: "updated" }; - }, - run: async function* ( - params: WorkerParams, - input: AsyncIterable, - ): AsyncGenerator { - for await (const _tick of input) { - if (params.signal.aborted) return; - // Snapshot the FS. - const found = new Map(); - for await (const info of files.list(rootPath, { recursive: true })) { - if (info.kind !== "file") continue; - found.set(info.path, { - path: info.path, - size: info.size ?? 0, - mtime: info.lastModified ?? 0, - }); - } - - // Snapshot prior known files from the graph. - const known = new Map(); - for await (const view of params.find("file:///%")) { - const path = view.uri.replace(/^file:\/\//, ""); - const attrs = (view.attributes ?? {}) as Partial; - known.set(path, { - path, - size: attrs.size ?? 0, - mtime: attrs.mtime ?? 0, - }); - } - - const stamp = await params.stamp(); - - // Emit added / updated. - for (const [path, info] of found) { - const prev = known.get(path); - const changed = !prev || prev.size !== info.size || prev.mtime !== info.mtime; - if (!changed) continue; - yield { - uri: fileUri(path), - stamp, - status: prev ? "updated" : "added", - hash: fingerprint(info.size, info.mtime), - attributes: { - path: info.path, - size: info.size, - mtime: info.mtime, - }, - }; - } - - // Emit removed. - for (const path of known.keys()) { - if (found.has(path)) continue; - yield { - uri: fileUri(path), - stamp, - status: "removed", - }; - } - } - }, - }; -} diff --git a/packages/uri-graph/src/workers/index-backends/memory-fts.ts b/packages/uri-graph/src/workers/index-backends/memory-fts.ts deleted file mode 100644 index 752b87d..0000000 --- a/packages/uri-graph/src/workers/index-backends/memory-fts.ts +++ /dev/null @@ -1,75 +0,0 @@ -export interface FtsHit { - scope: string; - score: number; -} - -export interface FtsBackend { - upsert(scope: string, docs: string[]): void; - remove(scope: string): void; - query(text: string): FtsHit[]; -} - -function tokenize(text: string): string[] { - return text - .toLowerCase() - .split(/[^a-z0-9]+/) - .filter(Boolean); -} - -/** - * Tiny inverted index keyed by scope. Score = number of distinct query terms - * that appear in the scope's documents. - */ -export function createMemoryFtsBackend(): FtsBackend { - // term → set of scopes that contain it. - const termIndex = new Map>(); - // scope → set of terms it contributed (so we can remove cleanly). - const scopeTerms = new Map>(); - - function dropScope(scope: string): void { - const terms = scopeTerms.get(scope); - if (!terms) return; - for (const t of terms) { - const set = termIndex.get(t); - if (!set) continue; - set.delete(scope); - if (set.size === 0) termIndex.delete(t); - } - scopeTerms.delete(scope); - } - - return { - upsert(scope, docs) { - dropScope(scope); - const terms = new Set(); - for (const d of docs) for (const t of tokenize(d)) terms.add(t); - for (const t of terms) { - let set = termIndex.get(t); - if (!set) { - set = new Set(); - termIndex.set(t, set); - } - set.add(scope); - } - scopeTerms.set(scope, terms); - }, - remove(scope) { - dropScope(scope); - }, - query(text) { - const queryTerms = tokenize(text); - const score = new Map(); - for (const t of queryTerms) { - const set = termIndex.get(t); - if (!set) continue; - for (const scope of set) { - score.set(scope, (score.get(scope) ?? 0) + 1); - } - } - const hits: FtsHit[] = []; - for (const [scope, s] of score) hits.push({ scope, score: s }); - hits.sort((a, b) => b.score - a.score || a.scope.localeCompare(b.scope)); - return hits; - }, - }; -} diff --git a/packages/uri-graph/src/workers/index-backends/memory-vector.ts b/packages/uri-graph/src/workers/index-backends/memory-vector.ts deleted file mode 100644 index 37f77db..0000000 --- a/packages/uri-graph/src/workers/index-backends/memory-vector.ts +++ /dev/null @@ -1,46 +0,0 @@ -export interface VectorHit { - id: string; - score: number; -} - -export interface VectorBackend { - upsert(id: string, vec: Float32Array): void; - remove(id: string): void; - search(query: Float32Array, k: number): VectorHit[]; -} - -function cosineSim(a: Float32Array, b: Float32Array): number { - const len = Math.min(a.length, b.length); - let dot = 0; - let na = 0; - let nb = 0; - for (let i = 0; i < len; i++) { - const av = a[i] ?? 0; - const bv = b[i] ?? 0; - dot += av * bv; - na += av * av; - nb += bv * bv; - } - if (na === 0 || nb === 0) return 0; - return dot / (Math.sqrt(na) * Math.sqrt(nb)); -} - -export function createMemoryVectorBackend(): VectorBackend { - const vectors = new Map(); - return { - upsert(id, vec) { - vectors.set(id, vec); - }, - remove(id) { - vectors.delete(id); - }, - search(query, k) { - const hits: VectorHit[] = []; - for (const [id, v] of vectors) { - hits.push({ id, score: cosineSim(query, v) }); - } - hits.sort((a, b) => b.score - a.score); - return hits.slice(0, k); - }, - }; -} diff --git a/packages/uri-graph/src/workers/indexer.ts b/packages/uri-graph/src/workers/indexer.ts deleted file mode 100644 index 6f7ef36..0000000 --- a/packages/uri-graph/src/workers/indexer.ts +++ /dev/null @@ -1,255 +0,0 @@ -import type { GraphStore } from "../store/types.js"; -import type { Update } from "../types/update.js"; -import type { WorkerDefinition, WorkerParams } from "../types/worker.js"; -import { sha256Hex } from "../util/hash.js"; -import type { FtsBackend } from "./index-backends/memory-fts.js"; -import type { VectorBackend } from "./index-backends/memory-vector.js"; - -export interface IndexerOptions { - fts: FtsBackend; - vector: VectorBackend; - /** Optional graph; when provided, the selector emits ready scopes. */ - graph?: GraphStore; - name?: string; - version?: string; -} - -interface ScopeBag { - text?: Update; - chunks: Update[]; - embeddings: Update[]; -} - -function ftsUri(scope: string): string { - return `index://fts/${scope}`; -} - -function vectorUri(scope: string): string { - return `index://vector/${scope}`; -} - -/** - * Multi-input indexer scoped by `text://` URI. Consumes interleaved updates - * grouped by `scope` and `role` (text | chunk | embedding) and writes per-scope - * entries to the configured FTS and vector backends. - * - * The indexer expects its input to be ordered by `scope` so that all rows for - * one scope arrive contiguously. Use `joinInputs` to merge multiple selector - * streams into a single ordered input. - */ -export function createIndexer(opts: IndexerOptions): WorkerDefinition { - const name = opts.name ?? "indexer"; - const version = opts.version ?? "v1"; - - return { - name, - version, - description: "Builds FTS + vector indexes from text/chunk/embedding inputs.", - inputPattern: "text:// + chunk:// + embedding://", - outputPattern: "index://**", - scopeExpr: "text_uri", - selector: makeIndexerSelector(opts.graph), - run: async function* ( - params: WorkerParams, - input: AsyncIterable, - ): AsyncGenerator { - let currentScope: string | undefined; - let bag: ScopeBag = { chunks: [], embeddings: [] }; - - async function* flush(): AsyncGenerator { - if (currentScope === undefined) return; - yield* indexOne(opts, params, currentScope, bag); - currentScope = undefined; - bag = { chunks: [], embeddings: [] }; - } - - for await (const u of input) { - if (params.signal.aborted) return; - const scope = u.scope ?? u.uri; - if (currentScope !== undefined && scope !== currentScope) { - yield* flush(); - } - currentScope = scope; - if (u.role === "text") bag.text = u; - else if (u.role === "chunk") bag.chunks.push(u); - else if (u.role === "embedding") bag.embeddings.push(u); - else if (u.uri.startsWith("text:")) bag.text = u; - else if (u.uri.startsWith("chunk:")) bag.chunks.push(u); - else if (u.uri.startsWith("embedding:")) bag.embeddings.push(u); - } - yield* flush(); - }, - }; -} - -function makeIndexerSelector(graph: GraphStore | undefined): WorkerDefinition["selector"] { - if (!graph) { - return async function* () { - // Driven externally (test harness). - }; - } - return (ctx) => indexerSelector(graph, ctx.workerName, ctx.workerVersion, ctx.limit); -} - -/** - * Selector for the indexer: emit one stream of `(text, chunk*, embedding*)` - * updates for each `text://` URI where every chunk has a matching embedding AND - * the indexer has not run for that scope at its current version. - * - * Streams are ordered by scope so the run can group via `(currentScope, bag)`. - */ -async function* indexerSelector( - graph: GraphStore, - workerName: string, - workerVersion: string, - limit: number, -): AsyncIterableIterator { - const scopes: string[] = []; - for await (const v of graph.find("text:///%")) { - if (v.status === "removed") { - // Removed text — emit a single sentinel so the run can cascade. - const processed = await graph.isInputProcessed(workerName, workerVersion, v.uri); - if (!processed) scopes.push(v.uri); - continue; - } - const processed = await graph.isInputProcessed(workerName, workerVersion, v.uri); - if (processed) continue; - // Verify all chunks have embeddings. - const chunkPattern = `chunk:${v.uri.slice("text:".length)}#%`; - let allEmbedded = true; - let chunkCount = 0; - for await (const c of graph.find(chunkPattern)) { - if (c.status === "removed") continue; - chunkCount += 1; - const emb = await graph.getState(`embedding://${c.uri}`); - if (!emb || emb.status === "removed") { - allEmbedded = false; - break; - } - } - if (chunkCount === 0 || !allEmbedded) continue; - scopes.push(v.uri); - } - - for (const scope of scopes.slice(0, limit)) { - const text = await graph.getState(scope); - if (!text) continue; - yield { - uri: scope, - stamp: text.stamp, - status: text.status, - hash: text.hash, - scope, - role: "text", - attributes: text.attributes, - }; - if (text.status === "removed") continue; - - const chunkPattern = `chunk:${scope.slice("text:".length)}#%`; - const chunks: Array<{ - uri: string; - stamp: number; - status: Update["status"]; - hash?: string; - attributes?: Record; - }> = []; - for await (const c of graph.find(chunkPattern)) { - chunks.push({ - uri: c.uri, - stamp: c.stamp, - status: c.status, - hash: c.hash, - attributes: c.attributes, - }); - } - chunks.sort((a, b) => a.uri.localeCompare(b.uri)); - for (const c of chunks) { - yield { - uri: c.uri, - stamp: c.stamp, - status: c.status, - hash: c.hash, - scope, - role: "chunk", - attributes: c.attributes, - }; - const emb = await graph.getState(`embedding://${c.uri}`); - if (!emb) continue; - yield { - uri: `embedding://${c.uri}`, - stamp: emb.stamp, - status: emb.status, - hash: emb.hash, - scope, - role: "embedding", - attributes: emb.attributes, - }; - } - } -} - -async function* indexOne( - opts: IndexerOptions, - params: WorkerParams, - scope: string, - bag: ScopeBag, -): AsyncGenerator { - // Removal: any "removed" text/chunk/embedding for this scope drops the indexes. - if (bag.text && bag.text.status === "removed") { - opts.fts.remove(scope); - for (const c of bag.chunks) opts.vector.remove(c.uri); - // Also drop any prior chunk-based vectors that may live under the scope's - // chunks even if the chunks themselves weren't passed in. - const stamp = await params.stamp(); - yield { - uri: ftsUri(scope), - stamp, - status: "removed", - scope, - role: "fts-index", - }; - yield { - uri: vectorUri(scope), - stamp, - status: "removed", - scope, - role: "vector-index", - }; - return; - } - - // Build / update entries. - const chunkTexts: string[] = []; - for (const c of bag.chunks) { - const t = (c.attributes as Record)?.text as string; - if (typeof t === "string") chunkTexts.push(t); - } - opts.fts.upsert(scope, chunkTexts); - - for (const e of bag.embeddings) { - const v = (e.attributes as Record)?.vector as number[] | undefined; - if (!Array.isArray(v)) continue; - const arr = new Float32Array(v); - opts.vector.upsert(e.uri, arr); - } - - const stamp = await params.stamp(); - const ftsHash = await sha256Hex(chunkTexts.join("\n")); - const vecHash = await sha256Hex(bag.embeddings.map((e) => e.hash ?? "").join("|")); - yield { - uri: ftsUri(scope), - stamp, - status: "updated", - hash: ftsHash, - scope, - role: "fts-index", - }; - yield { - uri: vectorUri(scope), - stamp, - status: "updated", - hash: vecHash, - scope, - role: "vector-index", - }; -} diff --git a/packages/uri-graph/tests/e2e/pipeline.test.ts b/packages/uri-graph/tests/e2e/pipeline.test.ts index eede2e3..c9bd354 100644 --- a/packages/uri-graph/tests/e2e/pipeline.test.ts +++ b/packages/uri-graph/tests/e2e/pipeline.test.ts @@ -1,162 +1,110 @@ -import { mkdtempSync, rmSync } from "node:fs"; -import { tmpdir } from "node:os"; -import { join } from "node:path"; import { newNodeTursoDb } from "@statewalker/db-turso-node"; -import { writeText } from "@statewalker/webrun-files"; -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { afterEach, describe, expect, it } from "vitest"; -import { createOrchestrator } from "../../src/orchestrator/orchestrator.js"; -import { createInMemoryPersistence } from "../../src/store/memory/files-persistence.js"; -import { MemoryGraphStore } from "../../src/store/memory/store.js"; -import { SqlGraphStore } from "../../src/store/sql/store.js"; -import type { GraphStore } from "../../src/store/types.js"; -import { openGraphStore } from "../../src/store/types.js"; -import { createChunker } from "../../src/workers/chunker.js"; -import { createEmbedder } from "../../src/workers/embedder.js"; -import { createMarkdownExtractor } from "../../src/workers/extractors/markdown-extractor.js"; -import { createFileWatcher } from "../../src/workers/file-watcher.js"; -import { createMemoryFtsBackend } from "../../src/workers/index-backends/memory-fts.js"; -import { createMemoryVectorBackend } from "../../src/workers/index-backends/memory-vector.js"; -import { createIndexer } from "../../src/workers/indexer.js"; - -const tmpDirs: string[] = []; -afterEach(() => { - while (tmpDirs.length) { - const d = tmpDirs.pop(); - if (d) { - try { - rmSync(d, { recursive: true, force: true }); - } catch { - // ignore - } - } - } -}); - -interface E2EHarness { - store: GraphStore; - files: MemFilesApi; - fts: ReturnType; - vector: ReturnType; - cleanup: () => Promise; -} +import { describe, expect, it } from "vitest"; +import { Engine, MemoryStore, SqlStore, type Store, type WorkerFn } from "../../src/index.js"; -async function makeMemoryHarness(): Promise { - const files = new MemFilesApi(); - const store = await openGraphStore(new MemoryGraphStore(createInMemoryPersistence("graph"))); - return { - store, - files, - fts: createMemoryFtsBackend(), - vector: createMemoryVectorBackend(), - async cleanup() { - await (store as unknown as { close(): Promise }).close(); - }, - }; -} +type StoreFactory = () => Promise<{ store: Store; close: () => Promise }>; -async function makeSqlHarness(): Promise { - const dir = mkdtempSync(join(tmpdir(), "uri-graph-e2e-")); - tmpDirs.push(dir); - const db = await newNodeTursoDb({ path: join(dir, "graph.db") }); - const store = await openGraphStore(new SqlGraphStore({ db })); - return { - store, - files: new MemFilesApi(), - fts: createMemoryFtsBackend(), - vector: createMemoryVectorBackend(), - async cleanup() { - await db.close(); +const factories: Array<{ name: string; make: StoreFactory }> = [ + { + name: "MemoryStore", + make: async () => ({ store: new MemoryStore(), close: async () => {} }), + }, + { + name: "SqlStore", + make: async () => { + const db = await newNodeTursoDb(); + return { store: new SqlStore(db), close: async () => db.close() }; }, - }; -} + }, +]; -async function runToFixpoint( - store: GraphStore, - files: MemFilesApi, - fts: ReturnType, - vector: ReturnType, - maxRounds = 30, -): Promise { - const orch = createOrchestrator({ graph: store, pollMs: 5 }); - await orch.registerWorker(createFileWatcher({ files, rootPath: "/" })); - await orch.registerWorker(createMarkdownExtractor({ files, graph: store })); - await orch.registerWorker(createChunker({ chunkSize: 5, graph: store })); - await orch.registerWorker( - createEmbedder({ - graph: store, - embed: async (text: string) => new Float32Array([text.length, text.charCodeAt(0) || 0]), - }), - ); - await orch.registerWorker(createIndexer({ graph: store, fts, vector })); - - const ac = new AbortController(); - const startPromise = orch.start(ac.signal); - - // Poll until any indexer outputs exist or timeout. - for (let i = 0; i < maxRounds * 10; i++) { - await new Promise((r) => setTimeout(r, 20)); - const seen: string[] = []; - for await (const v of store.find("index://%")) seen.push(v.uri); - if (seen.length >= 2) break; - } - ac.abort(); - await startPromise; +async function collect(it: AsyncIterable): Promise { + const out: T[] = []; + for await (const x of it) out.push(x); + return out; } -async function indexUris(store: GraphStore): Promise { - const out: string[] = []; - for await (const v of store.find("index://%")) { - if (v.status !== "removed") out.push(v.uri); - } - return out.sort(); -} +for (const { name, make } of factories) { + describe(`Pipeline e2e — ${name}`, () => { + it("scanner → extractor (md only) → indexer", async () => { + const { store, close } = await make(); + try { + const engine = new Engine(store); + + const scanner: WorkerFn = async function* (input, ctx) { + for await (const _tick of input) { + const stamp = await ctx.newStamp(); + yield { uri: "file://a.md", stamp, status: "added" }; + yield { uri: "file://b.png", stamp, status: "added" }; + yield { uri: "file://c.md", stamp, status: "added" }; + } + }; + + const extractor: WorkerFn = async function* (input, ctx) { + for await (const r of input) { + if (!r.uri.endsWith(".md")) continue; + const stamp = await ctx.newStamp(); + yield { uri: `text://${r.uri.slice("file://".length)}`, stamp, status: "added" }; + } + }; + + const indexer: WorkerFn = async function* (input, ctx) { + for await (const r of input) { + const stamp = await ctx.newStamp(); + yield { uri: `db://${r.uri.slice("text://".length)}`, stamp, status: "added" }; + } + }; + + await store.put({ + uri: "tick://run", + stamp: await store.newStamp(), + status: "updated", + }); -describe("E2E pipeline (memory store)", () => { - it("two markdown files → fts + vector indexes built", async () => { - const h = await makeMemoryHarness(); - try { - await writeText(h.files, "/a.md", "hello world"); - await writeText(h.files, "/b.md", "another doc"); - await runToFixpoint(h.store, h.files, h.fts, h.vector); - const indexes = await indexUris(h.store); - expect(indexes).toContain("index://fts/text:///a.md"); - expect(indexes).toContain("index://fts/text:///b.md"); - expect(indexes).toContain("index://vector/text:///a.md"); - expect(indexes).toContain("index://vector/text:///b.md"); - expect(h.fts.query("hello").map((x) => x.scope)).toContain("text:///a.md"); - } finally { - await h.cleanup(); - } - }, 30000); - - it("non-markdown files do not produce indexes", async () => { - const h = await makeMemoryHarness(); - try { - await writeText(h.files, "/c.png", "binary-ish"); - await runToFixpoint(h.store, h.files, h.fts, h.vector, 5); - const indexes = await indexUris(h.store); - expect(indexes).toEqual([]); - } finally { - await h.cleanup(); - } - }, 15000); -}); - -describe("E2E pipeline (sql store)", () => { - it("two markdown files → fts + vector indexes built", async () => { - const h = await makeSqlHarness(); - try { - await writeText(h.files, "/a.md", "hello world"); - await writeText(h.files, "/b.md", "another doc"); - await runToFixpoint(h.store, h.files, h.fts, h.vector); - const indexes = await indexUris(h.store); - expect(indexes).toContain("index://fts/text:///a.md"); - expect(indexes).toContain("index://fts/text:///b.md"); - expect(indexes).toContain("index://vector/text:///a.md"); - expect(indexes).toContain("index://vector/text:///b.md"); - } finally { - await h.cleanup(); - } - }, 30000); -}); + await engine.register({ name: "scanner", selects: "tick://", emits: "file://" }, scanner); + await engine.register( + { name: "extractor", selects: "file://", emits: "text://" }, + extractor, + ); + await engine.register({ name: "indexer", selects: "text://", emits: "db://" }, indexer); + + await collect(engine.stabilize()); + + const dbRows = await collect(store.list({ prefix: "db://" })); + expect(dbRows.map((r) => r.uri).sort()).toEqual(["db://a.md", "db://c.md"]); + + const png = await store.get("text://b.png"); + expect(png).toBeUndefined(); + } finally { + await close(); + } + }); + + it("invalidate triggers downstream re-execution", async () => { + const { store, close } = await make(); + try { + const engine = new Engine(store); + + const stamp1 = await store.newStamp(); + await store.put({ uri: "file://x", stamp: stamp1, status: "added" }); + + const echo: WorkerFn = async function* (input, ctx) { + for await (const r of input) { + const stamp = await ctx.newStamp(); + yield { uri: `text://${r.uri.slice("file://".length)}`, stamp, status: r.status }; + } + }; + + await engine.register({ name: "echo", selects: "file://", emits: "text://" }, echo); + await collect(engine.stabilize()); + expect((await store.get("text://x"))?.status).toBe("added"); + + await store.invalidate("file://"); + await collect(engine.stabilize()); + expect((await store.get("text://x"))?.status).toBe("removed"); + } finally { + await close(); + } + }); + }); +} diff --git a/packages/uri-graph/tests/engine.test.ts b/packages/uri-graph/tests/engine.test.ts new file mode 100644 index 0000000..57cce84 --- /dev/null +++ b/packages/uri-graph/tests/engine.test.ts @@ -0,0 +1,248 @@ +import { describe, expect, it } from "vitest"; +import { Engine, MemoryStore, type Resource, type WorkerFn } from "../src/index.js"; + +async function collect(it: AsyncIterable): Promise { + const out: T[] = []; + for await (const x of it) out.push(x); + return out; +} + +describe("Engine.runWorker", () => { + it("runs a worker with empty input and writes no completion", async () => { + const store = new MemoryStore(); + const engine = new Engine(store); + + const fn: WorkerFn = async function* (input, ctx) { + for await (const r of input) { + yield { uri: r.uri, stamp: await ctx.newStamp(), status: "added" }; + } + }; + + await engine.register({ name: "w", selects: "x://", emits: "y://" }, fn); + + const out = await collect(engine.runWorker("w")); + expect(out).toEqual([]); + + const wm = await store.allWatermarks(); + expect(wm.get("w")).toBeUndefined(); + }); + + it("runs a worker, persists outputs, and writes a completion stamp", async () => { + const store = new MemoryStore(); + const engine = new Engine(store); + + await store.put({ uri: "file://a", stamp: 1, status: "added" }); + await store.put({ uri: "file://b", stamp: 2, status: "added" }); + + const fn: WorkerFn = async function* (input, ctx) { + const stamp = await ctx.newStamp(); + for await (const r of input) { + yield { + uri: `text://${r.uri.slice("file://".length)}`, + stamp, + status: "updated", + }; + } + }; + + await engine.register({ name: "extractor", selects: "file://", emits: "text://" }, fn); + + const out: Resource[] = await collect(engine.runWorker("extractor")); + expect(out.map((r) => r.uri).sort()).toEqual(["text://a", "text://b"]); + + const wm = await store.allWatermarks(); + const completion = wm.get("extractor"); + expect(completion).toBeDefined(); + if (!completion) return; + + for (const r of out) expect(r.stamp).toBeLessThan(completion); + }); + + it("filter worker (consumes input, produces nothing) still bumps watermark", async () => { + const store = new MemoryStore(); + const engine = new Engine(store); + + await store.put({ uri: "file://a.png", stamp: 1, status: "added" }); + await store.put({ uri: "file://b.png", stamp: 2, status: "added" }); + + const fn: WorkerFn = async function* (input, ctx) { + for await (const r of input) { + if (r.uri.endsWith(".md")) { + yield { uri: `text://${r.uri}`, stamp: await ctx.newStamp(), status: "added" }; + } + } + }; + + await engine.register({ name: "filter", selects: "file://", emits: "text://" }, fn); + + await collect(engine.runWorker("filter")); + + const wm = await store.allWatermarks(); + const first = wm.get("filter"); + expect(first).toBeDefined(); + if (!first) return; + expect(first).toBeGreaterThanOrEqual(2); + + await collect(engine.runWorker("filter")); + + const wm2 = await store.allWatermarks(); + expect(wm2.get("filter")).toBe(first); + }); + + it("does not advance watermark when worker generator throws", async () => { + const store = new MemoryStore(); + const engine = new Engine(store); + + await store.put({ uri: "file://a", stamp: 1, status: "added" }); + + const fn: WorkerFn = async function* (input, ctx) { + for await (const r of input) { + if (r.uri) throw new Error("boom"); + yield { uri: r.uri, stamp: await ctx.newStamp(), status: "added" }; + } + }; + + await engine.register({ name: "w", selects: "file://", emits: "x://" }, fn); + + await expect(collect(engine.runWorker("w"))).rejects.toThrow("boom"); + + const wm = await store.allWatermarks(); + expect(wm.get("w")).toBeUndefined(); + }); + + it("re-running with no new inputs is a no-op", async () => { + const store = new MemoryStore(); + const engine = new Engine(store); + + await store.put({ uri: "file://a", stamp: 1, status: "added" }); + + let runs = 0; + const fn: WorkerFn = async function* (input, ctx) { + runs++; + const stamp = await ctx.newStamp(); + for await (const r of input) { + yield { uri: `text://${r.uri}`, stamp, status: "updated" }; + } + }; + + await engine.register({ name: "x", selects: "file://", emits: "text://" }, fn); + + await collect(engine.runWorker("x")); + expect(runs).toBe(1); + + await collect(engine.runWorker("x")); + expect(runs).toBe(2); + const got = await collect(store.list({ prefix: "text://" })); + expect(got.length).toBe(1); + }); +}); + +describe("Engine.stabilize", () => { + it("cascades through a 3-stage pipeline", async () => { + const store = new MemoryStore(); + const engine = new Engine(store); + + const scanner: WorkerFn = async function* (input, ctx) { + for await (const _tick of input) { + const stamp = await ctx.newStamp(); + yield { uri: "file://a", stamp, status: "added" }; + yield { uri: "file://b", stamp, status: "added" }; + } + }; + + const extractor: WorkerFn = async function* (input, ctx) { + const stamp = await ctx.newStamp(); + for await (const r of input) { + yield { uri: `text://${r.uri.slice("file://".length)}`, stamp, status: "updated" }; + } + }; + + const indexer: WorkerFn = async function* (input, ctx) { + const stamp = await ctx.newStamp(); + for await (const r of input) { + yield { uri: `db://${r.uri.slice("text://".length)}`, stamp, status: "updated" }; + } + }; + + await store.put({ uri: "tick://run", stamp: await store.newStamp(), status: "updated" }); + + await engine.register({ name: "scanner", selects: "tick://", emits: "file://" }, scanner); + await engine.register({ name: "extractor", selects: "file://", emits: "text://" }, extractor); + await engine.register({ name: "indexer", selects: "text://", emits: "db://" }, indexer); + + await collect(engine.stabilize()); + + const dbRows = await collect(store.list({ prefix: "db://" })); + expect(dbRows.map((r) => r.uri).sort()).toEqual(["db://a", "db://b"]); + }); + + it("stabilize converges (terminates) when no worker has new input", async () => { + const store = new MemoryStore(); + const engine = new Engine(store); + + const fn: WorkerFn = async function* (input, ctx) { + const stamp = await ctx.newStamp(); + for await (const r of input) { + yield { uri: `text://${r.uri}`, stamp, status: "updated" }; + } + }; + + await store.put({ uri: "file://a", stamp: await store.newStamp(), status: "added" }); + await engine.register({ name: "w", selects: "file://", emits: "text://" }, fn); + + const out1 = await collect(engine.stabilize()); + expect(out1.length).toBeGreaterThan(0); + + const out2 = await collect(engine.stabilize()); + expect(out2).toEqual([]); + }); + + it("invalidate triggers re-execution of downstream workers", async () => { + const store = new MemoryStore(); + const engine = new Engine(store); + + const extractor: WorkerFn = async function* (input, ctx) { + const stamp = await ctx.newStamp(); + for await (const r of input) { + if (r.status === "removed") { + yield { uri: `text://${r.uri}`, stamp, status: "removed" }; + } else { + yield { uri: `text://${r.uri}`, stamp, status: "updated" }; + } + } + }; + + await store.put({ uri: "file://a", stamp: await store.newStamp(), status: "added" }); + await engine.register({ name: "extractor", selects: "file://", emits: "text://" }, extractor); + + await collect(engine.stabilize()); + expect((await store.get("text://file://a"))?.status).toBe("updated"); + + await store.invalidate("file://"); + await collect(engine.stabilize()); + expect((await store.get("text://file://a"))?.status).toBe("removed"); + }); +}); + +describe("Engine.unregister", () => { + it("removes worker and clears completions", async () => { + const store = new MemoryStore(); + const engine = new Engine(store); + + await store.put({ uri: "file://a", stamp: await store.newStamp(), status: "added" }); + + const fn: WorkerFn = async function* (input, ctx) { + const stamp = await ctx.newStamp(); + for await (const r of input) { + yield { uri: `text://${r.uri}`, stamp, status: "updated" }; + } + }; + await engine.register({ name: "w", selects: "file://", emits: "text://" }, fn); + await collect(engine.stabilize()); + + expect((await store.allWatermarks()).get("w")).toBeDefined(); + await engine.unregister("w"); + expect((await store.allWatermarks()).get("w")).toBeUndefined(); + expect(await store.getWorker("w")).toBeUndefined(); + }); +}); diff --git a/packages/uri-graph/tests/graph/selector-helpers.test.ts b/packages/uri-graph/tests/graph/selector-helpers.test.ts deleted file mode 100644 index 13ad54f..0000000 --- a/packages/uri-graph/tests/graph/selector-helpers.test.ts +++ /dev/null @@ -1,170 +0,0 @@ -import { beforeEach, describe, expect, it } from "vitest"; -import { findDirty, joinInputs } from "../../src/graph/selector-helpers.js"; -import type { GraphStore } from "../../src/store/types.js"; -import type { Update } from "../../src/types/update.js"; -import { openTempMemoryStore } from "../helpers.js"; - -describe("findDirty", () => { - let store: GraphStore; - - beforeEach(async () => { - store = await openTempMemoryStore(); - // seed three URIs - await store.registerWorker({ name: "seed", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "seed", - version: "v1", - scope: null, - initialStamp: s, - }); - for (const u of ["file:///a.md", "file:///b.md", "file:///c.txt"]) { - await txn.applyUpdate({ uri: u, stamp: s, status: "added", hash: u }); - } - await txn.commit(); - }); - - async function collect(it: AsyncIterable): Promise { - const out: T[] = []; - for await (const x of it) out.push(x); - return out; - } - - it("yields URIs matching the pattern that the worker has not processed", async () => { - const it = findDirty(store, { - forWorker: "ext", - forVersion: "v1", - uriLike: "file:///%.md", - limit: 10, - }); - const results = await collect(it); - expect(results.map((r) => r.uri).sort()).toEqual(["file:///a.md", "file:///b.md"]); - }); - - it("excludes URIs the worker has already processed at this version", async () => { - // simulate ext worker processed a.md at v1 - await store.registerWorker({ name: "ext", version: "v1" }); - const aState = await store.getState("file:///a.md"); - expect(aState).not.toBeNull(); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "ext", - version: "v1", - scope: "file:///a.md", - initialStamp: s, - }); - await txn.recordInputs([{ uri: "file:///a.md", observedStamp: aState?.stamp ?? 0 }]); - await txn.applyUpdate({ - uri: "text:///a.md", - stamp: s, - status: "added", - hash: "h", - }); - await txn.commit(); - - const it = findDirty(store, { - forWorker: "ext", - forVersion: "v1", - uriLike: "file:///%.md", - limit: 10, - }); - const results = await collect(it); - expect(results.map((r) => r.uri)).toEqual(["file:///b.md"]); - }); - - it("re-yields a URI when the worker version is bumped", async () => { - await store.registerWorker({ name: "ext", version: "v1" }); - const aState = await store.getState("file:///a.md"); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "ext", - version: "v1", - scope: "file:///a.md", - initialStamp: s, - }); - await txn.recordInputs([{ uri: "file:///a.md", observedStamp: aState?.stamp ?? 0 }]); - await txn.applyUpdate({ - uri: "text:///a.md", - stamp: s, - status: "added", - hash: "h", - }); - await txn.commit(); - - const it = findDirty(store, { - forWorker: "ext", - forVersion: "v2", - uriLike: "file:///%.md", - limit: 10, - }); - const results = await collect(it); - expect(results.map((r) => r.uri).sort()).toEqual(["file:///a.md", "file:///b.md"]); - }); - - it("respects limit", async () => { - const it = findDirty(store, { - forWorker: "ext", - forVersion: "v1", - uriLike: "file:///%", - limit: 1, - }); - const results: Array<{ uri: string }> = []; - for await (const x of it) results.push(x); - expect(results.length).toBe(1); - }); -}); - -describe("joinInputs", () => { - async function* asyncIter(items: T[]): AsyncIterableIterator { - for (const x of items) yield x; - } - - async function collect(it: AsyncIterable): Promise { - const out: T[] = []; - for await (const x of it) out.push(x); - return out; - } - - it("merges streams ordered by scope, then role", async () => { - const a: Update[] = [ - { - uri: "text://x", - stamp: 1, - status: "added", - scope: "x", - role: "text", - }, - { - uri: "text://y", - stamp: 1, - status: "added", - scope: "y", - role: "text", - }, - ]; - const b: Update[] = [ - { - uri: "chunk://x#0", - stamp: 1, - status: "added", - scope: "x", - role: "chunk", - }, - { - uri: "chunk://y#0", - stamp: 1, - status: "added", - scope: "y", - role: "chunk", - }, - ]; - const merged = joinInputs(asyncIter(a), asyncIter(b)); - const result = await collect(merged); - expect(result.map((u) => `${u.scope}:${u.role}`)).toEqual([ - "x:chunk", - "x:text", - "y:chunk", - "y:text", - ]); - }); -}); diff --git a/packages/uri-graph/tests/helpers.ts b/packages/uri-graph/tests/helpers.ts deleted file mode 100644 index 725f1f8..0000000 --- a/packages/uri-graph/tests/helpers.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { createInMemoryPersistence } from "../src/store/memory/files-persistence.js"; -import { MemoryGraphStore } from "../src/store/memory/store.js"; -import type { GraphStore } from "../src/store/types.js"; -import { openGraphStore } from "../src/store/types.js"; - -/** - * Open an in-memory `MemoryGraphStore` with throwaway persistence. Each call - * gets an isolated store; no filesystem involved. - */ -export async function openTempMemoryStore(key = "graph"): Promise { - return openGraphStore(new MemoryGraphStore(createInMemoryPersistence(key))); -} diff --git a/packages/uri-graph/tests/orchestrator/drain.test.ts b/packages/uri-graph/tests/orchestrator/drain.test.ts deleted file mode 100644 index eb278ac..0000000 --- a/packages/uri-graph/tests/orchestrator/drain.test.ts +++ /dev/null @@ -1,131 +0,0 @@ -import { beforeEach, describe, expect, it } from "vitest"; -import { drain } from "../../src/orchestrator/drain.js"; -import type { GraphStore } from "../../src/store/types.js"; -import type { Update } from "../../src/types/update.js"; -import type { WorkerDefinition } from "../../src/types/worker.js"; -import { openTempMemoryStore } from "../helpers.js"; - -describe("drain", () => { - let store: GraphStore; - - beforeEach(async () => { - store = await openTempMemoryStore(); - await store.registerWorker({ name: "w", version: "v1" }); - }); - - function makeWorker( - runFn: ( - input: AsyncIterable, - params: { stamp(): Promise }, - ) => AsyncGenerator, - ): WorkerDefinition { - return { - name: "w", - version: "v1", - selector: async function* () { - // empty - }, - run: async function* (params, input) { - yield* runFn(input, { stamp: params.stamp }); - }, - }; - } - - async function* asyncIter(items: T[]): AsyncIterableIterator { - for (const x of items) yield x; - } - - it("multiple updates with same stamp commit together", async () => { - const worker = makeWorker(async function* (_input, p) { - const s = await p.stamp(); - yield { uri: "u://a", stamp: s, status: "added", hash: "ha" }; - yield { uri: "u://b", stamp: s, status: "added", hash: "hb" }; - yield { uri: "u://c", stamp: s, status: "added", hash: "hc" }; - }); - await drain(worker, asyncIter([]), store); - const a = await store.getState("u://a"); - const b = await store.getState("u://b"); - const c = await store.getState("u://c"); - expect(a?.stamp).toBe(b?.stamp); - expect(b?.stamp).toBe(c?.stamp); - }); - - it("multiple stamps produce multiple commits", async () => { - const worker = makeWorker(async function* (_input, p) { - const s1 = await p.stamp(); - yield { uri: "u://a", stamp: s1, status: "added", hash: "ha" }; - const s2 = await p.stamp(); - yield { uri: "u://b", stamp: s2, status: "added", hash: "hb" }; - }); - await drain(worker, asyncIter([]), store); - const a = await store.getState("u://a"); - const b = await store.getState("u://b"); - expect(a).not.toBeNull(); - expect(b).not.toBeNull(); - expect(a?.stamp).not.toBe(b?.stamp); - if (a && b) expect(b.stamp).toBeGreaterThan(a.stamp); - }); - - it("generator throw rolls back current batch", async () => { - const worker = makeWorker(async function* (_input, p) { - const s = await p.stamp(); - yield { uri: "u://x", stamp: s, status: "added", hash: "h" }; - throw new Error("boom"); - }); - await expect(drain(worker, asyncIter([]), store)).rejects.toThrow(/boom/); - expect(await store.getState("u://x")).toBeNull(); - }); - - it("stamp regression aborts run", async () => { - const worker = makeWorker(async function* () { - yield { uri: "u://a", stamp: 100, status: "added", hash: "h1" }; - yield { uri: "u://b", stamp: 99, status: "added", hash: "h2" }; - }); - await expect(drain(worker, asyncIter([]), store)).rejects.toThrow(/stamp/i); - expect(await store.getState("u://a")).toBeNull(); - expect(await store.getState("u://b")).toBeNull(); - }); - - it("consumed inputs are recorded", async () => { - // Seed two committed input URIs. - const seedStamp = await store.mintStamp(); - const seed = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: seedStamp, - }); - await seed.applyUpdate({ - uri: "in://1", - stamp: seedStamp, - status: "added", - hash: "1", - }); - await seed.applyUpdate({ - uri: "in://2", - stamp: seedStamp, - status: "added", - hash: "2", - }); - await seed.commit(); - - const inputs: Update[] = [ - { uri: "in://1", stamp: seedStamp, status: "added", hash: "1" }, - { uri: "in://2", stamp: seedStamp, status: "added", hash: "2" }, - ]; - const worker = makeWorker(async function* (input, p) { - for await (const u of input) { - const s = await p.stamp(); - yield { uri: `out://${u.uri}`, stamp: s, status: "added", hash: `h:${u.uri}` }; - } - }); - await drain(worker, asyncIter(inputs), store); - - // Both outputs should exist. - expect(await store.getState("out://in://1")).not.toBeNull(); - expect(await store.getState("out://in://2")).not.toBeNull(); - // priorOutputs should map input URIs back to their outputs. - const prior1 = await store.priorOutputs("w", "in://1"); - expect(prior1.map((p) => p.uri)).toEqual(["out://in://1"]); - }); -}); diff --git a/packages/uri-graph/tests/orchestrator/orchestrator.test.ts b/packages/uri-graph/tests/orchestrator/orchestrator.test.ts deleted file mode 100644 index 785526e..0000000 --- a/packages/uri-graph/tests/orchestrator/orchestrator.test.ts +++ /dev/null @@ -1,134 +0,0 @@ -import { beforeEach, describe, expect, it } from "vitest"; -import { createOrchestrator } from "../../src/orchestrator/orchestrator.js"; -import type { GraphStore } from "../../src/store/types.js"; -import type { WorkerDefinition } from "../../src/types/worker.js"; -import { openTempMemoryStore } from "../helpers.js"; - -describe("Orchestrator", () => { - let store: GraphStore; - - beforeEach(async () => { - store = await openTempMemoryStore(); - }); - - function makeOneShotWorker( - name: string, - version: string, - behavior: { selectorYields: number; runYields: number }, - ): { def: WorkerDefinition; selectorCalls: number; runCalls: number } { - const stats = { selectorCalls: 0, runCalls: 0 }; - let selectorEmitted = false; - const def: WorkerDefinition = { - name, - version, - selector: async function* () { - stats.selectorCalls += 1; - if (selectorEmitted) return; - for (let i = 0; i < behavior.selectorYields; i++) { - yield { - uri: `tick://${name}#${i}`, - stamp: 0, - status: "updated", - }; - } - selectorEmitted = true; - }, - run: async function* (params, input) { - stats.runCalls += 1; - // consume input fully - for await (const _ of input) { - // just drain - } - for (let i = 0; i < behavior.runYields; i++) { - const s = await params.stamp(); - yield { - uri: `out://${name}#${i}`, - stamp: s, - status: "added", - hash: `h:${i}`, - }; - } - }, - }; - return { - def, - get selectorCalls() { - return stats.selectorCalls; - }, - get runCalls() { - return stats.runCalls; - }, - }; - } - - it("invokes a worker when its selector has work", async () => { - const w = makeOneShotWorker("a", "v1", { selectorYields: 1, runYields: 2 }); - const orch = createOrchestrator({ graph: store, pollMs: 10 }); - await orch.registerWorker(w.def); - const ac = new AbortController(); - const startPromise = orch.start(ac.signal); - // Wait until run completes. - while (w.runCalls === 0) await new Promise((r) => setTimeout(r, 10)); - ac.abort(); - await startPromise; - expect(w.runCalls).toBe(1); - expect(await store.getState("out://a#0")).not.toBeNull(); - expect(await store.getState("out://a#1")).not.toBeNull(); - }); - - it("sleeps when no work is pending and stops on abort", async () => { - const orch = createOrchestrator({ graph: store, pollMs: 5 }); - const w = makeOneShotWorker("idle", "v1", { - selectorYields: 0, - runYields: 0, - }); - await orch.registerWorker(w.def); - const ac = new AbortController(); - const startPromise = orch.start(ac.signal); - await new Promise((r) => setTimeout(r, 50)); - ac.abort(); - await startPromise; - expect(w.runCalls).toBe(0); - expect(w.selectorCalls).toBeGreaterThan(0); - }); - - it("workers are addressable by name in run records", async () => { - const w = makeOneShotWorker("named", "v3", { - selectorYields: 1, - runYields: 1, - }); - const orch = createOrchestrator({ graph: store, pollMs: 5 }); - await orch.registerWorker(w.def); - const ac = new AbortController(); - const startPromise = orch.start(ac.signal); - while (w.runCalls === 0) await new Promise((r) => setTimeout(r, 5)); - ac.abort(); - await startPromise; - // priorOutputs by name should yield the run's outputs. - const prior = await store.priorOutputs("named", "tick://named#0"); - expect(prior.map((p) => p.uri)).toEqual(["out://named#0"]); - }); - - it("bumping a worker version triggers reprocessing", async () => { - const w1 = makeOneShotWorker("ver", "v1", { - selectorYields: 1, - runYields: 1, - }); - const orch1 = createOrchestrator({ graph: store, pollMs: 5 }); - await orch1.registerWorker(w1.def); - const ac1 = new AbortController(); - const p1 = orch1.start(ac1.signal); - while (w1.runCalls === 0) await new Promise((r) => setTimeout(r, 5)); - ac1.abort(); - await p1; - - // Sanity: v1 ran once. - expect(w1.runCalls).toBe(1); - - // Now register v2 with same name; isInputProcessed should be false against new version. - const v1Done = await store.isInputProcessed("ver", "v1", "tick://ver#0"); - const v2Done = await store.isInputProcessed("ver", "v2", "tick://ver#0"); - expect(v1Done).toBe(true); - expect(v2Done).toBe(false); - }); -}); diff --git a/packages/uri-graph/tests/orchestrator/status.test.ts b/packages/uri-graph/tests/orchestrator/status.test.ts deleted file mode 100644 index db5015b..0000000 --- a/packages/uri-graph/tests/orchestrator/status.test.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { createOrchestrator } from "../../src/orchestrator/orchestrator.js"; -import { openTempMemoryStore } from "../helpers.js"; - -describe("Orchestrator.status()", () => { - it("reports running flag and registered workers", async () => { - const store = await openTempMemoryStore(); - const orch = createOrchestrator({ graph: store, pollMs: 5 }); - await orch.registerWorker({ - name: "a", - version: "v1", - selector: async function* () { - // empty - }, - run: async function* () { - // empty - }, - }); - await orch.registerWorker({ - name: "b", - version: "v2", - selector: async function* () { - // empty - }, - run: async function* () { - // empty - }, - }); - const before = await orch.status(); - expect(before.running).toBe(false); - expect(before.workers.map((w) => `${w.name}:${w.version}`).sort()).toEqual(["a:v1", "b:v2"]); - - const ac = new AbortController(); - const startPromise = orch.start(ac.signal); - await new Promise((r) => setTimeout(r, 20)); - const during = await orch.status(); - expect(during.running).toBe(true); - ac.abort(); - await startPromise; - const after = await orch.status(); - expect(after.running).toBe(false); - }); -}); diff --git a/packages/uri-graph/tests/store/contract.ts b/packages/uri-graph/tests/store/contract.ts new file mode 100644 index 0000000..cd7b566 --- /dev/null +++ b/packages/uri-graph/tests/store/contract.ts @@ -0,0 +1,200 @@ +import { describe, expect, it } from "vitest"; +import type { Resource } from "../../src/index.js"; +import type { Store } from "../../src/store/store.js"; + +export type StoreFactory = () => Promise<{ store: Store; close: () => Promise }>; + +async function collect(it: AsyncIterable): Promise { + const out: T[] = []; + for await (const x of it) out.push(x); + return out; +} + +export function defineStoreContract(name: string, factory: StoreFactory): void { + describe(`${name} — Store contract`, () => { + it("mints monotonically increasing stamps", async () => { + const { store, close } = await factory(); + try { + const a = await store.newStamp(); + const b = await store.newStamp(); + const c = await store.newStamp(); + expect(a < b).toBe(true); + expect(b < c).toBe(true); + } finally { + await close(); + } + }); + + it("put + get returns the latest event for a uri", async () => { + const { store, close } = await factory(); + try { + await store.put({ uri: "file://a", stamp: 1, status: "added" }); + await store.put({ uri: "file://a", stamp: 5, status: "updated", meta: { size: 10 } }); + const r = await store.get("file://a"); + expect(r?.stamp).toBe(5); + expect(r?.status).toBe("updated"); + expect((r?.meta as { size: number }).size).toBe(10); + } finally { + await close(); + } + }); + + it("get returns undefined for unknown uri", async () => { + const { store, close } = await factory(); + try { + expect(await store.get("file://missing")).toBeUndefined(); + } finally { + await close(); + } + }); + + it("list filters by prefix and returns latest event per uri sorted by stamp", async () => { + const { store, close } = await factory(); + try { + await store.put({ uri: "file://a", stamp: 1, status: "added" }); + await store.put({ uri: "file://b", stamp: 2, status: "added" }); + await store.put({ uri: "text://a", stamp: 3, status: "added" }); + await store.put({ uri: "file://a", stamp: 4, status: "updated" }); + + const files = await collect(store.list({ prefix: "file://" })); + expect(files.map((r) => r.uri)).toEqual(["file://b", "file://a"]); + expect(files.find((r) => r.uri === "file://a")?.stamp).toBe(4); + + const texts = await collect(store.list({ prefix: "text://" })); + expect(texts.map((r) => r.uri)).toEqual(["text://a"]); + } finally { + await close(); + } + }); + + it("list filters by afterStamp", async () => { + const { store, close } = await factory(); + try { + await store.put({ uri: "file://a", stamp: 1, status: "added" }); + await store.put({ uri: "file://b", stamp: 5, status: "added" }); + await store.put({ uri: "file://a", stamp: 10, status: "updated" }); + + const after3 = await collect(store.list({ prefix: "file://", afterStamp: 3 })); + expect(after3.map((r) => r.uri).sort()).toEqual(["file://a", "file://b"]); + + const after7 = await collect(store.list({ prefix: "file://", afterStamp: 7 })); + expect(after7.map((r) => r.uri)).toEqual(["file://a"]); + } finally { + await close(); + } + }); + + it("saves, gets, lists, deletes workers", async () => { + const { store, close } = await factory(); + try { + await store.saveWorker({ name: "scanner", selects: "", emits: "file://" }); + await store.saveWorker({ name: "extractor", selects: "file://", emits: "text://" }); + + expect((await store.getWorker("scanner"))?.emits).toBe("file://"); + + const all = await collect(store.listWorkers()); + expect(all.map((w) => w.name).sort()).toEqual(["extractor", "scanner"]); + + await store.deleteWorker("scanner"); + expect(await store.getWorker("scanner")).toBeUndefined(); + } finally { + await close(); + } + }); + + it("saveWorker upserts on conflict", async () => { + const { store, close } = await factory(); + try { + await store.saveWorker({ name: "w", selects: "a://", emits: "b://" }); + await store.saveWorker({ name: "w", selects: "x://", emits: "y://" }); + const w = await store.getWorker("w"); + expect(w?.selects).toBe("x://"); + expect(w?.emits).toBe("y://"); + } finally { + await close(); + } + }); + + it("allWatermarks returns max stamp per worker", async () => { + const { store, close } = await factory(); + try { + await store.markCompleted("a", 5); + await store.markCompleted("a", 10); + await store.markCompleted("a", 7); + await store.markCompleted("b", 3); + + const wm = await store.allWatermarks(); + expect(wm.get("a")).toBe(10); + expect(wm.get("b")).toBe(3); + expect(wm.get("c")).toBeUndefined(); + } finally { + await close(); + } + }); + + it("invalidate emits 'removed' events for matching uris", async () => { + const { store, close } = await factory(); + try { + await store.put({ uri: "text://a", stamp: 1, status: "added" }); + await store.put({ uri: "text://b", stamp: 2, status: "updated" }); + await store.put({ uri: "file://x", stamp: 3, status: "added" }); + + await store.invalidate("text://"); + + const a = await store.get("text://a"); + const b = await store.get("text://b"); + const x = await store.get("file://x"); + expect(a?.status).toBe("removed"); + expect(b?.status).toBe("removed"); + expect(x?.status).toBe("added"); + } finally { + await close(); + } + }); + + it("invalidate skips uris that are already removed", async () => { + const { store, close } = await factory(); + try { + await store.put({ uri: "text://a", stamp: 1, status: "removed" }); + + const wmBefore = await store.newStamp(); + await store.invalidate("text://"); + const r = await store.get("text://a"); + expect(r?.stamp).toBeLessThanOrEqual(wmBefore); + } finally { + await close(); + } + }); + + it("purgeResources({ keepLatestPerUri: true }) collapses to one event per uri", async () => { + const { store, close } = await factory(); + try { + await store.put({ uri: "file://a", stamp: 1, status: "added" }); + await store.put({ uri: "file://a", stamp: 2, status: "updated" }); + await store.put({ uri: "file://a", stamp: 3, status: "updated" }); + + await store.purgeResources({ keepLatestPerUri: true }); + + const r = await store.get("file://a"); + expect(r?.stamp).toBe(3); + + const all: Resource[] = await collect(store.list({ prefix: "file://" })); + expect(all.length).toBe(1); + } finally { + await close(); + } + }); + + it("purgeCompletions({ keepLatestPerWorker }) keeps only the N newest", async () => { + const { store, close } = await factory(); + try { + for (let s = 1; s <= 5; s++) await store.markCompleted("w", s); + await store.purgeCompletions({ keepLatestPerWorker: 2 }); + const wm = await store.allWatermarks(); + expect(wm.get("w")).toBe(5); + } finally { + await close(); + } + }); + }); +} diff --git a/packages/uri-graph/tests/store/memory-snapshot.test.ts b/packages/uri-graph/tests/store/memory-snapshot.test.ts deleted file mode 100644 index 1ab2e4e..0000000 --- a/packages/uri-graph/tests/store/memory-snapshot.test.ts +++ /dev/null @@ -1,151 +0,0 @@ -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { describe, expect, it } from "vitest"; -import { createFilesPersistence } from "../../src/store/memory/files-persistence.js"; -import { MemoryGraphStore } from "../../src/store/memory/store.js"; -import { openGraphStore } from "../../src/store/types.js"; - -describe("MemoryGraphStore — files-backed persistence", () => { - it("opens with no existing snapshot and starts empty", async () => { - const files = new MemFilesApi(); - const store = await openGraphStore( - new MemoryGraphStore(createFilesPersistence(files, "/g.json")), - ); - expect(await store.getState("u://x")).toBeNull(); - expect(await store.mintStamp()).toBe(1); - }); - - it("opens with existing snapshot and restores state", async () => { - const files = new MemFilesApi(); - { - const store = await openGraphStore( - new MemoryGraphStore(createFilesPersistence(files, "/g.json")), - ); - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.applyUpdate({ - uri: "u://x", - stamp: s, - status: "added", - hash: "h", - }); - await txn.commit(); - await (store as unknown as { close(): Promise }).close(); - } - const store2 = await openGraphStore( - new MemoryGraphStore(createFilesPersistence(files, "/g.json")), - ); - expect((await store2.getState("u://x"))?.hash).toBe("h"); - const next = await store2.mintStamp(); - expect(next).toBeGreaterThan(1); - }); - - it("snapshot omits pending data", async () => { - const files = new MemFilesApi(); - const store = await openGraphStore( - new MemoryGraphStore(createFilesPersistence(files, "/g.json")), - ); - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.applyUpdate({ uri: "u://x", stamp: s, status: "added", hash: "h" }); - // Without commit, snapshot should not contain u://x. Force one via rollback. - await txn.rollback(); - const decoder = new TextDecoder(); - const chunks: Uint8Array[] = []; - for await (const c of files.read("/g.json")) chunks.push(c); - const total = chunks.reduce((a, c) => a + c.length, 0); - const buf = new Uint8Array(total); - let off = 0; - for (const c of chunks) { - buf.set(c, off); - off += c.length; - } - const json = JSON.parse(decoder.decode(buf)); - expect(JSON.stringify(json)).not.toContain("u://x"); - }); - - it("commit triggers snapshot write", async () => { - const files = new MemFilesApi(); - const store = await openGraphStore( - new MemoryGraphStore(createFilesPersistence(files, "/g.json")), - ); - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.applyUpdate({ uri: "u://x", stamp: s, status: "added", hash: "h" }); - await txn.commit(); - expect(await files.exists("/g.json")).toBe(true); - }); - - it("second open against same path errors", async () => { - const files = new MemFilesApi(); - const persistence = createFilesPersistence(files, "/g.json"); - await openGraphStore(new MemoryGraphStore(persistence)); - await expect( - openGraphStore(new MemoryGraphStore(createFilesPersistence(files, "/g.json"))), - ).rejects.toThrow(/already open/); - }); - - it("applyUpdate stages without affecting reads", async () => { - const files = new MemFilesApi(); - const store = await openGraphStore( - new MemoryGraphStore(createFilesPersistence(files, "/g.json")), - ); - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - await txn.applyUpdate({ uri: "u://x", stamp: s, status: "added", hash: "h" }); - expect(await store.getState("u://x")).toBeNull(); - await txn.commit(); - expect(await store.getState("u://x")).not.toBeNull(); - }); - - it("commit promotes staging atomically", async () => { - const files = new MemFilesApi(); - const store = await openGraphStore( - new MemoryGraphStore(createFilesPersistence(files, "/g.json")), - ); - await store.registerWorker({ name: "w", version: "v1" }); - const s = await store.mintStamp(); - const txn = await store.beginTransaction({ - worker: "w", - version: "v1", - scope: null, - initialStamp: s, - }); - for (let i = 0; i < 10; i++) { - await txn.applyUpdate({ - uri: `u://${i}`, - stamp: s, - status: "added", - hash: `h${i}`, - }); - } - expect(await store.getState("u://0")).toBeNull(); - await txn.commit(); - for (let i = 0; i < 10; i++) { - expect(await store.getState(`u://${i}`)).not.toBeNull(); - } - }); -}); diff --git a/packages/uri-graph/tests/store/memory.test.ts b/packages/uri-graph/tests/store/memory.test.ts index 32dffde..569c6c0 100644 --- a/packages/uri-graph/tests/store/memory.test.ts +++ b/packages/uri-graph/tests/store/memory.test.ts @@ -1,18 +1,7 @@ -import { defineGraphStoreContract } from "../../src/store/contract.js"; -import { createInMemoryPersistence } from "../../src/store/memory/files-persistence.js"; -import { MemoryGraphStore } from "../../src/store/memory/store.js"; -import { openGraphStore } from "../../src/store/types.js"; +import { MemoryStore } from "../../src/store/memory.js"; +import { defineStoreContract } from "./contract.js"; -defineGraphStoreContract("MemoryGraphStore", () => { - const persistence = createInMemoryPersistence("graph.json"); - return { - async open() { - const raw = new MemoryGraphStore(persistence); - return openGraphStore(raw); - }, - async close(store) { - const closable = store as { close?: () => Promise }; - if (closable.close) await closable.close(); - }, - }; +defineStoreContract("MemoryStore", async () => { + const store = new MemoryStore(); + return { store, close: async () => {} }; }); diff --git a/packages/uri-graph/tests/store/sql.test.ts b/packages/uri-graph/tests/store/sql.test.ts index 13e9190..ad28414 100644 --- a/packages/uri-graph/tests/store/sql.test.ts +++ b/packages/uri-graph/tests/store/sql.test.ts @@ -1,49 +1,14 @@ -import { mkdtempSync, rmSync } from "node:fs"; -import { tmpdir } from "node:os"; -import { join } from "node:path"; -import type { Db } from "@statewalker/db-api"; import { newNodeTursoDb } from "@statewalker/db-turso-node"; -import { afterEach } from "vitest"; -import { defineGraphStoreContract } from "../../src/store/contract.js"; -import { SqlGraphStore } from "../../src/store/sql/store.js"; -import { openGraphStore } from "../../src/store/types.js"; +import { SqlStore } from "../../src/store/sql.js"; +import { defineStoreContract } from "./contract.js"; -const tmpDirs: string[] = []; - -afterEach(() => { - while (tmpDirs.length) { - const dir = tmpDirs.pop(); - if (dir) { - try { - rmSync(dir, { recursive: true, force: true }); - } catch { - // ignore - } - } - } -}); - -defineGraphStoreContract("SqlGraphStore", () => { - const dir = mkdtempSync(join(tmpdir(), "uri-graph-sql-")); - tmpDirs.push(dir); - const dbPath = join(dir, "graph.db"); - const opened: { db: Db; store: SqlGraphStore }[] = []; +defineStoreContract("SqlStore", async () => { + const db = await newNodeTursoDb(); + const store = new SqlStore(db); return { - async open() { - const db = await newNodeTursoDb({ path: dbPath }); - const store = new SqlGraphStore({ db }); - opened.push({ db, store }); - return openGraphStore(store); - }, - async close(store) { - const idx = opened.findIndex((o) => o.store === (store as SqlGraphStore)); - if (idx >= 0) { - const entry = opened.splice(idx, 1)[0]; - if (entry) { - await entry.store.close(); - await entry.db.close(); - } - } + store, + close: async () => { + await db.close(); }, }; }); diff --git a/packages/uri-graph/tests/topo-layers.test.ts b/packages/uri-graph/tests/topo-layers.test.ts new file mode 100644 index 0000000..de96fd4 --- /dev/null +++ b/packages/uri-graph/tests/topo-layers.test.ts @@ -0,0 +1,45 @@ +import { describe, expect, it } from "vitest"; +import { topoLayers } from "../src/index.js"; + +describe("topoLayers", () => { + it("groups independent workers into the same layer", async () => { + const layers = topoLayers([ + { name: "a", selects: "x://", emits: "y://" }, + { name: "b", selects: "p://", emits: "q://" }, + ]); + expect(layers.length).toBe(1); + expect(layers[0]?.map((w) => w.name).sort()).toEqual(["a", "b"]); + }); + + it("orders dependents after their producers", async () => { + const layers = topoLayers([ + { name: "scanner", selects: "", emits: "file://" }, + { name: "extractor", selects: "file://", emits: "text://" }, + { name: "indexer", selects: "text://", emits: "db://" }, + ]); + expect(layers.map((l) => l.map((w) => w.name))).toEqual([ + ["scanner"], + ["extractor"], + ["indexer"], + ]); + }); + + it("places fan-in dependents in a layer after all upstreams", async () => { + const layers = topoLayers([ + { name: "a", selects: "", emits: "alpha://" }, + { name: "b", selects: "", emits: "beta://" }, + { name: "join", selects: "alpha://", emits: "gamma://" }, + ]); + expect(layers[0]?.map((w) => w.name).sort()).toEqual(["a", "b"]); + expect(layers[1]?.map((w) => w.name)).toEqual(["join"]); + }); + + it("throws on cycles", async () => { + expect(() => + topoLayers([ + { name: "a", selects: "x://", emits: "y://" }, + { name: "b", selects: "y://", emits: "x://" }, + ]), + ).toThrow(/cycle/); + }); +}); diff --git a/packages/uri-graph/tests/types/update.test.ts b/packages/uri-graph/tests/types/update.test.ts deleted file mode 100644 index 17750b8..0000000 --- a/packages/uri-graph/tests/types/update.test.ts +++ /dev/null @@ -1,50 +0,0 @@ -import { describe, expectTypeOf, it } from "vitest"; -import type { ReadOnlyView, Status, Update } from "../../src/types/update.js"; - -describe("Update", () => { - it("requires uri, stamp, status", () => { - expectTypeOf().toHaveProperty("uri").toEqualTypeOf(); - expectTypeOf().toHaveProperty("stamp").toEqualTypeOf(); - expectTypeOf().toHaveProperty("status").toEqualTypeOf(); - }); - - it("Status is the three-value union", () => { - expectTypeOf().toEqualTypeOf<"added" | "updated" | "removed">(); - }); - - it("hash, scope, role, attributes are optional", () => { - const minimal: Update = { uri: "u", stamp: 1, status: "added" }; - expectTypeOf(minimal).toMatchTypeOf(); - const full: Update = { - uri: "u", - stamp: 1, - status: "updated", - hash: "h", - scope: "s", - role: "r", - attributes: { k: 1 }, - }; - expectTypeOf(full).toMatchTypeOf(); - }); -}); - -describe("ReadOnlyView", () => { - it("has uri, stamp, status, optional hash and attributes", () => { - const view: ReadOnlyView = { uri: "u", stamp: 1, status: "added" }; - expectTypeOf(view).toMatchTypeOf(); - const view2: ReadOnlyView = { - uri: "u", - stamp: 1, - status: "removed", - hash: "h", - attributes: { x: true }, - }; - expectTypeOf(view2).toMatchTypeOf(); - }); - - it("does not carry scope or role", () => { - type ROVKeys = keyof ReadOnlyView; - expectTypeOf<"scope" extends ROVKeys ? true : false>().toEqualTypeOf(); - expectTypeOf<"role" extends ROVKeys ? true : false>().toEqualTypeOf(); - }); -}); diff --git a/packages/uri-graph/tests/types/worker.test.ts b/packages/uri-graph/tests/types/worker.test.ts deleted file mode 100644 index e4e11c3..0000000 --- a/packages/uri-graph/tests/types/worker.test.ts +++ /dev/null @@ -1,69 +0,0 @@ -import { describe, expectTypeOf, it } from "vitest"; -import type { ReadOnlyView, Update } from "../../src/types/update.js"; -import type { - Selector, - SelectorContext, - WorkerDefinition, - WorkerParams, -} from "../../src/types/worker.js"; - -describe("WorkerParams", () => { - it("exposes stamp, read, find, priorOutputs, recordRead, signal", () => { - expectTypeOf().toHaveProperty("stamp").toEqualTypeOf<() => Promise>(); - expectTypeOf() - .toHaveProperty("read") - .toEqualTypeOf<(uri: string) => Promise>(); - expectTypeOf() - .toHaveProperty("find") - .toEqualTypeOf<(pattern: string) => AsyncIterable>(); - expectTypeOf() - .toHaveProperty("priorOutputs") - .toEqualTypeOf<(inputUri: string) => Promise>(); - expectTypeOf() - .toHaveProperty("recordRead") - .toEqualTypeOf<(uri: string, role?: string) => void>(); - expectTypeOf().toHaveProperty("signal").toEqualTypeOf(); - }); -}); - -describe("Selector", () => { - it("is a function from SelectorContext to AsyncIterableIterator", () => { - expectTypeOf().toEqualTypeOf< - (ctx: SelectorContext) => AsyncIterableIterator - >(); - }); -}); - -describe("WorkerDefinition", () => { - it("requires name, version, selector, run; rest optional", () => { - const def: WorkerDefinition = { - name: "w", - version: "v1", - selector: async function* () { - // empty - }, - run: async function* () { - // empty - }, - }; - expectTypeOf(def).toMatchTypeOf(); - }); - - it("accepts inputPattern, outputPattern, scopeExpr, description", () => { - const def: WorkerDefinition = { - name: "w", - version: "v1", - description: "d", - inputPattern: "file://**", - outputPattern: "text://**", - scopeExpr: "uri", - selector: async function* () { - // empty - }, - run: async function* () { - // empty - }, - }; - expectTypeOf(def).toMatchTypeOf(); - }); -}); diff --git a/packages/uri-graph/tests/workers/chunker.test.ts b/packages/uri-graph/tests/workers/chunker.test.ts deleted file mode 100644 index 8f59b2b..0000000 --- a/packages/uri-graph/tests/workers/chunker.test.ts +++ /dev/null @@ -1,97 +0,0 @@ -import { beforeEach, describe, expect, it } from "vitest"; -import { drain } from "../../src/orchestrator/drain.js"; -import type { GraphStore } from "../../src/store/types.js"; -import type { Update } from "../../src/types/update.js"; -import { createChunker } from "../../src/workers/chunker.js"; -import { openTempMemoryStore } from "../helpers.js"; - -describe("chunker", () => { - let store: GraphStore; - - beforeEach(async () => { - store = await openTempMemoryStore(); - }); - - async function feedTextUpdates(updates: Update[]): Promise { - const chunker = createChunker({ chunkSize: 5 }); - await store.registerWorker({ - name: chunker.name, - version: chunker.version, - }); - async function* feed(): AsyncIterableIterator { - for (const u of updates) yield u; - } - await drain(chunker, feed(), store); - } - - function textUpdate(uri: string, text: string, hash: string): Update { - return { - uri, - stamp: 0, - status: "added", - hash, - attributes: { text }, - }; - } - - async function chunkUris(): Promise { - const out: string[] = []; - for await (const v of store.find("chunk:///%")) out.push(v.uri); - return out.sort(); - } - - it("splits text into chunks under one shared stamp", async () => { - const text = "abcde" + "fghij" + "klmno"; // 15 chars, chunkSize 5 → 3 chunks - await feedTextUpdates([textUpdate("text:///a.md", text, "h")]); - const uris = await chunkUris(); - expect(uris).toEqual(["chunk:///a.md#0", "chunk:///a.md#1", "chunk:///a.md#2"]); - const stamps = await Promise.all(uris.map(async (u) => (await store.getState(u))?.stamp)); - expect(new Set(stamps).size).toBe(1); - }); - - it("stable chunk URIs across re-runs", async () => { - const text = "abcdefghij"; // 10 chars → 2 chunks - await feedTextUpdates([textUpdate("text:///x.md", text, "h1")]); - const first = await chunkUris(); - await feedTextUpdates([textUpdate("text:///x.md", text, "h2")]); - const second = await chunkUris(); - expect(second).toEqual(first); - }); - - it("shrinking output cleans surplus chunks", async () => { - const long = "12345" + "67890" + "abcde" + "fghij" + "klmno"; // 25 chars → 5 chunks - await feedTextUpdates([textUpdate("text:///s.md", long, "h1")]); - const before = await chunkUris(); - expect(before.length).toBe(5); - - const short = "12345" + "67890" + "abcde"; // 15 chars → 3 chunks - await feedTextUpdates([textUpdate("text:///s.md", short, "h2")]); - - // The first 3 chunks should still exist; chunks #3 and #4 should be removed. - const c3 = await store.getState("chunk:///s.md#3"); - const c4 = await store.getState("chunk:///s.md#4"); - expect(c3?.status).toBe("removed"); - expect(c4?.status).toBe("removed"); - }); - - it("two text docs get distinct stamps", async () => { - await feedTextUpdates([ - textUpdate("text:///a.md", "hello world!", "ha"), - textUpdate("text:///b.md", "another doc", "hb"), - ]); - const a0 = await store.getState("chunk:///a.md#0"); - const b0 = await store.getState("chunk:///b.md#0"); - expect(a0?.stamp).not.toBe(b0?.stamp); - }); - - it("upstream removal cascades to chunks", async () => { - await feedTextUpdates([textUpdate("text:///a.md", "abcdefghij", "h")]); - expect((await store.getState("chunk:///a.md#0"))?.status).toBe("added"); - - await feedTextUpdates([{ uri: "text:///a.md", stamp: 0, status: "removed" }]); - const c0 = await store.getState("chunk:///a.md#0"); - const c1 = await store.getState("chunk:///a.md#1"); - expect(c0?.status).toBe("removed"); - expect(c1?.status).toBe("removed"); - }); -}); diff --git a/packages/uri-graph/tests/workers/embedder.test.ts b/packages/uri-graph/tests/workers/embedder.test.ts deleted file mode 100644 index 7daa3e8..0000000 --- a/packages/uri-graph/tests/workers/embedder.test.ts +++ /dev/null @@ -1,92 +0,0 @@ -import { beforeEach, describe, expect, it } from "vitest"; -import { drain } from "../../src/orchestrator/drain.js"; -import type { GraphStore } from "../../src/store/types.js"; -import type { Update } from "../../src/types/update.js"; -import { createEmbedder } from "../../src/workers/embedder.js"; -import { openTempMemoryStore } from "../helpers.js"; - -describe("embedder", () => { - let store: GraphStore; - - beforeEach(async () => { - store = await openTempMemoryStore(); - }); - - function chunkUpdate(uri: string, text: string): Update { - return { - uri, - stamp: 0, - status: "added", - hash: `h:${uri}`, - attributes: { text }, - }; - } - - it("yields one stamp per chunk", async () => { - const embedder = createEmbedder({ - embed: async (text: string) => new Float32Array([text.length, 0, 0]), - }); - await store.registerWorker({ - name: embedder.name, - version: embedder.version, - }); - const inputs = [ - chunkUpdate("chunk:///a.md#0", "hello"), - chunkUpdate("chunk:///a.md#1", "world"), - chunkUpdate("chunk:///a.md#2", "!"), - ]; - async function* feed(): AsyncIterableIterator { - for (const u of inputs) yield u; - } - await drain(embedder, feed(), store); - const e0 = await store.getState("embedding://chunk:///a.md#0"); - const e1 = await store.getState("embedding://chunk:///a.md#1"); - const e2 = await store.getState("embedding://chunk:///a.md#2"); - expect(e0?.stamp).not.toBe(e1?.stamp); - expect(e1?.stamp).not.toBe(e2?.stamp); - expect(e0?.stamp).not.toBe(e2?.stamp); - }); - - it("calls the embedding API once per chunk", async () => { - let calls = 0; - const embedder = createEmbedder({ - embed: async (_text: string) => { - calls += 1; - return new Float32Array([1, 2, 3]); - }, - }); - await store.registerWorker({ - name: embedder.name, - version: embedder.version, - }); - async function* feed(): AsyncIterableIterator { - yield chunkUpdate("chunk:///x#0", "a"); - yield chunkUpdate("chunk:///x#1", "b"); - } - await drain(embedder, feed(), store); - expect(calls).toBe(2); - }); - - it("respects abort: stops mid-stream", async () => { - const ac = new AbortController(); - const embedder = createEmbedder({ - embed: async (text: string) => { - if (text === "abort-here") ac.abort(); - return new Float32Array([1]); - }, - }); - await store.registerWorker({ - name: embedder.name, - version: embedder.version, - }); - async function* feed(): AsyncIterableIterator { - yield chunkUpdate("chunk:///a", "ok"); - yield chunkUpdate("chunk:///b", "abort-here"); - yield chunkUpdate("chunk:///c", "should-not-process"); - } - await drain(embedder, feed(), store, { signal: ac.signal }); - // a was processed; b might or might not have committed (its embed returned - // before abort took effect). c should NOT have been processed. - expect(await store.getState("embedding://chunk:///c")).toBeNull(); - }); -}); diff --git a/packages/uri-graph/tests/workers/extractors.test.ts b/packages/uri-graph/tests/workers/extractors.test.ts deleted file mode 100644 index 4569dea..0000000 --- a/packages/uri-graph/tests/workers/extractors.test.ts +++ /dev/null @@ -1,127 +0,0 @@ -import { writeText } from "@statewalker/webrun-files"; -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { drain } from "../../src/orchestrator/drain.js"; -import type { GraphStore } from "../../src/store/types.js"; -import type { Update } from "../../src/types/update.js"; -import { createHtmlExtractor } from "../../src/workers/extractors/html-extractor.js"; -import { createMarkdownExtractor } from "../../src/workers/extractors/markdown-extractor.js"; -import { createPlainTextExtractor } from "../../src/workers/extractors/plain-text-extractor.js"; -import { createFileWatcher } from "../../src/workers/file-watcher.js"; -import { openTempMemoryStore } from "../helpers.js"; - -describe("extractors", () => { - let store: GraphStore; - let files: MemFilesApi; - - beforeEach(async () => { - files = new MemFilesApi(); - store = await openTempMemoryStore(); - }); - - async function runWatcher(): Promise { - const watcher = createFileWatcher({ files, rootPath: "/" }); - await store.registerWorker({ - name: watcher.name, - version: watcher.version, - }); - async function* tick(): AsyncIterableIterator { - yield { uri: `tick://${watcher.name}`, stamp: 0, status: "updated" }; - } - await drain(watcher, tick(), store); - } - - async function runExtractor( - extractor: ReturnType, - ): Promise { - await store.registerWorker({ - name: extractor.name, - version: extractor.version, - }); - // Synthesize input from committed file:// URIs. - const inputs: Update[] = []; - for await (const v of store.find("file:///%")) { - inputs.push({ - uri: v.uri, - stamp: v.stamp, - status: v.status, - hash: v.hash, - attributes: v.attributes, - }); - } - async function* feed(): AsyncIterableIterator { - for (const u of inputs) yield u; - } - await drain(extractor, feed(), store); - } - - describe("markdown", () => { - it("matches only .md files and emits text://", async () => { - await writeText(files, "/a.md", "# Heading\n\nbody"); - await writeText(files, "/b.txt", "text only"); - await writeText(files, "/c.png", "binary"); - await runWatcher(); - await runExtractor(createMarkdownExtractor({ files })); - - const textViews: string[] = []; - for await (const v of store.find("text:///%")) textViews.push(v.uri); - expect(textViews.sort()).toEqual(["text:///a.md"]); - }); - - it("two files with identical content yield identical hashes", async () => { - await writeText(files, "/x.md", "same body"); - await writeText(files, "/y.md", "same body"); - await runWatcher(); - await runExtractor(createMarkdownExtractor({ files })); - const x = await store.getState("text:///x.md"); - const y = await store.getState("text:///y.md"); - expect(x?.hash).toBe(y?.hash); - expect(x?.hash).toBeTruthy(); - }); - - it("removed source cascades to text URI", async () => { - await writeText(files, "/a.md", "body"); - await runWatcher(); - await runExtractor(createMarkdownExtractor({ files })); - expect((await store.getState("text:///a.md"))?.status).toBe("added"); - - await files.remove("/a.md"); - await runWatcher(); - await runExtractor(createMarkdownExtractor({ files })); - expect((await store.getState("text:///a.md"))?.status).toBe("removed"); - }); - - it("ignores non-md files in the input stream", async () => { - await writeText(files, "/c.png", "binary"); - await runWatcher(); - await runExtractor(createMarkdownExtractor({ files })); - expect(await store.getState("text:///c.png")).toBeNull(); - }); - }); - - describe("plain text", () => { - it("matches only .txt files", async () => { - await writeText(files, "/a.md", "md"); - await writeText(files, "/b.txt", "plain"); - await runWatcher(); - await runExtractor(createPlainTextExtractor({ files })); - const textViews: string[] = []; - for await (const v of store.find("text:///%")) textViews.push(v.uri); - expect(textViews).toEqual(["text:///b.txt"]); - }); - }); - - describe("html", () => { - it("strips tags and emits plain text", async () => { - await writeText(files, "/a.html", "

Hi

world

"); - await runWatcher(); - await runExtractor(createHtmlExtractor({ files })); - const view = await store.getState("text:///a.html"); - expect(view).not.toBeNull(); - const text = (view?.attributes as Record)?.text as string; - expect(text).not.toContain("<"); - expect(text).toContain("Hi"); - expect(text).toContain("world"); - }); - }); -}); diff --git a/packages/uri-graph/tests/workers/file-watcher.test.ts b/packages/uri-graph/tests/workers/file-watcher.test.ts deleted file mode 100644 index 9b81dec..0000000 --- a/packages/uri-graph/tests/workers/file-watcher.test.ts +++ /dev/null @@ -1,130 +0,0 @@ -import { writeText } from "@statewalker/webrun-files"; -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { drain } from "../../src/orchestrator/drain.js"; -import type { GraphStore } from "../../src/store/types.js"; -import type { Update } from "../../src/types/update.js"; -import { createFileWatcher } from "../../src/workers/file-watcher.js"; -import { openTempMemoryStore } from "../helpers.js"; - -describe("file watcher", () => { - let store: GraphStore; - let files: MemFilesApi; - - beforeEach(async () => { - files = new MemFilesApi(); - store = await openTempMemoryStore(); - }); - - async function* singleTick(): AsyncIterableIterator { - yield { uri: "tick://file-watcher", stamp: 0, status: "updated" }; - } - - async function runOnce(rootPath: string): Promise { - const watcher = createFileWatcher({ files, rootPath }); - await store.registerWorker({ - name: watcher.name, - version: watcher.version, - }); - await drain(watcher, singleTick(), store); - } - - async function fileUris(): Promise { - const out: string[] = []; - for await (const v of store.find("file:///%")) out.push(v.uri); - return out.sort(); - } - - it("scans all files, not just one extension", async () => { - await writeText(files, "/a.md", "hello"); - await writeText(files, "/b.txt", "world"); - await writeText(files, "/c.png", "binary-ish"); - await writeText(files, "/d.pdf", "blob"); - await runOnce("/"); - expect(await fileUris()).toEqual([ - "file:///a.md", - "file:///b.txt", - "file:///c.png", - "file:///d.pdf", - ]); - }); - - it("emits added on first sighting and nothing on second sighting if unchanged", async () => { - await writeText(files, "/a.md", "hello"); - await runOnce("/"); - const firstStamp = (await store.getState("file:///a.md"))?.stamp; - await runOnce("/"); - const secondStamp = (await store.getState("file:///a.md"))?.stamp; - expect(secondStamp).toBe(firstStamp); - }); - - it("emits update when file mtime/size changes", async () => { - await writeText(files, "/a.md", "hello"); - await runOnce("/"); - const before = await store.getState("file:///a.md"); - // re-write with different content (different size) - await new Promise((r) => setTimeout(r, 5)); - await writeText(files, "/a.md", "hello world"); - await runOnce("/"); - const after = await store.getState("file:///a.md"); - expect(before).not.toBeNull(); - expect(after).not.toBeNull(); - if (before && after) { - expect(after.stamp).toBeGreaterThan(before.stamp); - } - }); - - it("emits removed when file is deleted", async () => { - await writeText(files, "/a.md", "hello"); - await runOnce("/"); - expect((await store.getState("file:///a.md"))?.status).toBe("added"); - await files.remove("/a.md"); - await runOnce("/"); - expect((await store.getState("file:///a.md"))?.status).toBe("removed"); - }); - - it("does not read file bytes", async () => { - await writeText(files, "/a.md", "hello"); - let readCalled = false; - const wrappedFiles = new Proxy(files, { - get(target, prop, receiver) { - if (prop === "read") { - return (...args: unknown[]) => { - readCalled = true; - // biome-ignore lint/suspicious/noExplicitAny: proxy passthrough - return (target as any).read(...args); - }; - } - return Reflect.get(target, prop, receiver); - }, - }); - const watcher = createFileWatcher({ files: wrappedFiles, rootPath: "/" }); - await store.registerWorker({ - name: watcher.name, - version: watcher.version, - }); - await drain(watcher, singleTick(), store); - expect(readCalled).toBe(false); - }); - - it("hash format is size:mtime", async () => { - await writeText(files, "/a.md", "hello"); - await runOnce("/"); - const view = await store.getState("file:///a.md"); - expect(view?.hash).toMatch(/^\d+:\d+$/); - }); - - it("empty FS yields nothing", async () => { - await runOnce("/"); - expect(await fileUris()).toEqual([]); - }); - - it("skips directories", async () => { - await writeText(files, "/sub/a.md", "hello"); - await runOnce("/"); - const uris = await fileUris(); - expect(uris).toEqual(["file:///sub/a.md"]); - // No URI for the directory itself. - expect(uris.some((u) => u.endsWith("/sub"))).toBe(false); - }); -}); diff --git a/packages/uri-graph/tests/workers/index-backends.test.ts b/packages/uri-graph/tests/workers/index-backends.test.ts deleted file mode 100644 index 43fe326..0000000 --- a/packages/uri-graph/tests/workers/index-backends.test.ts +++ /dev/null @@ -1,54 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { createMemoryFtsBackend } from "../../src/workers/index-backends/memory-fts.js"; -import { createMemoryVectorBackend } from "../../src/workers/index-backends/memory-vector.js"; - -describe("memory FTS backend", () => { - it("indexes documents per scope and finds matches", () => { - const fts = createMemoryFtsBackend(); - fts.upsert("doc:a", ["hello world", "foo bar"]); - fts.upsert("doc:b", ["hello there"]); - const hits = fts - .query("hello") - .map((h) => h.scope) - .sort(); - expect(hits).toEqual(["doc:a", "doc:b"]); - }); - - it("delete removes a scope's entries", () => { - const fts = createMemoryFtsBackend(); - fts.upsert("doc:a", ["hello"]); - fts.upsert("doc:b", ["hello"]); - fts.remove("doc:a"); - const hits = fts.query("hello").map((h) => h.scope); - expect(hits).toEqual(["doc:b"]); - }); - - it("upsert replaces existing entries for a scope", () => { - const fts = createMemoryFtsBackend(); - fts.upsert("doc:a", ["banana"]); - fts.upsert("doc:a", ["apple"]); - expect(fts.query("banana").length).toBe(0); - expect(fts.query("apple").length).toBe(1); - }); -}); - -describe("memory vector backend", () => { - it("stores vectors and finds nearest", () => { - const vec = createMemoryVectorBackend(); - vec.upsert("v:a", new Float32Array([1, 0])); - vec.upsert("v:b", new Float32Array([0, 1])); - vec.upsert("v:c", new Float32Array([0.9, 0.1])); - const top = vec.search(new Float32Array([1, 0]), 2); - expect(top[0]?.id).toBe("v:a"); - expect(top[1]?.id).toBe("v:c"); - }); - - it("delete removes the vector", () => { - const vec = createMemoryVectorBackend(); - vec.upsert("v:a", new Float32Array([1])); - vec.upsert("v:b", new Float32Array([1])); - vec.remove("v:a"); - const top = vec.search(new Float32Array([1]), 5); - expect(top.map((t) => t.id)).toEqual(["v:b"]); - }); -}); diff --git a/packages/uri-graph/tests/workers/indexer.test.ts b/packages/uri-graph/tests/workers/indexer.test.ts deleted file mode 100644 index 413f81f..0000000 --- a/packages/uri-graph/tests/workers/indexer.test.ts +++ /dev/null @@ -1,117 +0,0 @@ -import { beforeEach, describe, expect, it } from "vitest"; -import { drain } from "../../src/orchestrator/drain.js"; -import type { GraphStore } from "../../src/store/types.js"; -import type { Update } from "../../src/types/update.js"; -import { createMemoryFtsBackend } from "../../src/workers/index-backends/memory-fts.js"; -import { createMemoryVectorBackend } from "../../src/workers/index-backends/memory-vector.js"; -import { createIndexer } from "../../src/workers/indexer.js"; -import { openTempMemoryStore } from "../helpers.js"; - -describe("indexer", () => { - let store: GraphStore; - - beforeEach(async () => { - store = await openTempMemoryStore(); - }); - - function indexerInputs(scope: string): Update[] { - return [ - { - uri: scope, - stamp: 0, - status: "added", - scope, - role: "text", - attributes: { text: "hello world" }, - }, - { - uri: `chunk:${scope.slice(5)}#0`, - stamp: 0, - status: "added", - scope, - role: "chunk", - attributes: { text: "hello world" }, - }, - { - uri: `embedding://chunk:${scope.slice(5)}#0`, - stamp: 0, - status: "added", - scope, - role: "embedding", - attributes: { vector: [1, 0, 0] }, - }, - ]; - } - - it("emits one fts and one vector index URI per ready scope", async () => { - const fts = createMemoryFtsBackend(); - const vec = createMemoryVectorBackend(); - const indexer = createIndexer({ fts, vector: vec }); - await store.registerWorker({ - name: indexer.name, - version: indexer.version, - }); - - const inputs = indexerInputs("text:///x.md"); - async function* feed(): AsyncIterableIterator { - for (const u of inputs) yield u; - } - await drain(indexer, feed(), store); - - expect(await store.getState("index://fts/text:///x.md")).not.toBeNull(); - expect(await store.getState("index://vector/text:///x.md")).not.toBeNull(); - // The FTS backend now indexes the scope. - expect(fts.query("hello").map((h) => h.scope)).toContain("text:///x.md"); - }); - - it("groups inputs by scope when they interleave (sorted by joinInputs upstream)", async () => { - const fts = createMemoryFtsBackend(); - const vec = createMemoryVectorBackend(); - const indexer = createIndexer({ fts, vector: vec }); - await store.registerWorker({ - name: indexer.name, - version: indexer.version, - }); - - const inputs = [...indexerInputs("text:///a.md"), ...indexerInputs("text:///b.md")].sort( - (x, y) => (x.scope ?? "").localeCompare(y.scope ?? ""), - ); - async function* feed(): AsyncIterableIterator { - for (const u of inputs) yield u; - } - await drain(indexer, feed(), store); - expect(await store.getState("index://fts/text:///a.md")).not.toBeNull(); - expect(await store.getState("index://fts/text:///b.md")).not.toBeNull(); - }); - - it("removed text cascades indexes", async () => { - const fts = createMemoryFtsBackend(); - const vec = createMemoryVectorBackend(); - const indexer = createIndexer({ fts, vector: vec }); - await store.registerWorker({ - name: indexer.name, - version: indexer.version, - }); - - // First index it, then send a removed text update. - async function* feed1(): AsyncIterableIterator { - for (const u of indexerInputs("text:///r.md")) yield u; - } - await drain(indexer, feed1(), store); - expect(fts.query("hello").length).toBeGreaterThan(0); - - async function* feed2(): AsyncIterableIterator { - yield { - uri: "text:///r.md", - stamp: 0, - status: "removed", - scope: "text:///r.md", - role: "text", - }; - } - await drain(indexer, feed2(), store); - expect((await store.getState("index://fts/text:///r.md"))?.status).toBe("removed"); - expect((await store.getState("index://vector/text:///r.md"))?.status).toBe("removed"); - expect(fts.query("hello").length).toBe(0); - }); -});