diff --git a/.changeset/content-pipeline-redesign.md b/.changeset/content-pipeline-redesign.md new file mode 100644 index 0000000..e8a5c93 --- /dev/null +++ b/.changeset/content-pipeline-redesign.md @@ -0,0 +1,15 @@ +--- +"@statewalker/content-pipeline": minor +"@statewalker/content-cli": patch +--- + +**BREAKING:** removed `@statewalker/content-scanner` and `@statewalker/content-manager`, replaced by a single new package `@statewalker/content-pipeline`. + +The new package implements the same cascade (files → extract → split → embed → fts/vec index) as a set of layered Trackers — each with a persisted cursor, monotonic integer stamps, batched pacing, runtime cascade via `onStampUpdate`, and tombstone propagation — wiring ~2450 LOC of scanner + manager infrastructure down to ~585 LOC. Three interchangeable `Store` backends (`JsonManifestStore`, `BlobStore` with pluggable codecs including a raw Float32 fast-path for embeddings, and an optional day-2 `SqlStore`) let each layer pick the right persistence for its payload profile. + +**Migration:** +- Replace `@statewalker/content-manager` / `@statewalker/content-scanner` imports with `@statewalker/content-pipeline`. +- `createContentManager` options change: drop `registry: FilesScanRegistry`, add `statePrefix: string` (directory under which the pipeline stores its state). Everything else (`indexer`, `files`, `extractors`, `chunkOptions`, `embed`, `root`, `filter`) is unchanged. The `sync` / `search` / `status` / `clear` / `close` public surface is preserved. +- First run after upgrading rebuilds state from scratch; the on-disk store layout is not compatible with the old one. + +See `openspec/changes/content-pipeline-redesign/` in the umbrella for the full proposal, design notes, and spec deltas. diff --git a/README.md b/README.md index 7266909..a72c329 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,7 @@ Content pipeline: blocks, extractors, scanners, managers, plus the content-cli. | --- | --- | | [@statewalker/content-blocks](packages/content-blocks) | Block types shared across the content pipeline. | | [@statewalker/content-extractors](packages/content-extractors) | PDF/DOCX/XLSX/Markdown/HTML extractors. | -| [@statewalker/content-scanner](packages/content-scanner) | Scans a file tree and streams blocks into indexers. | -| [@statewalker/content-manager](packages/content-manager) | High-level scan + index orchestration. | +| [@statewalker/content-pipeline](packages/content-pipeline) | Layered trackers that cascade file-system changes through extract, split, embed, and index stages. | ## Apps diff --git a/apps/content-cli/package.json b/apps/content-cli/package.json index e84696e..0ad2b0e 100644 --- a/apps/content-cli/package.json +++ b/apps/content-cli/package.json @@ -32,14 +32,14 @@ }, "dependencies": { "@statewalker/content-extractors": "workspace:*", - "@statewalker/content-manager": "workspace:*", - "@statewalker/content-scanner": "workspace:*", + "@statewalker/content-pipeline": "workspace:*", "@statewalker/indexer-api": "catalog:", "@statewalker/indexer-mem-flexsearch": "catalog:", "@statewalker/webrun-files": "catalog:", "@statewalker/webrun-files-node": "catalog:" }, "devDependencies": { + "@types/node": "catalog:", "rimraf": "catalog:", "tsdown": "catalog:", "tsx": "catalog:", diff --git a/apps/content-cli/src/cli.ts b/apps/content-cli/src/cli.ts index 75a61c1..23b6c1f 100644 --- a/apps/content-cli/src/cli.ts +++ b/apps/content-cli/src/cli.ts @@ -2,8 +2,7 @@ import { resolve } from "node:path"; import { createDefaultRegistry } from "@statewalker/content-extractors/extractors"; -import { createContentManager } from "@statewalker/content-manager"; -import { FilesScanRegistry } from "@statewalker/content-scanner"; +import { createContentManager } from "@statewalker/content-pipeline"; import type { IndexerPersistence, PersistenceEntry } from "@statewalker/indexer-api"; import { createFlexSearchIndexer } from "@statewalker/indexer-mem-flexsearch"; import type { FilesApi } from "@statewalker/webrun-files"; @@ -112,18 +111,17 @@ async function main() { const rootDir = resolve(rootPath); const files = new NodeFilesApi({ rootDir }); const indexDir = `/${systemFolder}/indexer`; - const scanDir = `/${systemFolder}/scan`; + const statePrefix = `/${systemFolder}/content`; const persistence = createFilePersistence(files, indexDir); const indexer = createFlexSearchIndexer({ persistence }); - const registry = new FilesScanRegistry({ files, prefix: scanDir }); const extractors = createDefaultRegistry(); const manager = createContentManager({ - registry, indexer, files, extractors, + statePrefix, root: "/", filter: (path: string) => !path.startsWith(`/${systemFolder}/`), }); diff --git a/apps/content-cli/tsconfig.json b/apps/content-cli/tsconfig.json index e1bad3a..a8c7c61 100644 --- a/apps/content-cli/tsconfig.json +++ b/apps/content-cli/tsconfig.json @@ -3,7 +3,8 @@ "target": "ES2022", "module": "Preserve", "moduleResolution": "Bundler", - "lib": ["ESNext"], + "lib": ["ESNext", "DOM"], + "types": ["node"], "strict": true, "skipLibCheck": true, "verbatimModuleSyntax": true, diff --git a/packages/content-blocks/README.md b/packages/content-blocks/README.md index f6e2e63..2899a5e 100644 --- a/packages/content-blocks/README.md +++ b/packages/content-blocks/README.md @@ -23,4 +23,4 @@ For stable block ID generation use `@statewalker/shared-ids` directly. ## Related -- `@statewalker/content-extractors`, `@statewalker/content-scanner`, `@statewalker/content-manager`. +- `@statewalker/content-extractors`, `@statewalker/content-pipeline`. diff --git a/packages/content-manager/README.md b/packages/content-manager/README.md deleted file mode 100644 index bfd1062..0000000 --- a/packages/content-manager/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# @statewalker/content-manager - -Content manager: orchestrates scanning, extraction, chunking, and indexing over a `@statewalker/webrun-files` tree. - -## Installation - -```sh -pnpm add @statewalker/content-manager -``` - -## Usage - -```ts -import { createContentManager } from "@statewalker/content-manager"; - -const mgr = createContentManager({ fs, indexer, chunker }); -await mgr.sync(rootPath); -``` - -## API - -- `createContentManager(options)` — high-level scan + index driver. - -## Related - -- `@statewalker/content-scanner`, `@statewalker/content-extractors`, `@statewalker/indexer-api`. diff --git a/packages/content-manager/package.json b/packages/content-manager/package.json deleted file mode 100644 index 81d96e0..0000000 --- a/packages/content-manager/package.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "name": "@statewalker/content-manager", - "version": "0.1.0", - "private": false, - "type": "module", - "description": "Content manager: orchestrates scanning, extraction, and indexing of a @statewalker/webrun-files tree.", - "homepage": "https://github.com/statewalker/statewalker-content", - "author": { - "name": "Mikhail Kotelnikov", - "email": "mikhail.kotelnikov@gmail.com" - }, - "license": "MIT", - "repository": { - "type": "git", - "url": "git+ssh://git@github.com/statewalker/statewalker-content.git" - }, - "exports": { - ".": "./src/index.ts" - }, - "files": [ - "dist", - "src" - ], - "scripts": { - "build": "tsdown", - "dev": "tsdown --watch", - "test": "vitest run", - "test:watch": "vitest", - "typecheck": "tsc --noEmit", - "clean": "rimraf dist", - "lint": "biome check --write .", - "format": "biome format --write ." - }, - "dependencies": { - "@statewalker/content-extractors": "workspace:*", - "@statewalker/content-scanner": "workspace:*", - "@statewalker/indexer-api": "catalog:", - "@statewalker/indexer-chunker": "catalog:", - "@statewalker/webrun-files": "catalog:" - }, - "devDependencies": { - "@statewalker/indexer-mem-flexsearch": "catalog:", - "@statewalker/webrun-files-mem": "catalog:", - "rimraf": "catalog:", - "tsdown": "catalog:", - "typescript": "catalog:", - "vitest": "catalog:" - }, - "sideEffects": false, - "publishConfig": { - "access": "public" - } -} diff --git a/packages/content-manager/src/content-manager.ts b/packages/content-manager/src/content-manager.ts deleted file mode 100644 index abff1eb..0000000 --- a/packages/content-manager/src/content-manager.ts +++ /dev/null @@ -1,221 +0,0 @@ -import type { - ScanRegistry, - ScanStore, - Stamp, - Update, - UpdateSource, -} from "@statewalker/content-scanner"; -import { - ContentExtractorScanner, - ContentFtsIndexerScanner, - ContentSplitterScanner, - FilesScanner, -} from "@statewalker/content-scanner"; -import type { - CreateIndexParams, - DocumentPath, - HybridSearchResult, - Index, - Indexer, -} from "@statewalker/indexer-api"; -import type { ChunkOptions } from "@statewalker/indexer-chunker"; -import type { - ContentManager, - ContentManagerOptions, - ContentSearchParams, - ContentStatus, - SearchHit, - SyncEvent, -} from "./types.js"; - -const DEFAULT_CHUNK_OPTIONS: ChunkOptions = { - targetChars: 1500, -}; - -async function getOrCreateStore(registry: ScanRegistry, name: string): Promise { - const existing = await registry.getStore(name); - if (existing) return existing; - return registry.createStore(name); -} - -async function getOrCreateIndex(indexer: Indexer): Promise { - const existing = await indexer.getIndex("content"); - if (existing) return existing; - const params: CreateIndexParams = { - name: "content", - fulltext: { language: "en" }, - }; - return indexer.createIndex(params); -} - -/** - * Create a cascade source that yields only entries changed or removed - * since the given timestamp. This enables incremental downstream processing. - */ -function cascadeSource(store: ScanStore, since: Stamp | null): UpdateSource { - return async function* (): AsyncGenerator { - for await (const entry of store.list()) { - if (!since) { - yield entry; - continue; - } - // Include entries with newer stamp (changed) - if (entry.stamp > since) { - yield entry; - continue; - } - // Include entries removed after downstream's last scan - if (entry.removed && entry.removed > since) { - yield entry; - } - } - }; -} - -export function createContentManager(options: ContentManagerOptions): ContentManager { - const { - registry, - indexer, - files, - extractors, - chunkOptions = DEFAULT_CHUNK_OPTIONS, - root = "/", - filter, - } = options; - - let ftsIndex: Index | null = null; - - async function ensureIndex(): Promise { - if (!ftsIndex) { - ftsIndex = await getOrCreateIndex(indexer); - } - return ftsIndex; - } - - const manager: ContentManager = { - async *sync(): AsyncGenerator { - yield { type: "sync-started" }; - - const filesStore = await getOrCreateStore(registry, "files"); - const contentStore = await getOrCreateStore(registry, "content"); - const chunksStore = await getOrCreateStore(registry, "chunks"); - const ftsStore = await getOrCreateStore(registry, "fts-index"); - const index = await ensureIndex(); - - const stats = { scanned: 0, indexed: 0, removed: 0, errors: 0 }; - - // 1. Scan files - const filesScanner = new FilesScanner(filesStore, { - files, - root, - filter, - skipHash: false, - }); - for await (const event of filesScanner.scan()) { - if (event.type === "entry-processed") stats.scanned++; - if (event.type === "entry-error") stats.errors++; - } - - // 2. Extract content from changed files - const contentLast = await contentStore.getLastScan(); - const extractor = new ContentExtractorScanner(contentStore, { - files, - extractors, - }); - for await (const _event of extractor.scan(cascadeSource(filesStore, contentLast))) { - // consumed — intermediate stage - } - - // 3. Split content into chunks - const chunksLast = await chunksStore.getLastScan(); - const splitter = new ContentSplitterScanner(chunksStore, { - chunkOptions, - }); - for await (const _event of splitter.scan(cascadeSource(contentStore, chunksLast))) { - // consumed — intermediate stage - } - - // 4. Index chunks in FTS - const ftsLast = await ftsStore.getLastScan(); - const ftsIndexer = new ContentFtsIndexerScanner(ftsStore, { index }); - for await (const event of ftsIndexer.scan(cascadeSource(chunksStore, ftsLast))) { - if (event.type === "entry-processed") { - stats.indexed++; - yield { type: "file-indexed", uri: event.uri }; - } else if (event.type === "entry-removed") { - stats.removed++; - yield { type: "file-removed", uri: event.uri }; - } else if (event.type === "entry-error") { - stats.errors++; - yield { type: "file-error", uri: event.uri, error: event.error }; - } - } - - yield { type: "sync-done", stats }; - }, - - async search(params: ContentSearchParams): Promise { - const index = await ensureIndex(); - const { queries, semanticQueries: _semanticQueries, topK = 10, paths, weights } = params; - - const results: HybridSearchResult[] = []; - for await (const r of index.search({ - queries, - topK, - paths: paths as DocumentPath[] | undefined, - weights, - })) { - results.push(r); - } - - return results.map((r) => ({ - blockId: r.blockId, - uri: String(r.path), - content: r.fts?.snippet ?? "", - score: r.score, - })); - }, - - async status(): Promise { - let fileCount = 0; - let indexedCount = 0; - - const filesStore = await registry.getStore("files"); - if (filesStore) { - for await (const entry of filesStore.list()) { - if (!entry.removed) fileCount++; - } - } - - const ftsStore = await registry.getStore("fts-index"); - if (ftsStore) { - for await (const entry of ftsStore.list()) { - if (!entry.removed) indexedCount++; - } - } - - return { files: fileCount, indexed: indexedCount }; - }, - - async clear(): Promise { - const names = await registry.getStoreNames(); - for (const name of names) { - await registry.deleteStore(name); - } - if (await indexer.hasIndex("content")) { - await indexer.deleteIndex("content"); - } - ftsIndex = null; - }, - - async close(): Promise { - if (ftsIndex) { - await ftsIndex.close(); - ftsIndex = null; - } - await registry.close(); - }, - }; - - return manager; -} diff --git a/packages/content-manager/src/index.ts b/packages/content-manager/src/index.ts deleted file mode 100644 index 7789fe1..0000000 --- a/packages/content-manager/src/index.ts +++ /dev/null @@ -1,9 +0,0 @@ -export { createContentManager } from "./content-manager.js"; -export type { - ContentManager, - ContentManagerOptions, - ContentSearchParams, - ContentStatus, - SearchHit, - SyncEvent, -} from "./types.js"; diff --git a/packages/content-manager/src/types.ts b/packages/content-manager/src/types.ts deleted file mode 100644 index 48e420e..0000000 --- a/packages/content-manager/src/types.ts +++ /dev/null @@ -1,60 +0,0 @@ -import type { ExtractorRegistry } from "@statewalker/content-extractors"; -import type { ScanRegistry } from "@statewalker/content-scanner"; -import type { EmbedFn, Indexer } from "@statewalker/indexer-api"; -import type { ChunkOptions } from "@statewalker/indexer-chunker"; -import type { FilesApi } from "@statewalker/webrun-files"; - -export interface SearchHit { - blockId: string; - uri: string; - content: string; - score: number; -} - -export interface ContentSearchParams { - queries: string[]; - semanticQueries?: string[]; - topK?: number; - paths?: string[]; - weights?: { fts: number; embedding: number }; -} - -export interface ContentStatus { - files: number; - indexed: number; -} - -export type SyncEvent = - | { type: "sync-started" } - | { type: "file-indexed"; uri: string } - | { type: "file-removed"; uri: string } - | { type: "file-error"; uri: string; error: string } - | { - type: "sync-done"; - stats: { - scanned: number; - indexed: number; - removed: number; - errors: number; - }; - }; - -export interface ContentManagerOptions { - registry: ScanRegistry; - indexer: Indexer; - files: FilesApi; - extractors: ExtractorRegistry; - chunkOptions?: ChunkOptions; - embed?: EmbedFn; - embeddingDimensions?: number; - root?: string; - filter?: (path: string) => boolean; -} - -export interface ContentManager { - sync(): AsyncGenerator; - search(params: ContentSearchParams): Promise; - status(): Promise; - clear(): Promise; - close(): Promise; -} diff --git a/packages/content-manager/src/vendor.d.ts b/packages/content-manager/src/vendor.d.ts deleted file mode 100644 index 9d197b1..0000000 --- a/packages/content-manager/src/vendor.d.ts +++ /dev/null @@ -1,5 +0,0 @@ -declare module "@joplin/turndown-plugin-gfm" { - // biome-ignore lint: minimal type shim for transitive dependency - const gfm: any; - export { gfm }; -} diff --git a/packages/content-pipeline/README.md b/packages/content-pipeline/README.md new file mode 100644 index 0000000..a6a42d0 --- /dev/null +++ b/packages/content-pipeline/README.md @@ -0,0 +1,37 @@ +# @statewalker/content-pipeline + +Layered trackers that cascade file-system changes through extract, split, embed, and index stages. Each layer owns a `Store`, pulls entries from its upstream store since a per-listener cursor, processes them in batched + paced drains, and notifies downstream listeners when the batch completes. + +## Installation + +```sh +pnpm add @statewalker/content-pipeline +``` + +## Usage + +```ts +import { createPipeline, scanFiles } from "@statewalker/content-pipeline"; + +const pipeline = createPipeline({ + files, root, extractors, chunkOpts, + ftsIndex, // optional + embed, vecIndex, // optional — both or neither + stores, // built from the default wiring helper +}); + +await pipeline.scanFiles(); +await pipeline.catchUpAll(); +``` + +## API + +- `Entry` / `Store` / `Transform` — core types. +- `runTracker(upstream, own, transform, opts)` — driver. +- `JsonManifestStore`, `BlobStore` (msgpack + Float32 codecs), optional `SqlStore` — store backends. +- `scanFiles`, `extract`, `split`, `embed`, `ftsIndex`, `vecIndex` — concrete transforms. +- `createPipeline`, `ContentManager` — wiring + public surface. + +## Related + +- `@statewalker/content-blocks`, `@statewalker/content-extractors`, `@statewalker/indexer-api`, `@statewalker/indexer-chunker`. diff --git a/packages/content-scanner/package.json b/packages/content-pipeline/package.json similarity index 74% rename from packages/content-scanner/package.json rename to packages/content-pipeline/package.json index 6b3da66..4bbe6a2 100644 --- a/packages/content-scanner/package.json +++ b/packages/content-pipeline/package.json @@ -1,9 +1,9 @@ { - "name": "@statewalker/content-scanner", + "name": "@statewalker/content-pipeline", "version": "0.1.0", "private": false, "type": "module", - "description": "Content scanner: walks a @statewalker/webrun-files tree and streams @statewalker/content-blocks through extractors into indexers.", + "description": "Content pipeline: layered trackers that cascade file-system changes through extract, split, embed, and index stages.", "homepage": "https://github.com/statewalker/statewalker-content", "author": { "name": "Mikhail Kotelnikov", @@ -38,10 +38,18 @@ "@statewalker/indexer-chunker": "catalog:", "@statewalker/shared-ids": "catalog:", "@statewalker/webrun-files": "catalog:", - "@statewalker/webrun-msgpack": "workspace:*", - "@statewalker/webrun-streams": "workspace:*" + "@statewalker/webrun-msgpack": "workspace:*" + }, + "peerDependencies": { + "@statewalker/db-api": "workspace:*" + }, + "peerDependenciesMeta": { + "@statewalker/db-api": { + "optional": true + } }, "devDependencies": { + "@statewalker/indexer-mem-flexsearch": "catalog:", "@statewalker/webrun-files-mem": "catalog:", "rimraf": "catalog:", "tsdown": "catalog:", diff --git a/packages/content-pipeline/src/content-manager.ts b/packages/content-pipeline/src/content-manager.ts new file mode 100644 index 0000000..517d58a --- /dev/null +++ b/packages/content-pipeline/src/content-manager.ts @@ -0,0 +1,223 @@ +import type { ExtractorRegistry } from "@statewalker/content-extractors"; +import type { + DocumentPath, + HybridSearchResult, + Index, + Indexer, + EmbedFn as IndexerEmbedFn, +} from "@statewalker/indexer-api"; +import type { ChunkOptions } from "@statewalker/indexer-chunker"; +import type { FilesApi } from "@statewalker/webrun-files"; +import type { Pipeline, PipelineStores } from "./pipeline.js"; +import { createDefaultStores, createPipeline } from "./pipeline.js"; +import type { EmbedFn } from "./transforms/embed.js"; + +export type SearchHit = { + blockId: string; + uri: string; + content: string; + score: number; +}; + +export type ContentSearchParams = { + queries: string[]; + semanticQueries?: string[]; + topK?: number; + paths?: string[]; + weights?: { fts: number; embedding: number }; +}; + +export type ContentStatus = { + files: number; + indexed: number; +}; + +export type SyncEvent = + | { type: "sync-started" } + | { type: "file-indexed"; uri: string } + | { type: "file-removed"; uri: string } + | { type: "file-error"; uri: string; error: string } + | { + type: "sync-done"; + stats: { scanned: number; indexed: number; removed: number; errors: number }; + }; + +export type ContentManagerOptions = { + files: FilesApi; + /** Directory prefix for all store state, e.g. `/.settings/content`. */ + statePrefix: string; + extractors: ExtractorRegistry; + chunkOptions?: ChunkOptions; + indexer: Indexer; + embed?: EmbedFn | IndexerEmbedFn; + /** Optional precomputed `stores` — overrides the default wiring derived from `statePrefix`. */ + stores?: PipelineStores; + root?: string; + filter?: (path: string) => boolean; + batchSize?: number; + pauseMs?: number; +}; + +export type ContentManager = { + sync(): AsyncGenerator; + search(params: ContentSearchParams): Promise; + status(): Promise; + clear(): Promise; + close(): Promise; +}; + +const DEFAULT_CHUNK_OPTIONS: ChunkOptions = { targetChars: 1500 }; +const INDEX_NAME = "content"; + +async function getOrCreateIndex(indexer: Indexer): Promise { + const existing = await indexer.getIndex(INDEX_NAME); + if (existing) return existing; + return indexer.createIndex({ name: INDEX_NAME, fulltext: { language: "en" } }); +} + +/** + * Build a ContentManager over the new pipeline. Preserves the public surface of + * the old `@statewalker/content-manager` so `content-cli` can swap packages + * with minimal change. + */ +export function createContentManager(options: ContentManagerOptions): ContentManager { + const embed = options.embed as EmbedFn | undefined; + let index: Index | null = null; + let pipeline: Pipeline | null = null; + + async function ensureIndex(): Promise { + if (!index) index = await getOrCreateIndex(options.indexer); + return index; + } + + async function ensurePipeline(): Promise { + if (pipeline) return pipeline; + const idx = await ensureIndex(); + const stores = + options.stores ?? + createDefaultStores({ + files: options.files, + prefix: options.statePrefix, + withFtsIndex: true, + withEmbeddings: embed !== undefined, + withVecIndex: embed !== undefined, + }); + pipeline = createPipeline({ + files: options.files, + root: options.root ?? "/", + filter: options.filter, + extractors: options.extractors, + chunkOptions: options.chunkOptions ?? DEFAULT_CHUNK_OPTIONS, + embed, + ftsIndex: idx, + vecIndex: embed ? idx : undefined, + stores, + batchSize: options.batchSize, + pauseMs: options.pauseMs, + }); + return pipeline; + } + + return { + async *sync(): AsyncGenerator { + yield { type: "sync-started" }; + const p = await ensurePipeline(); + const stats = { scanned: 0, indexed: 0, removed: 0, errors: 0 }; + + // Count "scanned" = files-store writes produced by this sync. + const scanCursor = await p.stores.files.cursor("_mgr_scan"); + await p.scanFiles(); + let newScanCursor = scanCursor; + for await (const e of p.stores.files.since(scanCursor, Number.POSITIVE_INFINITY)) { + stats.scanned += 1; + if (e.stamp > newScanCursor) newScanCursor = e.stamp; + } + if (newScanCursor !== scanCursor) { + await p.stores.files.advance("_mgr_scan", newScanCursor); + } + + // Translate fts-receipt deltas to SyncEvents. + const fts = p.stores.fts; + const syncCursor = fts ? await fts.cursor("_mgr_sync") : 0; + await p.catchUpAll(); + if (fts) { + let newSyncCursor = syncCursor; + for await (const e of fts.since(syncCursor, Number.POSITIVE_INFINITY)) { + if (e.tombstone) { + stats.removed += 1; + yield { type: "file-removed", uri: e.uri }; + } else if (e.meta && "error" in e.meta) { + stats.errors += 1; + yield { + type: "file-error", + uri: e.uri, + error: String((e.meta as { error?: unknown }).error), + }; + } else { + stats.indexed += 1; + yield { type: "file-indexed", uri: e.uri }; + } + if (e.stamp > newSyncCursor) newSyncCursor = e.stamp; + } + if (newSyncCursor !== syncCursor) { + await fts.advance("_mgr_sync", newSyncCursor); + } + } + + yield { type: "sync-done", stats }; + }, + + async search(params: ContentSearchParams): Promise { + const idx = await ensureIndex(); + const results: HybridSearchResult[] = []; + for await (const r of idx.search({ + queries: params.queries, + topK: params.topK ?? 10, + paths: params.paths as DocumentPath[] | undefined, + weights: params.weights, + })) { + results.push(r); + } + return results.map((r) => ({ + blockId: r.blockId, + uri: String(r.path), + content: r.fts?.snippet ?? "", + score: r.score, + })); + }, + + async status(): Promise { + const p = await ensurePipeline(); + let files = 0; + for await (const e of p.stores.files.since(0, Number.POSITIVE_INFINITY)) { + if (!e.tombstone) files += 1; + } + let indexed = 0; + if (p.stores.fts) { + for await (const e of p.stores.fts.since(0, Number.POSITIVE_INFINITY)) { + if (!e.tombstone) indexed += 1; + } + } + return { files, indexed }; + }, + + async clear(): Promise { + if (pipeline) await pipeline.close(); + if (await options.indexer.hasIndex(INDEX_NAME)) { + await options.indexer.deleteIndex(INDEX_NAME); + } + // Remove all store state under the prefix. + if (await options.files.exists(options.statePrefix)) { + await options.files.remove(options.statePrefix); + } + pipeline = null; + index = null; + }, + + async close(): Promise { + if (pipeline) await pipeline.close(); + pipeline = null; + index = null; + }, + }; +} diff --git a/packages/content-pipeline/src/files-tracker.ts b/packages/content-pipeline/src/files-tracker.ts new file mode 100644 index 0000000..019a85a --- /dev/null +++ b/packages/content-pipeline/src/files-tracker.ts @@ -0,0 +1,63 @@ +import { sha1Bytes } from "@statewalker/shared-ids"; +import type { FilesApi } from "@statewalker/webrun-files"; +import { readFile } from "@statewalker/webrun-files"; +import type { Store } from "./store.js"; +import type { FileEntry } from "./types.js"; + +export type ScanFilesOptions = { + /** Exclude paths before any I/O by returning `false`. */ + filter?: (path: string) => boolean; +}; + +/** + * Walk `root` on `files`, comparing each file against the store's last recorded + * entry. Writes changed files, tombstones disappeared files. No-op for unchanged + * files (matching size + mtime + hash). Callers trigger this; periodic scheduling + * lives in the caller, not here. + */ +export async function scanFiles( + files: FilesApi, + root: string, + own: Store, + options?: ScanFilesOptions, +): Promise { + const filter = options?.filter; + const seen = new Set(); + const writes: { uri: string; meta: { size: number; mtime: number; hash: string } }[] = []; + + for await (const info of files.list(root, { recursive: true })) { + if (info.kind !== "file") continue; + if (filter && !filter(info.path)) continue; + seen.add(info.path); + + const size = info.size ?? 0; + const mtime = info.lastModified ?? 0; + const prev = await own.get(info.path); + + if ( + prev && + !prev.tombstone && + prev.meta && + prev.meta.size === size && + prev.meta.mtime === mtime + ) { + continue; + } + + const hash = await sha1Bytes((await readFile(files, info.path)) as Uint8Array); + if (prev && !prev.tombstone && prev.meta && prev.meta.hash === hash) continue; + + writes.push({ uri: info.path, meta: { size, mtime, hash } }); + } + + // Tombstone entries that exist in the store but were not seen this pass. + const tombstones: { uri: string; tombstone: true }[] = []; + for await (const entry of own.since(0, Number.POSITIVE_INFINITY)) { + if (entry.tombstone) continue; + if (!seen.has(entry.uri)) tombstones.push({ uri: entry.uri, tombstone: true }); + } + + if (writes.length > 0 || tombstones.length > 0) { + await own.put([...writes, ...tombstones]); + } +} diff --git a/packages/content-pipeline/src/index.ts b/packages/content-pipeline/src/index.ts new file mode 100644 index 0000000..d271b21 --- /dev/null +++ b/packages/content-pipeline/src/index.ts @@ -0,0 +1,17 @@ +export * from "./content-manager.js"; +export * from "./files-tracker.js"; +export * from "./pipeline.js"; +export * from "./store.js"; +export { BlobStore, type BlobStoreOptions } from "./stores/blob.js"; +export type { BlobCodec } from "./stores/codec.js"; +export { float32Codec } from "./stores/codec-float32.js"; +export { msgpackCodec } from "./stores/codec-msgpack.js"; +export { JsonManifestStore, type JsonManifestStoreOptions } from "./stores/json-manifest.js"; +export { createStampAllocator, type StampAllocator } from "./stores/stamp.js"; +export * from "./tracker.js"; +export { type EmbedFn, embed } from "./transforms/embed.js"; +export { extract } from "./transforms/extract.js"; +export { ftsIndex, ftsIndexRemove } from "./transforms/fts-index.js"; +export { split } from "./transforms/split.js"; +export { vecIndex, vecIndexRemove } from "./transforms/vec-index.js"; +export * from "./types.js"; diff --git a/packages/content-pipeline/src/pipeline.ts b/packages/content-pipeline/src/pipeline.ts new file mode 100644 index 0000000..e44b977 --- /dev/null +++ b/packages/content-pipeline/src/pipeline.ts @@ -0,0 +1,202 @@ +import type { ExtractorRegistry } from "@statewalker/content-extractors"; +import type { Index } from "@statewalker/indexer-api"; +import type { ChunkOptions } from "@statewalker/indexer-chunker"; +import type { FilesApi } from "@statewalker/webrun-files"; +import type { ScanFilesOptions } from "./files-tracker.js"; +import { scanFiles } from "./files-tracker.js"; +import type { Store } from "./store.js"; +import { BlobStore } from "./stores/blob.js"; +import { float32Codec } from "./stores/codec-float32.js"; +import { msgpackCodec } from "./stores/codec-msgpack.js"; +import { JsonManifestStore } from "./stores/json-manifest.js"; +import type { Tracker } from "./tracker.js"; +import { runTracker } from "./tracker.js"; +import { type EmbedFn, embed as embedT } from "./transforms/embed.js"; +import { extract as extractT } from "./transforms/extract.js"; +import { ftsIndexRemove, ftsIndex as ftsIndexT } from "./transforms/fts-index.js"; +import { split as splitT } from "./transforms/split.js"; +import { vecIndexRemove, vecIndex as vecIndexT } from "./transforms/vec-index.js"; +import type { ChunksEntry, ContentEntry, FileEntry, ReceiptEntry, VecsEntry } from "./types.js"; + +export type PipelineStores = { + files: Store; + content: Store; + chunks: Store; + embeddings?: Store; + fts?: Store; + vec?: Store; +}; + +export type CreatePipelineOptions = { + files: FilesApi; + root: string; + filter?: (path: string) => boolean; + extractors: ExtractorRegistry; + chunkOptions: ChunkOptions; + /** Enables the embed tracker and (with `vecIndex`) the vec tracker. */ + embed?: EmbedFn; + ftsIndex?: Index; + vecIndex?: Index; + stores: PipelineStores; + batchSize?: number; + pauseMs?: number; + signal?: AbortSignal; +}; + +export type Pipeline = { + stores: PipelineStores; + scanFiles(scanOpts?: ScanFilesOptions): Promise; + catchUpAll(): Promise; + close(): Promise; +}; + +/** + * Wire concrete trackers — each a `runTracker` over a transform — from an + * upstream store into its downstream store. Trackers subscribe to their upstream + * store's `onStampUpdate`, so a `scanFiles()` write kicks the full cascade. + */ +export function createPipeline(opts: CreatePipelineOptions): Pipeline { + const batchSize = opts.batchSize ?? 50; + const pauseMs = opts.pauseMs ?? 10; + const signal = opts.signal; + + const trackers: Tracker[] = []; + + const extractTracker = runTracker( + opts.stores.files, + opts.stores.content, + extractT(opts.files, opts.extractors), + { name: "extract", batchSize, pauseMs, signal }, + ); + trackers.push(extractTracker); + + const splitTracker = runTracker( + opts.stores.content, + opts.stores.chunks, + splitT(opts.chunkOptions), + { name: "split", batchSize, pauseMs, signal }, + ); + trackers.push(splitTracker); + + const ftsIndex = opts.ftsIndex; + const ftsStore = opts.stores.fts; + const ftsTracker = + ftsIndex && ftsStore + ? runTracker(opts.stores.chunks, ftsStore, ftsIndexT(ftsIndex), { + name: "fts", + batchSize, + pauseMs, + signal, + onRemove: (uri) => ftsIndexRemove(ftsIndex, uri), + }) + : null; + if (ftsTracker) trackers.push(ftsTracker); + + const embedFn = opts.embed; + const embedStore = opts.stores.embeddings; + const embedTracker = + embedFn && embedStore + ? runTracker(opts.stores.chunks, embedStore, embedT(embedFn), { + name: "embed", + batchSize, + pauseMs, + signal, + }) + : null; + if (embedTracker) trackers.push(embedTracker); + + const vecIndex = opts.vecIndex; + const vecStore = opts.stores.vec; + const vecTracker = + vecIndex && vecStore && embedStore + ? runTracker(embedStore, vecStore, vecIndexT(vecIndex), { + name: "vec", + batchSize, + pauseMs, + signal, + onRemove: (uri) => vecIndexRemove(vecIndex, uri), + }) + : null; + if (vecTracker) trackers.push(vecTracker); + + // Order matches dependency chain so that a cold-start catch-up fills the + // intermediate stores before the downstream trackers run. + const ordered: Tracker[] = [ + extractTracker, + splitTracker, + ...(ftsTracker ? [ftsTracker] : []), + ...(embedTracker ? [embedTracker] : []), + ...(vecTracker ? [vecTracker] : []), + ]; + + return { + stores: opts.stores, + async scanFiles(scanOpts) { + await scanFiles( + opts.files, + opts.root, + opts.stores.files, + scanOpts ?? { filter: opts.filter }, + ); + }, + async catchUpAll() { + for (const t of ordered) await t.catchUp(); + }, + async close() { + for (const t of trackers) await t.close(); + }, + }; +} + +/** + * Default store wiring: JSON manifest for `files`, `fts`, `vec`; + * BlobStore for `content`, `chunks` (msgpack codec), `embeddings` (float32 codec). + */ +export function createDefaultStores(params: { + files: FilesApi; + prefix: string; + withEmbeddings?: boolean; + withVecIndex?: boolean; + withFtsIndex?: boolean; +}): PipelineStores { + const { files, prefix } = params; + const filesStore = new JsonManifestStore({ + files, + prefix: `${prefix}/files`, + }); + const contentStore = new BlobStore({ + files, + prefix: `${prefix}/content`, + codec: msgpackCodec(), + }); + const chunksStore = new BlobStore({ + files, + prefix: `${prefix}/chunks`, + codec: msgpackCodec(), + }); + const stores: PipelineStores = { + files: filesStore, + content: contentStore, + chunks: chunksStore, + }; + if (params.withFtsIndex ?? true) { + stores.fts = new JsonManifestStore({ + files, + prefix: `${prefix}/fts`, + }); + } + if (params.withEmbeddings) { + stores.embeddings = new BlobStore({ + files, + prefix: `${prefix}/embeddings`, + codec: float32Codec(), + }); + } + if (params.withVecIndex) { + stores.vec = new JsonManifestStore({ + files, + prefix: `${prefix}/vec`, + }); + } + return stores; +} diff --git a/packages/content-pipeline/src/store.ts b/packages/content-pipeline/src/store.ts new file mode 100644 index 0000000..41bbca1 --- /dev/null +++ b/packages/content-pipeline/src/store.ts @@ -0,0 +1,40 @@ +import type { Entry } from "./types.js"; + +/** + * Payload written to a store. The stamp is allocated by the store, not the caller. + * Tombstones and live entries share this shape. + */ +export type StoreWrite = Omit; + +export type StampListener = (stamp: number) => void; +export type Unsubscribe = () => void; + +/** A per-URI manifest of entries with monotonic stamps and per-listener cursors. */ +export interface Store { + /** Lookup by URI. Returns the latest entry (live or tombstone) or undefined. */ + get(uri: string): Promise; + + /** + * Write entries. Each entry receives a fresh monotonic stamp, even within a single batch. + * Listeners are notified once with the new highest stamp after the batch is committed. + */ + put(entries: StoreWrite[]): Promise; + + /** Yield entries with `stamp > cursor`, ordered by stamp ascending, up to `limit` items. */ + since(cursor: number, limit: number): AsyncGenerator; + + /** Get the persisted cursor for the given listener name. Defaults to 0 if unknown. */ + cursor(name: string): Promise; + + /** Persist a new cursor for the named listener. */ + advance(name: string, stamp: number): Promise; + + /** Subscribe to stamp-advancement notifications. Returns an unsubscribe function. */ + onStampUpdate(listener: StampListener): Unsubscribe; + + /** Release any resources held by the store. Idempotent. */ + close(): Promise; +} + +export type { StampAllocator } from "./stores/stamp.js"; +export { createStampAllocator } from "./stores/stamp.js"; diff --git a/packages/content-pipeline/src/stores/blob.ts b/packages/content-pipeline/src/stores/blob.ts new file mode 100644 index 0000000..54205c1 --- /dev/null +++ b/packages/content-pipeline/src/stores/blob.ts @@ -0,0 +1,172 @@ +import { sha1Uuid } from "@statewalker/shared-ids"; +import type { FilesApi } from "@statewalker/webrun-files"; +import { readFile, readText, writeText } from "@statewalker/webrun-files"; +import type { StampListener, Store, StoreWrite, Unsubscribe } from "../store.js"; +import type { Entry } from "../types.js"; +import type { BlobCodec } from "./codec.js"; +import { createStampAllocator, type StampAllocator } from "./stamp.js"; + +type ManifestEntry = { stamp: number; tombstone?: true }; + +type Manifest = { + counter: number; + entries: Record; + cursors: Record; +}; + +export type BlobStoreOptions = { + /** FilesApi used for both the manifest and per-URI blobs. */ + files: FilesApi; + /** Directory prefix — manifest at `{prefix}/manifest.json`, blobs at `{prefix}/{dd}/{hash}.bin`. */ + prefix: string; + /** Codec for the meta payload. */ + codec: BlobCodec; +}; + +/** + * Meta-in-blob store. Manifest holds only `{uri → {stamp, tombstone?}}` (keeping + * listing cheap); each URI's meta is persisted as one blob file per URI via + * `BlobCodec`. Suitable for payload-heavy layers (extract, chunks, embeddings). + */ +export class BlobStore implements Store { + private readonly files: FilesApi; + private readonly prefix: string; + private readonly codec: BlobCodec ? M : never>; + private readonly listeners = new Set(); + private readonly stamps: StampAllocator = createStampAllocator(); + private manifest: Manifest | null = null; + private loadPromise: Promise | null = null; + + constructor(options: BlobStoreOptions ? M : never>) { + this.files = options.files; + this.prefix = options.prefix; + this.codec = options.codec; + } + + private get manifestPath(): string { + return `${this.prefix}/manifest.json`; + } + + private async blobPath(uri: string): Promise { + const hash = await sha1Uuid(uri); + const dd = hash.slice(0, 2); + return `${this.prefix}/${dd}/${hash}.bin`; + } + + private async load(): Promise { + if (this.manifest) return this.manifest; + if (this.loadPromise) return this.loadPromise; + this.loadPromise = (async () => { + if (await this.files.exists(this.manifestPath)) { + const text = await readText(this.files, this.manifestPath); + if (text) { + const parsed = JSON.parse(text) as Manifest; + this.stamps.seed(parsed.counter); + this.manifest = parsed; + return parsed; + } + } + const empty: Manifest = { counter: 0, entries: {}, cursors: {} }; + this.manifest = empty; + return empty; + })(); + try { + return await this.loadPromise; + } finally { + this.loadPromise = null; + } + } + + private async save(): Promise { + if (!this.manifest) return; + this.manifest.counter = this.stamps.current(); + await writeText(this.files, this.manifestPath, JSON.stringify(this.manifest)); + } + + async get(uri: string): Promise { + const m = await this.load(); + const persisted = m.entries[uri]; + if (!persisted) return undefined; + return this.materialise(uri, persisted); + } + + async put(writes: StoreWrite[]): Promise { + if (writes.length === 0) return; + const m = await this.load(); + let top = 0; + for (const w of writes) { + const stamp = this.stamps.next(); + top = stamp; + const blobP = await this.blobPath(w.uri); + if (w.tombstone) { + m.entries[w.uri] = { stamp, tombstone: true }; + if (await this.files.exists(blobP)) await this.files.remove(blobP); + } else if (w.meta !== undefined) { + const payload = await this.codec.encode(w.meta as E extends Entry ? M : never); + await writeBytes(this.files, blobP, payload); + m.entries[w.uri] = { stamp }; + } else { + // Live entry with no meta — rare but legal. Write the manifest row; no blob. + m.entries[w.uri] = { stamp }; + if (await this.files.exists(blobP)) await this.files.remove(blobP); + } + } + await this.save(); + for (const listener of this.listeners) listener(top); + } + + async *since(cursor: number, limit: number): AsyncGenerator { + const m = await this.load(); + const sorted = Object.entries(m.entries) + .filter(([, e]) => e.stamp > cursor) + .sort((a, b) => a[1].stamp - b[1].stamp) + .slice(0, limit); + for (const [uri, persisted] of sorted) { + yield await this.materialise(uri, persisted); + } + } + + async cursor(name: string): Promise { + const m = await this.load(); + return m.cursors[name] ?? 0; + } + + async advance(name: string, stamp: number): Promise { + const m = await this.load(); + m.cursors[name] = stamp; + await this.save(); + } + + onStampUpdate(listener: StampListener): Unsubscribe { + this.listeners.add(listener); + return () => this.listeners.delete(listener); + } + + async close(): Promise { + this.listeners.clear(); + } + + private async materialise(uri: string, persisted: ManifestEntry): Promise { + const entry: Entry = { uri, stamp: persisted.stamp }; + if (persisted.tombstone) { + (entry as { tombstone?: true }).tombstone = true; + return entry as T; + } + const blobP = await this.blobPath(uri); + if (await this.files.exists(blobP)) { + const bytes = await readFile(this.files, blobP); + const meta = await this.codec.decode(bytes); + entry.meta = meta as Record; + } + return entry as T; + } +} + +async function writeBytes(files: FilesApi, path: string, bytes: Uint8Array): Promise { + await files.write( + path, + (async function* () { + yield bytes; + })(), + ); +} diff --git a/packages/content-pipeline/src/stores/codec-float32.ts b/packages/content-pipeline/src/stores/codec-float32.ts new file mode 100644 index 0000000..e6ee0f5 --- /dev/null +++ b/packages/content-pipeline/src/stores/codec-float32.ts @@ -0,0 +1,65 @@ +import type { VecsMeta } from "../types.js"; +import type { BlobCodec } from "./codec.js"; + +/** + * Raw Float32 codec for embeddings meta. Format: `[count: u32 LE][dim: u32 LE][flat Float32 bytes]`. + * All vectors must share one `dim`. Decoding wraps the buffer in a single Float32Array and + * slices into N views — zero per-array framing, zero element copies. + */ +export function float32Codec(): BlobCodec { + return { + encode(meta: VecsMeta): Uint8Array { + const vecs = meta.vecs; + if (vecs.length === 0) { + const empty = new ArrayBuffer(8); + new DataView(empty).setUint32(0, 0, true); + new DataView(empty).setUint32(4, 0, true); + return new Uint8Array(empty); + } + const dim = vecs[0]?.length ?? 0; + for (let i = 1; i < vecs.length; i++) { + if (vecs[i]?.length !== dim) { + throw new Error( + `float32 codec: vectors must share dimension, got ${vecs[i]?.length} vs ${dim} at index ${i}`, + ); + } + } + const header = 8; + const bodyBytes = vecs.length * dim * 4; + const out = new Uint8Array(header + bodyBytes); + const view = new DataView(out.buffer); + view.setUint32(0, vecs.length, true); + view.setUint32(4, dim, true); + let off = header; + for (const v of vecs) { + out.set(new Uint8Array(v.buffer, v.byteOffset, v.byteLength), off); + off += dim * 4; + } + return out; + }, + decode(bytes: Uint8Array): VecsMeta { + if (bytes.length < 8) { + throw new Error("float32 codec: blob is shorter than the 8-byte header"); + } + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + const count = view.getUint32(0, true); + const dim = view.getUint32(4, true); + const expected = 8 + count * dim * 4; + if (bytes.length !== expected) { + throw new Error( + `float32 codec: blob size ${bytes.length} does not match header (count=${count}, dim=${dim}, expected=${expected})`, + ); + } + const vecs: Float32Array[] = []; + // Copy the body into its own ArrayBuffer so Float32Array alignment is guaranteed. + const body = new Uint8Array( + bytes.buffer.slice(bytes.byteOffset + 8, bytes.byteOffset + expected), + ); + const base = new Float32Array(body.buffer); + for (let i = 0; i < count; i++) { + vecs.push(base.subarray(i * dim, (i + 1) * dim)); + } + return { vecs }; + }, + }; +} diff --git a/packages/content-pipeline/src/stores/codec-msgpack.ts b/packages/content-pipeline/src/stores/codec-msgpack.ts new file mode 100644 index 0000000..4fd9654 --- /dev/null +++ b/packages/content-pipeline/src/stores/codec-msgpack.ts @@ -0,0 +1,40 @@ +import { decodeMsgpack, encodeMsgpack } from "@statewalker/webrun-msgpack"; +import type { BlobCodec } from "./codec.js"; + +async function* singleton(value: T): AsyncGenerator { + yield value; +} + +async function concat(chunks: Uint8Array[]): Promise { + const total = chunks.reduce((s, c) => s + c.length, 0); + const out = new Uint8Array(total); + let off = 0; + for (const c of chunks) { + out.set(c, off); + off += c.length; + } + return out; +} + +async function* emitOnce(bytes: Uint8Array): AsyncGenerator { + yield bytes; +} + +/** + * Default blob codec. Writes the meta as a single length-prefixed msgpack frame + * and decodes it back by reading one frame. Handles any JSON-serialisable shape + * (text, chunk lists, arbitrary Record). + */ +export function msgpackCodec(): BlobCodec { + return { + async encode(meta: M): Promise { + const frames: Uint8Array[] = []; + for await (const f of encodeMsgpack(singleton(meta))) frames.push(f); + return concat(frames); + }, + async decode(bytes: Uint8Array): Promise { + for await (const v of decodeMsgpack(emitOnce(bytes))) return v; + throw new Error("msgpack codec: blob is empty"); + }, + }; +} diff --git a/packages/content-pipeline/src/stores/codec.ts b/packages/content-pipeline/src/stores/codec.ts new file mode 100644 index 0000000..8e74aa4 --- /dev/null +++ b/packages/content-pipeline/src/stores/codec.ts @@ -0,0 +1,9 @@ +/** + * Binary codec for a layer's meta payload. Sync or async — BlobStore always awaits. + * Separate from Store so one layer can swap codecs (msgpack → raw Float32 for embeddings) + * without reimplementing persistence. + */ +export interface BlobCodec { + encode(meta: M): Uint8Array | Promise; + decode(bytes: Uint8Array): M | Promise; +} diff --git a/packages/content-pipeline/src/stores/json-manifest.ts b/packages/content-pipeline/src/stores/json-manifest.ts new file mode 100644 index 0000000..a3681e8 --- /dev/null +++ b/packages/content-pipeline/src/stores/json-manifest.ts @@ -0,0 +1,135 @@ +import type { FilesApi } from "@statewalker/webrun-files"; +import { readText, writeText } from "@statewalker/webrun-files"; +import type { StampListener, Store, StoreWrite, Unsubscribe } from "../store.js"; +import type { Entry } from "../types.js"; +import { createStampAllocator, type StampAllocator } from "./stamp.js"; + +type PersistedEntry = { + uri: string; + stamp: number; + tombstone?: true; + meta?: E extends Entry ? M : never; +}; + +type Manifest = { + counter: number; + entries: Record>; + cursors: Record; +}; + +export type JsonManifestStoreOptions = { + /** FilesApi the manifest is persisted on. */ + files: FilesApi; + /** Directory prefix — manifest lives at `{prefix}/manifest.json`. */ + prefix: string; +}; + +/** + * Single-file JSON store. Fits small-meta layers (files, receipts); not suitable + * for heavy payloads — every write rewrites the whole manifest. + */ +export class JsonManifestStore implements Store { + private readonly files: FilesApi; + private readonly prefix: string; + private readonly listeners = new Set(); + private readonly stamps: StampAllocator = createStampAllocator(); + private manifest: Manifest | null = null; + private loadPromise: Promise> | null = null; + + constructor(options: JsonManifestStoreOptions) { + this.files = options.files; + this.prefix = options.prefix; + } + + private get manifestPath(): string { + return `${this.prefix}/manifest.json`; + } + + private async load(): Promise> { + if (this.manifest) return this.manifest; + if (this.loadPromise) return this.loadPromise; + this.loadPromise = (async () => { + if (await this.files.exists(this.manifestPath)) { + const text = await readText(this.files, this.manifestPath); + if (text) { + const parsed = JSON.parse(text) as Manifest; + this.stamps.seed(parsed.counter); + this.manifest = parsed; + return parsed; + } + } + const empty: Manifest = { counter: 0, entries: {}, cursors: {} }; + this.manifest = empty; + return empty; + })(); + try { + return await this.loadPromise; + } finally { + this.loadPromise = null; + } + } + + private async save(): Promise { + if (!this.manifest) return; + this.manifest.counter = this.stamps.current(); + await writeText(this.files, this.manifestPath, JSON.stringify(this.manifest)); + } + + async get(uri: string): Promise { + const m = await this.load(); + const persisted = m.entries[uri]; + return persisted ? toEntry(persisted) : undefined; + } + + async put(writes: StoreWrite[]): Promise { + if (writes.length === 0) return; + const m = await this.load(); + let top = 0; + for (const w of writes) { + const stamp = this.stamps.next(); + top = stamp; + const persisted: PersistedEntry = { uri: w.uri, stamp }; + if (w.tombstone) persisted.tombstone = true; + if (w.meta !== undefined) persisted.meta = w.meta as PersistedEntry["meta"]; + m.entries[w.uri] = persisted; + } + await this.save(); + for (const listener of this.listeners) listener(top); + } + + async *since(cursor: number, limit: number): AsyncGenerator { + const m = await this.load(); + const sorted = Object.values(m.entries) + .filter((e) => e.stamp > cursor) + .sort((a, b) => a.stamp - b.stamp) + .slice(0, limit); + for (const e of sorted) yield toEntry(e); + } + + async cursor(name: string): Promise { + const m = await this.load(); + return m.cursors[name] ?? 0; + } + + async advance(name: string, stamp: number): Promise { + const m = await this.load(); + m.cursors[name] = stamp; + await this.save(); + } + + onStampUpdate(listener: StampListener): Unsubscribe { + this.listeners.add(listener); + return () => this.listeners.delete(listener); + } + + async close(): Promise { + this.listeners.clear(); + } +} + +function toEntry(p: PersistedEntry): E { + const entry: Entry = { uri: p.uri, stamp: p.stamp }; + if (p.tombstone) (entry as { tombstone?: true }).tombstone = true; + if (p.meta !== undefined) entry.meta = p.meta; + return entry as E; +} diff --git a/packages/content-pipeline/src/stores/stamp.ts b/packages/content-pipeline/src/stores/stamp.ts new file mode 100644 index 0000000..d5c0117 --- /dev/null +++ b/packages/content-pipeline/src/stores/stamp.ts @@ -0,0 +1,32 @@ +/** + * Monotonic integer stamp allocator. Seeded from `Date.now()` on first use; restart-safe + * when the store persists `current()` and re-seeds via `seed()` on reload. + * Every call to `next()` returns a strictly larger integer than the previous call. + */ +export interface StampAllocator { + /** Returns the next stamp (strictly > previously returned values). */ + next(): number; + /** Returns the most recently returned stamp without advancing. */ + current(): number; + /** + * Re-seed after loading a persisted counter. Future `next()` calls return at least + * `max(value + 1, Date.now())`, so a stale clock never regresses the counter. + */ + seed(value: number): void; +} + +/** Default `StampAllocator` — pure, no I/O; caller persists `current()`. */ +export function createStampAllocator(initial = 0): StampAllocator { + let counter = Math.max(initial, Date.now()); + return { + next: () => { + const now = Date.now(); + counter = Math.max(counter + 1, now); + return counter; + }, + current: () => counter, + seed: (value: number) => { + counter = Math.max(value, Date.now() - 1); + }, + }; +} diff --git a/packages/content-pipeline/src/tracker.ts b/packages/content-pipeline/src/tracker.ts new file mode 100644 index 0000000..3ac9967 --- /dev/null +++ b/packages/content-pipeline/src/tracker.ts @@ -0,0 +1,137 @@ +import type { Store, StoreWrite } from "./store.js"; +import type { Entry, Transform } from "./types.js"; + +export type RunTrackerOptions = { + /** Listener name — identifies this tracker's cursor on the upstream store. */ + name: string; + /** Entries per batch; a sleep is inserted between batches. Default: 50. */ + batchSize?: number; + /** Milliseconds to sleep between batches, yielding to the event loop. Default: 10. */ + pauseMs?: number; + /** Aborts the drain at the next batch boundary without committing a partial batch. */ + signal?: AbortSignal; + /** + * Side-effect hook invoked once per upstream tombstone, before the tombstone is + * written downstream. Indexer trackers use this to delete documents from the + * search index. The transform function is still NOT called for tombstones. + */ + onRemove?: (uri: string) => Promise | void; +}; + +export type Tracker = { + /** Drain all upstream entries newer than the persisted cursor. */ + catchUp(): Promise; + /** Request a drain; coalesces with any in-progress one. */ + kick(): void; + /** Unsubscribe from upstream notifications. Does not close stores. */ + close(): Promise; +}; + +const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Drive a single layer. Subscribes to `upstream.onStampUpdate` and drains + * upstream entries into `own` via `transform` in batches with `pauseMs` + * between batches. Per-URI errors are caught and recorded as `meta.error` + * so one bad entry doesn't stall the cursor. + */ +export function runTracker( + upstream: Store, + own: Store, + transform: Transform, + opts: RunTrackerOptions, +): Tracker { + const batchSize = opts.batchSize ?? 50; + const pauseMs = opts.pauseMs ?? 10; + const signal = opts.signal; + const onRemove = opts.onRemove; + + let pending = false; + let inFlight: Promise | null = null; + + async function drainBatch(): Promise { + const cursor = await upstream.cursor(opts.name); + const buf: StoreWrite[] = []; + let lastStamp = cursor; + let count = 0; + + for await (const up of upstream.since(cursor, batchSize)) { + if (signal?.aborted) return 0; + + if (up.tombstone) { + if (onRemove) { + try { + await onRemove(up.uri); + } catch { + // Swallow onRemove errors; the tombstone still propagates downstream. + } + } + buf.push({ uri: up.uri, tombstone: true } as StoreWrite); + } else { + try { + const out = await transform(up); + if (out) buf.push(out); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + buf.push({ uri: up.uri, meta: { error: message } } as unknown as StoreWrite); + } + } + + lastStamp = Math.max(lastStamp, up.stamp); + count += 1; + } + + if (count === 0) return 0; + await own.put(buf); + await upstream.advance(opts.name, lastStamp); + return count; + } + + async function drain(): Promise { + let total = 0; + while (!signal?.aborted) { + const n = await drainBatch(); + if (n === 0) break; + total += n; + if (pauseMs > 0) await sleep(pauseMs); + } + return total; + } + + async function loop(): Promise { + if (inFlight) { + pending = true; + // Wait for the running drain to finish; its do-while will re-run + // because `pending` is now set, so any writes made before this call + // will be observed by the time we return. + return inFlight; + } + inFlight = (async (): Promise => { + let total = 0; + try { + do { + pending = false; + total += await drain(); + } while (pending && !signal?.aborted); + } finally { + inFlight = null; + } + return total; + })(); + return inFlight; + } + + const unsubscribe = upstream.onStampUpdate(() => { + void loop(); + }); + + return { + catchUp: loop, + kick: () => { + void loop(); + }, + close: async () => { + unsubscribe(); + }, + }; +} diff --git a/packages/content-pipeline/src/transforms/embed.ts b/packages/content-pipeline/src/transforms/embed.ts new file mode 100644 index 0000000..aa27083 --- /dev/null +++ b/packages/content-pipeline/src/transforms/embed.ts @@ -0,0 +1,17 @@ +import type { ChunksEntry, Transform, VecsEntry } from "../types.js"; + +export type EmbedFn = (text: string) => Promise; + +/** + * Produce one embedding per chunk by sequentially awaiting `embedFn` — preserves + * chunk ordering in the resulting `vecs` array. Returns null if the upstream + * entry carries no chunks. + */ +export function embed(embedFn: EmbedFn): Transform { + return async (up) => { + if (!up.meta || up.meta.chunks.length === 0) return null; + const vecs: Float32Array[] = []; + for (const c of up.meta.chunks) vecs.push(await embedFn(c.text)); + return { uri: up.uri, meta: { vecs } }; + }; +} diff --git a/packages/content-pipeline/src/transforms/extract.ts b/packages/content-pipeline/src/transforms/extract.ts new file mode 100644 index 0000000..16bc33e --- /dev/null +++ b/packages/content-pipeline/src/transforms/extract.ts @@ -0,0 +1,39 @@ +import type { ExtractorRegistry } from "@statewalker/content-extractors"; +import type { FilesApi } from "@statewalker/webrun-files"; +import { extname } from "@statewalker/webrun-files"; +import type { ContentEntry, FileEntry, Transform } from "../types.js"; + +const EXT_FORMAT: Record = { + ".md": "markdown", + ".txt": "text", + ".pdf": "pdf", + ".docx": "docx", + ".xlsx": "xlsx", + ".html": "html", + ".htm": "html", +}; + +/** Map a URI to its content format using path extension (registry-agnostic). */ +function detectFormat(uri: string): string { + const ext = extname(uri).toLowerCase(); + return EXT_FORMAT[ext] ?? "unknown"; +} + +/** + * Read the file via `FilesApi`, pick an extractor from the registry by URI, + * and produce `{text, format}`. Returns null if no extractor matches — the + * driver advances the cursor past unknown formats without writing a downstream entry. + */ +export function extract( + files: FilesApi, + extractors: ExtractorRegistry, +): Transform { + return async (up) => { + const ex = extractors.get(up.uri); + if (!ex) return null; + const bytes = files.read(up.uri); + const result = await ex(bytes); + const text = typeof result === "string" ? result : String(result); + return { uri: up.uri, meta: { text, format: detectFormat(up.uri) } }; + }; +} diff --git a/packages/content-pipeline/src/transforms/fts-index.ts b/packages/content-pipeline/src/transforms/fts-index.ts new file mode 100644 index 0000000..6a8033a --- /dev/null +++ b/packages/content-pipeline/src/transforms/fts-index.ts @@ -0,0 +1,31 @@ +import type { DocumentPath, Index, IndexedBlock } from "@statewalker/indexer-api"; +import type { ChunksEntry, ReceiptEntry, Transform } from "../types.js"; +import { uriToDocPath } from "./util.js"; + +/** + * Full-replace indexer: deletes the document then adds one block per chunk. + * Block IDs follow `{uri}:{i}` so FTS and vector sub-indexes stay correlated. + * Receipt carries no meta — it exists purely so downstream listeners can subscribe. + */ +export function ftsIndex(index: Index): Transform { + return async (up) => { + const path = uriToDocPath(up.uri); + await index.deleteDocuments([{ path }]); + const chunks = up.meta?.chunks ?? []; + if (chunks.length > 0) { + const blocks: IndexedBlock[] = chunks.map((c) => ({ + path, + blockId: `${path}:${c.i}`, + content: c.text, + })); + await index.addDocument(blocks); + } + return { uri: up.uri, meta: {} as Record }; + }; +} + +/** Cascade-remove the document from the index when a chunks-layer tombstone arrives. */ +export async function ftsIndexRemove(index: Index, uri: string): Promise { + const path: DocumentPath = uriToDocPath(uri); + await index.deleteDocuments([{ path }]); +} diff --git a/packages/content-pipeline/src/transforms/split.ts b/packages/content-pipeline/src/transforms/split.ts new file mode 100644 index 0000000..568950b --- /dev/null +++ b/packages/content-pipeline/src/transforms/split.ts @@ -0,0 +1,19 @@ +import type { ChunkOptions } from "@statewalker/indexer-chunker"; +import { chunkMarkdown } from "@statewalker/indexer-chunker"; +import type { ChunksEntry, ContentEntry, Transform } from "../types.js"; + +/** + * Split extracted text into markdown chunks. Returns null for missing or empty + * text so the driver skips the URI without writing a downstream entry. + */ +export function split(opts: ChunkOptions): Transform { + return async (up) => { + if (!up.meta?.text) return null; + const chunks = chunkMarkdown(up.meta.text, opts).map((c) => ({ + i: c.index, + text: c.content, + })); + if (chunks.length === 0) return null; + return { uri: up.uri, meta: { chunks } }; + }; +} diff --git a/packages/content-pipeline/src/transforms/util.ts b/packages/content-pipeline/src/transforms/util.ts new file mode 100644 index 0000000..1dd9cea --- /dev/null +++ b/packages/content-pipeline/src/transforms/util.ts @@ -0,0 +1,6 @@ +import type { DocumentPath } from "@statewalker/indexer-api"; + +/** Coerce a URI into the `/…` DocumentPath shape required by indexer-api. */ +export function uriToDocPath(uri: string): DocumentPath { + return (uri.startsWith("/") ? uri : `/${uri}`) as DocumentPath; +} diff --git a/packages/content-pipeline/src/transforms/vec-index.ts b/packages/content-pipeline/src/transforms/vec-index.ts new file mode 100644 index 0000000..b004157 --- /dev/null +++ b/packages/content-pipeline/src/transforms/vec-index.ts @@ -0,0 +1,29 @@ +import type { DocumentPath, Index, IndexedBlock } from "@statewalker/indexer-api"; +import type { ReceiptEntry, Transform, VecsEntry } from "../types.js"; +import { uriToDocPath } from "./util.js"; + +/** + * Full-replace vector indexer. Block IDs match `ftsIndex` so hybrid search can + * correlate FTS and vector hits for the same chunk. + */ +export function vecIndex(index: Index): Transform { + return async (up) => { + const path = uriToDocPath(up.uri); + await index.deleteDocuments([{ path }]); + const vecs = up.meta?.vecs ?? []; + if (vecs.length > 0) { + const blocks: IndexedBlock[] = vecs.map((embedding, i) => ({ + path, + blockId: `${path}:${i}`, + embedding, + })); + await index.addDocument(blocks); + } + return { uri: up.uri, meta: {} as Record }; + }; +} + +export async function vecIndexRemove(index: Index, uri: string): Promise { + const path: DocumentPath = uriToDocPath(uri); + await index.deleteDocuments([{ path }]); +} diff --git a/packages/content-pipeline/src/types.ts b/packages/content-pipeline/src/types.ts new file mode 100644 index 0000000..7889bf7 --- /dev/null +++ b/packages/content-pipeline/src/types.ts @@ -0,0 +1,53 @@ +/** + * A tracked item in a layer's store. Every layer parameterises the meta type; + * tombstones carry no meta. + */ +export type Entry> = { + uri: string; + stamp: number; + tombstone?: true; + meta?: M; +}; + +/** A transform pulls upstream entries and returns downstream entries sans stamp. */ +export type Transform = ( + upstream: U, +) => Promise | null>; + +/** Files-tracker meta: what `scanFiles` records per URI. */ +export type FileMeta = { + size: number; + mtime: number; + hash: string; +}; + +/** Extract-tracker meta: text payload + detected format. */ +export type ContentMeta = { + text: string; + format: string; +}; + +/** A single chunk produced by `split`. */ +export type Chunk = { + i: number; + text: string; +}; + +/** Split-tracker meta: an ordered list of chunks. */ +export type ChunksMeta = { + chunks: Chunk[]; +}; + +/** Embed-tracker meta: one embedding per chunk, same order as the chunks list. */ +export type VecsMeta = { + vecs: Float32Array[]; +}; + +/** FTS/vec indexer receipt meta: intentionally empty. */ +export type Receipt = Record; + +export type FileEntry = Entry; +export type ContentEntry = Entry; +export type ChunksEntry = Entry; +export type VecsEntry = Entry; +export type ReceiptEntry = Entry; diff --git a/packages/content-pipeline/tests/blob-store.test.ts b/packages/content-pipeline/tests/blob-store.test.ts new file mode 100644 index 0000000..b3b818c --- /dev/null +++ b/packages/content-pipeline/tests/blob-store.test.ts @@ -0,0 +1,110 @@ +import { sha1Uuid } from "@statewalker/shared-ids"; +import { MemFilesApi } from "@statewalker/webrun-files-mem"; +import { describe, expect, it } from "vitest"; +import { BlobStore } from "../src/stores/blob.js"; +import { float32Codec } from "../src/stores/codec-float32.js"; +import { msgpackCodec } from "../src/stores/codec-msgpack.js"; +import type { ContentEntry, VecsEntry } from "../src/types.js"; + +describe("BlobStore", () => { + it("writes one blob per URI at {prefix}/{dd}/{hash}.bin", async () => { + const files = new MemFilesApi(); + const store = new BlobStore({ + files, + prefix: "/s", + codec: msgpackCodec(), + }); + await store.put([{ uri: "doc.md", meta: { text: "hello", format: "markdown" } }]); + const hash = await sha1Uuid("doc.md"); + const expectedPath = `/s/${hash.slice(0, 2)}/${hash}.bin`; + expect(await files.exists(expectedPath)).toBe(true); + }); + + it("round-trips meta via the msgpack codec", async () => { + const store = new BlobStore({ + files: new MemFilesApi(), + prefix: "/s", + codec: msgpackCodec(), + }); + await store.put([{ uri: "a", meta: { text: "body", format: "markdown" } }]); + const e = await store.get("a"); + expect(e?.meta).toEqual({ text: "body", format: "markdown" }); + }); + + it("tombstones remove the blob and skip codec encoding", async () => { + const files = new MemFilesApi(); + const store = new BlobStore({ + files, + prefix: "/s", + codec: msgpackCodec(), + }); + await store.put([{ uri: "a", meta: { text: "body", format: "markdown" } }]); + const hash = await sha1Uuid("a"); + const blobP = `/s/${hash.slice(0, 2)}/${hash}.bin`; + expect(await files.exists(blobP)).toBe(true); + + await store.put([{ uri: "a", tombstone: true }]); + expect(await files.exists(blobP)).toBe(false); + const e = await store.get("a"); + expect(e?.tombstone).toBe(true); + expect(e?.meta).toBeUndefined(); + }); + + it("since yields entries in stamp order, reading blobs lazily on each yield", async () => { + const store = new BlobStore({ + files: new MemFilesApi(), + prefix: "/s", + codec: msgpackCodec(), + }); + await store.put([ + { uri: "a", meta: { text: "A", format: "markdown" } }, + { uri: "b", meta: { text: "B", format: "markdown" } }, + { uri: "c", meta: { text: "C", format: "markdown" } }, + ]); + const texts: string[] = []; + for await (const e of store.since(0, 10)) { + if (e.meta) texts.push(e.meta.text); + } + expect(texts).toEqual(["A", "B", "C"]); + }); + + it("works with the float32 codec for embeddings", async () => { + const store = new BlobStore({ + files: new MemFilesApi(), + prefix: "/v", + codec: float32Codec(), + }); + await store.put([ + { + uri: "doc", + meta: { vecs: [Float32Array.of(0.1, 0.2, 0.3), Float32Array.of(0.4, 0.5, 0.6)] }, + }, + ]); + const e = await store.get("doc"); + expect(e?.meta?.vecs.length).toBe(2); + expect(Array.from(e?.meta?.vecs[0] ?? [])).toEqual([ + 0.10000000149011612, 0.20000000298023224, 0.30000001192092896, + ]); + }); + + it("persists across a simulated restart", async () => { + const files = new MemFilesApi(); + const first = new BlobStore({ + files, + prefix: "/s", + codec: msgpackCodec(), + }); + await first.put([{ uri: "a", meta: { text: "body", format: "markdown" } }]); + const topFirst = (await first.get("a"))?.stamp as number; + await first.advance("t", topFirst); + + const second = new BlobStore({ + files, + prefix: "/s", + codec: msgpackCodec(), + }); + const reopened = await second.get("a"); + expect(reopened?.meta?.text).toBe("body"); + expect(await second.cursor("t")).toBe(topFirst); + }); +}); diff --git a/packages/content-pipeline/tests/codecs.test.ts b/packages/content-pipeline/tests/codecs.test.ts new file mode 100644 index 0000000..93f1942 --- /dev/null +++ b/packages/content-pipeline/tests/codecs.test.ts @@ -0,0 +1,66 @@ +import { describe, expect, it } from "vitest"; +import { float32Codec } from "../src/stores/codec-float32.js"; +import { msgpackCodec } from "../src/stores/codec-msgpack.js"; +import type { ChunksMeta } from "../src/types.js"; + +describe("msgpackCodec", () => { + it("round-trips a chunk list preserving order and types", async () => { + const codec = msgpackCodec(); + const meta: ChunksMeta = { + chunks: [ + { i: 0, text: "alpha" }, + { i: 1, text: "beta" }, + { i: 2, text: "gamma" }, + ], + }; + const bytes = await codec.encode(meta); + const decoded = await codec.decode(bytes); + expect(decoded).toEqual(meta); + }); + + it("round-trips generic meta shapes", async () => { + const codec = msgpackCodec>(); + const meta = { size: 42, mtime: 1_700_000_000, nested: { a: [1, 2, 3] } }; + const bytes = await codec.encode(meta); + expect(await codec.decode(bytes)).toEqual(meta); + }); +}); + +describe("float32Codec", () => { + it("round-trips embeddings preserving values and order", async () => { + const codec = float32Codec(); + const meta = { + vecs: [Float32Array.of(1, 2, 3), Float32Array.of(4, 5, 6), Float32Array.of(7, 8, 9)], + }; + const bytes = await codec.encode(meta); + const decoded = await codec.decode(bytes); + expect(decoded.vecs.length).toBe(3); + expect(Array.from(decoded.vecs[0] as Float32Array)).toEqual([1, 2, 3]); + expect(Array.from(decoded.vecs[1] as Float32Array)).toEqual([4, 5, 6]); + expect(Array.from(decoded.vecs[2] as Float32Array)).toEqual([7, 8, 9]); + }); + + it("encodes empty vec list as a valid header-only blob", async () => { + const codec = float32Codec(); + const bytes = await codec.encode({ vecs: [] }); + expect(bytes.length).toBe(8); + const decoded = await codec.decode(bytes); + expect(decoded.vecs).toEqual([]); + }); + + it("throws on mismatched dimensions at encode time", () => { + const codec = float32Codec(); + expect(() => codec.encode({ vecs: [Float32Array.of(1, 2, 3), Float32Array.of(4, 5)] })).toThrow( + /share dimension/, + ); + }); + + it("throws on corrupted header at decode time", () => { + const codec = float32Codec(); + // Header claims 10 vecs of dim 3 but body is empty. + const buf = new Uint8Array(8); + new DataView(buf.buffer).setUint32(0, 10, true); + new DataView(buf.buffer).setUint32(4, 3, true); + expect(() => codec.decode(buf)).toThrow(/size/); + }); +}); diff --git a/packages/content-manager/tests/content-manager.test.ts b/packages/content-pipeline/tests/content-manager.test.ts similarity index 77% rename from packages/content-manager/tests/content-manager.test.ts rename to packages/content-pipeline/tests/content-manager.test.ts index b78d9bb..7ff73b9 100644 --- a/packages/content-manager/tests/content-manager.test.ts +++ b/packages/content-pipeline/tests/content-manager.test.ts @@ -1,26 +1,24 @@ import { createDefaultRegistry } from "@statewalker/content-extractors/extractors"; -import { FilesScanRegistry } from "@statewalker/content-scanner"; import { createFlexSearchIndexer } from "@statewalker/indexer-mem-flexsearch"; import { writeText } from "@statewalker/webrun-files"; import { MemFilesApi } from "@statewalker/webrun-files-mem"; import { describe, expect, it } from "vitest"; -import { createContentManager } from "../src/content-manager.js"; -import type { SyncEvent } from "../src/types.js"; +import { createContentManager, type SyncEvent } from "../src/content-manager.js"; function setup() { const files = new MemFilesApi(); - const registry = new FilesScanRegistry({ files, prefix: "/.index/scan" }); const indexer = createFlexSearchIndexer(); const extractors = createDefaultRegistry(); const manager = createContentManager({ - registry, - indexer, files, + statePrefix: "/.state/content", extractors, + indexer, root: "/", - filter: (path: string) => !path.startsWith("/.index/"), + filter: (p) => !p.startsWith("/.state/"), + pauseMs: 0, }); - return { files, registry, indexer, manager }; + return { files, indexer, manager }; } async function collectEvents(gen: AsyncGenerator): Promise { @@ -29,10 +27,9 @@ async function collectEvents(gen: AsyncGenerator): Promise { +describe("content-manager orchestrator (ported from legacy integration test)", () => { it("syncs files through the full pipeline", async () => { const { files, manager } = setup(); - await writeText(files, "/readme.md", "# Hello\n\nThis is a readme file."); await writeText( files, @@ -42,23 +39,17 @@ describe("content-manager orchestrator", () => { const events = await collectEvents(manager.sync()); - const started = events.find((e) => e.type === "sync-started"); - expect(started).toBeDefined(); - + expect(events.find((e) => e.type === "sync-started")).toBeDefined(); const done = events.find((e) => e.type === "sync-done"); - expect(done).toBeDefined(); expect(done?.type === "sync-done" && done.stats.indexed).toBe(2); expect(done?.type === "sync-done" && done.stats.errors).toBe(0); - - const indexed = events.filter((e) => e.type === "file-indexed"); - expect(indexed).toHaveLength(2); + expect(events.filter((e) => e.type === "file-indexed")).toHaveLength(2); await manager.close(); }); it("search finds indexed content", async () => { const { files, manager } = setup(); - await writeText( files, "/animals.md", @@ -71,7 +62,6 @@ describe("content-manager orchestrator", () => { ); await collectEvents(manager.sync()); - const hits = await manager.search({ queries: ["cats furry"] }); expect(hits.length).toBeGreaterThan(0); expect(hits[0]?.uri).toContain("animals.md"); @@ -81,7 +71,6 @@ describe("content-manager orchestrator", () => { it("status reports file and index counts", async () => { const { files, manager } = setup(); - await writeText(files, "/a.md", "# A\n\nContent A."); await writeText(files, "/b.md", "# B\n\nContent B."); await writeText(files, "/c.txt", "Plain text content."); @@ -89,9 +78,7 @@ describe("content-manager orchestrator", () => { await collectEvents(manager.sync()); const status = await manager.status(); - // All 3 files detected expect(status.files).toBe(3); - // Only .md and .txt have extractors expect(status.indexed).toBeGreaterThanOrEqual(2); await manager.close(); @@ -99,13 +86,11 @@ describe("content-manager orchestrator", () => { it("incremental sync skips unchanged files", async () => { const { files, manager } = setup(); - await writeText(files, "/doc.md", "# Doc\n\nOriginal content."); const events1 = await collectEvents(manager.sync()); const done1 = events1.find((e) => e.type === "sync-done"); expect(done1?.type === "sync-done" && done1.stats.indexed).toBe(1); - // Second sync without changes — no new indexing const events2 = await collectEvents(manager.sync()); const done2 = events2.find((e) => e.type === "sync-done"); expect(done2?.type === "sync-done" && done2.stats.indexed).toBe(0); @@ -113,47 +98,36 @@ describe("content-manager orchestrator", () => { await manager.close(); }); - it("clear removes all stores and index", async () => { - const { files, manager, registry } = setup(); - + it("clear removes all stores and the index", async () => { + const { files, manager } = setup(); await writeText(files, "/doc.md", "# Doc\n\nContent."); await collectEvents(manager.sync()); - const statusBefore = await manager.status(); expect(statusBefore.files).toBe(1); await manager.clear(); - const names = await registry.getStoreNames(); - expect(names).toHaveLength(0); - + expect(await files.exists("/.state/content")).toBe(false); await manager.close(); }); it("returns empty results for search with no content", async () => { const { manager } = setup(); - const hits = await manager.search({ queries: ["anything"] }); expect(hits).toEqual([]); - await manager.close(); }); it("handles file removal", async () => { const { files, manager } = setup(); - await writeText(files, "/temp.md", "# Temporary\n\nWill be removed."); await collectEvents(manager.sync()); - const status1 = await manager.status(); expect(status1.indexed).toBe(1); - // Remove the file and re-sync await files.remove("/temp.md"); const events = await collectEvents(manager.sync()); - - const removed = events.filter((e) => e.type === "file-removed"); - expect(removed).toHaveLength(1); + expect(events.filter((e) => e.type === "file-removed")).toHaveLength(1); await manager.close(); }); diff --git a/packages/content-pipeline/tests/json-manifest.test.ts b/packages/content-pipeline/tests/json-manifest.test.ts new file mode 100644 index 0000000..fdd9be8 --- /dev/null +++ b/packages/content-pipeline/tests/json-manifest.test.ts @@ -0,0 +1,88 @@ +import { MemFilesApi } from "@statewalker/webrun-files-mem"; +import { describe, expect, it } from "vitest"; +import { JsonManifestStore } from "../src/stores/json-manifest.js"; +import type { FileEntry } from "../src/types.js"; + +const mk = (prefix = "/store"): JsonManifestStore => + new JsonManifestStore({ files: new MemFilesApi(), prefix }); + +describe("JsonManifestStore", () => { + it("round-trips live entries and tombstones", async () => { + const store = mk(); + await store.put([ + { uri: "a", meta: { size: 10, mtime: 1, hash: "aa" } }, + { uri: "b", meta: { size: 20, mtime: 2, hash: "bb" } }, + ]); + const a = await store.get("a"); + expect(a?.meta?.hash).toBe("aa"); + expect(a?.tombstone).toBeUndefined(); + + await store.put([{ uri: "a", tombstone: true }]); + const aGone = await store.get("a"); + expect(aGone?.tombstone).toBe(true); + expect(aGone?.meta).toBeUndefined(); + }); + + it("since yields only entries with stamp > cursor, in ascending order", async () => { + const store = mk(); + await store.put([ + { uri: "a", meta: { size: 1, mtime: 1, hash: "a" } }, + { uri: "b", meta: { size: 2, mtime: 2, hash: "b" } }, + ]); + const all = []; + for await (const e of store.since(0, 100)) all.push(e); + expect(all.map((e) => e.uri)).toEqual(["a", "b"]); + for (let i = 1; i < all.length; i++) { + expect((all[i] as FileEntry).stamp).toBeGreaterThan((all[i - 1] as FileEntry).stamp); + } + const midCursor = (all[0] as FileEntry).stamp; + const after = []; + for await (const e of store.since(midCursor, 100)) after.push(e); + expect(after.map((e) => e.uri)).toEqual(["b"]); + }); + + it("keeps per-listener cursors independent", async () => { + const store = mk(); + await store.advance("A", 100); + await store.advance("B", 200); + expect(await store.cursor("A")).toBe(100); + expect(await store.cursor("B")).toBe(200); + expect(await store.cursor("unknown")).toBe(0); + await store.advance("A", 150); + expect(await store.cursor("A")).toBe(150); + expect(await store.cursor("B")).toBe(200); + }); + + it("persists across a simulated restart on the same FilesApi", async () => { + const files = new MemFilesApi(); + const first = new JsonManifestStore({ files, prefix: "/s" }); + await first.put([{ uri: "a", meta: { size: 1, mtime: 1, hash: "a" } }]); + const topFirst = (await first.get("a"))?.stamp as number; + await first.advance("t", topFirst); + + const second = new JsonManifestStore({ files, prefix: "/s" }); + const reopened = await second.get("a"); + expect(reopened?.meta?.hash).toBe("a"); + expect(await second.cursor("t")).toBe(topFirst); + + await second.put([{ uri: "b", meta: { size: 2, mtime: 2, hash: "b" } }]); + const topSecond = (await second.get("b"))?.stamp as number; + expect(topSecond).toBeGreaterThan(topFirst); + }); + + it("fires onStampUpdate once per put batch with the batch's top stamp", async () => { + const store = mk(); + const notifications: number[] = []; + const unsub = store.onStampUpdate((s) => notifications.push(s)); + await store.put([ + { uri: "a", meta: { size: 1, mtime: 1, hash: "a" } }, + { uri: "b", meta: { size: 2, mtime: 2, hash: "b" } }, + ]); + expect(notifications.length).toBe(1); + const top = (await store.get("b"))?.stamp; + expect(notifications[0]).toBe(top); + unsub(); + await store.put([{ uri: "c", meta: { size: 3, mtime: 3, hash: "c" } }]); + expect(notifications.length).toBe(1); + }); +}); diff --git a/packages/content-pipeline/tests/pipeline-cascade.test.ts b/packages/content-pipeline/tests/pipeline-cascade.test.ts new file mode 100644 index 0000000..566a269 --- /dev/null +++ b/packages/content-pipeline/tests/pipeline-cascade.test.ts @@ -0,0 +1,254 @@ +import { createDefaultRegistry } from "@statewalker/content-extractors/extractors"; +import { createFlexSearchIndexer } from "@statewalker/indexer-mem-flexsearch"; +import { writeText } from "@statewalker/webrun-files"; +import { MemFilesApi } from "@statewalker/webrun-files-mem"; +import { describe, expect, it } from "vitest"; +import { createDefaultStores, createPipeline } from "../src/pipeline.js"; + +const waitFor = async ( + predicate: () => boolean | Promise, + { timeoutMs = 2000, intervalMs = 10 } = {}, +): Promise => { + const start = Date.now(); + while (!(await predicate())) { + if (Date.now() - start > timeoutMs) throw new Error("waitFor timed out"); + await new Promise((r) => setTimeout(r, intervalMs)); + } +}; + +describe("pipeline cascade", () => { + it("propagates a new file through content, chunks, fts-receipt without an orchestrator", async () => { + const files = new MemFilesApi(); + const indexer = createFlexSearchIndexer(); + const index = await indexer.createIndex({ name: "content", fulltext: { language: "en" } }); + const extractors = createDefaultRegistry(); + const stores = createDefaultStores({ + files, + prefix: "/.state/content", + withFtsIndex: true, + }); + const pipeline = createPipeline({ + files, + root: "/", + filter: (p) => !p.startsWith("/.state/"), + extractors, + chunkOptions: { targetChars: 200 }, + ftsIndex: index, + stores, + pauseMs: 0, + }); + + await writeText(files, "/doc.md", "# Doc\n\nsome body text."); + await pipeline.scanFiles(); + + // scanFiles wrote to the files store; trackers are subscribed — wait for the + // fts-receipt store to show the entry without us ever calling catchUpAll. + await waitFor(async () => { + const receipt = await stores.fts?.get("/doc.md"); + return !!receipt && !receipt.tombstone; + }); + + const content = await stores.content.get("/doc.md"); + expect(content?.meta?.text).toContain("Doc"); + const chunks = await stores.chunks.get("/doc.md"); + expect(chunks?.meta?.chunks.length ?? 0).toBeGreaterThan(0); + + await pipeline.close(); + }); + + it("cascades tombstones through the pipeline and deletes docs from the FTS index", async () => { + const files = new MemFilesApi(); + const indexer = createFlexSearchIndexer(); + const index = await indexer.createIndex({ name: "content", fulltext: { language: "en" } }); + const extractors = createDefaultRegistry(); + const stores = createDefaultStores({ + files, + prefix: "/.state/content", + withFtsIndex: true, + }); + const pipeline = createPipeline({ + files, + root: "/", + filter: (p) => !p.startsWith("/.state/"), + extractors, + chunkOptions: { targetChars: 200 }, + ftsIndex: index, + stores, + pauseMs: 0, + }); + + await writeText(files, "/gone.md", "# Gone\n\nContent to be removed."); + await pipeline.scanFiles(); + await pipeline.catchUpAll(); + + const hitsBefore: unknown[] = []; + for await (const r of index.search({ queries: ["Gone"], topK: 5 })) hitsBefore.push(r); + expect(hitsBefore.length).toBeGreaterThan(0); + + await files.remove("/gone.md"); + await pipeline.scanFiles(); + await pipeline.catchUpAll(); + + expect((await stores.content.get("/gone.md"))?.tombstone).toBe(true); + expect((await stores.chunks.get("/gone.md"))?.tombstone).toBe(true); + expect((await stores.fts?.get("/gone.md"))?.tombstone).toBe(true); + + const hitsAfter: unknown[] = []; + for await (const r of index.search({ queries: ["Gone"], topK: 5 })) hitsAfter.push(r); + expect(hitsAfter).toHaveLength(0); + + await pipeline.close(); + }); + + it("rebuild-from-scratch: resetting the first tracker's cursor reprocesses every upstream entry", async () => { + const files = new MemFilesApi(); + const indexer = createFlexSearchIndexer(); + const index = await indexer.createIndex({ name: "content", fulltext: { language: "en" } }); + const extractors = createDefaultRegistry(); + const stores = createDefaultStores({ + files, + prefix: "/.state/content", + withFtsIndex: true, + }); + const pipeline = createPipeline({ + files, + root: "/", + filter: (p) => !p.startsWith("/.state/"), + extractors, + chunkOptions: { targetChars: 200 }, + ftsIndex: index, + stores, + pauseMs: 0, + }); + + await writeText(files, "/a.md", "# A\n\nbody A."); + await writeText(files, "/b.md", "# B\n\nbody B."); + await pipeline.scanFiles(); + await pipeline.catchUpAll(); + + const extractCursorA = await stores.files.cursor("extract"); + expect(extractCursorA).toBeGreaterThan(0); + + // Reset every layer's cursor; catchUpAll should replay everything. + await stores.files.advance("extract", 0); + await stores.content.advance("split", 0); + await stores.chunks.advance("fts", 0); + await pipeline.catchUpAll(); + + const extractCursorB = await stores.files.cursor("extract"); + expect(extractCursorB).toBe(extractCursorA); + expect(await stores.content.get("/a.md")).toBeDefined(); + expect(await stores.content.get("/b.md")).toBeDefined(); + + await pipeline.close(); + }); + + it("embeddings round-trip through BlobStore(float32) and vecIndex with matching block IDs", async () => { + const files = new MemFilesApi(); + const indexer = createFlexSearchIndexer(); + const index = await indexer.createIndex({ + name: "content", + fulltext: { language: "en" }, + // The flexsearch impl may or may not support an embedding sub-index; + // vecIndex only runs if we pass it as `vecIndex` below, so we keep both pointers. + }); + const extractors = createDefaultRegistry(); + const stores = createDefaultStores({ + files, + prefix: "/.state/content", + withFtsIndex: true, + withEmbeddings: true, + withVecIndex: true, + }); + + const fakeEmbed = async (text: string): Promise => { + // Deterministic 4-dim hash so we can assert presence/order without cosine math. + const vec = new Float32Array(4); + for (let i = 0; i < text.length; i++) { + const slot = i % 4; + vec[slot] = (vec[slot] ?? 0) + text.charCodeAt(i); + } + return vec; + }; + + const pipeline = createPipeline({ + files, + root: "/", + filter: (p) => !p.startsWith("/.state/"), + extractors, + chunkOptions: { targetChars: 60 }, + ftsIndex: index, + vecIndex: index, + embed: fakeEmbed, + stores, + pauseMs: 0, + }); + + await writeText(files, "/doc.md", "alpha beta gamma delta epsilon zeta eta theta"); + await pipeline.scanFiles(); + await pipeline.catchUpAll(); + + const vecsEntry = await stores.embeddings?.get("/doc.md"); + expect(vecsEntry?.meta?.vecs.length).toBeGreaterThan(0); + expect(vecsEntry?.meta?.vecs[0]?.length).toBe(4); + + // Sanity: the vec-receipt store reflects the indexing step. + const vecReceipt = await stores.vec?.get("/doc.md"); + expect(vecReceipt).toBeDefined(); + expect(vecReceipt?.tombstone).toBeUndefined(); + + // Block-ID invariant (embed-transform produces the same count as split chunks). + const chunks = await stores.chunks.get("/doc.md"); + const vecs = await stores.embeddings?.get("/doc.md"); + expect(vecs?.meta?.vecs.length).toBe(chunks?.meta?.chunks.length); + + await pipeline.close(); + }); + + it("pacing: pauseMs yields to the event loop between batches", async () => { + // Use the embed transform in isolation with a MemStore-like upstream to avoid + // depending on the full pipeline just to time the sleep. Verify via a + // side-effect that captures microtask-boundary crossings. + let externalTicks = 0; + const interval = setInterval(() => { + externalTicks += 1; + }, 5); + try { + const files = new MemFilesApi(); + const indexer = createFlexSearchIndexer(); + const index = await indexer.createIndex({ name: "content", fulltext: { language: "en" } }); + const extractors = createDefaultRegistry(); + const stores = createDefaultStores({ + files, + prefix: "/.state/content", + withFtsIndex: true, + }); + const pipeline = createPipeline({ + files, + root: "/", + filter: (p) => !p.startsWith("/.state/"), + extractors, + chunkOptions: { targetChars: 200 }, + ftsIndex: index, + stores, + batchSize: 2, + pauseMs: 20, + }); + + for (let i = 0; i < 6; i++) { + await writeText(files, `/file-${i}.md`, `# File ${i}\n\nbody ${i}.`); + } + await pipeline.scanFiles(); + await pipeline.catchUpAll(); + + // We wrote 6 files through two layers with batchSize=2 and pauseMs=20ms. + // That's at least (6/2 - 1) = 2 pauses per layer × 3 heavy layers = 6×20ms = 120ms + // of sleep — the 5ms-interval timer should have fired during that window. + expect(externalTicks).toBeGreaterThan(2); + + await pipeline.close(); + } finally { + clearInterval(interval); + } + }); +}); diff --git a/packages/content-pipeline/tests/stamp.test.ts b/packages/content-pipeline/tests/stamp.test.ts new file mode 100644 index 0000000..967e263 --- /dev/null +++ b/packages/content-pipeline/tests/stamp.test.ts @@ -0,0 +1,38 @@ +import { describe, expect, it } from "vitest"; +import { createStampAllocator } from "../src/stores/stamp.js"; + +describe("StampAllocator", () => { + it("produces strictly increasing stamps within a single batch", () => { + const alloc = createStampAllocator(); + const stamps = Array.from({ length: 1000 }, () => alloc.next()); + for (let i = 1; i < stamps.length; i++) { + const prev = stamps[i - 1] as number; + const cur = stamps[i] as number; + expect(cur).toBeGreaterThan(prev); + } + }); + + it("current() reflects the last allocated stamp without advancing", () => { + const alloc = createStampAllocator(); + const a = alloc.next(); + expect(alloc.current()).toBe(a); + expect(alloc.current()).toBe(a); + const b = alloc.next(); + expect(b).toBeGreaterThan(a); + expect(alloc.current()).toBe(b); + }); + + it("seed+next produces a stamp strictly greater than the seeded value", () => { + const alloc = createStampAllocator(); + alloc.seed(42); + expect(alloc.next()).toBeGreaterThan(42); + }); + + it("simulates restart: new allocator seeded with prior current() produces a greater stamp", () => { + const first = createStampAllocator(); + const top = first.next(); + const second = createStampAllocator(); + second.seed(top); + expect(second.next()).toBeGreaterThan(top); + }); +}); diff --git a/packages/content-pipeline/tests/tracker.test.ts b/packages/content-pipeline/tests/tracker.test.ts new file mode 100644 index 0000000..6195bb7 --- /dev/null +++ b/packages/content-pipeline/tests/tracker.test.ts @@ -0,0 +1,306 @@ +import { describe, expect, it } from "vitest"; +import { + createStampAllocator, + type StampListener, + type Store, + type StoreWrite, + type Unsubscribe, +} from "../src/store.js"; +import { runTracker } from "../src/tracker.js"; +import type { Entry, Transform } from "../src/types.js"; + +/** Minimal in-memory Store used by these tests. Orders entries by insertion stamp. */ +class MemStore implements Store { + private readonly entries = new Map(); + private readonly cursors = new Map(); + private readonly listeners = new Set(); + private readonly stamps = createStampAllocator(); + + async get(uri: string): Promise { + return this.entries.get(uri); + } + + async put(writes: StoreWrite[]): Promise { + if (writes.length === 0) return; + let top = 0; + for (const w of writes) { + const stamp = this.stamps.next(); + const entry = { ...w, stamp } as E; + this.entries.set(entry.uri, entry); + top = stamp; + } + for (const listener of this.listeners) listener(top); + } + + async *since(cursor: number, limit: number): AsyncGenerator { + const sorted = [...this.entries.values()] + .filter((e) => e.stamp > cursor) + .sort((a, b) => a.stamp - b.stamp); + for (let i = 0; i < Math.min(limit, sorted.length); i++) { + yield sorted[i] as E; + } + } + + async cursor(name: string): Promise { + return this.cursors.get(name) ?? 0; + } + + async advance(name: string, stamp: number): Promise { + this.cursors.set(name, stamp); + } + + onStampUpdate(listener: StampListener): Unsubscribe { + this.listeners.add(listener); + return () => this.listeners.delete(listener); + } + + async close(): Promise { + this.listeners.clear(); + } + + /** Test helper: snapshot live + tombstone entries. */ + all(): E[] { + return [...this.entries.values()].sort((a, b) => a.stamp - b.stamp); + } +} + +type TxtMeta = { text: string }; +type TxtEntry = Entry; + +const identityUpper: Transform = async (up) => { + if (!up.meta) return null; + return { uri: up.uri, meta: { text: up.meta.text.toUpperCase() } }; +}; + +const waitFor = async ( + predicate: () => boolean, + { timeoutMs = 1000, intervalMs = 5 } = {}, +): Promise => { + const start = Date.now(); + while (!predicate()) { + if (Date.now() - start > timeoutMs) throw new Error("waitFor timed out"); + await new Promise((r) => setTimeout(r, intervalMs)); + } +}; + +describe("runTracker", () => { + it("catches up from cursor N, processing entries in batches of batchSize", async () => { + const upstream = new MemStore(); + const own = new MemStore(); + await upstream.put( + Array.from({ length: 10 }, (_, i) => ({ uri: `f${i}`, meta: { text: `x${i}` } })), + ); + + const tracker = runTracker(upstream, own, identityUpper, { + name: "t", + batchSize: 3, + pauseMs: 0, + }); + try { + const processed = await tracker.catchUp(); + expect(processed).toBe(10); + expect(own.all().map((e) => e.meta?.text)).toEqual([ + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + "X9", + ]); + const topUpstream = Math.max(...upstream.all().map((e) => e.stamp)); + expect(await upstream.cursor("t")).toBe(topUpstream); + } finally { + await tracker.close(); + } + }); + + it("rebuilds from scratch when cursor is reset to 0", async () => { + const upstream = new MemStore(); + const own = new MemStore(); + await upstream.put([ + { uri: "a", meta: { text: "a" } }, + { uri: "b", meta: { text: "b" } }, + ]); + + const tracker = runTracker(upstream, own, identityUpper, { name: "t", pauseMs: 0 }); + try { + await tracker.catchUp(); + expect(own.all().length).toBe(2); + + await upstream.advance("t", 0); + const processed = await tracker.catchUp(); + expect(processed).toBe(2); + // Own store now contains two generations (original + rebuilt) keyed by same URI; + // last write wins per URI. + expect(own.all().length).toBe(2); + } finally { + await tracker.close(); + } + }); + + it("cascades via runtime notifications (no manual catchUp call)", async () => { + const upstream = new MemStore(); + const own = new MemStore(); + const tracker = runTracker(upstream, own, identityUpper, { name: "t", pauseMs: 0 }); + try { + await upstream.put([{ uri: "a", meta: { text: "a" } }]); + await waitFor(() => own.all().length === 1); + expect(own.all()[0]?.meta?.text).toBe("A"); + + await upstream.put([{ uri: "b", meta: { text: "b" } }]); + await waitFor(() => own.all().length === 2); + } finally { + await tracker.close(); + } + }); + + it("propagates tombstones without invoking the transform", async () => { + const upstream = new MemStore(); + const own = new MemStore(); + let transformCalls = 0; + const spy: Transform = async (up) => { + transformCalls += 1; + if (!up.meta) return null; + return { uri: up.uri, meta: { text: up.meta.text } }; + }; + + // Live entry for "a" is processed; later the tombstone replaces it upstream. + await upstream.put([{ uri: "a", meta: { text: "a" } }]); + const tracker = runTracker(upstream, own, spy, { name: "t", pauseMs: 0 }); + try { + await tracker.catchUp(); + expect(transformCalls).toBe(1); + expect(own.all().at(-1)?.meta?.text).toBe("a"); + + // Second put replaces "a" with a tombstone in the manifest-style upstream. + await upstream.put([{ uri: "a", tombstone: true }]); + const before = transformCalls; + await tracker.catchUp(); + // Transform NOT invoked for the tombstone — no additional calls. + expect(transformCalls).toBe(before); + expect(own.all().at(-1)?.tombstone).toBe(true); + expect(own.all().at(-1)?.meta).toBeUndefined(); + } finally { + await tracker.close(); + } + }); + + it("accepts transform-emitted tombstones", async () => { + const upstream = new MemStore(); + const own = new MemStore(); + const maybeTombstone: Transform = async (up) => { + if (up.meta?.text === "drop") return { uri: up.uri, tombstone: true }; + return { uri: up.uri, meta: { text: up.meta?.text ?? "" } }; + }; + + await upstream.put([ + { uri: "a", meta: { text: "a" } }, + { uri: "b", meta: { text: "drop" } }, + ]); + + const tracker = runTracker(upstream, own, maybeTombstone, { name: "t", pauseMs: 0 }); + try { + await tracker.catchUp(); + const byUri = new Map(own.all().map((e) => [e.uri, e])); + expect(byUri.get("a")?.meta?.text).toBe("a"); + expect(byUri.get("b")?.tombstone).toBe(true); + expect(byUri.get("b")?.meta).toBeUndefined(); + } finally { + await tracker.close(); + } + }); + + it("coalesces notifications that arrive during a drain", async () => { + const upstream = new MemStore(); + const own = new MemStore(); + let drainStarts = 0; + const slow: Transform = async (up) => { + drainStarts += 1; + await new Promise((r) => setTimeout(r, 5)); + return { uri: up.uri, meta: { text: up.meta?.text ?? "" } }; + }; + + const tracker = runTracker(upstream, own, slow, { + name: "t", + batchSize: 1, + pauseMs: 0, + }); + try { + await upstream.put([{ uri: "a", meta: { text: "a" } }]); + // Fire more writes while the first drain is still running. + await upstream.put([{ uri: "b", meta: { text: "b" } }]); + await upstream.put([{ uri: "c", meta: { text: "c" } }]); + await waitFor(() => own.all().length === 3); + // drainStarts equals the number of entries processed (one transform call each), + // not the number of notifications fired — coalescing means we don't spawn one + // drain per notification. + expect(drainStarts).toBe(3); + } finally { + await tracker.close(); + } + }); + + it("aborts mid-drain at the next entry without committing the partial batch", async () => { + const upstream = new MemStore(); + const own = new MemStore(); + const ctrl = new AbortController(); + const slow: Transform = async (up) => { + if (up.uri === "b") ctrl.abort(); + return { uri: up.uri, meta: { text: up.meta?.text ?? "" } }; + }; + + await upstream.put([ + { uri: "a", meta: { text: "a" } }, + { uri: "b", meta: { text: "b" } }, + { uri: "c", meta: { text: "c" } }, + ]); + + const tracker = runTracker(upstream, own, slow, { + name: "t", + batchSize: 10, + pauseMs: 0, + signal: ctrl.signal, + }); + try { + await tracker.catchUp(); + // Abort fired mid-batch — no entries from the aborted batch were committed. + expect(own.all().length).toBe(0); + // Cursor did not advance. + expect(await upstream.cursor("t")).toBe(0); + } finally { + await tracker.close(); + } + }); + + it("isolates per-URI errors as meta.error, advancing the cursor past them", async () => { + const upstream = new MemStore(); + const own = new MemStore>(); + const flaky: Transform> = async (up) => { + if (up.uri === "bad") throw new Error("boom"); + return { uri: up.uri, meta: { text: up.meta?.text ?? "" } }; + }; + + await upstream.put([ + { uri: "a", meta: { text: "a" } }, + { uri: "bad", meta: { text: "x" } }, + { uri: "c", meta: { text: "c" } }, + ]); + + const tracker = runTracker(upstream, own, flaky, { name: "t", pauseMs: 0 }); + try { + await tracker.catchUp(); + const byUri = new Map(own.all().map((e) => [e.uri, e])); + expect(byUri.get("a")?.meta?.text).toBe("a"); + expect(byUri.get("c")?.meta?.text).toBe("c"); + expect(byUri.get("bad")?.meta?.error).toBe("boom"); + const topUpstream = Math.max(...upstream.all().map((e) => e.stamp)); + expect(await upstream.cursor("t")).toBe(topUpstream); + } finally { + await tracker.close(); + } + }); +}); diff --git a/packages/content-manager/tsconfig.json b/packages/content-pipeline/tsconfig.json similarity index 95% rename from packages/content-manager/tsconfig.json rename to packages/content-pipeline/tsconfig.json index 6dbcc68..9dabea1 100644 --- a/packages/content-manager/tsconfig.json +++ b/packages/content-pipeline/tsconfig.json @@ -3,7 +3,7 @@ "target": "ES2022", "module": "Preserve", "moduleResolution": "Bundler", - "lib": ["ESNext"], + "lib": ["ESNext", "DOM"], "strict": true, "skipLibCheck": true, "verbatimModuleSyntax": true, diff --git a/packages/content-scanner/README.md b/packages/content-scanner/README.md deleted file mode 100644 index acb3480..0000000 --- a/packages/content-scanner/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# @statewalker/content-scanner - -Content scanner: walks a `@statewalker/webrun-files` tree and streams `@statewalker/content-blocks` through extractors into an indexer. - -## Installation - -```sh -pnpm add @statewalker/content-scanner -``` - -## Usage - -```ts -import { scanFileTree } from "@statewalker/content-scanner"; - -for await (const block of scanFileTree(root, { extractors, chunker })) { - await indexer.add(block); -} -``` - -## API - -- `scanFileTree(root, options)` — async-iterable of blocks. - -## Related - -- `@statewalker/content-extractors`, `@statewalker/content-manager`, `@statewalker/indexer-api`. diff --git a/packages/content-scanner/src/content-embedder-scanner.ts b/packages/content-scanner/src/content-embedder-scanner.ts deleted file mode 100644 index c231eae..0000000 --- a/packages/content-scanner/src/content-embedder-scanner.ts +++ /dev/null @@ -1,69 +0,0 @@ -import { decodeMsgpack, encodeFloat32Arrays } from "@statewalker/webrun-msgpack"; -import { collect } from "@statewalker/webrun-streams"; -import type { ChunkData } from "./content-splitter-scanner.js"; -import type { ScanStore, Update } from "./scan-store.js"; -import type { ScannerOptions } from "./scanner.js"; -import { Scanner } from "./scanner.js"; - -export type EmbedFn = (text: string) => Promise; - -export type ContentEmbedderOptions = ScannerOptions & { - /** Function to generate embeddings for a text chunk. */ - embed: EmbedFn; - /** Model name for metadata. */ - model?: string; - /** Embedding dimensionality for metadata. */ - dimensions?: number; -}; - -/** - * Scanner that generates embeddings for content chunks. - * - * Reads chunks from the upstream "chunks" store via msgpack stream, - * generates embeddings for each chunk, and stores them as Float32Array - * stream in the "embeddings" store. - */ -export class ContentEmbedderScanner extends Scanner { - private readonly embed: EmbedFn; - private readonly model: string; - private readonly dimensions: number; - - constructor(store: ScanStore, options: ContentEmbedderOptions) { - super(store, options); - this.embed = options.embed; - this.model = options.model ?? "unknown"; - this.dimensions = options.dimensions ?? 0; - } - - async processEntry(upstream: Update): Promise { - if (!upstream.content) return null; - - // Stream-decode chunks from upstream msgpack frames - const chunks = await collect(decodeMsgpack(upstream.content())); - if (chunks.length === 0) return null; - - // Generate embeddings for each chunk - const embeddings: Float32Array[] = []; - for (const chunk of chunks) { - const embedding = await this.embed(chunk.content); - embeddings.push(embedding); - } - - return { - uri: upstream.uri, - stamp: upstream.stamp, - meta: { - model: this.model, - dimensions: this.dimensions || (embeddings[0]?.length ?? 0), - chunkCount: embeddings.length, - }, - content: () => encodeFloat32Arrays(toAsync(embeddings)), - }; - } - - async removeEntry(_uri: string): Promise {} -} - -async function* toAsync(items: T[]): AsyncGenerator { - for (const item of items) yield item; -} diff --git a/packages/content-scanner/src/content-extractor-scanner.ts b/packages/content-scanner/src/content-extractor-scanner.ts deleted file mode 100644 index 5d7762f..0000000 --- a/packages/content-scanner/src/content-extractor-scanner.ts +++ /dev/null @@ -1,62 +0,0 @@ -import type { ExtractorRegistry } from "@statewalker/content-extractors"; -import type { FilesApi } from "@statewalker/webrun-files"; -import type { ScanStore, Update } from "./scan-store.js"; -import type { ScannerOptions } from "./scanner.js"; -import { Scanner } from "./scanner.js"; - -export type ContentExtractorScannerOptions = ScannerOptions & { - /** FilesApi to read raw file bytes from. */ - files: FilesApi; - /** Registry of content extractors (PDF, docx, markdown, txt, etc.). */ - extractors: ExtractorRegistry; -}; - -/** - * Scanner that extracts text content from files. - * - * Reads file URIs from the upstream "files" store, reads file bytes - * from `FilesApi`, extracts text using `ExtractorRegistry`, and stores - * the extracted text as binary content in the "content" store. - */ -export class ContentExtractorScanner extends Scanner { - private readonly files: FilesApi; - private readonly extractors: ExtractorRegistry; - - constructor(store: ScanStore, options: ContentExtractorScannerOptions) { - super(store, options); - this.files = options.files; - this.extractors = options.extractors; - } - - async processEntry(upstream: Update): Promise { - const uri = upstream.uri; - const extractor = this.extractors.get(uri); - if (!extractor) return null; // no extractor for this file type - - const bytes = this.files.read(uri); - const result = await extractor(bytes); - const text = typeof result === "string" ? result : String(result); - const encoded = new TextEncoder().encode(text); - - return { - uri, - stamp: upstream.stamp, - meta: { format: detectFormat(uri) }, - async *content() { - yield encoded; - }, - }; - } - - async removeEntry(_uri: string): Promise { - // Store soft-delete handles cleanup - } -} - -function detectFormat(path: string): string { - if (path.endsWith(".md")) return "markdown"; - if (path.endsWith(".txt")) return "text"; - if (path.endsWith(".pdf")) return "pdf"; - if (path.endsWith(".docx")) return "docx"; - return "unknown"; -} diff --git a/packages/content-scanner/src/content-fts-indexer-scanner.ts b/packages/content-scanner/src/content-fts-indexer-scanner.ts deleted file mode 100644 index cad4827..0000000 --- a/packages/content-scanner/src/content-fts-indexer-scanner.ts +++ /dev/null @@ -1,59 +0,0 @@ -import type { DocumentPath, Index, IndexedBlock } from "@statewalker/indexer-api"; -import { decodeMsgpack } from "@statewalker/webrun-msgpack"; -import { collect } from "@statewalker/webrun-streams"; -import type { ChunkData } from "./content-splitter-scanner.js"; -import type { ScanStore, Update } from "./scan-store.js"; -import type { ScannerOptions } from "./scanner.js"; -import { Scanner } from "./scanner.js"; - -export type ContentFtsIndexerOptions = ScannerOptions & { - /** The indexer-api Index to add/remove documents from. */ - index: Index; -}; - -/** - * Scanner that maintains a full-text search index from content chunks. - * - * Reads chunks from the upstream "chunks" store via msgpack stream, - * creates FullTextBlock entries, and delegates to the indexer-api Index. - * Tracks indexed URIs in its own "fts-index" store (metadata only). - */ -export class ContentFtsIndexerScanner extends Scanner { - private readonly index: Index; - - constructor(store: ScanStore, options: ContentFtsIndexerOptions) { - super(store, options); - this.index = options.index; - } - - async processEntry(upstream: Update): Promise { - if (!upstream.content) return null; - - const chunks = await collect(decodeMsgpack(upstream.content())); - if (chunks.length === 0) return null; - - const path = uriToDocPath(upstream.uri); - const blocks: IndexedBlock[] = chunks.map((c) => ({ - path, - blockId: `${path}:${c.index}`, - content: c.content, - })); - - await this.index.deleteDocuments([{ path }]); - await this.index.addDocument(blocks); - - return { - uri: upstream.uri, - stamp: upstream.stamp, - }; - } - - async removeEntry(uri: string): Promise { - const path = uriToDocPath(uri); - await this.index.deleteDocuments([{ path }]); - } -} - -function uriToDocPath(uri: string): DocumentPath { - return (uri.startsWith("/") ? uri : `/${uri}`) as DocumentPath; -} diff --git a/packages/content-scanner/src/content-scanner.ts b/packages/content-scanner/src/content-scanner.ts deleted file mode 100644 index bf9b37d..0000000 --- a/packages/content-scanner/src/content-scanner.ts +++ /dev/null @@ -1,238 +0,0 @@ -import type { FilesApi } from "@statewalker/webrun-files"; -import { readFile } from "@statewalker/webrun-files"; -import { encodeUri } from "./file-uri.js"; -import { createScanEvent } from "./scan-events.js"; -import { computeSha1 } from "./sha1.js"; -import { TrackingStore } from "./tracking-store.js"; -import type { CollectionConfig, FileMetadata, ScanMessage, ScanOptions } from "./types.js"; - -/** - * Separates tracking storage from content storage so the scanner's metadata - * can live on a different FilesApi instance (e.g., a local cache) than the - * content being scanned. - */ -export type ContentScannerOptions = { - /** Dedicated FilesApi for tracking data -- kept separate from scanned content so metadata I/O never interferes with the content file system. */ - trackingFiles: FilesApi; - /** Namespaces all tracking files on disk so multiple scanner instances (or versions) can coexist under the same FilesApi. Defaults to `"cs"`. */ - prefix?: string; -}; - -/** - * Central facade that hides the mechanics of file-tree walking, hashing, - * and metadata persistence behind a stream-oriented API. Callers register - * collections and consume AsyncGenerators of ContentSection events -- - * they never touch the TrackingStore or file-system details directly. - * This keeps scanning concerns isolated from the rest of the application. - */ -export class ContentScanner { - private readonly store: TrackingStore; - private readonly collections = new Map(); - - constructor(options: ContentScannerOptions) { - this.store = new TrackingStore(options.trackingFiles, options.prefix ?? "cs"); - } - - // -- Collection management ------------------------------------------- - - /** - * Registers a collection so subsequent `scan()` calls can track its files. - * Collections are scoped by ID -- the same scanner instance can manage - * multiple independent file trees without interference. - */ - addCollection(params: { config: CollectionConfig }): void { - this.collections.set(params.config.collectionId, params.config); - } - - /** - * Tears down a collection completely -- both the in-memory registration and - * all persisted tracking records. Without this, stale metadata would - * accumulate and `getChanges` would keep reporting phantom files. - */ - async removeCollection(params: { collectionId: string }): Promise { - this.collections.delete(params.collectionId); - await this.store.deleteByCollection({ - collectionId: params.collectionId, - }); - } - - /** Returns a snapshot so callers can inspect registrations without mutating the internal map. */ - getCollections(): CollectionConfig[] { - return [...this.collections.values()]; - } - - // -- Scanning -------------------------------------------------------- - - /** - * Walks a single collection's file tree and yields events for every detected - * change. This is the core operation -- it compares live file-system state - * against persisted metadata to detect additions, modifications, and removals - * in a single pass. Uses an AsyncGenerator so callers can process events - * incrementally without buffering the entire result set in memory. - * - * @throws Error if the collection ID is not registered. - */ - async *scan(params: { - collectionId: string; - options?: ScanOptions; - }): AsyncGenerator { - const config = this.collections.get(params.collectionId); - if (!config) { - throw new Error(`Collection not found: ${params.collectionId}`); - } - - const options = params.options; - const batchSize = options?.batchSize ?? 50; - const sleepMs = options?.sleepMs ?? 0; - const filter = options?.filter; - const skipHash = options?.skipHash ?? false; - - const scanTime = new Date().toISOString(); - const seenUris = new Set(); - let scannedCount = 0; - - yield createScanEvent({ - type: "scan-started", - collectionId: params.collectionId, - }); - - for await (const info of config.files.list(config.root, { - recursive: true, - })) { - if (info.kind !== "file") continue; - - const filePath = info.path; - if (filter && !filter(filePath)) continue; - - const uri = encodeUri(config.collectionId, filePath); - seenUris.add(uri); - - const size = info.size ?? 0; - const lastModified = info.lastModified ?? 0; - - const existing = await this.store.get({ uri }); - - let changed = false; - let hash = ""; - - if (!existing || existing.removalTime !== null) { - // New file or previously removed file reappearing - if (!skipHash) { - const data = await readFile(config.files, filePath); - hash = await computeSha1(data); - } - changed = true; - } else if (existing.size !== size || existing.lastModified !== lastModified) { - // Metadata changed — check content - if (!skipHash) { - const data = await readFile(config.files, filePath); - hash = await computeSha1(data); - changed = hash !== existing.hash; - } else { - changed = true; - } - } - - // Always update scan time - const metadata: FileMetadata = { - uri, - collectionId: config.collectionId, - path: filePath, - hash: changed ? hash : (existing?.hash ?? hash), - size, - lastModified, - scanTime, - removalTime: null, - }; - await this.store.set({ metadata }); - - if (changed) { - yield createScanEvent({ - type: "content-changed", - uri, - collectionId: config.collectionId, - }); - } - - scannedCount++; - - if (sleepMs > 0 && scannedCount % batchSize === 0) { - await sleep(sleepMs); - } - } - - // Mark unseen active files in this collection as removed - for await (const meta of this.store.listByCollection({ - collectionId: params.collectionId, - })) { - if (meta.removalTime === null && !seenUris.has(meta.uri)) { - const updated: FileMetadata = { - ...meta, - scanTime, - removalTime: scanTime, - }; - await this.store.set({ metadata: updated }); - - yield createScanEvent({ - type: "content-removed", - uri: meta.uri, - collectionId: params.collectionId, - }); - } - } - - yield createScanEvent({ - type: "scan-done", - collectionId: params.collectionId, - }); - } - - /** - * Convenience wrapper when callers don't need per-collection control -- - * scans everything in registration order and merges the event streams. - */ - async *scanAll(params?: { options?: ScanOptions }): AsyncGenerator { - for (const config of this.collections.values()) { - yield* this.scan({ - collectionId: config.collectionId, - options: params?.options, - }); - } - } - - /** - * Replays changes since a caller-provided checkpoint, enabling poll-based - * consumers that weren't listening during the original scan. The caller - * stores the last-seen timestamp and passes it here to get only newer events. - */ - async *getChanges(params: { collectionId: string; since: string }): AsyncGenerator { - const sinceTime = new Date(params.since).getTime(); - for await (const meta of this.store.listByCollection({ - collectionId: params.collectionId, - })) { - const metaScanTime = new Date(meta.scanTime).getTime(); - if (metaScanTime > sinceTime) { - const type = meta.removalTime !== null ? "content-removed" : "content-changed"; - yield createScanEvent({ - type, - uri: meta.uri, - collectionId: params.collectionId, - }); - } - } - } - - /** - * Garbage-collects stale removal records. Without periodic cleanup, - * the tracking store grows unboundedly as files are deleted over time. - * The `before` threshold lets callers keep recent removals visible to - * `getChanges` consumers while purging older ones. - */ - async cleanupRemoved(params: { before: string }): Promise { - return this.store.deleteRemovedBefore({ before: params.before }); - } -} - -function sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)); -} diff --git a/packages/content-scanner/src/content-splitter-scanner.ts b/packages/content-scanner/src/content-splitter-scanner.ts deleted file mode 100644 index be39ff4..0000000 --- a/packages/content-scanner/src/content-splitter-scanner.ts +++ /dev/null @@ -1,59 +0,0 @@ -import type { ChunkOptions } from "@statewalker/indexer-chunker"; -import { chunkMarkdown } from "@statewalker/indexer-chunker"; -import { encodeMsgpack } from "@statewalker/webrun-msgpack"; -import { collectString, decodeText } from "@statewalker/webrun-streams"; -import type { ScanStore, Update } from "./scan-store.js"; -import type { ScannerOptions } from "./scanner.js"; -import { Scanner } from "./scanner.js"; - -export type ContentSplitterOptions = ScannerOptions & { - /** Chunking configuration. */ - chunkOptions: ChunkOptions; -}; - -/** Chunk data shape stored via msgpack. */ -export type ChunkData = { index: number; content: string }; - -/** - * Scanner that splits extracted content into chunks. - * - * Reads content from the upstream "content" store, splits it using - * `chunkMarkdown()` from `@repo/indexer-chunker`, and stores serialized - * chunks as msgpack frames in the "chunks" store. - */ -export class ContentSplitterScanner extends Scanner { - private readonly chunkOptions: ChunkOptions; - - constructor(store: ScanStore, options: ContentSplitterOptions) { - super(store, options); - this.chunkOptions = options.chunkOptions; - } - - async processEntry(upstream: Update): Promise { - if (!upstream.content) return null; - const text = await collectString(decodeText(upstream.content())); - if (!text) return null; - - const chunks = chunkMarkdown(text, this.chunkOptions); - const payload: ChunkData[] = chunks.map((c) => ({ - index: c.index, - content: c.content, - })); - - return { - uri: upstream.uri, - stamp: upstream.stamp, - meta: { - chunkCount: chunks.length, - targetChars: this.chunkOptions.targetChars, - }, - content: () => encodeMsgpack(toAsync(payload)), - }; - } - - async removeEntry(_uri: string): Promise {} -} - -async function* toAsync(items: T[]): AsyncGenerator { - for (const item of items) yield item; -} diff --git a/packages/content-scanner/src/content-vector-indexer-scanner.ts b/packages/content-scanner/src/content-vector-indexer-scanner.ts deleted file mode 100644 index 3003bd1..0000000 --- a/packages/content-scanner/src/content-vector-indexer-scanner.ts +++ /dev/null @@ -1,58 +0,0 @@ -import type { DocumentPath, Index, IndexedBlock } from "@statewalker/indexer-api"; -import { decodeFloat32Arrays } from "@statewalker/webrun-msgpack"; -import { collect } from "@statewalker/webrun-streams"; -import type { ScanStore, Update } from "./scan-store.js"; -import type { ScannerOptions } from "./scanner.js"; -import { Scanner } from "./scanner.js"; - -export type ContentVectorIndexerOptions = ScannerOptions & { - /** The indexer-api Index to add/remove documents from. */ - index: Index; -}; - -/** - * Scanner that maintains a vector search index from embeddings. - * - * Reads embeddings from the upstream "embeddings" store via Float32Array - * stream, creates EmbeddingBlock entries, and delegates to the indexer-api - * Index. Tracks indexed URIs in its own "vec-index" store (metadata only). - */ -export class ContentVectorIndexerScanner extends Scanner { - private readonly index: Index; - - constructor(store: ScanStore, options: ContentVectorIndexerOptions) { - super(store, options); - this.index = options.index; - } - - async processEntry(upstream: Update): Promise { - if (!upstream.content) return null; - - const embeddings = await collect(decodeFloat32Arrays(upstream.content())); - if (embeddings.length === 0) return null; - - const path = uriToDocPath(upstream.uri); - const blocks: IndexedBlock[] = embeddings.map((emb, i) => ({ - path, - blockId: `${path}:${i}`, - embedding: emb, - })); - - await this.index.deleteDocuments([{ path }]); - await this.index.addDocument(blocks); - - return { - uri: upstream.uri, - stamp: upstream.stamp, - }; - } - - async removeEntry(uri: string): Promise { - const path = uriToDocPath(uri); - await this.index.deleteDocuments([{ path }]); - } -} - -function uriToDocPath(uri: string): DocumentPath { - return (uri.startsWith("/") ? uri : `/${uri}`) as DocumentPath; -} diff --git a/packages/content-scanner/src/file-uri.ts b/packages/content-scanner/src/file-uri.ts deleted file mode 100644 index 2a99b36..0000000 --- a/packages/content-scanner/src/file-uri.ts +++ /dev/null @@ -1,22 +0,0 @@ -/** - * Encode a collection ID and path into a URI string. - * Format: "{collectionId}:{path}" - */ -export function encodeUri(collectionId: string, path: string): string { - return `${collectionId}:${path}`; -} - -/** - * Parse a URI string into its collection ID and path components. - * Splits on the first colon only, so paths may contain colons. - */ -export function parseUri(uri: string): { collectionId: string; path: string } { - const idx = uri.indexOf(":"); - if (idx === -1) { - throw new Error(`Invalid file URI (no colon): ${uri}`); - } - return { - collectionId: uri.slice(0, idx), - path: uri.slice(idx + 1), - }; -} diff --git a/packages/content-scanner/src/files-scan-registry.ts b/packages/content-scanner/src/files-scan-registry.ts deleted file mode 100644 index 87857df..0000000 --- a/packages/content-scanner/src/files-scan-registry.ts +++ /dev/null @@ -1,118 +0,0 @@ -import type { FilesApi } from "@statewalker/webrun-files"; -import { readText, writeText } from "@statewalker/webrun-files"; -import { FilesScanStore } from "./files-scan-store.js"; -import type { ScanRegistry, ScanRegistryOptions, ScanStore } from "./scan-store.js"; - -const REGISTRY_FILE = "_registry.json"; - -type RegistryJson = { - stores: string[]; -}; - -export class FilesScanRegistry implements ScanRegistry { - private readonly files: FilesApi; - private readonly prefix: string; - private readonly openStores = new Map(); - private storeNames: string[] | null = null; - - constructor(options: ScanRegistryOptions) { - this.files = options.files; - this.prefix = options.prefix ?? "scan"; - } - - private get registryPath(): string { - return `${this.prefix}/${REGISTRY_FILE}`; - } - - private async loadNames(): Promise { - if (this.storeNames) return this.storeNames; - if (await this.files.exists(this.registryPath)) { - const text = await readText(this.files, this.registryPath); - if (text) { - const data = JSON.parse(text) as RegistryJson; - this.storeNames = data.stores; - return this.storeNames; - } - } - this.storeNames = []; - return this.storeNames; - } - - private async saveNames(): Promise { - if (!this.storeNames) return; - await writeText( - this.files, - this.registryPath, - JSON.stringify({ stores: this.storeNames } satisfies RegistryJson), - ); - } - - async createStore(name: string): Promise { - const names = await this.loadNames(); - if (names.includes(name)) { - throw new Error(`Store already exists: ${name}`); - } - names.push(name); - await this.saveNames(); - - const store = new FilesScanStore(name, this.files, `${this.prefix}/${name}`); - this.openStores.set(name, store); - return store; - } - - async getStore(name: string): Promise { - const existing = this.openStores.get(name); - if (existing) return existing; - - const names = await this.loadNames(); - if (!names.includes(name)) return null; - - const store = new FilesScanStore(name, this.files, `${this.prefix}/${name}`); - this.openStores.set(name, store); - return store; - } - - async hasStore(name: string): Promise { - const names = await this.loadNames(); - return names.includes(name); - } - - async getStoreNames(): Promise { - return [...(await this.loadNames())]; - } - - async deleteStore(name: string): Promise { - const names = await this.loadNames(); - const idx = names.indexOf(name); - if (idx === -1) { - throw new Error(`Store not found: ${name}`); - } - names.splice(idx, 1); - this.openStores.delete(name); - await this.saveNames(); - - // Remove store directory recursively - const storeDir = `${this.prefix}/${name}`; - if (await this.files.exists(storeDir)) { - // Collect all files first, then delete - const paths: string[] = []; - for await (const info of this.files.list(storeDir, { recursive: true })) { - if (info.kind === "file") { - paths.push(info.path); - } - } - for (const p of paths) { - await this.files.remove(p); - } - } - } - - async flush(): Promise { - await this.saveNames(); - } - - async close(): Promise { - this.openStores.clear(); - this.storeNames = null; - } -} diff --git a/packages/content-scanner/src/files-scan-store.ts b/packages/content-scanner/src/files-scan-store.ts deleted file mode 100644 index c359ee5..0000000 --- a/packages/content-scanner/src/files-scan-store.ts +++ /dev/null @@ -1,275 +0,0 @@ -import { sha1Uuid } from "@statewalker/shared-ids"; -import type { FilesApi } from "@statewalker/webrun-files"; -import { readText, writeText } from "@statewalker/webrun-files"; -import type { ListParams, ScanStore, Stamp, Update } from "./scan-store.js"; - -/** Shape of the per-entry JSON file on disk. */ -type EntryJson = { - uri: string; - stamp: string; // ISO - removed?: string; // ISO - meta?: Record; -}; - -/** Shape of the _index.json file. */ -type IndexJson = { - lastScan: string | null; // ISO - entries: Record; -}; - -const INDEX_FILE = "_index.json"; -const BATCH_SIZE = 50; - -/** Derive a deterministic storage path from a URI. */ -async function pathFor(prefix: string, uri: string): Promise { - const hash = await sha1Uuid(uri); - const dd = hash.slice(0, 2); - return `${prefix}/${dd}/${hash}`; -} - -function stampMatches( - stamp: Stamp, - include?: Stamp | [Stamp, Stamp], - exclude?: Stamp | [Stamp, Stamp], -): boolean { - const t = stamp.getTime(); - if (include !== undefined) { - if (Array.isArray(include)) { - if (t < include[0].getTime() || t > include[1].getTime()) return false; - } else { - if (t !== include.getTime()) return false; - } - } - if (exclude !== undefined) { - if (Array.isArray(exclude)) { - if (t >= exclude[0].getTime() && t <= exclude[1].getTime()) return false; - } else { - if (t === exclude.getTime()) return false; - } - } - return true; -} - -function uriMatches(uri: string, pattern?: string): boolean { - if (!pattern) return true; - if (pattern.endsWith("*")) { - return uri.startsWith(pattern.slice(0, -1)); - } - return uri === pattern; -} - -function entryToUpdate(entry: EntryJson, files: FilesApi, basePath: string): Update { - const update: Update = { - uri: entry.uri, - stamp: new Date(entry.stamp), - }; - if (entry.removed) { - update.removed = new Date(entry.removed); - } - if (entry.meta) { - update.meta = entry.meta; - } - const binPath = `${basePath}.bin`; - update.content = async function* () { - if (await files.exists(binPath)) { - yield* files.read(binPath); - } - }; - return update; -} - -export class FilesScanStore implements ScanStore { - readonly name: string; - private readonly files: FilesApi; - private readonly prefix: string; - private index: IndexJson | null = null; - - constructor(name: string, files: FilesApi, prefix: string) { - this.name = name; - this.files = files; - this.prefix = prefix; - } - - private get indexPath(): string { - return `${this.prefix}/${INDEX_FILE}`; - } - - private async loadIndex(): Promise { - if (this.index) return this.index; - if (await this.files.exists(this.indexPath)) { - const text = await readText(this.files, this.indexPath); - if (text) { - this.index = JSON.parse(text) as IndexJson; - return this.index; - } - } - this.index = { lastScan: null, entries: {} }; - return this.index; - } - - private async saveIndex(): Promise { - if (!this.index) return; - await writeText(this.files, this.indexPath, JSON.stringify(this.index)); - } - - async *store(updates: Iterable | AsyncIterable): AsyncGenerator { - const idx = await this.loadIndex(); - let count = 0; - for await (const update of updates) { - const basePath = await pathFor(this.prefix, update.uri); - const jsonPath = `${basePath}.json`; - - const entry: EntryJson = { - uri: update.uri, - stamp: update.stamp.toISOString(), - }; - if (update.removed) { - entry.removed = update.removed.toISOString(); - } - if (update.meta) { - entry.meta = update.meta; - } - await writeText(this.files, jsonPath, JSON.stringify(entry)); - - if (update.content) { - const binPath = `${basePath}.bin`; - await this.files.write(binPath, update.content()); - } - - idx.entries[update.uri] = { - stamp: entry.stamp, - ...(entry.removed ? { removed: entry.removed } : {}), - }; - - count++; - if (count % BATCH_SIZE === 0) { - await this.saveIndex(); - } - - yield entryToUpdate(entry, this.files, basePath); - } - await this.saveIndex(); - } - - async *list(params?: ListParams): AsyncGenerator { - const idx = await this.loadIndex(); - for (const [uri, info] of Object.entries(idx.entries)) { - if (!uriMatches(uri, params?.uri)) continue; - const stamp = new Date(info.stamp); - if (!stampMatches(stamp, params?.include, params?.exclude)) continue; - - const basePath = await pathFor(this.prefix, uri); - const jsonPath = `${basePath}.json`; - if (!(await this.files.exists(jsonPath))) continue; - - const text = await readText(this.files, jsonPath); - if (!text) continue; - const entry = JSON.parse(text) as EntryJson; - yield entryToUpdate(entry, this.files, basePath); - } - } - - async *remove(params?: ListParams): AsyncGenerator { - const idx = await this.loadIndex(); - const now = new Date().toISOString(); - const toRemove: string[] = []; - - for (const [uri, info] of Object.entries(idx.entries)) { - if (info.removed) continue; // already removed - if (!uriMatches(uri, params?.uri)) continue; - const stamp = new Date(info.stamp); - if (!stampMatches(stamp, params?.include, params?.exclude)) continue; - toRemove.push(uri); - } - - for (const uri of toRemove) { - const basePath = await pathFor(this.prefix, uri); - const jsonPath = `${basePath}.json`; - - if (await this.files.exists(jsonPath)) { - const text = await readText(this.files, jsonPath); - if (text) { - const entry = JSON.parse(text) as EntryJson; - entry.removed = now; - await writeText(this.files, jsonPath, JSON.stringify(entry)); - - idx.entries[uri] = { - stamp: entry.stamp, - removed: now, - }; - - yield entryToUpdate(entry, this.files, basePath); - } - } - } - await this.saveIndex(); - } - - async getLastScan(): Promise { - const idx = await this.loadIndex(); - return idx.lastScan ? new Date(idx.lastScan) : null; - } - - async setLastScan(stamp: Stamp): Promise { - const idx = await this.loadIndex(); - idx.lastScan = stamp.toISOString(); - await this.saveIndex(); - } - - async prune(before: Stamp): Promise { - const idx = await this.loadIndex(); - const beforeTime = before.getTime(); - let count = 0; - const toDelete: string[] = []; - - for (const [uri, info] of Object.entries(idx.entries)) { - if (info.removed && new Date(info.removed).getTime() < beforeTime) { - toDelete.push(uri); - } - } - - for (const uri of toDelete) { - const basePath = await pathFor(this.prefix, uri); - await this.files.remove(`${basePath}.json`); - await this.files.remove(`${basePath}.bin`); - delete idx.entries[uri]; - count++; - } - - if (count > 0) { - await this.saveIndex(); - } - return count; - } - - async rebuildIndex(): Promise { - const oldLastScan = this.index?.lastScan ?? null; - const newIndex: IndexJson = { lastScan: oldLastScan, entries: {} }; - - const trackingDir = this.prefix; - if (!(await this.files.exists(trackingDir))) { - this.index = newIndex; - await this.saveIndex(); - return; - } - - for await (const info of this.files.list(trackingDir, { - recursive: true, - })) { - if (info.kind !== "file" || !info.path.endsWith(".json")) continue; - if (info.path.endsWith(`/${INDEX_FILE}`)) continue; - - const text = await readText(this.files, info.path); - if (!text) continue; - - const entry = JSON.parse(text) as EntryJson; - newIndex.entries[entry.uri] = { - stamp: entry.stamp, - ...(entry.removed ? { removed: entry.removed } : {}), - }; - } - - this.index = newIndex; - await this.saveIndex(); - } -} diff --git a/packages/content-scanner/src/files-scanner.ts b/packages/content-scanner/src/files-scanner.ts deleted file mode 100644 index c6e46cd..0000000 --- a/packages/content-scanner/src/files-scanner.ts +++ /dev/null @@ -1,188 +0,0 @@ -import type { FilesApi } from "@statewalker/webrun-files"; -import { readFile } from "@statewalker/webrun-files"; -import type { ListParams, ScanStore, Update } from "./scan-store.js"; -import type { ScannerEvent, ScannerOptions, UpdateSource } from "./scanner.js"; -import { Scanner } from "./scanner.js"; -import { computeSha1 } from "./sha1.js"; - -export type FilesScannerOptions = ScannerOptions & { - /** FilesApi to read files from. */ - files: FilesApi; - /** Root directory to scan. */ - root: string; - /** Filter function — return `false` to skip a path. */ - filter?: (path: string) => boolean; - /** Skip content hashing (detect changes by size + mtime only). */ - skipHash?: boolean; - /** Periodic scan interval in ms (0 = no periodic scan). */ - intervalMs?: number; -}; - -/** - * Creates an `UpdateSource` that walks the file system and yields - * `Update` entries for each file found. - */ -export function createFsWalker( - files: FilesApi, - root: string, - filter?: (path: string) => boolean, -): UpdateSource { - return async function* (_params?: ListParams) { - for await (const info of files.list(root, { recursive: true })) { - if (info.kind !== "file") continue; - if (filter && !filter(info.path)) continue; - const update: Update = { - uri: info.path, - stamp: new Date(), // will be overwritten by scanner - meta: { - size: info.size ?? 0, - lastModified: info.lastModified ?? 0, - }, - }; - yield update; - } - }; -} - -/** - * Root scanner that detects file-system changes. - * - * It walks the file system, compares against its store, and detects - * added/modified/removed files. Stores metadata only (no binary content). - */ -export class FilesScanner extends Scanner { - private readonly files: FilesApi; - private readonly root: string; - private readonly filter?: (path: string) => boolean; - private readonly skipHash: boolean; - private readonly intervalMs: number; - private timer: ReturnType | null = null; - private running = false; - private stopped = false; - - constructor(store: ScanStore, options: FilesScannerOptions) { - super(store, options); - this.files = options.files; - this.root = options.root; - this.filter = options.filter; - this.skipHash = options.skipHash ?? false; - this.intervalMs = options.intervalMs ?? 0; - } - - async processEntry(upstream: Update): Promise { - const uri = upstream.uri; - const size = (upstream.meta?.size as number) ?? 0; - const lastModified = (upstream.meta?.lastModified as number) ?? 0; - - // Check existing entry in our store - const existing: Update | undefined = await firstOrUndefined(this.store.list({ uri })); - - let changed = false; - let hash = ""; - - if (!existing || existing.removed) { - // New file or previously removed - if (!this.skipHash) { - const data = await readFile(this.files, uri); - hash = await computeSha1(data); - } - changed = true; - } else { - const existingSize = (existing.meta?.size as number) ?? 0; - const existingMtime = (existing.meta?.lastModified as number) ?? 0; - if (existingSize !== size || existingMtime !== lastModified) { - if (!this.skipHash) { - const data = await readFile(this.files, uri); - hash = await computeSha1(data); - const existingHash = (existing.meta?.hash as string) ?? ""; - changed = hash !== existingHash; - } else { - changed = true; - } - } - } - - if (!changed && existing && !existing.removed) { - return null; // unchanged — skip re-stamping - } - - return { - uri, - stamp: new Date(), - meta: { size, lastModified, hash }, - }; - } - - async removeEntry(_uri: string): Promise { - // No extra cleanup needed — the store handles soft delete - } - - /** - * Override scan to also detect removed files. - * After processing all files from the walker, any entries in our store - * that were not visited are marked as removed. - */ - async *scan(source?: UpdateSource, params?: ListParams): AsyncGenerator { - const walker = source ?? createFsWalker(this.files, this.root, this.filter); - const scanTime = new Date(); - const seenUris = new Set(); - - // Wrap the source to track seen URIs - const trackingSource: UpdateSource = async function* (p) { - for await (const update of walker(p)) { - seenUris.add(update.uri); - yield update; - } - }; - - // Run the normal scan - yield* super.scan(trackingSource, params); - - // Detect removed files — entries in store not seen during this scan - for await (const existing of this.store.list()) { - if (existing.removed) continue; - if (seenUris.has(existing.uri)) continue; - // Not seen — mark as removed - for await (const _ of this.store.remove({ uri: existing.uri })) { - // consumed - } - } - - await this.store.setLastScan(scanTime); - } - - /** Start periodic scanning. */ - start(): void { - this.stop(); - this.stopped = false; - const doScan = async () => { - if (this.running || this.stopped) return; - this.running = true; - try { - for await (const _ of this.scan()) { - if (this.stopped) break; - } - } finally { - this.running = false; - } - if (!this.stopped && this.intervalMs > 0) { - this.timer = setTimeout(doScan, this.intervalMs); - } - }; - void doScan(); - } - - /** Stop periodic scanning. */ - stop(): void { - this.stopped = true; - if (this.timer) { - clearTimeout(this.timer); - this.timer = null; - } - } -} - -async function firstOrUndefined(gen: AsyncIterable): Promise { - for await (const item of gen) return item; - return undefined; -} diff --git a/packages/content-scanner/src/index.ts b/packages/content-scanner/src/index.ts deleted file mode 100644 index 92a030b..0000000 --- a/packages/content-scanner/src/index.ts +++ /dev/null @@ -1,19 +0,0 @@ -// Legacy exports (existing API) - -export * from "./content-embedder-scanner.js"; -export * from "./content-extractor-scanner.js"; -export * from "./content-fts-indexer-scanner.js"; -export * from "./content-scanner.js"; -export * from "./content-splitter-scanner.js"; -export * from "./content-vector-indexer-scanner.js"; -export * from "./file-uri.js"; -export * from "./files-scan-registry.js"; -export * from "./files-scan-store.js"; -export * from "./files-scanner.js"; -export * from "./scan-events.js"; -// New ScanRegistry / ScanStore API -export * from "./scan-store.js"; -export * from "./scanner.js"; -export * from "./sha1.js"; -export * from "./tracking-store.js"; -export * from "./types.js"; diff --git a/packages/content-scanner/src/scan-events.ts b/packages/content-scanner/src/scan-events.ts deleted file mode 100644 index 82a9502..0000000 --- a/packages/content-scanner/src/scan-events.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { SnowflakeId } from "@statewalker/shared-ids"; -import type { ScanEventType, ScanMessage, ScanMessageProps } from "./types.js"; - -const snowflake = new SnowflakeId(); - -/** - * Wraps raw scan data into a `ScanMessage` so scan events are - * wire-compatible with the rest of the content-blocks pipeline. - * Assigns a snowflake ID and timestamp to each event, making them - * orderable and dedupable by downstream consumers without extra coordination. - */ -export function createScanEvent(params: { - type: ScanEventType; - uri?: string; - collectionId: string; - extra?: Record; -}): ScanMessage { - const props: ScanMessageProps = { - id: snowflake.generate(), - role: "tool:content-scanner", - stage: "scanning", - time: new Date().toISOString(), - type: params.type, - collection: params.collectionId, - ...(params.uri !== undefined ? { uri: params.uri } : {}), - ...(params.extra ?? {}), - }; - return { - props, - blocks: [], - }; -} diff --git a/packages/content-scanner/src/scan-store.ts b/packages/content-scanner/src/scan-store.ts deleted file mode 100644 index 3db34d4..0000000 --- a/packages/content-scanner/src/scan-store.ts +++ /dev/null @@ -1,84 +0,0 @@ -import type { FilesApi } from "@statewalker/webrun-files"; - -/** Timestamp type for scan tracking. Serialized as ISO string in JSON on disk. */ -export type Stamp = Date; - -/** A tracked entry in a scan store. */ -export type Update = { - /** Primary key — identifies the tracked resource. */ - uri: string; - /** When this entry was last touched by a scan. */ - stamp: Stamp; - /** Soft-delete timestamp. Set by `remove()`, observed by downstream scanners. */ - removed?: Stamp; - /** Lightweight JSON-serializable metadata. */ - meta?: Record; - /** Lazy accessor for heavyweight binary content. Reads from disk on each call. */ - content?: () => AsyncGenerator; -}; - -/** Filter parameters for `list()` and `remove()`. */ -export type ListParams = { - /** URI mask. If it ends with `*`, used as a prefix match. Otherwise exact match. */ - uri?: string; - /** Return entries whose stamp matches this value or falls within this range (inclusive). */ - include?: Stamp | [Stamp, Stamp]; - /** Exclude entries whose stamp matches this value or falls within this range (inclusive). */ - exclude?: Stamp | [Stamp, Stamp]; -}; - -/** A named store that tracks per-URI updates with metadata and optional binary content. */ -export interface ScanStore { - readonly name: string; - - /** Stream-write entries. Index is updated per batch and at the end. Yields stored entries. */ - store(updates: Iterable | AsyncIterable): AsyncGenerator; - - /** List entries matching the given filters. Includes soft-deleted entries. Content is lazy. */ - list(params?: ListParams): AsyncGenerator; - - /** Soft-delete entries matching the given filters. Yields the removed entries. */ - remove(params?: ListParams): AsyncGenerator; - - /** Get the timestamp of the last completed scan, or `null` if never scanned. */ - getLastScan(): Promise; - - /** Set the timestamp of the last completed scan. */ - setLastScan(stamp: Stamp): Promise; - - /** Physically delete entries soft-removed before the given date. Returns count deleted. */ - prune(before: Stamp): Promise; - - /** Reconstruct _index.json from individual entry files. */ - rebuildIndex(): Promise; -} - -/** Registry that manages named ScanStore instances. Modeled after the Indexer interface. */ -export interface ScanRegistry { - /** Create a new named store. Throws if a store with this name already exists. */ - createStore(name: string): Promise; - - /** Get an existing store by name, or `null` if it doesn't exist. */ - getStore(name: string): Promise; - - /** Check whether a store with the given name exists. */ - hasStore(name: string): Promise; - - /** List the names of all existing stores. */ - getStoreNames(): Promise; - - /** Delete a store and all its data. */ - deleteStore(name: string): Promise; - - /** Persist any pending state. */ - flush(): Promise; - - /** Close all stores and release resources. */ - close(): Promise; -} - -/** Options for creating a FilesApi-backed ScanRegistry. */ -export type ScanRegistryOptions = { - files: FilesApi; - prefix?: string; -}; diff --git a/packages/content-scanner/src/scanner.ts b/packages/content-scanner/src/scanner.ts deleted file mode 100644 index bb4c8cb..0000000 --- a/packages/content-scanner/src/scanner.ts +++ /dev/null @@ -1,109 +0,0 @@ -import type { ListParams, ScanStore, Stamp, Update } from "./scan-store.js"; - -/** The upstream data source — typically an upstream store's `list` method. */ -export type UpdateSource = (params?: ListParams) => Generator | AsyncGenerator; - -/** Statistics for a completed scan. */ -export type ScanStats = { - processed: number; - removed: number; - errors: number; -}; - -/** Lifecycle events emitted by a scanner during `scan()`. */ -export type ScannerEvent = - | { type: "scan-started"; stamp: Stamp } - | { type: "batch-done"; processed: number; stamp: Stamp } - | { type: "entry-processed"; uri: string } - | { type: "entry-removed"; uri: string } - | { type: "entry-error"; uri: string; error: string } - | { type: "scan-done"; stats: ScanStats }; - -/** Options for configuring the scanner's batching behavior. */ -export type ScannerOptions = { - /** Yield `batch-done` every N entries. Default: 50. */ - batchSize?: number; -}; - -/** - * Abstract base class for pull-based scanners. - * - * Each scanner owns a `ScanStore`, pulls upstream data via an `UpdateSource`, - * processes entries through subclass-defined methods, and yields lifecycle events. - * Scanners are wired together by an orchestrator — they have no knowledge of - * downstream consumers. - */ -export abstract class Scanner { - readonly store: ScanStore; - private readonly batchSize: number; - - constructor(store: ScanStore, options?: ScannerOptions) { - this.store = store; - this.batchSize = options?.batchSize ?? 50; - } - - /** - * Process an upstream entry and return the update to store, - * or `null` to skip this entry. - */ - abstract processEntry(upstream: Update): Promise; - - /** Clean up data for a removed URI. */ - abstract removeEntry(uri: string): Promise; - - /** - * Pull entries from the upstream source, process each one, store results, - * and yield lifecycle events. On completion, updates this store's `lastScan`. - */ - async *scan(source: UpdateSource, params?: ListParams): AsyncGenerator { - const scanTime = new Date(); - yield { type: "scan-started", stamp: scanTime }; - - const stats: ScanStats = { processed: 0, removed: 0, errors: 0 }; - let batchCount = 0; - - for await (const upstream of source(params)) { - const uri = upstream.uri; - - if (upstream.removed) { - // Upstream entry was soft-deleted — cascade removal - try { - await this.removeEntry(uri); - for await (const _ of this.store.remove({ uri })) { - // consumed - } - stats.removed++; - yield { type: "entry-removed", uri }; - } catch (err) { - stats.errors++; - yield { type: "entry-error", uri, error: String(err) }; - } - } else { - // Process the entry - try { - const result = await this.processEntry(upstream); - if (result) { - const toStore = { ...result, stamp: scanTime } satisfies Update; - // Consume the store generator to persist the entry - for await (const _ of this.store.store([toStore])) { - // consumed - } - stats.processed++; - yield { type: "entry-processed", uri }; - } - } catch (err) { - stats.errors++; - yield { type: "entry-error", uri, error: String(err) }; - } - } - - batchCount++; - if (batchCount % this.batchSize === 0) { - yield { type: "batch-done", processed: batchCount, stamp: scanTime }; - } - } - - await this.store.setLastScan(scanTime); - yield { type: "scan-done", stats }; - } -} diff --git a/packages/content-scanner/src/sha1.ts b/packages/content-scanner/src/sha1.ts deleted file mode 100644 index 31bd092..0000000 --- a/packages/content-scanner/src/sha1.ts +++ /dev/null @@ -1,8 +0,0 @@ -/** Compute SHA-1 hash of binary data using the Web Crypto API (browser-compatible). */ -export async function computeSha1(data: Uint8Array): Promise { - const hashBuffer = await crypto.subtle.digest("SHA-1", data as Uint8Array); - const hashArray = new Uint8Array(hashBuffer); - return Array.from(hashArray) - .map((b) => b.toString(16).padStart(2, "0")) - .join(""); -} diff --git a/packages/content-scanner/src/tracking-store.ts b/packages/content-scanner/src/tracking-store.ts deleted file mode 100644 index 041cb66..0000000 --- a/packages/content-scanner/src/tracking-store.ts +++ /dev/null @@ -1,104 +0,0 @@ -import { sha1Uuid } from "@statewalker/shared-ids"; -import type { FilesApi } from "@statewalker/webrun-files"; -import { readText, writeText } from "@statewalker/webrun-files"; -import type { FileMetadata } from "./types.js"; - -/** - * Abstracts metadata persistence behind a simple CRUD interface so the - * ContentScanner doesn't need to know how or where tracking data is stored. - * Uses a two-level hash directory (`{dd}/{hash}.json`) to avoid - * file-system performance degradation from too many files in one folder. - * Backed by FilesApi, so it works identically across browser, Node, and - * remote storage backends. - */ -export class TrackingStore { - constructor( - private readonly files: FilesApi, - private readonly prefix: string, - ) {} - - /** Derives a deterministic, collision-free file path from a URI so each metadata record has a stable storage location regardless of special characters in the original path. */ - async pathFor(uri: string): Promise { - const hash = await sha1Uuid(uri); - const dd = hash.slice(0, 2); - return `${this.prefix}/tracking/${dd}/${hash}.json`; - } - - /** Loads a single record by URI so the scanner can compare current file-system state against the last-known snapshot and decide whether the file changed. */ - async get(params: { uri: string }): Promise { - const path = await this.pathFor(params.uri); - if (!(await this.files.exists(path))) return undefined; - const text = await readText(this.files, path); - if (!text) return undefined; - return JSON.parse(text) as FileMetadata; - } - - /** Persists a metadata snapshot so future scans have a baseline to diff against. Called after every file is processed, whether changed or not, to keep `scanTime` current. */ - async set(params: { metadata: FileMetadata }): Promise { - const path = await this.pathFor(params.metadata.uri); - await writeText(this.files, path, JSON.stringify(params.metadata)); - } - - /** Removes a single tracking record when its file is purged during cleanup or collection removal. Returns true so callers can count successful deletions. */ - async delete(params: { uri: string }): Promise { - const path = await this.pathFor(params.uri); - return this.files.remove(path); - } - - /** Streams all records without loading them into memory at once -- essential for large collections where the full set wouldn't fit comfortably in RAM. */ - async *listAll(): AsyncGenerator { - const trackingDir = `${this.prefix}/tracking`; - if (!(await this.files.exists(trackingDir))) return; - - for await (const info of this.files.list(trackingDir, { - recursive: true, - })) { - if (info.kind !== "file" || !info.path.endsWith(".json")) continue; - const text = await readText(this.files, info.path); - if (!text) continue; - yield JSON.parse(text) as FileMetadata; - } - } - - /** Filters to a single collection so the scanner can detect removals (files present in the store but missing from the latest file-system listing). */ - async *listByCollection(params: { collectionId: string }): AsyncGenerator { - for await (const meta of this.listAll()) { - if (meta.collectionId === params.collectionId) { - yield meta; - } - } - } - - /** Wipes all tracking state for a collection -- called when a collection is unregistered so orphaned records don't pollute future scans or change queries. */ - async deleteByCollection(params: { collectionId: string }): Promise { - let count = 0; - const toDelete: string[] = []; - for await (const meta of this.listAll()) { - if (meta.collectionId === params.collectionId) { - toDelete.push(meta.uri); - } - } - for (const uri of toDelete) { - const deleted = await this.delete({ uri }); - if (deleted) count++; - } - return count; - } - - /** Purges old removal records that are past the retention window, preventing unbounded store growth while still giving `getChanges` consumers time to observe deletions. */ - async deleteRemovedBefore(params: { before: string }): Promise { - let count = 0; - const beforeTime = new Date(params.before).getTime(); - const toDelete: string[] = []; - for await (const meta of this.listAll()) { - if (meta.removalTime !== null && new Date(meta.removalTime).getTime() < beforeTime) { - toDelete.push(meta.uri); - } - } - for (const uri of toDelete) { - const deleted = await this.delete({ uri }); - if (deleted) count++; - } - return count; - } -} diff --git a/packages/content-scanner/src/types.ts b/packages/content-scanner/src/types.ts deleted file mode 100644 index 67b13ed..0000000 --- a/packages/content-scanner/src/types.ts +++ /dev/null @@ -1,95 +0,0 @@ -import type { ContentMessage, ContentMessageProps } from "@statewalker/content-blocks"; -import type { FilesApi } from "@statewalker/webrun-files"; - -/** - * Groups all the information the scanner needs to walk a single directory tree. - * Extracted as a type so collections can be registered, stored, and passed around - * independently of the scanner instance that processes them. - */ -export type CollectionConfig = { - /** Namespaces tracking records so multiple independent file trees can coexist in one scanner without URI collisions. */ - collectionId: string; - /** Decouples the scanner from any specific file-system backend -- the same scanning logic works for browser, Node, or remote storage. */ - files: FilesApi; - /** Limits the scan scope within a potentially larger FilesApi -- only files under this subtree are tracked. */ - root: string; -}; - -/** - * Captures everything the scanner needs to decide whether a file has changed - * between scans. Persisted as JSON so change detection survives process restarts - * and works across async scan intervals. - */ -export type FileMetadata = { - /** Combines `collectionId` and `filePath` into a single key -- used as the primary identity for change tracking. Format: `"{collectionId}:{path}"`. */ - uri: string; - /** Links this record back to its collection so bulk operations (delete-all, list-by-collection) can filter without parsing the URI. */ - collectionId: string; - /** Preserved separately from URI so callers can resolve the file against its collection's FilesApi without reverse-engineering the URI format. */ - path: string; - /** Enables content-level change detection -- two files with the same size/mtime but different hashes are still flagged as changed. Empty when hashing is skipped. */ - hash: string; - /** Used as a cheap first-pass change signal before the more expensive SHA-1 hash; if size hasn't changed, the file may still be unmodified. */ - size: number; - /** Combined with `size` for a fast "dirty check" -- if neither changed, the scanner skips hashing entirely to save I/O. */ - lastModified: number; - /** Records when this file was last observed so `getChanges(since)` can return only entries newer than a caller's checkpoint. */ - scanTime: string; - /** Distinguishes "deleted" from "active" files -- non-null means the file disappeared and the record is kept only so downstream consumers can learn about the removal before `cleanupRemoved` purges it. */ - removalTime: string | null; -}; - -/** - * Lets callers tune the scan without forking the scanning logic -- all knobs - * are optional so the defaults work for most cases. - */ -export type ScanOptions = { - /** Controls back-pressure: the scanner pauses every N files so large trees don't starve the event loop. Default: 50. */ - batchSize?: number; - /** Cooperates with `batchSize` to throttle I/O -- useful when scanning over a network or shared file system where burst reads cause contention. Default: 0. */ - sleepMs?: number; - /** Lets the caller exclude paths (e.g., `node_modules`, dotfiles) before any I/O happens, avoiding wasted reads and hash computations. */ - filter?: (path: string) => boolean; - /** Trades accuracy for speed: when true, changes are detected by size + mtime only, skipping the SHA-1 read. Useful for quick "something changed" checks. Default: false. */ - skipHash?: boolean; -}; - -/** - * Defines the lifecycle stages of a scan as a closed set so consumers can - * exhaustively switch on event type without guessing. The four values map - * to the scan's natural phases: start, per-file change, per-file removal, finish. - */ -export type ScanEventType = "scan-started" | "content-changed" | "content-removed" | "scan-done"; - -// -------------------------------------------------------------- -// Typed scan messages — so consumers get compile-time guarantees -// on the shape of every event the scanner emits. -// -------------------------------------------------------------- - -/** - * Narrows `ContentMessageProps` to the exact fields every scan event carries. - * Consumers can destructure `props` without casting or null-checking `type`/`stage`, - * and TypeScript will catch mismatches if the scanner's output format changes. - */ -export interface ScanMessageProps extends ContentMessageProps { - /** Always `"tool:content-scanner"` — identifies this message source in a mixed-message stream. */ - role: "tool:content-scanner"; - /** Always `"scanning"` — all scan events belong to the same processing phase. */ - stage: "scanning"; - /** Discriminates the four lifecycle phases; consumers can `switch` exhaustively on this. */ - type: ScanEventType; - /** Which collection this event belongs to — present on every event so consumers never need to track scan context. */ - collection: string; - /** The `{collectionId}:{filePath}` of the affected file. Present on `content-changed` and `content-removed`, absent on `scan-started` and `scan-done`. */ - uri?: string; -} - -/** - * The concrete message type yielded by `ContentScanner.scan()` and related methods. - * Extends `ContentMessage` with a narrowed `props` so callers get full type safety - * on the event shape without downcasting. Scanner events always have empty `blocks` - * since they carry metadata, not content. - */ -export interface ScanMessage extends ContentMessage { - props: ScanMessageProps; -} diff --git a/packages/content-scanner/tests/content-embedder-scanner.test.ts b/packages/content-scanner/tests/content-embedder-scanner.test.ts deleted file mode 100644 index 2741706..0000000 --- a/packages/content-scanner/tests/content-embedder-scanner.test.ts +++ /dev/null @@ -1,102 +0,0 @@ -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { decodeFloat32Arrays } from "@statewalker/webrun-msgpack"; -import { collect as collectStream } from "@statewalker/webrun-streams"; -import { beforeEach, describe, expect, it } from "vitest"; -import { ContentEmbedderScanner } from "../src/content-embedder-scanner.js"; -import { FilesScanRegistry } from "../src/files-scan-registry.js"; -import type { ScanStore } from "../src/scan-store.js"; -import type { UpdateSource } from "../src/scanner.js"; -import { collect, makeChunksSource } from "./test-helpers.js"; - -/** Mock embed function that returns a fixed-size vector. */ -async function mockEmbed(text: string): Promise { - const dim = 4; - const arr = new Float32Array(dim); - for (let i = 0; i < dim; i++) { - arr[i] = (text.charCodeAt(i % text.length) || 0) / 256; - } - return arr; -} - -describe("ContentEmbedderScanner", () => { - let storeFiles: MemFilesApi; - let registry: FilesScanRegistry; - let store: ScanStore; - - beforeEach(async () => { - storeFiles = new MemFilesApi(); - registry = new FilesScanRegistry({ files: storeFiles, prefix: "scan" }); - store = await registry.createStore("embeddings"); - }); - - it("generates embeddings for chunks", async () => { - const scanner = new ContentEmbedderScanner(store, { - embed: mockEmbed, - model: "test-model", - dimensions: 4, - }); - - const source = makeChunksSource([ - { - uri: "/doc.md", - chunks: [ - { index: 0, content: "Hello world" }, - { index: 1, content: "Goodbye world" }, - ], - }, - ]); - - await collect(scanner.scan(source)); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(1); - expect(stored[0]?.meta?.model).toBe("test-model"); - expect(stored[0]?.meta?.dimensions).toBe(4); - expect(stored[0]?.meta?.chunkCount).toBe(2); - - // Decode embeddings from stored content via Float32Array stream - const entry = stored[0]; - if (!entry?.content) throw new Error("expected content"); - const embeddings = await collectStream(decodeFloat32Arrays(entry.content())); - expect(embeddings).toHaveLength(2); - expect(embeddings[0]?.length).toBe(4); - }); - - it("handles removal cascade", async () => { - const scanner = new ContentEmbedderScanner(store, { - embed: mockEmbed, - }); - - await collect( - scanner.scan( - makeChunksSource([{ uri: "/doc.md", chunks: [{ index: 0, content: "Hello" }] }]), - ), - ); - expect(await collect(store.list())).toHaveLength(1); - - const removeSource: UpdateSource = async function* () { - yield { - uri: "/doc.md", - stamp: new Date("2026-04-02T00:00:00Z"), - removed: new Date("2026-04-02T00:00:00Z"), - }; - }; - await collect(scanner.scan(removeSource)); - - const stored = await collect(store.list()); - expect(stored.filter((s) => s.removed)).toHaveLength(1); - }); - - it("skips entries without content", async () => { - const scanner = new ContentEmbedderScanner(store, { - embed: mockEmbed, - }); - - const source: UpdateSource = async function* () { - yield { uri: "/no-content.md", stamp: new Date() }; - }; - await collect(scanner.scan(source)); - - expect(await collect(store.list())).toHaveLength(0); - }); -}); diff --git a/packages/content-scanner/tests/content-extractor-scanner.test.ts b/packages/content-scanner/tests/content-extractor-scanner.test.ts deleted file mode 100644 index 9c2b07f..0000000 --- a/packages/content-scanner/tests/content-extractor-scanner.test.ts +++ /dev/null @@ -1,113 +0,0 @@ -import { ExtractorRegistry } from "@statewalker/content-extractors"; -import { writeText } from "@statewalker/webrun-files"; -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { ContentExtractorScanner } from "../src/content-extractor-scanner.js"; -import { FilesScanRegistry } from "../src/files-scan-registry.js"; -import type { ScanStore } from "../src/scan-store.js"; -import { collect, makeSource } from "./test-helpers.js"; - -describe("ContentExtractorScanner", () => { - let contentFiles: MemFilesApi; - let storeFiles: MemFilesApi; - let registry: FilesScanRegistry; - let store: ScanStore; - let extractors: ExtractorRegistry; - - beforeEach(async () => { - contentFiles = new MemFilesApi(); - storeFiles = new MemFilesApi(); - registry = new FilesScanRegistry({ files: storeFiles, prefix: "scan" }); - store = await registry.createStore("content"); - - extractors = new ExtractorRegistry(); - extractors.registerByPattern("*.md", async (content) => { - const chunks: Uint8Array[] = []; - for await (const chunk of content) chunks.push(chunk); - return new TextDecoder().decode(Buffer.concat(chunks)); - }); - extractors.registerByPattern("*.txt", async (content) => { - const chunks: Uint8Array[] = []; - for await (const chunk of content) chunks.push(chunk); - return new TextDecoder().decode(Buffer.concat(chunks)); - }); - }); - - it("extracts content from changed files", async () => { - await writeText(contentFiles, "/docs/readme.md", "# Hello World"); - - const scanner = new ContentExtractorScanner(store, { - files: contentFiles, - extractors, - }); - - const source = makeSource([ - { uri: "/docs/readme.md", stamp: new Date("2026-04-01T00:00:00Z") }, - ]); - - const events = await collect(scanner.scan(source)); - const processed = events.filter((e) => e.type === "entry-processed"); - expect(processed).toHaveLength(1); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(1); - expect(stored[0]?.meta?.format).toBe("markdown"); - - // Read extracted content - const entry = stored[0]; - if (!entry?.content) throw new Error("expected content"); - const chunks = await collect(entry.content()); - const text = new TextDecoder().decode(chunks[0]); - expect(text).toBe("# Hello World"); - }); - - it("skips files without extractor", async () => { - await writeText(contentFiles, "/data/file.bin", "binary data"); - - const scanner = new ContentExtractorScanner(store, { - files: contentFiles, - extractors, - }); - - const source = makeSource([{ uri: "/data/file.bin", stamp: new Date("2026-04-01T00:00:00Z") }]); - - await collect(scanner.scan(source)); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(0); - }); - - it("handles removal cascade", async () => { - await writeText(contentFiles, "/docs/readme.md", "# Hello"); - - const scanner = new ContentExtractorScanner(store, { - files: contentFiles, - extractors, - }); - - // First extract content - await collect( - scanner.scan( - makeSource([{ uri: "/docs/readme.md", stamp: new Date("2026-04-01T00:00:00Z") }]), - ), - ); - expect(await collect(store.list())).toHaveLength(1); - - // Then remove - await collect( - scanner.scan( - makeSource([ - { - uri: "/docs/readme.md", - stamp: new Date("2026-04-02T00:00:00Z"), - removed: new Date("2026-04-02T00:00:00Z"), - }, - ]), - ), - ); - - const stored = await collect(store.list()); - const removed = stored.filter((s) => s.removed); - expect(removed).toHaveLength(1); - }); -}); diff --git a/packages/content-scanner/tests/content-fts-indexer-scanner.test.ts b/packages/content-scanner/tests/content-fts-indexer-scanner.test.ts deleted file mode 100644 index a0be1fb..0000000 --- a/packages/content-scanner/tests/content-fts-indexer-scanner.test.ts +++ /dev/null @@ -1,119 +0,0 @@ -import type { DocumentPath, Index, IndexedBlock, PathSelector } from "@statewalker/indexer-api"; -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { ContentFtsIndexerScanner } from "../src/content-fts-indexer-scanner.js"; -import { FilesScanRegistry } from "../src/files-scan-registry.js"; -import type { ScanStore } from "../src/scan-store.js"; -import type { UpdateSource } from "../src/scanner.js"; -import { collect, makeChunksSource } from "./test-helpers.js"; - -/** Minimal mock Index that records addDocument/deleteDocuments calls. */ -function createMockIndex() { - const documents = new Map(); - const index: Partial = { - async addDocument(blocks: IndexedBlock[]) { - if (blocks.length === 0) return; - const path = blocks[0]?.path; - if (path) documents.set(path, blocks); - }, - async deleteDocuments(selectors: PathSelector[] | AsyncIterable) { - const sels = Array.isArray(selectors) ? selectors : await collect(selectors); - for (const sel of sels) { - documents.delete(sel.path); - } - }, - }; - return { index: index as Index, documents }; -} - -describe("ContentFtsIndexerScanner", () => { - let storeFiles: MemFilesApi; - let registry: FilesScanRegistry; - let store: ScanStore; - - beforeEach(async () => { - storeFiles = new MemFilesApi(); - registry = new FilesScanRegistry({ files: storeFiles, prefix: "scan" }); - store = await registry.createStore("fts-index"); - }); - - it("indexes chunks into the Index", async () => { - const { index, documents } = createMockIndex(); - const scanner = new ContentFtsIndexerScanner(store, { index }); - - const source = makeChunksSource([ - { - uri: "/docs/readme.md", - chunks: [ - { index: 0, content: "Hello" }, - { index: 1, content: "World" }, - ], - }, - ]); - - await collect(scanner.scan(source)); - - expect(documents.size).toBe(1); - const blocks = documents.get("/docs/readme.md" as DocumentPath); - expect(blocks).toHaveLength(2); - expect(blocks?.[0]?.blockId).toBe("/docs/readme.md:0"); - expect(blocks?.[1]?.blockId).toBe("/docs/readme.md:1"); - expect(blocks?.[0]?.content).toBe("Hello"); - - // Store tracks the indexed URI - const stored = await collect(store.list()); - expect(stored).toHaveLength(1); - }); - - it("removes documents from Index on cascade removal", async () => { - const { index, documents } = createMockIndex(); - const scanner = new ContentFtsIndexerScanner(store, { index }); - - // First index - await collect( - scanner.scan( - makeChunksSource([{ uri: "/docs/readme.md", chunks: [{ index: 0, content: "Hello" }] }]), - ), - ); - expect(documents.size).toBe(1); - - // Then remove - const removeSource: UpdateSource = async function* () { - yield { - uri: "/docs/readme.md", - stamp: new Date("2026-04-02T00:00:00Z"), - removed: new Date("2026-04-02T00:00:00Z"), - }; - }; - await collect(scanner.scan(removeSource)); - - expect(documents.size).toBe(0); - }); - - it("generates consistent block IDs", async () => { - const { index, documents } = createMockIndex(); - const scanner = new ContentFtsIndexerScanner(store, { index }); - - await collect( - scanner.scan( - makeChunksSource([ - { - uri: "/docs/readme.md", - chunks: [ - { index: 0, content: "A" }, - { index: 1, content: "B" }, - { index: 2, content: "C" }, - ], - }, - ]), - ), - ); - - const blocks = documents.get("/docs/readme.md" as DocumentPath); - expect(blocks?.map((b) => b.blockId)).toEqual([ - "/docs/readme.md:0", - "/docs/readme.md:1", - "/docs/readme.md:2", - ]); - }); -}); diff --git a/packages/content-scanner/tests/content-scanner.test.ts b/packages/content-scanner/tests/content-scanner.test.ts deleted file mode 100644 index 27112a8..0000000 --- a/packages/content-scanner/tests/content-scanner.test.ts +++ /dev/null @@ -1,376 +0,0 @@ -import type { ContentSection } from "@statewalker/content-blocks"; -import { writeText } from "@statewalker/webrun-files"; -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { ContentScanner } from "../src/content-scanner.js"; - -/** Collect all events from an async generator. */ -async function collectEvents(gen: AsyncGenerator): Promise { - const events: ContentSection[] = []; - for await (const event of gen) { - events.push(event); - } - return events; -} - -/** Extract events of a specific type. */ -function ofType(events: ContentSection[], type: string): ContentSection[] { - return events.filter((e) => e.props?.type === type); -} - -describe("ContentScanner", () => { - let trackingFiles: MemFilesApi; - let scanner: ContentScanner; - let files: MemFilesApi; - - beforeEach(() => { - trackingFiles = new MemFilesApi(); - scanner = new ContentScanner({ trackingFiles }); - files = new MemFilesApi(); - }); - - describe("scan — detect added files", () => { - it("detects all files as content-changed on first scan", async () => { - await writeText(files, "/root/a.txt", "hello"); - await writeText(files, "/root/b.txt", "world"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - const events = await collectEvents(scanner.scan({ collectionId: "docs" })); - - const changed = ofType(events, "content-changed"); - expect(changed).toHaveLength(2); - for (const e of changed) { - expect(e.props?.collection).toBe("docs"); - expect(e.props?.uri).toBeTruthy(); - } - }); - }); - - describe("scan — no changes", () => { - it("reports only scan-started and scan-done on re-scan", async () => { - await writeText(files, "/root/a.txt", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - // First scan - await collectEvents(scanner.scan({ collectionId: "docs" })); - - // Second scan — nothing changed - const events = await collectEvents(scanner.scan({ collectionId: "docs" })); - const changed = ofType(events, "content-changed"); - expect(changed).toHaveLength(0); - - expect(ofType(events, "scan-started")).toHaveLength(1); - expect(ofType(events, "scan-done")).toHaveLength(1); - }); - }); - - describe("scan — detect updates", () => { - it("detects modified file as content-changed", async () => { - await writeText(files, "/root/a.txt", "short"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - await collectEvents(scanner.scan({ collectionId: "docs" })); - - // Modify the file - await writeText(files, "/root/a.txt", "this is a much longer version of the content"); - - const events = await collectEvents(scanner.scan({ collectionId: "docs" })); - const changed = ofType(events, "content-changed"); - expect(changed).toHaveLength(1); - }); - }); - - describe("scan — detect removals", () => { - it("detects removed files as content-removed", async () => { - await writeText(files, "/root/a.txt", "hello"); - await writeText(files, "/root/b.txt", "world"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - await collectEvents(scanner.scan({ collectionId: "docs" })); - - // Remove one file - await files.remove("/root/b.txt"); - - const events = await collectEvents(scanner.scan({ collectionId: "docs" })); - const removed = ofType(events, "content-removed"); - expect(removed).toHaveLength(1); - }); - }); - - describe("event order", () => { - it("scan-started is first and scan-done is last", async () => { - await writeText(files, "/root/a.txt", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - const events = await collectEvents(scanner.scan({ collectionId: "docs" })); - - expect(events.length).toBeGreaterThanOrEqual(2); - expect(events[0]?.props?.type).toBe("scan-started"); - expect(events[events.length - 1]?.props?.type).toBe("scan-done"); - }); - }); - - describe("multiple collections", () => { - it("scopes events to collection", async () => { - const files2 = new MemFilesApi(); - - await writeText(files, "/root/a.txt", "hello"); - await writeText(files2, "/data/x.csv", "1,2,3"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - scanner.addCollection({ - config: { collectionId: "data", files: files2, root: "/data" }, - }); - - const docEvents = await collectEvents(scanner.scan({ collectionId: "docs" })); - const docChanged = ofType(docEvents, "content-changed"); - expect(docChanged).toHaveLength(1); - expect(docChanged[0]?.props?.collection).toBe("docs"); - - const dataEvents = await collectEvents(scanner.scan({ collectionId: "data" })); - const dataChanged = ofType(dataEvents, "content-changed"); - expect(dataChanged).toHaveLength(1); - expect(dataChanged[0]?.props?.collection).toBe("data"); - - // Removing a file in docs should not affect data - await files.remove("/root/a.txt"); - const docEvents2 = await collectEvents(scanner.scan({ collectionId: "docs" })); - expect(ofType(docEvents2, "content-removed")).toHaveLength(1); - - const dataEvents2 = await collectEvents(scanner.scan({ collectionId: "data" })); - expect(ofType(dataEvents2, "content-changed")).toHaveLength(0); - expect(ofType(dataEvents2, "content-removed")).toHaveLength(0); - }); - }); - - describe("removeCollection", () => { - it("clears tracking data", async () => { - await writeText(files, "/root/a.txt", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - await collectEvents(scanner.scan({ collectionId: "docs" })); - - await scanner.removeCollection({ collectionId: "docs" }); - - // Re-add and scan — should see fresh content-changed - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - const events = await collectEvents(scanner.scan({ collectionId: "docs" })); - const changed = ofType(events, "content-changed"); - expect(changed).toHaveLength(1); - }); - }); - - describe("cleanupRemoved", () => { - it("purges old removal records", async () => { - await writeText(files, "/root/a.txt", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - await collectEvents(scanner.scan({ collectionId: "docs" })); - - await files.remove("/root/a.txt"); - await collectEvents(scanner.scan({ collectionId: "docs" })); - - // Purge removals older than far future - const purged = await scanner.cleanupRemoved({ - before: new Date(Date.now() + 100000).toISOString(), - }); - expect(purged).toBe(1); - }); - - it("does not purge recent removals", async () => { - await writeText(files, "/root/a.txt", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - await collectEvents(scanner.scan({ collectionId: "docs" })); - - await files.remove("/root/a.txt"); - await collectEvents(scanner.scan({ collectionId: "docs" })); - - // Purge with old threshold — should not remove - const purged = await scanner.cleanupRemoved({ - before: "1970-01-01T00:00:00.000Z", - }); - expect(purged).toBe(0); - }); - }); - - describe("scanAll", () => { - it("scans all registered collections", async () => { - const files2 = new MemFilesApi(); - - await writeText(files, "/root/a.txt", "hello"); - await writeText(files2, "/data/x.csv", "1,2,3"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - scanner.addCollection({ - config: { collectionId: "data", files: files2, root: "/data" }, - }); - - const events = await collectEvents(scanner.scanAll()); - const changed = ofType(events, "content-changed"); - expect(changed).toHaveLength(2); - - const collections = new Set(changed.map((e) => e.props?.collection)); - expect(collections.size).toBe(2); - }); - }); - - describe("filter option", () => { - it("skips filtered paths", async () => { - await writeText(files, "/root/.project/config.json", "{}"); - await writeText(files, "/root/readme.md", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - const events = await collectEvents( - scanner.scan({ - collectionId: "docs", - options: { filter: (path) => !path.includes(".project") }, - }), - ); - - const changed = ofType(events, "content-changed"); - expect(changed).toHaveLength(1); - }); - }); - - describe("skipHash option", () => { - it("detects added files without hashing", async () => { - await writeText(files, "/root/a.txt", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - const events = await collectEvents( - scanner.scan({ - collectionId: "docs", - options: { skipHash: true }, - }), - ); - - const changed = ofType(events, "content-changed"); - expect(changed).toHaveLength(1); - }); - - it("detects updates by size/lastModified without hashing", async () => { - await writeText(files, "/root/a.txt", "short"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - await collectEvents( - scanner.scan({ - collectionId: "docs", - options: { skipHash: true }, - }), - ); - - await writeText(files, "/root/a.txt", "much longer content now"); - - const events = await collectEvents( - scanner.scan({ - collectionId: "docs", - options: { skipHash: true }, - }), - ); - const changed = ofType(events, "content-changed"); - expect(changed).toHaveLength(1); - }); - - it("reports no changes on re-scan of unchanged files", async () => { - await writeText(files, "/root/a.txt", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - await collectEvents( - scanner.scan({ - collectionId: "docs", - options: { skipHash: true }, - }), - ); - - const events = await collectEvents( - scanner.scan({ - collectionId: "docs", - options: { skipHash: true }, - }), - ); - const changed = ofType(events, "content-changed"); - expect(changed).toHaveLength(0); - }); - }); - - describe("getChanges since timestamp", () => { - it("returns changes since a given time", async () => { - const before = new Date(Date.now() - 1000).toISOString(); - - await writeText(files, "/root/a.txt", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - await collectEvents(scanner.scan({ collectionId: "docs" })); - - const events = await collectEvents( - scanner.getChanges({ collectionId: "docs", since: before }), - ); - expect(events.length).toBeGreaterThan(0); - }); - - it("returns empty for future timestamp", async () => { - await writeText(files, "/root/a.txt", "hello"); - - scanner.addCollection({ - config: { collectionId: "docs", files, root: "/root" }, - }); - - await collectEvents(scanner.scan({ collectionId: "docs" })); - - const events = await collectEvents( - scanner.getChanges({ - collectionId: "docs", - since: new Date(Date.now() + 100000).toISOString(), - }), - ); - expect(events).toHaveLength(0); - }); - }); -}); diff --git a/packages/content-scanner/tests/content-splitter-scanner.test.ts b/packages/content-scanner/tests/content-splitter-scanner.test.ts deleted file mode 100644 index 5af2aa4..0000000 --- a/packages/content-scanner/tests/content-splitter-scanner.test.ts +++ /dev/null @@ -1,97 +0,0 @@ -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { decodeMsgpack } from "@statewalker/webrun-msgpack"; -import { collect as collectStream } from "@statewalker/webrun-streams"; -import { beforeEach, describe, expect, it } from "vitest"; -import { ContentSplitterScanner } from "../src/content-splitter-scanner.js"; -import { FilesScanRegistry } from "../src/files-scan-registry.js"; -import type { ScanStore } from "../src/scan-store.js"; -import type { UpdateSource } from "../src/scanner.js"; -import { collect, makeContentSource } from "./test-helpers.js"; - -describe("ContentSplitterScanner", () => { - let storeFiles: MemFilesApi; - let registry: FilesScanRegistry; - let store: ScanStore; - - beforeEach(async () => { - storeFiles = new MemFilesApi(); - registry = new FilesScanRegistry({ files: storeFiles, prefix: "scan" }); - store = await registry.createStore("chunks"); - }); - - it("splits content into chunks", async () => { - const longText = `# Title\n\n${"Some content paragraph. ".repeat(200)}`; - const scanner = new ContentSplitterScanner(store, { - chunkOptions: { targetChars: 500 }, - }); - - const source = makeContentSource([{ uri: "/doc.md", text: longText }]); - await collect(scanner.scan(source)); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(1); - const entry = stored[0]; - expect(entry?.meta?.chunkCount).toBeGreaterThan(1); - expect(entry?.meta?.targetChars).toBe(500); - - // Verify chunks are serialized as msgpack stream - if (!entry?.content) throw new Error("expected content"); - const chunks = await collectStream( - decodeMsgpack<{ index: number; content: string }>(entry.content()), - ); - expect(chunks.length).toBeGreaterThan(1); - expect(chunks[0]?.index).toBe(0); - expect(typeof chunks[0]?.content).toBe("string"); - }); - - it("respects configurable chunk options", async () => { - const text = "Word ".repeat(1000); - const scanner = new ContentSplitterScanner(store, { - chunkOptions: { targetChars: 200 }, - }); - - const source = makeContentSource([{ uri: "/doc.md", text }]); - await collect(scanner.scan(source)); - - const stored = await collect(store.list()); - expect(stored[0]?.meta?.targetChars).toBe(200); - }); - - it("handles removal cascade", async () => { - const scanner = new ContentSplitterScanner(store, { - chunkOptions: { targetChars: 500 }, - }); - - // First split content - const text = `# Hello\n\n${"Content. ".repeat(100)}`; - await collect(scanner.scan(makeContentSource([{ uri: "/doc.md", text }]))); - expect(await collect(store.list())).toHaveLength(1); - - // Then remove - const removeSource: UpdateSource = async function* () { - yield { - uri: "/doc.md", - stamp: new Date("2026-04-02T00:00:00Z"), - removed: new Date("2026-04-02T00:00:00Z"), - }; - }; - await collect(scanner.scan(removeSource)); - - const stored = await collect(store.list()); - const removed = stored.filter((s) => s.removed); - expect(removed).toHaveLength(1); - }); - - it("skips entries without content", async () => { - const scanner = new ContentSplitterScanner(store, { - chunkOptions: { targetChars: 500 }, - }); - - const source: UpdateSource = async function* () { - yield { uri: "/no-content.md", stamp: new Date() }; - }; - await collect(scanner.scan(source)); - - expect(await collect(store.list())).toHaveLength(0); - }); -}); diff --git a/packages/content-scanner/tests/content-vector-indexer-scanner.test.ts b/packages/content-scanner/tests/content-vector-indexer-scanner.test.ts deleted file mode 100644 index 4641d7e..0000000 --- a/packages/content-scanner/tests/content-vector-indexer-scanner.test.ts +++ /dev/null @@ -1,99 +0,0 @@ -import type { DocumentPath, Index, IndexedBlock, PathSelector } from "@statewalker/indexer-api"; -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { ContentVectorIndexerScanner } from "../src/content-vector-indexer-scanner.js"; -import { FilesScanRegistry } from "../src/files-scan-registry.js"; -import type { ScanStore } from "../src/scan-store.js"; -import type { UpdateSource } from "../src/scanner.js"; -import { collect, makeEmbeddingsSource } from "./test-helpers.js"; - -function createMockIndex() { - const documents = new Map(); - const index: Partial = { - async addDocument(blocks: IndexedBlock[]) { - if (blocks.length === 0) return; - const path = blocks[0]?.path; - if (path) documents.set(path, blocks); - }, - async deleteDocuments(selectors: PathSelector[] | AsyncIterable) { - const sels = Array.isArray(selectors) ? selectors : await collect(selectors); - for (const sel of sels) { - documents.delete(sel.path); - } - }, - }; - return { index: index as Index, documents }; -} - -describe("ContentVectorIndexerScanner", () => { - let storeFiles: MemFilesApi; - let registry: FilesScanRegistry; - let store: ScanStore; - - beforeEach(async () => { - storeFiles = new MemFilesApi(); - registry = new FilesScanRegistry({ files: storeFiles, prefix: "scan" }); - store = await registry.createStore("vec-index"); - }); - - it("indexes embeddings into the Index", async () => { - const { index, documents } = createMockIndex(); - const scanner = new ContentVectorIndexerScanner(store, { index }); - - const emb1 = new Float32Array([0.1, 0.2, 0.3]); - const emb2 = new Float32Array([0.4, 0.5, 0.6]); - - await collect( - scanner.scan(makeEmbeddingsSource([{ uri: "/docs/readme.md", embeddings: [emb1, emb2] }])), - ); - - expect(documents.size).toBe(1); - const blocks = documents.get("/docs/readme.md" as DocumentPath); - expect(blocks).toHaveLength(2); - expect(blocks?.[0]?.embedding).toBeDefined(); - expect(blocks?.[0]?.embedding?.length).toBe(3); - }); - - it("removes documents from Index on cascade removal", async () => { - const { index, documents } = createMockIndex(); - const scanner = new ContentVectorIndexerScanner(store, { index }); - - const emb = new Float32Array([0.1, 0.2]); - await collect( - scanner.scan(makeEmbeddingsSource([{ uri: "/docs/readme.md", embeddings: [emb] }])), - ); - expect(documents.size).toBe(1); - - const removeSource: UpdateSource = async function* () { - yield { - uri: "/docs/readme.md", - stamp: new Date("2026-04-02T00:00:00Z"), - removed: new Date("2026-04-02T00:00:00Z"), - }; - }; - await collect(scanner.scan(removeSource)); - expect(documents.size).toBe(0); - }); - - it("uses same block ID pattern as FTS indexer", async () => { - const { index, documents } = createMockIndex(); - const scanner = new ContentVectorIndexerScanner(store, { index }); - - const emb1 = new Float32Array([0.1]); - const emb2 = new Float32Array([0.2]); - const emb3 = new Float32Array([0.3]); - - await collect( - scanner.scan( - makeEmbeddingsSource([{ uri: "/docs/readme.md", embeddings: [emb1, emb2, emb3] }]), - ), - ); - - const blocks = documents.get("/docs/readme.md" as DocumentPath); - expect(blocks?.map((b) => b.blockId)).toEqual([ - "/docs/readme.md:0", - "/docs/readme.md:1", - "/docs/readme.md:2", - ]); - }); -}); diff --git a/packages/content-scanner/tests/file-uri.test.ts b/packages/content-scanner/tests/file-uri.test.ts deleted file mode 100644 index 9935e56..0000000 --- a/packages/content-scanner/tests/file-uri.test.ts +++ /dev/null @@ -1,34 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { encodeUri, parseUri } from "../src/file-uri.js"; - -describe("file-uri", () => { - describe("encodeUri", () => { - it("produces collectionId:path format", () => { - expect(encodeUri("docs", "/readme.md")).toBe("docs:/readme.md"); - }); - - it("handles empty path", () => { - expect(encodeUri("col", "")).toBe("col:"); - }); - }); - - describe("parseUri", () => { - it("round-trips with encodeUri", () => { - const uri = encodeUri("myCol", "/some/file.txt"); - const parsed = parseUri(uri); - expect(parsed.collectionId).toBe("myCol"); - expect(parsed.path).toBe("/some/file.txt"); - }); - - it("handles paths containing colons", () => { - const uri = encodeUri("col", "/path/to:file:with:colons.txt"); - const parsed = parseUri(uri); - expect(parsed.collectionId).toBe("col"); - expect(parsed.path).toBe("/path/to:file:with:colons.txt"); - }); - - it("throws on URI without colon", () => { - expect(() => parseUri("nocolon")).toThrow("Invalid file URI"); - }); - }); -}); diff --git a/packages/content-scanner/tests/files-scanner.test.ts b/packages/content-scanner/tests/files-scanner.test.ts deleted file mode 100644 index 16d9eef..0000000 --- a/packages/content-scanner/tests/files-scanner.test.ts +++ /dev/null @@ -1,137 +0,0 @@ -import { writeText } from "@statewalker/webrun-files"; -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { FilesScanRegistry } from "../src/files-scan-registry.js"; -import { FilesScanner } from "../src/files-scanner.js"; -import type { ScanStore } from "../src/scan-store.js"; -import type { ScannerEvent } from "../src/scanner.js"; -import { collect } from "./test-helpers.js"; - -describe("FilesScanner", () => { - let contentFiles: MemFilesApi; - let storeFiles: MemFilesApi; - let registry: FilesScanRegistry; - let store: ScanStore; - - beforeEach(async () => { - contentFiles = new MemFilesApi(); - storeFiles = new MemFilesApi(); - registry = new FilesScanRegistry({ files: storeFiles, prefix: "scan" }); - store = await registry.createStore("files"); - - // Create some test files - await writeText(contentFiles, "/project/readme.md", "# Hello"); - await writeText(contentFiles, "/project/src/index.ts", "export {}"); - await writeText(contentFiles, "/project/.git/config", "gitconfig"); - }); - - it("detects new files on first scan", async () => { - const scanner = new FilesScanner(store, { - files: contentFiles, - root: "/project", - filter: (p) => !p.includes("/.git/"), - }); - - const events = await collect(scanner.scan()); - const processed = events.filter( - (e): e is Extract => e.type === "entry-processed", - ); - expect(processed).toHaveLength(2); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(2); - const uris = stored.map((s) => s.uri).sort(); - expect(uris.some((u) => u.includes("readme.md"))).toBe(true); - expect(uris.some((u) => u.includes("index.ts"))).toBe(true); - }); - - it("detects file modifications on re-scan", async () => { - const scanner = new FilesScanner(store, { - files: contentFiles, - root: "/project", - filter: (p) => !p.includes("/.git/"), - }); - - // First scan - await collect(scanner.scan()); - - // Modify a file - await writeText(contentFiles, "/project/readme.md", "# Updated"); - - // Second scan - await collect(scanner.scan()); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(2); - }); - - it("detects file removal", async () => { - const scanner = new FilesScanner(store, { - files: contentFiles, - root: "/project", - filter: (p) => !p.includes("/.git/"), - }); - - // First scan - await collect(scanner.scan()); - expect(await collect(store.list())).toHaveLength(2); - - // Remove a file - await contentFiles.remove("/project/readme.md"); - - // Second scan - await collect(scanner.scan()); - - const stored = await collect(store.list()); - const removed = stored.filter((s) => s.removed); - expect(removed).toHaveLength(1); - expect(removed.some((r) => r.uri.includes("readme.md"))).toBe(true); - }); - - it("respects filter function", async () => { - const scanner = new FilesScanner(store, { - files: contentFiles, - root: "/project", - filter: (p) => p.endsWith(".ts"), - }); - - await collect(scanner.scan()); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(1); - expect(stored.some((s) => s.uri.includes("index.ts"))).toBe(true); - }); - - it("stores file metadata (size, lastModified, hash)", async () => { - const scanner = new FilesScanner(store, { - files: contentFiles, - root: "/project", - filter: (p) => p.includes("readme.md"), - }); - - await collect(scanner.scan()); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(1); - const meta = stored[0]?.meta; - expect(meta).toBeDefined(); - expect(typeof meta?.size).toBe("number"); - expect(typeof meta?.hash).toBe("string"); - expect((meta?.hash as string).length).toBeGreaterThan(0); - }); - - it("skipHash mode detects changes by size/mtime only", async () => { - const scanner = new FilesScanner(store, { - files: contentFiles, - root: "/project", - filter: (p) => p.includes("readme.md"), - skipHash: true, - }); - - await collect(scanner.scan()); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(1); - expect(stored[0]?.meta?.hash).toBe(""); - }); -}); diff --git a/packages/content-scanner/tests/scan-events.test.ts b/packages/content-scanner/tests/scan-events.test.ts deleted file mode 100644 index 790d736..0000000 --- a/packages/content-scanner/tests/scan-events.test.ts +++ /dev/null @@ -1,71 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { createScanEvent } from "../src/scan-events.js"; - -describe("createScanEvent", () => { - it("returns a ContentSection with correct props", () => { - const event = createScanEvent({ - type: "scan-started", - collectionId: "docs", - }); - - expect(event.props).toBeDefined(); - expect(event.props?.role).toBe("tool:content-scanner"); - expect(event.props?.stage).toBe("scanning"); - expect(event.props?.type).toBe("scan-started"); - expect(event.props?.collection).toBe("docs"); - expect(event.blocks).toEqual([]); - }); - - it("generates an id that is a valid hex string", () => { - const event = createScanEvent({ - type: "content-changed", - collectionId: "docs", - }); - - const id = event.props?.id; - expect(id).toBeDefined(); - expect(id).toMatch(/^[0-9A-HJKMNP-TV-Z]{13}$/); // Crockford base32 - }); - - it("generates a valid ISO 8601 time", () => { - const event = createScanEvent({ - type: "scan-done", - collectionId: "docs", - }); - - const time = event.props?.time; - expect(time).toBeDefined(); - // Verify it parses as a valid date - const parsed = new Date(time!); - expect(parsed.toISOString()).toBe(time); - }); - - it("type matches input", () => { - for (const type of [ - "scan-started", - "content-changed", - "content-removed", - "scan-done", - ] as const) { - const event = createScanEvent({ type, collectionId: "c1" }); - expect(event.props?.type).toBe(type); - } - }); - - it("includes uri when provided", () => { - const event = createScanEvent({ - type: "content-changed", - collectionId: "docs", - uri: "docs:/root/file.txt", - }); - expect(event.props?.uri).toBe("docs:/root/file.txt"); - }); - - it("omits uri when not provided", () => { - const event = createScanEvent({ - type: "scan-started", - collectionId: "docs", - }); - expect(event.props?.uri).toBeUndefined(); - }); -}); diff --git a/packages/content-scanner/tests/scan-registry.test.ts b/packages/content-scanner/tests/scan-registry.test.ts deleted file mode 100644 index f31f24c..0000000 --- a/packages/content-scanner/tests/scan-registry.test.ts +++ /dev/null @@ -1,115 +0,0 @@ -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { FilesScanRegistry } from "../src/files-scan-registry.js"; -import type { Update } from "../src/scan-store.js"; -import { collect } from "./test-helpers.js"; - -describe("FilesScanRegistry", () => { - let files: MemFilesApi; - let registry: FilesScanRegistry; - - beforeEach(() => { - files = new MemFilesApi(); - registry = new FilesScanRegistry({ files, prefix: "scan" }); - }); - - describe("createStore", () => { - it("creates a new store", async () => { - const store = await registry.createStore("files"); - expect(store.name).toBe("files"); - }); - - it("throws on duplicate name", async () => { - await registry.createStore("files"); - await expect(registry.createStore("files")).rejects.toThrow("Store already exists"); - }); - }); - - describe("getStore", () => { - it("returns existing store", async () => { - await registry.createStore("files"); - const store = await registry.getStore("files"); - expect(store).not.toBeNull(); - expect(store?.name).toBe("files"); - }); - - it("returns null for non-existent store", async () => { - const store = await registry.getStore("unknown"); - expect(store).toBeNull(); - }); - }); - - describe("hasStore", () => { - it("returns true for existing store", async () => { - await registry.createStore("files"); - expect(await registry.hasStore("files")).toBe(true); - }); - - it("returns false for non-existent store", async () => { - expect(await registry.hasStore("unknown")).toBe(false); - }); - }); - - describe("getStoreNames", () => { - it("returns empty array initially", async () => { - expect(await registry.getStoreNames()).toEqual([]); - }); - - it("returns all store names", async () => { - await registry.createStore("files"); - await registry.createStore("content"); - await registry.createStore("chunks"); - const names = await registry.getStoreNames(); - expect(names.sort()).toEqual(["chunks", "content", "files"]); - }); - }); - - describe("deleteStore", () => { - it("removes the store and its data", async () => { - const store = await registry.createStore("files"); - const update: Update = { - uri: "/a.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - }; - await collect(store.store([update])); - - await registry.deleteStore("files"); - expect(await registry.hasStore("files")).toBe(false); - expect(await registry.getStore("files")).toBeNull(); - }); - - it("throws for non-existent store", async () => { - await expect(registry.deleteStore("unknown")).rejects.toThrow("Store not found"); - }); - }); - - describe("persistence", () => { - it("stores survive registry re-creation", async () => { - await registry.createStore("files"); - await registry.createStore("content"); - - const registry2 = new FilesScanRegistry({ files, prefix: "scan" }); - const names = await registry2.getStoreNames(); - expect(names.sort()).toEqual(["content", "files"]); - }); - - it("store data survives registry re-creation", async () => { - const store = await registry.createStore("files"); - const update: Update = { - uri: "/a.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - meta: { size: 100 }, - }; - await collect(store.store([update])); - - const registry2 = new FilesScanRegistry({ files, prefix: "scan" }); - const store2 = await registry2.getStore("files"); - expect(store2).not.toBeNull(); - if (!store2) throw new Error("expected store"); - const results = await collect(store2.list()); - expect(results).toHaveLength(1); - expect(results[0]?.uri).toBe("/a.txt"); - expect(results[0]?.meta).toEqual({ size: 100 }); - }); - }); -}); diff --git a/packages/content-scanner/tests/scan-store.test.ts b/packages/content-scanner/tests/scan-store.test.ts deleted file mode 100644 index 0d59aef..0000000 --- a/packages/content-scanner/tests/scan-store.test.ts +++ /dev/null @@ -1,307 +0,0 @@ -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { FilesScanStore } from "../src/files-scan-store.js"; -import type { Update } from "../src/scan-store.js"; -import { at, collect, contentOf, makeUpdate } from "./test-helpers.js"; - -describe("FilesScanStore", () => { - let files: MemFilesApi; - let store: FilesScanStore; - - beforeEach(() => { - files = new MemFilesApi(); - store = new FilesScanStore("test", files, "scan/test"); - }); - - describe("store + list round-trip", () => { - it("stores and retrieves entries", async () => { - const input = makeUpdate({ uri: "/a.txt" }); - await collect(store.store([input])); - - const results = await collect(store.list()); - expect(results).toHaveLength(1); - const entry = at(results, 0); - expect(entry.uri).toBe("/a.txt"); - expect(entry.stamp.getTime()).toBe(input.stamp.getTime()); - }); - - it("stores multiple entries", async () => { - const inputs = [ - makeUpdate({ uri: "/a.txt" }), - makeUpdate({ uri: "/b.txt" }), - makeUpdate({ uri: "/c.txt" }), - ]; - await collect(store.store(inputs)); - - const results = await collect(store.list()); - expect(results).toHaveLength(3); - }); - - it("preserves metadata", async () => { - const input = makeUpdate({ - uri: "/a.txt", - meta: { size: 100, format: "markdown" }, - }); - await collect(store.store([input])); - - const results = await collect(store.list()); - expect(at(results, 0).meta).toEqual({ size: 100, format: "markdown" }); - }); - - it("yields stored entries back", async () => { - const input = makeUpdate({ uri: "/a.txt" }); - const yielded = await collect(store.store([input])); - expect(yielded).toHaveLength(1); - expect(at(yielded, 0).uri).toBe("/a.txt"); - }); - }); - - describe("binary content", () => { - it("stores and lazily reads binary content", async () => { - const bytes = new TextEncoder().encode("hello world"); - const input: Update = { - uri: "/a.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - async *content() { - yield bytes; - }, - }; - await collect(store.store([input])); - - const entry = at(await collect(store.list()), 0); - expect(entry.content).toBeDefined(); - const chunks = await collect(contentOf(entry)); - const text = new TextDecoder().decode(at(chunks, 0)); - expect(text).toBe("hello world"); - }); - - it("content() is callable multiple times", async () => { - const bytes = new TextEncoder().encode("data"); - const input: Update = { - uri: "/a.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - async *content() { - yield bytes; - }, - }; - await collect(store.store([input])); - - const entry = at(await collect(store.list()), 0); - const firstRead = await collect(contentOf(entry)); - const secondRead = await collect(contentOf(entry)); - expect(firstRead).toHaveLength(1); - expect(secondRead).toHaveLength(1); - }); - - it("entries without content have empty content generator", async () => { - await collect(store.store([makeUpdate({ uri: "/a.txt" })])); - const entry = at(await collect(store.list()), 0); - const chunks = await collect(contentOf(entry)); - expect(chunks).toHaveLength(0); - }); - }); - - describe("Date serialization round-trip", () => { - it("stamp is deserialized as Date", async () => { - const stamp = new Date("2026-04-01T12:30:00.000Z"); - await collect(store.store([makeUpdate({ uri: "/a.txt", stamp })])); - - const entry = at(await collect(store.list()), 0); - expect(entry.stamp).toBeInstanceOf(Date); - expect(entry.stamp.getTime()).toBe(stamp.getTime()); - }); - - it("removed timestamp is deserialized as Date", async () => { - const removed = new Date("2026-04-02T00:00:00Z"); - await collect(store.store([makeUpdate({ uri: "/a.txt", removed })])); - - const entry = at(await collect(store.list()), 0); - expect(entry.removed).toBeInstanceOf(Date); - expect(entry.removed?.getTime()).toBe(removed.getTime()); - }); - }); - - describe("list filtering", () => { - beforeEach(async () => { - const entries = [ - makeUpdate({ - uri: "/docs/readme.md", - stamp: new Date("2026-04-01T00:00:00Z"), - }), - makeUpdate({ - uri: "/docs/guide.md", - stamp: new Date("2026-04-02T00:00:00Z"), - }), - makeUpdate({ - uri: "/src/index.ts", - stamp: new Date("2026-04-01T00:00:00Z"), - }), - makeUpdate({ - uri: "/src/utils.ts", - stamp: new Date("2026-04-03T00:00:00Z"), - }), - ]; - await collect(store.store(entries)); - }); - - it("filters by URI prefix", async () => { - const results = await collect(store.list({ uri: "/docs/*" })); - expect(results).toHaveLength(2); - expect(results.every((r) => r.uri.startsWith("/docs/"))).toBe(true); - }); - - it("filters by exact URI", async () => { - const results = await collect(store.list({ uri: "/docs/readme.md" })); - expect(results).toHaveLength(1); - expect(at(results, 0).uri).toBe("/docs/readme.md"); - }); - - it("filters by include exact stamp", async () => { - const results = await collect(store.list({ include: new Date("2026-04-01T00:00:00Z") })); - expect(results).toHaveLength(2); - }); - - it("filters by exclude exact stamp", async () => { - const results = await collect(store.list({ exclude: new Date("2026-04-01T00:00:00Z") })); - expect(results).toHaveLength(2); - const t = new Date("2026-04-01T00:00:00Z").getTime(); - expect(results.every((r) => r.stamp.getTime() !== t)).toBe(true); - }); - - it("filters by include stamp range", async () => { - const results = await collect( - store.list({ - include: [new Date("2026-04-01T00:00:00Z"), new Date("2026-04-02T00:00:00Z")], - }), - ); - expect(results).toHaveLength(3); - }); - - it("filters by exclude stamp range", async () => { - const results = await collect( - store.list({ - exclude: [new Date("2026-04-01T00:00:00Z"), new Date("2026-04-02T00:00:00Z")], - }), - ); - expect(results).toHaveLength(1); - expect(at(results, 0).uri).toBe("/src/utils.ts"); - }); - - it("combines URI and stamp filters", async () => { - const results = await collect( - store.list({ - uri: "/docs/*", - include: new Date("2026-04-01T00:00:00Z"), - }), - ); - expect(results).toHaveLength(1); - expect(at(results, 0).uri).toBe("/docs/readme.md"); - }); - }); - - describe("soft delete", () => { - it("marks entries as removed", async () => { - await collect(store.store([makeUpdate({ uri: "/a.txt" })])); - const removed = await collect(store.remove({ uri: "/a.txt" })); - expect(removed).toHaveLength(1); - expect(at(removed, 0).removed).toBeInstanceOf(Date); - }); - - it("soft-deleted entries are visible in list()", async () => { - await collect(store.store([makeUpdate({ uri: "/a.txt" })])); - await collect(store.remove({ uri: "/a.txt" })); - - const results = await collect(store.list()); - expect(results).toHaveLength(1); - expect(at(results, 0).removed).toBeDefined(); - }); - - it("does not re-remove already removed entries", async () => { - await collect(store.store([makeUpdate({ uri: "/a.txt" })])); - await collect(store.remove({ uri: "/a.txt" })); - const secondRemove = await collect(store.remove({ uri: "/a.txt" })); - expect(secondRemove).toHaveLength(0); - }); - - it("removes by URI prefix", async () => { - await collect( - store.store([ - makeUpdate({ uri: "/docs/a.md" }), - makeUpdate({ uri: "/docs/b.md" }), - makeUpdate({ uri: "/src/c.ts" }), - ]), - ); - const removed = await collect(store.remove({ uri: "/docs/*" })); - expect(removed).toHaveLength(2); - - const results = await collect(store.list()); - const active = results.filter((r) => !r.removed); - expect(active).toHaveLength(1); - expect(at(active, 0).uri).toBe("/src/c.ts"); - }); - }); - - describe("prune", () => { - it("physically deletes old soft-removed entries", async () => { - await collect( - store.store([ - makeUpdate({ - uri: "/old.txt", - removed: new Date("2025-01-01T00:00:00Z"), - }), - makeUpdate({ - uri: "/recent.txt", - removed: new Date("2026-06-01T00:00:00Z"), - }), - makeUpdate({ uri: "/active.txt" }), - ]), - ); - - const count = await store.prune(new Date("2026-01-01T00:00:00Z")); - expect(count).toBe(1); - - const results = await collect(store.list()); - expect(results).toHaveLength(2); - expect(results.map((r) => r.uri).sort()).toEqual(["/active.txt", "/recent.txt"]); - }); - }); - - describe("lastScan", () => { - it("returns null initially", async () => { - expect(await store.getLastScan()).toBeNull(); - }); - - it("persists and retrieves lastScan", async () => { - const stamp = new Date("2026-04-04T12:00:00Z"); - await store.setLastScan(stamp); - const result = await store.getLastScan(); - expect(result).toBeInstanceOf(Date); - expect(result?.getTime()).toBe(stamp.getTime()); - }); - - it("survives re-creation with same files", async () => { - const stamp = new Date("2026-04-04T12:00:00Z"); - await store.setLastScan(stamp); - - const store2 = new FilesScanStore("test", files, "scan/test"); - const result = await store2.getLastScan(); - expect(result?.getTime()).toBe(stamp.getTime()); - }); - }); - - describe("rebuildIndex", () => { - it("reconstructs index from entry files", async () => { - await collect(store.store([makeUpdate({ uri: "/a.txt" }), makeUpdate({ uri: "/b.txt" })])); - - // Corrupt the index by removing it - await files.remove("scan/test/_index.json"); - - // Rebuild - const store2 = new FilesScanStore("test", files, "scan/test"); - await store2.rebuildIndex(); - - const results = await collect(store2.list()); - expect(results).toHaveLength(2); - }); - }); -}); diff --git a/packages/content-scanner/tests/scanner.test.ts b/packages/content-scanner/tests/scanner.test.ts deleted file mode 100644 index 213a5d4..0000000 --- a/packages/content-scanner/tests/scanner.test.ts +++ /dev/null @@ -1,223 +0,0 @@ -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { FilesScanRegistry } from "../src/files-scan-registry.js"; -import type { ScanStore, Update } from "../src/scan-store.js"; -import { Scanner } from "../src/scanner.js"; -import { collect, makeSource } from "./test-helpers.js"; - -/** Concrete test scanner that uppercases content metadata. */ -class TestScanner extends Scanner { - removedUris: string[] = []; - - async processEntry(upstream: Update): Promise { - const text = upstream.meta?.text; - if (typeof text !== "string") return null; - return { - uri: upstream.uri, - stamp: upstream.stamp, - meta: { text: text.toUpperCase() }, - }; - } - - async removeEntry(uri: string): Promise { - this.removedUris.push(uri); - } -} - -/** Concrete test scanner that throws on specific URIs. */ -class FailingScanner extends Scanner { - async processEntry(upstream: Update): Promise { - if (upstream.uri === "/fail.txt") { - throw new Error("processing failed"); - } - return { uri: upstream.uri, stamp: upstream.stamp }; - } - - async removeEntry(_uri: string): Promise {} -} - -describe("Scanner", () => { - let files: MemFilesApi; - let registry: FilesScanRegistry; - let store: ScanStore; - - beforeEach(async () => { - files = new MemFilesApi(); - registry = new FilesScanRegistry({ files, prefix: "scan" }); - store = await registry.createStore("test"); - }); - - describe("scan processes upstream entries", () => { - it("calls processEntry for each upstream entry", async () => { - const scanner = new TestScanner(store); - const source = makeSource([ - { - uri: "/a.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - meta: { text: "hello" }, - }, - { - uri: "/b.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - meta: { text: "world" }, - }, - ]); - - await collect(scanner.scan(source)); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(2); - const uris = stored.map((s) => s.uri).sort(); - expect(uris).toEqual(["/a.txt", "/b.txt"]); - - const a = stored.find((s) => s.uri === "/a.txt"); - expect(a?.meta?.text).toBe("HELLO"); - }); - - it("skips entries when processEntry returns null", async () => { - const scanner = new TestScanner(store); - const source = makeSource([ - { - uri: "/a.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - meta: { text: "hello" }, - }, - { - uri: "/b.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - meta: { noText: true }, - }, - ]); - - await collect(scanner.scan(source)); - - const stored = await collect(store.list()); - expect(stored).toHaveLength(1); - expect(stored.find((s) => s.uri === "/a.txt")).toBeDefined(); - }); - }); - - describe("scan handles removals", () => { - it("calls removeEntry for soft-deleted upstream entries", async () => { - const scanner = new TestScanner(store); - - // First store something - const source1 = makeSource([ - { - uri: "/a.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - meta: { text: "hello" }, - }, - ]); - await collect(scanner.scan(source1)); - - // Then send removal - const source2 = makeSource([ - { - uri: "/a.txt", - stamp: new Date("2026-04-02T00:00:00Z"), - removed: new Date("2026-04-02T00:00:00Z"), - }, - ]); - await collect(scanner.scan(source2)); - - expect(scanner.removedUris).toEqual(["/a.txt"]); - }); - }); - - describe("scan yields events", () => { - it("yields scan-started and scan-done", async () => { - const scanner = new TestScanner(store); - const events = await collect(scanner.scan(makeSource([]))); - - expect(events).toHaveLength(2); - expect(events[0]?.type).toBe("scan-started"); - expect(events[1]?.type).toBe("scan-done"); - }); - - it("yields entry-processed for each processed entry", async () => { - const scanner = new TestScanner(store); - const source = makeSource([ - { - uri: "/a.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - meta: { text: "hi" }, - }, - ]); - - const events = await collect(scanner.scan(source)); - const processed = events.filter((e) => e.type === "entry-processed"); - expect(processed).toHaveLength(1); - expect((processed[0] as { uri: string }).uri).toBe("/a.txt"); - }); - - it("yields entry-removed for soft-deleted entries", async () => { - const scanner = new TestScanner(store); - const source = makeSource([{ uri: "/a.txt", stamp: new Date(), removed: new Date() }]); - - const events = await collect(scanner.scan(source)); - const removed = events.filter((e) => e.type === "entry-removed"); - expect(removed).toHaveLength(1); - }); - - it("yields batch-done after batchSize entries", async () => { - const scanner = new TestScanner(store, { batchSize: 2 }); - const entries: Update[] = []; - for (let i = 0; i < 5; i++) { - entries.push({ - uri: `/file-${i}.txt`, - stamp: new Date("2026-04-01T00:00:00Z"), - meta: { text: `content-${i}` }, - }); - } - - const events = await collect(scanner.scan(makeSource(entries))); - const batches = events.filter((e) => e.type === "batch-done"); - expect(batches).toHaveLength(2); // at 2 and 4 - }); - }); - - describe("error handling", () => { - it("continues processing after entry error", async () => { - const scanner = new FailingScanner(store); - const source = makeSource([ - { uri: "/ok.txt", stamp: new Date("2026-04-01T00:00:00Z") }, - { uri: "/fail.txt", stamp: new Date("2026-04-01T00:00:00Z") }, - { uri: "/ok2.txt", stamp: new Date("2026-04-01T00:00:00Z") }, - ]); - - const events = await collect(scanner.scan(source)); - const errors = events.filter((e) => e.type === "entry-error"); - const processed = events.filter((e) => e.type === "entry-processed"); - - expect(errors).toHaveLength(1); - expect(processed).toHaveLength(2); - }); - - it("includes error count in scan-done stats", async () => { - const scanner = new FailingScanner(store); - const source = makeSource([ - { uri: "/fail.txt", stamp: new Date("2026-04-01T00:00:00Z") }, - { uri: "/ok.txt", stamp: new Date("2026-04-01T00:00:00Z") }, - ]); - - const events = await collect(scanner.scan(source)); - const done = events.find((e) => e.type === "scan-done"); - expect(done?.type).toBe("scan-done"); - if (done?.type === "scan-done") { - expect(done.stats.errors).toBe(1); - expect(done.stats.processed).toBe(1); - } - }); - }); - - describe("lastScan update", () => { - it("updates store lastScan after scan completes", async () => { - const scanner = new TestScanner(store); - await collect(scanner.scan(makeSource([]))); - - const lastScan = await store.getLastScan(); - expect(lastScan).toBeInstanceOf(Date); - }); - }); -}); diff --git a/packages/content-scanner/tests/test-helpers.ts b/packages/content-scanner/tests/test-helpers.ts deleted file mode 100644 index 48252bd..0000000 --- a/packages/content-scanner/tests/test-helpers.ts +++ /dev/null @@ -1,94 +0,0 @@ -import { encodeFloat32Arrays, encodeMsgpack } from "@statewalker/webrun-msgpack"; -import type { Update } from "../src/scan-store.js"; -import type { UpdateSource } from "../src/scanner.js"; - -/** Collect all items from an async iterable into an array. */ -export async function collect(gen: AsyncIterable): Promise { - const items: T[] = []; - for await (const item of gen) items.push(item); - return items; -} - -/** Get an element by index, throwing if undefined. */ -export function at(arr: T[], index: number): T { - const item = arr[index]; - if (item === undefined) throw new Error(`no element at index ${index}`); - return item; -} - -/** Invoke content() on an Update, throwing if content is absent. */ -export function contentOf(update: Update): AsyncGenerator { - if (!update.content) throw new Error("expected content"); - return update.content(); -} - -/** Create a default Update with optional overrides. */ -export function makeUpdate(overrides: Partial = {}): Update { - return { - uri: "/docs/file.txt", - stamp: new Date("2026-04-01T00:00:00Z"), - ...overrides, - }; -} - -/** Create an UpdateSource from a plain array of Updates. */ -export function makeSource(entries: Update[]): UpdateSource { - return function* () { - yield* entries; - }; -} - -const DEFAULT_STAMP = new Date("2026-04-01T00:00:00Z"); - -/** Create an UpdateSource that yields entries with text as binary content. */ -export function makeContentSource(entries: Array<{ uri: string; text: string }>): UpdateSource { - return async function* () { - for (const e of entries) { - const encoded = new TextEncoder().encode(e.text); - yield { - uri: e.uri, - stamp: DEFAULT_STAMP, - async *content() { - yield encoded; - }, - }; - } - }; -} - -/** Create an UpdateSource that yields entries with chunks as msgpack stream. */ -export function makeChunksSource( - entries: Array<{ - uri: string; - chunks: Array<{ index: number; content: string }>; - }>, -): UpdateSource { - return async function* () { - for (const e of entries) { - yield { - uri: e.uri, - stamp: DEFAULT_STAMP, - content: () => encodeMsgpack(toAsync(e.chunks)), - }; - } - }; -} - -/** Create an UpdateSource that yields entries with embeddings as Float32Array stream. */ -export function makeEmbeddingsSource( - entries: Array<{ uri: string; embeddings: Float32Array[] }>, -): UpdateSource { - return async function* () { - for (const e of entries) { - yield { - uri: e.uri, - stamp: DEFAULT_STAMP, - content: () => encodeFloat32Arrays(toAsync(e.embeddings)), - }; - } - }; -} - -async function* toAsync(items: T[]): AsyncGenerator { - for (const item of items) yield item; -} diff --git a/packages/content-scanner/tests/tracking-store.test.ts b/packages/content-scanner/tests/tracking-store.test.ts deleted file mode 100644 index b845c1b..0000000 --- a/packages/content-scanner/tests/tracking-store.test.ts +++ /dev/null @@ -1,180 +0,0 @@ -import { MemFilesApi } from "@statewalker/webrun-files-mem"; -import { beforeEach, describe, expect, it } from "vitest"; -import { TrackingStore } from "../src/tracking-store.js"; -import type { FileMetadata } from "../src/types.js"; - -function makeMeta(overrides: Partial = {}): FileMetadata { - return { - uri: "col1:/root/file.txt", - collectionId: "col1", - path: "/root/file.txt", - hash: "abc123", - size: 100, - lastModified: 1000, - scanTime: "2026-01-01T00:00:00.000Z", - removalTime: null, - ...overrides, - }; -} - -describe("TrackingStore", () => { - let files: MemFilesApi; - let store: TrackingStore; - - beforeEach(() => { - files = new MemFilesApi(); - store = new TrackingStore(files, "cs"); - }); - - describe("pathFor", () => { - it("produces correct format with two-char prefix directory", async () => { - const path = await store.pathFor("col1:/root/file.txt"); - expect(path).toMatch(/^cs\/tracking\/[0-9a-f]{2}\/[0-9a-f]+\.json$/); - }); - }); - - describe("get", () => { - it("returns undefined for non-existent URI", async () => { - const result = await store.get({ uri: "col1:/no-such-file" }); - expect(result).toBeUndefined(); - }); - }); - - describe("set + get round-trip", () => { - it("stores and retrieves metadata", async () => { - const meta = makeMeta(); - await store.set({ metadata: meta }); - const result = await store.get({ uri: meta.uri }); - expect(result).toEqual(meta); - }); - }); - - describe("delete", () => { - it("removes an entry", async () => { - const meta = makeMeta(); - await store.set({ metadata: meta }); - const deleted = await store.delete({ uri: meta.uri }); - expect(deleted).toBe(true); - const result = await store.get({ uri: meta.uri }); - expect(result).toBeUndefined(); - }); - - it("returns false for non-existent URI", async () => { - const deleted = await store.delete({ uri: "col1:/nothing" }); - expect(deleted).toBe(false); - }); - }); - - describe("listAll", () => { - it("iterates all entries", async () => { - await store.set({ - metadata: makeMeta({ uri: "col1:/a.txt", path: "/a.txt" }), - }); - await store.set({ - metadata: makeMeta({ uri: "col1:/b.txt", path: "/b.txt" }), - }); - await store.set({ - metadata: makeMeta({ - uri: "col2:/c.txt", - path: "/c.txt", - collectionId: "col2", - }), - }); - - const all: FileMetadata[] = []; - for await (const meta of store.listAll()) { - all.push(meta); - } - expect(all).toHaveLength(3); - }); - }); - - describe("listByCollection", () => { - it("filters by collection ID", async () => { - await store.set({ - metadata: makeMeta({ uri: "col1:/a.txt", path: "/a.txt" }), - }); - await store.set({ - metadata: makeMeta({ - uri: "col2:/b.txt", - path: "/b.txt", - collectionId: "col2", - }), - }); - - const col1: FileMetadata[] = []; - for await (const meta of store.listByCollection({ - collectionId: "col1", - })) { - col1.push(meta); - } - expect(col1).toHaveLength(1); - expect(col1[0]?.collectionId).toBe("col1"); - }); - }); - - describe("deleteByCollection", () => { - it("removes all entries for a collection", async () => { - await store.set({ - metadata: makeMeta({ uri: "col1:/a.txt", path: "/a.txt" }), - }); - await store.set({ - metadata: makeMeta({ uri: "col1:/b.txt", path: "/b.txt" }), - }); - await store.set({ - metadata: makeMeta({ - uri: "col2:/c.txt", - path: "/c.txt", - collectionId: "col2", - }), - }); - - const count = await store.deleteByCollection({ collectionId: "col1" }); - expect(count).toBe(2); - - const remaining: FileMetadata[] = []; - for await (const meta of store.listAll()) { - remaining.push(meta); - } - expect(remaining).toHaveLength(1); - expect(remaining[0]?.collectionId).toBe("col2"); - }); - }); - - describe("deleteRemovedBefore", () => { - it("only removes entries marked as removed before the threshold", async () => { - await store.set({ - metadata: makeMeta({ - uri: "col1:/old.txt", - path: "/old.txt", - removalTime: "2025-01-01T00:00:00.000Z", - }), - }); - await store.set({ - metadata: makeMeta({ - uri: "col1:/recent.txt", - path: "/recent.txt", - removalTime: "2026-06-01T00:00:00.000Z", - }), - }); - await store.set({ - metadata: makeMeta({ - uri: "col1:/active.txt", - path: "/active.txt", - removalTime: null, - }), - }); - - const count = await store.deleteRemovedBefore({ - before: "2026-01-01T00:00:00.000Z", - }); - expect(count).toBe(1); - - const remaining: FileMetadata[] = []; - for await (const meta of store.listAll()) { - remaining.push(meta); - } - expect(remaining).toHaveLength(2); - }); - }); -}); diff --git a/packages/content-scanner/tsconfig.json b/packages/content-scanner/tsconfig.json deleted file mode 100644 index 6dbcc68..0000000 --- a/packages/content-scanner/tsconfig.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "compilerOptions": { - "target": "ES2022", - "module": "Preserve", - "moduleResolution": "Bundler", - "lib": ["ESNext"], - "strict": true, - "skipLibCheck": true, - "verbatimModuleSyntax": true, - "resolveJsonModule": true, - "esModuleInterop": true, - "forceConsistentCasingInFileNames": true, - "isolatedModules": true, - "noUnusedLocals": true, - "noUnusedParameters": true, - "noImplicitReturns": true, - "noFallthroughCasesInSwitch": true, - "noUncheckedIndexedAccess": true, - "resolvePackageJsonExports": true, - "declaration": true, - "declarationMap": true, - "sourceMap": true, - "noEmit": true - }, - "include": ["./src", "./tests"], - "exclude": ["node_modules", "dist"] -}