diff --git a/.changeset/linear-cache-and-cwd.md b/.changeset/linear-cache-and-cwd.md new file mode 100644 index 0000000..49e20ef --- /dev/null +++ b/.changeset/linear-cache-and-cwd.md @@ -0,0 +1,36 @@ +--- +"@maastrich/hashup": minor +--- + +Linear-memory cache + `--cwd` CLI flag. + +**Linear-memory cache.** `HashupCache.hashes` now stores each file's own +sha256 content hash (one 64-char string) instead of the flattened +transitive hash list. The transitive contribution is reconstructed at +combine time by walking `cache.deps`. Memory drops from +O(files × avg closure) to O(unique files) — on a real-world run that +previously needed 9 GB of heap, peak RSS is now ~125 MB and wall time +drops from minutes to ~1 s. + +**Hash output changes.** The final digest is now +`sha256(concat of each reachable file's content hash, sorted by path)`. +Each unique file contributes exactly once regardless of how many import +paths reach it. Any stored 0.6.x hashes must be re-baselined. As a +welcome side effect: cycles now hash the same regardless of which +member was the entry point. + +**`--cwd ` CLI flag.** Run `hashup` as if invoked from the given +directory. Changes where `hashup.json` is discovered, where relative +entry/extras paths resolve, and where `--out` writes. Defaults to +`process.cwd()`. + +```bash +hashup --cwd ./packages/app +hashup --cwd ./packages/app src/index.ts -o ../dist/app.hash +``` + +**Targeted break for direct `hashFile` callers.** Return type is now +`Promise` (the file's own hash, or `null` on failure) +instead of `Promise`. Callers should use `collectReachable` +to enumerate the transitive set and read each file's hash from +`cache.hashes` at combine time. `hashup()` itself is unchanged. diff --git a/docs/api/hashup.md b/docs/api/hashup.md index 8461ee7..ef39c1a 100644 --- a/docs/api/hashup.md +++ b/docs/api/hashup.md @@ -6,6 +6,8 @@ function hashup(entryFile: string, options?: HashupOptions): Promise; + hashes: Map; deps: Map; } @@ -55,12 +55,14 @@ function collectReachable(roots: readonly string[], cache: HashupCache): string[ ``` An in-memory cache scoped to one consumer's lifetime — not persisted, -not shared across processes. `hashes` stores each file's flattened -hash list; `deps` stores each file's direct resolved dependency paths. -Pass the same `HashupCache` to multiple `hashup()` or `hashFile()` calls -to dedupe work. `collectReachable` walks `deps` iteratively to rebuild -a per-call file list (used internally by `hashup()` to produce -`result.files`). +not shared across processes. `hashes` stores each file's own content +hash (one 64-char sha256 string per file); `deps` stores each file's +direct resolved dependency paths. Memory is linear in the number of +unique files. Pass the same `HashupCache` to multiple `hashup()` or +`hashFile()` calls to dedupe work. `collectReachable` walks `deps` +iteratively (no recursion) to enumerate the transitive closure — used +internally by `hashup()` to produce `result.files` and to fold each +file's content hash into the final digest. ## hashFile @@ -70,15 +72,18 @@ function hashFile( cache: HashupCache, resolver: Resolver, logger?: Logger, -): Promise; +): Promise; ``` -Hashes a file and all its transitive static imports. Results are memoized in -`cache` — pass the same `HashupCache` across multiple calls to dedupe work. -On error (file read or parse failure) the failure is sent through -`logger.warn` and an empty array is returned. `logger` defaults to a silent -logger; build one with [`createLogger`](#createlogger) when you want -diagnostics on stderr. +Hashes a file and recursively populates `cache.hashes` and `cache.deps` +for every non-`node_modules` transitive import. Returns the file's own +content hash on success, or `null` if the file could not be read or +parsed. The transitive contribution is reconstructed at combine time by +walking `cache.deps` — `hashFile` never returns the flattened list. +Results are memoized in `cache` — pass the same `HashupCache` across +multiple calls to dedupe work. `logger` defaults to a silent logger; +build one with [`createLogger`](#createlogger) when you want diagnostics +on stderr. Imports that resolve into `node_modules` are treated as opaque: the resolved path is skipped, its files are never read, and the dependency's own imports @@ -132,17 +137,23 @@ and hashing the result. Order-sensitive — pass hashes in a stable order. ## Composing Your Own Pipeline ```ts -import { combineHashes, createHashupCache, createResolver, hashFile } from "@maastrich/hashup"; +import { + collectReachable, + combineHashes, + createHashupCache, + createResolver, + hashFile, +} from "@maastrich/hashup"; const resolver = createResolver(); const cache = createHashupCache(); const entries = ["./src/a.ts", "./src/b.ts"]; -const allHashes: string[] = []; - for (const entry of entries) { - allHashes.push(...(await hashFile(entry, cache, resolver))); + await hashFile(entry, cache, resolver); } -const combined = combineHashes(allHashes); +const files = collectReachable(entries, cache).sort(); +const selfHashes = files.map((f) => cache.hashes.get(f)).filter((h) => h !== undefined); +const combined = combineHashes(selfHashes); ``` diff --git a/docs/guide/cli.md b/docs/guide/cli.md index fdebc6f..ac4a2c1 100644 --- a/docs/guide/cli.md +++ b/docs/guide/cli.md @@ -24,6 +24,9 @@ hashup src/index.ts Prints the hash of `src/index.ts` and its transitive import graph. Flags: - `-e, --extra ` — include an additional file in the hash (repeatable) +- `--cwd ` — run as if invoked from this directory. Changes where + `hashup.json` is discovered and where relative paths resolve. Defaults + to `process.cwd()`. - `-b, --base-dir ` — base directory for resolution (default: cwd) - `--json` — emit `{ "hash": "…" }` instead of plain text - `--files` — include the resolved file list in the JSON output diff --git a/docs/guide/how-it-works.md b/docs/guide/how-it-works.md index b355e99..ed379be 100644 --- a/docs/guide/how-it-works.md +++ b/docs/guide/how-it-works.md @@ -11,8 +11,10 @@ conditional exports, and extension resolution. 3. **Hash each file's content** (SHA-256). Results are cached per absolute path so a file reachable through multiple paths is hashed once. -4. **Combine all hashes** — the entry's graph plus any `extras` — into a single - deterministic SHA-256 digest. +4. **Combine the unique file hashes**, in sorted-path order, into a single + SHA-256 digest. Every file in the transitive closure contributes exactly + once, regardless of how many import paths reach it — memory stays linear + in the number of unique files, independent of graph width or diamond count. ## Determinism @@ -60,8 +62,7 @@ top-level `"logLevel"` field. The CLI flag wins when both are set. ## Caveats -- **Circular imports** terminate deterministically, but the exact hash of a - cycle depends on which member was the entry point — the cache is seeded - with the entry's content hash first, so cycle re-visits return that - placeholder. Entering the same cycle from a different file produces a - different (still deterministic) hash. +- **Circular imports** terminate deterministically. The cache is seeded with + the file's own content hash before recursing, and each unique file + contributes exactly once to the final digest, so entering the same + cycle from any of its members produces the same hash. diff --git a/src/cli/main.ts b/src/cli/main.ts index 0aaf605..9ae6f67 100644 --- a/src/cli/main.ts +++ b/src/cli/main.ts @@ -1,3 +1,4 @@ +import { resolve } from "node:path"; import { configJsonSchema } from "../config/json-schema.js"; import { die } from "./die.js"; import { parseCliArgs } from "./parse-args.js"; @@ -19,8 +20,14 @@ export async function main(argv: string[]): Promise { return; } + // --cwd is resolved against the real process.cwd() so that relative + // values on the command line behave predictably. Everything else + // (config path, baseDir, output path) resolves against this effective + // cwd, letting a single `--cwd ./packages/app` move the whole run. + const cwd = args.cwd !== undefined ? resolve(process.cwd(), args.cwd) : process.cwd(); + if (args.printSchema) { - await writeOutput(process.cwd(), args.out, `${JSON.stringify(configJsonSchema, null, 2)}\n`); + await writeOutput(cwd, args.out, `${JSON.stringify(configJsonSchema, null, 2)}\n`); return; } @@ -30,7 +37,7 @@ export async function main(argv: string[]): Promise { if (args.positionals.length === 1) { const output = await runSingleFileMode({ - cwd: process.cwd(), + cwd, file: args.positionals[0]!, extras: args.extras, baseDirOverride: args.baseDir, @@ -38,12 +45,12 @@ export async function main(argv: string[]): Promise { files: args.files, logLevel: args.logLevel, }); - await writeOutput(process.cwd(), args.out, output); + await writeOutput(cwd, args.out, output); return; } const result = await runConfigMode({ - cwd: process.cwd(), + cwd, configPath: args.config, baseDirOverride: args.baseDir, json: args.json, @@ -53,5 +60,5 @@ export async function main(argv: string[]): Promise { if (!result.ok) { die(result.error); } - await writeOutput(process.cwd(), args.out, result.output); + await writeOutput(cwd, args.out, result.output); } diff --git a/src/cli/parse-args.ts b/src/cli/parse-args.ts index 35133dc..8754af8 100644 --- a/src/cli/parse-args.ts +++ b/src/cli/parse-args.ts @@ -5,6 +5,7 @@ export interface CliArgs { config: string | undefined; extras: string[]; baseDir: string | undefined; + cwd: string | undefined; json: boolean; files: boolean; help: boolean; @@ -28,6 +29,7 @@ export function parseCliArgs(argv: string[]): CliArgs { "print-schema": { type: "boolean", default: false }, out: { type: "string", short: "o" }, "log-level": { type: "string", short: "l" }, + cwd: { type: "string" }, }, }); @@ -42,6 +44,7 @@ export function parseCliArgs(argv: string[]): CliArgs { config: values.config as string | undefined, extras: (values.extra as string[] | undefined) ?? [], baseDir: values["base-dir"] as string | undefined, + cwd: values.cwd as string | undefined, json: values.json === true, files: values.files === true, help: values.help === true, diff --git a/src/cli/usage.ts b/src/cli/usage.ts index 7643c4e..3f90c37 100644 --- a/src/cli/usage.ts +++ b/src/cli/usage.ts @@ -5,6 +5,7 @@ export const USAGE = `Usage: Options: -c, --config Path to config file (default: hashup.json) -e, --extra Extra file to include (repeatable, single-file mode) + --cwd Run as if from this directory (default: process.cwd()) -b, --base-dir Base directory for resolution (default: cwd) --json Output JSON instead of plain text --files Include resolved file list in JSON output diff --git a/src/lib/cache.ts b/src/lib/cache.ts index 98d71c0..5bf2b54 100644 --- a/src/lib/cache.ts +++ b/src/lib/cache.ts @@ -7,14 +7,16 @@ * computation (the file's content hash is recomputed at most once). * * Two parallel maps keyed by absolute file path: - * - `hashes`: the flattened hash list (self + transitive deps). - * Returned directly to callers and combined into the final digest. - * - `deps`: the file's direct resolved dependency paths. Used by - * `collectReachable` to rebuild the per-call file list without - * re-walking the graph. + * - `hashes`: the file's own content hash (sha256 of its bytes). + * One 64-char string per file — not a flattened transitive list, + * because that was O(files × avg closure) and blew out the heap + * on large monorepos. See `hashup()` for how the transitive + * contribution is reconstructed at combine time. + * - `deps`: the file's direct resolved dependency paths. Walked by + * `collectReachable` to enumerate the transitive closure. */ export interface HashupCache { - hashes: Map; + hashes: Map; deps: Map; } diff --git a/src/lib/hash-file.ts b/src/lib/hash-file.ts index 71b0de0..efc1ad1 100644 --- a/src/lib/hash-file.ts +++ b/src/lib/hash-file.ts @@ -4,54 +4,58 @@ import { createContentHash } from "./create-content-hash.js"; import { extractImports } from "./extract-imports.js"; import { isInNodeModules } from "./is-in-node-modules.js"; import { createLogger, type Logger } from "./logger.js"; -import { pushAll } from "./push-all.js"; import { readFileContent } from "./read-file-content.js"; import { resolveImport } from "./resolve-import.js"; +/** + * Ensure `file` and every file reachable from it are present in the + * cache. Returns the file's own content hash (sha256 hex) on success, + * or `null` if the file could not be read or parsed — in which case + * callers should skip it. The transitive contribution is reconstructed + * at combine time by walking `cache.deps`. + * + * Terminates deterministically on circular imports: the cache entry is + * seeded with the self hash before recursing, so a cycle A → B → A + * short-circuits on the revisit. + */ export async function hashFile( file: string, cache: HashupCache, resolver: Resolver, logger: Logger = createLogger("silent"), -): Promise { +): Promise { const cached = cache.hashes.get(file); - if (cached) { + if (cached !== undefined) { return cached; } try { const content = await readFileContent(file); - const hashes = [createContentHash(content)]; + const selfHash = createContentHash(content); const deps: string[] = []; - // Seed both caches before recursing so circular imports terminate: - // on a cycle A → B → A, the revisit of A hits `cache.hashes` and - // returns the placeholder instead of walking forever. - cache.hashes.set(file, hashes); + cache.hashes.set(file, selfHash); cache.deps.set(file, deps); const imports = await extractImports(file, content); - const dependencyHashes = await hashDependencies(imports, file, cache, resolver, logger, deps); - pushAll(hashes, dependencyHashes); + await walkDependencies(imports, file, cache, resolver, logger, deps); - return hashes; + return selfHash; } catch (error) { logger.warn(`Failed to hash file ${file}:`, error); cache.hashes.delete(file); cache.deps.delete(file); - return []; + return null; } } -async function hashDependencies( +async function walkDependencies( imports: string[], sourceFile: string, cache: HashupCache, resolver: Resolver, logger: Logger, deps: string[], -): Promise { - const hashes: string[] = []; - +): Promise { for (const imported of imports) { const resolved = await resolveImport(resolver, sourceFile, imported); if (!resolved) continue; @@ -65,9 +69,6 @@ async function hashDependencies( continue; } deps.push(resolved); - const resolvedHashes = await hashFile(resolved, cache, resolver, logger); - pushAll(hashes, resolvedHashes); + await hashFile(resolved, cache, resolver, logger); } - - return hashes; } diff --git a/src/lib/hashup.ts b/src/lib/hashup.ts index 47f03ff..05b7aa0 100644 --- a/src/lib/hashup.ts +++ b/src/lib/hashup.ts @@ -5,7 +5,6 @@ import { combineHashes } from "./combine-hashes.js"; import { createResolver } from "./create-resolver.js"; import { hashFile } from "./hash-file.js"; import { createLogger, type LogLevel } from "./logger.js"; -import { pushAll } from "./push-all.js"; export interface HashupOptions { /** @@ -71,6 +70,11 @@ export interface HashupResult { * treated as opaque and skipped — add a lockfile to `extras` if you * want install-tree changes reflected in the hash. * + * The hash is `sha256` over the concatenation of each reachable file's + * own content hash, in sorted-path order. Each file contributes exactly + * once regardless of how many import paths reach it, which keeps memory + * usage linear in the number of unique files. + * * @param entryFile - The entry file to hash * @param options - Optional configuration * @returns The deterministic hash and list of included files @@ -115,26 +119,25 @@ export async function hashup( const logger = createLogger(logLevel); const resolvedEntry = resolve(baseDir, entryFile); - const entryHashes = await hashFile(resolvedEntry, cache, resolver, logger); + await hashFile(resolvedEntry, cache, resolver, logger); - const extraHashes: string[] = []; const resolvedExtras: string[] = []; for (const extraFile of extras) { const resolvedExtra = resolve(baseDir, extraFile); resolvedExtras.push(resolvedExtra); - const hashes = await hashFile(resolvedExtra, cache, resolver, logger); - pushAll(extraHashes, hashes); + await hashFile(resolvedExtra, cache, resolver, logger); } - const combined: string[] = []; - pushAll(combined, entryHashes); - pushAll(combined, extraHashes); - const finalHash = combineHashes(combined); + // Reconstruct the transitive contribution by walking `cache.deps` + // from this call's roots. Each file contributes exactly once; sort + // by path so the combined hash is independent of traversal order. + const files = collectReachable([resolvedEntry, ...resolvedExtras], cache).sort(); - // `files` is the transitive closure of this call's roots — entry + - // extras — regardless of whether individual files were already in - // the shared cache. Walks the `deps` map, which is cheap. - const files = collectReachable([resolvedEntry, ...resolvedExtras], cache); + const selfHashes: string[] = []; + for (let i = 0; i < files.length; i++) { + const h = cache.hashes.get(files[i] as string); + if (h !== undefined) selfHashes.push(h); + } - return { hash: finalHash, files }; + return { hash: combineHashes(selfHashes), files }; } diff --git a/tests/circular.test.ts b/tests/circular.test.ts index 580d265..35a48a7 100644 --- a/tests/circular.test.ts +++ b/tests/circular.test.ts @@ -18,11 +18,10 @@ describe("hashup with circular imports", () => { expect(r1.files).toEqual(r2.files); }); - test("should produce the same hash regardless of which cycle member is the entry", async () => { + test("produces the same hash from either cycle member", async () => { const fromA = await hashup("./tests/fixtures/circular/a.ts"); const fromB = await hashup("./tests/fixtures/circular/b.ts"); - expect(fromA.hash).toMatch(/^[a-f0-9]{64}$/); - expect(fromB.hash).toMatch(/^[a-f0-9]{64}$/); + expect(fromA.hash).toBe(fromB.hash); }); }); diff --git a/tests/cli/parse-args.test.ts b/tests/cli/parse-args.test.ts index 24baae0..3200048 100644 --- a/tests/cli/parse-args.test.ts +++ b/tests/cli/parse-args.test.ts @@ -72,4 +72,9 @@ describe("parseCliArgs", () => { test("rejects invalid --log-level", () => { expect(() => parseCliArgs(["--log-level", "trace"])).toThrow(/Invalid --log-level/); }); + + test("parses --cwd", () => { + expect(parseCliArgs([]).cwd).toBeUndefined(); + expect(parseCliArgs(["--cwd", "./packages/app"]).cwd).toBe("./packages/app"); + }); }); diff --git a/tests/examples.test.ts b/tests/examples.test.ts index b2df23a..7c7b464 100644 --- a/tests/examples.test.ts +++ b/tests/examples.test.ts @@ -105,7 +105,7 @@ describe("hashup with example files", () => { const result = await hashup("./examples/src/index.ts"); expect(result.hash).toMatchInlineSnapshot( - `"48adf62a70c2645d0fc15ee3060973245af5dc30a542372791a7e1f05eaeacf6"`, + `"ed1c4758b6b759306f2b44feee0bbc2d06291ae490d97367043ab188ce670770"`, ); }); }); diff --git a/tests/shared-cache.test.ts b/tests/shared-cache.test.ts index a1211cd..fd3b6d1 100644 --- a/tests/shared-cache.test.ts +++ b/tests/shared-cache.test.ts @@ -85,7 +85,7 @@ describe("collectReachable", () => { const cache = createHashupCache(); const N = 50_000; for (let i = 0; i < N; i++) { - cache.hashes.set(`/f${i}`, ["x"]); + cache.hashes.set(`/f${i}`, "x"); cache.deps.set(`/f${i}`, i + 1 < N ? [`/f${i + 1}`] : []); } const files = collectReachable(["/f0"], cache);