From 66c46df9a5dc60ce2141ce54e47708c55d0d2288 Mon Sep 17 00:00:00 2001 From: Lebing Xie Date: Fri, 12 Jun 2026 10:27:42 +0800 Subject: [PATCH] add eval artifact cleanup --- doc/summary.md | 7 +- package.json | 1 + src/eval/README.md | 33 +- src/eval/cleanup-cli.ts | 127 ++++++ src/eval/core/runner.ts | 14 +- src/eval/core/temp-cleanup.test.ts | 143 +++++++ src/eval/core/temp-cleanup.ts | 364 ++++++++++++++++++ .../learn-claude-code/full-tool-runtime.ts | 12 + .../learn-claude-code/in-process-driver.ts | 1 + .../drivers/learn-claude-code/team-driver.ts | 19 +- src/eval/runner.test.ts | 10 +- 11 files changed, 724 insertions(+), 7 deletions(-) create mode 100644 src/eval/cleanup-cli.ts create mode 100644 src/eval/core/temp-cleanup.test.ts create mode 100644 src/eval/core/temp-cleanup.ts diff --git a/doc/summary.md b/doc/summary.md index 02c2b70..13e2641 100644 --- a/doc/summary.md +++ b/doc/summary.md @@ -10,11 +10,12 @@ GitHub: https://github.com/pingp76/swoopcode ## 当前状态 -**已完成阶段**: 基础 REPL + LLM 对话 + bash 工具调用 + 文件操作工具 + 消息标准化 + TODO 任务管理 + 子智能体(SubAgent)+ Skill(技能)系统 + LLM 通信日志 + 上下文压缩 + 权限管理 + Hook 机制 + Memory(长期记忆)+ **Prompt Cache 友好的请求布局** + LLM 错误恢复 + ProjectContext + Session/Transcript 原始事件流 + 持久化 Task 任务系统 + Async Run 非阻塞运行实例 + **Schedule 定时运行系统** + OutputStore 输出句柄 + 安全精确编辑 + 时间语义收口 + Runtime Hardening Round A(原子写与日志轮转)+ 教学注释增强(实现路径注释补齐)+ **PDD-16:模型适配与 Agent Runtime Policy 抽象层**(Provider Profile + Foundation Model Profile + Runtime Policy + LLM Adapter + Context Budget + Stable Context Manager + ContextRanker + RepoClassifier + TaskIntentClassifier)+ **PDD-17:Eval Harness 基础框架**(Eval Core、deterministic suite、real core tools、CLI driver)+ **PDD-18:Replay/Live/Judge/Full-tools Eval**(replay、live smoke、judge/report、live regression、full-tools live E2E)+ **PDD-19:MCP 与 Agent Team Eval Harness Prototype**(prototype suites 默认 skipped,避免误读为生产能力)+ **网页版教程雏形**(`tutorial/` 静态站点 + 第 00/01 章 + `web/temp/2/` 风格三栏阅读布局)+ **公开版 PDD 整理**(`doc/pdd-01-*.md` 到 `doc/pdd-19-*.md`,保留原始 PDD 深度,旧 refactor 工作记录已合并回对应 PDD) +**已完成阶段**: 基础 REPL + LLM 对话 + bash 工具调用 + 文件操作工具 + 消息标准化 + TODO 任务管理 + 子智能体(SubAgent)+ Skill(技能)系统 + LLM 通信日志 + 上下文压缩 + 权限管理 + Hook 机制 + Memory(长期记忆)+ **Prompt Cache 友好的请求布局** + LLM 错误恢复 + ProjectContext + Session/Transcript 原始事件流 + 持久化 Task 任务系统 + Async Run 非阻塞运行实例 + **Schedule 定时运行系统** + OutputStore 输出句柄 + 安全精确编辑 + 时间语义收口 + Runtime Hardening Round A(原子写与日志轮转)+ 教学注释增强(实现路径注释补齐)+ **PDD-16:模型适配与 Agent Runtime Policy 抽象层**(Provider Profile + Foundation Model Profile + Runtime Policy + LLM Adapter + Context Budget + Stable Context Manager + ContextRanker + RepoClassifier + TaskIntentClassifier)+ **PDD-17:Eval Harness 基础框架**(Eval Core、deterministic suite、real core tools、CLI driver)+ **PDD-18:Replay/Live/Judge/Full-tools Eval**(replay、live smoke、judge/report、live regression、full-tools live E2E)+ **PDD-19:MCP 与 Agent Team Eval Harness Prototype**(prototype suites 默认 skipped,避免误读为生产能力)+ **Eval 临时产物 TTL 清理**(失败保留 manifest + 白名单 GC CLI + trace 文件清理)+ **网页版教程雏形**(`tutorial/` 静态站点 + 第 00/01 章 + `web/temp/2/` 风格三栏阅读布局)+ **公开版 PDD 整理**(`doc/pdd-01-*.md` 到 `doc/pdd-19-*.md`,保留原始 PDD 深度,旧 refactor 工作记录已合并回对应 PDD) - **PDD-17 Eval Harness 基础能力**:Eval Core + Deterministic Suite + Real Core Tools + CLI Driver。 - **PDD-18 Eval 回归能力**:Replay + Live Smoke + Judge/Report;Live Regression — Core Tools;Full-tools Live E2E。 - **PDD-19 Eval Prototype 边界**:MCP fixture server + MCP runtime adapter + MCP trace/assertions;顺序 supervisor Team driver + Team trace/assertions;由于项目尚未实现生产级 MCP runtime / 真实 Agent Team runtime,相关 MCP/Team 测试当前全部 `describe.skip`。 +- **Eval 临时产物清理**:默认通过 case 结束即删除 workspace;`keepOnFailure` 保留失败 workspace / agentHome 时写入 `.eval-artifact.json`;`npm run eval:cleanup` 按白名单前缀和 TTL 清理 OS tmpdir 中的 eval 残留,并清理过期 `*.trace.json`。 ## 网页版教程站点雏形 @@ -150,7 +151,8 @@ src/ │ │ ├── trace.ts # TraceRecorder、RuntimeEvent │ │ ├── assertions.ts # portable + instrumented assertion 执行器 │ │ ├── runner.ts # runEvalCase/runEvalSuite 核心 runner -│ │ └── trace-writer.ts # JSON trace 输出 +│ │ ├── trace-writer.ts # JSON trace 输出 +│ │ └── temp-cleanup.ts # Eval 临时产物 TTL manifest 与白名单清理 │ ├── drivers/ │ │ ├── learn-claude-code/ │ │ │ ├── in-process-driver.ts # 当前项目 createAgent() driver @@ -195,6 +197,7 @@ src/ │ ├── replay/ │ │ └── replay-llm.ts # Replay LLM client │ ├── runner.test.ts # core + in-process driver 集成测试 +│ ├── cleanup-cli.ts # Eval 临时产物清理 CLI │ └── README.md # Eval 系统使用文档 skills/ ├── code-review/ diff --git a/package.json b/package.json index b8c4733..99d5ad3 100644 --- a/package.json +++ b/package.json @@ -41,6 +41,7 @@ "test:eval:live:team": "vitest run src/eval/live/live-team-suite.test.ts", "test:eval:live:team:mcp": "vitest run src/eval/live/live-team-suite.test.ts", "test:eval:judge": "vitest run src/eval/judge/judge-suite.test.ts", + "eval:cleanup": "tsx src/eval/cleanup-cli.ts", "typecheck": "tsc --noEmit", "lint": "eslint src/", "format": "prettier --write \"src/**/*.ts\"", diff --git a/src/eval/README.md b/src/eval/README.md index 6641968..ad51fb3 100644 --- a/src/eval/README.md +++ b/src/eval/README.md @@ -10,6 +10,12 @@ npm run test:eval # 运行所有 eval 相关测试(含 runner 集成测试) npx vitest run src/eval/ + +# 清理过期 eval 临时产物(默认 7 天) +npm run eval:cleanup + +# 先预览会删除什么 +npm run eval:cleanup -- --dry-run ``` ## 设计原则 @@ -17,7 +23,7 @@ npx vitest run src/eval/ - **确定性**:所有 case 使用 scripted LLM,不依赖真实模型,确保任何环境都能稳定通过 - **可移植**:Eval Core 不直接依赖当前项目内部模块(agent.ts、llm.ts 等),只认识 `CodingAgentDriver` 接口 - **可观测**:通过 instrumented assertions 验证工具调用、权限确认等内部行为 -- **隔离性**:每个 case 在独立临时 workspace 中运行,自动清理 +- **隔离性**:每个 case 在独立临时 workspace 中运行,默认自动清理;失败调试产物通过 TTL manifest 和 `npm run eval:cleanup` 定期回收 ## Case 结构 @@ -200,6 +206,31 @@ EVAL_TRACE_DIR=./eval-traces npm run test:eval Trace 文件包含:case 信息、步骤痕迹、runtime events、断言结果。 +## 临时产物清理 + +Eval 会创建三类临时产物: + +1. **workspace**:每个 case 的隔离工作目录,默认通过后立即删除 +2. **agentHome**:full-tools / team eval 的临时持久化根目录,保存 Memory、Skill、Task、Schedule、Output 等状态 +3. **trace**:开启 `trace.enabled` 或 `EVAL_TRACE_DIR` 后写出的 `*.trace.json` + +当 case 设置 `workspace.keepOnFailure: true` 且运行失败时,runner 会保留 workspace;full-tools / team driver 也会同步保留自己的临时 `agentHome`。这些目录会写入 `.eval-artifact.json`,记录 `caseId`、`createdAt` 和 `expiresAt`,便于后续清理。 + +定期清理命令: + +```bash +# 删除默认 OS tmpdir 下超过 7 天的 eval 产物 +npm run eval:cleanup + +# CI 中常用:删除超过 24 小时的残留 +npm run eval:cleanup -- --older-than 24h + +# 本地接入前先预览 +npm run eval:cleanup -- --dry-run +``` + +清理器只扫描白名单前缀(如 `eval-`、`learn-claude-eval-home-`、`learn-claude-team-home-` 等),不会递归扫描任意临时目录。`eval-traces` 目录中只删除过期的 `*.trace.json` 文件,避免误删手工放入的其他说明文件。 + ## 编写 Core Tool Case 的注意事项 1. **Scripted LLM Responses**:每个 tool call 需要至少 2 个 responses diff --git a/src/eval/cleanup-cli.ts b/src/eval/cleanup-cli.ts new file mode 100644 index 0000000..a1e9f62 --- /dev/null +++ b/src/eval/cleanup-cli.ts @@ -0,0 +1,127 @@ +/** + * cleanup-cli.ts — Eval 临时产物清理命令 + * + * 职责:把 temp-cleanup.ts 暴露成 npm 脚本入口,便于本地和 CI 定期执行。 + * + * 用法: + * - npm run eval:cleanup + * - npm run eval:cleanup -- --older-than 24h + * - npm run eval:cleanup -- --dry-run + */ + +import { fileURLToPath } from "node:url"; +import { tmpdir } from "node:os"; +import { + cleanupEvalArtifacts, + DEFAULT_EVAL_ARTIFACT_TTL_MS, + parseEvalCleanupDuration, +} from "./core/temp-cleanup.js"; + +interface CleanupCliOptions { + rootDir: string; + olderThanMs: number; + dryRun: boolean; +} + +async function main(argv: string[]): Promise { + const options = parseArgs(argv); + const result = await cleanupEvalArtifacts(options); + + console.log( + [ + `Eval cleanup root: ${result.rootDir}`, + `Mode: ${result.dryRun ? "dry-run" : "delete"}`, + `Scanned: ${result.scanned}`, + `Deleted: ${result.deleted.length}`, + `Kept: ${result.kept.length}`, + `Errors: ${result.errors.length}`, + ].join("\n"), + ); + + for (const entry of result.deleted) { + console.log(`[deleted] ${entry.path} (${entry.reason})`); + } + for (const error of result.errors) { + console.error(`[error] ${error.path}: ${error.message}`); + } + + if (result.errors.length > 0) { + process.exitCode = 1; + } +} + +function parseArgs(argv: string[]): CleanupCliOptions { + const options: CleanupCliOptions = { + rootDir: process.env["EVAL_TEMP_ROOT"] ?? tmpdir(), + olderThanMs: DEFAULT_EVAL_ARTIFACT_TTL_MS, + dryRun: false, + }; + + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]; + if (arg === undefined) { + continue; + } + if (arg === "--dry-run") { + options.dryRun = true; + continue; + } + if (arg === "--root") { + options.rootDir = readNextArg(argv, i, "--root"); + i++; + continue; + } + if (arg.startsWith("--root=")) { + options.rootDir = arg.slice("--root=".length); + continue; + } + if (arg === "--older-than") { + options.olderThanMs = parseEvalCleanupDuration( + readNextArg(argv, i, "--older-than"), + ); + i++; + continue; + } + if (arg.startsWith("--older-than=")) { + options.olderThanMs = parseEvalCleanupDuration( + arg.slice("--older-than=".length), + ); + continue; + } + if (arg === "--help" || arg === "-h") { + printHelp(); + process.exit(0); + } + throw new Error(`Unknown argument: ${arg}`); + } + + return options; +} + +function readNextArg(argv: string[], index: number, flag: string): string { + const value = argv[index + 1]; + if (value === undefined || value.startsWith("--")) { + throw new Error(`${flag} requires a value.`); + } + return value; +} + +function printHelp(): void { + console.log(`Usage: npm run eval:cleanup -- [options] + +Options: + --older-than Delete artifacts older than this duration. Default: 7d. + Supported units: ms, s, m, h, d. + --dry-run Print what would be deleted without deleting. + --root Scan this directory instead of EVAL_TEMP_ROOT or OS tmpdir. + -h, --help Show this help. +`); +} + +const currentFile = fileURLToPath(import.meta.url); +if (process.argv[1] === currentFile) { + main(process.argv.slice(2)).catch((err: unknown) => { + console.error(err instanceof Error ? err.message : String(err)); + process.exitCode = 1; + }); +} diff --git a/src/eval/core/runner.ts b/src/eval/core/runner.ts index 623f3b5..b6f9fc6 100644 --- a/src/eval/core/runner.ts +++ b/src/eval/core/runner.ts @@ -40,6 +40,7 @@ import { createTraceRecorder } from "./trace.js"; import { runAssertions } from "./assertions.js"; import { writeEvalTrace } from "./trace-writer.js"; import { runJudge } from "../judge/judge.js"; +import { writeEvalArtifactManifest } from "./temp-cleanup.js"; /** * runEvalCase — 执行单个 eval case @@ -269,7 +270,18 @@ export async function runEvalCase( // 12. 清理 workspace(如果 case 失败且设置了 keepOnFailure,则保留) const shouldKeep = evalCase.workspace?.keepOnFailure === true && status !== "passed"; - if (!shouldKeep) { + if (shouldKeep) { + try { + // 失败调试时保留 workspace 很有用,但必须给跨运行 GC 留下过期信息, + // 否则长期打开 keepOnFailure 后,系统临时目录会无限膨胀。 + await writeEvalArtifactManifest(workspace.root, { + caseId: evalCase.id, + kind: "workspace", + }); + } catch { + // manifest 写入失败不改变 eval 结果;后续 cleanup 仍可用 mtime 兜底。 + } + } else { try { await workspace.cleanup(); } catch { diff --git a/src/eval/core/temp-cleanup.test.ts b/src/eval/core/temp-cleanup.test.ts new file mode 100644 index 0000000..df22e7b --- /dev/null +++ b/src/eval/core/temp-cleanup.test.ts @@ -0,0 +1,143 @@ +/** + * temp-cleanup.test.ts — Eval 临时产物清理器测试 + * + * 这些测试的重点不是“rm 能不能工作”,而是验证清理边界: + * - 只删除白名单 eval 目录 + * - manifest 优先于 mtime + * - trace 只删除旧 trace JSON + * - dry-run 不产生真实删除 + */ + +import { existsSync } from "node:fs"; +import { mkdir, mkdtemp, rm, utimes, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; +import { + cleanupEvalArtifacts, + parseEvalCleanupDuration, + writeEvalArtifactManifest, +} from "./temp-cleanup.js"; + +describe("cleanupEvalArtifacts", () => { + const roots: string[] = []; + const baseNow = new Date("2026-06-12T00:00:00.000Z"); + const sevenDays = parseEvalCleanupDuration("7d"); + + afterEach(async () => { + const pending = roots.splice(0); + await Promise.all( + pending.map((root) => rm(root, { recursive: true, force: true })), + ); + }); + + it("removes only old directories with eval whitelist prefixes", async () => { + const root = await createRoot(); + const oldEval = join(root, "eval-old-case"); + const youngEval = join(root, "eval-young-case"); + const unrelated = join(root, "project-cache-old"); + await mkdir(oldEval); + await mkdir(youngEval); + await mkdir(unrelated); + await touchMtime(oldEval, new Date(baseNow.getTime() - sevenDays - 1000)); + await touchMtime(youngEval, baseNow); + await touchMtime(unrelated, new Date(baseNow.getTime() - sevenDays - 1000)); + + const result = await cleanupEvalArtifacts({ + rootDir: root, + olderThanMs: sevenDays, + now: baseNow, + }); + + expect(result.errors).toHaveLength(0); + expect(result.deleted.map((entry) => entry.path)).toContain(oldEval); + expect(existsSync(oldEval)).toBe(false); + expect(existsSync(youngEval)).toBe(true); + expect(existsSync(unrelated)).toBe(true); + }); + + it("uses manifest expiresAt before falling back to mtime", async () => { + const root = await createRoot(); + const expiredHome = join(root, "learn-claude-eval-home-expired"); + const activeHome = join(root, "learn-claude-eval-home-active"); + await mkdir(expiredHome); + await mkdir(activeHome); + await writeEvalArtifactManifest(expiredHome, { + caseId: "expired-case", + kind: "agentHome", + now: new Date(baseNow.getTime() - 2 * sevenDays), + ttlMs: sevenDays, + }); + await writeEvalArtifactManifest(activeHome, { + caseId: "active-case", + kind: "agentHome", + now: baseNow, + ttlMs: sevenDays, + }); + + const result = await cleanupEvalArtifacts({ + rootDir: root, + olderThanMs: sevenDays, + now: baseNow, + }); + + expect(result.errors).toHaveLength(0); + expect(existsSync(expiredHome)).toBe(false); + expect(existsSync(activeHome)).toBe(true); + }); + + it("removes old trace JSON files without touching unrelated trace files", async () => { + const root = await createRoot(); + const traceDir = join(root, "eval-traces"); + const oldTrace = join(traceDir, "case-a.trace.json"); + const youngTrace = join(traceDir, "case-b.trace.json"); + const note = join(traceDir, "notes.txt"); + await mkdir(traceDir); + await writeFile(oldTrace, "{}", "utf-8"); + await writeFile(youngTrace, "{}", "utf-8"); + await writeFile(note, "keep", "utf-8"); + await touchMtime(oldTrace, new Date(baseNow.getTime() - sevenDays - 1000)); + await touchMtime(youngTrace, baseNow); + await touchMtime(note, new Date(baseNow.getTime() - sevenDays - 1000)); + + const result = await cleanupEvalArtifacts({ + rootDir: root, + olderThanMs: sevenDays, + now: baseNow, + }); + + expect(result.errors).toHaveLength(0); + expect(result.deleted.map((entry) => entry.path)).toContain(oldTrace); + expect(existsSync(oldTrace)).toBe(false); + expect(existsSync(youngTrace)).toBe(true); + expect(existsSync(note)).toBe(true); + }); + + it("reports deletions in dry-run mode without removing files", async () => { + const root = await createRoot(); + const oldEval = join(root, "eval-dry-run-case"); + await mkdir(oldEval); + await touchMtime(oldEval, new Date(baseNow.getTime() - sevenDays - 1000)); + + const result = await cleanupEvalArtifacts({ + rootDir: root, + olderThanMs: sevenDays, + now: baseNow, + dryRun: true, + }); + + expect(result.dryRun).toBe(true); + expect(result.deleted.map((entry) => entry.path)).toContain(oldEval); + expect(existsSync(oldEval)).toBe(true); + }); + + async function createRoot(): Promise { + const root = await mkdtemp(join(tmpdir(), "eval-cleanup-test-")); + roots.push(root); + return root; + } + + async function touchMtime(path: string, time: Date): Promise { + await utimes(path, time, time); + } +}); diff --git a/src/eval/core/temp-cleanup.ts b/src/eval/core/temp-cleanup.ts new file mode 100644 index 0000000..333c9d0 --- /dev/null +++ b/src/eval/core/temp-cleanup.ts @@ -0,0 +1,364 @@ +/** + * temp-cleanup.ts — Eval 临时产物清理器 + * + * 职责:清理 eval harness 在系统临时目录中遗留的 workspace、agentHome 和 trace 文件。 + * + * 设计原则: + * - 只清理固定白名单前缀,避免误删用户自己的临时目录 + * - 优先读取 manifest 中的 expiresAt;老版本产物没有 manifest 时再按 mtime 兜底 + * - 支持 dry-run,让本地调试和 CI 接入前可以先观察会删除什么 + * - trace 目录特殊处理:只删除旧的 *.trace.json 文件,不清空未知内容 + */ + +import { + lstat, + mkdir, + readdir, + readFile, + rm, + stat, + writeFile, +} from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { basename, join, resolve } from "node:path"; + +/** Eval 临时产物目录中的 manifest 文件名。 */ +export const EVAL_ARTIFACT_MANIFEST = ".eval-artifact.json"; + +/** 本地默认保留 7 天:足够排查失败,又不会无限膨胀。 */ +export const DEFAULT_EVAL_ARTIFACT_TTL_MS = 7 * 24 * 60 * 60 * 1000; + +/** 允许清理的目录名前缀。 */ +const EVAL_TEMP_DIR_PREFIXES = [ + "eval-", + "learn-claude-eval-home-", + "learn-claude-team-home-", + "eval-full-ws-", + "eval-full-home-", + "eval-replay-test-", +] as const; + +/** Trace 默认目录名。 */ +const EVAL_TRACE_DIR_NAME = "eval-traces"; + +/** 可写入 manifest 的临时产物类型。 */ +export type EvalArtifactKind = "workspace" | "agentHome"; + +/** manifest 记录为什么保留该目录,以及它何时可以被清理。 */ +export interface EvalArtifactManifest { + caseId: string; + kind: EvalArtifactKind; + createdAt: string; + expiresAt: string; + reason: "keepOnFailure"; +} + +export interface WriteEvalArtifactManifestOptions { + caseId: string; + kind: EvalArtifactKind; + now?: Date; + ttlMs?: number; +} + +export interface CleanupEvalArtifactsOptions { + /** 扫描根目录,默认是 OS tmpdir。测试可传入独立目录。 */ + rootDir?: string; + /** 没有 manifest 的旧产物按 mtime 判断,默认 7 天。 */ + olderThanMs?: number; + /** 当前时间注入,便于测试。 */ + now?: Date; + /** 只报告不删除。 */ + dryRun?: boolean; +} + +export interface CleanupEvalArtifactEntry { + path: string; + kind: "directory" | "traceFile"; + reason: string; +} + +export interface CleanupEvalArtifactsResult { + rootDir: string; + dryRun: boolean; + scanned: number; + deleted: CleanupEvalArtifactEntry[]; + kept: CleanupEvalArtifactEntry[]; + errors: Array<{ path: string; message: string }>; +} + +/** + * writeEvalArtifactManifest — 给保留的失败产物写入过期信息。 + * + * 运行内 cleanup 已经能删除正常通过的 case;manifest 只服务于 keepOnFailure + * 或进程异常退出后的跨运行 GC。它刻意使用简单 JSON,方便人手打开查看。 + */ +export async function writeEvalArtifactManifest( + artifactRoot: string, + options: WriteEvalArtifactManifestOptions, +): Promise { + const now = options.now ?? new Date(); + const ttlMs = options.ttlMs ?? DEFAULT_EVAL_ARTIFACT_TTL_MS; + const manifest: EvalArtifactManifest = { + caseId: options.caseId, + kind: options.kind, + createdAt: now.toISOString(), + expiresAt: new Date(now.getTime() + ttlMs).toISOString(), + reason: "keepOnFailure", + }; + + await mkdir(artifactRoot, { recursive: true }); + await writeFile( + join(artifactRoot, EVAL_ARTIFACT_MANIFEST), + JSON.stringify(manifest, null, 2), + "utf-8", + ); +} + +/** + * cleanupEvalArtifacts — 扫描并清理过期 eval 临时产物。 + * + * 安全边界: + * - 不递归扫描任意目录树,只看 rootDir 的直接子项 + * - 目录必须匹配 eval 白名单前缀 + * - symlink 不删除,避免链接逃逸 + * - eval-traces 中只删除旧的 *.trace.json 文件 + */ +export async function cleanupEvalArtifacts( + options: CleanupEvalArtifactsOptions = {}, +): Promise { + const rootDir = resolve(options.rootDir ?? tmpdir()); + const now = options.now ?? new Date(); + const olderThanMs = options.olderThanMs ?? DEFAULT_EVAL_ARTIFACT_TTL_MS; + const dryRun = options.dryRun === true; + const result: CleanupEvalArtifactsResult = { + rootDir, + dryRun, + scanned: 0, + deleted: [], + kept: [], + errors: [], + }; + + let entries; + try { + entries = await readdir(rootDir, { withFileTypes: true }); + } catch (err) { + result.errors.push({ + path: rootDir, + message: err instanceof Error ? err.message : String(err), + }); + return result; + } + + for (const entry of entries) { + const entryPath = join(rootDir, entry.name); + + if (entry.name === EVAL_TRACE_DIR_NAME && entry.isDirectory()) { + await cleanupTraceFiles(entryPath, now, olderThanMs, dryRun, result); + continue; + } + + if (!entry.isDirectory() || !isEvalTempDirectoryName(entry.name)) { + continue; + } + + result.scanned++; + try { + const entryStat = await lstat(entryPath); + if (!entryStat.isDirectory() || entryStat.isSymbolicLink()) { + result.kept.push({ + path: entryPath, + kind: "directory", + reason: "not a real directory", + }); + continue; + } + + const decision = await shouldDeleteDirectory( + entryPath, + entryStat.mtimeMs, + now, + olderThanMs, + ); + if (!decision.delete) { + result.kept.push({ + path: entryPath, + kind: "directory", + reason: decision.reason, + }); + continue; + } + + result.deleted.push({ + path: entryPath, + kind: "directory", + reason: decision.reason, + }); + if (!dryRun) { + await rm(entryPath, { recursive: true, force: true }); + } + } catch (err) { + result.errors.push({ + path: entryPath, + message: err instanceof Error ? err.message : String(err), + }); + } + } + + return result; +} + +/** + * parseEvalCleanupDuration — 解析 CLI 中的 24h / 7d / 30m 等保留时间。 + * + * 只支持少量明确单位,避免把复杂自然语言解析带进教学项目。 + */ +export function parseEvalCleanupDuration(input: string): number { + const match = /^(\d+)(ms|s|m|h|d)$/.exec(input.trim()); + if (!match) { + throw new Error( + `Invalid duration "${input}". Use formats like 24h, 7d, 30m, or 1000ms.`, + ); + } + + const amountText = match[1]; + const unit = match[2]; + if (amountText === undefined || unit === undefined) { + throw new Error(`Invalid duration "${input}".`); + } + + const amount = Number.parseInt(amountText, 10); + const multipliers: Record = { + ms: 1, + s: 1000, + m: 60 * 1000, + h: 60 * 60 * 1000, + d: 24 * 60 * 60 * 1000, + }; + return amount * multipliers[unit]!; +} + +function isEvalTempDirectoryName(name: string): boolean { + return EVAL_TEMP_DIR_PREFIXES.some((prefix) => name.startsWith(prefix)); +} + +async function shouldDeleteDirectory( + directoryPath: string, + mtimeMs: number, + now: Date, + olderThanMs: number, +): Promise<{ delete: boolean; reason: string }> { + const manifest = await readManifest(directoryPath); + if (manifest !== undefined) { + const expiresAtMs = Date.parse(manifest.expiresAt); + if (!Number.isNaN(expiresAtMs)) { + return expiresAtMs <= now.getTime() + ? { delete: true, reason: `manifest expired at ${manifest.expiresAt}` } + : { + delete: false, + reason: `manifest expires at ${manifest.expiresAt}`, + }; + } + } + + const ageMs = now.getTime() - mtimeMs; + return ageMs >= olderThanMs + ? { delete: true, reason: `mtime older than ${olderThanMs}ms` } + : { delete: false, reason: `mtime newer than ${olderThanMs}ms` }; +} + +async function readManifest( + directoryPath: string, +): Promise { + try { + const raw = await readFile( + join(directoryPath, EVAL_ARTIFACT_MANIFEST), + "utf-8", + ); + return JSON.parse(raw) as EvalArtifactManifest; + } catch { + return undefined; + } +} + +async function cleanupTraceFiles( + traceDir: string, + now: Date, + olderThanMs: number, + dryRun: boolean, + result: CleanupEvalArtifactsResult, +): Promise { + let entries; + try { + entries = await readdir(traceDir, { withFileTypes: true }); + } catch (err) { + result.errors.push({ + path: traceDir, + message: err instanceof Error ? err.message : String(err), + }); + return; + } + + for (const entry of entries) { + const filePath = join(traceDir, entry.name); + if (!entry.isFile() || !entry.name.endsWith(".trace.json")) { + continue; + } + + result.scanned++; + try { + const fileStat = await stat(filePath); + const ageMs = now.getTime() - fileStat.mtimeMs; + if (ageMs < olderThanMs) { + result.kept.push({ + path: filePath, + kind: "traceFile", + reason: `mtime newer than ${olderThanMs}ms`, + }); + continue; + } + + result.deleted.push({ + path: filePath, + kind: "traceFile", + reason: `mtime older than ${olderThanMs}ms`, + }); + if (!dryRun) { + await rm(filePath, { force: true }); + } + } catch (err) { + result.errors.push({ + path: filePath, + message: err instanceof Error ? err.message : String(err), + }); + } + } + + await removeEmptyTraceDir(traceDir, dryRun, result); +} + +async function removeEmptyTraceDir( + traceDir: string, + dryRun: boolean, + result: CleanupEvalArtifactsResult, +): Promise { + try { + const remaining = await readdir(traceDir); + if (remaining.length > 0) { + return; + } + result.deleted.push({ + path: traceDir, + kind: "directory", + reason: `${basename(traceDir)} is empty after trace cleanup`, + }); + if (!dryRun) { + await rm(traceDir, { recursive: true, force: true }); + } + } catch (err) { + result.errors.push({ + path: traceDir, + message: err instanceof Error ? err.message : String(err), + }); + } +} diff --git a/src/eval/drivers/learn-claude-code/full-tool-runtime.ts b/src/eval/drivers/learn-claude-code/full-tool-runtime.ts index f87f8d1..02fe157 100644 --- a/src/eval/drivers/learn-claude-code/full-tool-runtime.ts +++ b/src/eval/drivers/learn-claude-code/full-tool-runtime.ts @@ -74,9 +74,11 @@ import { createEvalMcpRuntime, type EvalMcpRuntime, } from "./mcp-runtime.js"; +import { writeEvalArtifactManifest } from "../../core/temp-cleanup.js"; /** Full runtime 创建参数。 */ export interface CreateFullEvalRuntimeOptions { + caseId?: string; workspaceRoot: string; agentHome: string; llm: LLMClient; @@ -415,6 +417,16 @@ export async function createFullEvalRuntime( ); } if (cleanupOptions?.keepAgentHome === true) { + try { + // full-tools 的 agentHome 会保存 Memory、Task、Schedule、Output 等真实状态。 + // keepOnFailure 保留它用于调试,但写入 manifest 后,跨运行 GC 可以按 TTL 收掉。 + await writeEvalArtifactManifest(projectContext.agentHome, { + caseId: options.caseId ?? "unknown", + kind: "agentHome", + }); + } catch { + // manifest 只是 GC 提示;写入失败时仍保留目录,后续清理器按 mtime 兜底。 + } return; } await rm(projectContext.agentHome, { recursive: true, force: true }); diff --git a/src/eval/drivers/learn-claude-code/in-process-driver.ts b/src/eval/drivers/learn-claude-code/in-process-driver.ts index 6cf9f29..325f79a 100644 --- a/src/eval/drivers/learn-claude-code/in-process-driver.ts +++ b/src/eval/drivers/learn-claude-code/in-process-driver.ts @@ -199,6 +199,7 @@ export async function createLearnClaudeCodeInProcessDriver( join(tmpdir(), "learn-claude-eval-home-"), ); const runtimeOptions: Parameters[0] = { + caseId: context.caseId, workspaceRoot: context.workspaceRoot, agentHome, llm, diff --git a/src/eval/drivers/learn-claude-code/team-driver.ts b/src/eval/drivers/learn-claude-code/team-driver.ts index 6af3492..f49a131 100644 --- a/src/eval/drivers/learn-claude-code/team-driver.ts +++ b/src/eval/drivers/learn-claude-code/team-driver.ts @@ -50,6 +50,7 @@ import { type EvalMcpRuntime, } from "./mcp-runtime.js"; import { emitTeamEvent, previewTeamText } from "../../team/team-trace.js"; +import { writeEvalArtifactManifest } from "../../core/temp-cleanup.js"; interface MemberRunReport { agentId: string; @@ -194,9 +195,23 @@ export async function createLearnClaudeCodeTeamDriver( async close(options): Promise { terminal.close(); await mcpRuntime?.cleanup(); - if (agentHome && options?.keepArtifacts !== true) { - await rm(agentHome, { recursive: true, force: true }); + if (!agentHome) { + return; } + if (options?.keepArtifacts === true) { + try { + // Team eval 的成员共享一个临时 agentHome。失败时保留可帮助排查成员状态, + // 但同样写入过期 manifest,避免长期 live eval 后积累过多目录。 + await writeEvalArtifactManifest(agentHome, { + caseId: context?.caseId ?? "unknown", + kind: "agentHome", + }); + } catch { + // manifest 写入失败不影响失败现场保留;GC 仍可按 mtime 兜底。 + } + return; + } + await rm(agentHome, { recursive: true, force: true }); }, }; diff --git a/src/eval/runner.test.ts b/src/eval/runner.test.ts index 2d821d8..0e4d05b 100644 --- a/src/eval/runner.test.ts +++ b/src/eval/runner.test.ts @@ -12,7 +12,8 @@ import { describe, it, expect } from "vitest"; import { existsSync } from "node:fs"; -import { rm } from "node:fs/promises"; +import { readFile, rm } from "node:fs/promises"; +import { join } from "node:path"; import type { LLMClient } from "../llm.js"; import type { CodingAgentDriver } from "./core/driver.js"; import type { @@ -22,6 +23,7 @@ import type { } from "./core/case-schema.js"; import { runEvalCase } from "./core/runner.js"; import { createLearnClaudeCodeInProcessDriver } from "./drivers/learn-claude-code/in-process-driver.js"; +import { EVAL_ARTIFACT_MANIFEST } from "./core/temp-cleanup.js"; // --------------------------------------------------------------------------- // Fake Driver(用于测试 core runner 本身,不依赖当前项目 Agent) @@ -424,6 +426,12 @@ describe("Eval Runner", () => { expect(workspaceRoot).toBeTruthy(); expect(existsSync(agentHome!)).toBe(true); expect(existsSync(workspaceRoot!)).toBe(true); + await expect( + readFile(join(agentHome!, EVAL_ARTIFACT_MANIFEST), "utf-8"), + ).resolves.toContain("full-tools-keep-agent-home-on-failure"); + await expect( + readFile(join(workspaceRoot!, EVAL_ARTIFACT_MANIFEST), "utf-8"), + ).resolves.toContain("full-tools-keep-agent-home-on-failure"); } finally { if (agentHome) { await rm(agentHome, { recursive: true, force: true });