diff --git a/.gitignore b/.gitignore
index f4ba552..3c64fab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,5 @@ skills/
 web/temp/
 doc/web-tutorial-plan.md
 doc/todo.md
+tmp/
+.DS_Store
diff --git a/tutorial/assets/content.js b/tutorial/assets/content.js
index b7ad8b9..12b07de 100644
--- a/tutorial/assets/content.js
+++ b/tutorial/assets/content.js
@@ -134,8 +134,8 @@ export const chapters = {
   },
   "model-policy": {
     number: "A",
-    title: "不同大模型不是只换模型名",
-    navTitle: "不同大模型不是只换模型名",
+    title: "换个模型, 不只换 baseURL",
+    navTitle: "换个模型, 不只换 baseURL",
     group: "topic",
     file: "./chapters/model-policy.html",
     ready: true,
@@ -151,7 +151,7 @@ export const chapters = {
   reference: {
     number: "—",
     title: "Reference",
-    navTitle: "术语表、Prompt Pack 与验证手册",
+    navTitle: "设计模式",
     group: "reference",
     file: "./chapters/reference.html",
     ready: true,
diff --git a/tutorial/assets/styles.css b/tutorial/assets/styles.css
index 2ee8723..4432843 100644
--- a/tutorial/assets/styles.css
+++ b/tutorial/assets/styles.css
@@ -747,7 +747,8 @@ mark {
 }
 
 .figure {
-  margin: var(--space-6) 0;
+  margin: var(--space-8) auto;
+  max-width: 760px;
 }
 
 .figure figcaption {
@@ -755,6 +756,7 @@ mark {
   color: var(--color-text-faint);
   font-size: var(--text-sm);
   line-height: var(--leading-snug);
+  text-align: center;
 }
 
 .loop-map,
@@ -798,6 +800,146 @@ mark {
 .flow-map {
   display: grid;
   gap: var(--space-3);
+  justify-items: center;
+}
+
+.flow-row {
+  display: flex;
+  flex-wrap: wrap;
+  align-items: center;
+  justify-content: center;
+  gap: var(--space-3);
+  width: 100%;
+}
+
+.flow-arrow {
+  color: var(--color-text-faint);
+  font-size: var(--text-lg);
+}
+
+/* 居中变体: 单行带箭头, 居中对齐 */
+.flow-row--center {
+  display: flex;
+  flex-wrap: wrap;
+  align-items: center;
+  justify-content: center;
+  gap: var(--space-3);
+  width: 100%;
+}
+
+/* 居中变体: 树状 (1 父 → N 子) */
+.flow-tree {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: var(--space-3);
+  width: 100%;
+}
+
+.flow-tree__children {
+  display: flex;
+  flex-wrap: wrap;
+  align-items: flex-start;
+  justify-content: center;
+  gap: var(--space-4);
+  width: 100%;
+}
+
+.flow-tree__branch {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: var(--space-2);
+  flex: 1 1 200px;
+  max-width: 280px;
+}
+
+.flow-tree__connector {
+  width: 2px;
+  height: var(--space-4);
+  background: var(--color-border-soft);
+}
+
+/* 居中变体: 2 列对比 (左旧右新) */
+.flow-compare {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: var(--space-4);
+  width: 100%;
+}
+
+.flow-compare__col {
+  display: flex;
+  flex-direction: column;
+  gap: var(--space-2);
+  padding: var(--space-3);
+  border: 1px solid var(--color-border-soft);
+  border-radius: var(--radius-md);
+  background: var(--color-bg);
+}
+
+.flow-compare__col--bad {
+  border-color: #e8b4b8;
+  background: #fdf2f3;
+}
+
+.flow-compare__col--good {
+  border-color: var(--color-accent-soft);
+  background: var(--color-accent-bg);
+}
+
+.flow-compare__label {
+  font-weight: 600;
+  font-size: var(--text-sm);
+  text-align: center;
+  color: var(--color-text-faint);
+  text-transform: uppercase;
+  letter-spacing: 0.08em;
+}
+
+/* 居中变体: 层次栈 (上 → 下, 越来越具体) */
+.flow-stack {
+  display: flex;
+  flex-direction: column;
+  gap: var(--space-3);
+  align-items: center;
+  width: 100%;
+}
+
+.flow-stack__layer {
+  display: flex;
+  flex-direction: column;
+  gap: var(--space-2);
+  padding: var(--space-3) var(--space-4);
+  border: 1px solid var(--color-border-soft);
+  border-radius: var(--radius-md);
+  background: var(--color-bg);
+  width: 100%;
+  max-width: 600px;
+}
+
+.flow-stack__layer--stable {
+  border-color: var(--color-accent-soft);
+  background: var(--color-accent-bg);
+}
+
+.flow-stack__layer--dynamic {
+  border-color: #b8d4e8;
+  background: #f0f6fb;
+}
+
+.flow-stack__label {
+  font-size: var(--text-xs);
+  text-transform: uppercase;
+  letter-spacing: 0.08em;
+  font-weight: 600;
+  color: var(--color-text-faint);
+}
+
+.flow-stack__arrow {
+  color: var(--color-text-faint);
+  font-size: var(--text-xl);
+  text-align: center;
 }
 
 .flow-row {
diff --git a/tutorial/chapters/00-preface.html b/tutorial/chapters/00-preface.html
index 59662cc..ee58314 100644
--- a/tutorial/chapters/00-preface.html
+++ b/tutorial/chapters/00-preface.html
@@ -1,1085 +1,293 @@
 <p class="article__eyebrow">第 00 章 · 在写代码之前</p>
-<h1 class="article__title">用 LLM 写 LLM Agent 的元方法</h1>
+<h1 class="article__title">在写代码之前: 让 LLM 写 LLM Agent 的元方法</h1>
 <p class="article__lede">
-  这一章不写代码。它要回答一个比"如何实现 loop"更前置的问题：
-  <strong
-    >当你打算让 LLM 帮你写一个让 LLM
-    持续工作的系统时，你自己要先想清楚什么？</strong
-  >
-  读完后，你应该能看懂后续所有章节的 Prompt Card 是怎么来的，也能判断 LLM
-  帮你写出来的实现是不是真的"做对了"。
+  这一章不写代码。它要回答一个比"如何实现 loop"更前置的问题:
+  当你打算让 LLM 帮你写一个让 LLM 持续工作的系统时, 你自己要先想清楚什么?
+  读完后, 你应该能看懂后续所有章节的 Prompt Card 是怎么来的,
+  也能判断 LLM 帮你写出来的实现是不是真的"做对了"。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
 <h2 id="meta-question">这门课的真正主题</h2>
 <p>
-  表面上看，这是一本"从零实现 coding agent"的教程。往深一层看，它示范的是
-  <strong>如何与 LLM 协作完成一类特殊的工作：搭建承载 LLM 自身工作的环境</strong
-  >。 这件事有三个特点，让它和普通"让 LLM 写个函数"不同：
+  表面上看, 这是一本"从零实现 coding agent" 的教程。往深一层看,
+  它示范的是<strong>如何与 LLM 协作完成一类特殊的工作:
+  搭建承载 LLM 自身工作的环境</strong>。 这件事有三个特点,
+  让它和普通"让 LLM 写个函数" 不同。
 </p>
 <dl class="defs">
-  <dt>对象是 LLM 本身</dt>
-  <dd>
-    你的最终用户是另一个 LLM。你写的接口、命名、错误信息,都是在教一个
-    模型"如何更好地使用你的代码"。这意味着好名字比好实现更重要。
+<dt>对象是 LLM 本身</dt>
+<dd>
+    你的最终用户是另一个 LLM。 你写的接口、命名、错误信息, 都是在教一个
+    模型"如何更好地使用你的代码"。 这意味着好名字比好实现更重要:
+    一个叫 <code>run_bash</code> 的工具, 比一个叫 <code>execute_command_async_with_callback</code>
+    的工具更可能让模型用对。
   </dd>
-  <dt>现场是长生命周期</dt>
-  <dd>
-    一个 harness 跑几小时甚至几天,中途会有多轮对话、工具调用、异常恢复。
-    你的代码必须能在"现场丢失"后从历史里重新拼出语义。这是普通 Web
-    后端不太需要关心的事。
+<dt>现场是长生命周期</dt>
+<dd>
+    agent 跑起来之后, history 会一直增长, 文件会被一直改,
+    临时状态会一直累积。 你写的每一行代码都要考虑"30 分钟后
+    还在跑" 的场景, 而不是"调用一次就返回" 的普通函数。
   </dd>
-  <dt>失败模式是隐性的</dt>
-  <dd>
-    一次跑通不等于实现正确。LLM 生成的代码可能"看起来对、用起来也跑、
-    但悄悄把上下文弄丢"。你必须靠 trace 而不是肉眼判断它是否真的做对了。
+<dt>反馈是定性的, 不是定量的</dt>
+<dd>
+    "模型调错了工具" 没有 stack trace, 只有一段不像人话的回复;
+    "上下文爆炸" 表现为成本上升 10 倍, 但你看到的还是一段
+    普通的 prompt。 调试这类系统, 不能依赖传统的 error / log / test。
   </dd>
 </dl>
+<h2 id="llm-workflow-pattern">与 LLM 协作的 3 种工作流模式</h2>
 <p>
-  知道这三点,你就不会把这一章当成"普通项目实战",而会理解为什么后面的 Prompt Card
-  要写成"目标/场景/模块/接线/边界/验证"六件套 —— 它是为了
-  强迫你自己在写代码前先把"对象/现场/失败模式"想清楚。
-</p>
-
-<h2 id="meta-method">与 LLM 协作的 4 个动作</h2>
-<p>
-  接下来 16 章的每一章,本质都在重复同一个循环。把它单独拎出来, 你就能识别 LLM
-  在哪个环节最容易骗你。
-</p>
-<ol>
-  <li>
-    <strong>想清楚一现象。</strong>这一章要解决什么具体问题?用一段话写出来,
-    写到"我能给一个非工程师讲明白"为止。
-  </li>
-  <li>
-    <strong>想一个反例。</strong>如果不解决,最朴素的实现会长什么样?为什么它
-    不行?这一步是为了逼自己定义"边界"而不是"功能"。
-  </li>
-  <li>
-    <strong>想清楚接口和不变量。</strong>哪些模块、哪些函数、哪些"绝对不能
-    被破坏"的规则。这一步是为了让 LLM 知道"改哪里是安全的"。
-  </li>
-  <li>
-    <strong>想清楚怎么验证它做对了。</strong>用 fake LLM、用 trace assertion、
-    用 e2e 测试,而不是只靠"它没报错"判断。
-  </li>
-</ol>
-<div class="note">
-  <p class="note__title">这一章的最低目标</p>
-  <p>
-    读完后,你能看懂后续每一章为什么是这四步的循环,也能识别出 LLM
-    在哪一步上容易"看起来做完了"但其实偷懒。
-  </p>
-</div>
-
-<h2 id="terms">术语地图: 把后面反复出现的词钉死</h2>
-<p>
-  下面这些词在后续章节会反复出现。第一次出现时,我会用
-  <code>English</code> (中文释义) 的格式;之后只用英文。请你在自己的笔记里
-  也保持这套对应,不要中途换说法。
+  写一个"让 LLM 持续工作" 的系统, 我们和 LLM 协作的方式有 3 种。
+  这一节先讲清楚模式, 后面的章节都按其中一种或多种协作。
 </p>
 <dl class="defs">
-  <dt><code>harness</code> (外层运行环境)</dt>
-  <dd>
-    LLM 之外、替它保管现场、执行工具、约束副作用、记录事实的所有代码。
-    本教程的全部代码都属于 harness。
-  </dd>
-
-  <dt><code>agent loop</code> (主循环)</dt>
-  <dd>
-    "用户输入 → 写历史 → 调 LLM → 写回复 → 决定下一步"这条反复执行的路径。 简称
-    <code>loop</code>。本教程主线索就是 loop 一圈一圈地长。
+<dt>模式 1 · 一次性原型</dt>
+<dd>
+    你提需求, LLM 一次性写出 200 行 TypeScript, 跑通, 收工。
+    适合探索性 demo, 不适合长期演进。 坏处: 改第 3 个需求时,
+    LLM 会把前 2 个需求的实现搞乱, 因为它没有持续记忆。
   </dd>
-
-  <dt><code>History</code> (消息列表)</dt>
-  <dd>
-    当前会话的全部 message (user / assistant / tool / system),它的唯一职责
-    是构造下一次 LLM 请求。不要把"日志""审计""可观测"塞进 History,它们 由
-    <code>Transcript</code> 单独承担 (第 15 章会讲)。
-  </dd>
-
-  <dt><code>tool call</code> (工具调用请求)</dt>
-  <dd>
-    LLM 返回的结构化动作请求,而不是文本里的"我想读这个文件"。 对应 LLM 协议里的
-    <code>tool_use</code> / <code>function_call</code> 字段。
-  </dd>
-
-  <dt>
-    <code>Composition Root</code> (组装根, 通常是 <code>src/index.ts</code>)
-  </dt>
-  <dd>
-    创建共享依赖 (<code>history</code> / <code>llm</code> /
-    <code>terminal</code>)
-    并把它们传给各模块的位置。业务分支一旦塞进这里,后续测试和子智能体 都会变难。
-  </dd>
-
-  <dt><code>fake LLM</code> (假模型)</dt>
-  <dd>
-    测试时替代真实 LLM 的对象:能预设返回文本、记录所有收到的 messages、 断言
-    messages 顺序。本教程要求每一章都用 fake LLM 验证,而不是只跑 e2e
-    看最终输出。
-  </dd>
-</dl>
-<p>
-  还有一些更细的术语会在用到时引入 (例如第 02 章的 <code>tool registry</code>、
-  第 06 章的 <code>normalize / block / compress</code>、第 10 章的
-  <code>cache-friendly prefix</code>)。届时同样采用"首次出现钉死,之后只用英文"
-  的规则。
-</p>
-
-<h2 id="anti-self-deception">防自欺: 这门课里 4 句不能信的话</h2>
-<p>
-  vibe coding 最大的陷阱不是写不出代码,而是"看起来写完了"。下面 4 句话
-  在后续章节会反复以各种形式出现,请你把每一条都当成红灯。
-</p>
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">红灯 1 · 跑通 ≠ 正确</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见说法：</strong>"我跑了一次,模型回了正确文本,说明 loop
-      工作了。"
-    </p>
-    <p>
-      <strong>为什么错：</strong>模型回正确文本,可能是因为这一轮 messages 凑巧
-      包含全部所需信息。一旦轮次增加、上下文被压缩、或者 History 被某个 bug
-      弄丢,问题才会暴露。
-    </p>
-    <p>
-      <strong>正确做法：</strong>用 fake LLM 断言"第二轮 LLM 收到的 messages
-      长度 == N、最后一条 role == user、第一条 role == user"这类结构性事实,
-      而不只是断言最终文本。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">红灯 2 · 测试通过 ≠ 设计合理</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见说法：</strong>"vitest 跑过了,这个模块就完成了。"</p>
-    <p>
-      <strong>为什么错：</strong>测试只覆盖了你想到的 case。harness 的真正考验
-      是"多轮 + 异常 + 并发 + 长时间"。这些场景在 happy path 测试里几乎
-      不会触发。
-    </p>
-    <p>
-      <strong>正确做法：</strong>把测试分成两层: 一层是 fake LLM 的"messages
-      顺序断言",另一层是真实 e2e 的"行为断言"。后者要故意包含异常恢复
-      和上下文压缩。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">红灯 3 · LLM 说"做完了" ≠ 真的做完了</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见说法：</strong>"我让 LLM 实现工具系统,LLM 回我'已完成',
-      我去看了下文件都在。"
-    </p>
-    <p>
-      <strong>为什么错：</strong>LLM 倾向于"把请求理解为已经满足"。它会跳过
-      你没显式要求的部分,例如权限检查、错误返回结构、History 写入顺序。
-    </p>
-    <p>
-      <strong>正确做法：</strong>Prompt Card 里"边界"和"验证"两节必须写得 像
-      checklist —— 越具体越好,不要写"注意权限"这种空话,要写 "执行 rm
-      前必须确认路径在白名单内,否则抛 PermissionError"。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">红灯 4 · Prompt Card 漂亮 ≠ 实现漂亮</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见说法：</strong>"我卡片写得很完整,实现应该也漂亮。"</p>
-    <p>
-      <strong>为什么错：</strong>Prompt Card 是给 LLM 的"需求文档",但 LLM
-      仍然可能在命名、模块拆分、依赖方向上做出和你想象不同的选择。
-    </p>
-    <p>
-      <strong>正确做法：</strong>LLM 给的实现要先做"逆向核对": 把每个
-      文件的职责读一遍,确认它和卡片"模块/接线/边界"三节里描述的一致。
-      不一致的地方,要么改实现,要么回卡片里把规则说更死。
-    </p>
-  </div>
-</div>
-
-<h2 id="vibe-coding">Vibe Coding 方法论: Prompt Card 怎么交给 LLM</h2>
-<p>
-  6 段 Prompt Card 解决了"如何把需求说死",但还有另一半没解决:
-  <strong
-    >如何把这张卡片交给 LLM,让它真的按卡片交付、而不是用漂亮文本交差</strong
-  >。 这一节是教程的元方法,后续每一章末尾都会有 "本次如何 vibe code" 三件套
-  (拆卡 / review / 迭代) 复述这里的规则。
-</p>
-
-<h3 id="feed-strategy">拆卡: 不要一次性全给</h3>
-<p>
-  6 段 Prompt Card 是一张总图,不是一次 prompt 的全部内容。把 6 段一次性 贴给
-  LLM,会触发两种典型失败:
-</p>
-<dl class="defs">
-  <dt>上下文溢出与失焦</dt>
-  <dd>
-    一次性贴 200 行 prompt,LLM 会在"模块 / 接线 / 边界 / 验证"中轮换注意力,
-    最终通常只在它最后读到的那段做对。
+<dt>模式 2 · 迭代增量 + 测试守门</dt>
+<dd>
+    你提需求, LLM 写代码 + 写测试, 测试通过才合并。
+    适合中型项目。 关键纪律: 测试必须<strong>自己会写</strong>,
+    不会写测试时让 LLM 写测试, 等于让 LLM 既当选手又当裁判。
   </dd>
-  <dt>无法回滚</dt>
-  <dd>
-    一次性给完,LLM 一次性写完,你只能整体 reject,无法定位"它哪一段开始走偏"。
-    这正是第 00 章红灯 4 "Prompt Card 漂亮 ≠ 实现漂亮" 的具体发作场景。
+<dt>模式 3 · 大纲先行 + 章节式实现</dt>
+<dd>
+    你先写一份 6 段式 Prompt Card (后面会讲), 钉死抽象边界;
+    LLM 拿到卡片后, 按章节增量实现, 每章都有独立验收。
+    适合教学项目和大型重构。 本教程走的就是这种模式。
   </dd>
 </dl>
 <p>
-  推荐的拆卡顺序是 <strong>接口 → 接线 → 边界 → 验证</strong>,四轮迭代,
-  每轮独立可验证:
-</p>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>:只贴"目标 + 场景 + 模块",请 LLM 给出
-    interface 草案 (例如 <code>interface History</code>、
-    <code>interface LLMClient</code>)。 这一轮不写实现,只钉形状。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>:贴"模块 + 接线",请 LLM 给出
-    <code>index.ts</code> 的接线代码。注意此时 <code>createHistory</code>
-    等工厂还是 stub,返回任意对象即可。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>:贴"边界 (checklist)",请 LLM 按 checklist
-    实现每个工厂的真实逻辑。这一轮是 LLM 偷懒的重灾区, review checklist
-    全部要逐条核对。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>:贴"验证 (vitest 断言清单)",请 LLM
-    写测试。优先让它写 fake LLM + messages 顺序断言, 再写 happy path e2e。
-  </li>
-</ol>
-<div class="note">
-  <p class="note__title">为什么不一次给完</p>
-  <p>
-    四轮迭代的核心是"每轮交付都有独立可验证产物": 第 1 轮交付 interface 草案, 你
-    review 命名; 第 2 轮交付接线图, 你 review 依赖方向; 第 3 轮 交付工厂实现, 你
-    review checklist; 第 4 轮交付测试, 你跑测试。任一轮 不通过,
-    都可以单独回退而不必推翻全部。
-  </p>
-</div>
-
-<h3 id="review-checklist">Review: AI 写完一段代码后,看什么</h3>
-<p>
-  LLM 给的实现不是"读一遍"就能判断对错的, 也不是"跑一遍测试"就能判断 对错的
-  (红灯 1)。这一节给出一份通用 review checklist, 适用于每一章、 每一段 LLM
-  生成的代码。
-</p>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">通用 review checklist · V1</span>
-  </div>
-  <div class="card__body">
-    <p><strong>依赖方向 (Composition Root 反向检查):</strong></p>
-    <ul>
-      <li>
-        agent.ts / 子智能体 / tool executor 内不出现
-        <code>new LLMClient</code> / <code>new OpenAI()</code>
-      </li>
-      <li>
-        agent.ts / REPL 内不出现 <code>process.env</code> 读取 (配置走
-        <code>config.ts</code>)
-      </li>
-      <li>
-        不存在 module-level 单例依赖 (例如
-        <code>export const history = createHistory();</code>)
-      </li>
-      <li>工厂函数返回 interface 而不是 class instance, 便于 fake 注入</li>
-    </ul>
-    <p><strong>边界 (Prompt Card checklist 逐条核对):</strong></p>
-    <ul>
-      <li>
-        checklist 里每条"绝对不能"都对应一行 grep 验证 (例如
-        <code>grep -n 'process.env' src/agent.ts</code> 应当 0 行)
-      </li>
-      <li>边界对应到 validation 里的至少一条 vitest 断言</li>
-    </ul>
-    <p><strong>命名 (给 LLM 看的名片):</strong></p>
-    <ul>
-      <li>
-        导出符号命名稳定, 不会因为 LLM 自由发挥就改名 (例如不能
-        <code>createHistory</code> 写到一半变成 <code>createMessageStore</code>)
-      </li>
-      <li>
-        interface 名不带实现细节 (不要 <code>interface HistoryImpl</code>、<code
-          >interface HistoryV2</code
-        >)
-      </li>
-    </ul>
-    <p><strong>副作用 (隐性失败):</strong></p>
-    <ul>
-      <li>
-        <code>getMessages</code> / <code>getEntries</code> /
-        <code>getConfig</code> 之类返回集合的方法都返回浅拷贝或冻结,
-        不是内部引用
-      </li>
-      <li>
-        没有"读外部文件"隐式行为 (例如
-        <code>createAgent</code> 工厂函数内不应当读 <code>package.json</code>)
-      </li>
-      <li>
-        工具执行、文件写入、命令执行前都有明确的 permission 边界 (第 02 / 07
-        章会展开)
-      </li>
-    </ul>
-  </div>
-</div>
-<p>
-  上述 checklist 适用于任何章节。每一章还会给出"本章专属"的几条, 例如 01
-  章的"getMessages 返回浅拷贝"、02 章的"tool call 与 tool result 配对"、07
-  章的"permission 在工具执行前同步插入"。
-</p>
-
-<h3 id="debug-ai-pretend">调试: AI 常常"假装"实现了</h3>
-<p>
-  LLM 写代码时最危险的失败不是"写错", 而是
-  <strong>"看起来对、跑得通、但其实在偷偷绕过"</strong>。
-  这一节给出一份"假装清单",帮你识别三种最常见的伪装。
-</p>
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">伪装 1 · 假装实现了 tool call</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>症状:</strong>agent 跑通, 模型能"读文件", 但
-      <code>history.getMessages()</code> 里没有任何
-      <code>role: "tool"</code> 消息。
-    </p>
-    <p>
-      <strong>怎么发生的:</strong>LLM 倾向把 tool call 写成"读取文本中的命令行",
-      在 agent 内部用 <code>exec()</code> 跑, 再把输出塞回
-      <code>user</code> 消息。 它跑得通, 模型也"知道"读到了什么, 但 messages
-      序列错了, 后续 compress / replay 全部坏掉。
-    </p>
-    <p>
-      <strong>怎么验证:</strong>fake LLM 强制返回
-      <code>{ role: "assistant", content: "", tool_calls: [...] }</code>,
-      跑完一轮后断言
-      <code>history.getMessages().some(m =&gt; m.role === "tool")</code>。
-      这是第 02 章 Validation 卡片必含的一条。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">伪装 2 · 假装做了边界检查</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>症状:</strong>代码里能看到
-      <code>if (!fs.existsSync(...))</code> 之类的 守卫, 但实际跑起来仍然越界。
-    </p>
-    <p>
-      <strong>怎么发生的:</strong>LLM 在错误位置加守卫 (例如权限检查写在工具
-      内部 <code>if</code> 里, 而不是 agent 主循环), 或者只检查了 happy path
-      一个分支。
-    </p>
-    <p>
-      <strong>怎么验证:</strong>Prompt Card 的 checklist 每条都要对应到"反向
-      输入"的测试 (例如 "空字符串 query 不写入 history" 对应
-      <code>run("")</code> 测试)。边界检查必须在调用栈更外层, 这条要写进 边界
-      checklist 里, 不能让 LLM 自由决定位置。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">伪装 3 · 假装有测试</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>症状:</strong>仓库里有 <code>*.test.ts</code> 文件, vitest 跑过,
-      但实际只测了"返回非 undefined"。
-    </p>
-    <p>
-      <strong>怎么发生的:</strong>LLM 喜欢写 happy path 断言, 不会主动写
-      反向断言或顺序断言。
-    </p>
-    <p>
-      <strong>怎么验证:</strong>每章 Validation 卡片里至少包含一条"顺序断言"
-      (例如 "messages.length === 3") 和一条"反向断言" (例如 "外部 push 不影响
-      history")。红灯 2 "测试通过 ≠ 设计合理" 的具体发作场景。
-    </p>
-  </div>
-</div>
-
-<h3 id="iteration-cadence">迭代节奏: 写多少测多少, 不一次写完</h3>
-<p>
-  vibe coding 最大的诱惑是"让 LLM 一次写完整个模块"。这几乎一定踩坑。 推荐节奏是
-  <strong>小步提交, 每步可测, 不让上下文长到无法 review</strong>。
-</p>
-<ol>
-  <li>
-    <strong>每写完一个工厂, 跑一次该工厂的测试。</strong>
-    不要等所有工厂都写完才跑测试 —— 早期失败定位成本是后期的 1/5。
-  </li>
-  <li>
-    <strong
-      >每跑通一个 Validation 条目, 在 commit message 里引用对应 ID。</strong
-    >
-    例如
-    <code>feat(history): 实现 add/getMessages, 满足 V-01 章 Validation #3</code
-    >。 这样 git log 就是一份"vibe coding 进度表"。
-  </li>
-  <li>
-    <strong
-      >每章结束前, 把"差量表"(本章新增/修改文件)与实际 diff 对一遍。</strong
-    >
-    LLM 经常顺手"优化"你没要求改的代码 (例如把 <code>terminal.ts</code>
-    重命名)。差量对账是发现这种偷偷越界的唯一办法。
-  </li>
-  <li>
-    <strong>每章结束前, 把 fake LLM 跑一次并保留输出。</strong>
-    这一步等价于把"messages 顺序"事实存档, 后续章节会反复引用 (例如 02
-    章会断言"第二轮 messages 末尾是 tool message, 不是 user message")。
-  </li>
-</ol>
-<div class="note">
-  <p class="note__title">红灯 1 / 2 / 3 在迭代节奏里的应用</p>
-  <p>
-    红灯 1 (跑通 ≠ 正确) 的对策: 顺序断言 + 反向断言。红灯 2 (测试通过 ≠
-    设计合理) 的对策: 差量对账 + boundary checklist 逐条 grep。红灯 3 (LLM
-    说做完了 ≠ 做完) 的对策: 拆卡 4 轮迭代 + 每轮独立 review。 后续 01–15 章的
-    "本次如何 vibe code" 三件套, 会把这 3 盏红灯翻译成 本章具体的 3 步操作。
-  </p>
-</div>
-
-<h2 id="prompt-card">Prompt Card 写法: 让 LLM 没法偷懒的 6 段模板</h2>
-<p>
-  后续每一章末尾都会有一张 Prompt Card。这张卡片不是"prompt 润色", 而是"把第 1
-  节到第 4 节的思考压成可交给 LLM 的格式"。它有 6 段, 缺一段都会让 LLM
-  在某个位置偷懒。
-</p>
-<ol>
-  <li>
-    <strong>目标 (Goal)</strong>: 一句话讲清这一章要让 LLM 帮你交付什么。
-    <em>不写"完成 X 模块",写"实现 X, 使得 Y 测试通过"</em>。
-  </li>
-  <li>
-    <strong>场景 (Scene)</strong>: 给出 1 个具体用户请求 + 期望的 agent
-    行为。不要写"一般情况下",写"用户输入'帮我读 agent.ts',agent 应当先 read
-    tool、再把内容拼到回复里"。
-  </li>
-  <li>
-    <strong>模块 (Modules)</strong>: 列出这一章要新建/修改的文件,
-    写明每个文件的职责。名字要稳定,不要让 LLM 自由发挥。
-  </li>
-  <li>
-    <strong>接线 (Wiring)</strong>: 写明 Composition Root 怎么创建共享
-    实例、传给哪些模块。这一段是防止"业务逻辑塞进 index.ts"的关键。
-  </li>
-  <li>
-    <strong>边界 (Boundaries)</strong>: 用 checklist 写"绝对不能做"的事。
-    每条都要可验证。例如"agent.ts 内不要直接 new LLMClient", 而不是"注意架构"。
-  </li>
-  <li>
-    <strong>验证 (Validation)</strong>: 至少 3 条具体测试断言,包括 happy
-    path、异常路径、和 messages 顺序断言。
-  </li>
-</ol>
-<div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">反例对照 · 差 → 改 → 好</span>
-  </div>
-  <div class="card__body">
-    <p><strong>差的卡片 (5 段、但 LLM 会偷懒):</strong></p>
-    <pre class="code-block"><code>目标: 实现 agent loop
-模块: history.ts, llm.ts, agent.ts, index.ts
-验证: 能跑通
-边界: 注意架构
-场景: 用户输入 query, agent 调用 LLM 返回文本</code></pre>
-    <p>
-      <strong>问题：</strong>"能跑通"不是断言;"注意架构"不可验证; "调用
-      LLM"没说从哪里调、messages 怎么拼。
-    </p>
-    <p><strong>改 (5 段、有信息量但 LLM 仍会猜):</strong></p>
-    <pre class="code-block"><code>目标: 实现最小 agent loop, 多轮对话保留上下文
-模块: createHistory, createLlm, createAgent, createRepl, index.ts
-验证: 连续两次 run 后第二次 LLM 收到 messages 包含第一轮
-边界: agent.ts 不读环境变量, 不直接 new LLM client
-场景: 用户先说"我喜欢简洁", 再问"我喜欢什么风格"</code></pre>
-    <p><strong>改进：</strong>有 messages 顺序断言、有"绝对不能"的边界。</p>
-    <p><strong>好 (6 段、LLM 没空间偷懒):</strong></p>
-    <pre
-      class="code-block"
-    ><code>目标: 实现最小 agent loop, 多轮上下文由 History 提供
-场景: 用户依次输入 "我喜欢简洁" 与 "我喜欢什么风格",
-      agent 第二次回复应包含"简洁"
-模块:
-  - src/history.ts: createHistory(), 内部 messages: Message[]
-  - src/llm.ts: createLlm(config), 暴露 chat(messages)
-  - src/agent.ts: createAgent(deps), 暴露 run(query)
-  - src/repl.ts: createRepl(deps), 暴露 start()
-  - src/index.ts: 创建 history/llm/terminal, 传给 agent 和 repl
-接线:
-  index.ts 内只做 new + 传参, 不出现 if 分支
-  history / llm 在 agent 和 repl 间是同一实例
-边界 (LLM 必须遵守):
-  - agent.ts 内不出现 process.env
-  - agent.ts 内不出现 new LLMClient
-  - history.getMessages() 返回浅拷贝
-  - 空 query 不写入 history
-验证:
-  - fake LLM 返回 "收到" 时 agent.run("x") === "收到"
-  - 连续两次 run, fake LLM 第二次收到的 messages.length === 3
-  - 第二次收到的 messages[0].role === "user"
-  - run("") 不增加 history 长度</code></pre>
-    <p>
-      <strong>关键差异：</strong>"边界"是可枚举的 checklist; "验证"每条 都能落到
-      vitest 一行断言; "接线"写明实例是不是同一份 (避免双 factory
-      造成状态分裂,这是 <code>AGENTS.md</code> 里特别强调的)。
-    </p>
-  </div>
-</div>
-<p>
-  后续章节的 Prompt Card 都会按这个 6 段模板写。请你在自己的项目里也照
-  这个模板,不要自由发挥。模板本身就是把"对象/现场/失败模式"翻译成 LLM
-  能照搬的形式。
-</p>
-
-<h2 id="known-and-unknown">你已经知道什么,还不知道什么</h2>
-<p>
-  这门课不会从变量、函数、模块这些编程基础讲起。我们默认你知道 TypeScript
-  项目大概如何组织,也知道可以通过 HTTP API 或 SDK 调用大模型。你可能已经 让某个
-  coding agent 帮你改过代码,甚至已经习惯了让它跑测试、读报错、 继续修。
-</p>
-<p>
-  但这里要补的是另一层知识: 一个 coding agent 到底靠什么把"模型生成的
-  下一步意图"变成"真实工程动作"?为什么它不是简单的
-  <code>await llm.chat(query)</code>?为什么它需要 history、tool call、
-  permission、日志、持久化和 eval?
-</p>
-<p>
-  接下来,我们先建立对"普通 LLM 调用"和"coding agent harness"差别的
-  直观认识,再真正落到 TypeScript 模块 (第 01 章)。
-</p>
-
-<h2 id="study-path">本章怎么学</h2>
-<p>
-  接下来分两步。先做一个朴素的 LLM 调用,看它在哪里断;再回到 agent loop
-  的最小骨架,明确 History 怎么写、messages 怎么拼、assistant 怎么回写。 第 01
-  章会把这个骨架用真实 TypeScript 模块搭起来。
-</p>
-<p>
-  读这一段时,请一直问自己: 如果 LLM 本身是无状态的,外层程序到底要
-  替它保存什么、执行什么、验证什么?这个问题会贯穿后面所有章节。
-</p>
-
-<h2 id="scene">本章场景</h2>
-<p>
-  先想一个很普通的请求: 用户说"帮我记住, 我喜欢简洁直接的解释"。
-  下一轮用户又问:"我刚刚说我喜欢什么风格?"如果只是一次普通 LLM 调用,
-  第二次请求里并没有第一句话,模型其实无从知道答案。
+  模式 3 看起来最慢, 但<strong>返工率最低</strong>。 因为大纲阶段
+  把抽象边界钉死了, 后续 LLM 写出来的代码会自动落到
+  既定的接口里, 不会跑偏。 而模式 1 看起来最快, 但
+  第 3 个需求之后的改造成本, 通常超过模式 3 的 3 倍。
 </p>
+<h2 id="prompt-card-template">6 段 Prompt Card 模板</h2>
 <p>
-  再想一个更像 coding agent 的请求: 用户说"帮我看看项目里主循环是怎么
-  写的"。模型自己没有文件系统,它不能真的打开
-  <code>src/agent.ts</code>。它最多能生成"我想读这个文件"的意图,真正读
-  文件的是外层程序,也就是 harness。
+  这是整套教程的"元方法" — 每章末尾的 Prompt Card 都按这个模板写,
+  你也可以照着这张卡片自己 vibe 出新功能。
 </p>
-<p>
-  这就是我们要手搓 coding agent 的原因: 我们不是训练一个新模型,
-  而是在模型外面搭一套运行环境,让它能记住上下文、提出动作、接收观察
-  结果,并在安全边界内持续工作。
-</p>
-
-<h2 id="naive">先试一个朴素方案</h2>
-<p>
-  有经验的程序员第一反应可能是: 这不就是包一层 LLM API 吗?写成这样 似乎就能跑。
-</p>
-<pre class="code-block"><code>async function ask(query: string) {
-  const response = await llm.chat([
-    { role: "user", content: query },
-  ]);
-
-  return response.content;
+<pre class="code-block"><code>// 教学简化版, 真实模板见后续章节
+Prompt Card = {
+  目标:        "用户问什么, 我们让 LLM 写什么"
+  场景:        "具体用户故事, 一段对话能讲清"
+  模块:        "新增/修改哪些文件, 每个文件单一职责"
+  边界:        "LLM 必须遵守的 checklist (5-7 条)"
+  验证:        "怎么跑 fake LLM 确认实现没坏"
+  Prompt:      "可以直接复制粘贴给 LLM 的整段 prompt"
 }</code></pre>
 <p>
-  这段代码对"翻译一句话""解释一个概念"这类任务确实够用。它的优点是 直观:
-  输入一段文本,调用一次模型,返回一段文本。问题是, coding agent
-  的任务通常不是一次问答,而是一个不断积累现场、决定动作、观察
-  结果、再决定下一步的过程。
+  6 段顺序是<strong>不能换</strong>的。 "目标" 先定方向, "场景" 再具体化,
+  "模块" 才落到代码, "边界" 把不允许的行为钉死, "验证" 闭环,
+  "Prompt" 整段可复用。 任何一段缺失, 都会让 LLM 写出来的东西跑偏。
 </p>
-
-<h2 id="why-naive-fails">朴素方案为什么不够</h2>
-<p>上面的朴素方案至少会在四个地方失败。</p>
+<h3>4 个常见反模式</h3>
 <dl class="defs">
-  <dt>它没有记忆现场</dt>
-  <dd>
-    第二次调用时,第一轮用户说过什么、模型回答过什么都不在请求里。 LLM API
-    本身是无状态的; 如果你不把旧消息重新发过去,它不会自动记得。
+<dt>反模式 1 · 只写 "目标" + "Prompt"</dt>
+<dd>
+    跳过模块 / 边界 / 验证。 LLM 写出来的代码, 命名不统一,
+    还会引入和现有模块不一致的接口。 修起来比从零写还慢。
   </dd>
-
-  <dt>它不能行动</dt>
-  <dd>
-    模型无法直接读文件、写文件或执行命令。coding agent 需要工具系统,
-    让模型提出结构化动作请求,再由 harness 执行。
+<dt>反模式 2 · "边界" 写得太抽象</dt>
+<dd>
+    写"代码要清晰" 这种空话。 LLM 不知道什么叫清晰。
+    必须写"工厂模式, 闭包内状态, 不引入 module-level 单例"
+    这种<strong>可验证的</strong>约束。
   </dd>
-
-  <dt>它没有安全边界</dt>
-  <dd>
-    如果模型说"删除这个目录",程序不能无条件照做。工具执行前必须有 permission
-    检查、路径边界和危险命令过滤。
+<dt>反模式 3 · "验证" 写"跑通就行"</dt>
+<dd>
+    LLM 会给你一个"刚好能跑" 的实现, 边界条件全是漏的。
+    必须写"fake LLM 第一轮调 X, 第二轮不应调 X" 这种
+    <strong>显式断言</strong>。
   </dd>
-
-  <dt>它不可验证</dt>
-  <dd>
-    如果只看最终文本,你很难知道模型有没有真的读取文件、有没有按顺序 保留
-    history、有没有绕过权限。后面我们会用 trace 和 eval 记录行为事实。
+<dt>反模式 4 · 把整章 prompt 塞一张卡片</dt>
+<dd>
+    Prompt Card 是<strong>单元</strong>, 不是"整章描述"。
+    每章通常 3-5 张卡片, 每张卡片对应一个独立可验证的差量。
   </dd>
 </dl>
-
-<h2 id="what-is-agent">Coding Agent 到底是什么</h2>
-<p>在这门课里,我们先把 coding agent 看成一个工程系统,而不是一个神秘模型。</p>
-<div class="note">
-  <p class="note__title">一个实用定义</p>
-  <p>
-    <strong
-      >coding agent = LLM + agent loop + tools + context + permission +
-      persistence + eval。</strong
-    >
-  </p>
-  <p>
-    LLM 负责推理和生成意图; agent loop 负责让它持续工作; tools
-    让它接触文件系统和命令行; context 让它记住现场; permission 约束副作用;
-    persistence 保存长期状态; eval 证明行为不是偶然跑通。
-  </p>
-</div>
-<p>
-  你可以把 LLM 想成"脑",但这个脑没有手、没有文件系统、不会自动保留
-  对话历史,也不知道哪些命令危险。harness 就是给它配上的工作台:
-  记事本、工具箱、安全规则、执行记录和测试仪表盘。
-</p>
-
-<h2 id="loop-flow">回到 Agent Loop</h2>
+<h2 id="narrative-arc">教学叙事的 4 步节奏</h2>
 <p>
-  最小 agent loop 先不打开工具分支,只处理"用户输入 → 模型回答 →
-  进入下一轮"的路径。即使如此,它也已经比普通聊天调用多了两个关键 动作: 写入
-  history,以及从 history 重新构造 messages。
+  每章正文都按同一个节奏写, 你顺着读就会形成预期。
 </p>
-
 <figure class="figure">
-  <div class="flow-map" role="img" aria-label="最小 Agent Loop 流程图">
-    <div class="flow-row">
-      <span class="flow-node flow-node--accent">用户输入 query</span>
-      <span class="flow-arrow">→</span>
-      <span class="flow-node">REPL 接收输入</span>
-      <span class="flow-arrow">→</span>
-      <span class="flow-node">History 写入 user message</span>
-    </div>
-    <div class="flow-row">
-      <span class="flow-node">准备 system + history messages</span>
-      <span class="flow-arrow">→</span>
-      <span class="flow-node flow-node--accent">LLM.chat(messages)</span>
-      <span class="flow-arrow">→</span>
-      <span class="flow-node">得到 assistant message</span>
-    </div>
-    <div class="flow-row">
-      <span class="flow-node">History 写入 assistant</span>
-      <span class="flow-arrow">→</span>
-      <span class="flow-node">没有 tool call: 返回文本</span>
-      <span class="flow-arrow">→</span>
-      <span class="flow-node flow-node--accent">等待下一次输入</span>
-    </div>
-  </div>
-  <figcaption>
-    图 00-1 · 第 00/01 章只实现没有工具的路径; 第 02 章会把"assistant 请求 tool
-    call"这条分支接上。
-  </figcaption>
-</figure>
-
-<h2 id="walkthrough">一次真实运行 walkthrough</h2>
-<p>我们用一个没有工具调用的多轮对话,走一遍最小 loop。</p>
-<ol>
-  <li>用户在 REPL 输入: "记住我喜欢简洁直接的解释。"</li>
-  <li>REPL 不调用模型,只把这行文本交给 <code>agent.run(query)</code>。</li>
-  <li>
-    Agent 把它包装成 <code>{ role: "user", content: "..." }</code>,写入
-    History。
-  </li>
-  <li>
-    Agent 从 History 取出 messages; 如果已经设置 system prompt,也会把 system
-    message 放在最前面。
-  </li>
-  <li>
-    LLM Client 把 messages 发给模型,模型返回 assistant message:
-    "好的,我会尽量简洁直接。"
-  </li>
-  <li>
-    Agent 把 assistant message 写回 History。现在 History 里有一轮 user 和
-    assistant。
-  </li>
-  <li>
-    用户第二次输入: "我刚才说我喜欢什么风格?" Agent 再次把新 user message 写入
-    History。
-  </li>
-  <li>
-    这一次发给 LLM 的 messages 包含上一轮内容,所以模型可以回答:
-    "你说你喜欢简洁直接的解释。"
-  </li>
-</ol>
-<p>
-  注意这里的关键点: 所谓"记住",不是模型在服务器端替你保存了什么, 而是 harness
-  在下一次请求时把必要历史重新组织进 messages。
-</p>
-
-<h2 id="interfaces">关键接口和伪码</h2>
-<p>
-  本章不需要贴完整源码,但要先把接口形状定下来。接口越清楚,后面让 coding agent
-  实现时越不容易把所有逻辑糊进一个文件。
-</p>
-<pre class="code-block"><code>interface History {
-  add(message: Message): void;
-  getMessages(): Message[];
-  clear(): void;
-}
-
-interface LLMClient {
-  chat(messages: Message[]): Promise&lt;AssistantMessage&gt;;
-}
-
-interface Agent {
-  run(query: string): Promise&lt;string&gt;;
-}</code></pre>
-
-<p>最小 loop 的伪码可以先短到这样:</p>
-<pre class="code-block"><code>async function run(query: string) {
-  history.add({ role: "user", content: query });
-
-  const messages = history.getMessages();
-  const assistant = await llm.chat(messages);
-
-  history.add(assistant);
-  return assistant.content;
-}</code></pre>
-<p>
-  这段伪码已经埋下了后续扩展点: 当 <code>assistant</code> 里出现 tool call
-  时,不能直接返回文本,而要执行工具、写入 tool result,再进入下一轮 LLM 调用。
-</p>
-
-<h2 id="source-map">源码地图</h2>
-<p>
-  下面 5 个链接指向 GitHub <code>main</code> 分支的
-  <strong>当前最新</strong>代码,
-  而非历史快照。教程正文里描述的接口/函数名/职责边界基于本仓库的稳定设计,
-  不会随每次提交漂移; 但具体实现会持续演进, 学生读到时可能看到比教程更
-  多的字段和分支 (例如 <code>History.add</code> 已演进为
-  <code>add(message, meta?)</code>)。 第一次阅读时,
-  建议只看"导出符号"和"职责边界", 不要逐行对位。
-</p>
-<div class="source-links" aria-label="源码阅读路线">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/index.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/index.ts: 组装根,创建共享实例并接线</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/repl.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/repl.ts: 把用户输入交给 agent.run()</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/agent.ts: think → act → observe 主循环</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/history.ts"
-    target="_blank"
-    rel="noreferrer"
-    >4. src/history.ts: 保存 messages 和元信息</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/llm.ts"
-    target="_blank"
-    rel="noreferrer"
-    >5. src/llm.ts: 把 messages 发给模型并收敛响应</a
-  >
+<div class="flow-map" role="img" aria-label="每章叙事的 4 步节奏">
+<div class="flow-row">
+<span class="flow-node flow-node--accent">1. 痛点 / 现象</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">真实场景或失败故事<br/><small>为什么这章要存在</small></span>
 </div>
-
-<h2 id="state-boundary">状态与边界</h2>
-<p>最小 loop 虽然简单,但已经有几条边界要分清。</p>
-<dl class="defs">
-  <dt>History 是 prompt working context</dt>
-  <dd>
-    它服务下一次 LLM 请求,会影响模型看到什么。第 00/01 章先只关心 user 和
-    assistant message,后续会加入 tool result、system reminder、 压缩摘要。
-  </dd>
-  <dt>REPL 不是业务逻辑层</dt>
-  <dd>
-    REPL 负责输入输出和退出命令。它不应该知道 messages 怎么拼, 也不应该直接创建
-    LLM client。
-  </dd>
-  <dt>LLM Client 不应该知道终端</dt>
-  <dd>
-    LLM Client 的职责是请求模型并解析响应。它不关心用户从哪里输入,
-    也不关心最后如何打印。
-  </dd>
-  <dt>Composition Root 只接线</dt>
-  <dd>
-    <code>index.ts</code> 创建共享依赖,然后传给各模块。业务分支一旦
-    塞进这里,后续测试和子智能体都会变难。
-  </dd>
-</dl>
-
-<h2 id="design">设计直觉</h2>
-<p>
-  从第一章开始就拆模块,可能看起来有点"架构过度"。但它其实是在为后续 章节省成本:
-  工具系统需要复用同一个 History; 权限管理要在 Agent 执行工具前插入; Eval
-  需要把真实 LLM 换成 fake LLM; 子智能体需要复用 同样的 loop,但换一套隔离
-  History。
-</p>
-<p>
-  如果第一天把所有逻辑写进 <code>index.ts</code>,第二天加工具时还能
-  凑合,第三天加权限就开始痛苦,第六天加压缩和子智能体时基本只能重写。
-  所以教学项目的"可理解性"不是少写文件,而是让每个文件的职责足够稳定。
-</p>
-
-<h2 id="prompt">Prompt Card (本章任务)</h2>
-<div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">Prompt Card · 第 01 章任务</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现一个最小 TypeScript coding agent loop: REPL
-      接收用户输入,Agent 写入 History,调用 LLM,保存 assistant 回复并返回文本。
-    </p>
-    <p>
-      <strong>场景:</strong>先不实现文件工具、bash、权限、压缩和记忆,
-      只搭出后续章节可以扩展的最小可运行骨架。
-    </p>
-    <p>
-      <strong>模块:</strong>创建
-      <code>config.ts</code>、<code>terminal.ts</code>、
-      <code>history.ts</code>、<code>llm.ts</code>、<code>agent.ts</code>、
-      <code>repl.ts</code>、<code>index.ts</code>。模块使用
-      <code>createXxx()</code> 工厂函数和 interface。
-    </p>
-    <p>
-      <strong>接线:</strong><code>index.ts</code> 创建共享
-      <code>history</code>、<code>llm</code>、<code>terminal</code>,传给
-      <code>createAgent()</code> 和 <code>createRepl()</code>。业务逻辑 不要写在
-      <code>index.ts</code>。
-    </p>
-    <p>
-      <strong>边界 (LLM 必须遵守的 checklist):</strong>
-    </p>
-    <ul>
-      <li>agent.ts 内不出现 <code>process.env</code> 读取</li>
-      <li>agent.ts 内不出现 <code>new LLMClient</code></li>
-      <li>history.getMessages() 返回浅拷贝,不是内部数组</li>
-      <li>空 query 不写入 history</li>
-      <li>这一章不要实现 tool call 分支,只预留循环结构</li>
-    </ul>
-    <p>
-      <strong>验证 (用 fake LLM,逐条落到 vitest 断言):</strong>
-    </p>
-    <ul>
-      <li>fake LLM 返回 "收到" 时, <code>agent.run("x")</code> === "收到"</li>
-      <li>
-        连续两次 <code>run</code> 后, fake LLM 第二次收到的 messages.length ===
-        3
-      </li>
-      <li>第二次收到的 messages[0].role === "user"</li>
-      <li><code>run("")</code> 不增加 history 长度</li>
-      <li><code>exit</code> / <code>quit</code> 能关闭 REPL</li>
-    </ul>
-  </div>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
 </div>
-
-<h2 id="trap">容易踩坑 (反例梯度)</h2>
-<p>
-  这一节按"新手 / 中级 / 高级"三个梯度,展示三类容易被 LLM 写出、但
-  实际破坏边界的实现。
-</p>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>把 coding agent 理解成"一次更长的 prompt"。</p>
-    <p>
-      <strong>为什么错:</strong>prompt 只能描述意图,不能替你保存状态、
-      执行工具、检查权限或记录事实。
-    </p>
-    <p>
-      <strong>正确做法:</strong>把模型放进 loop 里理解: 它每轮只看到 harness
-      准备好的 messages,并通过 harness 才能影响真实世界。
-    </p>
-  </div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">2. 朴素反例</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">最直接的做法哪里坏<br/><small>为什么不能这样写</small></span>
+</div>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
 </div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>把 <code>history</code> 定义成 module-level 单例
-      (例如 <code>export const history = createHistory();</code>)。
-    </p>
-    <p>
-      <strong>为什么错:</strong>单例看上去省事,但 Composition Root
-      失去了"控制实例生命周期"的能力; 测试时也无法替换; 后续引入子
-      智能体或多窗口 REPL 时,所有调用方会共享同一份 messages,造成 上下文污染。
-    </p>
-    <p>
-      <strong>正确做法:</strong>在 <code>index.ts</code> 用
-      <code>const history = createHistory();</code> 创建一次,再以参数 形式注入
-      agent / repl / 后续的 todo manager。这是 <code>AGENTS.md</code> 里"Shared
-      instances must be literally shared"那条铁律。
-    </p>
-  </div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">3. 接口与不变量</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">3 条不变量 + 接口 shape<br/><small>设计核心</small></span>
 </div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>在 <code>history.getMessages()</code> 里
-      直接返回内部数组 (<code>return this.messages;</code>)。
-    </p>
-    <p>
-      <strong>为什么错:</strong>LLM client 在拼下一轮 messages 时可能
-      <code>push</code> 临时元素 (例如 <code>[...msgs, thisTurnUser]</code>),
-      而这会反向写入 History 内部数组,导致"未调用 add() 但 messages 增长"的隐性
-      bug。后续做 context 压缩和 replay 时,这种 bug 几乎 不可定位。
-    </p>
-    <p>
-      <strong>正确做法:</strong><code>getMessages()</code> 返回
-      <code>[...this.messages]</code> 浅拷贝,或冻结为 readonly array。
-      这一条要在 Validation 卡片里写死,不能省。
-    </p>
-  </div>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
 </div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>第一章就把工具、权限、压缩、记忆全部 做进去。
-    </p>
-    <p>
-      <strong>为什么错:</strong>读者还没有理解最小 loop,就会 被后续能力淹没;
-      实现也容易一次性变成不可测试的大函数。
-    </p>
-    <p>
-      <strong>正确做法:</strong>第 00/01 章只建立最小 loop,明确哪些分支
-      留给后续章节。
-    </p>
-  </div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">4. 实现 + 验证</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">代码 + fake LLM 测试<br/><small>如何用 1 段对话复现</small></span>
 </div>
-
-<h2 id="validate">如何验证</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 01 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>单轮行为:</strong>fake LLM 返回"收到",调用
-      <code>agent.run("你好")</code> 后应返回"收到"。这证明 query 能 进入
-      loop,assistant 回复能返回给调用方。
-    </p>
-    <p>
-      <strong>History 顺序:</strong>运行后 History 中应先有 user,再有
-      assistant。这证明上下文不是只打印出来,而是真的被保存。
-    </p>
-    <p>
-      <strong>多轮上下文:</strong>连续调用两次 <code>agent.run()</code>, 第二次
-      fake LLM 观察到的 messages 必须包含第一轮内容。这证明 LLM 的"记忆"来自
-      harness 重新发送 messages。
-    </p>
-    <p>
-      <strong>REPL 边界:</strong>空输入不调用 agent, <code>exit</code> 或
-      <code>quit</code> 关闭 terminal。这证明交互层没有把无意义输入 写进
-      History。
-    </p>
-  </div>
 </div>
-
-<h2 id="debug">如果实现失败,先查哪里</h2>
-<ol>
-  <li>
-    看 REPL 是否真的调用了 <code>agent.run(query)</code>,而不是直接调用 LLM。
-  </li>
-  <li>看 <code>agent.run()</code> 是否先写入 user message,再调用 LLM。</li>
-  <li>
-    看 <code>history.getMessages()</code> 是否返回浅拷贝,且包含上一轮 消息。
-  </li>
-  <li>看 fake LLM 测试是否能观察到传入的 messages,而不是只断言最终文本。</li>
-  <li>看 <code>index.ts</code> 是否只做接线,没有塞进业务分支。</li>
-</ol>
-
-<h2 id="practice">本章练习</h2>
+<figcaption>图 00-1 · 每章 4 步节奏. 4 步缺一不可。</figcaption>
+</figure>
 <p>
-  不看 Prompt Card,试着自己写一段 prompt,让 coding agent 只实现最小
-  loop。写完后检查它有没有包含 6 段 (目标/场景/模块/接线/边界/验证),
-  以及边界一节是否每条都"可逐字翻译成 vitest 断言"。
-</p>
-
+  4 步节奏的好处: 学生读到第 1 步就知道"这章在解决什么",
+  读到第 2 步就理解"为什么这个解法不是显然的",
+  读到第 3 步拿到"可背下来的设计要点",
+  读到第 4 步可以"自己照着 vibe 一份"。
+</p>
+<h2 id="chapter-map">章节地图</h2>
+<p>
+  16 个章节, 2 个专题, 1 个 Reference。 主题按"骨架 → 能力 → 控制 → 长期"
+  的顺序排:
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th>编号</th>
+<th>主题</th>
+<th>核心问题</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>00</td>
+<td>前言 (本章节)</td>
+<td>怎么与 LLM 协作, 怎么写 Prompt Card</td>
+</tr>
+<tr>
+<td>01</td>
+<td>最小 Agent Loop</td>
+<td>把"持续对话" 变成可测试的模块</td>
+</tr>
+<tr>
+<td>02</td>
+<td>工具调用</td>
+<td>给 Agent 一双手</td>
+</tr>
+<tr>
+<td>03</td>
+<td>TODO Manager</td>
+<td>多轮执行怎么有节奏</td>
+</tr>
+<tr>
+<td>04</td>
+<td>SubAgent</td>
+<td>让 Agent 学会分身</td>
+</tr>
+<tr>
+<td>05</td>
+<td>Skill</td>
+<td>工具数到 30+ 怎么办</td>
+</tr>
+<tr>
+<td>06</td>
+<td>压缩</td>
+<td>context 撑爆怎么办</td>
+</tr>
+<tr>
+<td>07</td>
+<td>Permission</td>
+<td>工具调用要不要先问人</td>
+</tr>
+<tr>
+<td>08</td>
+<td>Hook</td>
+<td>在 loop 周围挂钩子</td>
+</tr>
+<tr>
+<td>09</td>
+<td>Memory</td>
+<td>跨会话记忆怎么持久化</td>
+</tr>
+<tr>
+<td>10</td>
+<td>Cache</td>
+<td>Prompt Cache 怎么用</td>
+</tr>
+<tr>
+<td>11</td>
+<td>Recovery</td>
+<td>LLM 报错时不要崩</td>
+</tr>
+<tr>
+<td>12</td>
+<td>Task</td>
+<td>长期计划怎么落盘</td>
+</tr>
+<tr>
+<td>13</td>
+<td>Async Run</td>
+<td>怎么不阻塞主循环</td>
+</tr>
+<tr>
+<td>14</td>
+<td>Schedule</td>
+<td>怎么让时间触发 Agent</td>
+</tr>
+<tr>
+<td>15</td>
+<td>Hardening</td>
+<td>长期运行不把系统跑坏</td>
+</tr>
+<tr>
+<td>专题 A</td>
+<td>模型差异</td>
+<td>换模型不只是换 baseURL</td>
+</tr>
+<tr>
+<td>专题 B</td>
+<td>测试不确定系统</td>
+<td>怎么测一个会波动的 Agent</td>
+</tr>
+<tr>
+<td>Reference</td>
+<td>查阅页</td>
+<td>术语表 / Prompt Pack / 验证手册</td>
+</tr>
+</tbody>
+</table>
 <h2 id="summary">本章小结</h2>
 <p>
-  本章先讲了"用 LLM 写 LLM agent"的元方法 (想清楚现象/反例/接口/验证), 钉死了
-  harness/loop/History/tool call/Composition Root/fake LLM 这 6 个术语,点了 4
-  句"不能信的话",给出了一份可复用的 6 段 Prompt Card 模板。然后才把 coding agent
-  从"普通 LLM 调用"里拆出来: 模型只是 loop 中的一环,真正让它能持续工作的,是
-  History、Agent、REPL、LLM Client 和 Composition Root 组成的 harness。
-</p>
-
-<h2 id="next">下一章伏笔</h2>
-<p>
-  下一章会真正落到 TypeScript 模块: 怎么写 <code>createHistory()</code>、
-  <code>createAgent()</code> 和 <code>createRepl()</code>,怎么用 fake LLM
-  写"messages 顺序断言",怎么为第 02 章的工具调用预留分支。读完后,
-  你应该能自己完成本章的 Prompt Card 任务。
-</p>
-
-<div class="ack">
-  <p>
-    致谢:本教程受到
-    <a
-      href="https://github.com/shareAI-lab/learn-claude-code"
-      target="_blank"
-      rel="noreferrer"
-      >shareAI-lab/learn-claude-code</a
-    >
-    启发。感谢原项目把 coding agent harness 的核心思想讲得足够清楚, 让这个
-    TypeScript 教学版本有了起点。
-  </p>
-</div>
+  这一章不写代码, 但讲了三件比代码更重要的事:
+</p>
+<ul>
+<li>
+<strong>对象是 LLM</strong>: 写接口就是教模型, 好名字比好实现重要。
+    </li>
+<li>
+<strong>3 种协作模式</strong>: 一次性原型 / 迭代增量 / 大纲先行。
+    教学项目走模式 3, 返工率最低。
+    </li>
+<li>
+<strong>6 段 Prompt Card</strong>: 目标 / 场景 / 模块 / 边界 / 验证 / Prompt,
+    缺一段就翻车。
+    </li>
+<li>
+<strong>4 步叙事节奏</strong>: 痛点 → 朴素反例 → 接口不变量 → 实现验证,
+    4 步缺一不可。
+    </li>
+</ul>
+<p>
+  下一章开始落 TypeScript, 把第 01 章的最小 Agent Loop 写出来。
+</p>
+</content>
\ No newline at end of file
diff --git a/tutorial/chapters/01-agent-loop.html b/tutorial/chapters/01-agent-loop.html
index 93aa114..403e2d4 100644
--- a/tutorial/chapters/01-agent-loop.html
+++ b/tutorial/chapters/01-agent-loop.html
@@ -1,768 +1,786 @@
 <p class="article__eyebrow">第 01 章 · 真正落 TypeScript</p>
 <h1 class="article__title">最小 Agent Loop: 从伪码到可测试模块</h1>
 <p class="article__lede">
-  第 00 章讲了"为什么需要 harness",并给出了一份 6 段 Prompt Card 模板。
-  这一章按那张卡片把最小 agent loop 真正写成 TypeScript 模块: History、 LLM
-  Client、Agent、REPL、Composition Root 五件套,加上 fake LLM 的 messages
-  顺序断言。读完后,你能跑通一份"两轮对话保留上下文"的最小 harness,
-  并能用测试验证它不是偶然跑通。
+  第 00 章讲了"为什么需要 harness", 并给出了一份 6 段 Prompt Card 模板。
+  这一章按那张卡片把最小 agent loop 真正写成 TypeScript 模块:
+  History、 LLM Client、 Agent、 REPL、 Composition Root 五件套,
+  加上 fake LLM 的 messages 顺序断言。 读完后, 你能跑通一份
+  "两轮对话保留上下文" 的最小 harness, 并能用测试验证它不是偶然跑通。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-00">在第 00 章基础上改了什么</h2>
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="why-loop">为什么 agent 实际是个 loop</h2>
+<p>
+  在写代码之前, 先把"agent loop" 这个抽象放下, 用一个真实问题看
+  为什么它必须存在。
+  </p>
+<p>
+  假设你做一个"能聊天的程序", 用户输入 "Hello", 程序输出 "Hi!" —
+  30 行能搞定。 但用户接着输入 "Do you remember?" 时, 朴素实现
+  就崩了: 程序不知道上一轮说过 "Hello", 也知道上一轮回过 "Hi!",
+  它只看到这一句 query, 无法回答"remember what"。
+  </p>
+<p>
+  朴素想法 1: "在 <code>globalChatHistory</code> 里存所有消息。"
+  能解决 2 轮对话, 但跑 10 轮、100 轮后, 每次调 LLM 都要把全部
+  历史塞进去, 费用涨 10 倍、延迟涨 10 倍。
+  </p>
+<p>
+  朴素想法 2: "只保留最近 5 轮对话。"
+  费用降了, 但"我想调出 30 轮前的 tool 调用" 答不上来。 信息
+  永远在丢。
+  </p>
 <p>
-  在动手前,先明确这一章的"差量"。它是对第 00 章 Prompt Card 的兑现,
-  不是"另一份新设计"。下面这张表是这一章要在仓库里新增的文件、每个文件的
-  单一职责,以及它和第 00 章伪码的对应。
+  朴素想法 3: "干脆不用 LLM, 写死状态机?"
+  那不是 agent, 是 chatbot。 失去 LLM 的灵活性, 一切问题都要
+  工程师手写。
+  </p>
+<p>
+  正确做法: <strong>把"对话"建模成 loop</strong> —
+  <code>while (LLM 还要调工具) { 调 LLM; 调工具; 把结果喂回去 }</code>。
+  history 是"持续累积的状态", loop 是"持续重读的放大镜"。 每一轮
+  LLM 都看到完整 history, 但只决定"下一步做什么", 上下文由
+  harness 管理。
+  </p>
+<p>
+  这个 loop 不是教学简化 — 它是任何 agent (AutoGPT / Claude Code /
+  Cursor Agent) 的<strong>最小骨架</strong>。 后面 14 章所有功能
+  (工具 / 权限 / 压缩 / 子 agent) 都是这个 loop 的扩展, 不
+  替换它。
+  </p>
+<h2 id="what-is-loop">Agent Loop 是什么 — 不是什么</h2>
+<p>
+  在讲代码之前, 把"agent loop" 这个词锁紧, 避免你读到后面
+  还在想"我到底在写什么"。
+  </p>
+<p>
+  <strong>Agent loop 是</strong>:
 </p>
-<div class="source-links" aria-label="本章改动清单">
-  <a class="source-link" href="#delta-files">新增 7 个文件,每个文件一个职责</a>
-  <a class="source-link" href="#delta-loop"
-    >loop 路径与第 00 章图 00-1 一一对应</a
-  >
-  <a class="source-link" href="#delta-test"
-    >test 用 fake LLM 验证 messages 顺序</a
-  >
-</div>
+<ol>
+<li>把用户输入写入 history (对话历史)</li>
+<li>把 history 转成 messages, 调一次 LLM, 把回复写回 history</li>
+<li>看 LLM 是否要调工具: 要就执行, 不要就返回文本</li>
+</ol>
 <p>
-  之后所有章节 (02–15) 都会按这张差量表的格式开头: "在第 N 章基础上改了什么"。
-  这是一个硬规矩,目的是让学生能用 diff 而不是用记忆去跟读。
+  <strong>Agent loop 不是</strong>:
 </p>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
+<ul>
+<li>"循环调 LLM 直到用户满意" — 没终止条件, 会无限循环</li>
+<li>"AI 自动完成任务" — agent loop 只是骨架, 任务完成靠工具执行</li>
+<li>"智能对话" — "智能" 是 LLM 的事, loop 只是"持续放大镜"</li>
+</ul>
+<p>
+  任何一上来就讲"完整 agent" 的教程, 都在偷跑前 3 章的债。
+  你不知道 harness 哪部分是核心、哪部分是装饰, 改的时候不知道
+  改哪里。 这一章写的 loop 是"骨架", 后面 14 章都是"在骨架上
+  挂器官"。
+  </p>
+<h2 id="real-failure">真实失败故事: 一个 50 行脚本的第 11 天</h2>
+<p>
+  假设你按"最小能跑" 的思路写了一个 50 行的 chat 函数 (反例在下一节),
+  前 3 天跑得挺好。 第 11 天, 你想加一个"上下文超长就压缩" 的功能,
+  发现做不到:
+  </p>
+<ol>
+<li>
+<strong>history 是 module-level 单例, 多 agent 共享</strong>:
+    你想加 subagent 让长任务委托出去, 结果父子 history 串了。
+    </li>
+<li>
+<strong>没有 fake LLM 入口</strong>: 想加"基于 mock 验证 messages
+    顺序" 的测试, 但 openai SDK 是直接调的, mock 要侵入单例
+    全局变量, 写不出来。
+    </li>
+<li>
+<strong>tool 调度写死在函数里</strong>: 第 02 章要给 loop 加工具,
+    结果发现需要改 50 行里的 20 行, 因为 if/else 全在主函数里。
+    </li>
+</ol>
 <p>
-  按第 00 章 6 段 Prompt Card 模板倒推,写这一章 Prompt Card 之前,作者脑子里
-  经历了下面 4 步。第 00 章第 1 节已经讲过这 4 步的抽象,这里给具体例子。
+  这 3 个坑都是"骨架没分清" 造成的。 这一章写的 5 件套, 就是为了
+  让"加新功能" 只需要挂新模块, 不改骨架。
+  </p>
+<h2 id="naive">朴素反例: 一个文件 50 行的 loop</h2>
+<p>
+  写 agent 最朴素的做法: 一个文件, 一个函数, 全局变量存 history,
+  直接调 OpenAI SDK。 50 行能跑通, 但跑 10 分钟后就崩。
+  </p>
+<pre class="code-block"><code>// ❌ 反例: 全局 history + 直接调 SDK + 50 行
+let history: Message[] = [];
+
+export async function chat(userInput: string): Promise&lt;string&gt; {
+  history.push({ role: "user", content: userInput });
+  const completion = await openai.chat.completions.create({
+    model: "gpt-4",
+    messages: history,
+  });
+  const reply = completion.choices[0].message.content ?? "";
+  history.push({ role: "assistant", content: reply });
+  return reply;
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1" rel="noreferrer" target="_blank">GitHub · src/agent.ts 真实 agent loop 实现 (L1)</a></p>
+<p>
+  3 件事立刻坏掉:
 </p>
+<ol>
+<li>
+<strong>测试写不出</strong>: <code>openai.chat.completions.create</code> 是真实网络调用,
+    单测要 mock SDK, 断言 messages 顺序还要侵入全局变量。
+    实际写测试时, 你会发现自己要么 monkey-patch 全局 openai 客户端
+    (脏), 要么用 jest.mock (侵入性大)。
+  </li>
+<li>
+<strong>换模型写不出</strong>: 换 Claude / Gemini 时, <code>chat.completions.create</code>
+    的字段名不一样, 整个函数重写。 你想支持多模型路由, 只能复制粘贴
+    50 行 × 3, 维护噩梦。
+  </li>
+<li>
+<strong>多 agent 共享会污染</strong>: 子智能体拿到的 history 和父 agent 一样,
+    跨上下文泄露。 你想做 subagent, 但子 agent 看到的所有工具调用
+    都在 history 里, 它误以为是自己的。
+  </li>
+</ol>
+<p>
+  这 3 个问题不是"50 行写得不够好", 是"骨架设计错了"。 下面
+  5 件套就是解药。
+  </p>
+<h2 id="three-invariants">3 条不变量</h2>
+<p>
+  把反例的 3 个坏处倒过来, 就是这一章的 3 条不变量。 不变量是"写代码时
+  心里时刻记着的 3 件事", 不是"代码里写 3 条注释"。
+  </p>
 <dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    现象是"两次 run 之后,模型还能引用第一轮的偏好"。这个现象的最小复现 是 REPL
-    里连续输入两句"我喜欢简洁"和"我喜欢什么风格"。如果实现错了,
-    第二句会得到"我不知道你喜欢什么"。
+<dt>不变量 1 · History 在闭包内, 不在 module-level</dt>
+<dd>
+    让 <code>history</code> 由 <code>createAgent()</code> 创建, 存在闭包里。
+    每个 agent 实例有独立 history, 父子隔离自然成立。
+    这是"工厂模式" 在 agent 里的具体应用 — 后面 04 章 SubAgent 会
+    再用一次。
   </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是第 00 章的 <code>ask(query)</code>: 它只把当前 query 发给
-    LLM,没有 History。这一章要"治"的就是这种"每次都是新会话"的问题。
+<dt>不变量 2 · LLM 是窄接口, 不直接调 SDK</dt>
+<dd>
+    定义 <code>LLMClient.chat(messages, tools)</code> 接口,
+    默认实现调 OpenAI SDK, 测试用 fake。
+    换模型 = 换 LLMClient 实现, 业务代码不动。
+    这是"Adapter 模式" 的具体应用 — 后面专题 A 讲 Anthropic / Google
+    时会扩展。
   </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    不变量有三: (1) History 是 messages 的唯一来源, (2) getMessages()
-    返回浅拷贝,(3) Composition Root 内同一份 history 实例被 agent 和 repl
-    共享。接口要暴露 <code>add</code> / <code>getMessages</code> /
-    <code>clear</code> 三个动作,不要暴露内部数组。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    验证用 fake LLM,断言"第二轮 LLM 收到的 messages 数量 == 3、第一条 role ==
-    user、最后一条 role == user"。这条断言比"最终文本对"强,
-    因为它要求"消息真的被保存并被重新发送"。
+<dt>不变量 3 · Composition Root 唯一组装依赖</dt>
+<dd>
+    所有"用哪个 LLM / 哪个 history / 哪个 logger" 都在
+    <code>index.ts</code> 的 <code>main()</code> 里一次性创建, 通过
+    依赖注入传给 agent。 模块内部不 import 其他模块的实现。
+    这是"控制反转 + 依赖注入" — 让业务模块纯净, 0 改动换测试。
   </dd>
 </dl>
-
-<h2 id="observe-first">先观察: 三段故意有气味的代码</h2>
+<h2 id="why-factory">为什么用工厂模式 (而不是单例)</h2>
 <p>
-  在写正确实现之前,先看三段常见但"有气味"的最小实现,回答"它为什么
-  不行"。这一节不是练习,只是训练"看出哪里不对"的眼睛。
-</p>
-
-<div class="note">
-  <p class="note__title">观察 1 · 全局单例 History</p>
-  <pre class="code-block"><code>// history.ts
-export const history = createHistory();
-
-// agent.ts
-import { history } from "./history.js";
-export async function run(query: string) {
-  history.add({ role: "user", content: query });
-  const msgs = history.getMessages();
-  return (await llm.chat(msgs)).content;
-}</code></pre>
-  <p>
-    <strong>问:</strong>看上去很简洁。为什么 Composition Root 团队普遍禁止
-    module-level 单例?
+  "工厂模式" 在面向对象里是老生常谈, 但在 TypeScript 写 agent 时
+  有特别的意义, 必须单独讲一遍。
   </p>
-  <p>
-    <strong>答:</strong>单例失去三件事 —— 测试时无法替换; 子智能体或 多窗口 REPL
-    无法隔离; Composition Root 失去了"实例从哪里来"的可观测 性。第 00
-    章的"中级错法 B"已经讲过这一点。
+<p>
+  朴素想法: "单例不就行了吗, agent 又不创建多个。"
+  错。 至少 3 个场景会创建多个 agent 实例:
   </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · getMessages 返回内部引用</p>
-  <pre class="code-block"><code>// history.ts (内部实现)
-getMessages(): Message[] {
-  return this.messages;
-}</code></pre>
-  <p><strong>问:</strong>返回引用比浅拷贝快,为什么不写?</p>
-  <p>
-    <strong>答:</strong>LLM client 拼下一轮 messages 时常见写法是
-    <code>[...msgs, thisTurnUser]</code>。如果 msgs 是内部数组,这个 spread
-    会让外部 push 写回 history,出现"我没调 add() 但 messages 变长"的隐性
-    bug。这正是第 00 章的"高级错法 C"。
+<ol>
+<li>
+<strong>REPL + SubAgent</strong>: REPL 里有一个主 agent, 用户说
+    "再开一个窗口做这个任务", 主 agent 调 <code>run_subagent</code>
+    创建子 agent。 父子要独立 history。
+  </li>
+<li>
+<strong>测试隔离</strong>: 每个测试 <code>it("...")</code> 需要一个
+    干净的 agent 实例, 不能被前一个测试污染。
+  </li>
+<li>
+<strong>多 session</strong>: 未来 harness 可能支持"同时跑 2 个
+    agent, 一个做 A 一个做 B", 各自独立 history。
+  </li>
+</ol>
+<p>
+  单例写法短 1 行, 但加新场景时改 5 处; 工厂写法长 3 行, 但
+  加新场景时改 0 处。 这就是工厂模式在 agent harness 里的"复利"。
   </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 3 · 把 process.env 塞进 agent.ts</p>
-  <pre class="code-block"><code>// agent.ts
-const apiKey = process.env.ANTHROPIC_API_KEY;
-const llm = createLlm({ apiKey });</code></pre>
-  <p><strong>问:</strong>看上去"配置就近使用"很自然,为什么要搬到 config.ts?</p>
-  <p>
-    <strong>答:</strong>agent.ts 一旦读环境变量,测试就必须 mock 整个
-    process.env; 不同部署环境 (本地 / CI / 沙箱) 切换配置就要改业务文件;
-    子智能体想用同一份 agent 但不同 LLM provider 时也无从换起。 config.ts 是
-    Composition Root 的"配置版本",不是 agent.ts 的私货。
+<h2 id="five-pieces">五件套: 用处 → 场景 → 设计 → 实现</h2>
+<p>
+  最小 agent loop 拆成 5 个模块, 每个模块单一职责。 拆模块的边界
+  原则: <strong>"如果这一块要换实现, 改的代码越少越好"</strong>。
+  </p>
+<table class="terms">
+<thead>
+<tr>
+<th>模块</th>
+<th>职责</th>
+<th>依赖</th>
+<th>换实现时改哪里</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>history.ts</code></td>
+<td>管理对话消息序列</td>
+<td>无</td>
+<td>替换存储后端 (内存 / 文件 / Redis)</td>
+</tr>
+<tr>
+<td><code>llm.ts</code></td>
+<td>LLMClient 接口 + 默认 OpenAI 实现</td>
+<td>SDK (可选)</td>
+<td>换模型 / 换 SDK</td>
+</tr>
+<tr>
+<td><code>agent.ts</code></td>
+<td>agent.run() 6 步骨架</td>
+<td>history, llm</td>
+<td>几乎不改, 业务逻辑入口</td>
+</tr>
+<tr>
+<td><code>repl.ts</code></td>
+<td>readline 循环 + 命令分发</td>
+<td>agent, terminal</td>
+<td>换交互方式 (CLI / GUI / Web)</td>
+</tr>
+<tr>
+<td><code>index.ts</code></td>
+<td>Composition Root, 唯一组装点</td>
+<td>所有上述</td>
+<td>换测试 / 换部署, 改这一个文件</td>
+</tr>
+</tbody>
+</table>
+<p>
+  下面逐个讲。 每个模块按"用途 → 真实场景 → 设计思想 → 实现细节"
+  展开。
+  </p>
+<h3>History · 对话历史</h3>
+<p>
+  <strong>用途</strong>: 持久化"LLM 当前看到的所有消息"。 它是 loop 的
+  "事实状态", loop 每次迭代都从 history 读 messages 喂给 LLM,
+  把 LLM 回复和工具结果追加回 history。
   </p>
-</div>
-
-<h2 id="delta-files">新增文件清单 (差量表)</h2>
 <p>
-  这一章会新建 7 个文件。每个文件的第一行 import / export 决定了它的 职责边界;
-  不属于这个文件职责的代码,不要塞进来。
+  <strong>真实场景</strong>: 用户说 "刚才我让你改了什么?", LLM 答
+  "抱歉我不记得"。 原因就是 history 丢了。 一个 agent 跑 10 轮后
+  history 必须完整保留, 才能让 LLM 看到"过去发生了什么"。
+  </p>
+<p>
+  <strong>设计思想</strong>: History 的 API 必须<strong>收窄</strong>。
+  任何"能不能加一个 search" 的诱惑都先拒绝 — 后面有 hook /
+  memory 时再扩展。 这一版的 4 个方法, 覆盖了 80% 的场景: add /
+  getMessages / size / replaceEntries。 多一个方法, 就多一个
+  "会不会被滥用" 的地方。
+  </p>
+<p>
+  <strong>实现细节</strong>:
 </p>
-<dl class="defs">
-  <dt><code>src/config.ts</code></dt>
-  <dd>
-    读取环境变量并校验; 导出 <code>loadConfig()</code>,返回
-    <code>{ apiKey, model, ... }</code>。本文件不出现业务逻辑。
-  </dd>
-
-  <dt><code>src/terminal.ts</code></dt>
-  <dd>
-    封装 readline 的输入输出; 暴露 <code>ask()</code> / <code>print()</code> /
-    <code>close()</code>。本文件不出现 LLM 调用,也不出现 History。
-  </dd>
-
-  <dt><code>src/history.ts</code></dt>
-  <dd>
-    <code>createHistory()</code> 工厂; 内部 <code>messages: Message[]</code>。
-    暴露 <code>add</code> / <code>getMessages</code> / <code>clear</code>。
-    <code>getMessages</code> 必返回浅拷贝 (第 00 章不变量 2)。
-  </dd>
-
-  <dt><code>src/llm.ts</code></dt>
-  <dd>
-    <code>createLlm(config)</code> 工厂; 暴露 <code>chat(messages)</code>。
-    真实实现调 Anthropic SDK, fake LLM 由测试用同名 interface 注入。
-  </dd>
-
-  <dt><code>src/agent.ts</code></dt>
-  <dd>
-    <code>createAgent({ history, llm })</code> 工厂; 暴露
-    <code>run(query)</code>。本文件不读环境变量, 不 new LLMClient, 不直接用
-    readline (第 00 章边界 checklist)。
-  </dd>
-
-  <dt><code>src/repl.ts</code></dt>
-  <dd>
-    <code>createRepl({ agent, terminal })</code> 工厂; 暴露
-    <code>start()</code>。循环 <code>ask → agent.run → print</code>, 处理
-    <code>exit</code> / <code>quit</code> / 空输入。
-  </dd>
-
-  <dt><code>src/index.ts</code> (Composition Root)</dt>
-  <dd>
-    <code
-      >loadConfig → createHistory → createLlm → createTerminal →
-      createAgent(deps) → createRepl(deps) → repl.start()</code
-    >。 本文件不出现 if 分支, 不出现 try/catch 业务逻辑。
-  </dd>
-</dl>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
+<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
+interface History {
+  add(entry: HistoryEntry): void;
+  getMessages(): ChatCompletionMessageParam[];
+  size(): number;
+  replaceEntries(entries: HistoryEntry[]): void;  // 压缩用
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/history.ts#L1" rel="noreferrer" target="_blank">GitHub · src/history.ts History 实现 (L1)</a></p>
 <p>
-  接口是给 LLM (也就是"另一个会读这段代码的智能体") 看的名片,好名字比
-  好实现更重要。
+  4 个方法的边界:
 </p>
-<pre class="code-block"><code>// src/history.ts
-export interface Message {
-  role: "user" | "assistant" | "system" | "tool";
-  content: string;
-  // 第 02 章会扩展 tool_calls, tool_call_id, name 等字段。
-}
-
-export interface History {
-  add(message: Message): void;
-  getMessages(): Message[];
-  clear(): void;
-}
-
-export function createHistory(): History { /* ... */ }</code></pre>
-
-<pre class="code-block"><code>// src/llm.ts
-export interface AssistantMessage {
-  role: "assistant";
-  content: string;
-  // 第 02 章会扩展 tool_calls。
-}
-
-export interface LLMClient {
-  chat(messages: Message[]): Promise&lt;AssistantMessage&gt;;
-}
-
-export function createLlm(config: Config): LLMClient { /* ... */ }</code></pre>
-
-<pre class="code-block"><code>// src/agent.ts
-export interface Agent {
-  run(query: string): Promise&lt;string&gt;;
-}
-
-export interface AgentDeps {
-  history: History;
-  llm: LLMClient;
-}
-
-export function createAgent(deps: AgentDeps): Agent { /* ... */ }</code></pre>
-
-<pre class="code-block"><code>// src/repl.ts
-export interface Terminal {
-  ask(): Promise&lt;string | null&gt;;  // null 表示 EOF / 关闭
-  print(text: string): void;
-  close(): void;
-}
-
-export interface ReplDeps {
-  agent: Agent;
-  terminal: Terminal;
+<ul>
+<li>
+<strong><code>add</code> 只追加, 不修改历史</strong>: LLM 已经看到过的消息
+    不能改, 否则会出现"LLM 之前看到 A, 现在看到 A'" 的不一致。
+  </li>
+<li>
+<strong><code>getMessages</code> 是唯一出口</strong>: 返回
+    <code>ChatCompletionMessageParam[]</code>, 喂给 LLM。 history
+    内部可以有 metadata (时间戳 / token 数), 但出口必须干净。
+  </li>
+<li>
+<strong><code>size</code> 估算 token, 触发压缩</strong>: 简单按字符数估算,
+    中文字符 × 1.5, 英文 × 0.25。 不需要精确, 数量级对就行。
+  </li>
+<li>
+<strong><code>replaceEntries</code> 专给压缩用, 普通路径不调</strong>:
+    第 06 章 P2 全量压缩时, 把"最近 6 块 + 之前总结成 1 块"
+    整体替换。 这个方法权限大, 必须有显式调用点, 不能被普通业务乱调。
+  </li>
+</ul>
+<h3>LLMClient · LLM 抽象</h3>
+<p>
+  <strong>用途</strong>: 唯一跟"真实 LLM" 打交道的入口。 业务模块
+  (agent / tool) 不调 OpenAI SDK, 调 <code>LLMClient.chat()</code> 一个
+  方法。 换模型 / 换 SDK / 测试, 都通过换 LLMClient 实现完成。
+  </p>
+<p>
+  <strong>真实场景</strong>: 跑 100 轮对话后, 发现 Anthropic Claude 比
+  OpenAI gpt-4 在长上下文表现更好, 想切 Claude。 如果业务代码直接
+  调 <code>openai.chat.completions.create()</code>, 切换要改 20+ 处;
+  调 <code>LLMClient.chat()</code>, 切换只要换 LLMClient 实现。
+  </p>
+<p>
+  <strong>设计思想</strong>: 接口故意只暴露 chat。 temperature /
+  max_tokens / streaming 都不在第一版接口里, 是为了不让上层业务和
+  "这个 LLM SDK 有这个参数" 绑死。 后续章节会逐步加, 但加在
+  llm.ts 内部, 不外泄到 agent.ts。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>interface LLMClient {
+  chat(args: {
+    messages: ChatCompletionMessageParam[];
+    tools?: ChatCompletionTool[];
+  }): Promise&lt;LLMResponse&gt;;
 }
 
-export function createRepl(deps: ReplDeps): Repl { /* ... */ }</code></pre>
-
-<h2 id="delta-loop">loop 路径: 与第 00 章图 00-1 一一对应</h2>
+interface LLMResponse {
+  content: string | null;
+  toolCalls: ChatCompletionMessageToolCall[];
+  finishReason: string | null;
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/llm.ts#L1" rel="noreferrer" target="_blank">GitHub · src/llm.ts LLMClient 接口 + Adapter (L1)</a></p>
+<p>
+  为什么 LLMResponse 字段这么少?
+  答: harness 关心的就是"模型说了什么, 调了哪些工具, 为啥停"。
+  这 3 个字段直接对应 history.add() 的 3 个参数。 多一个字段,
+  就多一个"会不会被滥用" 的地方。
+  </p>
+<h3>Agent · 主循环</h3>
+<p>
+  <strong>用途</strong>: 整个 harness 的业务入口。 用户调
+  <code>agent.run(query)</code>, agent 负责"写到 history → 调 LLM →
+  调工具 → 返回结果" 的完整流程。 所有其他模块 (history / llm /
+  tool) 都被 agent 协调。
+  </p>
+<p>
+  <strong>真实场景</strong>: REPL 收到用户输入, 调 <code>agent.run("帮我
+  看看 README")</code>, agent 内部: 写 user 到 history → 调 LLM →
+  LLM 返回 assistant (调 run_read) → 调 run_read 读 README → 写
+  tool result 到 history → 调 LLM 续问 → LLM 返回 assistant (没
+  tool call) → 返回最终回复。
+  </p>
 <p>
-  下面这段实现直接对应第 00 章的伪码。每行右侧标注了它在图 00-1 中
-  的节点,方便对照。
+  <strong>设计思想</strong>: 6 步骨架, 每步职责单一, 用 <code>for (;;)</code>
+  循环而不是 <code>for (let i = 0; i &lt; maxRounds; i++)</code>。
+  早期版本用后者更安全, 但后续会引入 TODO + SubAgent, "最大轮数"
+  应该由子 agent 自己的 maxRounds 控制, 不是主 loop 强制。 主 loop
+  只看"用户有没有得到回复", 终止条件是 LLM 返回无 tool_call。
+  </p>
+<p>
+  <strong>实现细节</strong>:
 </p>
-<pre class="code-block"><code>// src/agent.ts
-export function createAgent(deps: AgentDeps): Agent {
-  const { history, llm } = deps;
-
+<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
+export function createAgent(deps: { llm: LLMClient; history: History }): Agent {
   return {
     async run(query: string): Promise&lt;string&gt; {
-      // 边界 checklist: 空 query 不写入 history
-      if (query.trim().length === 0) {
-        return "";
+      // 第 1 步: 用户输入写入 history
+      deps.history.add({ role: "user", content: query });
+
+      for (;;) {
+        // 第 2 步: 从 history 拿 messages
+        const messages = deps.history.getMessages();
+        // 第 3 步: 调 LLM
+        const response = await deps.llm.chat({ messages });
+        // 第 4 步: 把回复写回 history
+        deps.history.add({
+          role: "assistant",
+          content: response.content,
+          tool_calls: response.toolCalls,
+        });
+        // 第 5 步 (本章占位): 没有 tool call, 返回最终回复
+        if (response.toolCalls.length === 0) {
+          return response.content ?? "";
+        }
+        // 第 5 步 (第 02 章展开): 有 tool call, 执行
+        // 第 6 步: 回到第 2 步继续 loop
       }
-
-      // 图 00-1 节点 3: 写 user message
-      history.add({ role: "user", content: query });
-
-      // 图 00-1 节点 4: 取 messages 浅拷贝
-      const messages = history.getMessages();
-
-      // 图 00-1 节点 5: 调 LLM
-      const assistant = await llm.chat(messages);
-
-      // 图 00-1 节点 6: 写 assistant message
-      history.add(assistant);
-
-      return assistant.content;
     },
   };
 }</code></pre>
-
-<h2 id="composition-root">Composition Root: 把 5 件套装到一根线</h2>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1" rel="noreferrer" target="_blank">GitHub · src/agent.ts 6 步骨架完整实现 (L1)</a></p>
 <p>
-  <code>index.ts</code> 的存在意义是"5 件套的接线图"。它不出现 if 分支, 也不出现
-  try/catch,因为这两类代码一旦塞进 Composition Root,后续 子智能体和 e2e
-  都会变难。
+  这一章先不展开 step 5 (工具执行), 让 loop 跑通"两轮对话保留上下文"。
+  step 5 留到第 02 章, 那里会加 <code>ToolRegistry</code> 和 5 个常见 bug。
 </p>
-<pre class="code-block"><code>// src/index.ts
-import { loadConfig } from "./config.js";
-import { createHistory } from "./history.js";
-import { createLlm } from "./llm.js";
-import { createTerminal } from "./terminal.js";
-import { createAgent } from "./agent.js";
-import { createRepl } from "./repl.js";
-
-export async function main(): Promise&lt;void&gt; {
-  const config = loadConfig();
-  const history = createHistory();
-  const llm = createLlm(config);
-  const terminal = createTerminal();
-  const agent = createAgent({ history, llm });
-  const repl = createRepl({ agent, terminal });
-  await repl.start();
-}
-
-main().catch((err) =&gt; {
-  console.error(err);
-  process.exit(1);
-});</code></pre>
-
-<h2 id="repl-edge">REPL 的边界: 退出命令和空输入</h2>
+<h3>REPL · 终端循环</h3>
+<p>
+  <strong>用途</strong>: 让用户和 harness 对话。 用户在终端输入,
+  REPL 调 <code>agent.run()</code>, 把回复打回去。 是 harness 的
+  "I/O 边界" — 唯一接触终端的地方。
+  </p>
+<p>
+  <strong>真实场景</strong>: 用户跑 <code>npm run dev</code>, 看到
+  <code>&gt;</code> 提示符, 输入 "Hello", 看到 "Hi!", 输入 /exit 退出。
+  REPL 是这个交互循环的物理实现。
+  </p>
+<p>
+  <strong>设计思想</strong>: REPL 故意薄。 它只做 3 件事: 读一行, 调
+  agent.run, 打印回复。 复杂逻辑 (TODO 进度条 / SubAgent 状态)
+  不进 REPL, 那是 agent 自己的事。 REPL 越薄, 越容易换 (CLI →
+  GUI → Web, REPL 都不动, 业务模块都不动)。
+  </p>
 <p>
-  REPL 这一层看起来是"输入输出",但它要承担三件事: (1) 拦截 <code>exit</code> /
-  <code>quit</code> 关闭 terminal, (2) 拦截空输入避免 agent 写入 History, (3)
-  拦截 Ctrl-D / EOF 优雅关闭。 这三件事都不应该让 agent 知道。
+  <strong>实现细节</strong>:
 </p>
-<pre class="code-block"><code>// src/repl.ts
-export function createRepl(deps: ReplDeps): Repl {
-  const { agent, terminal } = deps;
-
-  return {
-    async start(): Promise&lt;void&gt; {
-      // eslint-disable-next-line no-constant-condition
-      while (true) {
-        const raw = await terminal.ask();
-        if (raw === null) break;            // EOF
-        const query = raw.trim();
-        if (query.length === 0) continue;   // 空输入不写入 history
-        if (query === "exit" || query === "quit") break;
-        const answer = await agent.run(query);
-        terminal.print(answer);
-      }
-      terminal.close();
-    },
-  };
+<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
+export async function startRepl(agent: Agent, terminal: Terminal): Promise&lt;void&gt; {
+  terminal.println("Mini Agent REPL. 输入 /exit 退出。");
+  for (;;) {
+    const line = await terminal.question("&gt; ");
+    if (line === "/exit") break;
+    const reply = await agent.run(line);
+    terminal.println(reply);
+  }
 }</code></pre>
-
-<h2 id="fake-llm">Fake LLM Cookbook: 怎么写 messages 顺序断言</h2>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/repl.ts#L1" rel="noreferrer" target="_blank">GitHub · src/repl.ts REPL 实现 (L1)</a></p>
+<p>
+  Terminal 也是注入的, 不是 <code>readline</code> 直接 import。 这样
+  测试可以注入"自动回答" 的 fake terminal, 不需要真的 readline。
+  </p>
+<h3>Composition Root · 唯一组装点</h3>
+<p>
+  <strong>用途</strong>: 唯一 import 一切、new 一切的地方。 业务模块
+  (agent / history / llm) 内部不 import 彼此, 只接收注入。
+  换测试 = 换 <code>main()</code>, 业务代码 0 改动。
+  </p>
+<p>
+  <strong>真实场景</strong>: 你跑测试, 写 <code>testMain()</code> 注入
+  fake LLM / fake terminal; 你部署到 GUI, 写 <code>guiMain()</code>
+  注入 GUI 终端; 你部署到 CI, 写 <code>ciMain()</code> 不启动
+  REPL, 直接调 agent.run()。 三种 main 共享同一份业务模块。
+  </p>
 <p>
-  这一节给出一份能直接抄的 fake LLM。它解决第 00 章"红灯 1"中提到的
-  "只断言最终文本"的隐患。
+  <strong>设计思想</strong>: 这是经典<strong>依赖反转原则</strong> (DIP) 的
+  极端形式。 业务模块不再"创建依赖", 而是"接收依赖"。 创建在哪
+  里? 集中在 1 个 <code>main()</code> 函数。 这避免了"业务模块
+  互相 new 对方" 的混乱图, 让依赖关系"塌缩成一个点"。
+  </p>
+<p>
+  <strong>实现细节</strong>:
 </p>
-<pre class="code-block"><code>// test/_helpers/fake-llm.ts
-import type { LLMClient, AssistantMessage, Message } from "../../src/llm.js";
-
-export function createFakeLlm(scripted: string[]): LLMClient &amp; {
-  received: Message[][];
-} {
-  const received: Message[][] = [];
-  let i = 0;
-
-  return {
-    received,
-    async chat(messages: Message[]): Promise&lt;AssistantMessage&gt; {
-      // 记录每次调用收到的 messages 浅拷贝
-      received.push([...messages]);
-      const content = scripted[i++] ?? scripted[scripted.length - 1];
-      return { role: "assistant", content };
-    },
-  };
+<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
+export async function main() {
+  const llm = createOpenAILLMClient({ apiKey: process.env.OPENAI_API_KEY! });
+  const history = createHistory();
+  const agent = createAgent({ llm, history });
+  const terminal = createReadlineTerminal();
+  await startRepl(agent, terminal);
 }</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/index.ts#L1" rel="noreferrer" target="_blank">GitHub · src/index.ts Composition Root (L1)</a></p>
 <p>
-  测试中只断言 <code>fake.received[N][M].role</code> /
-  <code>.content</code>,就能在不调真实 LLM 的情况下验证"第二次 LLM
-  收到了什么"。这种断言比"最终文本对"严格得多,它是后续每一章 Validation
-  卡片的基础。
-</p>
-
-<h2 id="delta-test">Validation: 把 Prompt Card 的 5 条断言落到 vitest</h2>
+  <code>main()</code> 是<strong>唯一</strong>import 一切、new 一切的地方。
+  业务模块 (agent / history / llm) 内部不 import 彼此, 只接收注入。
+  </p>
+<h2 id="loop-figure">loop 在仓库里的位置</h2>
+<figure class="figure">
+<div class="flow-map" role="img" aria-label="5 件套架构 + 数据流">
+<div class="flow-row--center">
+<span class="flow-node">index.ts<br/><small>Composition Root</small></span>
+<span class="flow-arrow">↘</span>
+<span class="flow-node flow-node--accent">agent.run(query)</span>
+<span class="flow-arrow">↙</span>
+<span class="flow-node">repl.ts<br/><small>读一行 → 调 run</small></span>
+</div>
+<div class="flow-row--center" style="margin-top: var(--space-3)">
+<span class="flow-node flow-node--accent">agent.run</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">history<br/><small>闭包内消息序列</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">llm.chat<br/><small>OpenAI SDK</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">history.add(assistant)</span>
+</div>
+</div>
+<figcaption>图 01-1 · 5 件套架构. Composition Root 注入所有依赖, agent.run 是业务入口, history 是状态, llm 是放大镜。</figcaption>
+</figure>
+<figure class="figure">
+<div class="flow-compare" role="img" aria-label="朴素反例 vs 工厂+闭包">
+<div class="flow-compare__col flow-compare__col--bad">
+<div class="flow-compare__label">❌ 朴素反例</div>
+<span class="flow-node">let history = []<br/><small>module-level 单例</small></span>
+<span class="flow-node">let llm = new OpenAI()<br/><small>业务直接调 SDK</small></span>
+<span class="flow-node">业务函数 new 一切<br/><small>父子 agent 共享</small></span>
+</div>
+<div class="flow-compare__col flow-compare__col--good">
+<div class="flow-compare__label">✅ 工厂 + 闭包</div>
+<span class="flow-node">createHistory()<br/><small>每次闭包独立</small></span>
+<span class="flow-node">createLLMClient()<br/><small>业务只见接口</small></span>
+<span class="flow-node">createAgent(deps)<br/><small>依赖注入</small></span>
+</div>
+</div>
+<figcaption>图 01-2 · 朴素反例 vs 工厂+闭包. 朴素写法短 1 行, 工厂写法长 3 行, 但加新场景时改 0 处。</figcaption>
+</figure>
+<figure class="figure">
+<div class="flow-stack" role="img" aria-label="稳定前缀 vs 动态 tail">
+<div class="flow-stack__layer flow-stack__layer--stable">
+<div class="flow-stack__label">稳定前缀 · 进 cache</div>
+<span class="flow-node">system prompt</span>
+<span class="flow-node">+ tool descriptions (按 skill 集合)</span>
+</div>
+<div class="flow-stack__arrow">↓</div>
+<div class="flow-stack__layer flow-stack__layer--dynamic">
+<div class="flow-stack__label">动态 tail · 不进 cache, 算增量</div>
+<span class="flow-node">history messages</span>
+<span class="flow-node">+ reminders (TODO / memory / skill)</span>
+<span class="flow-node">+ 当前 user query</span>
+</div>
+</div>
+<figcaption>图 01-3 · 稳定前缀 vs 动态 tail. 状态走 reminder, system prompt 写后不改 (第 10 章展开)。</figcaption>
+</figure>
+<h2 id="fake-llm-test">fake LLM 测试: 验证不是偶然跑通</h2>
 <p>
-  下面 5 条断言直接来自第 00 章 Prompt Card 的"验证"一节。每条都能用 fake LLM 在
-  <code>&lt;100ms</code> 内跑完。
-</p>
-<pre class="code-block"><code>// test/agent.test.ts
-import { describe, it, expect } from "vitest";
-import { createAgent } from "../src/agent.js";
-import { createHistory } from "../src/history.js";
-import { createFakeLlm } from "./_helpers/fake-llm.js";
-
-describe("agent.run (最小 loop)", () =&gt; {
-  it("单轮: 透传 fake LLM 返回值", async () =&gt; {
-    const llm = createFakeLlm(["收到"]);
-    const agent = createAgent({ history: createHistory(), llm });
-    expect(await agent.run("你好")).toBe("收到");
-  });
-
-  it("多轮: 第二次 LLM 收到的 messages 包含第一轮 user 和 assistant", async () =&gt; {
-    const llm = createFakeLlm(["A1", "A2"]);
-    const agent = createAgent({ history: createHistory(), llm });
-
-    await agent.run("u1");
-    await agent.run("u2");
-
-    expect(llm.received[1]?.length).toBe(3);                  // [u1, a1, u2]
-    expect(llm.received[1]?.[0]?.role).toBe("user");
-    expect(llm.received[1]?.[0]?.content).toBe("u1");
-    expect(llm.received[1]?.[1]?.role).toBe("assistant");
-    expect(llm.received[1]?.[1]?.content).toBe("A1");
-    expect(llm.received[1]?.[2]?.role).toBe("user");
-  });
-
-  it("空 query 不写入 history", async () =&gt; {
-    const llm = createFakeLlm(["A"]);
-    const history = createHistory();
-    const agent = createAgent({ history, llm });
-
-    expect(await agent.run("")).toBe("");
-    expect(history.getMessages().length).toBe(0);
-    expect(llm.received.length).toBe(0);
-  });
-
-  it("getMessages 返回浅拷贝, 外部 push 不影响内部", () =&gt; {
-    const history = createHistory();
-    history.add({ role: "user", content: "u" });
-
-    const snap = history.getMessages();
-    snap.push({ role: "user", content: "hacked" });
-
-    expect(history.getMessages().length).toBe(1);
-    expect(history.getMessages()[0]?.content).toBe("u");
-  });
-});</code></pre>
+  朴素做法"跑通" 是最危险的。 你跑了 1 次, 它通了, 你就觉得
+  "代码没问题", 改天改一行又挂了, 不知道为什么。
+  </p>
 <p>
-  这 4 条断言 (5 条里第 5 条是 REPL 的,放到 <code>repl.test.ts</code>)
-  已经覆盖了第 00 章 Prompt Card 边界 checklist 的全部 5 条。 第 02
-  章开始,每一章都会沿用"fake LLM + messages 顺序断言"的模式, 只是 messages
-  数量和 role 序列变长。
-</p>
-
-<h2 id="trap">反例梯度: 新手 / 中级 / 高级</h2>
+  <strong>设计思想</strong>: 关键洞见是 fake LLM 不仅能"假装模型",
+  还能"捕获模型收到的所有 messages"。 每次 <code>chat()</code> 调用
+  时, 真实 LLM 收到什么 messages, fake LLM 也能拿到一份。 我们
+  用这个能力写断言。
+  </p>
 <p>
-  本章的反例按"看起来能跑 / 看上去更优雅 / 用对了工具但放错位置"分三档。
-  读完后你应该能说出"为什么 LLM 经常写出 B 或 C"。
+  为什么不用 snapshot 测试? snapshot 测试对比 messages 的
+  <strong>完整结构</strong> (role / content / tool_calls 全部字段)。
+  问题: 任何"无害的字段添加" (比如新增 metadata), 都会让
+  snapshot 挂。 我们用 <code>toContainEqual</code> 断言<strong>关键消息
+  存在</strong>, 不在意其他字段。 测试更稳定。
+  </p>
+<p>
+  <strong>实现细节</strong>:
 </p>
+<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
+function createFakeLLM(responses: LLMResponse[]): LLMClient &amp; { lastMessages: () =&gt; any[] } {
+  let i = 0;
+  const calls: { messages: any[]; tools?: any[] }[] = [];
+  return {
+    async chat({ messages, tools }) {
+      calls.push({ messages, tools });
+      if (i &gt;= responses.length) throw new Error("no more fake responses");
+      return responses[i++];
+    },
+    lastMessages() { return calls[calls.length - 1].messages; },
+    allCalls() { return calls; },
+  };
+}
 
+test("agent.run 第二轮能看见第一轮的 user + assistant", async () =&gt; {
+  const llm = createFakeLLM([
+    { content: "Hi!", toolCalls: [], finishReason: "stop" },
+    { content: "Yes, I remember.", toolCalls: [], finishReason: "stop" },
+  ]);
+  const agent = createAgent({ llm, history: createHistory() });
+
+  await agent.run("Hello");
+  await agent.run("Do you remember?");
+
+  // 断言: 第二次 chat() 收到的 messages 包含第一次的 user + assistant
+  const lastCall = llm.allCalls()[1];
+  expect(lastCall.messages).toContainEqual({ role: "user", content: "Hello" });
+  expect(lastCall.messages).toContainEqual({ role: "assistant", content: "Hi!" });
+  expect(lastCall.messages).toContainEqual({ role: "user", content: "Do you remember?" });
+});</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.test.ts#L1" rel="noreferrer" target="_blank">GitHub · src/agent.test.ts fake LLM 测试 (L1)</a></p>
+<p>
+  这条测试如果挂了, 你立刻知道 "history 没正确累积"。
+  跑了 1000 次还是 1000 次, 不会因为 LLM 升级突然挂。
+  </p>
+<h2 id="common-confusion">3 个常见误解</h2>
+<dl class="defs">
+<dt>误解 1 · "History 是给用户看的"</dt>
+<dd>
+    错。 History 是给 LLM 看的, 用户看的是 REPL 渲染。
+    History 里有 tool_call / tool_result / metadata, 这些不进 REPL 显示。
+    混了之后, "REPL 怎么不显示完整 tool result?" 这种问题答不上来。
+  </dd>
+<dt>误解 2 · "LLMClient 是单例"</dt>
+<dd>
+    错。 LLMClient 也是工厂创建的, 多个 agent 实例可以共享一个
+    LLMClient (避免双倍费用), 也可以各自独立 (测试场景)。
+    "共享 vs 独立" 是 Composition Root 决定的, 不是 LLMClient 决定的。
+  </dd>
+<dt>误解 3 · "Loop 是 for (let i; i &lt; N; i++)"</dt>
+<dd>
+    错。 Loop 是 <code>for (;;)</code>, 终止条件是 LLM 返回无 tool_call。
+    "最大轮数" 是 agent 收到的参数, 由主 agent / SubAgent / 评测
+    case 各自决定, 不在 loop 内部硬编码。
+  </dd>
+</dl>
+<h2 id="trap">反例梯度</h2>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>把 <code>agent.run()</code> 实现成"调一次
-      LLM,把响应拼接返回",不写 history。
+<div class="card__head">
+<span class="card__tag">新手错法 · A</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> module-level <code>let history = []</code>。
     </p>
-    <p>
-      <strong>为什么错:</strong>多轮上下文立刻断, 第二句会得到"我不知道
-      你喜欢什么"。
+<p>
+<strong>为什么错:</strong> 子智能体共享父 agent 的 history, 跨上下文泄露。
     </p>
-    <p>
-      <strong>正确做法:</strong>严格按图 00-1 的 4 步, 写入 → 取 messages → 调
-      LLM → 写回。
+<p>
+<strong>正确做法:</strong> <code>history</code> 在 <code>createAgent()</code> 闭包内。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>把 History 做成 module-level 单例, 在
-      <code>agent.ts</code> 内
-      <code>import { history } from "./history.js"</code>。
+<div class="card__head">
+<span class="card__tag">中级错法 · B</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> agent.ts 直接 <code>import OpenAI from "openai"</code>。
     </p>
-    <p>
-      <strong>为什么错:</strong>测试时无法替换, 子智能体无法隔离, Composition
-      Root 失去可观测性 (第 00 章中级错法 B)。
+<p>
+<strong>为什么错:</strong> 测试要 mock SDK, 换模型要重写。
+    </p>
+<p>
+<strong>正确做法:</strong> agent.ts 只用 <code>LLMClient</code> 接口, SDK 在 llm.ts 内部 import。
     </p>
-    <p><strong>正确做法:</strong>Composition Root 创建实例, 通过 deps 注入。</p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong><code>getMessages()</code> 返回内部数组引用。
+<div class="card__head">
+<span class="card__tag">高级错法 · C</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> Composition Root 不存在, 业务模块互相 import。
     </p>
-    <p>
-      <strong>为什么错:</strong>LLM client 拼下一轮 messages 时
-      <code>[...msgs, thisTurnUser]</code> 会反向污染 history (第 00 章 高级错法
-      C)。
+<p>
+<strong>为什么错:</strong> 子智能体无法换 LLM, 测试无法换 fake。
     </p>
-    <p>
-      <strong>正确做法:</strong>返回 <code>[...this.messages]</code> 浅拷贝,
-      并在 Validation 里写一条 push 反向断言。
+<p>
+<strong>正确做法:</strong> <code>index.ts</code> 是唯一 import 一切的地方, 业务模块只接收注入。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>空字符串 query 仍然写 history 并调 LLM。</p>
-    <p>
-      <strong>为什么错:</strong>REPL 误触 (例如连续回车) 会让 history 出现
-      <code>role: "user", content: ""</code>, 既污染上下文又浪费 token。
+<div class="card__head">
+<span class="card__tag">边界错法 · D</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 测试断言"最终回复文本"。
     </p>
-    <p>
-      <strong>正确做法:</strong>在 agent 层和 repl 层各加一次空输入拦截, 并在
-      Validation 里断言"空输入不增加 history 长度"。
+<p>
+<strong>为什么错:</strong> LLM 升级后措辞变化, 测试假阳性挂。
+    </p>
+<p>
+<strong>正确做法:</strong> 用 fake LLM 断言 <code>messages</code> 顺序, 不断言最终回复。
     </p>
-  </div>
 </div>
-
+</div>
 <h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
 <div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 01 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>单轮透传:</strong>fake LLM 返回 "收到",
-      <code>agent.run("x")</code> === "收到"。
-    </p>
-    <p>
-      <strong>多轮 messages 顺序:</strong>连续两次 run 后, fake LLM 第二次
-      收到的 messages 长度 == 3, role 序列 == [user, assistant, user], content
-      与第一轮输入一致。
+<div class="card__head">
+<span class="card__tag">Validation · 第 01 章</span>
+</div>
+<div class="card__body">
+<p>
+<strong>两轮对话保留上下文:</strong> fake LLM, 第一轮问"Hello", 第二轮问
+      "Do you remember?", 第二次 chat() 收到的 messages 含
+      第一次的 user + assistant。
     </p>
-    <p>
-      <strong>空 query 不写 history:</strong><code>run("")</code> 返回 "",
-      history 长度不变, fake LLM 收到 0 次调用。
+<p>
+<strong>History 在闭包内:</strong> <code>grep -n '^let history\|^const history' src/agent.ts</code>
+      应当 0 行, 只能出现在 <code>createAgent()</code> 函数体内。
     </p>
-    <p>
-      <strong>getMessages 浅拷贝:</strong>外部 push 新元素后, history
-      内部不受影响 (反向断言)。
+<p>
+<strong>LLMClient 是窄接口:</strong> agent.ts 不 import SDK, 验证
+      <code>grep -n 'openai\|anthropic' src/agent.ts</code> 应当 0 行。
     </p>
-    <p>
-      <strong>Composition Root 实例共享:</strong>在 <code>index.ts</code> 内
-      <code>history</code> 只 new 一次, agent 和 repl 拿到的是同一份 引用 (用
-      <code>===</code> 断言)。
+<p>
+<strong>Composition Root 唯一组装:</strong> <code>index.ts</code> 是唯一 new 一切的地方,
+      验证 agent.ts / history.ts / llm.ts 互不 import。
     </p>
-  </div>
 </div>
-
-<h2 id="debug">如果实现失败,先查哪里</h2>
-<ol>
-  <li>
-    <strong>多轮断言失败:</strong>看 <code>agent.run()</code> 是不是漏写了
-    <code>history.add(assistant)</code>, 或者 <code>getMessages()</code>
-    返回了浅拷贝但外部没拿到。
-  </li>
-  <li>
-    <strong>空 query 仍然写 history:</strong>看
-    <code>agent.run()</code> 顶部是否做了
-    <code>if (query.trim().length === 0) return ""</code>。
-  </li>
-  <li>
-    <strong>浅拷贝断言失败:</strong>看 <code>getMessages()</code> 是不是 返回了
-    <code>this.messages</code> 而不是 <code>[...this.messages]</code>。
-  </li>
-  <li>
-    <strong>Composition Root 出现 if 分支:</strong>把 if 拆到对应的工厂 函数或
-    REPL 里, index.ts 只做接线。
-  </li>
-  <li>
-    <strong>fake LLM 收到 0 次调用:</strong>检查
-    <code>llm.received</code> 是不是被 fake 工厂外部覆盖 (要直接读
-    <code>llm.received</code>, 不要在测试里用 <code>fake.received</code>)。
-  </li>
-</ol>
-
-<h2 id="lookback">回望第 00 章: 哪些原则在本章兑现了</h2>
-<p>
-  这一节是"知识回环"。每章末尾都要把"自己兑现了哪些第 00 章原则"
-  列出来,作为对前章的差量验证。
-</p>
+</div>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>元方法 (4 步思考链):</strong>本章 "作者怎么想的" 一节把"现象 → 反例
-    → 接口 → 验证"完整走了一遍, Prompt Card 卡片里没有"注意架构" 这种空话。
-  </li>
-  <li>
-    <strong>术语锁定:</strong><code>History</code> / <code>LLMClient</code> /
-    <code>Composition Root</code> / <code>fake LLM</code> 全部以英文出现,
-    中文仅做"外层运行环境""消息列表"等一次性释义。
-  </li>
-  <li>
-    <strong>防自欺:</strong>红灯 1 "跑通 ≠ 正确" 体现在 fake LLM 的 messages
-    顺序断言上; 红灯 3 "LLM 说做完了" 体现在边界 checklist 的每条都可 翻译为
-    vitest 断言上; 红灯 4 "卡片漂亮 ≠ 实现漂亮" 体现在反例梯度 4 条具体错法上。
-  </li>
-  <li>
-    <strong>Prompt Card 6 段:</strong>第 00 章 Prompt Card 是 "6 段模板" 的
-    演示版, 第 01 章 Prompt Card 已经把 6 段全部按模板写, 不再空话。
-  </li>
+<li>
+<strong>工厂模式</strong>: <code>createAgent()</code> 返回实例, history 在闭包内。
+    </li>
+<li>
+<strong>依赖注入</strong>: llm / history 通过参数传入, 不在内部 new。
+    </li>
+<li>
+<strong>Composition Root</strong>: index.ts 唯一组装点。
+    </li>
+<li>
+<strong>fake 测试</strong>: 不依赖真实 LLM, 用 scripted response 验证 messages 顺序。
+    </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
-<p>
-  现在的 loop 能跑, 但还差几件事: assistant 里出现 tool call 时 (第 02
-  章)、上下文增长到模型放不下时 (第 06 章)、多轮里 LLM 临时 抽风时 (第 11
-  章)、需要让另一个 agent 接力时 (第 04 章)。这一节 把这些张力显式钉出来,
-  后续章节会逐个回来解决。
-</p>
 <dl class="defs">
-  <dt>tool call 分支</dt>
-  <dd>
-    当 assistant.content 是空但出现 tool_calls 数组时, loop 不能返回 文本,
-    而要执行工具、写入 tool result、再发一次 LLM。这条分支在 第 02 章接入, 但
-    prompt 模板里要提前留出 if 分支的形状。
-  </dd>
-  <dt>context 增长</dt>
-  <dd>
-    长对话会让 messages 超过模型窗口。History 之后会被 normalize / block /
-    compress (第 06 章), 这意味着 history 不能再"原样返回", 它要知道自己"是
-    prompt working context", 不等同于"原始记录"。
-  </dd>
-  <dt>LLM 异常</dt>
-  <dd>
-    LLM 返回空、超时、JSON 损坏时, agent 要决定"重试 / 跳过 / 中止"。 第 11
-    章会引入 recovery 边界, 现在的 agent.run() 还没考虑这些。
-  </dd>
-  <dt>子智能体</dt>
-  <dd>
-    第 04 章会让 agent.run() 内部"开第二个 loop", 但第二个 loop 的 History
-    必须和父 agent 隔离。这意味着 History 不能再 module-level, createHistory()
-    必须是工厂 (这一章已经做到)。
-  </dd>
+<dt>工具调用</dt>
+<dd>
+    loop 第 5 步"有 tool_call 就执行" 留空, 第 02 章展开。
+    </dd>
+<dt>压缩</dt>
+<dd>
+    history 不会无限增长, 第 06 章加 P0 / P1 / P2 三层压缩。
+    </dd>
+<dt>权限</dt>
+<dd>
+    工具执行要不要先问人, 第 07 章展开。
+    </dd>
 </dl>
-
-<h2 id="vibe-coding-01">本次如何 vibe code: 第 01 章的三件套</h2>
-<p>
-  按第 00 章 vibe-coding 方法论的拆卡 / review / 迭代三件套, 第 01
-  章的具体操作如下。 后续 02–15 章都会按这个格式给出 "本次如何 vibe code",
-  请你也按这个格式去写自己的 LLM 对话。
-</p>
-
-<h3 id="vibe-feed-01">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。只贴 "目标 + 场景 + 模块", 让 LLM 给出
-    <code>interface History</code> / <code>interface LLMClient</code> /
-    <code>interface Agent</code> / <code>interface Terminal</code> 四个
-    interface 的草案。本轮不写实现, 只钉形状与命名。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。贴 "模块 + 接线", 让 LLM 给出
-    <code>src/index.ts</code> 接线代码, 其中 <code>createHistory</code> /
-    <code>createLlm</code> / <code>createAgent</code> /
-    <code>createRepl</code> 都是返回空对象的 stub。本轮 review 重点:
-    <code>history</code> 实例是否 在 <code>index.ts</code> 中只 new 一次, agent
-    和 repl 拿到的是不是 同一份引用 (<code>===</code> 断言)。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。贴 "边界 (5 条 checklist)", 让 LLM 按
-    checklist 写每个工厂的真实实现。本轮 review 重点 (V1 review checklist
-    的"依赖方向"+"副作用"两栏): agent.ts 是否被偷偷塞进
-    <code>process.env</code> 读取, <code>getMessages</code> 是否返回浅拷贝。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。贴 "验证 (5 条 vitest 断言)", 让 LLM 写
-    <code>test/agent.test.ts</code>。本轮 review 重点: fake LLM 是否 真正记录
-    messages (而不是只返回字符串), 多轮断言是否同时检查 role 和 content。
-  </li>
-</ol>
-<div class="note">
-  <p class="note__title">为什么第 1 轮不允许写实现</p>
-  <p>
-    LLM 拿到 "目标 + 场景 + 模块" 后, 默认会直接写实现。写实现会触发"假装清单"
-    里的"假装做了边界检查"。本轮显式禁止 "do not write implementation yet", 强迫
-    LLM 把注意力放在命名、参数顺序、返回类型上, 后续轮次再补实现。
-  </p>
+<h2 id="prompt-card">Prompt Card (本章任务)</h2>
+<div class="card card--prompt">
+<div class="card__head">
+<span class="card__tag">Prompt Card · 第 01 章</span>
+<button class="card__copy" data-copy-card="" type="button">复制</button>
 </div>
-
-<h3 id="vibe-review-01">Review: 第 01 章专属 checklist</h3>
-<p>在第 00 章通用 review checklist 之上, 第 01 章额外要逐条核对的 5 条:</p>
-<ol>
-  <li>
-    <strong>history 是工厂而不是单例。</strong
-    ><code>grep -n 'export const history' src/</code>
-    应当 0 行。如果出现, 立即回退到第 2 轮让 LLM 改成工厂。
-  </li>
-  <li>
-    <strong>getMessages 返回浅拷贝。</strong>看实现是不是
-    <code>return [...this.messages]</code>, 而不是
-    <code>return this.messages</code>。 必须配合 Validation 卡片里 "外部 push
-    不影响 history" 这条反向断言。
-  </li>
-  <li>
-    <strong>agent.run() 内不读环境变量。</strong
-    ><code>grep -n 'process.env' src/agent.ts</code>
-    应当 0 行。
-  </li>
-  <li>
-    <strong>空 query 在 agent.run() 顶部拦截。</strong>看是不是有
-    <code>if (query.trim().length === 0) return ""</code> 这行, 而不是 REPL
-    拦截后直接 <code>continue</code>。两层都要有。
-  </li>
-  <li>
-    <strong>Composition Root 内 <code>history</code> 只 new 一次。</strong
-    >写一个 <code>__test__.ts</code> (不进 git) 把 agent 和 repl 拿到的 history
-    引用 <code>===</code> 比对, 不通过则立即重写。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-01">调试: 第 01 章典型伪装</h3>
-<p>LLM 在 01 章最常犯的两种伪装:</p>
-<ol>
-  <li>
-    <strong>伪装 A · 把 LLM 调用直接放进 REPL。</strong>症状:
-    <code>repl.ts</code> 里有 <code>await llm.chat(...)</code>。这意味着 REPL
-    绕过了 agent, history 不再唯一。验证:
-    <code>grep -n 'llm.chat' src/repl.ts</code> 应当 0 行;
-    <code>grep -n 'llm.chat' src/agent.ts</code> 应当 ≥ 1 行。
-  </li>
-  <li>
-    <strong>伪装 B · 把 history 写成 module-level 闭包单例。</strong>症状:
-    <code>history.ts</code> 顶层有
-    <code>let messages: Message[] = [];</code> 之类 module-level 状态。验证:
-    <code>grep -n '^let messages\|^const messages' src/history.ts</code> 应当 0
-    行 (只能在 <code>createHistory()</code> 函数体内出现)。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-01">迭代: 第 01 章 4 个 commit 节点</h3>
-<p>推荐的 4 个 commit 节点。每个 commit 跑通对应 vitest 集合后才进下一个:</p>
-<ol>
-  <li>
-    <code>feat(ch01): 钉 4 个 interface 形状, 不写实现</code> —— 跑
-    <code>tsc --noEmit</code> 通过, 没有任何实现代码。
-  </li>
-  <li>
-    <code>feat(ch01): Composition Root 接线, 工厂为 stub</code> —— 跑通
-    "Composition Root 实例共享" 那个一次性断言 (history 在 agent 和 repl 间是
-    <code>===</code>)。
-  </li>
-  <li>
-    <code>feat(ch01): 5 条边界 checklist 兑现</code> —— 跑通 Validation 卡片前 4
-    条 (单轮透传 / 多轮 messages 顺序 / 空 query / 浅拷贝)。
-  </li>
-  <li>
-    <code>test(ch01): REPL 边界 (exit / quit / EOF)</code> —— 跑通 第 5 条
-    Validation (REPL 边界)。
-  </li>
-</ol>
+<div class="card__body">
 <p>
-  每个 commit message 引用对应 Validation ID 是有意为之: 当某次回归测试 失败时,
-  你能立刻定位"是哪一条 Validation 被破坏了", 而不必从头读 diff。
-</p>
-
+<strong>目标:</strong> 把最小 agent loop 写成可测试 TypeScript 模块。
+    </p>
+<p>
+<strong>场景:</strong> 用户在 REPL 输入 "Hello", 再输入 "Do you remember?",
+      第二次 chat() 收到的 messages 含第一次的 user + assistant。
+    </p>
+<p>
+<strong>模块:</strong> <code>src/history.ts</code> (新) 暴露 <code>createHistory()</code>;
+      <code>src/llm.ts</code> (新) 暴露 <code>createOpenAILLMClient()</code> 和
+      <code>LLMClient</code> 接口; <code>src/agent.ts</code> (新) 暴露
+      <code>createAgent({ llm, history })</code> 和 6 步 loop;
+      <code>src/repl.ts</code> (新) 暴露 <code>startRepl()</code>;
+      <code>src/index.ts</code> (新) Composition Root。
+    </p>
+<p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
+<ul>
+<li>history 在 <code>createAgent()</code> 闭包内, 不写 module-level 单例</li>
+<li>LLMClient 接口只暴露 <code>chat()</code>, 不暴露 temperature / max_tokens</li>
+<li>agent.ts 不 import SDK, SDK 只在 llm.ts 内部使用</li>
+<li>index.ts 是唯一 new 一切的地方, 业务模块互不 import</li>
+</ul>
+<p><strong>验证 (用 fake LLM + vitest, 逐条断言):</strong></p>
+<ul>
+<li>两轮对话后, 第二次 chat() 收到的 messages 含第一次的 user + assistant</li>
+<li>history 状态在两次 run() 之间累积, 不丢</li>
+<li>没有 tool_call 时, run() 返回 assistant content</li>
+<li>agent.ts 不出现 <code>openai</code> / <code>anthropic</code> 字符串</li>
+</ul>
+</div>
+</div>
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    按本章 4 条 Validation 断言写测试, 故意把 <code>getMessages()</code> 改回
-    <code>return this.messages;</code>, 验证测试会失败。
+<li>
+    故意把 history 写到 module-level, 跑测试, 看"两轮对话保留上下文" 是否抓到。
   </li>
-  <li>
-    在 <code>agent.ts</code> 里加一个分支: 如果 <code>query</code> 以
-    <code>/echo </code> 开头, 不调 LLM, 直接把剩余部分作为 assistant 写回
-    history。验证多轮 messages 序列仍然正确 (echo 也算一轮)。
+<li>
+    把 LLM SDK 直接 import 到 agent.ts, 跑测试, 看 fake LLM 是否被绕过。
   </li>
-  <li>
-    把 <code>index.ts</code> 故意塞进 <code>if (process.env.DRY_RUN)</code>
-    分支, 看看自己在看代码时能不能立刻识别"这里坏了" (第 00 章红灯 4)。
+<li>
+    让 <code>index.ts</code> 同时 import agent.ts 和 llm.ts 的内部实现,
+    看 composition root 唯一性是否被破坏。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章按 6 段 Prompt Card 模板兑现了第 00 章的承诺: 7 个文件, 5 件套 +
-  Composition Root + fake LLM, 4 条 messages 顺序断言覆盖了"多轮上下文 真的由
-  History 承担"这件事。读完后, 你能自己写一份能通过全部 Validation 的实现,
-  也能识别 LLM 给的代码"看上去对但其实偷懒"的几种典型手法。
-</p>
-
-<h2 id="next">下一章伏笔</h2>
-<p>
-  现在的 loop 只能聊天。下一章 (第 02 章) 把 tool call 分支接进来: 模型
-  只能提出结构化动作请求, 真正执行动作的是 harness; 工具执行前要过 permission,
-  工具结果要以 tool message 写回 History, 之后 loop 再走 下一轮。读完后, 你的
-  agent 就从"聊天循环"变成"能改代码的 agent"。
+  这一章把最小 agent loop 拆成 5 个模块, 每模块按"用途 → 场景 →
+  设计思想 → 实现细节" 展开, 全部走依赖注入。 3 条不变量:
+  history 在闭包内、LLM 是窄接口、Composition Root 唯一组装。
+  跑通的方式是用 fake LLM 断言 messages 顺序, 而不是断言最终回复文本。
+  下一章 (第 02 章) 给 loop 加上一双"手" — 工具调用。
 </p>
+</content>
\ No newline at end of file
diff --git a/tutorial/chapters/02-tools.html b/tutorial/chapters/02-tools.html
index d903b25..5cd9770 100644
--- a/tutorial/chapters/02-tools.html
+++ b/tutorial/chapters/02-tools.html
@@ -1,624 +1,876 @@
-<p class="article__eyebrow">第 02 章 · 让模型提出动作</p>
-<h1 class="article__title">给 Agent 一双手: 工具调用</h1>
+<p class="article__eyebrow">第 02 章 · 给 Agent 一双手</p>
+<h1 class="article__title">工具调用: 让 LLM 真的能改世界</h1>
 <p class="article__lede">
-  第 01 章的 loop 只能聊天。这一章把 tool call 接进 loop: 模型只提出
-  结构化的动作请求, 真正去执行动作的是 harness。这一步是 coding agent 与普通聊天
-  agent 的分水岭: 模型从此可以读文件、跑命令、改代码。
+  第 01 章的 loop 跑通了两轮对话保留上下文, 但 LLM 还是只输出文本 —
+  它没法真的改文件、跑命令。 这一章给 loop 加上一双"手": 工具调用。
+  读完后, 你能区分 3 种工具调用协议 (Anthropic / OpenAI / Google),
+  知道 ToolResult 怎么设计, 并能用 fake tool 写一个"先读 README 再回答"
+  的最小集成测试。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-01">在第 01 章基础上改了什么</h2>
-<p>
-  这一章把"LLM 返回文本"扩展为"LLM 可能返回 tool_calls"。loop 主结构不变,
-  只在"调 LLM"和"写 assistant"之间插入"如果 assistant 携带 tool_calls,
-  就执行工具、把 tool result 写回 history, 再回到 loop 头部"这条新分支。
-  对应到代码, 改动集中在 4 个文件: <code>src/llm.ts</code>、
-  <code>src/tools/registry.ts</code>、<code>src/agent.ts</code>、
-  <code>src/history.ts</code>。其他文件不动。
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="why-tools">为什么需要工具调用</h2>
+<p>
+  写代码之前, 先讲为什么 LLM 必须能调工具, 不用会怎样。
+  </p>
+<p>
+  LLM 本质是个"下一个 token 预测器"。 给它一段对话, 它能续写;
+  但让它"真的去看 README 里写了什么" 或者"真的跑 <code>ls</code>" ,
+  它做不到 — 模型没有文件系统、没有 shell、没有网络。 即使你用
+  "系统提示" 告诉它"假装你能 ls", 它只能瞎编。
+  </p>
+<p>
+  <strong>工具调用就是让 LLM 输出一个结构化指令 (tool_call),
+  由 harness 执行, 再把结果 (tool_result) 喂回 LLM</strong>。
+  </p>
+<p>
+  这条"输出指令 → 执行 → 喂结果" 的循环, 让 LLM 第一次有了"改世界"
+  的能力。 也是 coding agent 和普通 chatbot 的分水岭。 没有工具
+  调用的 agent, 只是 chat 套壳; 有工具调用的 agent, 才开始谈得上
+  "完成用户任务"。
+  </p>
+<p>
+  本章要解决的 3 个真实问题:
 </p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/llm.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/llm.ts: LLM 响应多一个 tool_calls 字段</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/tools/registry.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/tools/registry.ts: 工具注册表 (新文件)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/agent.ts: 主体循环加 tool call 分支</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/history.ts"
-    target="_blank"
-    rel="noreferrer"
-    >4. src/history.ts: 新增 tool message 写入</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    用户问"项目里主循环怎么写", 01 章的 agent 只能回"我看不到你的项目"。 真正的
-    coding agent 必须先读文件, 再把文件内容拼到下一轮 LLM 请求里。 现象是"agent
-    需要在 loop 内中途插入一次外部世界访问"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"让模型输出 shell 命令, harness 用 <code>exec()</code> 跑"。
-    这有两个致命问题: 一是模型生成的文本和 shell 语法混在一起, harness
-    无法稳定判断哪些命令安全; 二是工具结果伪装成 user message, 多轮
-    之后模型搞不清"上一轮到底执行了什么"。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口: <code>interface Tool { name, schema, execute(args) }</code>;
-    工具注册表暴露 <code>register(tool)</code> 和 <code>get(name)</code>。
-    不变量三条: (1) tool call 与 tool result 必须配对出现, (2) tool 名称稳定,
-    不会因 LLM 自由发挥改名, (3) tool result 写入 history 时 role 必须是
-    <code>"tool"</code>, 不能伪装成 user。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake LLM 预设返回
-    <code
-      >{ role: "assistant", content: "", tool_calls: [{ id: "1", name:
-      "read_file", args: {path:"a.ts"} }] }</code
-    >, 跑完一轮后断言 history 里出现 role 为 <code>"tool"</code> 的消息, 且
-    <code>tool_call_id</code> 与 assistant 的 tool_calls.id 一一对应。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · 把工具结果伪装成 user 消息</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-const answer = await llm.chat(messages);
-const command = extractShell(answer.content);
-const output = await exec(command);
-messages.push({ role: "user", content: output });</code></pre>
-  <p><strong>问:</strong>看上去比 tool call 简洁, 为什么仍然不行?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 协议层: LLM 协议要求 tool_call 与 tool
-    result 配对, 伪装成 user 之后, 后续 normalize / compress / replay
-    全失事实边界; 安全层: harness 拿不到 tool name, 没法做 permission 检查,
-    危险命令全靠肉眼拦; 体验层: 模型下一次被告知 "刚才执行了 X",
-    它无法判断这是"自己请求的工具"还是"用户塞进来的 文本", 推理时会犹豫。
+<ol>
+<li>
+<strong>协议对齐</strong>: 3 家 LLM (Anthropic / OpenAI / Google) 的
+    tool call 协议不同, harness 怎么选一个统一内部表示?
+    </li>
+<li>
+<strong>工具结果怎么表达</strong>: 工具执行完返回什么给 LLM? 字符串
+    够不够, 还是要支持图片 / 文件?
+    </li>
+<li>
+<strong>工具多了怎么管</strong>: 1 个工具写死, 5 个工具还能写死,
+    30 个工具不写个注册表就乱。
+    </li>
+</ol>
+<h2 id="protocol-divergence">3 种工具调用协议</h2>
+<p>
+  朴素想法: "OpenAI / Anthropic / Google 都有 tool call, 应该一样吧?"
+  实际上 3 家协议有 3 个真实差异, adapter 必须收口。 这不是"语法不同"
+  那么轻, 整个 messages 序列结构都不同。
   </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · 工具表用 if/else 分发</p>
-  <pre class="code-block"><code>// 教学简化版
-if (toolName === "read_file") return readFile(args);
-if (toolName === "bash") return runBash(args);
-if (toolName === "edit") return applyEdit(args);
-// ...</code></pre>
-  <p><strong>问:</strong>为什么不直接写 if/else, 而要单独搞一个 registry?</p>
-  <p>
-    <strong>答:</strong>if/else 在小规模能跑, 一旦工具数到 10+ 就出现 三个问题:
-    (1) 没法在工具描述里统一暴露给 LLM, (2) 测试时无法 mock 单个工具, (3)
-    第三方工具 (例如 MCP server) 没有挂载点。 registry
-    把"工具是什么"和"工具怎么用"解耦, 后续 skill / subagent / permission
-    都要复用它。
+<table class="terms">
+<thead>
+<tr>
+<th>维度</th>
+<th>Anthropic</th>
+<th>OpenAI</th>
+<th>Google</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>工具调用字段</td>
+<td><code>tool_use</code> 块</td>
+<td><code>tool_calls</code> 数组</td>
+<td><code>functionCall</code> 块</td>
+</tr>
+<tr>
+<td>content 形状</td>
+<td>数组 (text + tool_use)</td>
+<td>字符串 + tool_calls 并列</td>
+<td>数组 (text + functionCall)</td>
+</tr>
+<tr>
+<td>工具结果回传</td>
+<td>user message + <code>tool_result</code> 块</td>
+<td>tool message + <code>tool_call_id</code></td>
+<td>user message + <code>functionResponse</code> 块</td>
+</tr>
+</tbody>
+</table>
+<p>
+  字段名不同只是表面, 真正难的是<strong>history 序列</strong>不同:
+</p>
+<ul>
+<li>Anthropic: assistant 输出 <code>tool_use</code>, 下一条必须是 user 消息里的 <code>tool_result</code> 块</li>
+<li>OpenAI: assistant 输出 <code>tool_calls</code>, 下一条必须是 role:"tool" 的消息, 用 <code>tool_call_id</code> 配对</li>
+<li>Google: assistant 输出 <code>functionCall</code>, 下一条必须是 user 消息里的 <code>functionResponse</code> 块</li>
+</ul>
+<p>
+  注意区别: Anthropic 和 Google 都把 tool result 放在 user message 里
+  (用 content block 类型区分), OpenAI 单独有一个 <code>role: "tool"</code>。
+  选哪个? OpenAI 的最简单 (独立 role, 不用 content block 类型判断),
+  我们就用 OpenAI 风格的统一 history 形状, 未来接 Anthropic / Google
+  时在 adapter 层翻译 (专题 A 展开)。
   </p>
-</div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface ToolCall {
-  id: string;
-  name: string;
-  args: Record&lt;string, unknown&gt;;
-}
-
-export interface ToolResult {
-  toolCallId: string;
-  content: string;
-  error?: string;
-}
-
-export interface Tool {
-  name: string;
-  description: string;
-  schema: JSONSchema;  // 给 LLM 看的参数形状
-  execute(args: Record&lt;string, unknown&gt;): Promise&lt;ToolResult&gt;;
-}
-
-export interface ToolRegistry {
-  register(tool: Tool): void;
-  get(name: string): Tool | undefined;
-  list(): Tool[];
+<h2 id="tool-call-message-shape">OpenAI 风格的 message 形状</h2>
+<p>
+  选 OpenAI 风格后, harness 内部有 3 种 message role:
+</p>
+<pre class="code-block"><code>// 1. 用户输入
+{ role: "user", content: "What's the test command?" }
+
+// 2. assistant 调工具 (一条消息, content 可能为 null)
+{
+  role: "assistant",
+  content: null,
+  tool_calls: [
+    {
+      id: "call_abc123",                      // 配对 ID
+      type: "function",
+      function: {
+        name: "run_read",
+        arguments: '{"path": "package.json"}',  // 字符串, 不是对象
+      },
+    },
+  ],
 }
 
-// LLM 响应增加 tool_calls 字段
-export interface AssistantMessage {
-  role: "assistant";
+// 3. 工具结果 (role: "tool", 用 tool_call_id 配对上面的调用)
+{
+  role: "tool",
+  tool_call_id: "call_abc123",
+  content: "{ \"scripts\": { \"test\": \"npm test\" } }",
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/llm-adapter.ts#L1" rel="noreferrer" target="_blank">GitHub · src/llm-adapter.ts OpenAI 风格 message 形状 (L1)</a></p>
+<p>
+  3 个细节决定后面写代码时少踩坑:
+</p>
+<ol>
+<li>
+<strong>assistant 消息的 content 可以是 null</strong>: 调工具时
+    模型可能"只调不解释", content 就是 null。 LLM SDK 接受
+    <code>content: null</code>, 不用填空字符串。
+  </li>
+<li>
+<strong>tool_calls[].function.arguments 是字符串</strong>:
+    字符串, 不是对象。 调 executor 前要 <code>JSON.parse</code>。
+    模型有时候会给出非法 JSON (比如忘了引号), 解析失败要写
+    error tool message, 不能让 loop 崩。
+  </li>
+<li>
+<strong>tool message 必须配对 tool_call_id</strong>: 多工具并发
+    时 (<code>run_bash</code> 和 <code>run_read</code> 同时调),
+    每条 tool message 必须用对应的 id 配对, 不然模型不知道
+    哪个结果对应哪个调用。
+  </li>
+</ol>
+<h2 id="naive">朴素反例: 工具逻辑写死在 agent.ts</h2>
+<p>
+  写 agent 最朴素的做法: 把所有工具的 if/else 塞进 <code>agent.ts</code>。
+  5 个工具能撑, 10 个工具开始肿, 30 个工具变成大泥球。
+  </p>
+<pre class="code-block"><code>// ❌ 反例: 5 个工具 5 个 if
+async function handleToolCall(call) {
+  if (call.name === "run_read") {
+    return await fs.readFile(call.args.path, "utf8");
+  }
+  if (call.name === "run_write") {
+    await fs.writeFile(call.args.path, call.args.content);
+    return "OK";
+  }
+  if (call.name === "run_bash") {
+    const { stdout } = await exec(call.args.command);
+    return stdout;
+  }
+  if (call.name === "run_edit") {
+    // ... 50 行编辑逻辑
+  }
+  if (call.name === "run_todo_create") {
+    // ... 10 行 todo 逻辑
+  }
+  // 新加 run_web_fetch 又要加 if
+}</code></pre>
+<p>
+  5 件事立刻坏掉:
+</p>
+<ol>
+<li>
+<strong>agent.ts 变成大泥球</strong>: 5 个工具 100+ 行, 10 个工具 200+ 行。
+  </li>
+<li>
+<strong>测试没法隔离</strong>: 想测"agent 调 run_read 后能不能回答",
+    必须真的创建文件, 跑 read。 不能用 fake tool。
+  </li>
+<li>
+<strong>权限难加</strong>: 想在 run_write 前问用户, 只能改 agent.ts,
+    工具作者根本不知道有权限层。
+  </li>
+<li>
+<strong>元信息丢失</strong>: 工具返回的 metadata (执行耗时、字节数) 没地方放。
+  </li>
+<li>
+<strong>工具作者互相耦合</strong>: 改 run_edit 的实现可能影响 run_bash 的逻辑。
+  </li>
+</ol>
+<p>
+  解决方式: <code>ToolRegistry</code> 注册表, 把"调用哪个工具" 和
+  "工具怎么实现" 隔开。
+  </p>
+<h2 id="toolresult-design">ToolResult 接口 — 工具返回什么</h2>
+<p>
+  <strong>用途</strong>: 工具执行完, harness 要把结果喂回 LLM。 这个结果
+  怎么表达, 决定了 LLM 能不能正确解读。 我们用统一的
+  <code>ToolResult</code> 接口。
+  </p>
+<p>
+  <strong>真实场景</strong>: 工具作者写 <code>run_bash("ls")</code>, 返回
+  "README.md\nsrc\n", 这个字符串 LLM 能直接看; 工具作者写
+  <code>run_web_fetch("https://...")</code>, 返回 HTML + 图片,
+  纯字符串放不下, 需要 attachments。 统一接口覆盖两种情况。
+  </p>
+<p>
+  <strong>设计思想</strong>: 4 个字段刚好覆盖所有情况 — content 必有
+  (LLM 看的文本), attachments 可选 (多模态), error 标记业务错误
+  (区别 throw), metadata 内部信息 (不进 LLM, 只进 transcript)。
+  不多不少, 多一个字段就多一个"会不会被滥用" 的地方。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>interface ToolResult {
+  // 文本结果, 大多数工具用它
   content: string;
-  tool_calls?: ToolCall[];
+  // 可选: 输出图片 / 文件 / 结构化数据
+  attachments?: Array&lt;{
+    kind: "image" | "file" | "json";
+    path?: string;
+    data?: string;       // base64 或 JSON 字符串
+    mimeType?: string;
+  }&gt;;
+  // 错误标记: 工具正常完成但没拿到结果 (vs throw 是 harness 错误)
+  error?: boolean;
+  // 工具内部补充信息 (不进 LLM messages, 只进 transcript)
+  metadata?: Record&lt;string, unknown&gt;;
 }</code></pre>
-
-<h2 id="delta-loop">loop 路径: 与第 01 章的关系</h2>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/types.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/types.ts ToolResult 接口 (L1)</a></p>
 <p>
-  第 01 章的 loop 是"调 LLM → 写 assistant → 返回"。这一章把它扩展成 "调 LLM →
-  如果有 tool_calls 就执行工具 → 写 tool result → 回到头部再 调 LLM"。如果
-  assistant 没有 tool_calls, 行为退化为第 01 章。
+  4 个字段的边界:
 </p>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-async function run(query: string): Promise&lt;string&gt; {
-  history.add({ role: "user", content: query });
-
-  for (;;) {
-    const messages = history.getMessages();
-    const assistant = await llm.chat(messages);
-    history.add(assistant);
-
-    // 第 02 章新增: 如果 assistant 携带 tool_calls, 执行并继续
-    if (!assistant.tool_calls || assistant.tool_calls.length === 0) {
-      return assistant.content;
-    }
+<ol>
+<li>
+<strong><code>content</code> 必有</strong>: LLM 看到的"工具说了什么"。
+    即使工具有 attachments, 也要有一个 content 描述
+    "我找到了这张图, 在 attachments 里"。
+  </li>
+<li>
+<strong><code>attachments</code> 可选</strong>: 工具产物, 比如
+    <code>run_bash</code> 写出的截图, <code>run_read</code> 读到的图片。
+    LLM 收到时知道这是附件, 不是普通文本。 当前多数工具链
+    只用 content, attachments 是为多模态工具 (第 09 章 memory 多模态)
+    留的扩展点。
+  </li>
+<li>
+<strong><code>error: true</code> ≠ throw</strong>: 工具正常执行完,
+    但返回了"命令不存在" 这种业务错误, 标 <code>error: true</code>。
+    harness 抛 throw 是 harness 内部错误 (权限拒绝、工具超时),
+    不是工具错误。 两类错误要分开处理 — 见下一节。
+  </li>
+<li>
+<strong><code>metadata</code> 不进 LLM</strong>: 工具的内部信息
+    (执行耗时、字节数), 只进 transcript 调试, 不进 messages。
+    避免 LLM 被内部细节污染 — LLM 不需要知道"我跑了 200ms",
+    只需要"命令输出是 X"。
+  </li>
+</ol>
+<h2 id="tool-failure-modes">工具失败的 3 种模式</h2>
+<p>
+  <strong>用途</strong>: 工具调用的"失败" 有 3 种, 处理方式完全不同。
+  这是新同学最容易搞混的地方, 必须单独讲。
+  </p>
+<p>
+  <strong>真实场景</strong>: 跑一个 agent 调 3 个工具: <code>run_read</code>
+  成功, <code>run_bash</code> 因为命令不存在失败, <code>run_write</code>
+  因为权限被拒未执行。 3 个失败, 3 种处理, 但 harness 必须一致地
+  把失败信息喂回 LLM, 不能假装成功。
+  </p>
+<table class="terms">
+<thead>
+<tr>
+<th>模式</th>
+<th>含义</th>
+<th>示例</th>
+<th>处理</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>业务错误</strong></td>
+<td>工具正常完成, 但结果表示"做错了"</td>
+<td><code>run_bash("cat nonexistent")</code> 返回 exit code 1</td>
+<td><code>result.error = true</code>, 写 tool message, LLM 自己决定</td>
+</tr>
+<tr>
+<td><strong>权限拒绝</strong></td>
+<td>工具被 permission 拦了</td>
+<td>用户拒绝 run_write</td>
+<td>写 tool message "<code>Permission denied</code>", 不执行, LLM 知道</td>
+</tr>
+<tr>
+<td><strong>异常抛出</strong></td>
+<td>harness 内部错误, 工具没正常返回</td>
+<td>工具超时 / OOM / panic</td>
+<td>recovery 决定 (第 11 章), loop 不该崩</td>
+</tr>
+</tbody>
+</table>
+<p>
+  ❌ / ✅: 把业务错误当 throw。
+</p>
+<pre class="code-block"><code>// ❌ 把业务错误当 throw
+async function runRead(args) {
+  const content = await fs.readFile(args.path, "utf8");
+  if (!content) throw new Error("empty file");   // LLM 看不到这个, messages 断裂
+  return { content };
+}
 
-    for (const call of assistant.tool_calls) {
-      const tool = registry.get(call.name);
-      if (!tool) {
-        history.add({ role: "tool", tool_call_id: call.id,
-                       content: `Error: Unknown tool "${call.name}"` });
-        continue;
-      }
-      const result = await tool.execute(call.args);
-      history.add({ role: "tool", tool_call_id: call.id, content: result.content });
-    }
-    // 继续 loop, 让 LLM 看到 tool result
+// ✅ 业务错误用 error: true
+async function runRead(args) {
+  try {
+    const content = await fs.readFile(args.path, "utf8");
+    return { content };
+  } catch (e) {
+    return { content: `Error: ${e.message}`, error: true };
   }
 }</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/run-read.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/run-read.ts 业务错误 vs throw 模板 (L1)</a></p>
+<p>
+  业务错误用 <code>error: true</code>, LLM 收到后能继续推理:
+  "啊, 文件不存在, 我换个路径试试"。 异常抛出时, LLM 看不到任何信息,
+  messages 序列还可能断, 整个 loop 崩。
+  </p>
+<p>
+  为什么 throw 不行? 假设 LLM 调了 2 个工具: <code>run_read</code> 和
+  <code>run_bash</code>。 <code>run_read</code> 抛 throw,
+  <code>run_bash</code> 还没执行。 messages 序列里:
+</p>
+<ol>
+<li>user (原 query)</li>
+<li>assistant (调 run_read + run_bash)</li>
+<li>工具: throw 退出, 没有 tool message</li>
+</ol>
+<p>
+  下一次 chat() 时, LLM 看到一个 assistant 调了 2 个工具, 但只
+  收到 0 个 tool message, 模型困惑。 messages 序列不完整, OpenAI
+  API 甚至可能直接报 400 错误。
+  </p>
+<h2 id="tool-registry">Tool Registry — 工具怎么注册</h2>
+<p>
+  <strong>用途</strong>: 工具数量从 1 涨到 5 之后, 把它们写死在 <code>agent.ts</code>
+  里就难维护。 引入 <code>ToolRegistry</code> 注册表, 把"注册 / 查询 /
+  调用" 分离。
+  </p>
+<p>
+  <strong>真实场景</strong>: harness 跑久了, 工具数从 5 涨到 30+。 注册表让
+  工具作者写 <code>register(name, def, executor)</code> 一行就接入,
+  不需要改 agent.ts 主循环。
+  </p>
 <p>
-  注意三点: (1) tool message 永远带 <code>tool_call_id</code>, 协议要求它 与对应
-  assistant.tool_calls[].id 配对; (2) 即使工具抛错, 也要写一条 tool message,
-  不能让 messages 在协议层断裂; (3) loop 终止条件是 "assistant 不再携带
-  tool_calls", 而不是"工具全部成功"。
+  <strong>设计思想</strong>: 经典<strong>注册表模式</strong> — 把"按名字查找
+  对象" 抽成一个数据结构, 外界通过 <code>register()</code> 和
+  <code>get()</code> 交互。 翻译 (tool def → LLM ChatCompletionTool)
+  藏 registry 内部, 业务模块不感知。 详细模式解释见 Reference
+  章节的"模式 5 · 注册表模式"。
+  </p>
+<p>
+  <strong>实现细节</strong>:
 </p>
+<pre class="code-block"><code>interface ToolRegistry {
+  register(name: string, def: ToolDefinition, executor: ToolExecutor): void;
+  get(name: string): ToolDefinition | undefined;
+  list(): ToolDefinition[];
+  getToolDefinitions(): ChatCompletionTool[];   // 转成 OpenAI 格式
+  invoke(name: string, args: unknown): Promise&lt;ToolResult&gt;;
+}
 
-<h2 id="registry">工具注册表: 把"工具是什么"和"工具怎么用"解耦</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export function createToolRegistry(): ToolRegistry {
-  const tools = new Map&lt;string, Tool&gt;();
+type ToolDefinition = {
+  name: string;
+  description: string;       // 给 LLM 看, 决定何时调
+  parameters: JSONSchema;     // JSON Schema 描述参数
+};
+
+type ToolExecutor = (args: any, ctx: ToolContext) =&gt; Promise&lt;ToolResult&gt;;</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/registry.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/registry.ts ToolRegistry 完整实现 (L1)</a></p>
+<p>
+  Registry 的 4 个职责:
+</p>
+<ol>
+<li>
+<strong>注册</strong>: 工具通过 <code>register(name, def, executor)</code> 接入。
+  </li>
+<li>
+<strong>查询</strong>: <code>getToolDefinitions()</code> 转成 OpenAI
+    <code>ChatCompletionTool[]</code>, 喂给 LLM。
+  </li>
+<li>
+<strong>执行</strong>: <code>invoke(name, args)</code> 找到 executor, 调用,
+    返回 ToolResult。
+  </li>
+<li>
+<strong>权限检查 (第 07 章展开)</strong>: invoke 之前
+    调 permissionManager, 拒绝时返回 "Permission denied" 而不执行。
+  </li>
+</ol>
+<p>
+  为什么 registry 不直接暴露给 LLM?
+  因为 LLM 看到的"工具列表" 必须稳定, 不能按 mode 动态增减 —
+  会破坏 prompt cache (第 10 章)。 工具的"激活 / 停用" 由 Skill 系统管,
+  不在 registry。
+  </p>
+<h2 id="loop-integration">loop 接入: 工具调用是第 5 步</h2>
+<p>
+  <strong>用途</strong>: 把第 01 章的 6 步 loop 补上第 5 步, 让 LLM 调的工具真的
+  被执行。 这是 loop 从"聊天" 升级为"agent" 的关键一步。
+  </p>
+<p>
+  <strong>真实场景</strong>: LLM 返回 <code>tool_calls: [{ name: "run_read", args: {path: "package.json"} }]</code>,
+  harness 找到 run_read 的 executor, 调它, 拿 ToolResult,
+  写回 history, 回到 loop 顶部让 LLM 看到结果。
+  </p>
+<p>
+  <strong>设计思想</strong>: 5 个关键细节决定"工具调用是不是真稳":
+</p>
+<ol>
+<li>
+<strong>工具参数是 JSON 字符串</strong>: <code>call.function.arguments</code>
+    是字符串, 调 <code>invoke</code> 前 <code>JSON.parse</code>。 解析失败要
+    写 error tool message, 不能 throw 让 loop 崩。
+  </li>
+<li>
+<strong>每个 tool_call 都写一条 tool message</strong>: 即使工具
+    抛错, 也要写 <code>role: "tool"</code> 消息告诉 LLM "这个调用失败了" 。
+    不写会让 messages 序列断裂, LLM 收到错位消息。
+  </li>
+<li>
+<strong>catch 包到 invoke 内部</strong>: 不让 throw 逃出 loop。
+    业务错误用 <code>error: true</code> 表达, harness 错误用 recovery
+    (第 11 章) 处理。
+  </li>
+<li>
+<strong>load_skill 特殊处理</strong>: 第 05 章展开 — load_skill
+    激活 skill 而不执行, 激活后写 tool message。
+  </li>
+<li>
+<strong>工具是并行还是串行?</strong> 第一版用串行, 简单可调试。
+    第 13 章再讲并行 / 异步。
+  </li>
+</ol>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>export function createAgent(deps: { llm: LLMClient; history: History; tools: ToolRegistry }): Agent {
   return {
-    register(tool) { tools.set(tool.name, tool); },
-    get(name) { return tools.get(name); },
-    list() { return Array.from(tools.values()); },
+    async run(query: string): Promise&lt;string&gt; {
+      deps.history.add({ role: "user", content: query });
+
+      for (;;) {
+        const messages = deps.history.getMessages();
+        const toolDefs = deps.tools.getToolDefinitions();
+        const response = await deps.llm.chat({ messages, tools: toolDefs });
+        deps.history.add({
+          role: "assistant",
+          content: response.content,
+          tool_calls: response.toolCalls,
+        });
+        if (response.toolCalls.length === 0) {
+          return response.content ?? "";
+        }
+
+        // 5. 执行工具调用
+        for (const call of response.toolCalls) {
+          let result: ToolResult;
+          try {
+            const args = JSON.parse(call.function.arguments);
+            result = await deps.tools.invoke(call.function.name, args);
+          } catch (e) {
+            // JSON 解析失败也算业务错误, 不让 loop 崩
+            result = { content: `Tool error: ${e.message}`, error: true };
+          }
+          deps.history.add({
+            role: "tool",
+            tool_call_id: call.id,
+            content: result.content,
+          });
+        }
+        // 6. 回到 loop 顶部
+      }
+    },
   };
-}
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1" rel="noreferrer" target="_blank">GitHub · src/agent.ts loop 接入工具调用 (L1)</a></p>
+<h2 id="description-patterns">tool description 怎么写 LLM 才用对</h2>
+<p>
+  <strong>用途</strong>: 工具被 LLM 调对调错, 80% 取决于 description 怎么写。
+  这一节总结几个常见 pattern, 帮工具作者写好 description。
+  </p>
+<p>
+  <strong>真实场景</strong>: 工具作者写 <code>description: "Read a file"</code>,
+  LLM 看到不知道何时调; 改成 "读取文件内容, 需要查看文件时使用, 不要
+  用 cat 命令代替", LLM 立刻知道用这个工具。
+  </p>
+<p>
+  <strong>设计思想</strong>: description = "做什么 + 何时用 + 不要做什么 +
+  输出格式"。 4 个要素缺一不可。
+  </p>
+<p>
+  <strong>实现细节 (3 个 pattern)</strong>:
+</p>
+<p>
+  <strong>Pattern 1: 动词 + 对象 + 何时用</strong>
+</p>
+<pre class="code-block"><code>// ❌ 抽象
+description: "Read tool"
 
-// 注册一个 read_file 工具
-registry.register({
-  name: "read_file",
-  description: "读取项目内文件的文本内容",
-  schema: {
-    type: "object",
-    properties: { path: { type: "string" } },
-    required: ["path"],
-  },
-  async execute(args) {
-    const path = String(args["path"]);
-    const content = await fs.readFile(path, "utf8");
-    return { toolCallId: "", content };
-  },
-});</code></pre>
+// ✅ 具体
+description: "读取文件内容。需要查看文件内容时使用, 不要用 cat 命令代替。"</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/run-read.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/run-read.ts description Pattern 1 (L1)</a></p>
 <p>
-  这一段把工具的"身份证"集中起来: name 给 LLM 看, schema 给 LLM 校验 参数,
-  execute 给 harness 真正执行。第 07 章 permission 会复用 schema
-  做"参数级权限判断", 第 10 章 prompt cache 会复用 name + description
-  做"工具描述前缀稳定性"。这些都是为什么 registry 不是 if/else 的原因。
+  <strong>Pattern 2: 反例 (告诉 LLM 不要做什么)</strong>
 </p>
+<pre class="code-block"><code>// ❌ 没说清边界
+description: "Search the web"
 
-<h2 id="llm-adapter">LLM 适配: 不同模型的 tool call 协议不一样</h2>
+// ✅ 明确边界
+description: "使用搜索引擎查询实时信息。仅当本地知识不够用时调用, 不要用它查 package 文档 (用 run_read)。"</code></pre>
 <p>
-  OpenAI / Anthropic / Google 的 tool call 字段名、参数序列化方式、 tool result
-  写入约定都不完全一致。这一层细节全部收敛在
-  <code>src/llm-adapter.ts</code> 里 (你会在仓库里看到这个文件)。 本教程假设
-  <code>createLlm()</code> 内部已经统一好了, 暴露给 agent 的就是上面的
-  <code>AssistantMessage</code> 形态。
+  <strong>Pattern 3: 输出格式</strong>
 </p>
+<pre class="code-block"><code>// ❌ 输出格式没说
+description: "Run a shell command"
 
+// ✅ 说了
+description: "执行 shell 命令, 返回 stdout (最多 5000 字符) 和 exit code。 长输出会被截断, 需要时用 grep 过滤。"</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/run-bash.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/run-bash.ts description Pattern 3 (L1)</a></p>
+<h2 id="fake-tool-test">fake tool 测试: 验证不是偶然跑通</h2>
+<p>
+  <strong>用途</strong>: 真实工具 (<code>run_bash</code>) 是 IO 密集型, 测试时不希望
+  真跑命令。 fake tool 用"预设 response" 模拟工具行为。
+  </p>
+<p>
+  <strong>设计思想</strong>: 跟 fake LLM 一个套路 — fake tool 实现
+  <code>ToolExecutor</code> 接口, 预设 response, 行为完全可控。 fake tool
+  的额外好处: 测<strong>工具错误的传播</strong>。 写一个返回
+  <code>{ content: "Error: file not found", error: true }</code>
+  的 fake tool, 验证 LLM 收到错误后能调整策略。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>test("agent.run 调 run_read 后能基于结果回答", async () =&gt; {
+  const fakeTools = createToolRegistry();
+  fakeTools.register(
+    "run_read",
+    { name: "run_read", description: "Read a file", parameters: {} },
+    async (args) =&gt; ({ content: "package.json says test = npm test" }),
+  );
+
+  const fakeLLM = createFakeLLM([
+    // 第一次: 模型决定调 run_read
+    {
+      content: null,
+      toolCalls: [{ id: "c1", function: { name: "run_read", arguments: '{"path":"package.json"}' } }],
+      finishReason: "tool_calls",
+    },
+    // 第二次: 基于工具结果回答
+    { content: "Test command: npm test", toolCalls: [], finishReason: "stop" },
+  ]);
+
+  const agent = createAgent({ llm: fakeLLM, history: createHistory(), tools: fakeTools });
+  const reply = await agent.run("What test command does this project use?");
+
+  expect(reply).toBe("Test command: npm test");
+  // 验证 history 累积了 user + assistant(tool_call) + tool + assistant
+  const msgs = fakeLLM.lastCapturedMessages();
+  expect(msgs).toContainEqual(expect.objectContaining({ role: "tool", tool_call_id: "c1" }));
+});</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.test.ts#L1" rel="noreferrer" target="_blank">GitHub · src/agent.test.ts fake tool 测试 (L1)</a></p>
+<p>
+  这条测试验证 3 件事: (1) 工具被调了, (2) 工具结果写进了 history,
+  (3) LLM 第二轮能看到工具结果。 任何一件坏, 测试挂。
+  </p>
+<h2 id="loop-with-tool">带工具的 loop 数据流</h2>
+<figure class="figure">
+<div class="flow-map" role="img" aria-label="带工具的 loop 数据流">
+<div class="flow-row--center">
+<span class="flow-node">用户输入</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">history.add(user)</span>
+</div>
+<div class="flow-row--center" style="margin-top: var(--space-3)">
+<span class="flow-node">history.getMessages()</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">llm.chat(messages, tools)</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">assistant<br/><small>含 tool_calls</small></span>
+</div>
+<div class="flow-row--center" style="margin-top: var(--space-3)">
+<span class="flow-node">for each call<br/>tools.invoke(name, args)</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">history.add(tool result)</span>
+</div>
+<div class="flow-row--center" style="margin-top: var(--space-3)">
+<span class="flow-node flow-node--accent">回到 chat(), 喂 messages + tools</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">最终回复 (无 tool_calls)</span>
+</div>
+</div>
+<figcaption>图 02-1 · 工具调用 loop. 关键: tool message 必须紧跟 assistant(tool_calls)。</figcaption>
+</figure>
+<figure class="figure">
+<div class="flow-map" role="img" aria-label="OpenAI 风格 message 形状">
+<div class="flow-row--center">
+<span class="flow-node">user</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">assistant<br/><small>tool_calls: [...]</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">tool<br/><small>tool_call_id 配对</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">assistant<br/><small>最终回复</small></span>
+</div>
+</div>
+<figcaption>图 02-2 · OpenAI 风格 message 配对. tool message 必须用 tool_call_id 配对 assistant(tool_calls)。</figcaption>
+</figure>
+<figure class="figure">
+<div class="flow-tree" role="img" aria-label="3 种工具失败模式">
+<div class="flow-tree__children" style="border: 1px solid var(--color-border-soft); border-radius: var(--radius-md); padding: var(--space-3); width: 100%; max-width: 700px;">
+<div class="flow-tree__branch">
+<div class="flow-compare__label" style="color: #b8860b;">业务错误</div>
+<span class="flow-node">run_bash 返 exit 1</span>
+<span class="flow-node">run_read 文件不存在</span>
+<span class="flow-node" style="font-size: var(--text-xs);">→ error: true, LLM 继续</span>
+</div>
+<div class="flow-tree__branch">
+<div class="flow-compare__label" style="color: #cd5c5c;">权限拒绝</div>
+<span class="flow-node">用户拒绝 run_write</span>
+<span class="flow-node">黑名单触发</span>
+<span class="flow-node" style="font-size: var(--text-xs);">→ 写 denied tool message</span>
+</div>
+<div class="flow-tree__branch">
+<div class="flow-compare__label" style="color: #8b008b;">异常抛出</div>
+<span class="flow-node">工具超时</span>
+<span class="flow-node">OOM / panic</span>
+<span class="flow-node" style="font-size: var(--text-xs);">→ recovery 决定, loop 不崩</span>
+</div>
+</div>
+</div>
+<figcaption>图 02-3 · 3 种工具失败模式. 业务错误用 error: true, 异常抛出走 recovery, 权限拒绝写 denied tool message。</figcaption>
+</figure>
+<figure class="figure">
+<div class="flow-compare" role="img" aria-label="业务错误 vs throw">
+<div class="flow-compare__col flow-compare__col--bad">
+<div class="flow-compare__label">❌ throw</div>
+<span class="flow-node">assistant 调 2 工具</span>
+<span class="flow-node" style="font-weight: 600;">throw 退出</span>
+<span class="flow-node">tool message: 0 条</span>
+<span class="flow-node">messages 序列断裂</span>
+</div>
+<div class="flow-compare__col flow-compare__col--good">
+<div class="flow-compare__label">✅ error: true</div>
+<span class="flow-node">assistant 调 2 工具</span>
+<span class="flow-node" style="font-weight: 600;">error: true 返回</span>
+<span class="flow-node">tool message: 2 条</span>
+<span class="flow-node">LLM 继续推理</span>
+</div>
+</div>
+<figcaption>图 02-4 · 业务错误 vs throw. throw 让 LLM 看不到任何信息, error: true 让 LLM 继续推理。</figcaption>
+</figure>
 <h2 id="trap">反例梯度</h2>
-
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>用 if/else 分发工具, 不写 registry。</p>
-    <p>
-      <strong>为什么错:</strong>小规模能跑, 工具数到 10+
-      后无法测试、无法扩展、无法暴露给 LLM。
+<div class="card__head">
+<span class="card__tag">新手错法 · A</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 工具执行失败直接 <code>throw</code>。
     </p>
-    <p>
-      <strong>正确做法:</strong>本章已示范, registry 是 harness 复用同一份 tool
-      列表的基础设施。
+<p>
+<strong>为什么错:</strong> throw 会让 agent.run 整个崩, messages 序列断裂,
+      LLM 收不到任何信息。
+    </p>
+<p>
+<strong>正确做法:</strong> 业务错误用 <code>{ content: "Error: ...", error: true }</code> 返回,
+      LLM 看到错误后能继续推理。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>工具抛错时, 不写 tool message, 直接抛给上层。
+<div class="card__head">
+<span class="card__tag">中级错法 · B</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 工具调成功不写 <code>role: "tool"</code> 消息。
     </p>
-    <p>
-      <strong>为什么错:</strong>messages 序列断裂, LLM 协议拒绝接受。下一次 LLM
-      调用会报 "missing tool_result" 错, 而且无法恢复。
+<p>
+<strong>为什么错:</strong> messages 序列断裂, 下一次 LLM 收到的 assistant
+      消息没有对应 tool_result, 模型困惑。
     </p>
-    <p>
-      <strong>正确做法:</strong>try/catch 包住 execute, 出错时写一条
-      <code>{role:"tool", content:"Error: ..."}</code>。错误也是信息, 必须让 LLM
-      看到。
+<p>
+<strong>正确做法:</strong> 每次 tool_call 必须配对一条 tool message, 不管
+      成功失败。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>tool message 不带 <code>tool_call_id</code>,
-      或者 <code>id</code> 用自增整数生成。
+<div class="card__head">
+<span class="card__tag">高级错法 · C</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> tool description 写"这是一个工具"。
     </p>
-    <p>
-      <strong>为什么错:</strong>多工具并行调用时, 整数 id 无法对应到具体哪个
-      tool_call, 协议层会拒绝接受。
+<p>
+<strong>为什么错:</strong> LLM 看了不知道何时调, 永远不调。
     </p>
-    <p>
-      <strong>正确做法:</strong>用 LLM 在 tool_calls 里返回的
-      <code>id</code> 字符串原样回写, 不要重新生成。
+<p>
+<strong>正确做法:</strong> 写"<strong>读取文件内容</strong>, <strong>需要时</strong>用
+      这个工具, <strong>不要</strong>用 cat 命令代替"。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>把权限检查写在工具 execute 内部。</p>
-    <p>
-      <strong>为什么错:</strong>权限是 harness 决策, 不是工具职责。写进 execute
-      后, 第 07 章 permission 模块无法统一拦截, 第三方工具 (例如 MCP)
-      也无法复用同一套规则。
+<div class="card__head">
+<span class="card__tag">边界错法 · D</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 工具参数 schema 用 TypeScript 类型, 不写 JSON Schema。
+    </p>
+<p>
+<strong>为什么错:</strong> LLM 看不到类型, 模型给出来的参数不符合期望。
     </p>
-    <p>
-      <strong>正确做法:</strong>权限检查在 agent 主循环里, 工具 execute
-      假定输入已经过校验。本章先放空, 第 07 章会展开。
+<p>
+<strong>正确做法:</strong> 用 JSON Schema 描述, harness 在 invoke 前做一次
+      校验, 不通过返回 error。
     </p>
-  </div>
 </div>
-
+</div>
 <h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
 <div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 02 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>tool_call 触发执行:</strong>fake LLM 返回携带
-      <code>tool_calls: [{name:"read_file", args:{path:"a.ts"}}]</code>
-      的 assistant, 跑完后 history 末尾出现 <code>role: "tool"</code>
-      消息, 且 content 是文件内容。
-    </p>
-    <p>
-      <strong>tool_call_id 配对:</strong>fake LLM 返回 2 个 tool_calls, 跑完后
-      history 出现 2 条 tool message, 它们的
-      <code>tool_call_id</code> 严格对应到 assistant.tool_calls[0/1].id。
+<div class="card__head">
+<span class="card__tag">Validation · 第 02 章</span>
+</div>
+<div class="card__body">
+<p>
+<strong>调一次工具再回答:</strong> fake tool 返回固定内容, fake LLM 第一轮
+      调工具, 第二轮基于工具结果回答, 最终回复匹配 fake tool 的内容。
     </p>
-    <p>
-      <strong>工具抛错仍写 tool message:</strong>fake registry 在 execute
-      内抛错, 跑完后 history 仍有 <code>role: "tool"</code> 消息, content 以
-      "Error: " 开头。
+<p>
+<strong>tool message 必写:</strong> 工具调成功 / 业务失败 / 权限拒绝, history
+      末尾都有 <code>role: "tool"</code> 消息, content 描述结果。
     </p>
-    <p>
-      <strong>loop 终止条件:</strong>fake LLM 第一轮带 tool_calls, 第二轮 不带,
-      跑完后 <code>agent.run()</code> 恰好返回第二轮 assistant 的 content,
-      没有第三轮。
+<p>
+<strong>参数 JSON 解析失败不崩:</strong> LLM 给出非法 JSON args, tool executor
+      收到解析错误返回, history 写 error tool message, loop 不抛。
     </p>
-    <p>
-      <strong>未知工具不崩:</strong>fake LLM 返回 <code>name: "不存在"</code>,
-      跑完后 history 末尾有 tool message, content 提示 "Unknown tool", agent
-      不抛异常, 第二轮 LLM 看到错误并改用其他工具。
+<p>
+<strong>tool description 含使用场景:</strong> 每个工具 description 至少含
+      一句"何时用", 而不是空泛的"工具" 二字。
     </p>
-  </div>
 </div>
-
-<h2 id="debug">如果实现失败, 先查哪里</h2>
-<ol>
-  <li>
-    <strong>LLM 报 "missing tool_result":</strong>看 tool execute 抛错时是否写了
-    tool message, 或者是否漏写了 <code>role: "tool"</code>。
-  </li>
-  <li>
-    <strong>工具从未执行:</strong>看
-    <code>assistant.tool_calls</code> 解析是否被丢, fake LLM 返回时是否带
-    <code>tool_calls</code> 字段。
-  </li>
-  <li>
-    <strong>loop 死循环:</strong>看终止条件。fake LLM 永远返回同一个 tool_call,
-    loop 会一直转; 测试时务必预设两轮不同响应。
-  </li>
-  <li>
-    <strong>tool_call_id 对不上:</strong>看是否用了自增整数, 应严格用 LLM
-    返回的字符串 id。
-  </li>
-  <li>
-    <strong>工具 execute 读不到 schema:</strong>registry.get(name) 返回
-    undefined, 检查 name 是否一致, schema 是否完整。
-  </li>
-</ol>
-
-<h2 id="lookback">回望第 00–01 章: 哪些原则在本章兑现了</h2>
+</div>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>术语锁定:</strong><code>tool call</code> /
-    <code>tool result</code> / <code>tool registry</code> 在第 00 章 6
-    词里没出现, 首次出现于本章, 之后只用英文。
-  </li>
-  <li>
-    <strong>Composition Root 唯一接线:</strong>registry 在
-    <code>index.ts</code> 创建, 通过 deps 注入 agent; agent.ts 内不 new
-    registry。
-  </li>
-  <li>
-    <strong>共享实例:</strong>同一个 registry 实例被 agent 和 permission 共享,
-    不会因为新建两次导致"工具列表不一致"。
-  </li>
-  <li>
-    <strong>fake LLM 顺序断言:</strong>本章所有 Validation 都基于 fake LLM,
-    不依赖真实模型, 跑得快、断言准。
-  </li>
+<li>
+<strong>依赖注入</strong>: ToolRegistry 注入 agent, 不在内部 new。
+    </li>
+<li>
+<strong>工厂模式</strong>: 工具通过 <code>register(name, def, executor)</code> 注册,
+      registry 是单例, 但 executor 闭包内捕获上下文。
+    </li>
+<li>
+<strong>窄接口</strong>: ToolResult 只暴露 4 个字段, 工具元信息不外泄。
+    </li>
+<li>
+<strong>fake 测试</strong>: 不依赖真实文件系统, 用 fake tool 验证 loop。
+    </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>工具多到装不下</dt>
-  <dd>第 05 章 Skill 会解决"按需加载工具子集", 本章的 registry 是它的底座。</dd>
-  <dt>危险命令要拦</dt>
-  <dd>
-    第 07 章 Permission 会在 execute 之前同步拦截, 本章把 execute
-    假定为"已校验"是为了让权限模块能复用同一个 registry。
-  </dd>
-  <dt>工具结果太长</dt>
-  <dd>
-    第 06 章 Compress 会处理"tool result 太大撑爆 context" 的问题, tool message
-    是它的主要压缩对象之一。
-  </dd>
-  <dt>工具结果要审计</dt>
-  <dd>
-    第 08 章 Hook 会在 tool execute 前后插入"事实记录", transcript 会单独留底,
-    不与 history 混。
-  </dd>
-  <dt>子智能体用工具</dt>
-  <dd>
-    第 04 章 SubAgent 会让第二个 agent 复用同一份 registry, 但暴露工具子集。
-  </dd>
+<dt>工具数到 30+</dt>
+<dd>
+    全部暴露给 LLM 会让 system prompt 撑爆, 模型也选不对。 第 05 章
+    Skill 机制按场景动态激活工具子集。
+    </dd>
+<dt>权限</dt>
+<dd>
+    工具执行要不要先问用户, 第 07 章展开。
+    </dd>
+<dt>大输出</dt>
+<dd>
+    <code>run_bash</code> 返回 5000 字错误日志怎么办, 第 06 章 P1 即时压缩。
+    </dd>
 </dl>
-
-<h2 id="vibe-coding-02">本次如何 vibe code: 第 02 章的三件套</h2>
-
-<h3 id="vibe-feed-02">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。只贴 "目标 + 场景 + 模块", 让 LLM 给出
-    <code>Tool</code> / <code>ToolRegistry</code> / <code>ToolCall</code> /
-    <code>ToolResult</code> 4 个 interface 的草案, 以及
-    <code>AssistantMessage</code> 增加
-    <code>tool_calls</code> 字段的形态。本轮不写实现, 只钉形状与命名。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。贴 "模块 + 接线", 让 LLM 给出
-    <code>index.ts</code> 接线代码, 其中
-    <code>createToolRegistry</code> 是空实现, agent.run 仍然只走第 01
-    章的"返回文本"路径。本轮 review 重点: registry 实例是否在
-    <code>index.ts</code> 只 new 一次。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。贴 "边界 (5 条 checklist)", 让 LLM 按
-    checklist 写 agent 主循环加 tool call 分支、写 read_file 工具、写 tool
-    message 写入 history。本轮 review 重点: tool_call_id 是否原样回写,
-    错误是否仍然写 tool message。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。贴 "验证 (5 条 vitest 断言)", 让 LLM 写
-    <code>test/agent.tool.test.ts</code>。本轮 review 重点: fake LLM
-    是否预设两轮响应 (第一轮带 tool_calls, 第二轮不带), 否则 loop 不会终止。
-  </li>
-</ol>
-
-<h3 id="vibe-review-02">Review: 第 02 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>tool_call_id 严格来自 LLM。</strong>代码中不得出现
-    <code>const id = String(counter++)</code> 这种自增 id。验证:
-    <code>grep -n 'counter\|nanoid\|uuid' src/agent.ts</code> 应当 0 行。
-  </li>
-  <li>
-    <strong>tool message role 必为 "tool"。</strong>不出现
-    <code>role: "user"</code> 伪装成工具结果。验证:
-    <code>grep -n 'role: "user"' src/agent.ts</code> 在 tool 执行路径上为 0 行
-    (非 tool 路径仍可写 user message)。
-  </li>
-  <li>
-    <strong>未知工具不抛异常。</strong>写一条 tool message 提示 "Unknown tool",
-    继续 loop。验证: Validation 卡片"未知工具不崩"那条断言要落到测试里。
-  </li>
-  <li>
-    <strong>permission 没写在 execute 内。</strong>第 07 章会补, 本章的 execute
-    应当是"裸"实现, 不读 process.env, 不读白名单。验证:
-    <code>grep -n 'process.env\|whitelist' src/tools/read_file.ts</code> 应当 0
-    行。
-  </li>
-  <li>
-    <strong>registry 是工厂不是单例。</strong>复用第 01 章的检查:
-    <code>grep -n 'export const registry' src/</code> 应当 0 行。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-02">调试: 第 02 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · 工具执行了但 messages 没增长。</strong>症状: tool execute
-    输出在终端能看到, 但 fake LLM 第二次收到的 messages 不含 tool message。验证:
-    Validation 卡片"tool_call 触发执行"那条断言, 必须断言
-    <code
-      >history.getMessages().filter(m =&gt; m.role === "tool").length ===
-      1</code
-    >, 而不只是断言最终文本。
-  </li>
-  <li>
-    <strong>伪装 B · tool_call_id 用函数参数名伪装。</strong>症状: 代码里写
-    <code>tool_call_id: call.name</code>, 看着对, 实际错。验证: LLM 返回的
-    <code>call.id</code> 应当是字符串 (例如 "call_abc123"), 不是工具函数名。grep
-    一次 <code>tool_call_id:</code> 看取的是哪个字段。
-  </li>
-  <li>
-    <strong>伪装 C · 把权限检查塞进工具 execute。</strong>症状: read_file 工具内
-    <code>if (path.startsWith("/")) throw new Error(...)</code
-    >。这等于把"路径必须项目内"这条硬规则焊在 read_file 里, 后续 edit / write
-    工具又会重复写一遍。验证: 第 07 章会统一收口, 本章的 execute 内禁止出现 if
-    守卫 (除了参数解析)。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-02">迭代: 第 02 章 5 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch02): 钉 Tool / ToolRegistry / AssistantMessage.tool_calls
-      接口</code
-    >
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code>feat(ch02): createToolRegistry 工厂 + read_file 工具 stub</code> ——
-    tsc 通过, agent.run 仍未走 tool 分支。
-  </li>
-  <li>
-    <code>feat(ch02): agent.run 加 tool call 分支, 写 tool message</code> ——
-    跑通 Validation 卡片前 3 条 (触发执行 / id 配对 / 错误仍写 tool message)。
-  </li>
-  <li>
-    <code>feat(ch02): loop 终止条件 + 未知工具降级</code> —— 跑通 Validation
-    卡片后 2 条 (终止条件 / 未知工具不崩)。
-  </li>
-  <li>
-    <code>test(ch02): 5 条 Validation 全部 vitest 通过</code> —— 全绿, commit
-    message 引用 5 个 Validation ID。
-  </li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">Prompt Card · 第 02 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>目标:</strong>把 tool call 接进 agent loop,
-      让模型可以读文件、跑命令、改代码。
+<div class="card__head">
+<span class="card__tag">Prompt Card · 第 02 章</span>
+<button class="card__copy" data-copy-card="" type="button">复制</button>
+</div>
+<div class="card__body">
+<p>
+<strong>目标:</strong> 给 agent loop 加上工具调用能力, LLM 输出 tool_call,
+      harness 执行, 把 tool_result 写回 history。
     </p>
-    <p>
-      <strong>场景:</strong>用户输入 "帮我读 src/agent.ts", agent 第一轮让 LLM
-      返回 tool_call=read_file, 跑完工具后第二轮 LLM 基于文件内容回答。
+<p>
+<strong>场景:</strong> 用户问"这个项目用什么 test 命令?", agent 调
+      <code>run_read("package.json")</code>, 看到结果, 回答 "npm test"。
     </p>
-    <p>
-      <strong>模块:</strong> <code>src/llm.ts</code> 暴露
-      <code>chat()</code> 解析 tool_calls;
-      <code>src/tools/registry.ts</code> (新) 暴露
-      <code>createToolRegistry()</code>;
-      <code>src/tools/read_file.ts</code> (新) 实现 read_file 工具;
-      <code>src/agent.ts</code> 主循环加 tool call 分支;
-      <code>src/history.ts</code> 接受 role: "tool" 消息。
+<p>
+<strong>模块:</strong> <code>src/tools/types.ts</code> (新) 暴露 <code>ToolResult</code> 接口;
+      <code>src/tools/registry.ts</code> (新) 暴露 <code>createToolRegistry()</code>;
+      <code>src/tools/run-read.ts</code> (新) 真实 <code>run_read</code> 工具;
+      <code>src/agent.ts</code> (改) 接收 ToolRegistry, 第 5 步执行工具;
+      <code>src/index.ts</code> (改) 注册核心工具。
     </p>
-    <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
-    <ul>
-      <li>
-        tool message 必带 <code>tool_call_id</code>, id 来自 LLM 返回,
-        不重新生成
-      </li>
-      <li>
-        工具 execute 抛错时仍然写一条 <code>role: "tool"</code> 消息, content 以
-        "Error: " 开头
-      </li>
-      <li>未知工具不抛异常, 写一条提示消息后继续 loop</li>
-      <li>execute 内不读 process.env, 不写权限检查 (留给第 07 章)</li>
-      <li>assistant 没有 tool_calls 时, 行为退化为第 01 章 (返回 content)</li>
-    </ul>
-    <p><strong>验证 (用 fake LLM + fake registry, 逐条落到 vitest):</strong></p>
-    <ul>
-      <li>
-        fake LLM 第一轮返回带 tool_calls 的 assistant, 第二轮返回纯文本,
-        agent.run 返回第二轮 content
-      </li>
-      <li>tool_call_id 严格等于 assistant.tool_calls[N].id</li>
-      <li>
-        fake registry.execute 抛错时, history 末尾仍有
-        <code>role: "tool"</code> 消息
-      </li>
-      <li>
-        fake LLM 返回 name: "不存在", history 末尾有提示消息, agent.run 不抛
-      </li>
-      <li>
-        2 个 tool_calls → 2 条 tool message, 顺序按 assistant.tool_calls 顺序
-      </li>
-    </ul>
-  </div>
+<p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
+<ul>
+<li>ToolResult 只暴露 <code>content</code> / <code>attachments</code> / <code>error</code> /
+      <code>metadata</code> 4 个字段, 工具元信息不外泄</li>
+<li>工具参数是 JSON Schema, 不是 TypeScript 类型</li>
+<li>业务错误用 <code>error: true</code> 返回, 不 throw</li>
+<li>每次 tool_call 必须配对一条 <code>role: "tool"</code> 消息, 不管成功失败</li>
+<li>tool description 必须含"何时用" 的场景描述, 不能空泛</li>
+</ul>
+<p><strong>验证 (用 fake LLM + fake tool, 逐条落到 vitest):</strong></p>
+<ul>
+<li>fake LLM 第一轮调 <code>run_read</code>, fake tool 返回固定内容, 第二轮
+      LLM 收到 tool_result, 最终回复基于工具内容</li>
+<li>history 累积 user + assistant(tool_call) + tool + assistant, 不断裂</li>
+<li>业务错误 (fake tool 返回 <code>error: true</code>) 不抛, LLM 继续推理</li>
+<li>非法 JSON args 返回 error tool message, loop 不崩</li>
+</ul>
+</div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>故意把 tool_call_id 改写成自增整数, 跑测试, 看哪条 Validation 抓到。</li>
-  <li>
-    在 agent.run 里加
-    <code
-      >if (assistant.tool_calls.length &gt; 0) throw new Error("no tools")</code
-    >, 跑测试, 看是否被 Validation 卡片"未知工具不崩"抓到。
+<li>
+    故意让 tool executor <code>throw new Error()</code>, 跑测试, 看
+    "业务错误不崩" 是否抓到 (loop 崩, history 断裂)。
   </li>
-  <li>
-    把 read_file 工具换成 write_file, 写一段故意不写 tool message 的代码,
-    跑测试, 看是否被"工具抛错仍写 tool message"抓到。
+<li>
+    让 tool executor 调成功但不写 tool message, 跑测试, 看
+    "tool message 必写" 是否抓到 (下次 chat() 收到错位消息)。
+  </li>
+<li>
+    写一个 tool description = "这是一个工具", 跑测试, 看 LLM
+    是否会调它 (不会调, 描述太抽象)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章把 loop 从"调 LLM → 返回文本"扩展成"调 LLM → 可能有 tool_calls → 执行工具
-  → 写 tool result → 再调 LLM"。tool call 是 harness 与 LLM 协作的核心协议,
-  一旦接上, coding agent 就能读文件、改代码、跑命令。 下一章 (第 03 章) 我们会在
-  loop 上加一层"计划"——TODO Manager, 让多轮执行有节奏。
+  这一章给 loop 加上了工具调用。 核心是 3 个设计选择:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>OpenAI 风格的统一 history 形状</strong>: 内部一律用
+      <code>tool_calls</code> + <code>role: "tool"</code>, 未来接 Anthropic
+      / Google 时在 adapter 层翻译。
+    </li>
+<li>
+<strong>ToolResult 4 字段</strong>: content / attachments / error / metadata,
+      业务错误用 <code>error: true</code>, 不用 throw。
+    </li>
+<li>
+<strong>Tool Registry</strong>: 工具注册后转成 JSON Schema 给 LLM,
+      LLM 调的工具名 + 参数 + 结果都进 history。
+    </li>
 <p>
-  第 02 章的 loop 仍然没有节奏: 模型可以一次调 10 个工具, 也可以
-  一直调下去不结束。第 03 章 TODO Manager 给 loop 加一个"工作清单",
-  模型每轮开始前能看到当前 TODO 状态, 每轮结束后可以更新 TODO。 这是给 harness
-  加"自我规划"能力的第一步。
+  下一章 (第 03 章) 给 agent 加一个"短期记忆" — TODO Manager,
+  让多轮执行有节奏。
 </p>
+</content>
\ No newline at end of file
diff --git a/tutorial/chapters/03-todo.html b/tutorial/chapters/03-todo.html
index 2ecd40b..1116b28 100644
--- a/tutorial/chapters/03-todo.html
+++ b/tutorial/chapters/03-todo.html
@@ -1,523 +1,796 @@
-<p class="article__eyebrow">第 03 章 · 让多轮执行有节奏</p>
-<h1 class="article__title">给 Loop 加一个工作清单: TODO Manager</h1>
+<p class="article__eyebrow">第 03 章 · 多轮执行有节奏</p>
+<h1 class="article__title">TODO Manager: 让 LLM 自己追踪执行步骤</h1>
 <p class="article__lede">
-  第 02 章接上 tool call 之后, loop 有一个新问题: 模型可以一次调 10 个工具,
-  也可以一直调下去不结束。这一章给 loop 加一个"工作清单"——TODO Manager。
-  模型每轮开始前能看到当前 TODO 状态, 每轮结束后可以更新 TODO。 这是 harness
-  第一次具备"自我规划"能力。
+  第 02 章的 agent 能调工具了, 但跑 5 轮之后 LLM 已经忘了最初的计划 —
+  它在调"读 README" 时, 可能忘了"先 run_bash 看看 package.json"。
+  这一章给 harness 加一个 TODO Manager: LLM 可以创建 / 更新 / 完成 TODO,
+  这些 TODO 既给 LLM 自己看, 也给用户看。 读完后, 你能讲清 TODO 和
+  持久化 Task 的边界, 并能用 fake LLM 写一个"完成 3 步 TODO 才返回"
+  的测试。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-02">在第 02 章基础上改了什么</h2>
-<p>
-  这一章在 agent 主循环里加一个 TODO 模块。loop 跑起来时, 每次 LLM
-  调用前都把当前 TODO 状态作为 system reminder 注入到 messages, LLM
-  返回后允许它通过一个特殊工具 (<code>update_todo</code>) 更新 TODO。
-  对应到代码, 改动集中在 3 个新文件 + 1 处改 agent.ts:
-  <code>src/todo.ts</code> (新)、
-  <code>src/tools/update_todo.ts</code> (新)、<code>src/agent.ts</code> (改)、
-  <code>src/index.ts</code> (改接线)。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/todo.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/todo.ts: TODO 数据结构与状态机 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/agent.ts: 每轮 LLM 调用前注入 TODO reminder</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/tools/registry.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/tools/registry.ts: 注册 update_todo 工具</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    长任务下 (例如"先读 3 个文件, 改 5 行, 跑测试"), 模型在第 5 轮
-    已经忘了"我刚才说要先读文件"。harness 必须替它保留一份工作清单,
-    每轮提醒它"现在在清单第几项"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"把 TODO 写在 system prompt 里, 让 LLM 自己维护"。
-    这有两个问题: 一是 LLM 不会真去维护, 它会"假装"维护 (写漂亮 文本,
-    实际不再看); 二是 system prompt 写 TODO 会破坏第 10 章的 cache-friendly
-    布局。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code>interface TodoManager { get(), update(items), tickRound() }</code>。
-    不变量三条: (1) TODO 状态独立于 history, 不进 messages 序列, (2) 每轮 LLM
-    看到的 TODO 是 reminder 形式 (system-reminder user 消息), 不污染 system
-    prompt 稳定前缀, (3) TODO 状态机: pending → in_progress → completed,
-    不允许跳跃 (例如从 pending 直接 completed)。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake LLM 第一轮返回 assistant 携带 <code>update_todo</code> 工具调用,
-    跑完后断言 TODO 状态变了; 第二轮 LLM 收到的 messages 包含一条 reminder 消息,
-    content 描述新的 TODO 状态。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · 把 TODO 塞进 system prompt</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-const systemPrompt = `你是一个编码助手。当前 TODO: ${JSON.stringify(todos)}`;
-const messages = [{ role: "system", content: systemPrompt }, ...history.getMessages()];</code></pre>
-  <p><strong>问:</strong>看上去简洁, 为什么仍然不行?</p>
-  <p>
-    <strong>答:</strong>两件事同时坏掉 —— cache: system prompt 一旦包含 TODO
-    状态, 每次调用 LLM 的 system prompt 都不同, 整个 prompt cache 失效, token
-    成本涨 3-5 倍; 真实性: LLM 不会真去更新 system prompt 里的 TODO 字符串,
-    harness 必须用结构化数据维护 TODO, 不能用自由文本。
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: 第 7 轮失忆</h2>
+<p>
+  写代码之前, 看一段真实长任务的失败。 跑一段对话: 用户让 agent
+  "全项目 code review, 列出 3 个最严重的 bug"。
   </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · TODO 状态机不限制跳跃</p>
-  <pre class="code-block"><code>// 教学简化版
-class TodoManager {
-  update(items) { this.items = items; }  // 直接覆盖
-}
-}</code></pre>
-  <p><strong>问:</strong>为什么不直接覆盖, 要限制状态机?</p>
-  <p>
-    <strong>答:</strong>LLM 倾向于"全部 completed 一笔勾销"。如果不限制,
-    模型会在第一轮就把所有 TODO 标成 completed, 之后即使它根本没做完, harness
-    也不知道。这种"假装做完了"是红灯 3 的典型发作。
+<ol>
+<li>
+<strong>第 1-3 轮</strong>: agent 看了 README, 看了 2 个文件, 思路清晰。
+    </li>
+<li>
+<strong>第 5 轮</strong>: agent 调第 4 个文件, 但忘了之前看过什么,
+    把"我觉得 X 是 bug" 重复了 1 遍。
+    </li>
+<li>
+<strong>第 7 轮</strong>: agent 调第 6 个文件, 又开始重新调第 2 个文件
+    (重复看), 总结时漏掉 1 个文件, 重复 1 个文件的判断。
+    </li>
+</ol>
+<p>
+  原因不是 LLM 笨, 是<strong>5 轮工具结果把上下文撑满, LLM 注意力
+  被稀释</strong>。 它"看到" 5 个文件, 但"记得住" 的只有最近 1-2 个。
+  </p>
+<p>
+  朴素想法 1: "让 LLM 写总结, 把之前的工具结果总结成一段?"
+  能缓解, 但不根治 — 总结本身也要占 token, 总结会丢失细节。
+  </p>
+<p>
+  朴素想法 2: "压缩工具结果?"
+  第 06 章会讲压缩, 但压缩针对"单条工具结果太长", 不解决"多步
+  任务的状态丢失" 问题。
+  </p>
+<p>
+  正确做法: <strong>给 harness 加"短期步骤追踪"</strong> — TODO Manager。
+  LLM 自己创建 TODO, 每条 TODO 状态走 turn reminder 注入 history,
+  LLM 每轮都能看到"还有哪些没做", 用户也能在 REPL 看到进度。
+  </p>
+<h2 id="why-not-system-prompt">为什么不能塞进 system prompt</h2>
+<p>
+  朴素想法: "TODO 状态既然重要, 直接拼到 system prompt 字符串里?"
+  立刻坏 3 件事:
+  </p>
+<ol>
+<li>
+<strong>破坏 prompt cache</strong>: system prompt 是 stable prefix,
+    LLM provider 缓存这段换算 1/10 价格。 TODO 状态每轮都变, 拼到
+    system prompt 等于让 cache 命中率归零。 成本涨 5-10 倍。
+  </li>
+<li>
+<strong>污染 system prompt 语义</strong>: system prompt 是"行为规则",
+    "当前 TODO 状态" 是"运行时状态"。 混在一起, LLM 不知道哪部分是
+    规则哪部分是状态。
+  </li>
+<li>
+<strong>每轮重复生成</strong>: 拼到 system prompt 字符串, 每次 chat()
+    都要重新生成完整字符串 (history 拼上)。 多 100 token 不算多,
+    跑 100 轮就 10000 token 浪费。
+  </li>
+</ol>
+<p>
+  解决方式: TODO 状态走 <strong>turn reminder</strong> — 每轮注入
+  user message, 状态变化才更新内容, 不变就不重复生成。 详细 reminder
+  模式见 Reference 章节的"模式 11"。
   </p>
+<h2 id="data-structure">TODO 数据结构</h2>
+<figure class="figure">
+<div class="flow-tree" role="img" aria-label="TODO 状态生命周期">
+<div class="flow-tree__children" style="border: 1px solid var(--color-border-soft); border-radius: var(--radius-md); padding: var(--space-4); width: 100%; max-width: 600px;">
+<div class="flow-tree__branch">
+<div class="flow-compare__label">pending</div>
+<span class="flow-node">LLM 创建</span>
+<span class="flow-node">↓</span>
 </div>
+<div class="flow-tree__branch">
+<div class="flow-compare__label">in_progress</div>
+<span class="flow-node">LLM 标 in_progress</span>
+<span class="flow-node">harness 自动设 active</span>
+<span class="flow-node">↓</span>
+</div>
+<div class="flow-tree__branch">
+<div class="flow-compare__label">completed / cancelled</div>
+<span class="flow-node">完成 or 放弃</span>
+<span class="flow-node">active 自动清空</span>
+<span class="flow-node" style="font-size: var(--text-xs);">保留追溯, 不删</span>
+</div>
+</div>
+</div>
+<figcaption>图 03-1 · TODO 4 个状态生命周期. pending → in_progress → completed / cancelled, activeId 由 harness 自动维护。</figcaption>
+</figure>
+<p>
+  <strong>用途</strong>: 持久化"当前 session 的步骤列表"。 给 LLM 看的
+  "我接下来要做什么", 给用户看的 "[2/5] 进度条"。
+  </p>
+<p>
+  <strong>真实场景</strong>: 用户让 agent 跑 5 步任务, agent 创建 5 条 TODO
+  (read / bash / write / write / verify), 跑过程中 LLM 标 in_progress →
+  completed, 用户在 REPL 看到进度变化, 知道 agent 在做什么。
+  </p>
+<p>
+  <strong>设计思想</strong>: TODO 是<strong>session 内</strong>的临时状态,
+  不持久化, 进程结束就丢。 这个边界很重要: 跨会话的长期任务
+  属于第 12 章的 Task 系统, 不要混。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>type TodoStatus = "pending" | "in_progress" | "completed" | "cancelled";
 
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export type TodoStatus = "pending" | "in_progress" | "completed";
-
-export interface TodoItem {
-  content: string;
+interface Todo {
+  id: string;             // 短 hash, 不用 UUID
+  content: string;        // "Read package.json"
   status: TodoStatus;
+  createdAt: number;
+  updatedAt: number;
 }
 
-export interface TodoManager {
-  get(): TodoItem[];
-  update(items: TodoItem[]): void;
-  // 每轮 LLM 调用前调用, 返回 reminder 字符串 (空表示无提醒)
-  tickRound(): string;
-  // 检查是否全部完成
-  isAllCompleted(): boolean;
-}
-
-export function createTodoManager(): TodoManager { /* ... */ }</code></pre>
-
-<h2 id="state-machine">状态机: 不允许跳跃</h2>
-<p>TODO 状态机是本章的关键不变量。它有三条规则:</p>
+interface TodoList {
+  items: Todo[];
+  // 由 TodoManager 维护, LLM 不直接调
+  activeId: string | null;  // 当前 in_progress 的 id
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/todo.ts#L1" rel="noreferrer" target="_blank">GitHub · src/todo.ts TodoManager 数据结构 (L1)</a></p>
+<p>
+  4 个状态的语义边界:
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th>状态</th>
+<th>含义</th>
+<th>谁设置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>pending</code></td>
+<td>已建, 没开始</td>
+<td>LLM 创建 / 取消后又恢复</td>
+</tr>
+<tr>
+<td><code>in_progress</code></td>
+<td>正在做</td>
+<td>LLM 标记</td>
+</tr>
+<tr>
+<td><code>completed</code></td>
+<td>做完</td>
+<td>LLM 标记</td>
+</tr>
+<tr>
+<td><code>cancelled</code></td>
+<td>放弃 (用户改主意 / 任务已不必要)</td>
+<td>LLM 标记</td>
+</tr>
+</tbody>
+</table>
+<p>
+  为什么不让 LLM "删除" TODO? 删了之后 history 看不到, 调试时无法
+  追溯"为什么这一项没了"。 取消比删除更可追溯。 同样的逻辑在 git 里
+  也成立: <code>git reset --hard</code> 比 <code>git revert</code> 危险。
+  </p>
+<p>
+  id 字段为什么用短 hash 不用 UUID? 因为 LLM 要在 tool_call 里
+  引用 id, 短 hash 比 UUID 更易读、易打字, LLM 出错率低。 短 hash
+  用内容前 8 个字符的 hash, 冲突概率 session 内可忽略。
+  </p>
+<h2 id="six-tools">6 个工具设计</h2>
+<p>
+  <strong>用途</strong>: LLM 怎么操作 TODO? 通过 6 个工具。 设计原则是
+  "覆盖 LLM 90% 操作, 不留歧义, 不写批量工具"。
+  </p>
+<p>
+  <strong>真实场景</strong>: 跑一个 5 步任务, LLM 操作 TODO 的路径是:
+  create → list (确认) → update in_progress → (做工具调用) → update completed
+  → create next → ... → 全部 completed。 6 个工具覆盖这个循环。
+  </p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>CRUD 分解</strong> — create / list /
+  get / update 是 4 个基础操作, clear / set_active 是 2 个辅助操作。
+  不写"批量更新" 工具 — 批量工具让 LLM "刷" 状态, 失去逐条追踪的
+  价值。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>run_todo_create(content: string): { id: string; content: string }
+run_todo_update(id: string, status: TodoStatus): { ok: boolean }
+run_todo_list(): { items: Todo[] }
+run_todo_get(id: string): { todo: Todo | null }
+run_todo_clear(): { ok: boolean }                  // 清空所有
+run_todo_set_active(id: string): { ok: boolean }   // 标记当前在做</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/run-todo.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/run-todo.ts 6 个 TODO 工具 (L1)</a></p>
+<p>
+  6 个工具的边界:
+</p>
 <ol>
-  <li>新增: 任意状态都可以新增一条 pending。</li>
-  <li>激活: pending → in_progress, 任何时候一个清单只允许一条 in_progress。</li>
-  <li>
-    完成: in_progress → completed, 不允许 pending → completed (跳过
-    in_progress)。
+<li>
+<strong>create / update / list / get 是基础 CRUD</strong>: LLM
+    90% 的操作在这 4 个里。
+  </li>
+<li>
+<strong>clear 留给"任务完成 / 用户重置"</strong>: 不是常用, 但偶尔需要。
+  </li>
+<li>
+<strong>set_active 不是 LLM 必需的</strong>: 第一个 <code>in_progress</code>
+    自动成为 active, 只有在 LLM 想"切换" 时用。 这一条留着
+    但少用, 别给 LLM 添堵。
   </li>
 </ol>
 <p>
-  这三条规则保证 TODO 状态对 LLM 是可读的: 任意时刻, "in_progress"
-  那条就是"我现在在做的事"。如果 LLM 想把 pending 直接标 completed, harness 在
-  <code>update()</code> 里 reject 这条, 写一条 error tool message 告诉 LLM。
+  ❌ / ✅: 不要给 LLM 写"批量更新" 工具。
 </p>
+<pre class="code-block"><code>// ❌ run_todo_bulk_update 一次改 5 条
+// LLM 会用它"刷" 所有 TODO 状态, 失去 TODO 的"逐条追踪" 价值
 
-<h2 id="loop-injection">loop 注入点: reminder 而不是 system prompt</h2>
+// ✅ 一次只改一条, 显式逐条
+// LLM 必须在 history 里看到"第 1 条变 completed" "第 2 条变 in_progress" 的过程</code></pre>
+<p>
+  同样的逻辑: 不要写"删除" 工具。 删除是丢弃, 取消是状态变化。
+  状态变化留在 history 里, 删除不留。
+  </p>
+<h2 id="manager-as-closure">TodoManager — 闭包 + factory</h2>
+<p>
+  <strong>用途</strong>: 维护 TODO 状态, 提供 6 个工具给 LLM, 提供 reminder
+  渲染给用户。
+  </p>
+<p>
+  <strong>真实场景</strong>: 创建 3 条 TODO, 把其中一条标 in_progress,
+  把第 2 条标 completed, 把第 3 条取消 — 4 个操作, manager
+  内部维护 activeId 指针, 渲染 reminder 时按 ID 排序。
+  </p>
 <p>
-  TODO 状态变化很频繁 (几乎每轮都可能变), 但它不能进 system prompt。 折中方案:
-  每轮 LLM 调用前, TODO 状态被格式化成 reminder 字符串, 作为 user 消息注入到
-  messages 末尾。
+  <strong>设计思想</strong>: 经典<strong>工厂 + 闭包</strong> 模式 (Reference
+  章节的模式 1)。 状态藏在闭包内, 父子 agent 天然隔离。 不用 class,
+  避免 <code>this</code> 绑定问题。
+  </p>
+<p>
+  <strong>实现细节</strong>:
 </p>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-async function run(query: string): Promise&lt;string&gt; {
-  history.add({ role: "user", content: query });
-
-  for (;;) {
-    // 第 03 章新增: 每轮注入 TODO reminder
-    const reminder = todo.tickRound();
-    if (reminder) {
-      history.add({ role: "user",
-        content: `&lt;system-reminder source="todo"&gt;\n${reminder}\n&lt;/system-reminder&gt;` });
-    }
-
-    const messages = history.getMessages();
-    const assistant = await llm.chat(messages);
-    history.add(assistant);
-
-    if (!assistant.tool_calls || assistant.tool_calls.length === 0) {
-      return assistant.content;
-    }
-
-    for (const call of assistant.tool_calls) {
-      const tool = registry.get(call.name);
-      if (!tool) { /* ... */ continue; }
-      const result = await tool.execute(call.args);
-      history.add({ role: "tool", tool_call_id: call.id, content: result.content });
-    }
-  }
+<pre class="code-block"><code>export function createTodoManager(): TodoManager {
+  const items: Map&lt;string, Todo&gt; = new Map();
+  let activeId: string | null = null;
+
+  return {
+    create(content: string): Todo {
+      const todo: Todo = {
+        id: hashId(content),
+        content,
+        status: "pending",
+        createdAt: Date.now(),
+        updatedAt: Date.now(),
+      };
+      items.set(todo.id, todo);
+      return todo;
+    },
+    update(id: string, status: TodoStatus): boolean {
+      const todo = items.get(id);
+      if (!todo) return false;
+      todo.status = status;
+      todo.updatedAt = Date.now();
+      if (status === "in_progress") activeId = id;
+      else if (status === "completed" || status === "cancelled") {
+        if (activeId === id) activeId = null;
+      }
+      return true;
+    },
+    list(): Todo[] { return [...items.values()].sort((a, b) =&gt; a.createdAt - b.createdAt); },
+    // ... 其他方法
+  };
 }</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/todo.ts#L1" rel="noreferrer" target="_blank">GitHub · src/todo.ts createTodoManager 完整实现 (L1)</a></p>
+<p>
+  <code>activeId</code> 不用 LLM 主动管理, TodoManager 自己根据 update 维护。
+  这是"harness 的内部状态" 和"LLM 看到的" 的边界 — LLM 调工具,
+  harness 自己维护 active 指针, LLM 不需要知道这个概念。
+  </p>
 <p>
-  reminder 用 <code>&lt;system-reminder&gt;</code> 标签包起来, LLM 看到这种
-  标签就知道"这是 harness 注入的, 不是用户写的", 不会把它当成普通 user
-  消息去回应。
+  为什么 TodoManager 是工厂函数, 不是 class?
 </p>
-
+<ol>
+<li>
+<strong>闭包天然隔离</strong>: 工厂返回的对象捕获 <code>items</code> 和
+    <code>activeId</code>, 每个 agent 实例独立闭包, 父子隔离自然。
+  </li>
+<li>
+<strong>无 this 绑定问题</strong>: TypeScript class 容易被不小心
+    <code>this.items</code> 在 callback 里丢上下文。 闭包没这个问题。
+  </li>
+<li>
+<strong>易测试</strong>: 测试时直接 <code>createTodoManager()</code>,
+    不用 mock 任何东西。
+  </li>
+</ol>
+<h2 id="history-rendering">怎么进 history: turn reminder 模式</h2>
+<p>
+  <strong>用途</strong>: TODO 状态需要被 LLM 看到, 但不能破坏 prompt cache。
+  走 turn reminder 模式 (user message 末尾)。
+  </p>
+<p>
+  <strong>真实场景</strong>: agent 跑 5 步任务, TODO 状态变化, 下一轮
+  chat() 前 reminder 注入到 messages 末尾, LLM 看到 "进度 [2/5]",
+  知道还剩 3 步。
+  </p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>稳定前缀分离</strong> — TODO 状态
+  走 messages 末尾 (动态 tail), 不拼到 system prompt (稳定前缀),
+  保持 prompt cache 命中。 这是第 10 章的核心思想, 本章是它的
+  第一次具体应用。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>function buildTodoReminder(todos: Todo[]): string {
+  if (todos.length === 0) return "";
+  const lines = todos.map(t =&gt; {
+    const icon = t.status === "completed" ? "✓" : t.status === "in_progress" ? "→" : t.status === "cancelled" ? "✗" : "·";
+    return `${icon} [${t.id}] ${t.content}`;
+  });
+  return `&lt;system-reminder source="todo"&gt;
+当前 TODO 状态:
+${lines.join("\n")}
+&lt;/system-reminder&gt;`;
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1" rel="noreferrer" target="_blank">GitHub · src/agent.ts buildTodoReminder 实现 (L1)</a></p>
+<p>
+  注意 4 件事:
+</p>
+<ol>
+<li>
+<strong>走 user message, 不走 system prompt</strong>: reminder
+    是动态状态, 不进 stable prefix。
+  </li>
+<li>
+<strong>每轮注入, 但只在 TODO 变化时更新内容</strong>: 不变就不重复生成。
+  </li>
+<li>
+<strong>用简洁符号</strong>: ✓ → ✗ · 比文字描述省 token, LLM 看得更清楚。
+    一个完成符号 + 一个 id + 描述, 总共 50 字符内, 比"Task t1 is
+    in completed state" 省 70% token。
+  </li>
+<li>
+<strong>空列表返回空字符串</strong>: 不污染 history, 不浪费 token。
+  </li>
+</ol>
+<p>
+  什么时候注入 reminder? 在 <code>agent.run()</code> 的第 2 步
+  (拿 messages) 之前, 调 <code>buildTodoReminder(manager.list())</code>,
+  把字符串作为最后一条 user message 加进 messages。 LLM 看到的
+  messages 序列:
+</p>
+<ol>
+<li>user: 原 query</li>
+<li>assistant: 调 run_todo_create</li>
+<li>tool: 创建成功</li>
+<li>user: &lt;system-reminder&gt; 当前 TODO 状态: ...&lt;/system-reminder&gt;</li>
+<li>assistant: 调 run_todo_update</li>
+<li>tool: 更新成功</li>
+<li>...</li>
+</ol>
+<p>
+  reminder 在每轮的开头出现, LLM 一眼能扫到"还有哪些没做"。
+  </p>
+<h2 id="todo-vs-task">TODO vs Task: 边界</h2>
+<figure class="figure">
+<div class="flow-compare" role="img" aria-label="TODO vs Task 对比">
+<div class="flow-compare__col flow-compare__col--good">
+<div class="flow-compare__label">TODO (本章)</div>
+<span class="flow-node">session 内临时</span>
+<span class="flow-node">进程退出就丢</span>
+<span class="flow-node">闭包内状态</span>
+<span class="flow-node">无依赖图</span>
+<span class="flow-node">5-10 条 / session</span>
+</div>
+<div class="flow-compare__col flow-compare__col--good">
+<div class="flow-compare__label">TASK (第 12 章)</div>
+<span class="flow-node">跨会话持久化</span>
+<span class="flow-node">存到 ~/.swoopcode/tasks/</span>
+<span class="flow-node">依赖图 + owner</span>
+<span class="flow-node">几十条 / 跨项目</span>
+<span class="flow-node">完成后归档</span>
+</div>
+</div>
+<figcaption>图 03-2 · TODO vs Task. 决策规则: "当前 session 临时步骤" 走 TODO, "3 周后还要跟的计划" 走 Task。</figcaption>
+</figure>
+<figure class="figure">
+<div class="flow-map" role="img" aria-label="reminder 注入流程">
+<div class="flow-row--center">
+<span class="flow-node">agent.run<br/><small>第 1 步</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">user query<br/><small>history.add</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">llm.chat<br/><small>第 2 步</small></span>
+</div>
+<div class="flow-row--center" style="margin-top: var(--space-3)">
+<span class="flow-node">history.add(assistant)</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">调工具</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">history.add(tool result)</span>
+</div>
+<div class="flow-row--center" style="margin-top: var(--space-3)">
+<span class="flow-node">回到顶部前<br/><small>第 2.5 步</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">buildTodoReminder<br/><small>list() → 渲染</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">user message<br/><small>&lt;system-reminder&gt;</small></span>
+</div>
+</div>
+<figcaption>图 03-3 · reminder 注入流程. reminder 走 user message 末尾, 不进 system prompt, 保持 stable prefix。</figcaption>
+</figure>
+<p>
+  <strong>用途</strong>: TODO 和 Task 看起来都是"任务列表", 但它们
+  活在不同的生命周期里。 搞混了, 你要么把"今天的小事" 全建 Task Group
+  (过载), 要么把"3 周后还要跟的计划" 塞进 TODO (丢了)。
+  </p>
+<p>
+  <strong>真实场景</strong>: 用户跑 50 个 session, 用 TODO 管每个 session
+  内的临时步骤, 用 Task 管跨 session 的长期计划。 如果用错,
+  session 退出后 TODO 列表没了, 3 周前的 Task Group 也不见了。
+  </p>
+<p>
+  <strong>设计思想</strong>: 决策规则 (给 LLM 和教程读者同时用):
+</p>
+<ol>
+<li>
+<strong>当前 session 的"3 步走完就忘" 任务</strong> → TODO。
+  </li>
+<li>
+<strong>"3 周后还要跟" 的计划, 跨项目, 多人协作</strong> → Task。
+  </li>
+</ol>
+<table class="terms">
+<thead>
+<tr>
+<th>维度</th>
+<th>TODO (本章)</th>
+<th>Task (第 12 章)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>生命周期</td>
+<td>session 内, 进程退出就丢</td>
+<td>跨会话, 持久化到 agentHome</td>
+</tr>
+<tr>
+<td>用户</td>
+<td>当前 session 的临时步骤</td>
+<td>跨项目的长期计划, 多 owner</td>
+</tr>
+<tr>
+<td>数据位置</td>
+<td>agent 闭包内</td>
+<td><code>~/.swoopcode/tasks/</code></td>
+</tr>
+<tr>
+<td>依赖图</td>
+<td>无, 自由顺序</td>
+<td>有, task 之间可以 blocks / blockedBy</td>
+</tr>
+<tr>
+<td>是否阻塞</td>
+<td>不阻塞, 仅追踪</td>
+<td>可以阻塞另一个 task</td>
+</tr>
+</tbody>
+</table>
+<p>
+  这条规则要写进 system prompt, 避免 LLM 混淆。 验证方法: 跑
+  50 个真实 session, 看 LLM 建 Task Group 的频率。 如果每个 session
+  都建 3 个 Task Group, 说明 prompt 没强调边界, 改。
+  </p>
+<h2 id="user-display">用户视角: 怎么看到 TODO 进度</h2>
+<p>
+  <strong>用途</strong>: LLM 内部用 TODO 是为了"自己不忘", 用户看到 TODO
+  状态是副产品。 REPL 怎么展示。
+  </p>
+<p>
+  <strong>真实场景</strong>: 用户跑一个 5 步任务, REPL 顶部展示
+  <code>[2/5] 改 config.ts ...</code>, 用户随时知道 agent 在做哪一步。
+  这不是装饰, 是给用户一个"卡住还是正常" 的判断依据。
+  </p>
+<p>
+  <strong>设计思想</strong>: 渲染走<strong>独立</strong>UI 路径, 跟 LLM 看到的
+  reminder 走两条路。 数据同源 (manager.list()), UI 渲染不同 —
+  LLM 看到完整 reminder, 用户看到简洁进度条。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>function renderTodoBar(todos: Todo[]): string {
+  const total = todos.length;
+  const done = todos.filter(t =&gt; t.status === "completed").length;
+  const current = todos.find(t =&gt; t.status === "in_progress");
+  if (total === 0) return "";
+  const bar = `[${done}/${total}]`;
+  return current
+    ? `${bar} ${current.content}...`
+    : done === total ? `${bar} 全部完成` : `${bar} 等待下一步`;
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/repl.ts#L1" rel="noreferrer" target="_blank">GitHub · src/repl.ts renderTodoBar 进度条 (L1)</a></p>
+<p>
+  REPL 顶部展示这一行, 用户随时知道 agent 在做哪一步。 跑长任务时,
+  用户每 30 秒扫一眼, 看到 [2/5] → [3/5] 知道正常, 看到 [1/5] 卡了
+  5 分钟知道有问题。
+  </p>
+<h2 id="fake-test">fake LLM 测试: 完成 3 步才返回</h2>
+<p>
+  <strong>用途</strong>: TODO 系统最容易写错的不是 create, 是 LLM 跳过
+  TODO 直接动手。 写一个测试, 强制 LLM 必须先建 TODO 再调工具。
+  </p>
+<p>
+  <strong>真实场景</strong>: 跑一个 case, LLM 应该先建 TODO 再做, fake LLM
+  模拟"不建 TODO 直接调工具" 的坏路径, 测试立刻挂, 提醒工具作者
+  写 todo reminder。
+  </p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>fake LLM 反向断言</strong> — 写一个
+  fake LLM 跑"坏路径", 看测试是否抓到。 跟"正常路径" 测试配对, 覆盖
+  双向。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>test("agent 必须建 TODO 才能调 run_read", async () =&gt; {
+  const toolCalls: string[] = [];
+  const fakeLLM = createFakeLLM([
+    // 第一次: 拒绝调工具, 先建 TODO
+    {
+      content: null,
+      toolCalls: [
+        { id: "t1", function: { name: "run_todo_create", arguments: '{"content":"Read package.json"}' } },
+      ],
+      finishReason: "tool_calls",
+    },
+    // 第二次: 标 in_progress, 调 run_read
+    {
+      content: null,
+      toolCalls: [
+        { id: "t2", function: { name: "run_todo_update", arguments: '{"id":"t1","status":"in_progress"}' } },
+        { id: "t3", function: { name: "run_read", arguments: '{"path":"package.json"}' } },
+      ],
+      finishReason: "tool_calls",
+    },
+    // 第三次: 标 completed, 回答
+    {
+      content: null,
+      toolCalls: [
+        { id: "t4", function: { name: "run_todo_update", arguments: '{"id":"t1","status":"completed"}' } },
+      ],
+      finishReason: "tool_calls",
+    },
+    { content: "Test: npm test", toolCalls: [], finishReason: "stop" },
+  ]);
+  const agent = createAgent({ llm: fakeLLM, history, tools });
+  await agent.run("What's the test command?");
+
+  // 验证: tool_call 序列含 todo 工具
+  const callNames = fakeLLM.allCalls().map(c =&gt; c.function.name);
+  expect(callNames).toContain("run_todo_create");
+  expect(callNames).toContain("run_todo_update");
+});</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.test.ts#L1" rel="noreferrer" target="_blank">GitHub · src/agent.test.ts TODO 流程测试 (L1)</a></p>
+<p>
+  这条测试不强制 LLM 必须先建 TODO (fake LLM 写好了脚本), 但可以
+  看到"如果 LLM 跳了 TODO, 这条测试看不到 run_todo_create, 会失败"。
+  </p>
+<h2 id="common-confusion">3 个常见误解</h2>
+<dl class="defs">
+<dt>误解 1 · "TODO 应该进 system prompt 才有保证"</dt>
+<dd>
+    错。 走 reminder (user message) 和走 system prompt 一样能让 LLM
+    看到。 区别是 cache 命中率: 走 system prompt 破坏 cache, 走 reminder
+    不破坏。 功能上等价, 性能上不同。
+  </dd>
+<dt>误解 2 · "TODO 是给用户看的"</dt>
+<dd>
+    一半对一半错。 TODO 主要是给 LLM 看的 (避免失忆), 用户看的是
+    REPL 顶部那行 [2/5] 进度条。 两个 UI 独立渲染, 数据同源。
+  </dd>
+<dt>误解 3 · "持久化的 Task 也能用 run_todo_*"</dt>
+<dd>
+    错。 跨会话的 Task 走 run_task_*, 不走 run_todo_*。 边界要分清,
+    不要因为"今天的小事" 用 Task, 也不要因为"3 周的计划" 用 TODO。
+  </dd>
+</dl>
 <h2 id="trap">反例梯度</h2>
-
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>TODO 状态作为 module-level 单例。</p>
-    <p>
-      <strong>为什么错:</strong>第 04 章子智能体需要独立的 TODO 列表,
-      单例会让子智能体的 TODO 污染主 agent。
+<div class="card__head">
+<span class="card__tag">新手错法 · A</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> TODO 列表放 module-level 单例。
     </p>
-    <p>
-      <strong>正确做法:</strong>本章已示范,
-      <code>createTodoManager()</code> 是工厂, Composition Root 在
-      <code>index.ts</code> 创建并注入。
+<p>
+<strong>为什么错:</strong> 子智能体共享父 agent 的 TODO, 跨上下文污染。
+    </p>
+<p>
+<strong>正确做法:</strong> TodoManager 在 <code>createAgent()</code> 闭包内,
+      父子隔离天然成立。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>不限制状态机, 允许 LLM 把 pending 直接
-      completed。
+<div class="card__head">
+<span class="card__tag">中级错法 · B</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> TODO 状态拼到 system prompt 字符串。
     </p>
-    <p>
-      <strong>为什么错:</strong>LLM 倾向于"全部勾选", harness 必须强制它走过
-      in_progress, 否则 "in_progress 是不是当前正在做" 这条不变量就崩了。
+<p>
+<strong>为什么错:</strong> 破坏 prompt cache, 每轮都重传完整 TODO 状态。
     </p>
-    <p>
-      <strong>正确做法:</strong>在 <code>update()</code> 入口校验, 拒绝
-      pending→completed 跳跃, 写 error tool message。
+<p>
+<strong>正确做法:</strong> 走 turn reminder (user message), 状态变化才更新。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>reminder 直接放在 system prompt 末尾。</p>
-    <p>
-      <strong>为什么错:</strong>破坏第 10 章的 cache-friendly 布局, system
-      prompt 应当是稳定前缀, TODO 是动态状态。
+<div class="card__head">
+<span class="card__tag">高级错法 · C</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 允许 LLM 删除 TODO。
+    </p>
+<p>
+<strong>为什么错:</strong> 删了之后 history 看不到, 调试时无法追溯"为什么这一项没了"。
     </p>
-    <p>
-      <strong>正确做法:</strong>reminder 作为 user 消息, 标签
-      <code>&lt;system-reminder source="todo"&gt;</code>。
+<p>
+<strong>正确做法:</strong> 用 <code>cancelled</code> 状态而不是删除, 保留追溯能力。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>LLM 调 update_todo 时不传 tool_call_id, 或者
-      role 写成 "user"。
+<div class="card__head">
+<span class="card__tag">边界错法 · D</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> TODO 工具和 Task 工具都给 LLM, 让它自己选。
     </p>
-    <p>
-      <strong>为什么错:</strong>回顾第 02 章的 tool call 协议, role 必须是
-      "tool", id 必须来自 LLM。
+<p>
+<strong>为什么错:</strong> LLM 不知道边界, 容易把"今天的小事" 都建 Task Group。
     </p>
-    <p>
-      <strong>正确做法:</strong>第 02 章的约束全部沿用,
-      不要在本章单独发明一套协议。
+<p>
+<strong>正确做法:</strong> system prompt + tool description 都明确"3 步以内走 TODO,
+      跨会话走 Task"。
     </p>
-  </div>
 </div>
-
+</div>
 <h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
 <div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 03 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>TODO 状态机拒绝跳跃:</strong>fake LLM 调用 update_todo 试图把
-      pending 直接 completed, harness 写一条 error tool message, TODO 状态没变。
-    </p>
-    <p>
-      <strong>reminder 注入位置正确:</strong>fake LLM 第一轮不调 update_todo,
-      第二轮收到的 messages 仍然包含
-      <code>&lt;system-reminder source="todo"&gt;</code>
-      标签, 描述初始 TODO 状态。
+<div class="card__head">
+<span class="card__tag">Validation · 第 03 章</span>
+</div>
+<div class="card__body">
+<p>
+<strong>创建 / 更新 / 完成 / 列表 4 个工具都跑通:</strong> fake LLM
+      调 4 个 run_todo_* 工具, TodoManager 状态正确变化。
     </p>
-    <p>
-      <strong>reminder 不在 system prompt:</strong>断言 messages[0].role !==
-      "system" 仍然成立, reminder 是 user 角色。
+<p>
+<strong>reminder 进 user message:</strong> history 的最后一条 user message
+      含 <code>&lt;system-reminder source="todo"&gt;</code>, 不在 system prompt。
     </p>
-    <p>
-      <strong>in_progress 唯一:</strong>fake LLM 试图把两条都标 in_progress,
-      harness 拒绝, 写 error tool message。
+<p>
+<strong>父 / 子 agent TODO 隔离:</strong> 父 agent 建 TODO 后 spawn 子 agent,
+      子 agent 的 reminder 为空, 看不到父的 TODO。
     </p>
-    <p>
-      <strong>tickRound 幂等:</strong>连续两次 <code>todo.tickRound()</code>
-      返回同一份 reminder 字符串, 不会"消耗"状态。
+<p>
+<strong>无 TODO 不渲染 reminder:</strong> 列表为空时, reminder 是空字符串,
+      不污染 history。
     </p>
-  </div>
 </div>
-
-<h2 id="lookback">回望第 00–02 章: 哪些原则在本章兑现了</h2>
+</div>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>工厂模式:</strong>TODO 是工厂, 不是单例, 为第 04 章子智能体隔离 TODO
-    列表做准备。
-  </li>
-  <li>
-    <strong>不污染 system prompt:</strong>reminder 用 user 消息 + 标签, 保留
-    cache-friendly 布局 (第 10 章的伏笔)。
-  </li>
-  <li>
-    <strong>tool call 协议沿用:</strong>update_todo 复用第 02 章的 Tool
-    interface, 不发明新协议。
-  </li>
-  <li>
-    <strong>状态机约束:</strong>不变量写进 update() 入口, 用 Validation
-    卡片"状态机拒绝跳跃"和"in_progress 唯一"反向断言。
-  </li>
+<li>
+<strong>工厂模式</strong>: TodoManager 在 <code>createAgent()</code> 闭包内。
+    </li>
+<li>
+<strong>Stable prefix 优先</strong>: TODO 状态走 reminder, 不进 system prompt。
+    </li>
+<li>
+<strong>窄接口</strong>: 6 个工具, 每个职责单一, 不写批量工具。
+    </li>
+<li>
+<strong>fake 测试</strong>: 不依赖真实 LLM, 用 scripted 验证 TODO 流。
+    </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>子智能体用 TODO</dt>
-  <dd>
-    第 04 章会让子智能体继承父 agent 的 dependencies, 但 TODO 列表必须独立
-    (本章工厂模式已经在准备这件事)。
-  </dd>
-  <dt>TODO 列表本身太长</dt>
-  <dd>
-    第 06 章 compress 会处理"TODO 列表本身占据 context" 的情况,
-    通常的方案是只把"前 3 条 in_progress" 注入 reminder, 不注入全表。
-  </dd>
-  <dt>TODO 跨会话保留</dt>
-  <dd>
-    第 09 章 memory 会讨论"TODO 是不是用户级长期事实", 倾向是: 当次会话的 TODO
-    不入 memory, 跨会话的 plan 入 task (第 12 章)。
-  </dd>
-  <dt>TODO 触发 reminder 频率</dt>
-  <dd>
-    第 13 章 async run 会有"后台 agent 完成后通知", 那时也会复用 reminder 机制,
-    不会单独发明通道。
-  </dd>
+<dt>TODO 不持久化</dt>
+<dd>
+    session 退出就丢, 用户重启 agent 后 TODO 列表没了。 第 12 章的
+    Task 系统是跨会话持久化版本, 但边界要分清: TODO 是 session
+    短期步骤, Task 是跨项目长期计划。
+    </dd>
+<dt>子智能体 TODO 继承</dt>
+<dd>
+    第 04 章 SubAgent 是否应该继承父的 TODO? 第一版选择"不继承",
+      各自管自己的 TODO 列表。
+    </dd>
 </dl>
-
-<h2 id="vibe-coding-03">本次如何 vibe code: 第 03 章的三件套</h2>
-
-<h3 id="vibe-feed-03">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>TodoItem</code> /
-    <code>TodoStatus</code> / <code>TodoManager</code> 三个 interface,
-    以及状态机规则文档 (3 条规则 + 一段说明)。本轮不写实现。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    todo 实例是 stub (返回空数组)。本轮 review 重点: todo 与 history / llm /
-    registry 是同级依赖, 都从 <code>index.ts</code> 注入。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 createTodoManager() + update_todo
-    工具 + agent.ts 注入 reminder。本轮 review 重点: update() 入口的状态机校验,
-    reminder 标签格式。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写 <code>test/todo.test.ts</code> +
-    <code>test/agent.todo.test.ts</code>。本轮 review 重点: Validation 卡片 5
-    条都要在测试里, 特别是"reminder 不在 system prompt"和"tickRound 幂等"。
-  </li>
-</ol>
-
-<h3 id="vibe-review-03">Review: 第 03 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>reminder 标签格式严格。</strong>必须
-    <code>&lt;system-reminder source="todo"&gt;...&lt;/system-reminder&gt;</code
-    >, source 写 "todo", 不是 "todo-manager" 之类的自由发挥。验证: grep
-    <code>source="todo"</code> 在 agent.ts 内 ≥ 1 行。
-  </li>
-  <li>
-    <strong>状态机校验在 update() 入口。</strong>不依赖上层 agent 去校验,
-    否则子智能体复用 TODO 时会绕过校验。验证:
-    <code>grep -n 'pending' src/todo.ts</code> 在 update() 函数体内 ≥ 1 行
-    (说明有校验)。
-  </li>
-  <li>
-    <strong>tickRound 无副作用。</strong>连续调用不修改 TODO 状态。验证:
-    写一个测试 <code>tickRound(); tickRound()</code> 两次后状态没变。
-  </li>
-  <li>
-    <strong>TODO 工厂不是单例。</strong>复用前两章检查:
-    <code>grep -n 'export const todo' src/</code> 应当 0 行。
-  </li>
-  <li>
-    <strong>不污染 history 的稳定结构。</strong>TODO 数据本身不进
-    history.getMessages(), 只把 reminder 字符串作为 user 消息写入。验证:
-    跑完一轮后 history.getMessages() 不含 TodoItem 类型对象。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-03">调试: 第 03 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · 状态机校验写在 agent.ts 里。</strong>症状: agent.ts 有
-    <code
-      >if (item.status === "pending" &amp;&amp; newStatus === "completed")
-      reject(...)</code
-    >。这意味着子智能体用自己的 todo manager 时会绕过校验。验证:
-    <code>grep -n 'pending.*completed' src/agent.ts</code> 应当 0 行, 应当在
-    todo.ts 内。
-  </li>
-  <li>
-    <strong>伪装 B · reminder 拼到 system prompt 字符串里。</strong>症状: system
-    prompt 末尾出现 "Current TODO: ..."。验证: 跑完一轮后 messages[0].content
-    不应含 "Current TODO"。
-  </li>
-  <li>
-    <strong>伪装 C · update_todo 写完不返回新状态。</strong>症状: 工具 execute
-    返回成功, 但下一轮 LLM 收到的 reminder 还是旧状态。验证: Validation
-    卡片"reminder 注入位置正确"那条, 必须断言"第二轮 reminder 的 content
-    与第一轮不同"。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-03">迭代: 第 03 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch03): 钉 TodoItem / TodoStatus / TodoManager 接口与状态机规则</code
-    >
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code>feat(ch03): createTodoManager 工厂 + update_todo 工具 stub</code> ——
-    tsc 通过, agent.run 仍未注入 reminder。
-  </li>
-  <li>
-    <code>feat(ch03): agent.ts 注入 reminder, 状态机校验在 update() 入口</code>
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code>test(ch03): tickRound 幂等 + reminder 标签格式断言</code> —— 全绿。
-  </li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">Prompt Card · 第 03 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>目标:</strong>给 agent loop 加 TODO Manager,
-      模型可以看到/更新工作清单, 状态机不允许跳跃。
+<div class="card__head">
+<span class="card__tag">Prompt Card · 第 03 章</span>
+<button class="card__copy" data-copy-card="" type="button">复制</button>
+</div>
+<div class="card__body">
+<p>
+<strong>目标:</strong> 给 harness 加 TODO Manager, LLM 可以创建 / 更新 /
+      完成 / 取消 TODO, 状态走 turn reminder 注入, 父子 agent 隔离。
     </p>
-    <p>
-      <strong>场景:</strong>用户输入 "重构 user 组件, 先读再改再测", agent
-      第一轮调 update_todo 创建 3 条 pending 并把第一条标 in_progress, 之后每轮
-      reminder 显示当前 in_progress 那条。
+<p>
+<strong>场景:</strong> 用户说"3 步: 读 package.json, 跑 npm test, 修
+      失败的 case", LLM 建 3 条 TODO, 逐条标 in_progress → completed,
+      用户在 REPL 看到 [0/3] → [1/3] → [2/3] → [3/3] 的进度。
     </p>
-    <p>
-      <strong>模块:</strong> <code>src/todo.ts</code> (新) 暴露
-      <code>createTodoManager()</code>;
-      <code>src/tools/update_todo.ts</code> (新) 实现 update_todo 工具;
-      <code>src/agent.ts</code> 每轮 LLM 调用前注入 reminder;
-      <code>src/index.ts</code> 接线 todo 到 agent 和 registry。
+<p>
+<strong>模块:</strong> <code>src/todo.ts</code> (新) 暴露 <code>createTodoManager()</code>;
+      <code>src/tools/run-todo.ts</code> (新) 6 个 todo 工具;
+      <code>src/agent.ts</code> (改) prepare messages 阶段调用
+      <code>buildTodoReminder(todoManager.list())</code>;
+      <code>src/index.ts</code> (改) 注入 todoManager。
     </p>
-    <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
-    <ul>
-      <li>
-        TODO 状态机: pending → in_progress → completed, 拒绝 pending→completed
-        跳跃
-      </li>
-      <li>同一时刻只允许一条 in_progress</li>
-      <li>
-        reminder 用 <code>&lt;system-reminder source="todo"&gt;</code> 标签,
-        role 是 "user", 不是 "system"
-      </li>
-      <li>tickRound 幂等, 不修改 TODO 状态</li>
-      <li>TODO 数据本身不进 history.getMessages()</li>
-    </ul>
-    <p><strong>验证 (用 fake LLM + fake registry, 逐条落到 vitest):</strong></p>
-    <ul>
-      <li>
-        LLM 调 update_todo 把 pending 直接 completed, 写 error tool message,
-        TODO 状态不变
-      </li>
-      <li>LLM 试图激活第二条 in_progress, 写 error tool message</li>
-      <li>连续两次 tickRound() 返回同一份 reminder 字符串</li>
-      <li>跑完一轮后 messages[0].role !== "system" 仍然成立</li>
-      <li>
-        第二轮 LLM 收到的 messages 包含 reminder 标签, content 描述新 TODO 状态
-      </li>
-    </ul>
-  </div>
+<p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
+<ul>
+<li>TodoManager 在 <code>createAgent()</code> 闭包内, 不写 module-level 单例</li>
+<li>TODO 状态走 turn reminder (user message), 不进 system prompt</li>
+<li>6 个工具, 不写"批量更新" 工具, 每次只改一条</li>
+<li>不允许删除, 用 <code>cancelled</code> 状态代替</li>
+<li>TODO 和 Task 边界写进 system prompt: "3 步以内走 TODO, 跨会话走 Task"</li>
+</ul>
+<p><strong>验证 (用 fake LLM + vitest, 逐条断言):</strong></p>
+<ul>
+<li>4 个核心工具 (create / update / list / get) 跑通, 状态正确变化</li>
+<li>reminder 在 history 最后一条 user message, 不在 system prompt</li>
+<li>父 agent 建 TODO 后 spawn 子 agent, 子 agent 看不到父的 TODO</li>
+<li>TODO 列表为空时, reminder 是空字符串, 不污染 history</li>
+</ul>
+</div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意把状态机校验挪到 agent.ts, 跑测试, 看 Validation 卡片"in_progress 唯一"
-    是否还能抓到。
+<li>
+    把 TodoManager 提到 module-level, 跑测试, 看"父子 TODO 隔离" 是否抓到。
   </li>
-  <li>
-    在 reminder 里直接拼 TodoItem 对象的 JSON.stringify, 跑测试, 看是否破坏
-    system prompt 稳定性检查 (本节没有这条, 但你可以为第 10 章留一个伏笔)。
+<li>
+    把 TODO 状态拼到 system prompt 字符串, 跑测试, 看"reminder 不进 system
+      prompt" 是否抓到。
   </li>
-  <li>
-    把 update_todo 工具的 role 写成 "user", 跑测试, 看 Validation 卡片"tool
-    message role" 是否抓到 (这条是第 02 章的约束, 本章沿用)。
+<li>
+    加一个 <code>run_todo_delete</code> 工具, 跑测试, 看取消 vs 删除的
+      追溯性差异 (删了之后 history 看不到)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 loop 加了一个"工作清单": TODO Manager。loop 现在每轮能看到 当前
-  in_progress 那条 TODO, 也能通过 update_todo 工具更新清单。状态机 不允许跳跃,
-  防止 LLM "全部勾选" 假装做完了。下一章 (第 04 章) 我们 会让 harness
-  学会"分身"——SubAgent, 让一个 agent 在 loop 内开第二个 agent 处理子任务, 但隔离
-  history 与 TODO。
+  TODO Manager 是 harness 给 LLM 加的"短期记忆"。 核心是 4 个设计:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>session 闭包, 不持久化</strong>: 区分于第 12 章的 Task 系统,
+      边界写进 prompt。
+    </li>
+<li>
+<strong>6 个工具 + 4 状态</strong>: 拒绝"批量更新" 和"删除",
+      保留追溯能力。
+    </li>
+<li>
+<strong>走 reminder, 不走 system prompt</strong>: 保持 stable prefix,
+      prompt cache 友好。
+    </li>
+<li>
+<strong>工厂模式</strong>: TodoManager 闭包, 父子隔离天然成立。
+    </li>
 <p>
-  第 03 章让 loop 有节奏, 但仍然只有一个 agent 在工作。第 04 章 SubAgent 会让
-  agent 在 loop 内"开第二个 agent" 处理子任务, 关键约束是: 子智能体的
-  history、todo、permission 都必须和父 agent 隔离, 但 tool registry
-  可以共享。这一章准备的"工厂模式"就是为这一步服务的。
+  下一章 (第 04 章) 教 agent 学会"分身" — SubAgent, 让长任务可以
+  委托子智能体做, 父 agent 不会被上下文撑爆。
 </p>
+</content>
\ No newline at end of file
diff --git a/tutorial/chapters/04-subagent.html b/tutorial/chapters/04-subagent.html
index 3890505..73b7a9a 100644
--- a/tutorial/chapters/04-subagent.html
+++ b/tutorial/chapters/04-subagent.html
@@ -1,502 +1,741 @@
 <p class="article__eyebrow">第 04 章 · 让 Agent 学会分身</p>
-<h1 class="article__title">在 Loop 内开第二个 Agent: SubAgent</h1>
+<h1 class="article__title">SubAgent: 长任务委托, 父 agent 不被撑爆</h1>
 <p class="article__lede">
-  前面三章的 loop 只有一个 agent 在工作。这一章给 harness 加"分身"能力: 一个
-  agent 在 loop 内开第二个 agent 处理子任务, 子智能体跑完把结果 拼回主
-  loop。关键约束是: 子智能体的 history、todo、permission 都必须 独立, 但 tool
-  registry 可以共享。这一章是 harness 第一次处理"递归 agent"。
+  第 03 章的 TODO 让 LLM 自己追踪步骤, 但跑 50 轮对话后父 agent 的
+  history 会被 50 轮工具结果撑爆 — 上下文超出窗口, LLM 反应变慢。
+  这一章给 agent 加一个"分身"能力: <code>run_subagent</code> 工具, 把
+  长期子任务委托给子智能体, 父 agent 只看最终结果。 读完后, 你能
+  讲清"父子隔离" 的 3 个不变量, 并能用 fake LLM 验证子 agent 跑完后
+  父 history 不被污染。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-03">在第 03 章基础上改了什么</h2>
-<p>
-  这一章在 agent 主循环里加一个"开子智能体"的工具
-  (<code>spawn_subagent</code>)。 子智能体复用 createAgent() 工厂,
-  但接收一组独立 dependencies: 新的 history (空)、新的 todo (空)、可选的 tool
-  registry 子集。父 agent 拿到子智能体的最终输出, 写一条 user message 拼回主
-  loop。 对应到代码, 改动集中在 3 个文件: <code>src/agent.ts</code> (改)、
-  <code>src/tools/spawn_subagent.ts</code> (新)、<code>src/index.ts</code>
-  (改接线, 注入 subagentFactory)。
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: 50 轮后父 agent 撑爆</h2>
+<p>
+  写代码之前, 看一段真实长任务的失败。 跑一个真实场景: 用户让 agent
+  "全项目 code review, 列出 3 个最严重的 bug, 给出修复方案"。
+  </p>
+<ol>
+<li>
+<strong>第 1-10 轮</strong>: 父 agent 调 <code>run_read</code> 看 5 个文件,
+    每个文件 1000 行, history 累积 5000 token。 还撑得住。
+  </li>
+<li>
+<strong>第 25 轮</strong>: 看到第 12 个文件时, history 累积 12000 token。
+    LLM 反应开始慢, 偶尔重复调之前看过的文件。
+  </li>
+<li>
+<strong>第 50 轮</strong>: 看到第 25 个文件时, history 累积 25000 token。
+    LLM 严重失忆, 总结时漏掉 5 个文件, 重复 3 个文件的判断,
+    修复方案质量大幅下降。
+  </li>
+<li>
+<strong>第 100 轮</strong>: history 累积 50000 token, 单次 prompt 超过窗口,
+    LLM SDK 报 400 错误, agent loop 整个崩。
+  </li>
+</ol>
+<p>
+  问题本质: 父 agent 扛了"完整探索历史" + "用户 query", 上下文窗口
+  被撑爆。 即使 LLM 窗口是 200K, 50 轮工具结果也能把它填满。
+  </p>
+<p>
+  朴素想法 1: "把 read 改成只读前 100 行?"
+  治标不治本 — 父 agent 还是扛着工具结果, 还是会撑爆。
+  </p>
+<p>
+  朴素想法 2: "压缩, 把前 10 个文件的 read 结果总结成一段?"
+  能缓解, 但总结会丢失细节, LLM 基于总结给出的修复方案质量下降。
+  </p>
+<p>
+  正确做法: <strong>父 agent 调 <code>run_subagent(task="review these 5 files")</code>,
+  子智能体独立跑, 看 5 个文件, 产出"哪些有 bug" 的总结,
+  父 agent 只看到总结, 不看到子智能体的全部 history</strong>。
+  上下文从 50000 token 降到 500 token。 成本从 1.5 美元降到 0.05 美元。
+  </p>
+<h2 id="naive">朴素反例: 直接调 agent.run 复用 history</h2>
+<pre class="code-block"><code>// ❌ 反例: 子 agent 复用父 history
+function runSubagent(task: string) {
+  // 错误: 共享 history
+  return createAgent({ llm, history: parentHistory }).run(task);
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/subagent.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/subagent.ts 反例: 复用父 history (L1)</a></p>
+<p>
+  4 件事立刻坏掉:
 </p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/agent.ts: createAgent 工厂接受 subagentFactory</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/tools/registry.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/tools/registry.ts: 注册 spawn_subagent 工具</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/hooks.ts: 子智能体的 SubagentStart / SubagentEnd 钩子 (第 08
-    章展开)</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
+<ol>
+<li>
+<strong>父子污染</strong>: 子 agent 写进 history 的消息, 父 agent 下
+    一轮也能看到。 子 agent 调了 50 个工具, 父 agent 全部继承。
+    上下文没被压缩, 反而共享污染。
+  </li>
+<li>
+<strong>状态错乱</strong>: 子 agent 的 TODO 列表和父 agent 混在一起,
+    进度条闪烁 — 父 agent 看到 [3/5], 子 agent 看到 [1/3],
+    哪个是当前的? 用户搞不清。
+  </li>
+<li>
+<strong>权限泄露</strong>: 子 agent 拿到了父 agent 的全部工具权限
+    (包括 run_write), 父 agent 不想让子 agent 写文件也防不住。
+    父 agent "分析 5 个文件", 子 agent 偷偷 run_write 改了 2 个。
+  </li>
+<li>
+<strong>不可调试</strong>: 父 history 里混杂父子消息, 出问题不知道
+    哪一层搞的。 调试时只能二分"父还是子", 但 history 看不到
+    来源, 只能看 message role / content, 没线索。
+  </li>
+</ol>
+<h2 id="three-invariants">3 条不变量</h2>
+<p>
+  把反例的 4 个坏处倒过来, 就是 SubAgent 的 3 条不变量。 这 3 条
+  不变量是"写 SubAgent 代码时心里时刻记着的 3 件事", 缺一不可。
+  </p>
 <dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    长任务下, 一个 agent 跑 30 轮 context 就撑爆了。更糟的是, 子任务 (例如 "调研
-    react-query 文档") 不需要污染主 loop 的 history, 只需要把 结论拼回来。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"父子共用一个 history"。这有两个问题: 一是子任务 上下文把主
-    loop 撑爆, 二是子任务的工具调用也会出现在主 loop, 主 agent
-    会被自己的子任务搞糊涂。
+<dt>不变量 1 · 父子 history 隔离</dt>
+<dd>
+    子 agent 有自己的 history, 父 agent 看不到子 agent 的内部消息。
+    子 agent 跑完, 父 agent 只看到一条"任务已完成, 总结: ..."
+    的结果消息 (role: "tool", tool_call_id 是 run_subagent 的 id)。
+    这条消息的 content 是子 agent 的最终回复, 不是子 agent 调过的
+    50 个工具结果。
   </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口: <code>interface SubagentFactory { create(overrides): Agent }</code>。
-    不变量三条: (1) 子智能体 history 与父 agent 严格隔离, (2) 子智能体 todo 与父
-    agent 严格隔离, (3) tool registry 默认共享, 但子智能体 不允许用
-    spawn_subagent (防递归爆炸, 除非显式 enable)。
+<dt>不变量 2 · 父子 TODO 隔离</dt>
+<dd>
+    子 agent 独立 TodoManager, 父 agent 的 TODO 列表不受影响。
+    父子进度条独立显示 — 父 REPL 顶部是父 TODO, 子 agent 内部
+    跑自己的 TODO (用户看不到子 TODO, 怕屏幕太乱)。
   </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake LLM 第一轮返回 assistant 携带 spawn_subagent 调用, 跑完后 父 history
-    末尾出现一条 user 消息 (子智能体的最终输出), 但子智能体 内部的 messages
-    不进父 history。fake LLM 给子智能体的预设是返回 "调研结论: X", 父 agent
-    第二轮能看到 "X"。
+<dt>不变量 3 · 子 agent 工具权限 ≤ 父 agent</dt>
+<dd>
+    子 agent 拿到的是<strong>过滤后的</strong> ToolRegistry, 父 agent
+    有的工具子 agent 不一定有 (例如父可以写, 子只能读)。
+    这是"最小权限" 原则的体现 — 给子 agent 越少工具, 越不可能
+    出问题。
   </dd>
 </dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · 父子共用 history</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-async function spawnSubagent(task: string) {
-  // 错误: 共用父 history
-  const child = createAgent({ history, llm, tools, todo });
-  return await child.run(task);
-}</code></pre>
-  <p><strong>问:</strong>看上去省事, 为什么仍然不行?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 隔离性: 子任务的 30 轮 messages 全进父
-    history, 主 loop 立刻撑爆; 真实性: 子任务的 tool_calls 也进 父 history, 父
-    agent 在第 5 轮看到的是子任务的工具调用, 推理时 混乱; 安全:
-    子任务的权限被父权限覆盖, 一旦父 agent 拒绝了某个工具, 子任务也无法使用
-    (即使子任务场景完全合理)。
+<h2 id="shared-llm">为什么共享 LLM, 不共享 history</h2>
+<p>
+  一个微妙的设计: 父子 agent <strong>共享同一个 LLMClient 实例</strong>,
+  但 <strong>不共享 history</strong>。 为什么?
   </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · 子智能体直接读父 todo</p>
-  <pre class="code-block"><code>// 教学简化版
-const child = createAgent({
-  history: createHistory(),
-  todo: parentTodo,  // 错误: 共用父 todo
-});</code></pre>
-  <p><strong>问:</strong>为什么不复用父 todo?</p>
-  <p>
-    <strong>答:</strong>子智能体的 TODO 是"调研 react-query 文档" 的子清单, 父
-    agent 的 TODO 是"用户给我的 5 件事"。两者不同。共用会让子任务的 临时项 (例如
-    "查 GitHub README") 出现在父 reminder 里, 父 agent
-    看到后以为这是用户要求的。
+<p>
+  共享 LLM 的好处:
   </p>
-</div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface SubagentDeps {
-  history: History;
-  llm: LLMClient;
-  tools: ToolRegistry;
-  todo: TodoManager;
-  permission: PermissionManager;
-  // 子智能体允许的轮次上限, 防止死循环
-  maxRounds: number;
-}
-
-export type SubagentFactory = (overrides: Partial&lt;SubagentDeps&gt;) =&gt; Agent;
-
-export interface SubagentConfig {
-  task: string;
-  // 可选: 子智能体允许使用的工具白名单, 不填默认是 registry 全部
-  toolAllowlist?: string[];
-  // 可选: 子智能体的最大轮次, 默认 10
-  maxRounds?: number;
-}</code></pre>
+<ol>
+<li>
+<strong>避免双倍费用</strong>: 父 agent 调 LLM, 子 agent 也调 LLM,
+    共享一个 client 不会重复计费 (但 token 仍按子 agent 用量算)。
+    </li>
+<li>
+<strong>复用 LLM 客户端配置</strong>: provider / apiKey / 模型名
+    都是一处配置, 子 agent 自动继承。
+    </li>
+<li>
+<strong>支持 streaming / 缓存 / retry 等中间件</strong>: LLM 客户端
+    通常会包一层 retry / cache, 父子共享时这些中间件也共享。
+    </li>
+</ol>
+<p>
+  不共享 history 的原因: 父子上下文是隔离的, LLM 看到的"对话"
+  是各自 agent 的对话, 不是父子合并的。 共享 history 等于父子
+  共享上下文, 失去"父只看总结" 的价值。
+  </p>
+<p>
+  ❌ / ✅ 写法:
+</p>
+<pre class="code-block"><code>// ❌ 父子都用新的 LLMClient
+const childLLM = createOpenAILLMClient({ apiKey });
+// 浪费: 子 agent 的 retry / cache / logging 都从零开始
 
-<h2 id="factory-pattern">工厂模式: 第 01–03 章都在准备这一步</h2>
+// ✅ 父子共享 LLMClient
+const childLLM = parentLLM;   // 同一个实例
+// 子 agent 自动获得 retry / cache / logging</code></pre>
+<h2 id="factory-pattern">工厂模式: 子 agent 怎么创建</h2>
+<figure class="figure">
+<div class="flow-tree" role="img" aria-label="父子 agent 资源对比">
+<div class="flow-tree__children" style="border: 1px solid var(--color-border-soft); border-radius: var(--radius-md); padding: var(--space-3); width: 100%; max-width: 700px;">
+<div class="flow-tree__branch">
+<div class="flow-compare__label" style="color: var(--color-accent);">共享</div>
+<span class="flow-node flow-node--accent">LLMClient</span>
+<span class="flow-node" style="font-size: var(--text-xs);">单实例, 父子都调它</span>
+</div>
+<div class="flow-tree__branch">
+<div class="flow-compare__label" style="color: #cd5c5c;">各自独立</div>
+<span class="flow-node">history (闭包)</span>
+<span class="flow-node">TodoManager</span>
+<span class="flow-node">Tools (过滤后)</span>
+<span class="flow-node" style="font-size: var(--text-xs);">3 个不变量各自独立</span>
+</div>
+</div>
+</div>
+<figcaption>图 04-1 · 父子 agent 资源对比. 共享 LLM (避免双倍费用), 各自独立 history / todo / tools (避免污染)。</figcaption>
+</figure>
+<figure class="figure">
+<div class="flow-tree" role="img" aria-label="工厂模式创建子 agent">
+<div class="flow-node flow-node--accent">父 agent</div>
+<div class="flow-tree__connector"></div>
+<div class="flow-tree__children">
+<div class="flow-tree__branch">
+<div class="flow-compare__label">子 agent 1</div>
+<span class="flow-node">createHistory()</span>
+<span class="flow-node">createTodoManager()</span>
+<span class="flow-node">filterToolsForChild</span>
+<span class="flow-node">createAgent(deps)</span>
+</div>
+<div class="flow-tree__branch">
+<div class="flow-compare__label">子 agent 2</div>
+<span class="flow-node">createHistory()</span>
+<span class="flow-node">createTodoManager()</span>
+<span class="flow-node">filterToolsForChild</span>
+<span class="flow-node">createAgent(deps)</span>
+</div>
+</div>
+</div>
+<figcaption>图 04-2 · 工厂模式创建子 agent. 每次调 createAgent() 都得到全新闭包, 父子天然隔离。</figcaption>
+</figure>
+<figure class="figure">
+<div class="flow-map" role="img" aria-label="工具权限过滤">
+<div class="flow-row--center">
+<span class="flow-node flow-node--accent">父 ToolRegistry<br/><small>10 个工具</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">filterToolsForChild<br/><small>按 tools 数组过滤</small></span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">child ToolRegistry<br/><small>2-3 个工具</small></span>
+</div>
+<div class="flow-row--center" style="margin-top: var(--space-3)">
+<span class="flow-node">强制移除<br/>run_subagent</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">子 agent<br/><small>叶子节点</small></span>
+</div>
+</div>
+<figcaption>图 04-3 · 工具权限过滤. 父显式声明 tools 数组, 叶子节点策略防止递归。</figcaption>
+</figure>
+<figure class="figure">
+<div class="flow-compare" role="img" aria-label="失败传播 vs 跨上下文污染">
+<div class="flow-compare__col flow-compare__col--bad">
+<div class="flow-compare__label">❌ 共享 history</div>
+<span class="flow-node">子 agent 调 50 工具</span>
+<span class="flow-node">history 累积 5000 token</span>
+<span class="flow-node">父 agent 全部继承</span>
+<span class="flow-node">上下文没压缩, 反而污染</span>
+</div>
+<div class="flow-compare__col flow-compare__col--good">
+<div class="flow-compare__label">✅ 独立 history</div>
+<span class="flow-node">子 agent 调 50 工具</span>
+<span class="flow-node">子 history 累积 5000 token</span>
+<span class="flow-node">父 agent 只看总结 500 token</span>
+<span class="flow-node">上下文压缩 10 倍</span>
+</div>
+</div>
+<figcaption>图 04-4 · 失败传播 vs 跨上下文污染. 独立 history 是 SubAgent 上下文压缩的根本机制。</figcaption>
+</figure>
+<p>
+  <strong>用途</strong>: 不变量 1 + 2 决定了子 agent 必须是<strong>独立实例</strong>,
+  通过 <code>createAgent()</code> 工厂创建, 不复用父的闭包。
+  </p>
 <p>
-  第 01 章强制 history 是工厂, 第 03 章强制 todo 是工厂, 当时看起来
-  "架构过度"。这一章就是回报: createSubagentFactory() 可以无副作用地
-  创建任意多个独立 agent, 因为它每次都调用 createHistory() / createTodoManager()
-  而不是用 module-level 单例。
+  <strong>真实场景</strong>: 父 agent 调 <code>run_subagent(task="review src/agent.ts",
+  tools=["run_read"])</code>, harness 内部用工厂函数创建子 agent
+  实例, 注入过滤后的工具集, 跑子任务, 拿结果, 返回父。
+  </p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>工厂模式 + 依赖注入</strong> (Reference
+  章节的模式 1 + 模式 3)。 工厂创建独立闭包, 注入 5 件套 (llm /
+  history / todoManager / tools / hooks)。 5 个关键点决定"父子隔
+  离是不是真成立"。
+  </p>
+<p>
+  <strong>实现细节</strong>:
 </p>
 <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export function createSubagentFactory(parentDeps: SubagentDeps): SubagentFactory {
-  return (overrides) =&gt; {
-    const childDeps: SubagentDeps = {
-      ...parentDeps,
-      history: overrides.history ?? createHistory(),
-      todo: overrides.todo ?? createTodoManager(),
-      maxRounds: overrides.maxRounds ?? 10,
-      // 默认子智能体不能用 spawn_subagent, 防止递归爆炸
-      tools: filterTools(parentDeps.tools, {
-        deny: ["spawn_subagent", ...],
-        allow: overrides.toolAllowlist,
-      }),
-      permission: parentDeps.permission,  // 共享 permission 策略
-    };
-    return createAgent(childDeps);
+export function createSubagentTool(deps: {
+  parentAgent: Agent;
+  factory: AgentFactory;       // createAgent 工厂
+  permissionManager: PermissionManager;
+}): Tool {
+  return {
+    name: "run_subagent",
+    description: "把一个子任务委托给子 agent, 子 agent 独立 history, " +
+                 "完成后只返回最终结果。子 agent 看到的工具是你给的子集。",
+    parameters: {
+      type: "object",
+      properties: {
+        task: { type: "string", description: "子 agent 要完成的任务描述" },
+        tools: { type: "array", description: "子 agent 可用的工具子集, 留空 = 父的只读子集" },
+        maxRounds: { type: "number", description: "子 agent 最大循环轮数, 默认 10" },
+      },
+      required: ["task"],
+    },
+    execute: async (args) =&gt; {
+      // 不变量 1: 独立 history
+      const childHistory = createHistory();
+      // 不变量 2: 独立 TodoManager
+      const childTodoManager = createTodoManager();
+      // 不变量 3: 工具权限由父控制
+      const childTools = filterToolsForChild(deps.parentAgent.tools, args.tools);
+      // 共享 LLM, 不共享 history
+      const childAgent = deps.factory({
+        llm: deps.parentAgent.llm,
+        history: childHistory,
+        todoManager: childTodoManager,
+        tools: childTools,
+        maxRounds: args.maxRounds ?? 10,
+      });
+      const result = await childAgent.run(args.task);
+      // 子 agent 跑完, 父 agent 只看到这条结果 (作为 tool message)
+      return { content: result };
+    },
   };
 }</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/subagent.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/subagent.ts runSubagent 完整实现 (L1)</a></p>
+<h2 id="tool-filtering">子 agent 工具怎么过滤</h2>
+<p>
+  <strong>用途</strong>: 父 agent 不想让子 agent 拥有的工具, 子 agent 拿不到
+  schema, 也调不到。 工具权限由父显式声明, 子 agent 自动继承
+  过滤后的子集。
+  </p>
 <p>
-  关键设计: <code>permission</code> 和 <code>tools</code> 是"父策略 + 子过滤",
-  子智能体不能绕过父策略, 也不能递归 spawn 新的子智能体 (除非显式 enable)。
-  <code>history</code> 和 <code>todo</code> 是"完全独立", 子任务的细节不进父
-  loop。
+  <strong>真实场景</strong>: 父 agent 调 <code>run_subagent(task="分析 src/agent.ts",
+  tools=["run_read"])</code>, 子 agent 工具 schema 不含 run_write,
+  即使子 agent 想偷偷写, 调 run_write 返回 "tool not found" 错误。
+  </p>
+<table class="terms">
+<thead>
+<tr>
+<th>父给的子集</th>
+<th>子 agent 能调的</th>
+<th>典型场景</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>["run_read", "run_bash"]</code></td>
+<td>读文件 + 跑只读命令</td>
+<td>"分析这 5 个文件, 不要改"</td>
+</tr>
+<tr>
+<td><code>["run_read", "run_write", "run_edit"]</code></td>
+<td>读 + 写 + 编辑</td>
+<td>"实现这个功能, 改文件"</td>
+</tr>
+<tr>
+<td><code>[]</code> (默认)</td>
+<td>父的只读子集 (run_read / run_bash readonly)</td>
+<td>"查资料, 给我答案"</td>
+</tr>
+</tbody>
+</table>
+<p>
+  ❌ / ✅: 不要让子 agent 拿到 <code>run_subagent</code> 自身。
 </p>
+<pre class="code-block"><code>// ❌ 子 agent 可以再 spawn 子子 agent
+filterToolsForChild(parentTools, args.tools).includes("run_subagent")
+// 递归分身, 上下文被无限拉低, 但费用指数上升
 
-<h2 id="loop-integration">loop 接入: spawn 是一次 tool call</h2>
+// ✅ 过滤掉 run_subagent, 强制叶子节点
+const childTools = filterToolsForChild(parentTools, args.tools);
+childTools.remove("run_subagent");  // 永远禁止子 agent 递归</code></pre>
 <p>
-  子智能体对父 agent 而言, 就是一个普通工具 (<code>spawn_subagent</code>)。 父
-  agent 决定什么时候用、用哪个子任务、允许多少轮次。父 agent 拿到
-  子智能体的最终输出, 写一条 user 消息拼回主 loop, 然后继续自己的 loop。
+  这是硬规矩, 不是配置项。 叶子节点策略, 避免"子生子" 拖垮系统。
+  递归分身看起来"灵活", 实际是个灾难 — 父给子 1 个任务, 子又
+  spawn 3 个子任务, 每个子任务又 spawn 3 个, 第 4 层就有 81 个
+  agent, 费用爆炸, 调试也爆炸。
+  </p>
+<h2 id="error-propagation">子 agent 出错怎么办</h2>
+<p>
+  <strong>用途</strong>: 子 agent 跑挂了 (maxRounds 用完 / 工具抛 throw),
+  父 agent 不能假装成功。 3 种处理方式, 跟第 02 章工具失败模式一致。
+  </p>
+<p>
+  <strong>真实场景</strong>: 子 agent 跑 10 轮没完成, 收敛为 maxRounds 触底,
+  父 agent 收到 "子任务超时" 工具结果, 决定: 重试 / 换工具 / 放弃。
+  父 agent 不会因为子失败而崩。
+  </p>
+<table class="terms">
+<thead>
+<tr>
+<th>情况</th>
+<th>父 agent 看到</th>
+<th>处理</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>子 agent 正常完成</td>
+<td>"任务完成, 总结: ..."</td>
+<td>父 agent 继续下一步</td>
+</tr>
+<tr>
+<td>子 agent 超 maxRounds</td>
+<td>"子任务超时, 已完成: ... / 未完成: ..."</td>
+<td>父 agent 决定: 重试 / 换工具 / 放弃</td>
+</tr>
+<tr>
+<td>子 agent 抛 throw</td>
+<td>"子任务失败: error message"</td>
+<td>父 agent 决定: 重试 / 换工具 / 放弃</td>
+</tr>
+</tbody>
+</table>
+<p>
+  ❌ / ✅: 不要让子 agent 失败时父 agent 也崩。
 </p>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-async function executeTool(call: ToolCall): Promise&lt;ToolResult&gt; {
-  if (call.name === "spawn_subagent") {
-    const config = call.args as SubagentConfig;
-    const childAgent = subagentFactory({ maxRounds: config.maxRounds });
-    const childOutput = await childAgent.run(config.task);
-    return { toolCallId: call.id, content: childOutput };
+<pre class="code-block"><code>// ❌ 子 agent 失败, 父 agent 整个崩
+async function runSubagent(args) {
+  const result = await childAgent.run(args.task);
+  return { content: result };     // child 抛了, 这里也抛
+}
+
+// ✅ 失败转成 ToolResult, 父 agent 知道
+async function runSubagent(args) {
+  try {
+    const result = await childAgent.run(args.task);
+    return { content: result };
+  } catch (e) {
+    return { content: `子任务失败: ${e.message}\n请决定重试 / 换方案 / 放弃。`, error: true };
   }
-  // 其它工具走 registry
-  const tool = registry.get(call.name);
-  return await tool.execute(call.args);
 }</code></pre>
+<p>
+  业务错误 (子 agent 失败) 用 <code>error: true</code> 写回, 父 agent
+  看到后能继续推理 — "子任务失败了, 我换个方案再试"。 跟第 02 章
+  的工具失败模式一致。
+  </p>
+<p>
+  注意: <code>try/catch</code> 包在 <code>childAgent.run()</code> 外面,
+  捕获的是"子 agent 抛出的 throw" (比如 maxRounds 用完),
+  不是"子 agent 返回的业务错误" (比如子 agent 调 run_read 失败)。
+  后者子 agent 自己用 <code>error: true</code> 处理, 不会抛 throw。
+  </p>
+<h2 id="user-display">子 agent 跑的时候父怎么显示</h2>
+<p>
+  <strong>用途</strong>: 子 agent 跑 10 轮工具, 父 agent 屏幕静止 30 秒,
+  用户会以为卡了。 REPL 怎么展示子 agent 进度。
+  </p>
+<p>
+  <strong>真实场景</strong>: 父 REPL 渲染时, 在当前 prompt 上方加一行
+  "↳ [子 agent] 进度", 用户看到子 agent 在跑, 不会以为父 agent
+  卡死。 子 agent 自己的工具调用细节不显示 (避免父 REPL 被淹没)。
+  </p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>反向通知</strong>模式 — 父等子, 子
+  反向告诉父自己的状态 (通过 callback / event)。 不阻塞父, 父
+  显示当前状态, 用户知道在等什么。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>// 教学简化版, 真实实现见 GitHub 永久链接
+function renderSubagentIndicator(childProgress: ChildProgress): string {
+  if (!childProgress) return "";
+  const { task, currentStep, totalSteps } = childProgress;
+  return `\n  ↳ [子 agent] ${task} (${currentStep}/${totalSteps})\n`;
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/repl.ts#L1" rel="noreferrer" target="_blank">GitHub · src/repl.ts renderSubagentIndicator (L1)</a></p>
+<p>
+  子 agent 进度怎么拿? <code>run_subagent</code> 工具执行期间,
+  通过 callback 通知父 REPL: "子 agent 完成第 3 步, 共 5 步"。
+  这是"反向通知" 模式 — 父等子, 子反向告诉父自己的状态。
+  </p>
+<h2 id="fake-test">fake LLM 测试: 父子隔离</h2>
+<p>
+  <strong>用途</strong>: SubAgent 隔离是最难写测试的, 因为涉及"父看到的"
+  和"实际发生的" 的差异。 写一个测试, 验证 3 件事。
+  </p>
+<p>
+  <strong>设计思想</strong>: 用 fake LLM 模拟"父子各自调 LLM" 的序列,
+  在父 agent.run 结束后, 断言"父 history 看不到子 agent 内部消息"。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>test("subagent 跑完后, 父 history 不被污染", async () =&gt; {
+  let isChild = false;
+  const fakeLLM = createFakeLLM([
+    // 父第一次: 调 run_subagent
+    {
+      content: null,
+      toolCalls: [{
+        id: "sub1", function: {
+          name: "run_subagent",
+          arguments: '{"task":"分析 package.json"}',
+        },
+      }],
+      finishReason: "tool_calls",
+    },
+    // 父第二次: 收到子 agent 结果, 回答
+    { content: "Based on subagent: test = npm test", toolCalls: [], finishReason: "stop" },
+    // 子 agent 调的工具 (按调用顺序)
+    // 第一次: 子 agent 调 run_read
+    {
+      content: null,
+      toolCalls: [{ id: "child_r1", function: { name: "run_read", arguments: '{"path":"package.json"}' } }],
+      finishReason: "tool_calls",
+    },
+    // 第二次: 子 agent 看到结果, 总结
+    { content: "package.json says test = npm test", toolCalls: [], finishReason: "stop" },
+  ]);
 
-<h2 id="trap">反例梯度</h2>
+  // 关键: 区分"父 chat" 和"子 chat"
+  const callMessages: any[][] = [];
+  let parentCallCount = 0;
+  let childCallCount = 0;
+  const trackingLLM = {
+    async chat({ messages }) {
+      const isParentCall = !isChild;
+      callMessages.push(messages);
+      if (isParentCall) parentCallCount++;
+      else childCallCount++;
+      // ... 返回 fake response (根据 callCount)
+    },
+  };
+
+  const agent = createAgent({ llm: trackingLLM, history: createHistory(), tools });
+  const reply = await agent.run("What's the test command?");
 
+  // 验证 1: 父 history 末尾是 assistant 总结
+  const lastParentCall = callMessages[parentCallCount - 1];
+  const lastAssistant = lastParentCall.filter(m =&gt; m.role === "assistant").pop();
+  expect(lastAssistant.content).toContain("Based on subagent");
+
+  // 验证 2: 父 history 看不到子 agent 的 tool_call
+  expect(lastParentCall).not.toContainEqual(expect.objectContaining({
+    role: "tool", tool_call_id: "child_r1",
+  }));
+
+  // 验证 3: 父 history 只看到子 agent 的总结作为 tool result
+  expect(lastParentCall).toContainEqual(expect.objectContaining({
+    role: "tool", tool_call_id: "sub1",
+    content: expect.stringContaining("package.json says test = npm test"),
+  }));
+});</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/subagent.test.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/subagent.test.ts 父子隔离测试 (L1)</a></p>
+<p>
+  这条测试如果挂, 你立刻知道父子隔离出了问题。 3 个断言联合
+  验证: 父看到总结, 看不到子内部消息, run_subagent 配对正确。
+  </p>
+<h2 id="common-confusion">3 个常见误解</h2>
+<dl class="defs">
+<dt>误解 1 · "子 agent 应该继承父 history"</dt>
+<dd>
+    错。 子 agent 独立 history 是不变量 1, 继承父 history 等于
+    把子 agent 50 轮工具结果灌给父, 上下文反而没被压缩。 父
+    想"让子知道上下文" 应该显式传 "context summary" 给子 agent,
+    不是共享 history。
+  </dd>
+<dt>误解 2 · "子 agent 可以再 spawn 子子 agent"</dt>
+<dd>
+    错。 递归 spawn 看似灵活, 实际是费用爆炸 + 调试噩梦。 强制
+    叶子节点策略, 子 agent 不拿 run_subagent 工具。
+  </dd>
+<dt>误解 3 · "子 agent 失败应该重试"</dt>
+<dd>
+    错。 子 agent 失败是父 agent 的<strong>决策点</strong>, 不该
+    harness 自动重试。 重试掩盖了真实问题, 应该让父 agent 看到
+    错误, 自己决定: 重试 / 换方案 / 放弃。
+  </dd>
+</dl>
+<h2 id="trap">反例梯度</h2>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>子智能体用 module-level 单例 history。</p>
-    <p>
-      <strong>为什么错:</strong>完全打乱父子隔离, 子任务的 messages 立刻污染父
-      history。
+<div class="card__head">
+<span class="card__tag">新手错法 · A</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 子 agent 复用父的 history 对象。
     </p>
-    <p>
-      <strong>正确做法:</strong>子智能体在 factory 内调用
-      <code>createHistory()</code>, 每次都新建。
+<p>
+<strong>为什么错:</strong> 子 agent 写进 history 的消息全部进父,
+      上下文没被压缩, 反而共享污染。
+    </p>
+<p>
+<strong>正确做法:</strong> 子 agent 通过 <code>createHistory()</code>
+      独立创建, 父子闭包隔离。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>子智能体允许递归 spawn_subagent。</p>
-    <p>
-      <strong>为什么错:</strong>LLM 容易写出 "子任务里再开子任务" 的指令,
-      形成无限递归, token 瞬间撑爆。
+<div class="card__head">
+<span class="card__tag">中级错法 · B</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 父 agent 把所有工具都传给子 agent。
+    </p>
+<p>
+<strong>为什么错:</strong> 父想"分析文件不要改", 结果子 agent 拿到
+      run_write 直接改了, 父 agent 防不住。
     </p>
-    <p>
-      <strong>正确做法:</strong>默认 deny "spawn_subagent" 在子工具列表里,
-      除非父 agent 显式 enable。
+<p>
+<strong>正确做法:</strong> 父显式声明子 agent 的工具子集, 最小权限。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>子智能体的 tool_calls 写进父 history。</p>
-    <p>
-      <strong>为什么错:</strong>子任务的工具调用 (例如"读 30 个文件") 出现在父
-      history, 父 agent 第 5 轮被自己的子任务搞糊涂。
+<div class="card__head">
+<span class="card__tag">高级错法 · C</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 子 agent 也能调 <code>run_subagent</code>,
+      递归 spawn。
+    </p>
+<p>
+<strong>为什么错:</strong> 上下文被无限拉低, 费用指数上升, 调试噩梦。
     </p>
-    <p>
-      <strong>正确做法:</strong>子智能体只把"最终文本输出"作为 tool result
-      写回父 history, 中间过程全部隔离。
+<p>
+<strong>正确做法:</strong> 过滤工具时强制移除 <code>run_subagent</code>,
+      叶子节点策略。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>子智能体超时 (maxRounds) 时抛异常, 中断父 loop。
+<div class="card__head">
+<span class="card__tag">边界错法 · D</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 子 agent 失败时父 agent 整个崩。
     </p>
-    <p>
-      <strong>为什么错:</strong>父 agent 拿到的是异常, 而不是"子任务未完成,
-      建议这样做" 的可读输出。
+<p>
+<strong>为什么错:</strong> 子 agent 是异步任务, 它的失败是业务错误
+      不是 harness 错误, 不该传上去。
     </p>
-    <p>
-      <strong>正确做法:</strong>子智能体超时时返回 "[Round limit reached] ...",
-      父 agent 看到后能选择"重试"或"放弃子任务"。
+<p>
+<strong>正确做法:</strong> 子 agent 抛错时, <code>run_subagent</code>
+      工具返回 <code>{ content: "子任务失败: ...", error: true }`,
+      父 agent 收到后自己决定。
     </p>
-  </div>
 </div>
-
+</div>
 <h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
 <div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 04 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>父子 history 隔离:</strong>父 agent 调 spawn_subagent, 跑完后 父
-      history 末尾出现 1 条 user 消息 (子智能体输出), 但子智能体内部
-      调用的工具不进父 history。验证: <code>parentHistory.getMessages()</code>
-      中不含子任务用过的 tool name。
+<div class="card__head">
+<span class="card__tag">Validation · 第 04 章</span>
+</div>
+<div class="card__body">
+<p>
+<strong>父子 history 隔离:</strong> 父调 <code>run_subagent</code>, fake LLM
+      模拟子 agent 调 5 个工具, 验证父 history 看不到这 5 个 tool_call,
+      只看到子 agent 总结作为 tool result。
     </p>
-    <p>
-      <strong>父子 todo 隔离:</strong>父 todo 含 3 条, 子智能体创建了 2 条,
-      跑完后父 todo 仍只含 3 条, 子 todo 含 2 条 (且不重叠)。
+<p>
+<strong>父子 TODO 隔离:</strong> 父建 TODO 后 spawn 子 agent, 子 agent
+      建自己的 TODO, 父 reminder 看不到子的, 子 reminder 看不到父的。
     </p>
-    <p>
-      <strong>防递归:</strong>子智能体在自己的 loop 内试图调 spawn_subagent,
-      fake registry 拒绝 (工具列表不含 spawn_subagent), 写 error tool message。
+<p>
+<strong>子 agent 工具子集:</strong> 父声明 <code>tools=["run_read"]</code>,
+      子 agent 工具 schema 不含 run_write, 调 run_write 返回
+      "tool not found" 错误。
     </p>
-    <p>
-      <strong>maxRounds 触发截断:</strong>子智能体 maxRounds=2, fake LLM
-      永远返回带 tool_call 的 assistant, 跑完后子智能体返回 "[Round limit
-      reached] ..." 而不是抛异常。
+<p>
+<strong>递归禁止:</strong> 验证 <code>run_subagent</code> 不在子 agent 工具列表。
     </p>
-    <p>
-      <strong>子智能体输出拼回主 loop:</strong>父 agent 调 spawn_subagent 后,
-      下一轮 LLM 收到的 messages 末尾出现一条 user 消息, content
-      等于子智能体输出。
+<p>
+<strong>子失败不崩:</strong> 子 agent 抛错时, <code>run_subagent</code> 工具返回
+      <code>error: true</code> 的 tool result, 父 history 继续累积, 不抛。
     </p>
-  </div>
 </div>
-
-<h2 id="lookback">回望第 00–03 章: 哪些原则在本章兑现了</h2>
+</div>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>工厂模式回报:</strong>第 01 章 history 工厂 + 第 03 章 todo 工厂,
-    在本章终于体现出价值 —— 不重写 agent.ts 就能创建独立子智能体。
-  </li>
-  <li>
-    <strong>共享 + 隔离的边界:</strong>permission 共享 (策略一致), history/todo
-    隔离 (状态独立), tools 受控共享 (默认 + 白名单)。这三条是 harness 设计"分身"
-    的标准模式, 第 13 章 async run 也会沿用。
-  </li>
-  <li>
-    <strong>tool call 协议沿用:</strong>spawn_subagent 是普通工具, 复用第 02
-    章的 Tool interface, 不发明新协议。
-  </li>
-  <li>
-    <strong>maxRounds 防爆炸:</strong>子智能体必须有上限, 否则 LLM
-    倾向无限递归。这是 harness "反 LLM 偷懒" 的关键阀门。
-  </li>
+<li>
+<strong>工厂模式</strong>: 子 agent 走 <code>createAgent()</code> 工厂,
+      父子闭包隔离天然成立。
+    </li>
+<li>
+<strong>依赖注入</strong>: 父子共享 LLM, 独立 history / todoManager / tools。
+    </li>
+<li>
+<strong>最小权限</strong>: 父声明子 agent 工具子集, 强制移除 run_subagent。
+    </li>
+<li>
+<strong>错误传播</strong>: 子失败不传 throw, 业务错误用 <code>error: true</code>。
+    </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>子智能体的工具集动态变化</dt>
-  <dd>第 05 章 Skill 会让子智能体"按需加载"工具子集, 进一步收紧工具白名单。</dd>
-  <dt>子智能体也用 TODO</dt>
-  <dd>第 03 章工厂模式直接复用, 子智能体有自己的 TODO 列表, 跑完自动丢弃。</dd>
-  <dt>子智能体超时常发生</dt>
-  <dd>第 11 章 recovery 会处理"子任务反复超时怎么办", 引入退避重试。</dd>
-  <dt>子智能体用 async run</dt>
-  <dd>
-    第 13 章 async run 会让 spawn_subagent 不阻塞主 loop, 改为"启动后台 agent,
-    完成后通知"。
-  </dd>
+<dt>子 agent 看不到父上下文</dt>
+<dd>
+    子 agent 不知道父 agent 在做什么长任务。 想要"父子共享上下文"
+    需要显式传 "context summary" 给子 agent, 这是后续增强。
+    </dd>
+<dt>子 agent 之间互不可见</dt>
+<dd>
+    多个子 agent 不能直接通信, 只能通过父 agent 转发。 想要"多
+    子 agent 协作" 需要更复杂的拓扑, 专题 B 提到 Agent Team 时
+    再展开。
+    </dd>
 </dl>
-
-<h2 id="vibe-coding-04">本次如何 vibe code: 第 04 章的三件套</h2>
-
-<h3 id="vibe-feed-04">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>SubagentDeps</code> /
-    <code>SubagentFactory</code> / <code>SubagentConfig</code> 三个
-    interface。本轮不写实现, 重点钉"父策略 + 子过滤" 的边界。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    <code>createSubagentFactory()</code> 接受父 deps。本轮 review 重点: factory
-    是闭包, 不是 class。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 createSubagentFactory +
-    spawn_subagent 工具 + agent.ts 接入。本轮 review 重点: 工具列表默认 deny
-    "spawn_subagent", history / todo 必须独立。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/subagent.test.ts</code>。本轮 review 重点: "父子 history 隔离" 和
-    "maxRounds 触发截断" 两条必须有反向断言 (例如"父 history 不含子工具名")。
-  </li>
-</ol>
-
-<h3 id="vibe-review-04">Review: 第 04 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>子智能体 history 严格独立。</strong>不得在 factory 内复用
-    <code>parentDeps.history</code>。验证:
-    <code>grep -n 'parentDeps.history' src/agent.ts</code> 应当 0 行。
-  </li>
-  <li>
-    <strong>子智能体 todo 严格独立。</strong>同上,
-    <code>grep -n 'parentDeps.todo' src/agent.ts</code> 应当 0 行。
-  </li>
-  <li>
-    <strong>工具列表默认 deny spawn_subagent。</strong>验证: factory 内 filter
-    显式 deny "spawn_subagent", 除非 overrides.toolAllowlist 显式包含。
-  </li>
-  <li>
-    <strong>maxRounds 必传。</strong>factory 必须给子智能体 maxRounds, 默认
-    10。验证: <code>grep -n 'maxRounds' src/agent.ts</code> 在
-    createSubagentFactory 内 ≥ 1 行。
-  </li>
-  <li>
-    <strong>子智能体超时返回字符串, 不抛异常。</strong>验证: Validation
-    卡片"maxRounds 触发截断"必须断言返回 string, 不抛错。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-04">调试: 第 04 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · 父子共用 todo, 假装"只是没填新 todo"。</strong>症状:
-    子智能体没有创建 todo, 但子任务的中间步骤会写进父 reminder。验证: Validation
-    卡片"父子 todo 隔离" 必须断言"父 todo 仍只含 3 条"。
-  </li>
-  <li>
-    <strong>伪装 B · 子智能体 spawn_subagent 没被 deny。</strong>症状:
-    工具列表里仍然有 spawn_subagent。验证: 子智能体调 spawn_subagent 时应当写
-    error tool message, 不进入子 loop。
-  </li>
-  <li>
-    <strong>伪装 C · 子智能体输出被父 history 当成 tool message。</strong>症状:
-    父 history 末尾有 <code>role: "tool"</code> 消息, 描述子智能体输出。验证:
-    应当是 <code>role: "user"</code> 消息, 因为父 agent
-    看到子输出是"我刚刚做的事的结果", 归类为 user 视角更自然。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-04">迭代: 第 04 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch04): 钉 SubagentDeps / SubagentFactory / SubagentConfig 接口</code
-    >
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code
-      >feat(ch04): createSubagentFactory 工厂 + spawn_subagent 工具 stub</code
-    >
-    —— tsc 通过, agent.run 仍未接入。
-  </li>
-  <li>
-    <code>feat(ch04): agent.run 接入 spawn_subagent, 父子隔离 + 防递归</code> ——
-    跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code>test(ch04): 子智能体输出拼回主 loop, role 断言为 "user"</code> ——
-    全绿。
-  </li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">Prompt Card · 第 04 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 SubAgent, 父 agent 可在 loop 内开第二个 agent
-      处理子任务, history/todo 隔离, tool registry 受控共享。
+<div class="card__head">
+<span class="card__tag">Prompt Card · 第 04 章</span>
+<button class="card__copy" data-copy-card="" type="button">复制</button>
+</div>
+<div class="card__body">
+<p>
+<strong>目标:</strong> 给 agent 加 <code>run_subagent</code> 工具, 长任务可以
+      委托子 agent, 父子 history / todo / tools 严格隔离。
     </p>
-    <p>
-      <strong>场景:</strong>用户输入 "调研 react-query 文档后帮我重构", 父 agent
-      调 spawn_subagent, 子任务查 3 个文件后返回 "调研结论: X", 父 agent 拿到 X
-      继续主 loop。
+<p>
+<strong>场景:</strong> 用户说"全项目 code review", 父 agent 调
+      <code>run_subagent(task="review src/agent.ts", tools=["run_read"])</code>,
+      子 agent 独立 history 跑 5 轮, 父 agent 只看总结, 上下文不被撑爆。
     </p>
-    <p>
-      <strong>模块:</strong> <code>src/agent.ts</code> 改 createAgent 接受
-      subagentFactory; <code>src/tools/spawn_subagent.ts</code> (新) 实现工具;
-      <code>src/index.ts</code> 接线 subagentFactory;
-      <code>src/tools/filter.ts</code> (新) 实现工具列表过滤。
+<p>
+<strong>模块:</strong> <code>src/tools/run-subagent.ts</code> (新) 实现
+      <code>run_subagent</code> 工具; <code>src/subagent.ts</code> (新)
+      工厂函数 <code>createSubagentFromParent()</code>;
+      <code>src/agent.ts</code> (改) 注册 run_subagent 工具;
+      <code>src/index.ts</code> (改) 注入子 agent 工厂。
     </p>
-    <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
-    <ul>
-      <li>子智能体 history 与父 agent 严格隔离 (不共享引用)</li>
-      <li>子智能体 todo 与父 agent 严格隔离 (不共享引用)</li>
-      <li>子智能体工具列表默认不含 "spawn_subagent", 防递归</li>
-      <li>子智能体 maxRounds 必传, 默认 10, 超时返回字符串不抛异常</li>
-      <li>permission 策略与父 agent 共享 (子不能绕过父权限)</li>
-    </ul>
-    <p><strong>验证 (用 fake LLM + fake registry, 逐条落到 vitest):</strong></p>
-    <ul>
-      <li>
-        父 agent 调 spawn_subagent 后, 父 history 末尾出现 1 条 role: "user"
-        消息, content = 子智能体输出
-      </li>
-      <li>父 history 不含子智能体调过的 tool name</li>
-      <li>
-        子智能体在自己的 loop 内调 spawn_subagent, 写 error tool message,
-        不进入子子 loop
-      </li>
-      <li>
-        子智能体 maxRounds=2 + fake LLM 永远返回带 tool_call, 子智能体返回
-        "[Round limit reached] ..." 字符串
-      </li>
-      <li>子 todo 与父 todo 不重叠, 各自独立</li>
-    </ul>
-  </div>
+<p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
+<ul>
+<li>子 agent 通过 <code>createAgent()</code> 工厂创建, 不复用父的 history / todoManager</li>
+<li>父子共享 LLM, 独立 history / todoManager / 过滤后 tools</li>
+<li>子 agent 工具子集由父显式声明, 强制移除 <code>run_subagent</code> 防止递归</li>
+<li>子 agent 失败用 <code>error: true</code> 写回父, 不传 throw</li>
+<li>子 agent 跑完, 父 history 只看到一条"子任务完成, 总结: ..." 的 tool result</li>
+</ul>
+<p><strong>验证 (用 fake LLM + vitest, 逐条断言):</strong></p>
+<ul>
+<li>父 history 看不到子 agent 内部的 tool_call, 只看到 run_subagent 配对的 tool result</li>
+<li>父 / 子 TODO 列表相互不可见, 进度条独立</li>
+<li>子 agent 拿到 <code>["run_read"]</code> 子集, 调 run_write 返回 "tool not found"</li>
+<li><code>run_subagent</code> 不在子 agent 工具列表, 递归禁止</li>
+<li>子 agent 抛错时, <code>run_subagent</code> 返回 <code>error: true</code>, 父 agent 不崩</li>
+</ul>
+</div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意把子智能体 history 改成 <code>parentDeps.history</code>, 跑测试, 看"父子
-    history 隔离" 是否抓到。
+<li>
+    故意让子 agent 复用父 history, 跑测试, 看"父子 history 隔离" 是否抓到
+    (父 history 含子 agent 内部消息)。
+  </li>
+<li>
+    让父 agent 不传 tools 子集 (默认全给), 跑测试, 看"最小权限" 是否抓到
+    (子 agent 能写文件)。
   </li>
-  <li>在 filter 里忘记 deny "spawn_subagent", 跑测试, 看"防递归" 是否抓到。</li>
-  <li>
-    把子智能体 maxRounds 改成 0, 跑测试, 看 "maxRounds 触发截断" 是否能识别
-    (可能直接抛 "maxRounds must be > 0", 提示你 maxRounds 校验要写在哪里)。
+<li>
+    子 agent 调 <code>run_subagent</code> 递归, 跑测试, 看"递归禁止" 是否抓到。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了"分身" 能力。createSubagentFactory() 利用前三章 的工厂模式,
-  每次创建独立 history / todo, 受控共享 tool registry 与 permission
-  策略。maxRounds 是防止 LLM 递归爆炸的关键阀门。 下一章 (第 05 章)
-  我们会处理"工具太多装不下" 的问题——Skill, 让 agent 按需加载工具子集,
-  进一步收紧 prompt 中的工具描述前缀。
+  SubAgent 是给长任务的"上下文泄洪" 机制。 核心是 3 个不变量:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>父子 history 隔离</strong>: 子 agent 独立 history, 父只看到总结。
+    </li>
+<li>
+<strong>父子 TODO 隔离</strong>: 各自管自己的进度。
+    </li>
+<li>
+<strong>子工具 ≤ 父</strong>: 最小权限, 禁止递归。
+    </li>
 <p>
-  第 04 章的 spawn_subagent 默认使用全量 tool registry。当工具数到 30+ 之后, 父
-  agent 的 tool schema 描述会撑爆 system prompt。下一章 Skill
-  模块会按场景动态加载工具子集, 同时也作为"按需暴露能力" 的元机制, 为第 10 章
-  cache-friendly 布局 (工具描述作为稳定前缀) 做准备。
+  下一章 (第 05 章) 处理"工具数到 30+" 的另一类上下文问题 —
+  Skill 机制, 让 LLM 按场景动态激活工具子集。
 </p>
+</content>
\ No newline at end of file
diff --git a/tutorial/chapters/05-skill.html b/tutorial/chapters/05-skill.html
index 5b62386..12571af 100644
--- a/tutorial/chapters/05-skill.html
+++ b/tutorial/chapters/05-skill.html
@@ -1,132 +1,104 @@
 <p class="article__eyebrow">第 05 章 · 按需加载能力</p>
-<h1 class="article__title">让 Agent 按需加载工具子集: Skill</h1>
+<h1 class="article__title">Skill: 让 Agent 按需激活工具子集</h1>
 <p class="article__lede">
-  第 04 章让 agent 学会分身, 但所有 agent 看到的都是同一个 tool registry。
-  工具数到 30+ 之后, system prompt 里的工具描述就撑爆了, 模型也会被淹没
-  在不相关的工具里。这一章给 harness 加 "Skill" 机制: 按场景动态加载工具 子集,
-  同时 Skill 本身也是一种 "按需注入能力" 的元机制, 后续章节会复用。
+  第 04 章的 SubAgent 解决"长任务上下文" 问题, 但解决不了另一类
+  撑爆: 工具数到 30+ 之后, system prompt 里的工具描述就撑爆了,
+  模型也被淹没在不相关的工具里。 这一章给 harness 加"Skill" 机制:
+  按场景动态加载工具子集, 同时 Skill 本身也是一种"按需注入能力" 的
+  元机制, 后续章节会复用。 读完后, 你能讲清"工具描述是 stable
+  prefix" 的伏笔, 并能用 fake LLM 验证"加载 skill 后工具可用, loop
+  结束不污染父"。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-04">在第 04 章基础上改了什么</h2>
-<p>
-  这一章在 tool registry 之上加一层 "SkillLoader"。当 LLM 在 tool_call 里调用
-  <code>load_skill</code> 时, harness 把指定 skill 的工具集"激活" 进当前 loop
-  的工具列表 (而不是永久改 registry)。loop 结束后激活状态 丢弃, 不会污染父 agent
-  或后续 loop。 对应到代码, 改动集中在 4 个文件:
-  <code>src/skills.ts</code> (新)、
-  <code>src/tools/load_skill.ts</code> (新)、<code>src/agent.ts</code> (改)、
-  <code>src/index.ts</code> (改接线, 注入 skills 配置)。
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: 30 个工具的混乱</h2>
+<p>
+  写代码之前, 先看一个真实场景。 跑你的 harness 已经积累到 30 个工具
+  (<code>run_bash</code> / <code>run_read</code> / <code>run_write</code> / <code>run_edit</code> /
+  <code>run_edit_exact</code> / <code>run_todo_*</code> / <code>run_task_*</code> /
+  <code>run_memory_*</code> / <code>run_skill</code> / <code>run_subagent</code> /
+  <code>run_async_*</code> / <code>run_schedule_*</code> / <code>run_output_*</code> /
+  <code>run_web_*</code> ...)。
+  </p>
+<p>
+  用户让 agent "修改 React 组件"。 观察:
 </p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/skills.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/skills.ts: Skill 加载器 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/agent.ts: 维护当前激活的 skill 集合</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/tools/registry.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/tools/registry.ts: 暴露 getAll() 给 skill loader</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/system-prompt.ts"
-    target="_blank"
-    rel="noreferrer"
-    >4. src/system-prompt.ts: 工具描述前缀 (第 10 章 cache-friendly 的伏笔)</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    工具数到 30+ 之后, system prompt 里堆 30 段工具描述, 模型在第 5 轮
-    已经记不清 "我有哪些工具可用"。更糟的是, 任务不相关 (例如 "改 React 组件"
-    不需要 "deploy k8s" 工具) 的工具描述会误导模型 尝试用错工具。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"把工具描述塞进 system prompt, 让 LLM 自己挑"。 这有两个问题:
-    一是 system prompt 越长, token 成本越高 (无关任务 也要付账), 二是 LLM 在长
-    context 下选错工具的概率显著上升。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface SkillLoader { load(name): Tool[], list(): SkillMeta[] }</code
-    >。 不变量三条: (1) skill 激活状态是 loop 内的临时态, loop 结束丢弃, (2)
-    同一个 loop 内可激活多个 skill, 工具集取并集, (3) 工具描述 在 LLM
-    视角是稳定前缀 (skill 集合是稳定集合, 不是动态拼字符串)。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake LLM 第一轮调 load_skill("react"), 跑完后第二轮 LLM 收到的 system prompt
-    含 react skill 的工具描述; 第三轮 LLM 不再调 load_skill, 但 react
-    工具仍然可用 (loop 内激活状态保留)。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · skill 状态写到 module-level 单例</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-let activeSkills: string[] = [];
-export function loadSkill(name: string) { activeSkills.push(name); }</code></pre>
-  <p><strong>问:</strong>为什么不直接 module-level 存, 而要放到 loop 闭包?</p>
-  <p>
-    <strong>答:</strong>module-level 会让父 agent 加载的 skill 出现在子智能体
-    工具列表里 (第 04 章的子智能体共享 tool registry, 但 skill 状态也 共享,
-    这就破坏了"按需隔离")。也违反第 04 章工厂模式的本意。
+<ol>
+<li>
+<strong>system prompt 撑到 8000 token</strong>: 30 个工具的 description
+    + JSON Schema 占了大半, 真正"用户场景" 的 system prompt 反而被
+    挤到边角。
+    </li>
+<li>
+<strong>模型被淹没</strong>: 30 个工具摆在面前, 模型选了
+    <code>run_web_search</code> 去搜 React 文档, 而不是用
+    <code>run_read</code> 看本地组件代码。 因为工具描述里
+    <code>run_web_search</code> 写得很显眼 (描述里说"搜任何信息"),
+    <code>run_read</code> 写得很朴素 ("读文件")。
+    </li>
+<li>
+<strong>成本涨 3 倍</strong>: 8000 token 的 system prompt 每轮都重传,
+    即使不相关也要付账。 跑 50 轮对话, 仅 system prompt 就 400K token。
+    </li>
+<li>
+<strong>模型选错工具的频率上升</strong>: 10 个工具时模型选对率 95%,
+    30 个工具时降到 78%。 模型在长列表中迷失, 工具描述写得"有趣"
+    的更容易被选, 但有趣 ≠ 相关。
+    </li>
+</ol>
+<p>
+  朴素想法 1: "压缩工具描述, 每个只写 1 句?"
+  模型看 1 句描述, 跟没看一样, 选错的频率更高。
   </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · 工具描述按需拼接字符串</p>
-  <pre class="code-block"><code>// 教学简化版
-let systemPrompt = basePrompt + skillDescriptions.join("");</code></pre>
-  <p><strong>问:</strong>为什么不直接拼字符串?</p>
-  <p>
-    <strong>答:</strong>字符串拼接会让 system prompt 每次调 LLM 都不同, 整个
-    prompt cache 失效, token 成本涨 3-5 倍。正确做法: skill 集合稳定 (例如
-    "react" 一旦加载就一直存在), 工具描述作为稳定 前缀, 不再因 LLM
-    调用而变化。这是第 10 章 cache-friendly 布局 的关键伏笔。
+<p>
+  朴素想法 2: "按 mode 切工具集?"
+  比如"代码 mode" 只给 run_*, "运维 mode" 只给 schedule_*。 但
+  mode 切换是用户的负担, 用户不知道当前 mode。
   </p>
-</div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface SkillMeta {
-  name: string;
+<p>
+  正确做法: <strong>Skill 机制</strong>。 平时只暴露 5-10 个核心工具,
+  LLM 看到"我需要 React 工具" 时调 <code>load_skill("react")</code>,
+  harness 临时把 React 相关工具加进当前 loop 的工具列表。 loop
+  结束后激活状态丢弃, 不污染父 agent 或后续 loop。
+  </p>
+<h2 id="what-is-skill">Skill 是什么: 不只是工具子集</h2>
+<p>
+  <strong>用途</strong>: Skill 是<strong>一组工具 + 一段元信息</strong>的封装。
+  包含 3 个要素: skill 名 / description (给 LLM 看的"何时用") / tools 数组
+  (包含哪些工具)。 Skill 本身也是"按需注入能力" 的元机制 — 后面 hook
+  / memory 章节会复用。
+  </p>
+<p>
+  <strong>真实场景</strong>: harness 默认只暴露 5-10 个核心工具。
+  用户说"帮我做一个 React 组件", LLM 调 <code>load_skill("react")</code>,
+  harness 把 <code>use_react_component</code> / <code>react_props</code> 等 5 个
+  工具加进当前 loop 的工具列表。 LLM 看到新工具, 调它们。 loop
+  结束后激活状态丢弃, 不污染父 agent。
+  </p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>按需加载</strong>模式 — 平时只暴露
+  必要的"地基工具", 场景化工具按需加载。 这跟 npm 的
+  <code>import</code> 一个道理: 全局 <code>require</code> 所有包会撑爆
+  内存, 按需 import 才省。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>interface SkillMeta {
+  name: string;             // "react" / "kubernetes" / "code-review"
   description: string;       // 给 LLM 看: 什么场景用
   tools: string[];            // 该 skill 包含的工具名
+  // 后续可以加 examples / whenToLoad / 依赖关系等
 }
 
-export interface SkillLoader {
+interface SkillLoader {
   get(name: string): SkillMeta | undefined;
   list(): SkillMeta[];
   // 返回该 skill 包含的工具实例
   resolve(name: string): Tool[];
 }
 
-export interface ActiveSkillSet {
+interface ActiveSkillSet {
   // 激活 skill, 返回新增的工具 (供 agent 合并到当前工具列表)
   activate(name: string): Tool[];
   // 当前激活的 skill 名字列表
@@ -134,389 +106,643 @@ <h2 id="interfaces">接口形状: 在写实现前钉死</h2>
   // 当前激活的所有工具 (去重)
   tools(): Tool[];
 }</code></pre>
-
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/skills.ts#L1" rel="noreferrer" target="_blank">GitHub · src/skills.ts SkillLoader / ActiveSkillSet 实现 (L1)</a></p>
+<p>
+  3 个接口, 3 个职责:
+</p>
+<ol>
+<li>
+<strong><code>SkillLoader</code></strong>: 从配置目录 (如 <code>~/.skills/</code>)
+    加载所有 skill, 静态数据。 启动后不变。
+  </li>
+<li>
+<strong><code>ActiveSkillSet</code></strong>: 当前 loop 内激活的 skill 集合。
+    闭包内状态, 父子 agent 各自一份。
+  </li>
+<li>
+<strong>LLM 调 <code>load_skill(name)</code></strong>: 把 skill 加进
+    ActiveSkillSet, 工具列表更新。 这是普通的 tool call, 走
+    第 02 章的 tool_call 协议。
+  </li>
+</ol>
+<h2 id="three-invariants">3 条不变量</h2>
+<dl class="defs">
+<dt>不变量 1 · skill 激活状态是 loop 内的临时态, loop 结束丢弃</dt>
+<dd>
+    ActiveSkillSet 存在 agent 实例的闭包里, 不放 module-level。
+    父 agent 加载的 skill 不出现在子智能体工具列表 (第 04 章
+    父子隔离的延伸)。
+  </dd>
+<dt>不变量 2 · 同一个 loop 内可激活多个 skill, 工具集取并集</dt>
+<dd>
+    LLM 调 <code>load_skill("react")</code> 后再调
+    <code>load_skill("typescript")</code>, 两个 skill 的工具合并
+    进当前工具列表。 不需要"先 deactivate 再 activate", 简单加并集。
+  </dd>
+<dt>不变量 3 · 工具描述在 LLM 视角是稳定前缀</dt>
+<dd>
+    skill 集合一旦确定, 工具描述就稳定 (skill 是稳定集合, 不是动态拼字符串)。
+    不会因为 LLM 调用某个工具而触发工具描述变化, 不会破坏 prompt cache
+    (第 10 章的伏笔)。
+  </dd>
+</dl>
+<h2 id="naive">朴素反例: 状态写到 module-level 单例</h2>
+<p>
+  朴素实现: skill 激活状态写 module-level。
+</p>
+<pre class="code-block"><code>// ❌ 反例: module-level 单例
+let activeSkills: string[] = [];
+export function loadSkill(name: string) { activeSkills.push(name); }</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/skills.ts#L1" rel="noreferrer" target="_blank">GitHub · src/skills.ts 反例: module-level 单例 (L1)</a></p>
+<p>
+  立刻坏 3 件事:
+</p>
+<ol>
+<li>
+<strong>父子污染</strong>: 父 agent 加载 "react" skill, spawn 子智能体,
+    子智能体工具列表里出现 react 工具。 第 04 章的父子隔离被破坏。
+    </li>
+<li>
+<strong>多 agent 互相干扰</strong>: 多个 agent 实例共享 activeSkills,
+    一个 agent 加载 skill 另一个 agent 也看到, 互不可见性被破坏。
+    </li>
+<li>
+<strong>无法测试隔离</strong>: 跑完一个测试 activeSkills 还有上次
+    加载的 skill, 下一个测试看到 "ghost skill", 测试 flaky。
+    </li>
+</ol>
+<p>
+  解决方式: <code>ActiveSkillSet</code> 在 <code>createAgent()</code>
+  闭包内创建, 父子天然隔离。
+  </p>
+<h2 id="naive-2">朴素反例 2: 工具描述按需拼字符串</h2>
+<p>
+  朴素实现 2: skill 激活时把工具描述拼到 system prompt 字符串。
+</p>
+<pre class="code-block"><code>// ❌ 反例: 拼字符串
+let systemPrompt = basePrompt + skillDescriptions.join("");</code></pre>
+<p>
+  这条更危险 — 破坏 prompt cache。 解释:
+</p>
+<ol>
+<li>
+<strong>cache 命中率归零</strong>: system prompt 每轮都变, 即使只是
+    TODO 状态变化, 整个 system prompt 重新算 hash, cache 不命中。
+  </li>
+<li>
+<strong>成本涨 5-10 倍</strong>: 100 轮对话, 没用 cache 时 $1.50, 拼
+    上去后 $7.50-$15.00。
+    </li>
+<li>
+<strong>cache-debug 无法追踪</strong>: 第 10 章的 stable prefix hash
+    算法假设 system prompt 稳定, 一旦拼字符串, hash 永远在变。
+    </li>
+</ol>
+<p>
+  正确做法: skill 工具描述<strong>作为独立的 tool definition 数组</strong>
+  传给 LLM, 不拼到 system prompt。 LLM 通过 <code>tool_calls</code> 字段
+  调工具, 工具描述在 LLM 视角是稳定的 stable prefix。
+  </p>
 <h2 id="loop-integration">loop 接入: skill 激活是 loop 内的临时态</h2>
 <p>
-  关键设计: <code>ActiveSkillSet</code> 是 agent 实例闭包内的状态, 不是
-  module-level。loop 跑完一次 run() 之后, 状态保留 (LLM 在同一 run 内 多次
-  load_skill 是合理的); 但 agent 实例被销毁时, 状态自然消失。 子智能体拿不到父
-  agent 的 skill 状态 (第 04 章隔离)。
+  <strong>用途</strong>: 把 SkillLoader 接入 agent.run, 让 LLM 调 load_skill
+  后能立刻看到新工具。 这是"按需加载" 在 agent 里的具体应用。
+  </p>
+<p>
+  <strong>真实场景</strong>: LLM 跑一段对话, 中途调 <code>load_skill("react")</code>,
+  下一轮 LLM 看到 <code>use_react_component</code> 工具, 调它。 loop
+  结束后激活状态自动丢弃, 不会影响下一个 run。
+  </p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>按需加载 + 状态机</strong> 组合 — skill 加载
+  是异步的, 加载后下一轮 chat() 才看到新 tools, 同一个 run() 的
+  tools 数组稳定, 不破坏 prompt cache。 这跟第 10 章的 cache 布局
+  思想完全一致。
+  </p>
+<p>
+  <strong>实现细节</strong>:
 </p>
 <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export function createAgent(deps): Agent {
-  const activeSkills = createActiveSkillSet();
-
+export function createAgent(deps: { llm: LLMClient; history: History; tools: ToolRegistry; skillLoader: SkillLoader; activeSkills: ActiveSkillSet }): Agent {
   return {
     async run(query) {
-      history.add({ role: "user", content: query });
+      deps.history.add({ role: "user", content: query });
+
       for (;;) {
         // 当前可用工具 = registry 基础 + 当前激活的 skill 工具
-        const availableTools = [...registry.list(), ...activeSkills.tools()];
-        const messages = history.getMessages();
-        const assistant = await llm.chat({ messages, tools: availableTools });
-        history.add(assistant);
-
-        if (!assistant.tool_calls) return assistant.content;
-
-        for (const call of assistant.tool_calls) {
+        const availableTools = [
+          ...deps.tools.list(),
+          ...deps.activeSkills.tools(),
+        ];
+        const messages = deps.history.getMessages();
+        const assistant = await deps.llm.chat({ messages, tools: availableTools });
+        deps.history.add({
+          role: "assistant",
+          content: assistant.content,
+          tool_calls: assistant.toolCalls,
+        });
+
+        if (assistant.toolCalls.length === 0) return assistant.content ?? "";
+
+        for (const call of assistant.toolCalls) {
           // load_skill 特殊处理: 不真执行, 而是激活 skill
           if (call.name === "load_skill") {
-            const skillName = call.args["name"] as string;
-            activeSkills.activate(skillName);
+            const skillName = call.args["name"];
+            const newTools = deps.activeSkills.activate(skillName);
             history.add({ role: "tool", tool_call_id: call.id,
-              content: `Loaded skill "${skillName}". Tools: ${
-                skillLoader.get(skillName)?.tools.join(", ") ?? "(unknown)"
-              }` });
+              content: `Loaded skill "${skillName}". Tools: ${skillName in loader ? loader.get(skillName).tools.join(", ") : "(unknown)"}` });
             continue;
           }
-          // 其它工具正常执行
-          const tool = registry.get(call.name);
-          /* ... */
+          // 其他工具正常执行 (第 02 章的 loop)
+          const result = await deps.tools.invoke(call.name, JSON.parse(call.function.arguments));
+          history.add({ role: "tool", tool_call_id: call.id, content: result.content });
         }
       }
     },
   };
 }</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1" rel="noreferrer" target="_blank">GitHub · src/agent.ts loop 接入 Skill (L1)</a></p>
 <p>
-  注意: <code>load_skill</code> 不算 "真" 的 tool call, 它激活 skill 后 写一条
-  tool message 告诉 LLM "已加载 X, 可用工具: Y, Z"。下一轮 LLM 调用时,
-  这些工具就在 availableTools 里, LLM 看到 schema 后 可以直接调用。
+  关键点: <code>load_skill</code> 是特殊 tool, 它激活 skill 后写一条
+  <code>role: "tool"</code> 消息告诉 LLM "已加载 X, 可用工具: Y, Z"。
+  下一轮 LLM 调用时, 这些工具就在 availableTools 里, LLM 看到
+  schema 后可以直接调。
+  </p>
+<h2 id="cache-implication">对 prompt cache 的影响: stable vs 动态</h2>
+<p>
+  本章是第 10 章 cache-friendly 布局的关键伏笔。 原则是:
 </p>
-
-<h2 id="cache-implication">对 prompt cache 的影响: 稳定 vs 动态</h2>
-<p>本章是第 10 章 cache-friendly 布局的关键伏笔。原则是:</p>
 <ol>
-  <li>
-    <strong>稳定前缀</strong>: system prompt + 工具描述 (按 skill 集合)。 一旦
+<li>
+<strong>稳定前缀</strong>: system prompt + 工具描述 (按 skill 集合)。 一旦
     skill 集合在一次 run 内确定, 这部分就不变。
   </li>
-  <li>
-    <strong>动态状态</strong>: history messages + reminder。 每轮都变, 但走普通
-    messages, 不进 system prompt。
+<li>
+<strong>动态状态</strong>: history messages + reminder。 每轮都变, 但
+    走普通 messages, 不进 system prompt。
   </li>
 </ol>
 <p>
-  错误做法是把 skill 工具描述拼到 system prompt 字符串里 (观察 2 的反例), 那会让
-  system prompt 每次都变, cache 失效。
+  ❌ / ✅: 错误做法是把 skill 工具描述拼到 system prompt 字符串里
+  (朴素反例 2), 那会让 system prompt 每次都变, cache 失效。
+  正确做法: 工具描述作为 LLM.chat() 的 tools 字段, system prompt
+  保持稳定。
+  </p>
+<p>
+  LLM 视角: LLM 看到 system prompt (稳定) + tools (稳定, 因为 skill
+  集合不变) + messages (动态, 含 tool_call / tool_result / reminder)。
+  真正变的是 messages, 但 LLM provider 通常也缓存 messages 前缀
+  (Anthropic 支持 4 个 breakpoint, 详见第 10 章)。
+  </p>
+<figure class="figure">
+<div class="flow-compare" role="img" aria-label="skill 工具描述作为稳定前缀">
+<div class="flow-compare__col flow-compare__col--bad">
+<div class="flow-compare__label">❌ 拼字符串</div>
+<span class="flow-node">systemPrompt =<br/>base + skillDesc.join</span>
+<span class="flow-node">每轮 system prompt 变</span>
+<span class="flow-node">cache 命中率归零</span>
+<span class="flow-node" style="font-weight: 600;">成本涨 5-10 倍</span>
+</div>
+<div class="flow-compare__col flow-compare__col--good">
+<div class="flow-compare__label">✅ tools 字段</div>
+<span class="flow-node">systemPrompt = base</span>
+<span class="flow-node">skill 集合稳定 → tools 稳定</span>
+<span class="flow-node">cache 命中率高</span>
+<span class="flow-node" style="font-weight: 600;">成本降 5-10 倍</span>
+</div>
+</div>
+<figcaption>图 05-1 · skill 工具描述作为稳定前缀. 拼字符串破坏 cache, tools 字段保持 cache 命中。</figcaption>
+</figure>
+<h2 id="active-skills-closure">ActiveSkillSet 闭包化</h2>
+<p>
+  ActiveSkillSet 必须在 createAgent() 闭包内, 不写 module-level:
 </p>
+<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
+export function createActiveSkillSet(skillLoader: SkillLoader): ActiveSkillSet {
+  const activated = new Map&lt;string, SkillMeta&gt;();  // name → skill
+  const toolIndex = new Map&lt;string, Tool&gt;();        // toolName → Tool (去重)
 
-<h2 id="trap">反例梯度</h2>
+  return {
+    activate(name: string): Tool[] {
+      const meta = skillLoader.get(name);
+      if (!meta) return [];
+      activated.set(name, meta);
+      const resolved = skillLoader.resolve(name);
+      const newly: Tool[] = [];
+      for (const tool of resolved) {
+        if (!toolIndex.has(tool.name)) {
+          toolIndex.set(tool.name, tool);
+          newly.push(tool);
+        }
+      }
+      return newly;   // 返回新增的工具 (供 agent 调试用)
+    },
+    names(): string[] { return [...activated.keys()]; },
+    tools(): Tool[] { return [...toolIndex.values()]; },
+  };
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/skills.ts#L1" rel="noreferrer" target="_blank">GitHub · src/skills.ts createActiveSkillSet 完整实现 (L1)</a></p>
+<p>
+  关键: <code>toolIndex</code> 用 Map 而不是 Array, 天然去重。 同名工具
+  在两个 skill 出现时, 先注册的优先 (而不是报错或覆盖)。
+  </p>
+<p>
+  ❌ / ✅ 写法:
+</p>
+<pre class="code-block"><code>// ❌ 激活时用 Array.includes 检查去重 (O(n))
+function activate(name) {
+  const resolved = skillLoader.resolve(name);
+  for (const t of resolved) {
+    if (!tools.find(x =&gt; x.name === t.name)) tools.push(t);
+  }
+}
 
+// ✅ 用 Map 去重 (O(1))
+function activate(name) {
+  for (const t of resolved) {
+    if (!toolIndex.has(t.name)) toolIndex.set(t.name, t);
+  }
+}</code></pre>
+<p>
+  功能上等价, 但 10 个 skill 30 个工具时, O(n²) 的 includes 跑
+  1000 次循环, O(1) 的 Map 只跑 30 次。 测试上看不出, 但 LLM
+  调 100 个 skill 加载时, 性能差异明显。
+  </p>
+<h2 id="loop-boundary">loop 边界: 不变量 1 的实际意义</h2>
+<p>
+  "skill 激活状态是 loop 内的临时态" 翻译成代码:
+</p>
+<pre class="code-block"><code>// 教学简化版
+export function createAgent(deps) {
+  // 闭包内 activeSkills, 父子隔离天然成立
+  const activeSkills = createActiveSkillSet(deps.skillLoader);
+
+  return {
+    async run(query) {
+      // ... run 内可调 load_skill, 状态保留
+      // run 跑完, activeSkills 仍存在 (下次 run() 复用)
+      // agent 实例被销毁时, activeSkills 自然消失
+    },
+  };
+}</code></pre>
+<p>
+  注意: <strong>不是"每次 run() 重置 activeSkills"</strong>。 同一个
+  agent 实例的多次 run() 共享 activeSkills — 这是合理的, 用户
+  在 run 1 加载了 "react" skill, run 2 想继续用, 不需要重新加载。
+  状态仅在 agent 实例被销毁时清空, 而 agent 实例通常在用户退出
+  REPL 时销毁。
+  </p>
+<figure class="figure">
+<div class="flow-map" role="img" aria-label="skill 激活生命周期">
+<div class="flow-row--center">
+<span class="flow-node">agent.run() 1</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">load_skill("react")</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">用 react 工具</span>
+</div>
+<div class="flow-row--center" style="margin-top: var(--space-3)">
+<span class="flow-node">agent.run() 2</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">react 工具仍可用</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">用 react 工具</span>
+</div>
+<div class="flow-row--center" style="margin-top: var(--space-3)">
+<span class="flow-node">REPL 退出 / agent 销毁</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">activeSkills 闭包销毁</span>
+</div>
+</div>
+<figcaption>图 05-2 · skill 激活生命周期. 同一 agent 实例跨 run() 共享 activeSkills, agent 销毁时彻底清空。</figcaption>
+</figure>
+<h2 id="load-skill-tool">load_skill 工具的特殊设计</h2>
+<p>
+  <code>load_skill</code> 是特殊 tool, 它不在 ToolRegistry 里, 而是
+  agent.run 内部特殊处理。 原因:
+</p>
+<ol>
+<li>
+<strong>load_skill 不应该被普通 tool invoke 路径处理</strong>:
+    普通工具是 IO 操作 (读文件、跑命令), load_skill 是"修改
+    agent 自身状态"。 两条路径语义不同。
+  </li>
+<li>
+<strong>load_skill 必须写 tool message</strong>: 跟其他工具一样, 写
+    一条 <code>role: "tool"</code> 消息告诉 LLM "skill 已加载"。
+    不写会让 messages 序列断裂 (第 02 章的教训)。
+  </li>
+<li>
+<strong>load_skill 的 content 描述可用工具</strong>: 不是 "OK",
+    而是 "已加载 X, 可用工具: Y, Z"。 让 LLM 知道下一步能调
+    哪些工具。
+  </li>
+</ol>
+<p>
+  ❌ / ✅: 错误做法是 load_skill 激活后不写 tool message。
+</p>
+<pre class="code-block"><code>// ❌ load_skill 激活了, 但没写 tool message
+if (call.name === "load_skill") {
+  activeSkills.activate(call.args.name);
+  continue;   // 跳到下一个 tool call, 没 history.add(tool)
+}
+// 下一次 chat() 时, LLM 看到一个 assistant 调了 load_skill 但
+// 没收到 tool result, 模型困惑
+
+// ✅ load_skill 激活后必写 tool message
+if (call.name === "load_skill") {
+  activeSkills.activate(call.args.name);
+  history.add({ role: "tool", tool_call_id: call.id, content: "Loaded skill X. Tools: Y, Z" });
+  continue;
+}</code></pre>
+<h2 id="fake-test">fake LLM 测试: skill 加载 + 同 skill 多次去重</h2>
+<p>
+  <strong>用途</strong>: 验证 skill 加载和去重逻辑的 fake 测试。 不依赖
+  真实 skill loader, 跑得快, 跑得稳。
+  </p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>fake LLM + fake loader</strong> 双 fake 套路 —
+  模拟 LLM 调 load_skill, 验证 activeSkills 状态正确, 验证下一轮
+  chat() 的 tools 字段含新工具。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>test("load_skill 后, 工具列表含 skill 工具", async () =&gt; {
+  const skillLoader = createSkillLoader({
+    react: {
+      name: "react",
+      description: "React 组件工具集",
+      tools: ["use_react_component"],
+    },
+  });
+  skillLoader.register("use_react_component", { name: "use_react_component", description: "...", parameters: {} }, async (args) =&gt; ({ content: "OK" }));
+
+  const activeSkills = createActiveSkillSet(skillLoader);
+  const fakeLLM = createFakeLLM([
+    // 第一次: LLM 调 load_skill
+    { content: null, toolCalls: [{ id: "s1", function: { name: "load_skill", arguments: '{"name":"react"}' } }], finishReason: "tool_calls" },
+    // 第二次: 调 use_react_component
+    { content: null, toolCalls: [{ id: "t1", function: { name: "use_react_component", arguments: '{}' } }], finishReason: "tool_calls" },
+    // 第三次: 回答
+    { content: "Done.", toolCalls: [], finishReason: "stop" },
+  ]);
+
+  const agent = createAgent({ llm: fakeLLM, history, tools: emptyRegistry, skillLoader, activeSkills });
+  await agent.run("Make a React component");
+
+  // 验证: 第二次 chat() 时, tools 字段含 use_react_component
+  const secondCallTools = fakeLLM.allCalls()[1].tools;
+  expect(secondCallTools.map(t =&gt; t.function.name)).toContain("use_react_component");
+});
+
+test("同 skill 多次加载去重", async () =&gt; {
+  const skillLoader = createSkillLoader({ react: { name: "react", tools: ["use_react_component"] } });
+  const activeSkills = createActiveSkillSet(skillLoader);
+
+  activeSkills.activate("react");
+  activeSkills.activate("react");
+  activeSkills.activate("react");
+
+  // 工具列表只有 1 个 use_react_component, 不重复
+  expect(activeSkills.tools().length).toBe(1);
+});</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/skills.test.ts#L1" rel="noreferrer" target="_blank">GitHub · src/skills.test.ts skill 加载 + 去重测试 (L1)</a></p>
+<p>
+  这两条测试如果挂, 你立刻知道 skill 加载或去重逻辑坏了。
+  </p>
+<h2 id="common-confusion">3 个常见误解</h2>
+<dl class="defs">
+<dt>误解 1 · "Skill 应该进 system prompt 才有保证"</dt>
+<dd>
+    错。 Skill 工具描述走 <code>LLM.chat()</code> 的 tools 字段,
+    不进 system prompt。 跟 TODO 走 reminder 一样的原理:
+    功能上等价, 性能上 (prompt cache) 不同。
+  </dd>
+<dt>误解 2 · "Skill 之间互斥"</dt>
+<dd>
+    错。 Skill 是<strong>并集</strong>, 不是互斥。 LLM 调多个 skill,
+    工具列表合并。 "react" + "typescript" 两个 skill 共存。
+    只有同名工具会去重, 不是整个 skill 互斥。
+  </dd>
+<dt>误解 3 · "Skill 加载是 hot reload"</dt>
+<dd>
+    错。 skill 配置在 <code>~/.skills/</code> 目录, agent 启动时
+    <code>skillLoader.scan()</code> 加载一次。 加载后修改 skill 文件,
+    需要重启 agent 才生效。 "hot reload" 看似灵活, 实际引入
+    一致性问题 (LLM 看到 A, 你改完, 下一轮看到 B, 行为漂移)。
+  </dd>
+</dl>
+<h2 id="trap">反例梯度</h2>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>skill 激活状态写到 module-level 单例。</p>
-    <p>
-      <strong>为什么错:</strong>第 04 章父子隔离被破坏, 子智能体看到父 agent
-      加载的 skill。
+<div class="card__head">
+<span class="card__tag">新手错法 · A</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> skill 激活状态写到 module-level 单例。
+    </p>
+<p>
+<strong>为什么错:</strong> 第 04 章父子隔离被破坏, 子智能体看到父
+      agent 加载的 skill。
     </p>
-    <p>
-      <strong>正确做法:</strong>activeSkills 在 createAgent() 闭包内, 工厂模式。
+<p>
+<strong>正确做法:</strong> activeSkills 在 <code>createAgent()</code> 闭包内,
+      工厂模式。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>工具描述拼到 system prompt 字符串。</p>
-    <p>
-      <strong>为什么错:</strong>破坏第 10 章 cache-friendly 布局, 每次调 LLM
+<div class="card__head">
+<span class="card__tag">中级错法 · B</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 工具描述拼到 system prompt 字符串。
+    </p>
+<p>
+<strong>为什么错:</strong> 破坏第 10 章 cache-friendly 布局, 每次调 LLM
       都要重传所有 skill 工具描述。
     </p>
-    <p>
-      <strong>正确做法:</strong>工具描述作为 LLM.chat() 的 tools 字段, system
+<p>
+<strong>正确做法:</strong> 工具描述作为 LLM.chat() 的 tools 字段, system
       prompt 保持稳定。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>load_skill 工具调成功但没写 tool message。</p>
-    <p>
-      <strong>为什么错:</strong>第 02 章的 tool_call_id 配对约束, 不写 tool
+<div class="card__head">
+<span class="card__tag">高级错法 · C</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> load_skill 工具调成功但没写 tool message。
+    </p>
+<p>
+<strong>为什么错:</strong> 第 02 章的 tool_call_id 配对约束, 不写 tool
       message 会让 messages 序列断裂。
     </p>
-    <p>
-      <strong>正确做法:</strong>load_skill 激活后, 必写一条 role: "tool" 消息,
+<p>
+<strong>正确做法:</strong> load_skill 激活后, 必写一条 role: "tool" 消息,
       content 描述已加载的工具列表。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>同一个 skill 加载多次, 工具集出现重复。</p>
-    <p>
-      <strong>为什么错:</strong>availableTools 出现重复项, LLM 看到两个相同 name
+<div class="card__head">
+<span class="card__tag">边界错法 · D</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 同一个 skill 加载多次, 工具集出现重复。
+    </p>
+<p>
+<strong>为什么错:</strong> availableTools 出现重复项, LLM 看到两个相同 name
       的工具, 困惑。
     </p>
-    <p>
-      <strong>正确做法:</strong>activeSkills.tools() 用 Map 去重,
+<p>
+<strong>正确做法:</strong> activeSkills.tools() 用 Map 去重,
       同名工具取先注册的版本。
     </p>
-  </div>
 </div>
-
+</div>
 <h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
 <div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 05 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>load_skill 激活工具:</strong>fake LLM 第一轮调
-      load_skill("react"), 跑完后第二轮 LLM 收到的 tools 字段含 react skill
-      的工具 (例如 "use_react_component")。
+<div class="card__head">
+<span class="card__tag">Validation · 第 05 章</span>
+</div>
+<div class="card__body">
+<p>
+<strong>load_skill 激活工具:</strong> fake LLM 第一轮调
+      <code>load_skill("react")</code>, 跑完后第二轮 LLM 收到的 tools 字段
+      含 react skill 的工具 (例如 "use_react_component")。
     </p>
-    <p>
-      <strong>tool message 必写:</strong>load_skill 调用后, history 末尾 出现
-      role: "tool" 消息, content 描述已加载的工具列表。
+<p>
+<strong>tool message 必写:</strong> load_skill 调用后, history 末尾
+      出现 role: "tool" 消息, content 描述已加载的工具列表。
     </p>
-    <p>
-      <strong>同 skill 多次加载去重:</strong>fake LLM 连续调 load_skill("react")
+<p>
+<strong>同 skill 多次加载去重:</strong> fake LLM 连续调 load_skill("react")
       两次, availableTools 不出现重复项。
     </p>
-    <p>
-      <strong>loop 结束不污染父:</strong>父 agent 调 load_skill 后 spawn
+<p>
+<strong>loop 结束不污染父:</strong> 父 agent 调 load_skill 后 spawn
       子智能体, 子智能体的 availableTools 不含 react skill 工具 (第 04
       章父子隔离, 本章沿用)。
     </p>
-    <p>
-      <strong>未知 skill 不崩:</strong>load_skill("不存在"), 写 error tool
+<p>
+<strong>未知 skill 不崩:</strong> load_skill("不存在"), 写 error tool
       message, 继续 loop。
     </p>
-  </div>
 </div>
-
-<h2 id="lookback">回望第 00–04 章: 哪些原则在本章兑现了</h2>
+</div>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>工厂模式再次回报:</strong>activeSkills 在 createAgent() 闭包内,
-    父子隔离自然成立。
-  </li>
-  <li>
-    <strong>tool call 协议沿用:</strong>load_skill 是普通工具, 复用 Tool
-    interface, 激活后写 tool message 满足协议。
-  </li>
-  <li>
-    <strong>稳定前缀原则:</strong>工具描述作为 LLM.chat() 的 tools 字段, 不污染
-    system prompt 字符串, 为第 10 章做准备。
-  </li>
-  <li>
-    <strong>不污染 history 的稳定结构:</strong>skill 状态本身不进
-    history.getMessages(), 只在 loop 闭包内。
-  </li>
+<li>
+<strong>工厂模式再次回报:</strong> activeSkills 在 <code>createAgent()</code> 闭包内,
+      父子隔离自然成立。
+    </li>
+<li>
+<strong>tool call 协议沿用:</strong> load_skill 是普通工具, 复用 Tool
+      interface, 激活后写 tool message 满足协议。
+    </li>
+<li>
+<strong>稳定前缀原则:</strong> 工具描述作为 LLM.chat() 的 tools 字段, 不污染
+      system prompt 字符串, 为第 10 章做准备。
+    </li>
+<li>
+<strong>不污染 history 的稳定结构:</strong> skill 状态本身不进
+      history.getMessages(), 只在 loop 闭包内。
+    </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>skill 加载后内容太长</dt>
-  <dd>
-    第 06 章 compress 会处理"skill 工具返回内容太大" 的情况, 例如 read_skill
+<dt>skill 加载后内容太长</dt>
+<dd>
+    第 06 章 compress 会处理"skill 工具返回内容太长" 的情况, 例如 read_skill
     工具一次返回整个 README, 需要 P1 即时压缩。
-  </dd>
-  <dt>skill 跨会话保留</dt>
-  <dd>
+    </dd>
+<dt>skill 跨会话保留</dt>
+<dd>
     第 09 章 memory 区分"会话内 skill" (本章的) 和"用户级偏好 skill"
     (例如"用户偏好 React 不用 Vue"), 后者需要持久化。
-  </dd>
-  <dt>工具描述前缀稳定性</dt>
-  <dd>
+    </dd>
+<dt>工具描述前缀稳定性</dt>
+<dd>
     第 10 章 cache-friendly 布局会展开, skill 集合一旦在 run() 内确定就稳定,
-    工具描述作为稳定前缀。
-  </dd>
-  <dt>skill 加载权限</dt>
-  <dd>
+      工具描述作为稳定前缀。
+    </dd>
+<dt>skill 加载权限</dt>
+<dd>
     第 07 章 permission 会扩展, load_skill 本身可能需要权限 (例如 "deploy" skill
     是受限的)。
-  </dd>
+    </dd>
 </dl>
-
-<h2 id="vibe-coding-05">本次如何 vibe code: 第 05 章的三件套</h2>
-
-<h3 id="vibe-feed-05">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>SkillMeta</code> /
-    <code>SkillLoader</code> / <code>ActiveSkillSet</code> 三个 interface, 以及
-    skill 配置文件结构 (例如 <code>~/.skills/react.json</code> 列出
-    name/description/tools)。本轮不写实现。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    <code>createSkillLoader()</code> 从配置目录加载所有 skill。本轮 review 重点:
-    skillLoader 实例在 <code>index.ts</code> 只 new 一次。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 createActiveSkillSet + load_skill
-    工具 + agent.ts 接入。本轮 review 重点: activeSkills 必须在 createAgent()
-    闭包内 (不是 module-level), load_skill 必写 tool message。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写 <code>test/skill.test.ts</code> +
-    <code>test/agent.skill.test.ts</code>。本轮 review 重点: "同 skill
-    多次加载去重"和"loop 结束不污染父" 两条必须有反向断言。
-  </li>
-</ol>
-
-<h3 id="vibe-review-05">Review: 第 05 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>activeSkills 是闭包状态, 不是 module-level。</strong>验证:
-    <code>grep -n 'let activeSkills\|^let active' src/</code> 应当 0 行, 只能在
-    createAgent() 函数体内出现。
-  </li>
-  <li>
-    <strong>工具描述不拼到 system prompt 字符串。</strong>验证:
-    <code>grep -n 'systemPrompt.*\+.*skill\|systemPrompt.*join' src/</code> 应当
-    0 行。
-  </li>
-  <li>
-    <strong>load_skill 必写 tool message。</strong>验证: 工具 execute
-    (或等价分支) 末尾有 <code>history.add({role: "tool", ...})</code>。
-  </li>
-  <li>
-    <strong>同名工具去重。</strong>验证:
-    <code>grep -n 'Map' src/skills.ts</code> 在 <code>tools()</code> 实现内 ≥ 1
-    行。
-  </li>
-  <li>
-    <strong>父子 skill 隔离。</strong>验证: 子智能体的 availableTools 不含父
-    agent 加载的 skill。第 04 章 + 本章的检查点合并。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-05">调试: 第 05 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · 工具描述拼到 system prompt 字符串。</strong>症状:
-    <code>systemPrompt + skillDesc</code> 写法。验证: Validation 卡片"load_skill
-    激活工具" 必须断言 "tools 字段含 react 工具", 而不是 "system prompt 含 react
-    描述"。
-  </li>
-  <li>
-    <strong>伪装 B · 工具去重用 Array.includes 而非 Map。</strong>症状:
-    <code>if (!tools.includes(t)) tools.push(t)</code>。功能上能跑, 但 O(n²)
-    性能差。验证: 第 12 章性能测试可能间接抓到, 本章不写显式测试。
-  </li>
-  <li>
-    <strong>伪装 C · load_skill 调成功但没写 tool message。</strong>症状:
-    activeSkills 激活了, 但 messages 序列断裂。验证: Validation 卡片"tool
-    message 必写" 那条断言, 必须
-    <code
-      >history.getMessages().filter(m =&gt; m.role === "tool").length ===
-      1</code
-    >。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-05">迭代: 第 05 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch05): 钉 SkillMeta / SkillLoader / ActiveSkillSet
-      接口与配置结构</code
-    >
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code>feat(ch05): createSkillLoader 工厂 + load_skill 工具 stub</code> ——
-    tsc 通过, agent.run 仍未接入。
-  </li>
-  <li>
-    <code
-      >feat(ch05): activeSkills 闭包化 + load_skill 写 tool message + 去重</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li><code>test(ch05): 父子 skill 隔离 + 未知 skill 不崩</code> —— 全绿。</li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">Prompt Card · 第 05 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 Skill 加载机制, agent 可按需激活工具子集,
+<div class="card__head">
+<span class="card__tag">Prompt Card · 第 05 章</span>
+<button class="card__copy" data-copy-card="" type="button">复制</button>
+</div>
+<div class="card__body">
+<p>
+<strong>目标:</strong> 实现 Skill 加载机制, agent 可按需激活工具子集,
       激活状态在 loop 闭包内临时保留。
     </p>
-    <p>
-      <strong>场景:</strong>用户输入 "帮我做一个 React 组件", agent 调
+<p>
+<strong>场景:</strong> 用户输入 "帮我做一个 React 组件", agent 调
       load_skill("react"), 第二轮 LLM 看到 react 工具 (例如
       use_react_component), 之后在该 run 内持续可用。
     </p>
-    <p>
-      <strong>模块:</strong> <code>src/skills.ts</code> (新) 暴露
+<p>
+<strong>模块:</strong> <code>src/skills.ts</code> (新) 暴露
       <code>createSkillLoader()</code> 和 <code>createActiveSkillSet()</code>;
       <code>src/tools/load_skill.ts</code> (新) 实现工具;
       <code>src/agent.ts</code> createAgent() 维护 activeSkills 闭包;
       <code>src/index.ts</code> 接线 skillLoader。
     </p>
-    <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
-    <ul>
-      <li>activeSkills 是 createAgent() 闭包内状态, 不是 module-level</li>
-      <li>load_skill 必写 role: "tool" 消息, content 描述已加载工具列表</li>
-      <li>工具描述不拼到 system prompt 字符串 (走 LLM.chat() 的 tools 字段)</li>
-      <li>同名工具去重 (用 Map 而非 Array.includes)</li>
-      <li>子智能体拿不到父 agent 的 skill 状态 (第 04 章隔离沿用)</li>
-    </ul>
-    <p><strong>验证 (用 fake LLM + fake loader, 逐条落到 vitest):</strong></p>
-    <ul>
-      <li>load_skill("react") 后, 第二轮 LLM 收到的 tools 字段含 react 工具</li>
-      <li>load_skill 调用后, history 末尾出现 role: "tool" 消息</li>
-      <li>连续两次 load_skill("react"), availableTools 不重复</li>
-      <li>
+<p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
+<ul>
+<li>activeSkills 是 createAgent() 闭包内状态, 不是 module-level</li>
+<li>load_skill 必写 role: "tool" 消息, content 描述已加载工具列表</li>
+<li>工具描述不拼到 system prompt 字符串 (走 LLM.chat() 的 tools 字段)</li>
+<li>同名工具去重 (用 Map 而非 Array.includes)</li>
+<li>子智能体拿不到父 agent 的 skill 状态 (第 04 章隔离沿用)</li>
+</ul>
+<p><strong>验证 (用 fake LLM + fake loader, 逐条落到 vitest):</strong></p>
+<ul>
+<li>load_skill("react") 后, 第二轮 LLM 收到的 tools 字段含 react 工具</li>
+<li>load_skill 调用后, history 末尾出现 role: "tool" 消息</li>
+<li>连续两次 load_skill("react"), availableTools 不重复</li>
+<li>
         父 agent 调 load_skill 后 spawn 子智能体, 子智能体 tools 不含 react 工具
       </li>
-      <li>load_skill("不存在"), 写 error tool message, agent 不抛</li>
-    </ul>
-  </div>
+<li>load_skill("不存在"), 写 error tool message, agent 不抛</li>
+</ul>
+</div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
+<li>
     故意把 activeSkills 提到 module-level, 跑测试, 看"父子 skill 隔离"
     是否抓到。
   </li>
-  <li>
+<li>
     把工具描述改成 <code>systemPrompt + skillDesc</code> 拼接写法, 跑测试,
     看是否破坏 system prompt 稳定性 (本节没显式测, 但可以为第 10 章留伏笔)。
   </li>
-  <li>
+<li>
     在 load_skill 末尾忘记写 tool message, 跑测试, 看"tool message 必写"
     是否抓到。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了 "Skill" 机制, agent 可按需激活工具子集。 activeSkills 是
-  createAgent() 闭包内的临时态, 父子隔离, 工具描述 走 LLM.chat() 的 tools
-  字段而不是 system prompt 字符串拼接, 为 第 10 章 cache-friendly
-  布局做铺垫。下一章 (第 06 章) 我们处理 "context 撑爆" 的问题——Normalize /
-  Block / Compress, 让 harness 在长任务下保持稳定。
-</p>
-
-<h2 id="next">下一章伏笔</h2>
-<p>
-  第 05 章解决了"工具太多装不下" 的问题, 但 messages 本身仍然会无限 增长
-  (用户多轮对话 + 工具结果累积)。第 06 章会引入三层压缩: Normalize
-  (合并相邻同角色消息) → Block (按消息块分组) → Compress (P1 即时压缩 + P2
-  全量压缩), 让 harness 在长任务下保持稳定的 context 长度。
+  这一章给 harness 加了 "Skill" 机制, agent 可按需激活工具子集。
+  activeSkills 是 createAgent() 闭包内的临时态, 父子隔离, 工具描述
+  走 LLM.chat() 的 tools 字段而不是 system prompt 字符串拼接, 为
+  第 10 章 cache-friendly 布局做铺垫。 下一章 (第 06 章) 我们处理
+  "context 撑爆" 的问题——Normalize / Block / Compress, 让 harness
+  在长任务下保持稳定。
 </p>
+</content>
\ No newline at end of file
diff --git a/tutorial/chapters/06-compress.html b/tutorial/chapters/06-compress.html
index 9d86435..0dfa46d 100644
--- a/tutorial/chapters/06-compress.html
+++ b/tutorial/chapters/06-compress.html
@@ -1,538 +1,742 @@
-<p class="article__eyebrow">第 06 章 · 上下文太长怎么办</p>
-<h1 class="article__title">三层压缩: Normalize / Block / Compress</h1>
+<p class="article__eyebrow">第 06 章 · context 撑爆怎么办</p>
+<h1 class="article__title">压缩: Normalize / Block / Compress 三层防线</h1>
 <p class="article__lede">
-  前面五章让 harness 能聊天、能调工具、能跑子任务、能加载 skill。但 messages
-  列表会随着对话轮次和工具结果无限增长, 最终撑爆模型窗口。这一章给 harness
-  加三层压缩: Normalize (合并相邻同角色消息) → Block (按消息块分组) → Compress
-  (P1 即时压缩 + P2 全量压缩), 让 harness 在长任务下保持稳定。
+  第 05 章的 Skill 让工具数不再撑爆 system prompt, 但 messages 本身
+  仍然会无限增长: 用户多轮对话 + 工具结果累积, 上下文很快超出窗口。
+  这一章引入三层压缩: Normalize (合并相邻同角色消息) → Block
+  (按消息块分组) → Compress (P0 衰减 + P1 即时 + P2 全量),
+  让 harness 在长任务下保持稳定的 context 长度。 读完后, 你能讲清
+  "为什么消息块是压缩的原子单位", 并能用 fake LLM 验证 P0 衰减
+  截断后的消息块不被破坏。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-05">在第 05 章基础上改了什么</h2>
-<p>
-  这一章在 prepareMessages() 流程里加三层压缩管道。原始 messages 从 history
-  读出后, 先 Normalize 合并相邻同角色消息, 再 Block 按"用户输入 + 助手回复 +
-  工具结果" 分组, 最后 Compress 做 P1 即时压缩 (大工具结果存文件 + 占位摘要) 和
-  P2 全量压缩 (超阈值时 LLM 总结早期 messages)。压缩后的 messages 进 LLM, 但
-  history 内部保留的是原始未压缩的 messages, 用于 audit。 对应到代码, 改动集中在
-  4 个文件: <code>src/normalize.ts</code> (新)、
-  <code>src/message-block.ts</code> (新)、<code>src/compressor.ts</code> (新)、
-  <code>src/agent.ts</code> (改 prepareMessages 流程)。
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: 200 轮后 LLM 失忆</h2>
+<p>
+  写代码之前, 先看一段真实长任务的失败。 跑一段对话: 用户让 agent
+  "分析 50 个文件, 总结常见问题"。
+  </p>
+<ol>
+<li>
+<strong>第 1-10 轮</strong>: 调 <code>run_bash</code> 5 次, 调 <code>run_read</code> 10 次, history 累积 8000 token。
+    LLM 还在跟踪。
+  </li>
+<li>
+<strong>第 25 轮</strong>: messages 累积到 200K, LLM 反应开始慢, 偶尔
+    重复调之前调过的工具。
+  </li>
+<li>
+<strong>第 50 轮</strong>: history 累积 80000 token。 LLM 严重失忆,
+    总结时漏掉 5 个文件, 重复 3 个文件的判断, 修复方案质量大幅下降。
+    </li>
+<li>
+<strong>第 100 轮</strong>: history 累积 50000 token, 单次 prompt 超过窗口,
+    LLM SDK 报 400 错误, agent loop 整个崩。
+  </li>
+</ol>
+<p>
+  朴素想法 1: "让 LLM 写总结, 把之前的工具结果总结成一段?"
+  能缓解, 但不根治 — 总结本身也要占 token, 总结会丢失细节, LLM
+  基于总结给出的判断质量下降。
+  </p>
+<p>
+  朴素想法 2: "压缩工具结果?"
+  压缩针对"单条工具结果太长", 不解决"多步任务的状态丢失" 问题。
+  </p>
+<p>
+  正确做法: <strong>三层压缩</strong>。 P0 衰减 (轻量, 每轮自动跑) +
+  P1 即时 (单 tool result 太大, 立刻压缩) + P2 全量 (超阈值, 整段
+  总结)。 三层互补, 各管各的边界。
+  </p>
+<h2 id="three-layers">三层压缩的边界</h2>
+<p>
+  三层不是"3 个选项", 是"3 道防线, 各管各的时机":
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th>层</th>
+<th>何时触发</th>
+<th>压缩什么</th>
+<th>谁负责</th>
+<th>是否可逆</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>P0 衰减</strong></td>
+<td>每轮 LLM 调用前自动</td>
+<td>超过 decayThreshold 的旧 tool result 截断</td>
+<td>compressor 内部</td>
+<td>不可逆 (丢掉细节)</td>
+</tr>
+<tr>
+<td><strong>P1 即时</strong></td>
+<td>单 tool result 超过阈值</td>
+<td>大 tool result 写入 OutputStore, 返回 preview + output_id</td>
+<td>tool executor 内部</td>
+<td>可逆 (通过 output_id 重读)</td>
+</tr>
+<tr>
+<td><strong>P2 全量</strong></td>
+<td>整个 history 超过阈值</td>
+<td>早期 K 块整体压缩成 1 段 summary</td>
+<td>compressor 内部 + LLM 总结</td>
+<td>不可逆 (总结会丢细节)</td>
+</tr>
+</tbody>
+</table>
+<p>
+  3 个关键差异:
+</p>
+<ol>
+<li>
+<strong>触发时机</strong>: P0 每轮自动, P1 工具返回时, P2 整个 history 超阈值。
+    各管各的, 不重叠。
+  </li>
+<li>
+<strong>压缩粒度</strong>: P0 单 tool result 截断, P1 单 tool result 存文件,
+    P2 整段 history 总结。 粒度递增。
+  </li>
+<li>
+<strong>是否可逆</strong>: P0 不可逆, P1 可逆 (通过 output_id 重读),
+    P2 不可逆。 这是有意的设计 — P1 给用户"后悔" 的机会, P0/P2
+    默认不可逆。
+  </li>
+</ol>
+<h2 id="block-primitive">消息块: 压缩的原子单位</h2>
+<p>
+  在讲 P0/P1/P2 之前, 必须先讲<strong>消息块</strong> — 压缩的原子单位。
+  </p>
+<p>
+  朴素想法: "把消息按 token 数切片, 每 4000 token 一段。"
+  立刻坏: 一段可能切到 <code>tool_use</code> 块中间, 留下 <code>tool_call</code>
+  没有 <code>tool_result</code>, messages 序列错乱。
+  </p>
+<p>
+  正确做法: 消息块是"语义完整" 的最小单位, 压缩时<strong>整块处理</strong>。
+  </p>
+<pre class="code-block"><code>type Block =
+  | { kind: "text"; messages: Message[] }                      // 纯文本对话
+  | { kind: "tool_use"; messages: Message[] }                   // 工具调用 + tool_result 配对
+  | { kind: "summary"; messages: Message[] };                   // P2 总结块</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/message-block.ts#L1" rel="noreferrer" target="_blank">GitHub · src/message-block.ts groupToBlocks / flattenToMessages (L1)</a></p>
+<p>
+  3 个块类型的边界:
 </p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/normalize.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/normalize.ts: 合并相邻同角色 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/message-block.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/message-block.ts: 按消息块分组 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/compressor.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/compressor.ts: P1 即时 + P2 全量压缩 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >4. src/agent.ts: prepareMessages 串起三层管道</a
-  >
+<ol>
+<li>
+<strong>text 块</strong>: 只有 user + assistant 无 tool_calls 的对话。
+    普通聊天场景, 多个连续 user/assistant 消息可以合并。
+    </li>
+<li>
+<strong>tool_use 块</strong>: assistant 含 tool_calls + 所有对应的
+    tool 消息。 这是不可分割的最小单位 — 拆开会破坏 tool_call_id
+    配对。
+    </li>
+<li>
+<strong>summary 块</strong>: P2 压缩产生的总结, 一段文字。 后续 P0
+    衰减不再处理 summary 块 (它本身就是压缩过的)。
+    </li>
+</ol>
+<figure class="figure">
+<div class="flow-tree" role="img" aria-label="消息块类型">
+<div class="flow-tree__children" style="border: 1px solid var(--color-border-soft); border-radius: var(--radius-md); padding: var(--space-3); width: 100%; max-width: 700px;">
+<div class="flow-tree__branch">
+<div class="flow-compare__label">text 块</div>
+<span class="flow-node">user + assistant</span>
+<span class="flow-node">无 tool_calls</span>
+<span class="flow-node" style="font-size: var(--text-xs);">连续 user 消息可合并</span>
 </div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    长任务下, 第 5 轮 LLM 调用时 messages 已经 200k tokens, 模型不仅贵
-    还"记不清早先的细节"。现象是"history 无增长, 模型却开始犯低级错误"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"history 截断到最近 N 轮"。这有两个问题: 一是早期 的关键决策
-    (例如"用户偏好简洁解释") 被丢掉, 二是模型失去对任务 全局的感知,
-    容易重复已做的事。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface ContextCompressor { compressToolResult, decayOldBlocks,
-      compactHistory }</code
-    >。 不变量三条: (1) history 内部保留原始未压缩 messages, 用于 audit 和 重放,
-    (2) 压缩只发生在 prepareMessages() 阶段, 不写回 history, (3) tool_call_id
-    配对在压缩后仍然成立 (压缩不能破坏协议)。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake LLM 触发 P1 压缩 (返回超长 tool result), 跑完后 prepareMessages
-    的输出中 tool result 被替换为占位摘要, 但 tool_call_id 仍在;
-    history.getMessages() 仍然保留原始长 tool result。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · 压缩后写回 history</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-async function prepareMessages() {
-  const messages = history.getMessages();
-  const compressed = compressor.compact(messages);
-  history.replaceEntries(compressed);  // 错误: 写回 history
-  return compressed;
+<div class="flow-tree__branch">
+<div class="flow-compare__label">tool_use 块</div>
+<span class="flow-node">assistant (tool_calls)</span>
+<span class="flow-node">+ 所有 tool 消息</span>
+<span class="flow-node" style="font-size: var(--text-xs);">不可分割, 拆开破坏配对</span>
+</div>
+<div class="flow-tree__branch">
+<div class="flow-compare__label">summary 块</div>
+<span class="flow-node">P2 压缩产生</span>
+<span class="flow-node">一段文字</span>
+<span class="flow-node" style="font-size: var(--text-xs);">不再被 P0 衰减</span>
+</div>
+</div>
+</div>
+<figcaption>图 06-1 · 消息块 3 种类型. tool_use 块不可拆, summary 块是 P2 压缩产物, text 块可合并。</figcaption>
+</figure>
+<p>
+  为什么 tool_use 块不能拆? 假设 assistant 调用 3 个工具:
+</p>
+<ol>
+<li>assistant(tool_call_1, tool_call_2, tool_call_3)</li>
+<li>tool(tool_call_id_1)</li>
+<li>tool(tool_call_id_2)</li>
+<li>tool(tool_call_id_3)</li>
+</ol>
+<p>
+  4 条消息组成一个 tool_use 块。 压缩时只能 4 条一起压, 不能 1-2 留
+  3-4 走, 否则 LLM 看到"assistant 调了 3 个工具, 只收到 2 个 tool
+  result" — messages 序列错乱。
+  </p>
+<h2 id="p1-immediate">P1 即时压缩: 大 tool result 存文件</h2>
+<p>
+  P1 是<strong>最常用</strong>的压缩, 在 tool executor 内部完成。 触发:
+  tool result content 长度超过阈值 (默认 2000 字符)。
+  </p>
+<pre class="code-block"><code>// 教学简化版
+async function runBash(args) {
+  const result = await exec(args.command);
+  if (result.stdout.length &gt; 2000) {
+    // 大输出: 写文件, 返回 preview + output_id
+    const outputId = await outputStore.write(result.stdout, { source: "bash", toolCallId: args._toolCallId });
+    return {
+      content: `[Output truncated, ${result.stdout.length} chars]\n[Preview]:\n${result.stdout.slice(0, 500)}\n...[use run_output_read with output_id: ${outputId} to read full output]`,
+      // 不标 error, 这是正常的"输出太大" 情况
+    };
+  }
+  return { content: result.stdout };
 }</code></pre>
-  <p><strong>问:</strong>为什么不能把压缩结果写回 history?</p>
-  <p>
-    <strong>答:</strong>写回会破坏三个东西 —— 审计: history 不再是"原始 现场",
-    transcript 模块 (第 15 章) 看到的是压缩后的消息; 重放: 下次 LLM 报 context
-    overflow, 触发"再次压缩", 陷入递归; 协议: tool_call_id
-    配对可能在压缩中丢失。
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/tools/run-bash.ts#L1" rel="noreferrer" target="_blank">GitHub · src/tools/run-bash.ts P1 即时压缩实现 (L1)</a></p>
+<p>
+  4 个细节:
+</p>
+<ol>
+<li>
+<strong>写文件后返回 preview + output_id</strong>: LLM 看到 preview
+    知道大概内容, 看到 output_id 知道如何重读。
+    </li>
+<li>
+<strong>不标 error</strong>: 这是正常情况, 不是工具失败。 LLM 收到
+    不会误以为命令出错。
+  </li>
+<li>
+<strong>preview 长度固定 500 字符</strong>: 给 LLM 足够的"第一印象",
+    又不会撑爆单条 tool message。 500 字符大约 100-200 token,
+    合理。
+  </li>
+<li>
+<strong>content 含完整说明</strong>: "Output truncated, N chars" +
+    "Preview: ..." + "use run_output_read with output_id: X"。
+    LLM 看到就知道下一步怎么读完整内容。
+  </li>
+</ol>
+<p>
+  为什么 P1 是<strong>可逆</strong>的? LLM 收到 output_id 后可以调
+  <code>run_output_read(output_id)</code> 读完整内容 (第 13 章展开)。
+  这是"后悔药" — 用户让 LLM 看完整 5000 行日志, LLM 不会因为压缩
+  而丢失信息。
   </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · 截断到最近 N 轮</p>
-  <pre class="code-block"><code>// 教学简化版
-const recent = messages.slice(-20);  // 只留最后 20 条
-return recent;</code></pre>
-  <p><strong>问:</strong>为什么不直接截断?</p>
-  <p>
-    <strong>答:</strong>截断丢掉的是"事实", 不是"长度"。关键决策
-    (例如"用户偏好"、 "项目约束") 经常出现在早期轮次。截断后模型 失去全局感,
-    重复已做的事或违背用户偏好。压缩保留的是"信息", 截断保留的是"顺序"。
+<h2 id="p0-decay">P0 衰减: 截断旧 tool result</h2>
+<p>
+  P0 在每次 LLM 调用前自动跑, 不需要 LLM 介入。 触发: tool_use 块
+  超过 decayThreshold (默认 5 轮全局 LLM 调用)。
   </p>
+<pre class="code-block"><code>// 教学简化版
+function decayOldBlocks(blocks: Block[], currentLoopIndex: number, decayThreshold: number): Block[] {
+  return blocks.map(block =&gt; {
+    if (block.kind !== "tool_use") return block;
+    if (currentLoopIndex - block.lastLoopIndex &lt; decayThreshold) return block;
+    // 超过 decayThreshold 轮的 tool_use 块, 截断 tool result content
+    return {
+      ...block,
+      messages: block.messages.map(m =&gt;
+        m.role === "tool"
+          ? { ...m, content: `[Decayed, ${m.content.length} chars, use output_id if needed]` }
+          : m
+      ),
+    };
+  });
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/compressor.ts#L1" rel="noreferrer" target="_blank">GitHub · src/compressor.ts P0 衰减实现 (L1)</a></p>
+<p>
+  关键点:
+</p>
+<ol>
+<li>
+<strong>按全局 loopIndex 衰减</strong>: 不是"每条消息独立 age",
+    是"整个块的年龄"。 一个 tool_use 块作为一个整体被衰减。
+    </li>
+<li>
+<strong>截断而非删除</strong>: content 替换成 "[Decayed, N chars]",
+    不是从 messages 里删除。 原因: tool_call_id 配对不能丢,
+    删了 LLM 看到"assistant 调了 3 个工具, 只收到 2 个 tool"
+    又错乱。
+  </li>
+<li>
+<strong>summary 块不衰减</strong>: 它本身就是压缩过的, 衰减没意义。
+    如果 summary 也衰减, 等于二次压缩, 信息丢失加速。
+  </li>
+</ol>
+<p>
+  ❌ / ✅: 不要按"消息条数" 衰减, 按"块" 衰减。
+</p>
+<pre class="code-block"><code>// ❌ 按消息条数, 拆 tool_use 块
+function decayByMessage(messages, currentLoop) {
+  for (let i = 0; i &lt; messages.length; i++) {
+    if (messages[i].role === "tool" &amp;&amp; currentLoop - i &gt; 5) {
+      messages[i].content = "[Decayed]";   // 拆了 tool_use 块
+    }
+  }
+}
+
+// ✅ 按块, tool_use 块整体处理
+function decayByBlock(blocks, currentLoop) {
+  return blocks.map(block =&gt; block.kind === "tool_use" &amp;&amp; currentLoop - block.lastLoopIndex &gt; 5
+    ? truncateToolResults(block)   // 整个块截断, 不拆
+    : block);
+}</code></pre>
+<h2 id="p2-compact">P2 全量: 整段 history 总结</h2>
+<p>
+  P2 在 history 超过总阈值时触发, 是一次"重型压缩"。 流程:
+</p>
+<ol>
+<li>把 history 分块</li>
+<li>保留最近 K 个块 (默认 6)</li>
+<li>之前的所有块 + 当前 summary 调 LLM 总结成 1 段</li>
+<li>替换为 [summary 块] + [最近 K 个块]</li>
+</ol>
+<pre class="code-block"><code>// 教学简化版
+async function compactHistory(blocks: Block[], recentKeep: number, llm: LLMClient): Promise&lt;Block[]&gt; {
+  if (blocks.length &lt;= recentKeep) return blocks;
+  const toCompress = blocks.slice(0, blocks.length - recentKeep);
+  const toKeep = blocks.slice(blocks.length - recentKeep);
+
+  // 拼接要被压缩的块
+  const text = toCompress.map(b =&gt; serializeBlock(b)).join("\n---\n");
+  // 调 LLM 总结
+  const response = await llm.chat({
+    messages: [{ role: "user", content: `请总结以下对话的关键信息, 保留事实和决策:\n\n${text}` }],
+  });
+  const summary = response.content ?? "";
+
+  // 替换为 summary 块 + 最近的块
+  return [
+    { kind: "summary", messages: [{ role: "assistant", content: summary }] },
+    ...toKeep,
+  ];
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/compressor.ts#L1" rel="noreferrer" target="_blank">GitHub · src/compressor.ts P2 全量压缩实现 (L1)</a></p>
+<p>
+  5 个细节:
+</p>
+<ol>
+<li>
+<strong>保留最近 K 块</strong>: 不全压, 留最近 6 个块保持近期上下文。
+    压缩总结本身也占 token, 压太狠反而失去信息。
+    </li>
+<li>
+<strong>summary 调 LLM 总结</strong>: P0/P2 是 harness 内部操作,
+    P2 总结需要 LLM。 这次 LLM 调用不进 history, 是一次性工具。
+    </li>
+<li>
+<strong>不写进 history</strong>: 总结后的 summary 块作为 history 的一部分
+    (kind: "summary"), 下次 prepare messages 时正常处理。
+    </li>
+<li>
+<strong>触发条件</strong>: history 超过 80% 窗口 (或配置阈值)。
+    太早触发浪费 LLM 调用, 太晚触发超窗口。
+    </li>
+<li>
+<strong>可重入</strong>: 连续 P2 不会出错, 因为 summary 块不参与下次 P2
+    (只压非 summary 块)。 这是"分层压缩" 的关键。
+  </li>
+</ol>
+<figure class="figure">
+<div class="flow-stack" role="img" aria-label="三层压缩触发顺序">
+<div class="flow-stack__layer flow-stack__layer--dynamic">
+<div class="flow-stack__label">P1 即时 (按需)</div>
+<span class="flow-node">单 tool result &gt; 2000 字符</span>
+<span class="flow-node">→ 写 OutputStore, 返回 preview + output_id</span>
 </div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 1. Normalize 层
-export function normalizeMessages(messages: Message[]): Message[];
-
-// 2. Block 层
-export type MessageBlock =
-  | { type: "text"; user?: Message; assistant?: Message }
-  | { type: "tool_use"; user: Message; assistant: Message; toolResults: Message[] }
-  | { type: "summary"; user: Message; summary: string };
-
-export function groupToBlocks(messages: Message[]): MessageBlock[];
-export function flattenToMessages(blocks: MessageBlock[]): Message[];
-
-// 3. Compress 层
-export interface ContextCompressor {
-  // P1: 即时压缩单个 tool result, 超过阈值存文件 + 占位摘要
-  compressToolResult(toolName: string, callId: string, content: string): ToolResult;
-  // P0: 衰减压缩 (第 N 轮之后, 把旧 tool result 截断到前 K tokens)
-  decayOldBlocks(blocks: MessageBlock[], currentLoopIndex: number): MessageBlock[];
-  // P2: 全量压缩 (token 估算超阈值时, 让 LLM 总结早期 blocks)
-  compactHistory(blocks: MessageBlock[]): { blocks: MessageBlock[]; summary: string };
+<div class="flow-stack__arrow">↓</div>
+<div class="flow-stack__layer flow-stack__layer--dynamic">
+<div class="flow-stack__label">P0 衰减 (每轮自动)</div>
+<span class="flow-node">tool_use 块 &gt; 5 轮</span>
+<span class="flow-node">→ 截断 content 为 "[Decayed, N chars]"</span>
+</div>
+<div class="flow-stack__arrow">↓</div>
+<div class="flow-stack__layer flow-stack__layer--stable">
+<div class="flow-stack__label">P2 全量 (超阈值时)</div>
+<span class="flow-node">history &gt; 80% 窗口</span>
+<span class="flow-node">→ 调 LLM 总结, 保留最近 6 块</span>
+</div>
+</div>
+<figcaption>图 06-2 · 三层压缩触发顺序. P0 轻量每轮自动, P1 按需, P2 重型超阈值才触发。</figcaption>
+</figure>
+<h2 id="pipeline-order">消息处理管道的顺序</h2>
+<p>
+  prepare messages 阶段, 3 个步骤按固定顺序:
+</p>
+<ol>
+<li>
+<strong>getEntries</strong>: 从 history 拿所有 entry (带 metadata)。
+    </li>
+<li>
+<strong>annotate</strong>: 标记每个 entry 的 _loopIndex / _loopRound /
+    _messageSequence (供 P0 衰减判断用)。
+    </li>
+<li>
+<strong>normalize</strong>: 合并相邻同角色消息 (连续 user 消息合并,
+    连续 assistant 消息合并)。 减少 messages 数量, 不减少 token。
+    </li>
+<li>
+<strong>group</strong>: 把 messages 分组成 text/tool_use/summary
+    块。
+    </li>
+<li>
+<strong>decay</strong>: P0 衰减, 截断超过 decayThreshold 的 tool_use 块。
+    </li>
+<li>
+<strong>compact</strong>: 超过总阈值时 P2 总结。
+    </li>
+<li>
+<strong>flatten</strong>: 把块还原成 messages 数组, 清除内部
+    _loopIndex 等 metadata。
+    </li>
+</ol>
+<p>
+  顺序不能换: getEntries 必须在 annotate 之前, group 必须在 decay
+  之前, decay 必须在 compact 之前。 任何一步乱序, 要么 metadata
+  丢了, 要么压缩破坏了消息块。
+  </p>
+<h2 id="internal-fields">内部字段清理: _xxx 不发 LLM</h2>
+<p>
+  history 内部用了 <code>_turnIndex</code> / <code>_loopRound</code> / <code>_loopIndex</code> /
+  <code>_messageSequence</code> 等字段, 这些是 harness 内部记账用的, 不
+  进 LLM messages。
+  </p>
+<pre class="code-block"><code>// 教学简化版
+function flattenToMessages(blocks: Block[]): ChatCompletionMessageParam[] {
+  return blocks.flatMap(block =&gt;
+    block.messages.map(m =&gt; {
+      const cleaned: ChatCompletionMessageParam = { role: m.role, content: m.content };
+      if (m.tool_calls) cleaned.tool_calls = m.tool_calls;
+      if (m.tool_call_id) cleaned.tool_call_id = m.tool_call_id;
+      // ❌ 不要 spread m, 否则 _loopIndex 等会进 LLM
+      return cleaned;
+    })
+  );
 }</code></pre>
-
-<h2 id="pipeline">三层管道: prepareMessages 串起来</h2>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/compressor.ts#L1" rel="noreferrer" target="_blank">GitHub · src/compressor.ts flattenToMessages 内部字段清理 (L1)</a></p>
 <p>
-  三层是有顺序的: 先 Normalize 解决"消息分裂", 再 Block 解决"逻辑分组", 最后
-  Compress 解决"长度控制"。顺序反了会出问题: 先 Compress 看不到完整 block,
-  总结质量差; 先 Block 但不 Normalize, block 内仍有相邻同角色冗余。
+  4 个细节:
 </p>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-function prepareMessages(currentLoopIndex: number, currentQuery: string): Message[] {
-  const entries = history.getEntries();       // 原始 messages, 不压缩
-  const systemPrompt = history.getSystemPrompt();
-  const systemMsg = systemPrompt ? [{ role: "system", content: systemPrompt }] : [];
-
-  const normalized = normalizeMessages(entries);
-  const blocks = groupToBlocks(normalized);
-
-  // P0 衰减: 旧 block 的 tool result 截断
-  const decayed = compressor.decayOldBlocks(blocks, currentLoopIndex);
-
-  // P2 全量: token 超阈值时, 总结早期 block
-  const tokenEstimate = estimateMessagesTokens(normalized);
-  const finalBlocks = tokenEstimate &gt; maxContextTokens
-    ? compressor.compactHistory(decayed).blocks
-    : decayed;
-
-  const flat = flattenToMessages(finalBlocks);
-  return [...systemMsg, ...flat];
+<ol>
+<li>
+<strong>显式枚举字段</strong>: 只 role / content / tool_calls /
+    tool_call_id, 内部字段不复制。
+  </li>
+<li>
+<strong>Compress 层做</strong>: flatten 在 compressor 内部, 不是
+    agent.ts。 业务模块不感知。
+  </li>
+<li>
+<strong>test 验证</strong>: 跑 fake LLM, 检查 messages 不含
+    _loopIndex 字段。
+  </li>
+<li>
+<strong>transcript 保留</strong>: transcript 是审计流, 保留全部
+    字段, 包括 _loopIndex。 跟 LLM messages 区分。
+  </li>
+</ol>
+<h2 id="token-estimation">token 估算: 中英文不同权重</h2>
+<p>
+  什么时候触发 P2? 需要估算 history 的 token 数。 OpenAI 官方
+  tokenizer 慢, 我们的简单估算:
+</p>
+<pre class="code-block"><code>function estimateTokens(text: string): number {
+  // 中文字符 × 1.5, 英文 × 0.25, 取较大值
+  const chinese = (text.match(/[\u4e00-\u9fa5]/g) || []).length;
+  const english = text.length - chinese;
+  return Math.ceil(chinese * 1.5 + english * 0.25);
 }</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/compressor.ts#L1" rel="noreferrer" target="_blank">GitHub · src/compressor.ts estimateTokens 中英文权重 (L1)</a></p>
+<p>
+  为什么不精确? 精确 token 化要调 tiktoken (慢, 启动慢), 或
+  LLM SDK 的 count_tokens 接口 (要网络)。 估算够用 — P2 触发的
+  阈值是 80% 窗口, 估算 5-10% 误差不影响触发。
+  </p>
+<h2 id="fake-test">fake LLM 测试: P0 衰减不破坏 tool_call_id 配对</h2>
+<p>
+  <strong>用途</strong>: P0 衰减最容易写错的不是衰减逻辑, 是破坏
+  tool_call_id 配对。 写测试验证 3 件事。
+  </p>
+<p>
+  <strong>设计思想</strong>: 跑 10 轮对话, 触发 P0 衰减, 验证 (1) 所有
+  tool_call_id 仍能找到对应 tool message, (2) 早期 tool result 被
+  截断, (3) 近期 tool result 完整。 3 件事联合验证。
+  </p>
 <p>
-  关键不变量: <code>history</code> 内部仍然保留原始未压缩 messages; 压缩只发生在
-  prepareMessages() 阶段, LLM 看到的是压缩后的 messages, 但
-  history.getMessages() 仍然返回原始。
+  <strong>实现细节</strong>:
 </p>
-
-<h2 id="p1-p2">P1 / P2 的分工</h2>
+<pre class="code-block"><code>test("P0 衰减后 tool_call_id 配对仍然完整", async () =&gt; {
+  const history = createHistory();
+  // 模拟 10 轮对话, 调 10 个工具
+  for (let i = 0; i &lt; 10; i++) {
+    history.add({ role: "user", content: `Step ${i}` });
+    history.add({ role: "assistant", tool_calls: [{ id: `c${i}`, function: { name: "run_bash", arguments: "{}" } }] });
+    history.add({ role: "tool", tool_call_id: `c${i}`, content: `Result ${i} with 100 chars content` });
+  }
+  // 触发 P0 衰减 (假设 currentLoopIndex = 15, threshold = 5)
+  const messages = prepareMessages({ history, currentLoopIndex: 15, decayThreshold: 5 });
+  // 验证: 所有 tool_call_id 仍然配对
+  const callsById = new Map();
+  const resultsById = new Map();
+  for (const m of messages) {
+    if (m.role === "assistant" &amp;&amp; m.tool_calls) {
+      for (const tc of m.tool_calls) callsById.set(tc.id, tc);
+    }
+    if (m.role === "tool") resultsById.set(m.tool_call_id, m);
+  }
+  for (const [id, _] of callsById) {
+    expect(resultsById.has(id)).toBe(true);
+  }
+  // 验证: 早期 (c0-c4) 的 tool result content 是截断标记
+  for (let i = 0; i &lt; 5; i++) {
+    const result = messages.find(m =&gt; m.role === "tool" &amp;&amp; m.tool_call_id === `c${i}`);
+    expect(result.content).toMatch(/Decayed/);
+  }
+  // 验证: 近期 (c5-c9) 的 tool result content 完整
+  for (let i = 5; i &lt; 10; i++) {
+    const result = messages.find(m =&gt; m.role === "tool" &amp;&amp; m.tool_call_id === `c${i}`);
+    expect(result.content).toBe(`Result ${i} with 100 chars content`);
+  }
+});</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/compressor.test.ts#L1" rel="noreferrer" target="_blank">GitHub · src/compressor.test.ts P0 衰减配对测试 (L1)</a></p>
+<p>
+  这条测试如果挂, 你立刻知道 P0 衰减破坏了 tool_call_id 配对,
+  或者衰减粒度错了 (衰减了不该衰减的块)。
+  </p>
+<h2 id="common-confusion">3 个常见误解</h2>
 <dl class="defs">
-  <dt>P1 即时压缩</dt>
-  <dd>
-    工具执行后立刻触发。例如 read_file 工具返回 100k 字符的日志, P1
-    把超长部分存到本地文件 (<code>~/.cache/run-outputs/abc123.txt</code>), tool
-    result 里只留前 200 字符 + "完整内容见 output_id=abc123" 的 占位符。LLM
-    看到的是"这里有占位符, 需要全文可以再调 read_output"。
+<dt>误解 1 · "压缩就是删除"</dt>
+<dd>
+    错。 P0/P2 是<strong>截断/总结</strong>, 不是删除。 删除会破坏
+    tool_call_id 配对, 让 LLM 收到错位消息。 截断保留 role 和 id,
+    只换 content, 安全。
   </dd>
-  <dt>P2 全量压缩</dt>
-  <dd>
-    每次 LLM 调用前 token 估算超阈值时触发。早期 block 被合并成一个 "summary
-    block", 由 LLM 写成一段摘要, 替换原始 block。 不同于 P1, P2 的产物是
-    messages 的一部分, 不在文件里。
+<dt>误解 2 · "压缩可以丢 system prompt"</dt>
+<dd>
+    错。 system prompt 是 stable prefix, 不参与压缩。 只压 history
+    messages。 压 system prompt 等于让 LLM 忘掉行为规则。
   </dd>
-  <dt>P0 衰减压缩 (中间层)</dt>
-  <dd>
-    比 P1 弱, 比 P2 轻: 第 N 轮之后, 把旧 tool result 截断到前 K tokens (例如前
-    500 tokens), 不调 LLM 总结。LLM 仍然能看到"这里有内容",
-    但要查全文需要重新触发工具。
+<dt>误解 3 · "越早压越好"</dt>
+<dd>
+    错。 P0 太激进 (每轮都衰减) 会让 LLM 看不到近期上下文, 质量下降。
+    阈值要合理: decayThreshold = 5 轮, totalThreshold = 80% 窗口。
+    调参要看真实任务, 不靠感觉。
   </dd>
 </dl>
-
-<h2 id="transcript">不写回 history: 留给 transcript 兜底</h2>
-<p>
-  压缩只发生在 prepareMessages(), 不写回 history。这看上去"每次都要重做",
-  实际是把"事实" 和 "视图" 分开:
-</p>
-<ul>
-  <li><strong>事实 (history)</strong>: 原始 messages, append-only, 不可变。</li>
-  <li>
-    <strong>视图 (prepareMessages 输出)</strong>: 当次 LLM 调用的压缩结果,
-    用完即弃。
-  </li>
-</ul>
-<p>
-  这种分离让 transcript (第 15 章) 能记录原始现场, 重放 (eval) 能从
-  原始历史里还原任意时点的 messages 序列。如果把压缩写回 history,
-  这两件事都会崩。
-</p>
-
 <h2 id="trap">反例梯度</h2>
-
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>压缩后写回 history。</p>
-    <p><strong>为什么错:</strong>破坏 audit / 重放 / tool_call_id 配对。</p>
-    <p>
-      <strong>正确做法:</strong>压缩只发生在 prepareMessages(), history
-      内部保留原始。
+<div class="card__head">
+<span class="card__tag">新手错法 · A</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 按消息条数衰减, 拆了 tool_use 块。
+    </p>
+<p>
+<strong>为什么错:</strong> 拆 tool_use 块破坏 tool_call_id 配对, LLM 看到
+      错位消息, OpenAI API 可能报 400。
+    </p>
+<p>
+<strong>正确做法:</strong> 按消息块衰减, tool_use 块整体处理, 只截断
+      content 不拆块。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>截断 messages 到最近 N 条。</p>
-    <p>
-      <strong>为什么错:</strong>丢掉关键事实, 模型失去全局感, 重复已做的事。
+<div class="card__head">
+<span class="card__tag">中级错法 · B</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> P0 衰减时删除 tool message, 留 assistant
+      调工具的记录。
+    </p>
+<p>
+<strong>为什么错:</strong> 删 tool message 破坏配对, 下一轮 LLM 看到
+      "assistant 调了 3 个工具, 只收到 2 个 tool"。
+    </p>
+<p>
+<strong>正确做法:</strong> 截断 content 为 "[Decayed, N chars]", 保留
+      role: "tool" 和 tool_call_id。
     </p>
-    <p><strong>正确做法:</strong>用 P2 总结替代截断, 保留信息密度。</p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>压缩时丢掉 tool_call_id。</p>
-    <p>
-      <strong>为什么错:</strong>第 02 章的 tool_call_id 配对约束被破坏,
-      协议层拒绝接受。
+<div class="card__head">
+<span class="card__tag">高级错法 · C</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 用 spread <code>{...m}</code> 复制消息, 把内部
+      _loopIndex 字段发给 LLM。
+    </p>
+<p>
+<strong>为什么错:</strong> _loopIndex 拼到 messages, 每次 chat() 消息数组
+      不一样, dynamic tail 不稳定, cache 命中受影响。
+    </p>
+<p>
+<strong>正确做法:</strong> flatten 时清除 _loopIndex 等内部字段,
+      只发 role / content / tool_calls / tool_call_id。
     </p>
-    <p><strong>正确做法:</strong>压缩保留 id 字段, 只压缩 content 字符串。</p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>P2 总结调 LLM 时不传 "这是压缩任务" 的提示, 让
-      LLM 误以为是普通对话。
+<div class="card__head">
+<span class="card__tag">边界错法 · D</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 每次 P2 压缩都从最开头开始, summary 块被反复压缩。
     </p>
-    <p>
-      <strong>为什么错:</strong>总结质量不稳定, LLM 可能输出"好的, 我会..."
-      这种无意义前缀。
+<p>
+<strong>为什么错:</strong> 反复压缩 summary 块, 信息丢失加速, 最后
+      summary 失去所有细节。
     </p>
-    <p>
-      <strong>正确做法:</strong>压缩调用 LLM 时, messages 是特殊 system prompt,
-      明确说"总结以下对话, 不要回复"。
+<p>
+<strong>正确做法:</strong> summary 块标记为 "kind: summary", P2 只压
+      非 summary 块, summary 块保留原样。
     </p>
-  </div>
 </div>
-
+</div>
 <h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
 <div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 06 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>P1 触发占位:</strong>fake LLM 返回 tool_call=read_file, fake
-      execute 返回 100k 字符长字符串, 跑完后 prepareMessages 输出中 tool result
-      被替换为前 200 字符 + "完整内容见 output_id=xxx"。
+<div class="card__head">
+<span class="card__tag">Validation · 第 06 章</span>
+</div>
+<div class="card__body">
+<p>
+<strong>P0 衰减不破坏配对:</strong> 10 轮对话, 触发 P0 衰减, 验证
+      所有 tool_call_id 仍能找到对应 tool message。
     </p>
-    <p>
-      <strong>history 内部不压缩:</strong>跑完后 history.getMessages() 仍
-      然返回原始 100k 字符 tool result。
+<p>
+<strong>P1 即时可逆:</strong> run_bash 返回 5000 字符, 触发 P1,
+      tool message 含 output_id; LLM 调 run_output_read(output_id)
+      读完整内容。
     </p>
-    <p>
-      <strong>tool_call_id 配对保留:</strong>压缩后 tool message 的 tool_call_id
-      仍然等于 assistant.tool_calls[N].id。
+<p>
+<strong>P2 总结保留最近 K 块:</strong> history 50 个块, 触发 P2,
+      验证最后 6 个块原样保留, 之前 44 个块被 summary 块替换。
     </p>
-    <p>
-      <strong>P2 触发条件:</strong>token 估算超 maxContextTokens, 跑完后
-      prepareMessages 输出中早期 block 被替换为 summary block。
+<p>
+<strong>summary 块不参与下次 P2:</strong> 连续 P2 两次, 验证第二次
+      P2 不压缩第一次产生的 summary 块。
     </p>
-    <p>
-      <strong>Normalize 合并相邻:</strong>history 里有 3 条连续 role: "user"
-      消息, 跑完后 prepareMessages 输出合并为 1 条。
+<p>
+<strong>内部字段不发 LLM:</strong> prepare messages 后, LLM 收到的
+      messages 不含 _loopIndex / _loopRound / _messageSequence。
     </p>
-  </div>
 </div>
-
-<h2 id="lookback">回望第 00–05 章: 哪些原则在本章兑现了</h2>
+</div>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>事实与视图分离:</strong>history 是事实, prepareMessages
-    是视图。这条原则在第 00 章"History 唯一职责" 隐含, 本章显式兑现。
-  </li>
-  <li>
-    <strong>tool call 协议沿用:</strong>压缩保留 tool_call_id, 不破坏第 02
-    章的协议约束。
-  </li>
-  <li>
-    <strong>不写回 history:</strong>history.getMessages() 仍然可观测,
-    不被压缩污染, 第 15 章 transcript 和 eval 重放都需要这一点。
-  </li>
-  <li>
-    <strong>稳定前缀原则:</strong>system prompt 不进压缩管道, 稳定不变, 第 10
-    章的伏笔。
-  </li>
+<li>
+<strong>消息块是原子单位</strong>: 压缩不能拆 tool_use 块, 否则破坏
+    tool_call_id 配对。
+    </li>
+<li>
+<strong>窄接口</strong>: compressor 只暴露 compress / decay / compact,
+    不暴露内部 history 结构。
+    </li>
+<li>
+<strong>分层防御</strong>: P0 / P1 / P2 各管各的时机, 不互相替代。
+    </li>
+<li>
+<strong>内部 metadata 隔离</strong>: _loopIndex 等字段在 history 内部
+    流转, flatten 时清除。
+    </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>压缩策略由谁决定</dt>
-  <dd>
-    第 10 章 cache-friendly 布局会展开, P2 总结的 LLM 调用本身吃 cache 配额,
-    需要 stable context 隔离。
-  </dd>
-  <dt>压缩出错怎么办</dt>
-  <dd>
-    第 11 章 recovery 会处理"压缩 LLM 调用失败" 的回退路径, 通常降级到 P0
-    衰减或纯截断。
-  </dd>
-  <dt>P1 占位 output_id 怎么读</dt>
-  <dd>第 12 章 task 系统会持久化 output_id 引用, eval 重放时还原原始内容。</dd>
-  <dt>压缩时遇到 system-reminder 怎么办</dt>
-  <dd>
-    Normalize 不合并 system-reminder 标签, groupToBlocks 不进 block, 独立处理,
-    第 03 章 TODO reminder 走这条路径。
-  </dd>
+<dt>压缩后 LLM 失忆</dt>
+<dd>
+    P2 总结太长会丢细节。 后续可以加 "按文件类型分层" — 代码文件
+    保留函数签名, doc 文件保留段落, 总结更精准。
+    </dd>
+<dt>P1 输出文件清理</dt>
+<dd>
+    P1 写文件后, OutputStore 越攒越多。 第 13 章 OutputStore 展开
+    清理策略。
+    </dd>
 </dl>
-
-<h2 id="vibe-coding-06">本次如何 vibe code: 第 06 章的三件套</h2>
-
-<h3 id="vibe-feed-06">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出
-    <code>normalizeMessages</code> / <code>MessageBlock</code> /
-    <code>ContextCompressor</code> 三个模块的 interface。本轮不写实现,
-    重点钉三层管道的输入输出形态。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 agent.prepareMessages() 的
-    stub, 三层都是 noop (直接返回原 messages)。本轮 review 重点: prepareMessages
-    是 createAgent() 闭包内的私有函数, 不暴露给外部。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写三层实现 + agent.prepareMessages()
-    串起。本轮 review 重点: 压缩结果不写回 history, tool_call_id 保留, P2 调 LLM
-    走特殊 system prompt。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/compress.test.ts</code>。本轮 review 重点: "history 内部不压缩"
-    和 "tool_call_id 配对保留" 两条必须有反向断言。
-  </li>
-</ol>
-
-<h3 id="vibe-review-06">Review: 第 06 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>压缩不写回 history。</strong>不得出现
-    <code>history.replaceEntries(compressed)</code>。验证:
-    <code>grep -n 'replaceEntries' src/agent.ts</code> 应当 0 行 (本章内),
-    仅在第 11 章 recovery 路径里允许。
-  </li>
-  <li>
-    <strong>tool_call_id 保留。</strong>压缩过程中 tool message 的 id
-    字段不被丢弃。验证: 写一个测试断言
-    <code>compressed.find(m =&gt; m.role === "tool").tool_call_id</code>
-    等于原始 id。
-  </li>
-  <li>
-    <strong>P2 调 LLM 走特殊 system prompt。</strong>压缩用的 LLM 调用 messages
-    头部有明确"压缩任务" 标识。验证: 写一个测试, 让 fake LLM 拒绝响应 (无 tool
-    call) 应当被识别为压缩任务。
-  </li>
-  <li>
-    <strong>Normalize 不合并 system-reminder。</strong>第 03 章的 reminder
-    标签不进 Normalize 合并, 独立保留。验证: 写一个测试, history 里有 3 条 user
-    message (其中 1 条是 system-reminder 标签), 跑完后 reminder 那条独立, 另 2
-    条合并。
-  </li>
-  <li>
-    <strong>不污染 system prompt。</strong>P2 总结结果不进 system prompt, 进普通
-    user 消息。验证: 跑完 P2 后 messages[0].role !== "system" 仍然成立。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-06">调试: 第 06 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · 压缩后写回 history 加速下次 LLM 调用。</strong>症状:
-    觉得"每次重做压缩浪费", 写回 history 看起来能省 token。验证: Validation
-    卡片"history 内部不压缩" 必须断言"跑完后 history.getMessages()
-    仍然返回原始长 tool result"。
-  </li>
-  <li>
-    <strong>伪装 B · 截断 messages 代替压缩。</strong>症状:
-    <code>messages.slice(-50)</code> 写进 prepareMessages。验证: Validation
-    卡片"P2 触发条件" 那条, 必须断言"早期 block 被替换为 summary block",
-    不是"早期 block 被丢弃"。
-  </li>
-  <li>
-    <strong>伪装 C · 压缩时 LLM 调用混进普通 LLM 流量。</strong>症状: 压缩 LLM
-    调用的 messages 头部没有"压缩任务" 标识, 模型不知道这是压缩。验证: 第 18 章
-    eval 会通过"压缩 LLM 调用的 system prompt 长度" 反向检查, 本章不显式测。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-06">迭代: 第 06 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code>feat(ch06): 钉 normalize / block / compressor 三层接口</code> —— tsc
-    通过, 无实现。
-  </li>
-  <li>
-    <code
-      >feat(ch06): createCompressor 工厂 + agent.prepareMessages() stub</code
-    >
-    —— tsc 通过, 三层 noop。
-  </li>
-  <li>
-    <code
-      >feat(ch06): 三层实现 + prepareMessages 串起 + 压缩不写回 history</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code
-      >test(ch06): Normalize 不合并 system-reminder + tool_call_id 保留</code
-    >
-    —— 全绿。
-  </li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">Prompt Card · 第 06 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现三层压缩管道 (Normalize / Block / Compress), 让
-      harness 在长任务下保持稳定 context 长度。
-    </p>
-    <p>
-      <strong>场景:</strong>用户多轮对话 + 多次 read_file 工具调用后, messages
-      超过 80k tokens, 触发 P2 全量压缩, 早期 block 被 LLM 总结为一段摘要替换。
+<div class="card__head">
+<span class="card__tag">Prompt Card · 第 06 章</span>
+<button class="card__copy" data-copy-card="" type="button">复制</button>
+</div>
+<div class="card__body">
+<p>
+<strong>目标:</strong> 给 harness 加三层压缩 (P0 衰减 / P1 即时 / P2 全量),
+      长任务下保持稳定的 context 长度。
     </p>
-    <p>
-      <strong>模块:</strong> <code>src/normalize.ts</code> (新) 暴露
-      <code>normalizeMessages()</code>; <code>src/message-block.ts</code> (新)
-      暴露 <code>groupToBlocks()</code> / <code>flattenToMessages()</code>;
-      <code>src/compressor.ts</code> (新) 暴露 <code>createCompressor()</code>;
-      <code>src/agent.ts</code> 改 prepareMessages() 串起三层。
+<p>
+<strong>场景:</strong> 跑 200 轮对话, history 累积 80000 token, P0 衰减
+      早期 tool result, P1 即时压缩大输出, P2 总结超阈值 history。
     </p>
-    <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
-    <ul>
-      <li>压缩结果不写回 history, 只在 prepareMessages() 输出层生效</li>
-      <li>tool_call_id 在压缩中保留, 不丢弃</li>
-      <li>P2 调 LLM 时 messages 头部带"压缩任务" 标识的 system prompt</li>
-      <li>Normalize 不合并 system-reminder 标签</li>
-      <li>P1 占位 output_id 写入本地文件, 不进 messages</li>
-    </ul>
-    <p>
-      <strong>验证 (用 fake LLM + fake compressor, 逐条落到 vitest):</strong>
+<p>
+<strong>模块:</strong> <code>src/normalize.ts</code> (新) 合并相邻同角色消息;
+      <code>src/message-block.ts</code> (新) <code>groupToBlocks()</code> 和
+      <code>flattenToMessages()</code>; <code>src/compressor.ts</code> (新)
+      <code>createContextCompressor()</code> 暴露 P0 / P1 / P2;
+      <code>src/agent.ts</code> (改) prepare messages 阶段调用压缩;
+      <code>src/output-store.ts</code> (改) 支持 P1 写文件。
     </p>
-    <ul>
-      <li>
-        fake execute 返回 100k 字符串, 跑完后 prepareMessages 输出含 P1 占位符
-      </li>
-      <li>跑完后 history.getMessages() 仍含 100k 字符串 (不压缩)</li>
-      <li>
-        压缩后 tool message 的 tool_call_id 仍等于 assistant.tool_calls[N].id
-      </li>
-      <li>token 估算超阈值, prepareMessages 输出含 summary block</li>
-      <li>
-        3 条连续 user message, prepareMessages 输出合并为 1 条 (除
-        system-reminder 外)
-      </li>
-    </ul>
-  </div>
+<p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
+<ul>
+<li>压缩按消息块, 不按消息条数, tool_use 块不能拆</li>
+<li>P0 截断 content 为 "[Decayed]", 不删除 role: "tool" 消息</li>
+<li>P1 大输出写 OutputStore, 返回 preview + output_id, LLM 可重读</li>
+<li>P2 总结保留最近 K 块, summary 块不参与下次 P2</li>
+<li>flatten messages 时清除 _loopIndex 等内部字段</li>
+<li>_loopIndex / _loopRound / _messageSequence 等内部字段不进 LLM messages</li>
+</ul>
+<p><strong>验证 (用 fake LLM + vitest, 逐条断言):</strong></p>
+<ul>
+<li>10 轮对话触发 P0, 验证所有 tool_call_id 仍能找到对应 tool message</li>
+<li>run_bash 返回 5000 字符触发 P1, tool message 含 output_id</li>
+<li>history 超阈值触发 P2, 最后 6 个块原样保留, 之前块被 summary 替换</li>
+<li>连续 P2 两次, 第二次不压缩第一次的 summary 块</li>
+<li>prepare messages 后, LLM 收到的 messages 不含 _loopIndex 字段</li>
+</ul>
+</div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意把压缩结果写回 history, 跑测试, 看 "history 内部不压缩" 是否抓到。
+<li>
+    故意按消息条数衰减, 跑测试, 看"P0 衰减不破坏配对" 是否抓到 (拆了
+    tool_use 块)。
   </li>
-  <li>
-    把 P2 调 LLM 的 system prompt 去掉, 跑测试, 看压缩质量是否明显下降
-    (这条需要做主观对比, 没有客观断言)。
+<li>
+    让 run_bash 返回 5000 字符但 P1 阈值设 10000, 跑测试, 看"P1 即时压缩"
+    是否抓到 (没触发, tool message 撑爆)。
   </li>
-  <li>
-    故意在 Normalize 里合并 system-reminder, 跑测试, 看"system-reminder 不合并"
-    是否抓到。
+<li>
+    用 spread <code>{...m}</code> 复制消息, 跑测试, 看"内部字段不发 LLM"
+    是否抓到 (_loopIndex 进 messages)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了三层压缩管道: Normalize 合并相邻同角色消息, Block
-  按消息块分组, Compress 做 P1 / P0 / P2 三档长度控制。 关键不变量是"压缩不写回
-  history", history 保留原始现场, prepareMessages 只在视图层做压缩。这一原则为第
-  15 章 transcript 和 eval 重放留了 完整的现场。下一章 (第 07 章)
-  我们处理"危险命令" 的问题——Permission, 在工具执行前同步拦截, 不让 LLM
-  写到文件系统就删光。
+  这一章给 harness 加了 3 层压缩防线:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>P0 衰减</strong>: 每轮自动截断旧 tool result, 轻量。
+    </li>
+<li>
+<strong>P1 即时</strong>: 大 tool result 写文件, 返回 preview + output_id,
+      可逆。
+    </li>
+<li>
+<strong>P2 全量</strong>: 整段 history 超阈值, 调 LLM 总结, 保留最近 K 块。
+    </li>
 <p>
-  第 06 章让 harness 在长任务下稳定, 但 LLM 仍然可以发出"删除项目根目录"
-  这样的危险命令。下一章 Permission 模块会在工具执行前同步拦截, 通过 schema 校验
-  + 用户确认 + 路径边界三个机制, 给 harness 加"安全阀门"。 Permission 是 harness
-  设计哲学最关键的一环——它是模型与真实世界之间的 唯一可观察屏障。
+  关键不变量: <strong>消息块是压缩的原子单位</strong>, tool_use 块
+  不可拆; 内部 metadata 字段 (如 _loopIndex) 不进 LLM messages。
+  下一章 (第 07 章) 解决"工具执行要不要先问用户" 的问题 — Permission。
 </p>
+</content>
\ No newline at end of file
diff --git a/tutorial/chapters/07-permission.html b/tutorial/chapters/07-permission.html
index 44e636c..fa952a6 100644
--- a/tutorial/chapters/07-permission.html
+++ b/tutorial/chapters/07-permission.html
@@ -1,542 +1,716 @@
 <p class="article__eyebrow">第 07 章 · 给工具画边界</p>
-<h1 class="article__title">在工具执行前同步拦截: Permission</h1>
+<h1 class="article__title">Permission: 工具调用前要不要问人</h1>
 <p class="article__lede">
-  前面六章让 harness 能聊天、能调工具、能跑子任务、能压缩 context。但 LLM
-  仍然可能发出"删除项目根目录" 这样的危险命令。这一章给 harness 加 权限管理:
-  在工具执行前同步拦截, 通过 schema 校验 + 用户确认 + 路径边界 三个机制, 让
-  harness 在模型与真实世界之间加一道可观察屏障。
+  第 06 章的压缩解决了"context 撑爆" 问题, 但 harness 现在能调
+  <code>run_write</code> / <code>run_bash</code> 改文件, 没人把关的话, LLM 可能误删 /
+  误改重要文件。 这一章给 harness 加 Permission 系统: 三种模式
+  (plan / auto / default), 黑白名单, 路径边界, ask 降级。 读完后,
+  你能讲清"plan 模式" 和 "default 模式" 的区别, 并能用 fake terminal
+  验证"ask 时被拒, tool result 写 Permission denied" 的协议。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-06">在第 06 章基础上改了什么</h2>
-<p>
-  这一章在 agent 主循环的工具执行分支里加 permission 检查。loop 跑到 "调 LLM →
-  拿到 tool_calls → 准备执行" 时, 每条 tool_call 都先过 permission.check(), 根据
-  action (allow / ask / deny) 决定直接执行、 询问用户、写 error tool
-  message。permission 策略在 Composition Root 注入, 工具 schema 在 permission
-  模块里被复用做参数级校验。 对应到代码, 改动集中在 3 个文件:
-  <code>src/permission.ts</code> (新)、
-  <code>src/agent.ts</code> (改工具执行分支)、<code>src/index.ts</code>
-  (改接线, 注入 permissionManager)。
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: 跑 5 分钟测试卡死主循环</h2>
+<p>
+  写代码之前, 先看一个真实场景。 跑一段真实场景: 用户用 harness 跑一个
+  "全项目 e2e 测试", 命令是 <code>npm run test:e2e</code>, 跑 5 分钟。
+  </p>
+<ol>
+<li>
+<strong>误删</strong>: LLM 试图"清理项目里的临时文件",
+    跑 <code>rm -rf /tmp/some-keep-this</code>, 重要数据没了。
+    </li>
+<li>
+<strong>误改</strong>: LLM 试图"fix 一个 typo", 但它用 <code>run_edit</code>
+    模糊匹配, 把同名 pattern 的 5 个文件全改了。 用户只想要 1 个。
+  </li>
+<li>
+<strong>网络外泄</strong>: LLM 试图"看 React 文档", 调
+    <code>curl https://...</code> 把本地文件内容发到外网。
+    </li>
+<li>
+<strong>危险命令</strong>: LLM 试图"重启服务", 跑
+    <code>sudo systemctl restart nginx</code>, 整个环境挂了。
+  </li>
+</ol>
+<p>
+  朴素想法 1: "LLM 应该聪明到不犯这些错?"
+  错。 LLM 是"下一个 token 预测器", 它在压力下 (上下文满、跑长任务)
+  会选错工具。 把安全责任完全推给 LLM, 就像让司机"靠记忆" 开车
+  不系安全带。
+  </p>
+<p>
+  朴素想法 2: "禁止 LLM 调危险工具?"
+  那 LLM 啥都干不了。 需要<strong>人工把关</strong>。
+  </p>
+<p>
+  正确做法: <strong>Permission 层在 LLM 和工具之间加一道
+  人工把关</strong>。 危险操作前先问用户, 用户拒绝就跳过, 不假装成功。
+  </p>
+<h2 id="three-modes">三种模式: plan / auto / default</h2>
+<p>
+  <strong>用途</strong>: Permission 系统提供 3 种模式, 用户的"信任程度"
+  不同。 默认 <code>default</code>, 用户启动时通过
+  <code>/mode plan|auto|default</code> 切换。
+  </p>
+<p>
+  <strong>真实场景</strong>: 用户首次使用 harness, 不知道 harness 行为,
+  选 <code>plan</code> 模式"看着它做"; 跑 CI / 批处理时选 <code>auto</code>
+  信任 harness; 日常使用选 <code>default</code> 平衡。
+  </p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>策略模式</strong> — 同一操作在不同
+  模式下有不同决策 (allow / ask / deny), 用查表代替 if/else 链。
+  这是 Reference 章节的"模式 8 · Strategy 策略模式" 的具体应用。
+  </p>
+<table class="terms">
+<thead>
+<tr>
+<th>模式</th>
+<th>行为</th>
+<th>典型场景</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>plan</code></td>
+<td>所有写操作都要用户确认, 只读操作不确认</td>
+<td>用户首次使用, 不知道 harness 行为, 想"看着它做"</td>
+</tr>
+<tr>
+<td><code>auto</code></td>
+<td>所有操作直接执行, 不问</td>
+<td>用户信任 harness, 跑批量任务 (CI / 自动化)</td>
+</tr>
+<tr>
+<td><code>default</code></td>
+<td>按"危险度" 决定: 危险操作问, 安全操作直接做</td>
+<td>日常使用, 大部分操作自动, 危险操作确认</td>
+</tr>
+</tbody>
+</table>
+<p>
+  3 个模式的边界:
 </p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/permission.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/permission.ts: 权限决策模块 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/agent.ts: 工具执行前同步检查</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/command-safety.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/command-safety.ts: 危险命令检测 (新)</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    LLM 接到"清理临时文件" 的指令, 可能输出 "rm -rf /tmp/*"。如果 harness 不拦,
-    项目目录可能一起被删。现象是"模型把工具当成了万能接口, 任何指令
-    都直接落到真实世界"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"完全信任 LLM 输出, 直接执行"。这有两个问题: 一是 LLM
-    生成的文本可能包含危险命令 (例如 "rm -rf"、 "sudo"), 二是 LLM
-    不会做"路径是否在项目内" 的判断, 可能误删系统目录。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface PermissionManager { check({toolName, args}): Decision }</code
-    >。 不变量三条: (1) permission 在工具 execute 之前同步执行, 不允许异步延后,
-    (2) 危险命令 deny 之后必须写 tool message 告诉 LLM, 不允许沉默拒绝, (3)
-    用户确认 (ask) 必须在工具 execute 之前, 不能"先执行再撤销"。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake registry 暴露一个 run_bash 工具, fake permission 配置 "rm -rf" 为 deny,
-    跑完后 history 末尾有 tool message 描述 "Permission denied: rm -rf in args",
-    registry.execute 没被调用 (用 spy 验证)。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · 危险命令过滤写在工具 execute 内</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 错误位置: 写在工具内部
-async function runBash(args) {
-  if (args.cmd.includes("rm -rf")) throw new Error("blocked");
-  return await exec(args.cmd);
-}</code></pre>
-  <p><strong>问:</strong>为什么不写在工具内部, 而要单独搞 permission 模块?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 复用: 每个工具都要写一遍"危险模式"
-    检查, 30 个工具就有 30 处 if; 测试: 工具单测要同时测"正常情况" 和
-    "permission 拒绝" 两条路径; 策略: 用户无法集中管理"我允许 / 我禁止"
-    的工具列表, 只能改源代码。
+<ol>
+<li>
+<strong>plan 模式</strong>: 防御性, 适合新用户。 但用户会被"问得太多" 烦。
+    跑 10 轮对话可能问 8 次。
+  </li>
+<li>
+<strong>auto 模式</strong>: 信任模式, 跑 CI / 批处理必备。 但用户
+    失去"撤销" 机会 — 误删了没办法。
+  </li>
+<li>
+<strong>default 模式</strong>: 平衡模式, 大部分情况自动, 危险
+    操作问。 适合日常使用。
+  </li>
+</ol>
+<p>
+  默认是 <code>default</code>, 用户启动时通过 <code>/mode auto</code> /
+  <code>/mode plan</code> / <code>/mode default</code> 切换。 mode 状态
+  存在 sessionEventBuffer (第 08 章), 跨 turn 保留。
   </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · 异步延后 permission 检查</p>
-  <pre class="code-block"><code>// 教学简化版
-const result = await tool.execute(args);  // 先执行
-permission.check({ toolName, args }).then(decision =&gt; {
-  if (decision.action === "deny") rollback(result);  // 再撤销
-});</code></pre>
-  <p><strong>问:</strong>为什么不"先执行再撤销"?</p>
-  <p>
-    <strong>答:</strong>不是所有副作用都能撤销。rm -rf 删完 100 个文件 之后,
-    撤销 API 是不存在的; 写到网络端的命令也无法回滚。permission
-    必须在执行前同步拦截, 接受这个延迟换"绝对安全"。
+<h2 id="ask-flow">ask 流程: 怎么问用户</h2>
+<p>
+  Permission 检查命中"需要问" 时, 走 ask 流程:
+</p>
+<ol>
+<li>harness 检测到危险操作 (run_write / run_bash 危险命令)</li>
+<li>调 <code>terminal.askUser(prompt)</code> 弹出确认</li>
+<li>用户在 REPL 看到 <code>Allow run_write to /etc/hosts? (y/n)</code></li>
+<li>用户输入 <code>y</code> 或 <code>n</code></li>
+<li>harness 根据回答继续 (y: 执行, n: 跳过)</li>
+</ol>
+<p>
+  关键: ask 必须<strong>同步</strong>等用户回答。 REPL 不能继续读
+  下一行, 直到用户回答当前 ask。 这跟普通 readline 不同 — readline
+  是一问一答, ask 是一问一等 + 必须答。
   </p>
-</div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export type Decision = {
-  action: "allow" | "ask" | "deny";
-  reason: string;
-  message?: string;  // 当 action 是 "ask" 时, 给用户看的询问
-};
-
-export interface PermissionRequest {
-  toolName: string;
-  args: Record&lt;string, unknown&gt;;
+<h2 id="naive">朴素反例: 权限检查写在工具里</h2>
+<pre class="code-block"><code>// ❌ 反例: 权限写在工具里
+async function runWrite(args) {
+  if (args.path.startsWith("/etc")) {
+    if (!confirm("Allow run_write to /etc/hosts? (y/n)")) {
+      return { content: "Permission denied", error: true };
+    }
+  }
+  await fs.writeFile(args.path, args.content);
+  return { content: "OK" };
 }
 
-export interface PermissionManager {
-  check(request: PermissionRequest): Decision;
-  // 子智能体 ask 降级为 deny 时不抛错, 由 harness 写 tool message
-  downgradeAsk?(request: PermissionRequest): Decision;
+// 同样逻辑要在 run_bash / run_edit / run_delete / 等等各写一份</code></pre>
+<p>
+  5 件事立刻坏掉:
+</p>
+<ol>
+<li>
+<strong>重复代码</strong>: 同样的"问 / 拒绝" 逻辑在 5 个工具里各写一份,
+    维护噩梦。
+  </li>
+<li>
+<strong>模式切换写不出</strong>: <code>plan</code> / <code>auto</code> / <code>default</code>
+    怎么切? 每个工具都要知道当前 mode, mode 状态散在各处。
+  </li>
+<li>
+<strong>测试难写</strong>: 测试时每个工具都要 mock 自己的 confirm。
+    不能用一个 fake terminal 统一替换。
+  </li>
+<li>
+<strong>规则不一致</strong>: run_write 问 /etc 路径, run_bash 问
+    哪些命令, run_edit 问哪些 pattern — 各自一套规则, 容易有漏洞。
+  </li>
+<li>
+<strong>工具作者必须懂权限</strong>: 写 run_web_fetch 的工程师
+    要决定"问不问", 工具作者增加心智负担。
+  </li>
+</ol>
+<p>
+  解决方式: Permission 层在 ToolRegistry.invoke 之前拦截, 工具
+  不知道也不关心权限。
+  </p>
+<h2 id="permission-manager">PermissionManager: 集中权限决策</h2>
+<p>
+  把权限检查抽出来, 集中在一个 manager:
+</p>
+<pre class="code-block"><code>interface PermissionManager {
+  check(input: {
+    toolName: string;
+    args: Record&lt;string, unknown&gt;;
+    mode: "plan" | "auto" | "default";
+  }): PermissionDecision;
+  setMode(mode: "plan" | "auto" | "default"): void;
+  addBlacklist(pattern: string): void;
+  addWhitelist(pattern: string): void;
 }
 
-export type AskUserFn = (message: string) =&gt; Promise&lt;boolean&gt;;</code></pre>
-
-<h2 id="policy">策略层: 工具级 + 参数级 + 命令级</h2>
-<p>完整的 permission 策略是三层叠加的:</p>
+type PermissionDecision =
+  | { kind: "allow" }
+  | { kind: "deny"; reason: string }              // 硬黑名单 (危险命令), 不问
+  | { kind: "ask"; prompt: string };              // 需要用户确认</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/permission.ts#L1" rel="noreferrer" target="_blank">GitHub · src/permission.ts PermissionManager 实现 (L1)</a></p>
+<p>
+  3 种 decision 的边界:
+</p>
 <ol>
-  <li>
-    <strong>工具级</strong>: 哪些工具被允许 (allowlist) / 禁止 (denylist)。
-    这是最粗的边界, 通常配置在项目根 <code>.claude/settings.json</code> 里。
+<li>
+<strong>allow</strong>: 直接执行, 不问用户。 工具白名单 / 安全操作。
   </li>
-  <li>
-    <strong>参数级</strong>: 工具被允许, 但参数不允许。例如 "run_bash" 被允许,
-    但参数里如果出现 "rm -rf" 就拒绝。
+<li>
+<strong>deny</strong>: 硬黑名单, 拒绝, 不问。 比如 <code>rm -rf /</code>、
+    <code>sudo</code>、<code>/etc</code> 写。 即使 auto 模式也拒绝。
   </li>
-  <li>
-    <strong>命令级</strong>: 针对 run_bash 这种"自由文本命令" 工具, 进一步 用
-    command-safety.ts 检测危险模式 (例如删除、sudo、网络下载并执行)。
+<li>
+<strong>ask</strong>: 调 terminal.askUser, 问用户。 plan 模式全 ask,
+    default 模式危险操作 ask, auto 模式不 ask。
   </li>
 </ol>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export function createPermissionManager(config: PermissionConfig): PermissionManager {
-  const toolAllowlist = new Set(config.allowlist ?? []);
-  const toolDenylist = new Set(config.denylist ?? []);
-
-  return {
-    check(request) {
-      // 1. 工具级
-      if (toolDenylist.has(request.toolName)) {
-        return { action: "deny", reason: `Tool "${request.toolName}" is denied` };
-      }
-      if (toolAllowlist.size &gt; 0 &amp;&amp; !toolAllowlist.has(request.toolName)) {
-        return { action: "deny", reason: `Tool "${request.toolName}" is not in allowlist` };
-      }
-
-      // 2. 参数级 (run_bash 的命令字符串检测)
-      if (request.toolName === "run_bash") {
-        const cmd = String(request.args["cmd"] ?? "");
-        if (detectDangerousCommand(cmd)) {
-          return { action: "deny",
-            reason: `Dangerous command detected: ${describeDanger(cmd)}` };
-        }
-      }
-
-      // 3. 写操作默认 ask (让用户确认)
-      if (isWriteOperation(request.toolName)) {
-        return { action: "ask",
-          message: `Allow ${request.toolName}(${summarizeArgs(request.args)})?` };
-      }
-
-      return { action: "allow", reason: "ok" };
-    },
-  };
-}</code></pre>
-
-<h2 id="loop-integration">loop 接入: 三个 action 三条路径</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-for (const call of assistant.tool_calls) {
-  // 1. permission 检查 (同步, 必须先于 execute)
-  const decision = permissionManager.check({ toolName: call.name, args: call.args });
-
-  if (decision.action === "deny") {
-    // 写 error tool message, 不执行
-    history.add({ role: "tool", tool_call_id: call.id,
-      content: `Permission denied: ${decision.reason}` });
-    continue;
+<p>
+  ❌ / ✅: 错误做法是 deny 和 ask 混为一谈。
+</p>
+<pre class="code-block"><code>// ❌ deny 和 ask 都是 ask, 让用户决定
+async function check(toolName, args, mode) {
+  if (isDangerous(toolName, args)) {
+    const allowed = await ask(`Allow ${toolName}?`);
+    return allowed ? { kind: "allow" } : { kind: "deny" };
   }
+  return { kind: "allow" };
+}
+// 问题: <code>rm -rf /</code> 也让用户决定, 用户可能误答 y
 
-  if (decision.action === "ask") {
-    // 子智能体没有 askUserFn, 降级为 deny
-    if (!askUserFn) {
-      history.add({ role: "tool", tool_call_id: call.id,
-        content: `Permission denied (no confirmation): ${decision.message ?? ""}` });
-      continue;
-    }
-    const approved = await askUserFn(decision.message ?? "");
-    if (!approved) {
-      history.add({ role: "tool", tool_call_id: call.id, content: "User denied" });
-      continue;
-    }
+// ✅ deny 是硬规则, 不让用户选择
+async function check(toolName, args, mode) {
+  if (isBlacklisted(toolName, args)) return { kind: "deny", reason: "hard blacklist" };
+  if (isDangerous(toolName, args) &amp;&amp; mode !== "auto") {
+    return { kind: "ask", prompt: `Allow ${toolName}?` };
   }
+  return { kind: "allow" };
+}</code></pre>
+<h2 id="blacklist">黑名单: 危险命令的硬规则</h2>
+<p>
+  黑名单是 hard rule, 永远拒绝, 不让用户选择。 常见黑名单:
+</p>
+<pre class="code-block"><code>const HARDCODED_BLACKLIST = [
+  /rm\s+-rf\s+\//,                    // rm -rf /
+  /sudo/,                              // sudo 命令
+  /mkfs/,                             // 格式化
+  /dd\s+if=/,                         // dd 写盘
+  /:\(\)\s*\{.*\s*:\s*&apos;\1;.*\}/,  // fork bomb
+  /chmod\s+-R\s+777\s+\//,             // chmod 777 根
+];</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/command-safety.ts#L1" rel="noreferrer" target="_blank">GitHub · src/command-safety.ts HARDCODED_BLACKLIST (L1)</a></p>
+<p>
+  这些规则来自 <code>command-safety.ts</code>, 跨工具共享 (run_bash
+  和 ExecutionPolicy 都用)。 集中在一处, 避免两处实现漂移。
+  </p>
+<p>
+  黑名单的判断: 先用正则匹配命令字符串, 匹配成功就 deny。 这种
+  简单匹配能挡掉 90% 的常见误操作。 复杂 shell 语法解析不可靠时
+  宁可拒绝。
+  </p>
+<h2 id="mode-decision-table">3 种模式的决策表</h3>
+<table class="terms">
+<thead>
+<tr>
+<th>工具 + 操作</th>
+<th><code>plan</code></th>
+<th><code>default</code></th>
+<th><code>auto</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>run_read (任何路径)</td>
+<td>allow</td>
+<td>allow</td>
+<td>allow</td>
+</tr>
+<tr>
+<td>run_bash (安全命令如 ls / cat / echo)</td>
+<td>ask</td>
+<td>allow</td>
+<td>allow</td>
+</tr>
+<tr>
+<td>run_bash (危险命令如 rm / sudo)</td>
+<td>deny</td>
+<td>deny</td>
+<td>deny</td>
+</tr>
+<tr>
+<td>run_write (workspace 内)</td>
+<td>ask</td>
+<td>ask</td>
+<td>allow</td>
+</tr>
+<tr>
+<td>run_write (workspace 外)</td>
+<td>deny</td>
+<td>deny</td>
+<td>deny</td>
+</tr>
+<tr>
+<td>run_edit (workspace 内)</td>
+<td>ask</td>
+<td>ask</td>
+<td>allow</td>
+</tr>
+<tr>
+<td>run_web_fetch (任意 URL)</td>
+<td>ask</td>
+<td>ask</td>
+<td>allow</td>
+</tr>
+<tr>
+<td>黑名单 (任何模式)</td>
+<td>deny</td>
+<td>deny</td>
+<td>deny</td>
+</tr>
+</tbody>
+</table>
+<figure class="figure">
+<div class="flow-compare" role="img" aria-label="3 模式决策对比">
+<div class="flow-compare__col flow-compare__col--good">
+<div class="flow-compare__label">plan</div>
+<span class="flow-node">run_read: allow</span>
+<span class="flow-node">run_bash: ask</span>
+<span class="flow-node">run_write: ask</span>
+<span class="flow-node">黑名单: deny</span>
+</div>
+<div class="flow-compare__col flow-compare__col--good">
+<div class="flow-compare__label">default</div>
+<span class="flow-node">run_read: allow</span>
+<span class="flow-node">安全 run_bash: allow</span>
+<span class="flow-node">危险 run_bash: ask</span>
+<span class="flow-node">run_write: ask</span>
+</div>
+<div class="flow-compare__col flow-compare__col--good">
+<div class="flow-compare__label">auto</div>
+<span class="flow-node">run_read: allow</span>
+<span class="flow-node">run_bash: allow</span>
+<span class="flow-node">run_write: allow</span>
+<span class="flow-node">黑名单: deny (仍然)</span>
+</div>
+</div>
+<figcaption>图 07-1 · 3 模式决策对比. 黑名单跨模式都 deny, ask / allow 随模式变, 跑 CI 选 auto 跑日常选 default 跑学习选 plan。</figcaption>
+</figure>
+<p>
+  关键点: 黑名单是硬规则, 跨模式都拒绝。 workspace 外写也是硬规则,
+  不让用户"破例" — workspace 是项目边界, 突破就破坏隔离。
+  </p>
+<h2 id="path-boundary">路径边界: workspace 内的 hard rule</h2>
+<p>
+  <strong>用途</strong>: 写操作必须限制在 workspace 内, 这是 hard rule, 跟
+  模式无关。 防止 LLM 误写到 /etc / 用户的 home / 任何项目外。
+  </p>
+<p>
+  <strong>真实场景</strong>: LLM 调 <code>run_write("/etc/hosts", ...)</code>
+  修改系统 hosts, 任何模式都拒绝。 workspace 是项目边界, 突破就
+  破坏隔离。
+  </p>
+<p>
+  <strong>设计思想</strong>: 路径边界用 <code>path.resolve()</code> 解析相对路径和
+  <code>..</code>, 然后判断 resolved 是否在 workspaceRoot 下。 不在就 deny。
+  </p>
+<p>
+  <strong>实现细节</strong>:
+</p>
+<pre class="code-block"><code>// 教学简化版
+function isPathInWorkspace(path: string, workspaceRoot: string): boolean {
+  const resolved = path.resolve(workspaceRoot, path);
+  return resolved.startsWith(path.resolve(workspaceRoot) + path.sep);
+}
 
-  // 2. 通过检查才执行
-  const tool = registry.get(call.name);
-  const result = await tool.execute(call.args);
-  history.add({ role: "tool", tool_call_id: call.id, content: result.content });
+// 决策
+if (toolName === "run_write" &amp;&amp; !isPathInWorkspace(args.path, workspaceRoot)) {
+  return { kind: "deny", reason: "path outside workspace" };
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/permission.ts#L1" rel="noreferrer" target="_blank">GitHub · src/permission.ts 路径边界检查 (L1)</a></p>
+<p>
+  边界检查的细节: 用 <code>path.resolve()</code> 解析相对路径和 <code>..</code>,
+  然后判断 resolved 是否在 workspaceRoot 下。 不在就 deny。
+  <code>startsWith(workspaceRoot + sep)</code> 避免 <code>/tmp/foo</code>
+  误判为 <code>/tmp/foobar</code> 的子路径。
+  </p>
+<h2 id="terminal-askuser">Terminal.askUser 同步等待</h2>
+<p>
+  ask 必须<strong>同步</strong>等用户回答。 这跟普通 readline.question()
+  不一样, 后者是一问一答, 前者是一问一等 + 必须答。
+  </p>
+<pre class="code-block"><code>// 教学简化版
+interface Terminal {
+  question(prompt: string): Promise&lt;string&gt;;        // 普通 readline
+  askUser(prompt: string): Promise&lt;boolean&gt;;       // 同步 ask, 必须答
+  println(text: string): void;
+  close(): void;
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/terminal.ts#L1" rel="noreferrer" target="_blank">GitHub · src/terminal.ts Terminal 接口 (L1)</a></p>
+<p>
+  真实 readline 实现的 askUser:
+</p>
+<pre class="code-block"><code>async function askUser(prompt: string): Promise&lt;boolean&gt; {
+  // 暂停 readline 主循环, 弹出确认
+  this.rl.pause();
+  process.stdout.write(`${prompt} (y/n) `);
+  return new Promise((resolve) =&gt; {
+    const onLine = (line: string) =&gt; {
+      this.rl.removeListener("line", onLine);
+      this.rl.resume();
+      resolve(line.trim().toLowerCase() === "y");
+    };
+    this.rl.on("line", onLine);
+  });
+}</code></pre>
+<p>
+  关键: <code>rl.pause()</code> 暂停主 readline 循环, 否则用户在主
+  readline 输的内容会被吞。 ask 完 <code>rl.resume()</code> 恢复。
+  </p>
+<h2 id="deny-pretends-success">拒绝时怎么写 tool result</h2>
+<p>
+  拒绝时, tool message 应该明确写"Permission denied", 不能假装成功:
+</p>
+<pre class="code-block"><code>// 教学简化版
+if (decision.kind === "deny") {
+  history.add({
+    role: "tool",
+    tool_call_id: call.id,
+    content: `Permission denied: ${decision.reason}. 请重新决定或换其他方案。`,
+    // 不标 error: true, 因为这是预期行为, 不是工具错误
+  });
+  continue;  // 跳过 invoke
+}
+if (decision.kind === "ask" &amp;&amp; !(await terminal.askUser(decision.prompt))) {
+  history.add({
+    role: "tool",
+    tool_call_id: call.id,
+    content: "Permission denied by user. 请重新决定或换其他方案。",
+  });
+  continue;  // 用户拒绝, 跳过 invoke
 }</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1" rel="noreferrer" target="_blank">GitHub · src/agent.ts 拒绝时写 tool message (L1)</a></p>
+<p>
+  ❌ / ✅: 不要拒绝时 throw。
+</p>
+<pre class="code-block"><code>// ❌ 拒绝时 throw, 整个 loop 崩
+if (!allowed) throw new Error("Permission denied");
 
+// ✅ 拒绝时写 tool message, LLM 看到后自己调整
+if (!allowed) history.add({ role: "tool", tool_call_id: call.id, content: "Permission denied by user." });</code></pre>
+<h2 id="ask-degrade">ask 降级: 用户不在时怎么办</h2>
+<p>
+  ask 流程的边界: 用户不在 REPL 旁边怎么办?
+  </p>
+<ol>
+<li>
+<strong>CI / 批处理</strong>: 跑 auto 模式, 不 ask。 permission 系统
+    检测到 plan/auto 模式时不调 askUser。
+    </li>
+<li>
+<strong>用户不在但 harness 在跑</strong>: ask 会卡住。 permission
+    系统给 ask 设 timeout, 默认超时 (比如 60 秒), 超时按 deny 处理。
+    </li>
+<li>
+<strong>异步执行 (第 13 章)</strong>: Async Run 默认 auto 模式, 不 ask。
+    </li>
+</ol>
+<p>
+  超时降级很重要, 否则 harness 卡在 ask 等用户, 跑 1 小时没动静,
+  用户以为死了。
+  </p>
+<h2 id="fake-test">fake terminal 测试: 拒绝时 tool result 写 denied</h2>
+<pre class="code-block"><code>test("ask 时用户拒绝, tool result 写 Permission denied", async () =&gt; {
+  const fakeTerminal = {
+    askUser: async (prompt: string) =&gt; false,  // 用户拒绝
+    question: async (prompt: string) =&gt; "test input",
+    println: () =&gt; {},
+  };
+  const permissionManager = createPermissionManager({ mode: "default", terminal: fakeTerminal });
+  const fakeLLM = createFakeLLM([
+    { content: null, toolCalls: [{ id: "w1", function: { name: "run_write", arguments: '{"path":"foo.txt","content":"x"}' } }], finishReason: "tool_calls" },
+    { content: "Permission denied, I will not modify files.", toolCalls: [], finishReason: "stop" },
+  ]);
+  const agent = createAgent({ llm: fakeLLM, history, tools, permissionManager, terminal: fakeTerminal });
+  await agent.run("Modify foo.txt");
+
+  // 验证: history 含 "Permission denied" 的 tool message
+  const lastToolMsg = fakeLLM.lastCapturedMessages().filter(m =&gt; m.role === "tool").pop();
+  expect(lastToolMsg.content).toMatch(/Permission denied/);
+  // 验证: LLM 收到 Permission denied 后, 最终回复提到它
+  expect(fakeLLM.lastCapturedMessages().pop().content).toMatch(/Permission denied/);
+});</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/permission.test.ts#L1" rel="noreferrer" target="_blank">GitHub · src/permission.test.ts ask 拒绝测试 (L1)</a></p>
+<p>
+  这条测试如果挂, 你立刻知道 ask / deny 流程有 bug。
+  2 个断言: tool message 含 denied, LLM 最终回复提到它。
+  </p>
+<h2 id="common-confusion">3 个常见误解</h2>
+<dl class="defs">
+<dt>误解 1 · "Permission 应该写在工具里"</dt>
+<dd>
+    错。 写进工具里 = 重复代码 + 难测试 + 规则不一致。 Permission
+    必须在 ToolRegistry.invoke 之前拦截, 工具不感知权限。
+  </dd>
+<dt>误解 2 · "黑名单和 ask 是同一回事"</dt>
+<dd>
+    错。 黑名单是 hard rule, 不让用户选择。 ask 是 user choice,
+    用户可以 y 或 n。 <code>rm -rf /</code> 绝不能 ask, 必须 deny。
+  </dd>
+<dt>误解 3 · "auto 模式 = 不检查权限"</dt>
+<dd>
+    错。 auto 模式 = 不 ask, 但黑名单仍然 deny。 <code>rm -rf /</code>
+    在 auto 模式也拒绝。 auto 只是"不打扰用户", 不是"放飞"。
+  </dd>
+</dl>
 <h2 id="trap">反例梯度</h2>
-
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>permission 写在工具 execute 内部。</p>
-    <p><strong>为什么错:</strong>无法复用, 无法测试, 用户无法集中管理。</p>
-    <p>
-      <strong>正确做法:</strong>permission 在 agent 主循环同步执行, 工具 execute
-      假定输入已校验。
+<div class="card__head">
+<span class="card__tag">新手错法 · A</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 权限检查写进每个工具里。
+    </p>
+<p>
+<strong>为什么错:</strong> 重复代码, 模式切换写不出, 工具作者增加
+      心智负担, 规则不一致。
+    </p>
+<p>
+<strong>正确做法:</strong> PermissionManager 在 ToolRegistry.invoke
+      之前拦截, 工具不感知权限。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>deny 时不写 tool message, 沉默拒绝。</p>
-    <p>
-      <strong>为什么错:</strong>LLM 不知道"刚才为什么没执行", 会反复重试同样的
-      tool_call, 死循环。
+<div class="card__head">
+<span class="card__tag">中级错法 · B</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 拒绝时 throw, 让 loop 崩。
     </p>
-    <p>
-      <strong>正确做法:</strong>deny 必写 role: "tool" 消息, content
-      描述拒绝原因。
+<p>
+<strong>为什么错:</strong> 拒绝是业务行为, 不是 harness 错误, throw 会让
+      整个 loop 崩, messages 序列错乱。
+    </p>
+<p>
+<strong>正确做法:</strong> 拒绝时写 "Permission denied" tool message,
+      LLM 看到后自己调整。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>子智能体在 ask 路径上阻塞, 等用户确认。</p>
-    <p>
-      <strong>为什么错:</strong>子智能体本来就在后台跑, 阻塞等用户会让主 loop
-      卡住, 体验差。
+<div class="card__head">
+<span class="card__tag">高级错法 · C</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> <code>rm -rf /</code> 让用户决定 (ask 而非 deny)。
     </p>
-    <p>
-      <strong>正确做法:</strong>子智能体没有 askUserFn, 自动降级为 deny, 写 tool
-      message 告诉模型"需要用户确认, 跳过此步骤"。
+<p>
+<strong>为什么错:</strong> 黑名单是 hard rule, 用户可能误答 y, 误删
+      整个系统。 跨模式都拒绝。
+    </p>
+<p>
+<strong>正确做法:</strong> 黑名单和 ask 分开, 黑名单永远 deny, 不让
+      用户选择。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>denylist 配置后没生效, 工具仍然执行。</p>
-    <p>
-      <strong>为什么错:</strong>denylist 检查写在 allow 检查之后, allow
-      命中就跳过了 denylist。验证: 写一个测试, 工具在 allowlist 和 denylist
-      都出现, 应当 deny。
+<div class="card__head">
+<span class="card__tag">边界错法 · D</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> ask 没设 timeout, harness 卡住等用户。
+    </p>
+<p>
+<strong>为什么错:</strong> 用户不在 REPL 旁边, ask 会永远卡住, 跑 1 小时
+      没动静, 看起来像死循环。
+    </p>
+<p>
+<strong>正确做法:</strong> ask 设 60 秒 timeout, 超时按 deny 处理,
+      避免挂死。
     </p>
-    <p><strong>正确做法:</strong>先 denylist 后 allowlist, deny 优先。</p>
-  </div>
 </div>
-
+</div>
 <h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
 <div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 07 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>deny 必写 tool message:</strong>fake permission 对 run_bash 返回
-      deny, fake registry 暴露 spy, 跑完后 spy 没被调用, history 末尾有 tool
-      message 含 "Permission denied"。
+<div class="card__head">
+<span class="card__tag">Validation · 第 07 章</span>
+</div>
+<div class="card__body">
+<p>
+<strong>plan 模式全 ask:</strong> plan 模式下, run_write 触发
+      terminal.askUser, 用户回答决定 allow / deny。
     </p>
-    <p>
-      <strong>ask 在子智能体降级为 deny:</strong>子智能体调用 ask 路径,
-      askUserFn 未注入, 写 tool message "Permission denied (no confirmation)",
-      不抛异常。
+<p>
+<strong>黑名单跨模式 deny:</strong> auto 模式下, run_bash("rm -rf /")
+      仍返回 deny, 不调 askUser。
     </p>
-    <p>
-      <strong>denylist 优先于 allowlist:</strong>工具同时在 denylist 和
-      allowlist, decision.action === "deny"。
+<p>
+<strong>workspace 外 deny:</strong> run_write("/etc/passwd", ...) 任何模式
+      都 deny, 错误信息含 "path outside workspace"。
     </p>
-    <p>
-      <strong>危险命令检测:</strong>run_bash 参数 "rm -rf /tmp/foo",
-      decision.action === "deny", reason 含 "rm -rf"。
+<p>
+<strong>拒绝写 Permission denied tool message:</strong> 用户在 ask 时
+      选 n, history 末尾出现 "Permission denied by user" 的 tool message。
     </p>
-    <p>
-      <strong>permission 在 execute 之前同步:</strong>fake permission 用
-      Promise.resolve (同步), spy 验证 execute 在 permission.check 返回
-      之后才调用。
+<p>
+<strong>ask timeout 降级:</strong> 用户 60 秒不回答, harness 按 deny
+      处理, 写 "Permission denied: timeout" tool message。
     </p>
-  </div>
 </div>
-
-<h2 id="lookback">回望第 00–06 章: 哪些原则在本章兑现了</h2>
+</div>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>Composition Root 唯一接线:</strong>permissionManager 在
-    <code>index.ts</code> 创建, 注入 agent, 不在 agent.ts 内 new。
-  </li>
-  <li>
-    <strong>工具职责单一:</strong>execute 假定输入已校验, 不写 if 守卫。这条是第
-    02 章埋伏笔, 本章兑现。
-  </li>
-  <li>
-    <strong>tool message 协议沿用:</strong>deny / ask 拒绝都写 role: "tool"
-    消息, 不破坏第 02 章的协议约束。
-  </li>
-  <li>
-    <strong>子智能体隔离 (第 04 章):</strong>子智能体 ask 降级为 deny, 不阻塞主
-    loop。
-  </li>
+<li>
+<strong>关注点分离</strong>: PermissionManager 独立于 ToolRegistry, 工具
+    不感知权限。
+    </li>
+<li>
+<strong>工厂模式</strong>: PermissionManager 在 Composition Root 创建,
+    注入 agent。
+    </li>
+<li>
+<strong>硬规则与软规则分开</strong>: 黑名单 = hard, ask = user choice, 不混淆。
+    </li>
+<li>
+<strong>工具调用协议</strong>: 拒绝时写 tool message, 复用第 02 章的协议,
+    LLM 能正常推理。
+    </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>permission 决策记录在哪</dt>
-  <dd>
-    第 08 章 Hook 会在 PreToolUse 钩子里记录 permission 决策, transcript
-    留底用于审计。
-  </dd>
-  <dt>用户确认如何实现</dt>
-  <dd>
-    第 13 章 async run 会让 ask 异步化, 不阻塞主 loop, 后台等用户响应再唤醒。
-  </dd>
-  <dt>危险命令模式如何更新</dt>
-  <dd>
-    第 10 章 cache-friendly 布局会展开, 危险命令模式 (denylist) 是稳定配置, 不进
-    messages。
-  </dd>
-  <dt>permission 跨项目保留</dt>
-  <dd>
-    第 09 章 memory 区分"项目级 permission" (本章) 和"用户级长期偏好"
-    (例如"用户对 rm 永远 deny")。
-  </dd>
+<dt>ask 之外的扩展点</dt>
+<dd>
+    Permission 在 invoke 之前拦截。 后续 Hook (第 08 章) 提供
+    更通用的扩展 — PreToolUse / PostToolUse 钩子, 不只是 ask。
+    </dd>
+<dt>白名单持久化</dt>
+<dd>
+    用户对某些路径/命令的回答可以记住, 下次不重复问。 第 09 章
+    Memory 展开持久化。
+    </dd>
 </dl>
-
-<h2 id="vibe-coding-07">本次如何 vibe code: 第 07 章的三件套</h2>
-
-<h3 id="vibe-feed-07">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出
-    <code>PermissionManager</code> / <code>PermissionRequest</code> /
-    <code>Decision</code> / <code>AskUserFn</code> 四个 interface。本轮不写实现,
-    重点钉"同步 / 三 action / deny 必写 tool message" 三条约束。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    <code>createPermissionManager()</code> 接受 <code>PermissionConfig</code>,
-    agent.run() 仍然走第 02 章的工具执行路径 (无 permission 检查)。本轮 review
-    重点: permissionManager 与 llm / history 是同级依赖。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 createPermissionManager +
-    agent.run 接入。本轮 review 重点: deny / ask 拒绝路径必写 tool message,
-    同步检查, denylist 优先。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/permission.test.ts</code> +
-    <code>test/agent.permission.test.ts</code>。本轮 review 重点: spy 验证
-    execute 在 permission.check 之后才调用, deny 不调 execute。
-  </li>
-</ol>
-
-<h3 id="vibe-review-07">Review: 第 07 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>permission 同步执行。</strong>不得出现
-    <code>await permission.check</code> 之外的异步延后。验证:
-    <code>grep -n 'permission' src/agent.ts</code> 在 tool 执行路径上,
-    全部是同步调用。
-  </li>
-  <li>
-    <strong>deny 必写 tool message。</strong>不得
-    <code>if (decision.action === "deny") continue</code> (无消息)。验证: 三个
-    action 分支都有 <code>history.add({role: "tool", ...})</code>。
-  </li>
-  <li>
-    <strong>denylist 优先于 allowlist。</strong>验证: Validation 卡片"denylist
-    优先"那条测试通过。
-  </li>
-  <li>
-    <strong>子智能体 ask 降级为 deny。</strong>验证: 子智能体调用 ask
-    路径不阻塞, 写 deny 消息。
-  </li>
-  <li>
-    <strong>permission 不写在工具 execute 内。</strong>验证:
-    <code>grep -n 'process.env\|whitelist' src/tools/run_bash.ts</code> 应当 0
-    行。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-07">调试: 第 07 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · deny 时 console.warn 但不写 tool message。</strong>症状:
-    用户看到日志说"被拒绝", 但 LLM 不知道, 反复重试。验证: Validation 卡片"deny
-    必写 tool message" 那条, 必须断言 history 末尾有 tool message, 不只是
-    console.warn。
-  </li>
-  <li>
-    <strong>伪装 B · 危险命令检测漏掉 shell 重定向。</strong>症状:
-    <code>echo "rm -rf /" &gt; script.sh &amp;&amp; bash script.sh</code>
-    通过检测, 因为 <code>rm -rf</code> 不在命令开头。验证: command-safety.ts
-    应当做"全命令字符串扫描", 不只检查开头。
-  </li>
-  <li>
-    <strong>伪装 C · permission 在 tool execute 之后回滚。</strong>症状:
-    看到"异步延后" 写法。验证: spy 验证 execute 在 permission.check 之后才调用,
-    不允许反序。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-07">迭代: 第 07 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code>feat(ch07): 钉 PermissionManager / Decision / AskUserFn 接口</code> ——
-    tsc 通过, 无实现。
-  </li>
-  <li>
-    <code>feat(ch07): createPermissionManager 工厂 + agent.run 接入 stub</code>
-    —— tsc 通过, decision 永远 allow。
-  </li>
-  <li>
-    <code
-      >feat(ch07): 三 action 分支 + deny/ask 写 tool message +
-      子智能体降级</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code>test(ch07): 同步执行 (spy 验证) + denylist 优先</code> —— 全绿。
-  </li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">Prompt Card · 第 07 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 Permission 模块, 在工具执行前同步拦截, 通过
-      allowlist / denylist / 危险命令检测三层策略。
-    </p>
-    <p>
-      <strong>场景:</strong>用户输入 "清理临时文件", LLM 调用 run_bash("rm -rf
-      /tmp/foo"), permission 在 run_bash.execute 之前 deny, 写 tool message
-      "Permission denied: rm -rf", LLM 第二轮看到拒绝, 改用更安全的命令。
+<div class="card__head">
+<span class="card__tag">Prompt Card · 第 07 章</span>
+<button class="card__copy" data-copy-card="" type="button">复制</button>
+</div>
+<div class="card__body">
+<p>
+<strong>目标:</strong> 给 harness 加 Permission 系统, 危险操作前问用户,
+      黑名单硬拒绝, workspace 外不允许写, plan/auto/default 三模式。
     </p>
-    <p>
-      <strong>模块:</strong> <code>src/permission.ts</code> (新) 暴露
-      <code>createPermissionManager()</code>;
-      <code>src/command-safety.ts</code> (新) 暴露
-      <code>detectDangerousCommand()</code>;
-      <code>src/agent.ts</code> 工具执行分支加 permission check;
-      <code>src/index.ts</code> 接线 permissionManager 和 askUserFn。
+<p>
+<strong>场景:</strong> 用户跑 plan 模式, agent 调 run_write 改文件,
+      REPL 弹出 "Allow run_write to README.md? (y/n)", 用户选 n,
+      history 写 "Permission denied by user"。
     </p>
-    <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
-    <ul>
-      <li>permission 同步执行, 在 tool execute 之前</li>
-      <li>deny 必写 role: "tool" 消息, content 描述拒绝原因</li>
-      <li>ask 路径在子智能体降级为 deny, 不阻塞主 loop</li>
-      <li>denylist 优先于 allowlist</li>
-      <li>permission 不写在工具 execute 内部</li>
-    </ul>
-    <p>
-      <strong
-        >验证 (用 fake permission + spy registry, 逐条落到 vitest):</strong
-      >
+<p>
+<strong>模块:</strong> <code>src/permission.ts</code> (新) 暴露
+      <code>createPermissionManager()</code>; <code>src/command-safety.ts</code> (新)
+      硬黑名单; <code>src/agent.ts</code> (改) invoke 之前调
+      <code>permissionManager.check()</code>; <code>src/cli-commands.ts</code> (改)
+      注册 <code>/mode plan|auto|default</code>。
     </p>
-    <ul>
-      <li>
-        deny 时 spy.execute 没被调用, history 末尾有 tool message "Permission
-        denied"
-      </li>
-      <li>
-        子智能体调用 ask 路径, 写 tool message "Permission denied (no
-        confirmation)"
-      </li>
-      <li>工具同时在 allowlist 和 denylist, decision.action === "deny"</li>
-      <li>run_bash 参数 "rm -rf /tmp/foo", decision.action === "deny"</li>
-      <li>spy 验证 execute 在 permission.check 之后才调用 (同步约束)</li>
-    </ul>
-  </div>
+<p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
+<ul>
+<li>PermissionManager 在 ToolRegistry.invoke 之前拦截, 工具不感知权限</li>
+<li>黑名单和 ask 分开, 黑名单永远 deny, 不让用户选择</li>
+<li>workspace 外写操作硬拒绝, 跨模式都拒绝</li>
+<li>拒绝时写 "Permission denied" tool message, 不 throw</li>
+<li>ask 设 60 秒 timeout, 超时按 deny 处理</li>
+<li>plan/auto/default 三模式, 行为边界见决策表</li>
+</ul>
+<p><strong>验证 (用 fake terminal + vitest, 逐条断言):</strong></p>
+<ul>
+<li>plan 模式 run_write 触发 askUser, 用户回答决定 allow / deny</li>
+<li>auto 模式 run_bash("rm -rf /") 仍 deny, 不调 askUser</li>
+<li>run_write("/etc/passwd", ...) 任何模式都 deny, 错误含 "path outside workspace"</li>
+<li>用户在 ask 时选 n, history 末尾出现 "Permission denied by user" tool message</li>
+<li>ask 60 秒不回答, 按 deny 处理, 写 "Permission denied: timeout"</li>
+</ul>
+</div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意把 permission 写成"先 execute 再回滚", 跑测试, 看 "同步执行" 是否抓到。
+<li>
+    故意把权限检查写进 run_write 里, 跑测试, 看"PermissionManager
+    集中权限决策" 是否抓到 (重复代码 + 模式切换写不出)。
   </li>
-  <li>
-    在 deny 分支只 console.warn 不写 tool message, 跑测试, 看"deny 必写 tool
-    message" 是否抓到。
+<li>
+    让 <code>rm -rf /</code> 走 ask 而不是 deny, 跑测试, 看"黑名单跨模式
+    deny" 是否抓到。
   </li>
-  <li>
-    让 denylist 在 allowlist 之后检查, 跑测试, 看"denylist 优先" 是否抓到。
+<li>
+    不设 ask timeout, 跑测试, 验证 ask 卡死时 harness 也卡死 (说明
+    timeout 必要性)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了"安全阀门" —— Permission。在工具执行前同步拦截, 通过
-  allowlist / denylist / 危险命令检测三层策略, 防止 LLM 把危险
-  命令直接落到真实世界。deny 必写 tool message 让 LLM 知道拒绝原因, ask
-  路径在子智能体自动降级为 deny 不阻塞主 loop。下一章 (第 08 章) 我们会在
-  permission 周围加 Hook 机制, 把 permission 决策、工具执行 记录在 transcript
-  里, 用于审计和回放。
+  Permission 是给工具的"安全网"。 核心是 3 个设计:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>3 种模式</strong>: plan 全 ask, default 危险 ask, auto 不 ask。
+    黑名单跨模式都 deny。
+    </li>
+<li>
+<strong>PermissionManager 集中</strong>: 工具不感知权限, 重复代码
+    消失, 测试简单。
+    </li>
+<li>
+<strong>拒绝写 tool message</strong>: 不 throw, LLM 收到 "Permission denied"
+    后自己调整。
+    </li>
 <p>
-  第 07 章给 harness 加了"决策层" —— Permission。下一章 Hook 机制在 decision 和
-  execution 之间插入"事实记录"层: PreToolUse (执行前)、 PostToolUse
-  (执行后)、SessionStart (会话开始) 三个钩子, 让 harness 的关键事件有 transcript
-  兜底, 也为 eval 重放和审计留事实基础。
+  下一章 (第 08 章) 扩展 Permission 的思路 — Hook 系统, 让用户
+  在 loop 周围挂自定义逻辑 (PreToolUse / PostToolUse), 不只是
+  ask / deny。
 </p>
+</content>
\ No newline at end of file
diff --git a/tutorial/chapters/08-hook.html b/tutorial/chapters/08-hook.html
index 17ee814..a2471d5 100644
--- a/tutorial/chapters/08-hook.html
+++ b/tutorial/chapters/08-hook.html
@@ -1,554 +1,684 @@
-<p class="article__eyebrow">第 08 章 · 在 Loop 周围挂钩子</p>
-<h1 class="article__title">给 Harness 留事实基础: Hook</h1>
+<p class="article__eyebrow">第 08 章 · 给主循环挂扩展点</p>
+<h1 class="article__title">Hook: 在主循环周围挂自定义逻辑</h1>
 <p class="article__lede">
-  前面七章让 harness 能聊天、能调工具、能跑子任务、能压缩、能拦权限。 但 harness
-  的关键事件 (permission 决策、工具执行、状态变化) 现在还没有 统一记录。这一章在
-  loop 周围挂三个钩子: PreToolUse (执行前)、 PostToolUse (执行后)、SessionStart
-  (会话开始), 让 harness 的关键事件 有 transcript 兜底, 也为 eval
-  重放和审计留事实基础。
+  第 07 章的 Permission 是"工具执行前要不要问人" 的硬规则, 但用户经常想
+  挂更灵活的东西: 审计日志、敏感词提醒、自动追加项目规范、跑完测试后
+  通知 Slack。 Permission 装不下, 写死在 tool 里又不通用。 这一章
+  加 <strong>Hook 系统</strong>: 3 个事件点 (SessionStart / PreToolUse /
+  PostToolUse) × 3 个返回码 (continue / block / inject), 用工厂函数
+  + 闭包组合, 让扩展逻辑在 loop 周围插桩。 读完后, 你能讲清"Hook
+  和 Permission 的边界" (Permission 是安全边界, Hook 是扩展点), 并能
+  用 fake handler 验证"block 短路" 和 "inject 延迟注入" 的协议。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-07">在第 07 章基础上改了什么</h2>
-<p>
-  这一章在 agent 主循环的关键节点插入 Hook 调用点。PreToolUse 在
-  permission.check 之后、tool.execute 之前触发, PostToolUse 在 tool.execute
-  之后、history.add(tool message) 之前触发, SessionStart 在第一次 run()
-  调用、history.add(user message) 之前触发。Hook 返回 exitCode: 0 (pass through)
-  / 1 (block) / 2 (inject message) 三种 状态, harness 根据 exitCode
-  决定后续行为。 对应到代码, 改动集中在 3 个文件:
-  <code>src/hooks.ts</code> (新)、 <code>src/agent.ts</code> (改关键节点插入
-  hookRunner.run())、 <code>src/index.ts</code> (改接线, 注入 hookRunner)。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/hooks.ts: 钩子运行器 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/agent.ts: PreToolUse / PostToolUse / SessionStart 接入点</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/transcript.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/transcript.ts: Hook 决策记录 (第 15 章展开)</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    harness 跑了一个月之后, 出了"为什么这个工具没执行" 的问题, 但 history
-    里只有"Permission denied" 这条 tool message, 没有"为什么 permission 拒绝"
-    的具体信息。现象是"决策过程不透明"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"在 permission 内部 console.log"。这有两个问题: 一是日志进了
-    stdout, 但 transcript 没记录, eval 重放时无法还原; 二是 console.log
-    是副作用, 测试时无法断言"permission 拒绝时是否 输出了正确原因"。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code>interface HookRunner { run(event): Promise&lt;HookResult&gt; }</code
-    >。 不变量三条: (1) Hook 是异步的, 但不能修改主消息流 (只能注入新消息 或返回
-    block), (2) exitCode 1 必须对应到具体副作用, 不能"无理由 block", (3) Hook
-    失败 (抛错) 必须降级为 pass through, 不能让 harness 因 Hook 崩溃。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake HookRunner 预设 PreToolUse 返回 exitCode 1, 跑完后 history 末尾有 tool
-    message "Blocked by PreToolUse hook: ..."; spy 验证 tool.execute 没被调用。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · Hook 抛错让 harness 崩</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-const result = await hookRunner.run(event);
-if (result.exitCode === 1) {
-  // 错误: 没写 tool message
-  return;  // 直接 return, tool 也不执行了
-}</code></pre>
-  <p><strong>问:</strong>为什么不写 tool message, 直接 return?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 协议: tool_call 必须配对 tool_message,
-    缺一条 messages 序列就断裂; 真实性: LLM 不知道"为什么没执行", 反复重试;
-    体验: eval 重放时看不到 "Hook 在此刻 block 了什么"。
-  </p>
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: 用户想"加一行提醒" 却改了 5 个 tool</h2>
+<p>
+  写代码之前, 先看一个真实痛点。 用户的 team 想给 harness 加一条团队
+  规范: "所有 bash 命令前检查是否有 <code>--no-verify</code>, 有就警告
+  LLM 改用 <code>--verify</code>"。 用 Permission 写, 只能 deny, 不能
+  "警告后继续"。
+</p>
+<ol>
+<li>
+<strong>写进 tool 里?</strong> <code>run_bash.ts</code> 加
+    <code>if (cmd.includes("--no-verify")) ...</code>。 5 个用户提了
+    5 个不同规则, tool 里堆满 if/else, 没人维护得了。
+  </li>
+<li>
+<strong>写进 Permission?</strong> Permission 的语义是 allow/ask/deny,
+    不能"警告后继续"。 强行加, Permission 就变成万能垃圾桶。
+  </li>
+<li>
+<strong>写进 agent loop?</strong> 改了 5 个 tool 不够, 还要改 agent
+    的主循环, 任何循环扩展都污染核心。
+  </li>
+</ol>
+<p>
+  朴素想法 1: "直接给 agent.ts 加个 switch 监听事件?" 错。 agent 是
+  通用循环, 业务规则千变万化, agent 不能认识所有规则。 主循环应该
+  只<strong>发事件</strong>, 不<strong>处理事件</strong>。
+</p>
+<p>
+  朴素想法 2: "用 Node EventEmitter 就行了?" 表面行, 但 LLM 的
+  protocol 决定了你不能在 <code>tool_call</code> 和 <code>tool_result</code>
+  之间塞消息 (OpenAI API 拒绝)。 需要 Hook 系统
+  理解<strong>延迟注入</strong>: 把"补充消息" 留到所有 tool_result
+  写完后, 统一追加成 user message。
+</p>
+<p>
+  正确做法: 加一个<strong>独立模块</strong> <code>src/hooks.ts</code>,
+  定义 3 个事件 + 3 个返回码, 主循环在固定时机 emit, handler 在外部
+  注册, 主循环根据返回码决定继续 / 阻止 / 注入。 Permission 仍然是
+  安全边界, Hook 是<strong>扩展点</strong>。 这是 Reference 章节
+  "模式 9 · Observer 观察者模式" 的具体应用。
+</p>
+<h2 id="three-events">三个事件点: SessionStart / PreToolUse / PostToolUse</h2>
+<p>
+  <strong>用途</strong>: 主循环在 3 个固定时机发事件, handler
+  在外部注册监听。 事件点必须<strong>少而精</strong>, 多一个事件点
+  就是多一份主循环耦合。
+</p>
+<p>
+  <strong>真实场景</strong>: 团队要"跑完测试后自动 commit", 用
+  PostToolUse 监听 <code>run_bash("npm test")</code> 返回 0, 触发
+  <code>git commit</code>; 用户想"工具调用前打印彩色日志",
+  用 PreToolUse 把 tool name + args 写到 stderr; 团队要"每次启动
+  注入 CLAUDE.md 内容" 用 SessionStart 一次性发出 query。
+</p>
+<p>
+  <strong>设计思想</strong>: 经典<strong>观察者模式</strong> +
+  <strong>策略模式</strong> 组合 — 主循环是事件源, handler 是观察者,
+  返回码是策略。 关键是"事件点数量要少, 返回码语义要正交":
+</p>
+<div class="figure figure--stack">
+  <div class="figure__title">图 1 · 3 个事件点在主循环中的位置</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">1 · SessionStart</div>
+      <div class="flow-stack__body">run() 第一次被调用时, 发一次, 携带 query 文本。 用于"启动时注入项目规范 / 加载 MEMORY"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">2 · PreToolUse</div>
+      <div class="flow-stack__body">权限检查通过后, 工具执行前, 每次 tool_call 都发。 携带 toolCallId / toolName / args / round。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">3 · PostToolUse</div>
+      <div class="flow-stack__body">工具执行后, 每次都有。 携带 output (经 P1 即时压缩) + error 标记。 用于审计 / 自动 commit / Slack 通知。</div>
+    </div>
+  </div>
 </div>
-
-<div class="note">
-  <p class="note__title">观察 2 · Hook 抛错让主 loop 崩</p>
-  <pre class="code-block"><code>// 教学简化版
-try {
-  const result = await hookRunner.run(event);
-} catch (err) {
-  throw err;  // 错误: 让 hook 错误冒泡到 agent
-}</code></pre>
-  <p><strong>问:</strong>为什么不让 hook 错误冒泡?</p>
-  <p>
-    <strong>答:</strong>Hook 是用户扩展点, 第三方 Hook (例如审计 Hook)
-    不应该有能力让 harness 崩溃。Hook 抛错必须降级为 pass through, 写一条 error
-    tool message 告诉 LLM "Hook 失败, 已忽略"。
-  </p>
+<p>
+  <strong>实现细节</strong>: 事件用 TypeScript <strong>discriminated
+  union</strong> 定义, 编译期就能窄化 payload 类型。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts#L76"><code>src/hooks.ts</code> 第 76 行的 <code>HookEvent</code> union</a>:
+</p>
+<pre><code class="language-typescript">export type HookEvent =
+  | { name: "SessionStart"; payload: { query: string } }
+  | {
+      name: "PreToolUse";
+      payload: {
+        toolCallId: string;
+        toolName: string;
+        args: Record&lt;string, unknown&gt;;
+        round: number;
+      };
+    }
+  | {
+      name: "PostToolUse";
+      payload: {
+        toolCallId: string;
+        toolName: string;
+        args: Record&lt;string, unknown&gt;;
+        round: number;
+        output: string;
+        error: boolean;
+      };
+    };</code></pre>
+<p>
+  为什么是 discriminated union 而不是 3 个独立 type? handler 注册
+  时只关心自己监听的事件 (如 PostToolUse), union 让
+  <code>event.name</code> 在 switch 里自动窄化到具体 payload, 不需要
+  cast, 编译期防错。
+</p>
+<p>
+  为什么 <code>args</code> 是 <code>Record&lt;string, unknown&gt;</code>?
+  工具 schema 各异, Hook 系统不认识所有 schema, 留 unknown 让 handler
+  自己 cast / 验证。
+</p>
+<h2 id="three-codes">三个返回码: 0 / 1 / 2 的语义正交</h2>
+<p>
+  <strong>用途</strong>: handler 返回一个退出码, 主循环根据退出码决定
+  后续动作。 返回码<strong>必须正交</strong> (互不依赖), 任何
+  组合都有明确语义。
+</p>
+<p>
+  <strong>真实场景</strong>: PreToolUse 监听 <code>run_bash</code>,
+  看到 <code>--no-verify</code> 就返回 1 (block, 阻止执行); 看到
+  <code>curl</code> 就返回 2 (inject, 追加警告消息到对话历史);
+  其他返回 0 (continue, 啥都不做)。
+</p>
+<p>
+  <strong>设计思想</strong>: 数字退出码借鉴 shell 传统 (exit 0/1/2),
+  0 = 成功, 1 = 失败, 2 = 特殊用法。 这里的语义是:
+</p>
+<div class="figure figure--compare">
+  <div class="figure__title">图 2 · 3 个返回码的语义对比</div>
+  <div class="flow-compare">
+    <div class="flow-compare__col flow-compare__col--bad">
+      <div class="flow-compare__head">exitCode 0 · continue</div>
+      <div class="flow-compare__body">啥都不做, 继续当前动作。 用于"我听了, 但没意见"。 大多数 handler 走这条。</div>
+    </div>
+    <div class="flow-compare__col flow-compare__col--warn">
+      <div class="flow-compare__head">exitCode 1 · block</div>
+      <div class="flow-compare__body">阻止当前动作。 PreToolUse 上阻止 tool 执行; PostToolUse 上仅记录警告, 不重做。 SessionStart 上几乎不用。</div>
+    </div>
+    <div class="flow-compare__col flow-compare__col--good">
+      <div class="flow-compare__head">exitCode 2 · inject</div>
+      <div class="flow-compare__body">注入一条补充消息后继续。 消息在所有 tool_result 写完后, 统一追加成 user 消息。 用于"提醒 / 警告 / 补充上下文"。</div>
+    </div>
+  </div>
 </div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export type HookEventName =
-  | "SessionStart" | "PreToolUse" | "PostToolUse" | "SubagentStart" | "SubagentEnd";
-
-export interface HookEvent {
-  name: HookEventName;
-  payload: Record&lt;string, unknown&gt;;  // 事件专属字段
-}
-
-export type HookExitCode = 0 | 1 | 2;
-// 0: pass through (继续执行)
-// 1: block (阻止后续动作, 但 tool_call 必须配对 tool_message)
-// 2: inject (允许后续动作, 追加一条 user 消息到 history)
+<p>
+  <strong>实现细节</strong>: HookResult 接口只有一个 <code>exitCode</code>
+  + 可选 <code>message</code>, 没有"action 字段" "reason 字段"。
+  看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts#L47"><code>src/hooks.ts</code> 第 47 行的 <code>HookResult</code> 类型</a>:
+</p>
+<pre><code class="language-typescript">export type HookExitCode = 0 | 1 | 2;
 
 export interface HookResult {
+  /** 退出码: 0=继续, 1=阻止, 2=注入补充消息后继续 */
   exitCode: HookExitCode;
-  message?: string;  // exitCode 1 时为拒绝原因, exitCode 2 时为注入消息
-}
-
-export interface HookRunner {
-  run(event: HookEvent): Promise&lt;HookResult&gt;;
+  /** 补充说明文本, 给用户 / LLM / 日志看 */
+  message?: string;
 }</code></pre>
-
-<h2 id="loop-integration">loop 接入: 三个事件三个位置</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// SessionStart: 第一次 run() 时触发
-if (!sessionStarted) {
-  sessionStarted = true;
-  const result = await safeRunHook(hookRunner, { name: "SessionStart", payload: { query } });
-  if (result.exitCode === 1) {
-    return result.message ?? "Session blocked by hook.";  // 整个会话被拒
-  }
-  // exitCode 2 注入的 message 在用户消息之后追加
-}
-
-// 在工具执行路径上
-for (const call of assistant.tool_calls) {
-  // 1. permission
-  const decision = permissionManager.check({ toolName: call.name, args: call.args });
-  if (decision.action === "deny") { /* ... */ continue; }
-  if (decision.action === "ask") { /* ... */ }
-
-  // 2. PreToolUse Hook (permission 之后, execute 之前)
-  const preResult = await safeRunHook(hookRunner, {
-    name: "PreToolUse",
-    payload: { toolName: call.name, args: call.args, round: loopRound },
-  });
-  if (preResult.exitCode === 1) {
-    history.add({ role: "tool", tool_call_id: call.id,
-      content: `Blocked by PreToolUse hook: ${preResult.message ?? ""}` });
-    continue;
-  }
-  // exitCode 2 注入的消息延迟到所有 tool_result 写完后再追加 (避免破坏协议)
-
-  // 3. execute
-  const result = await tool.execute(call.args);
-
-  // 4. PostToolUse Hook (execute 之后, tool message 写入之前)
-  const postResult = await safeRunHook(hookRunner, {
-    name: "PostToolUse",
-    payload: { toolName: call.name, args: call.args, output: result.content, error: result.error },
-  });
-  // PostToolUse exitCode 1 不能撤销已执行的副作用, 只能注入警告
-  if (postResult.exitCode === 1) {
-    pendingHookMessages.push(`[Hook: PostToolUse] block: ${postResult.message ?? ""}`);
-  }
-  if (postResult.exitCode === 2) {
-    pendingHookMessages.push(`[Hook: PostToolUse] ${postResult.message ?? ""}`);
+<p>
+  聚合规则: 多个 handler 注册同一事件时, 串行执行, 优先级
+  <strong>1 &gt; 2 &gt; 0</strong>。 遇到 block (1) 立即短路, 不执行
+  后续 handler; inject (2) 的 message 累积, 最后用空行拼接; continue
+  (0) 啥都不做。
+</p>
+<p>
+  handler 抛异常时<strong>只记录 warn 日志, 不中断主流程</strong>。 这是
+  Reference 章节 "模式 21 · No Catch Throw" 的体现 — Hook 是扩展机制
+  不是安全机制, 一个扩展挂掉不能让整个 agent 死掉。
+</p>
+<h2 id="aggregation">多 handler 聚合: 串行 + 短路 + 累积</h2>
+<p>
+  <strong>用途</strong>: 多个 handler 可以注册同一事件, 团队 / 用户
+  / 内置审计可以同时挂。 聚合规则必须<strong>明确</strong>, 不能
+  含糊 (不能说"按注册顺序决定一切")。
+</p>
+<p>
+  <strong>真实场景</strong>: 一个团队想挂 3 个 handler — (a) 内置审计
+  记录到 <code>audit.log</code>; (b) 团队规范检查 <code>--no-verify</code>;
+  (c) 安全扫描器检测 <code>rm -rf</code>。 handler (c) 返回 1 应该
+  短路, 跳过 (b); handler (b) 返回 2 累积 message, 不影响 (a) 的
+  审计写入。
+</p>
+<p>
+  <strong>设计思想</strong>: 串行而非并行 — 简单、可预测、易调试;
+  block 短路而非 "block 排队" — 第一个 block 命中就够了, 后续没必要
+  执行; inject 累积而非 "最后一个赢" — 多条提醒都有意义。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts#L172"><code>src/hooks.ts</code> 第 172 行的 <code>run()</code> 实现</a>:
+</p>
+<pre><code class="language-typescript">async function run(event: HookEvent): Promise&lt;HookResult&gt; {
+  const list = handlers[event.name] ?? [];
+  const injected: string[] = [];
+
+  for (const handler of list) {
+    try {
+      const result = await handler(event);
+      if (result.exitCode === 1) return result;          // 短路
+      if (result.exitCode === 2 &amp;&amp; result.message) {
+        injected.push(result.message);                   // 累积
+      }
+    } catch (error) {
+      logger.warn("Hook %s failed: %s", event.name, ...);
+    }
   }
-
-  history.add({ role: "tool", tool_call_id: call.id, content: result.content });
-}
-
-// 5. 所有 tool_result 写完后, 统一追加 Hook 注入的 user 消息
-for (const msg of pendingHookMessages) {
-  history.add({ role: "user", content: msg }, timing);
-}</code></pre>
-
-<h2 id="error-degradation">错误降级: hook 抛错不能崩 harness</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-async function safeRunHook(hookRunner: HookRunner, event: HookEvent): Promise&lt;HookResult&gt; {
-  try {
-    return await hookRunner.run(event);
-  } catch (err) {
-    logger.warn("Hook %s failed: %s", event.name, err);
-    return { exitCode: 0 };  // 降级为 pass through
+  if (injected.length &gt; 0) {
+    return { exitCode: 2, message: injected.join("\n\n") };
   }
+  return { exitCode: 0 };
 }</code></pre>
 <p>
-  第三方 Hook 不应该有能力让 harness 崩溃。Hook 抛错时, harness 降级 为 pass
-  through, 写一条 warning 到 logger, 主 loop 继续。
+  <strong>实现细节</strong>: handler 数组存在闭包里, 用
+  <code>Partial&lt;Record&lt;HookEventName, HookHandler[]&gt;&gt;</code>
+  类型保证"只注册需要的事件, 不需要的事件不必写空数组"。 这也是
+  Reference 章节 "模式 1 · 工厂 + 闭包" 的应用 — 状态 (handlers +
+  injected) 都在闭包内, 外部只能通过 <code>run()</code> 接口访问。
 </p>
-
-<h2 id="trap">反例梯度</h2>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>PreToolUse block 时不写 tool message。</p>
-    <p>
-      <strong>为什么错:</strong>破坏第 02 章 tool_call 配对协议, messages
-      序列断裂。
-    </p>
-    <p>
-      <strong>正确做法:</strong>block 必写 role: "tool" 消息 "Blocked by
-      PreToolUse hook: ..."。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>Hook 抛错时让主 loop 崩。</p>
-    <p>
-      <strong>为什么错:</strong>第三方 Hook 不应该有能力让 harness 崩溃, 主 loop
-      健壮性优先。
-    </p>
-    <p>
-      <strong>正确做法:</strong>safeRunHook() 包 try/catch, 抛错降级为 exitCode
-      0。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>Hook 注入的 user 消息插在 tool_result 之间。
-    </p>
-    <p>
-      <strong>为什么错:</strong>多工具并行调用时, 插在 tool_result 之间会破坏
-      LLM 协议的 tool_call/tool_result 配对规则。
-    </p>
-    <p>
-      <strong>正确做法:</strong>Hook 注入消息延迟到所有 tool_result 写完之后,
-      统一追加。
-    </p>
+<h2 id="delayed-inject">延迟注入: 为什么不能在 tool_call 后立刻写 user message</h2>
+<p>
+  <strong>用途</strong>: exitCode 2 的 message 必须<strong>延迟</strong>到
+  所有 tool_result 写完后才能追加, 不然会破坏 OpenAI API 消息格式。
+</p>
+<p>
+  <strong>真实场景</strong>: 假设 LLM 一次发了 3 个 tool_call
+  (run_bash + run_read + run_write), PostToolUse 在每个执行完都触发,
+  handler 返回 2 要注入警告。 如果在第一个 tool_result 后立刻追加
+  user message, OpenAI API 会拒绝: 它要求 assistant message 的所有
+  tool_call 必须有对应的 tool_result, 中间不能有 user message。
+</p>
+<p>
+  <strong>设计思想</strong>: Hook 系统<strong>不负责写入</strong>,
+  它只返回"应该注入什么"; 主循环负责"什么时候写"。 这就是 Reference
+  章节 "模式 4 · 依赖注入" — HookRunner 是被注入的策略, 主循环是
+  调用方, 双方协议清晰。
+</p>
+<div class="figure figure--stack">
+  <div class="figure__title">图 3 · exitCode 2 的延迟注入时机</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">❌ 错误时机: tool_result 之后立刻追加</div>
+      <div class="flow-stack__body">assistant(tool_call A) → tool_result A → <strong>user(警告消息)</strong> → tool_result B → ... OpenAI 拒绝, 400 error。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">✅ 正确时机: 所有 tool_result 写完后追加</div>
+      <div class="flow-stack__body">assistant(tool_call A,B,C) → tool_result A → tool_result B → tool_result C → <strong>user(警告消息)</strong> → 下一轮 assistant。 协议合法。</div>
+    </div>
   </div>
 </div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
+<p>
+  <strong>实现细节</strong>: 这条约束写在
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts#L28"><code>src/hooks.ts</code> 第 28 行的"延迟注入" 注释</a>
+  里, 是从 "本阶段限制" 段之后的"关键约束" 部分 — 不写测试也会被
+  后续读者注意到。 agent 主循环在 collect tool result 阶段<strong>缓存</strong>
+  所有 inject 消息, 写完最后一个 tool_result 后再批量追加为 user
+  message。
+</p>
+<h2 id="noop">createNoopHookRunner: 让 agent 主循环保持直线阅读</h2>
+<p>
+  <strong>用途</strong>: 默认没有 hook 时, agent 主循环里每个
+  <code>if (hookRunner)</code> 都会打断阅读。 用 <code>createNoopHookRunner()</code>
+  做一个"啥都不做" 的默认实现, 让主循环保持直线流。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户没注册任何 hook, 跑最简单的 harness:
+  "读 1 个文件, 改 1 个文件, 结束"。 主循环 6 步里每步都有
+  <code>hookRunner.run(...)</code>, 如果 hookRunner 是 undefined 就
+  要写 6 个 if, 阅读起来像刺猬。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>空对象模式 (Null Object)</strong> —
+  不是简单 <code>hookRunner?.run()</code>, 而是给一个永远返回
+  <code>{ exitCode: 0 }</code> 的对象, 协议完整但行为无副作用。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts#L220"><code>src/hooks.ts</code> 第 220 行的 <code>createNoopHookRunner()</code></a>:
+</p>
+<pre><code class="language-typescript">export function createNoopHookRunner(): HookRunner {
+  return {
+    async run() {
+      return { exitCode: 0 };
+    },
+  };
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: 在 <code>index.ts</code> 的 Composition Root
+  里, 如果用户没传 <code>hookRunner</code> 就 <code>createNoopHookRunner()</code>
+  兜底, 保证 agent.ts 里 <code>await hookRunner.run(event)</code> 永远
+  不需要判空。 这是 Reference 章节 "模式 1 · 工厂 + 闭包" 的应用 —
+  闭包返回一个稳定的对象, 行为可预测。
+</p>
+<h2 id="register">handler 注册: Partial&lt;Record&gt; 表达"不监听就是空数组"</h2>
+<p>
+  <strong>用途</strong>: 创建 HookRunner 时传入 handlers 配置, 同一
+  事件可注册多个 handler。 没注册的事件 = 空数组 = 啥也不做。
+</p>
+<p>
+  <strong>真实场景</strong>: <code>index.ts</code> 里
+  <code>createHookRunner({ PreToolUse: [auditHandler, noVerifyHandler], PostToolUse: [slackHandler] }, logger)</code>
+  — 只注册了 2 个事件, SessionStart 默认空数组。
+</p>
+<p>
+  <strong>设计思想</strong>: 用 <code>Partial&lt;Record&lt;K, V&gt;&gt;</code>
+  类型让"只填需要的事件" 成为类型安全的事实 — 漏填的事件是 undefined,
+  编译期允许, 运行期用 <code>?? []</code> 兜底。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts#L163"><code>src/hooks.ts</code> 第 163 行的 <code>createHookRunner</code> 工厂</a>:
+</p>
+<pre><code class="language-typescript">export function createHookRunner(
+  handlers: Partial&lt;Record&lt;HookEventName, HookHandler[]&gt;&gt;,
+  logger: Logger,
+): HookRunner {
+  // handlers[event.name] ?? [] 兜底, Partial 允许 undefined
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: 工厂函数返回的对象只有一个 <code>run</code>
+  方法, 没有 <code>addHandler</code> / <code>removeHandler</code> —
+  这是<strong>故意</strong>的简化: handler 在 Composition Root 一次性
+  注入, 运行期不能动态加 (避免多 handler 顺序的歧义)。 这是 Reference
+  章节 "模式 3 · 窄接口" 的应用 — 接口只暴露 <code>run</code>, 不暴露
+  handlers 数组, 闭包状态不外泄。
+</p>
+<h2 id="loop-integration">主循环集成: 在哪 3 个时机 emit</h2>
+<p>
+  <strong>用途</strong>: Hook 是扩展点, 主循环<strong>必须</strong>
+  按固定时序 emit, 时序错了协议就错。 3 个 emit 时机在 agent.ts 里
+  是:
+</p>
+<ol>
+<li>
+<strong>SessionStart</strong>: <code>while</code> 循环开始前, 把 query
+    包装成事件 <code>{ name: "SessionStart", payload: { query } }</code>
+    发一次。 handler 返回 2 的 message 注入到 system 消息后, 第一轮
+    assistant 调用之前。
+  </li>
+<li>
+<strong>PreToolUse</strong>: 权限检查通过后, 调
+    <code>toolRegistry.invoke(...)</code> 之前。 handler 返回 1 阻止
+    invoke, 写 "Permission denied by hook" tool message; 返回 2 累积
+    message, 在所有 tool_result 后注入。
+  </li>
+<li>
+<strong>PostToolUse</strong>: <code>invoke</code> 完成后, 把
+    <code>ToolResult.output</code> 喂给 handler。 handler 返回 1
+    (block) 在 PostToolUse 上<strong>不重做</strong>, 只写警告日志
+    (因为 tool 已经跑完, block 没意义); 返回 2 累积 message。
+  </li>
+</ol>
+<p>
+  <strong>设计思想</strong>: 集成点必须<strong>集中</strong>, 不能散
+  在 agent.ts 各个角落。 一般 6 步 loop 里 SessionStart 在 step 0
+  (load history 后), PreToolUse 在 step 3 (call tools 前), PostToolUse
+  在 step 3.5 (call tools 后, before step 4 compress)。
+</p>
+<p>
+  <strong>实现细节</strong>: 集成代码写在 agent.ts 里, 调用方式统一是
+  <code>const hookResult = await hookRunner.run(event); if (hookResult.exitCode === 1) ...; if (hookResult.message) pendingInjects.push(hookResult.message);</code>。
+  缓存的 inject 消息在 collect tool results 阶段之后批量追加, 这就是
+  第 02 章 "message 配对" 协议的具体应用。
+</p>
+<h2 id="cache-implication">对 prompt cache 的影响: SessionStart 注入是 stable prefix 一部分</h2>
+<p>
+  <strong>用途</strong>: SessionStart 的 inject 消息会进入 system 消息,
+  而 system 消息是 stable prefix 的<strong>核心</strong>。 每次启动注入
+  不同内容, 整个对话的 cache 命中会崩。
+</p>
+<p>
+  <strong>真实场景</strong>: 团队用 SessionStart 注入"项目规范", 但
+  规范每行一个 prompt 改了 10 个字符, 后面 100 轮对话 cache 全部
+  miss, 成本 ×10。 应该把 SessionStart 的注入做成<strong>幂等</strong>:
+  启动时一次写, 后续 100 轮都引用, 不再追加。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>幂等性约束</strong> — SessionStart
+  注入必须保证"同一 session 内, 相同 query + 相同 handler 集合"
+  产出相同 message。 handler 应该读配置文件 / MEMORY, 不读外部随机
+  数据 (时间戳 / 随机数)。
+</p>
+<p>
+  <strong>实现细节</strong>: 这是和第 05 章 (Skill 稳定前缀) +
+  第 09 章 (Memory 4 类 tags) 联合的 cache 策略。 SessionStart 的
+  inject 应该在 handler 内部做"内容去重" — 同一 handler 在同一 session
+  只 emit 一次。 这部分实现在第 10 章 (Cache) 详细展开。
+</p>
+<h2 id="fake-test">fake test: 用 stub handler 验证 block 短路和 inject 累积</h2>
+<p>
+  <strong>用途</strong>: Hook 系统的测试不需要真 hook, 用 stub 即可 —
+  测的是"主循环按返回码做了什么", 不是 handler 内部逻辑。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户写一个"检测 rm -rf" 的 handler, 想
+  验证: (a) <code>rm -rf /</code> 应该 block; (b) <code>rm file.txt</code>
+  应该 continue; (c) block 时主循环写 "Permission denied by hook" tool
+  message, 不真的跑 rm。 stub handler 返回固定结果就能测。
+</p>
+<p>
+  <strong>设计思想</strong>: 测<strong>协议</strong>不测实现 — Hook 系统
+  的协议是"返回码 → 主循环行为", 这层用 stub 测最稳。 handler 内部
+  业务逻辑 (rm 检测算法) 用单元测试单独测, 不混在 HookRunner 测试里。
+</p>
+<pre><code class="language-typescript">test("block exitCode 短路后续 handler", async () =&gt; {
+  const calls: string[] = [];
+  const runner = createHookRunner({
+    PreToolUse: [
+      () =&gt; { calls.push("first"); return { exitCode: 0 }; },
+      () =&gt; { calls.push("second"); return { exitCode: 1, message: "blocked" }; },
+      () =&gt; { calls.push("third"); return { exitCode: 2, message: "warn" }; },
+    ],
+  }, noopLogger);
+  const result = await runner.run({ name: "PreToolUse", payload: {...} });
+  expect(calls).toEqual(["first", "second"]);   // third 没跑
+  expect(result).toEqual({ exitCode: 1, message: "blocked" });
+});
+
+test("inject 累积: 多个 handler 的 message 用空行拼接", async () =&gt; {
+  const runner = createHookRunner({
+    PostToolUse: [
+      () =&gt; ({ exitCode: 2, message: "audit: 工具跑完了" }),
+      () =&gt; ({ exitCode: 2, message: "warn: 命令有点危险" }),
+    ],
+  }, noopLogger);
+  const result = await runner.run({ name: "PostToolUse", payload: {...} });
+  expect(result.exitCode).toBe(2);
+  expect(result.message).toBe("audit: 工具跑完了\n\nwarn: 命令有点危险");
+});
+
+test("handler 抛异常只 warn, 不中断", async () =&gt; {
+  const runner = createHookRunner({
+    PreToolUse: [
+      () =&gt; { throw new Error("boom"); },
+      () =&gt; ({ exitCode: 2, message: "next" }),
+    ],
+  }, noopLogger);
+  const result = await runner.run({ name: "PreToolUse", payload: {...} });
+  expect(result.message).toBe("next");   // 异常被吞, 后续继续
+});</code></pre>
+<p>
+  <strong>实现细节</strong>: 这 3 个 stub 测试覆盖了 Hook 系统 90% 的
+  协议 — 短路、累积、异常隔离。 handler 自身的业务逻辑测试应该放在
+  <code>handler-name.test.ts</code>, 不在 <code>hooks.test.ts</code>。
+</p>
+<h2 id="common-confusion">常见误解: Hook 不是 Permission 替代品</h2>
+<p>
+  <strong>误解 1: "Hook 能取代 Permission?"</strong> 错。 Permission 是
+  <strong>安全边界</strong> — 黑名单命令、workspace 外路径、ask 用户,
+  这些必须 hard 拒绝, 不让 LLM 重试。 Hook 是<strong>扩展点</strong>
+  — 审计、提醒、注入上下文, 不替代安全。 两者职责正交。
+</p>
+<p>
+  <strong>误解 2: "exitCode 2 = block 的一种?"</strong> 错。 2 是
+  <strong>inject 后继续</strong>, 不是 block。 把 2 当 1 用, 工具不会
+  跑, 但 LLM 也不会收到反馈, 以为是协议错误。
+</p>
+<p>
+  <strong>误解 3: "PostToolUse 上 block 能重做工具?"</strong> 错。
+  PostToolUse 触发时工具<strong>已经跑完</strong>, block 没有意义。
+  block 在 PostToolUse 上<strong>只记 warn 日志</strong>, 不重做。
+  想要"重做" 用上层 retry 机制, 不是 Hook。
+</p>
+<p>
+  <strong>误解 4: "SessionStart 能 emit 多次?"</strong> 不行。
+  SessionStart 语义是"每个 agent 实例第一次 run() 时", emit 多次
+  会污染 cache。 主循环只在 <code>while</code> 之前发一次, 不在
+  循环内重发。
+</p>
+<h2 id="trap">反例梯度: 4 个常见错误</h2>
+<div class="cards-grid">
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 1 · 过度耦合</span></div>
+    <div class="card__body">
+      <p>把 handler 业务逻辑写进 <code>agent.ts</code> 主循环。 改一次
+        规则要重读 200 行 agent 代码。 错。 Handler 应该注册到
+        HookRunner, 主循环不出现业务关键字。 <code>agent.ts</code> 里
+        只调 <code>hookRunner.run(event)</code>, 不 import 任何 handler
+        实现。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>PostToolUse exitCode 1 试图撤销已执行的副作用。
-    </p>
-    <p>
-      <strong>为什么错:</strong>工具已经执行 (例如文件已写、命令已跑), 撤销 API
-      不存在。PostToolUse block 只能注入警告, 不能"假装撤销"。
-    </p>
-    <p>
-      <strong>正确做法:</strong>PostToolUse exitCode 1 注入 "[Hook: PostToolUse]
-      block: ..." 警告, 不假装撤销。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 2 · 把 2 当 1</span></div>
+    <div class="card__body">
+      <p>用户写 handler 检测 <code>--no-verify</code>, 想"阻止 + 给 LLM
+        提示改用 <code>--verify</code>", 返回 <code>{ exitCode: 1, message: "请用 --verify" }</code>。 错。 1 是 block, message
+        不会注入, LLM 收到 "blocked" 啥也不知道。 这种情况要返回
+        <code>{ exitCode: 2, message: "..." }</code> (注入) <strong>或</strong>
+        1 (硬阻止) + 主循环负责把 message 写到 tool result。</p>
+    </div>
   </div>
-</div>
-
-<h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 08 章</span>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 3 · 忽略延迟注入</span></div>
+    <div class="card__body">
+      <p>主循环在 tool_result 写完后, 立刻把 inject 消息追加到 history
+        数组。 表面看"消息进 history 了", 但如果 LLM 一轮发了 3 个
+        tool_call, 第一个 tool_result 后立刻追加 user message, OpenAI
+        API 拒绝 400。 正确做法: <strong>缓存所有 inject 消息</strong>,
+        在本轮所有 tool_result 写完后<strong>统一追加</strong>。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>PreToolUse block 写 tool message:</strong>fake HookRunner
-      PreToolUse 返回 exitCode 1, 跑完后 spy 验证 tool.execute 没被 调用,
-      history 末尾有 tool message "Blocked by PreToolUse hook"。
-    </p>
-    <p>
-      <strong>Hook 抛错降级为 pass through:</strong>fake HookRunner.run 抛错,
-      跑完后主 loop 继续, tool.execute 被调用, logger.warn 有 "Hook PreToolUse
-      failed"。
-    </p>
-    <p>
-      <strong>Hook 注入消息延迟追加:</strong>fake LLM 返回 2 个 tool_calls, fake
-      HookRunner PreToolUse 返回 exitCode 2 with message, 跑完后 history 中 2 条
-      tool_result 之后才出现 1 条 user message "[Hook: PreToolUse] ..."。
-    </p>
-    <p>
-      <strong>SessionStart block 终止 run:</strong>fake HookRunner SessionStart
-      返回 exitCode 1, agent.run() 返回 "Session blocked by hook.", history 长度
-      = 0 (用户消息未写入)。
-    </p>
-    <p>
-      <strong>PostToolUse block 不撤销:</strong>fake HookRunner PostToolUse 返回
-      exitCode 1, 跑完后 history 末尾有 tool result (execute 已完成) + 1 条 user
-      message "[Hook: PostToolUse] block: ..."。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 4 · handler 抛异常拖死 agent</span></div>
+    <div class="card__body">
+      <p>handler 调外部 API 写审计日志, API 挂掉, throw。 主循环
+        <code>await hookRunner.run(event)</code> 直接抛, agent 死。
+        错。 Hook 是扩展, 不是核心。 hookRunner.run() 必须 catch 所有
+        异常, 只 warn 日志, 然后继续。 一个审计 handler 挂掉不能
+        让 agent 不能用。</p>
+    </div>
   </div>
 </div>
-
-<h2 id="lookback">回望第 00–07 章: 哪些原则在本章兑现了</h2>
+<h2 id="validate">Validation: 4 条不变量检验</h2>
+<ol>
+<li>
+<strong>协议对称</strong>: handler 返回什么 exitCode, 主循环
+    行为必须和 exitCode 语义一致 — 0 啥也不做, 1 阻止 (PreToolUse
+    不调 invoke, PostToolUse 只 warn), 2 累积 message 在本轮 tool
+    结束后注入。 验证: 用 stub handler 返回 0/1/2, 跑主循环 1 轮,
+    看 history 末尾的 tool message 和 user message 数量是否符合预期。
+  </li>
+<li>
+<strong>延迟注入</strong>: 同一轮多个 tool_call 时, inject 消息
+    必须出现在<strong>所有 tool_result 之后</strong>。 验证: 写测试
+    让 LLM 调 3 个 tool (run_bash / run_read / run_write), 第一个
+    PostToolUse 就返回 2, 看 history 数组里 inject 消息的 index 是
+    不是在所有 tool_result 之后。
+  </li>
+<li>
+<strong>闭包隔离</strong>: 多个 agent 实例 (父子 subagent) 用
+    各自的 HookRunner, 不共享 handler 数组。 验证: 父 agent
+    handler 调一次, 子 agent handler 不应该被调用; 子 agent 的
+    handler 调一次, 父 agent 不应该被调用。
+  </li>
+<li>
+<strong>空对象兼容</strong>: <code>createNoopHookRunner()</code> 和
+    真实 <code>createHookRunner(...)</code> 接口完全一致, agent.ts
+    同一份代码, 不写 <code>if (hookRunner)</code>。 验证: TypeScript
+    编译通过, 跑 100 轮对话, 行为和挂空 hook 时一致 (只是少了
+    inject message)。
+  </li>
+</ol>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>tool_call 协议沿用:</strong>PreToolUse block 必写 tool message, Hook
-    注入消息延迟追加, 不破坏第 02 章的协议约束。
+<li>
+<strong>关注点分离</strong>: Hook 独立于 Permission, 独立于
+    ToolRegistry。 主循环只发事件, 不处理业务规则。
   </li>
-  <li>
-    <strong>Composition Root 唯一接线:</strong>hookRunner 在
-    <code>index.ts</code> 创建, 注入 agent, 不在 agent.ts 内 new。
+<li>
+<strong>工厂 + 闭包</strong>: <code>createHookRunner</code> 把
+    handlers 数组 + 累积 injected 数组藏在闭包, 外部只能调
+    <code>run</code>。
   </li>
-  <li>
-    <strong>错误降级原则:</strong>Hook 抛错不崩主 loop, 这条原则在第 11 章
-    recovery 会进一步展开 (LLM 抛错也降级)。
+<li>
+<strong>空对象模式</strong>: <code>createNoopHookRunner</code> 让
+    agent 主循环保持直线流, 不需要判空。
   </li>
-  <li>
-    <strong>事实与视图分离:</strong>Hook 决策不进 history, 进 transcript (第 15
-    章展开), history 仍然保持"对话历史" 的单一职责。
+<li>
+<strong>协议对称</strong>: 3 个事件 × 3 个返回码, 每个组合的
+    主循环行为都有明确定义, 单元测试覆盖。
+  </li>
+<li>
+<strong>延迟注入</strong>: inject 消息不能在 tool_call /
+    tool_result 中间插入, 必须缓存到本轮结束后追加, 这是 LLM
+    协议层的硬约束。
   </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>Hook 决策落盘</dt>
-  <dd>
-    第 15 章 transcript 会把 Hook 决策 (exitCode, message, timestamp) 落盘,
-    用于审计和 eval 重放。
-  </dd>
-  <dt>Hook 阻塞太久</dt>
-  <dd>
-    第 11 章 recovery 会处理"Hook 阻塞超过 30s" 的超时降级, 不让主 loop 永远卡在
-    Hook 上。
+<dt>Handler 业务逻辑</dt>
+<dd>
+    本章的 handler 例子都是内联 stub。 实战中 handler 要读
+    配置文件 / 调外部 API, 这部分属于"配置层" 而不是 HookRunner
+    本身。 第 09 章 Memory 展开"持久化" 机制, SessionStart handler
+    可以从 MEMORY.md 加载团队规范。
   </dd>
-  <dt>SessionStart 拒掉的会话</dt>
-  <dd>
-    第 09 章 memory 区分"被 SessionStart 拒掉的会话" (不进 memory)
-    和"被正常完成的会话" (写 memory)。
+<dt>Handler 调试可观测性</dt>
+<dd>
+    Handler 抛异常只 warn, 用户怎么知道 handler 挂了? 需要
+    单独的"hook 日志" 流, 不是 agent 主对话。 第 11 章 Recovery
+    展开"异常分类" 时会带上 hook 异常的处理。
   </dd>
-  <dt>Hook 跨项目保留</dt>
-  <dd>
-    Hook 配置通常在 <code>.claude/hooks/</code> 项目级目录, 与第 09 章 memory
-    的项目级 / 用户级分类一致。
+<dt>动态注册 / 注销</dt>
+<dd>
+    当前 handler 在 Composition Root 一次性注册, 运行期不能动态
+    加。 实战中用户可能想"跑测试时挂一个 handler, 跑完摘掉",
+    动态注册需要"撤销语义" — 哪些 handler 算"同一组", 撤销时
+    一组全摘。 这是 P2 阶段的功能, 本阶段不实现。
   </dd>
 </dl>
-
-<h2 id="vibe-coding-08">本次如何 vibe code: 第 08 章的三件套</h2>
-
-<h3 id="vibe-feed-08">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>HookEvent</code> /
-    <code>HookResult</code> / <code>HookRunner</code> 三个 interface, 以及
-    exitCode 0/1/2 三种语义的明确文档。本轮不写实现, 重点钉"block 必写 tool
-    message" 和"错误降级" 两条约束。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    <code>createNoopHookRunner()</code> 返回 pass-through runner (永远 exitCode
-    0), agent.run 接入三个事件调用点但 runner 是 noop。本轮 review 重点:
-    hookRunner 与 permissionManager 是同级依赖。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 safeRunHook + PreToolUse /
-    PostToolUse / SessionStart 接入 + 注入消息延迟追加。本轮 review 重点:
-    三个事件都在正确位置, safeRunHook 包 try/catch。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/hook.test.ts</code>。本轮 review 重点: "PreToolUse block 写 tool
-    message" 和 "Hook 抛错降级" 两条必须有 spy 验证。
-  </li>
-</ol>
-
-<h3 id="vibe-review-08">Review: 第 08 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>block 必写 tool message。</strong>PreToolUse exitCode 1 路径上必有
-    <code>history.add({role: "tool", ...})</code>。验证:
-    <code>grep -n 'Blocked by PreToolUse' src/agent.ts</code> ≥ 1 行。
-  </li>
-  <li>
-    <strong>Hook 抛错降级。</strong>safeRunHook 必有 try/catch。验证:
-    <code>grep -n 'catch' src/hooks.ts</code> ≥ 1 行。
-  </li>
-  <li>
-    <strong>Hook 注入消息延迟追加。</strong>exitCode 2 注入的 message 不直接写
-    history, 而是 push 到 pendingHookMessages 数组, 在所有 tool_result
-    写完之后统一追加。验证: agent.ts 末尾有 "for (const msg of
-    pendingHookMessages)" 循环。
-  </li>
-  <li>
-    <strong>PostToolUse 不撤销。</strong>PostToolUse exitCode 1 路径上,
-    tool.execute 之后正常写 tool message, Hook block 仅作为 user 消息追加。验证:
-    跑测试, 工具 spy 被调用 1 次 (execute 真的跑了)。
-  </li>
-  <li>
-    <strong>hookRunner 是工厂, 不在 agent 内 new。</strong>验证:
-    <code>grep -n 'new HookRunner\|new NoopHook' src/agent.ts</code> 应当 0 行。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-08">调试: 第 08 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · Hook 注入消息直接写 history, 不延迟。</strong>症状:
-    exitCode 2 路径上 <code>history.add({role: "user", ...})</code> 紧跟在
-    tool.execute 之后。验证: 跑测试, 2 个 tool_calls 时 history 中间不能出现
-    user message, 必须在末尾。
-  </li>
-  <li>
-    <strong>伪装 B · Hook 抛错时 throw 而不是降级。</strong>症状: agent.run 顶层
-    try/catch 缺失, Hook 错误冒泡到 main。验证: Validation 卡片"Hook 抛错降级"
-    那条测试通过。
-  </li>
-  <li>
-    <strong>伪装 C · PostToolUse block 试图 "撤销"。</strong>症状: 代码里有 "if
-    (postResult.exitCode === 1) { tool.undo?.() }"。验证: PostToolUse block
-    只能追加 user 消息警告, 不能调 tool 自身的 undo (没有这个方法)。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-08">迭代: 第 08 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch08): 钉 HookEvent / HookResult / HookRunner 接口与 exitCode
-      语义</code
-    >
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code>feat(ch08): createNoopHookRunner + agent.run 接入三个事件 stub</code>
-    —— tsc 通过, runner 永远 exitCode 0。
-  </li>
-  <li>
-    <code
-      >feat(ch08): safeRunHook + 注入消息延迟追加 + block 写 tool message</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code>test(ch08): PostToolUse block 不撤销 + SessionStart 拒掉会话</code> ——
-    全绿。
-  </li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
   <div class="card__head">
     <span class="card__tag">Prompt Card · 第 08 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
+    <button class="card__copy" data-copy-card="" type="button">复制</button>
   </div>
   <div class="card__body">
-    <p>
-      <strong>目标:</strong>在 agent loop 关键节点插入 Hook 调用点, 让 harness
-      的关键事件 (permission 决策、工具执行、状态变化) 有 transcript 兜底。
-    </p>
-    <p>
-      <strong>场景:</strong>用户输入 "清理临时文件", LLM 调用 run_bash,
-      PreToolUse Hook 记录 decision, tool.execute 之后 PostToolUse Hook 记录
-      result, transcript 留下完整事件流供 eval 重放。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/hooks.ts</code> (新) 暴露
-      <code>createNoopHookRunner()</code> 和
-      <code>createHookRunner(config)</code>; <code>src/agent.ts</code> 在
-      PreToolUse / PostToolUse / SessionStart 三个位置插入 hookRunner.run();
-      <code>src/index.ts</code> 接线 hookRunner。
-    </p>
+    <p><strong>目标:</strong> 给 harness 加 Hook 系统, 3 个事件 × 3 个
+      返回码, 主循环按固定时序 emit, handler 在外部注册。</p>
+    <p><strong>场景:</strong> 团队挂 3 个 handler — 审计 logger 写
+      audit.log; <code>--no-verify</code> 检查返回 1 阻止; 敏感命令
+      警告返回 2 注入 user message。 验证 3 个 handler 串联时, block
+      短路后续 handler, inject 累积 message 用空行拼接。</p>
+    <p><strong>模块:</strong> <code>src/hooks.ts</code> (新) 暴露
+      <code>createHookRunner(handlers, logger)</code> + <code>createNoopHookRunner()</code>;
+      <code>src/agent.ts</code> (改) 在 3 个固定时机 emit event;
+      <code>src/index.ts</code> (改) Composition Root 注入 HookRunner。</p>
     <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
     <ul>
-      <li>
-        PreToolUse block (exitCode 1) 必写 role: "tool" 消息 "Blocked by
-        PreToolUse hook: ..."
-      </li>
-      <li>Hook 抛错时降级为 exitCode 0, 不让主 loop 崩</li>
-      <li>
-        Hook 注入消息 (exitCode 2) 延迟到所有 tool_result 写完之后统一追加
-      </li>
-      <li>PostToolUse block 仅追加 user 消息警告, 不撤销已执行的副作用</li>
-      <li>SessionStart block 终止 run(), history 长度 = 0 (用户消息未写入)</li>
+      <li>3 个事件 SessionStart / PreToolUse / PostToolUse, 不可新增第 4 个事件点</li>
+      <li>3 个返回码 0 / 1 / 2, 语义正交, 不可合并 / 重命名</li>
+      <li>block 短路后续 handler, inject 累积 message 用 <code>\n\n</code> 拼接</li>
+      <li>handler 异常只 warn 日志, 不中断 agent 主流程</li>
+      <li>exitCode 2 的 message 延迟到本轮所有 tool_result 写完后追加</li>
+      <li>createNoopHookRunner 提供默认空实现, agent.ts 不写 <code>if (hookRunner)</code></li>
+      <li>HookRunner 接口只有 <code>run(event)</code>, 不暴露 handler 数组</li>
     </ul>
-    <p>
-      <strong
-        >验证 (用 fake HookRunner + spy registry, 逐条落到 vitest):</strong
-      >
-    </p>
+    <p><strong>验证 (用 stub handler + vitest, 逐条断言):</strong></p>
     <ul>
-      <li>
-        PreToolUse exitCode 1, spy.execute 没被调用, history 末尾有 tool message
-        "Blocked by PreToolUse hook"
-      </li>
-      <li>fake HookRunner.run 抛错, 主 loop 继续, tool.execute 被调用</li>
-      <li>
-        2 个 tool_calls + PreToolUse exitCode 2, history 末尾顺序: [tool, tool,
-        user(Hook 注入)]
-      </li>
-      <li>
-        SessionStart exitCode 1, agent.run() 返回 "Session blocked by hook.",
-        history 长度 = 0
-      </li>
-      <li>
-        PostToolUse exitCode 1, tool.execute 仍被调用 1 次, history 末尾追加
-        user 警告
-      </li>
+      <li>block 短路: 3 个 handler 串联, 第 2 个返回 1, 第 3 个不执行</li>
+      <li>inject 累积: 2 个 handler 都返回 2, message 用空行拼接</li>
+      <li>异常隔离: handler 抛 Error, 后续 handler 继续执行</li>
+      <li>createNoopHookRunner 和 createHookRunner 接口完全一致, 行为不同</li>
+      <li>延迟注入: 一轮 3 个 tool_call, 第一个 PostToolUse 返回 2, inject 消息在所有 tool_result 之后</li>
     </ul>
   </div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意在 PreToolUse block 路径上不写 tool message, 跑测试, 看"PreToolUse block
-    写 tool message" 是否抓到。
+<li>
+    故意把 handler 业务逻辑写进 <code>agent.ts</code> (如 <code>if (event.name === "PreToolUse" &amp;&amp; event.payload.args.command?.includes("--no-verify")) ...</code>),
+    跑测试, 看"主循环只发事件不处理业务" 是否抓到 (agent.ts
+    行数膨胀 + 改规则必须改 agent)。
+  </li>
+<li>
+    让 handler 返回 <code>{ exitCode: 1, message: "请改用 --verify" }</code>
+    想 "block + 给 LLM 提示", 跑测试, 看"1 是 block, message 不会注入"
+    是否抓到 (LLM 收不到 message, 一直重复 --no-verify)。
   </li>
-  <li>
-    让 fake HookRunner.run 抛错, 但 safeRunHook 不包 try/catch, 跑测试, 看"Hook
-    抛错降级" 是否抓到。
+<li>
+    故意在 tool_result 之后立刻追加 user message (不缓存 inject
+    消息), 跑多 tool_call 测试, 看"延迟注入" 是否抓到 (OpenAI API
+    400 error, 或协议断言失败)。
   </li>
-  <li>
-    把 Hook 注入消息直接写 history (不延迟), 跑测试, 看"Hook 注入消息延迟追加"
-    是否抓到 (2 个 tool_calls 时中间不能有 user message)。
+<li>
+    故意让 handler <code>throw new Error("boom")</code>, 跑测试,
+    看"异常隔离" 是否抓到 (agent 主流程崩 vs. 后续 handler 继续)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了 Hook 机制, 在 loop 关键节点 (SessionStart / PreToolUse /
-  PostToolUse) 插入可扩展点。Hook 通过 exitCode 0/1/2 决定 pass-through / block
-  / inject 三种行为, 错误降级原则保证 第三方 Hook 不会让 harness 崩溃。下一章
-  (第 09 章) 我们会处理"用户偏好 怎么跨会话保留" 的问题——Memory, 把"项目级"
-  和"用户级" 长期事实 分开, 让 harness 在新会话里也能想起"用户喜欢简洁解释"。
+  Hook 是给主循环的<strong>扩展点</strong>, 不是 Permission 的替代品。
+  核心是 4 个设计:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>3 个事件点</strong>: SessionStart / PreToolUse / PostToolUse,
+    少而精, 事件点多就是多一份主循环耦合。
+  </li>
+<li>
+<strong>3 个返回码</strong>: 0 / 1 / 2, 语义正交 — 0 啥也不做, 1 阻止
+    (PreToolUse 阻止工具, PostToolUse 只 warn), 2 注入 user message。
+  </li>
+<li>
+<strong>多 handler 聚合</strong>: 串行执行, block 短路, inject 累积
+    (用 <code>\n\n</code> 拼接), 异常隔离 (catch + warn, 不中断)。
+  </li>
+<li>
+<strong>延迟注入</strong>: exitCode 2 的 message 必须在所有
+    tool_result 写完后追加, 不然破坏 OpenAI API 协议。
+  </li>
 <p>
-  第 08 章让 harness 有关键事件的事实记录, 但用户偏好 (例如"我喜欢 简洁解释")
-  仍然在每次新会话都丢。下一章 Memory 模块会区分"项目级 短期事实"
-  (例如本项目的命名规范) 和"用户级 长期事实" (例如用户的 解释风格偏好),
-  把后者持久化, 让 harness 在新会话里也能想起。
+  下一章 (第 09 章) 展开 SessionStart 最常用的扩展 — <strong>Memory
+  系统</strong>, 4 类 tag 把"团队规范 / 用户偏好 / 项目上下文 /
+  历史错误" 持久化到 MEMORY.md, 启动时通过 SessionStart handler
+  注入。
 </p>
diff --git a/tutorial/chapters/09-memory.html b/tutorial/chapters/09-memory.html
index 64b629c..a04cc12 100644
--- a/tutorial/chapters/09-memory.html
+++ b/tutorial/chapters/09-memory.html
@@ -1,519 +1,661 @@
-<p class="article__eyebrow">第 09 章 · 跨会话记忆</p>
-<h1 class="article__title">让 Harness 记得用户: Memory</h1>
+<p class="article__eyebrow">第 09 章 · 跨会话的长期记忆</p>
+<h1 class="article__title">Memory: 4 类 tag 持久化团队规范与用户偏好</h1>
 <p class="article__lede">
-  前面八章让 harness 在单次会话内能聊天、调工具、跑子任务、压缩、拦权限、 留
-  hook。但用户偏好 (例如"我喜欢简洁解释") 仍然在每次新会话都丢。 这一章给
-  harness 加 Memory 模块, 区分"项目级短期事实" 和"用户级长期 事实",
-  把后者持久化, 让 harness 在新会话里也能想起。
+  第 08 章的 Hook 给了"主循环周围挂逻辑" 的扩展点, 但 SessionStart
+  handler 想读的"团队规范 / 用户偏好" 从哪来? 用户的 home 目录里
+  躺着一个 <code>MEMORY.md</code>, 团队 git 仓库里有一个
+  <code>.claude/memory/</code> 目录, 但 LLM 不知道哪些"应该记住"。 这一章
+  加 <strong>Memory 系统</strong>: 4 类 tag (user / feedback / project /
+  reference) × Markdown frontmatter 存储 × 自动生成索引 × Jaccard
+  相似度去重, 启动时通过 SessionStart handler 注入 system prompt。
+  读完后, 你能讲清"4 类 tag 的语义" (用户偏好 vs 团队规范 vs 项目
+  上下文 vs 参考资料), 并能用 fake filesystem 验证"name 合法 + 索引
+  重建 + 相似度去重" 3 条不变量。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-08">在第 08 章基础上改了什么</h2>
-<p>
-  这一章在 Composition Root 加 memory 持久化层。memory 数据落盘到
-  <code>~/.claude/memory/&lt;scope&gt;/&lt;key&gt;.md</code> 这样的 markdown
-  文件, scope 区分 <code>user</code> (用户级) 和 <code>project</code> (项目级)。
-  agent 主循环在 SessionStart 时把 memory 注入到 system prompt, 会话过程中 LLM
-  可通过 save_memory 工具更新 memory。memory 与 history 严格分离, 不进 messages
-  序列。 对应到代码, 改动集中在 3 个文件: <code>src/memory.ts</code> (新)、
-  <code>src/tools/save_memory.ts</code> (新)、<code>src/agent.ts</code> (改
-  SessionStart 注入 memory)。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/memory.ts: Memory 持久化 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/agent.ts: SessionStart 注入 memory 到 system prompt</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/project-context.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/project-context.ts: 项目级上下文 (新)</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    用户在新会话里说"我喜欢简洁解释", agent 完全不知道用户在上一会话
-    说过这件事。现象是"harness 短期记忆 OK, 长期记忆缺失"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"把 memory 写在 history 里"。这有两个问题: 一是 history
-    每次新会话都从空开始, memory 仍然丢; 二是 history 是"对话 上下文", memory
-    是"长期事实", 两者职责不同。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface MemoryStore { get(scope, key), set(scope, key, value),
-      list(scope) }</code
-    >。 不变量三条: (1) memory 数据不进 history.getMessages(), (2) memory 落盘是
-    append-only (更新是写新文件, 不直接改原文件, 第 15 章 atomic-write 兜底),
-    (3) 区分 user 级与 project 级, user 级跨项目共享, project
-    级仅在当前项目可见。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake memoryStore 暴露 spy, 跑完一次会话, spy.set 被调用写入"用户 喜欢简洁";
-    第二次会话启动, SessionStart 注入的 system prompt 含 "用户偏好: 简洁解释",
-    LLM 第二轮能引用这个事实。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · memory 写在 history 里</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 错误: 试图用 history 保存 memory
-history.add({ role: "system", content: "User preference: 简洁解释" });
-history.persist();  // 错误: history 不该有 persist 方法</code></pre>
-  <p><strong>问:</strong>为什么不复用 history 的持久化能力?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 职责: history 是"对话上下文", memory
-    是"长期事实", 两者职责不同; 性能: history 每次 LLM 调用都 重新发送, memory
-    写进去意味着每次都重传无关事实; 协议: memory 改了 history 的语义, 后续
-    compress / replay 都会混乱。
-  </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · 不区分 scope, 所有 memory 写一个文件</p>
-  <pre class="code-block"><code>// 教学简化版
-const memory = await fs.readFile("~/.claude/memory", "utf8");
-memory += `\n${newFact}`;
-await fs.writeFile("~/.claude/memory", memory);</code></pre>
-  <p><strong>问:</strong>为什么不分 scope?</p>
-  <p>
-    <strong>答:</strong>用户换项目之后, 项目级 fact (例如"本项目用 React") 跟着
-    user 走, 污染新项目。区分 user / project scope 之后, 项目级 fact
-    在新项目自动隐藏, 不会被 LLM 误用。
-  </p>
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: 用户说"我之前说过" harness 完全不知道</h2>
+<p>
+  写代码之前, 先看一个真实痛点。 用户的 team 用 harness 跑 3 个月后,
+  反复问 LLM 同样的偏好:
+</p>
+<ol>
+<li>
+<strong>用户偏好</strong>: 用户每次都说"中文回答 + TypeScript 不要 any",
+    harness 每次都按默认回答。
+  </li>
+<li>
+<strong>团队规范</strong>: team 反复要求"PR 标题用 conventional commit
+    格式", harness 每次都按普通 commit 写。
+  </li>
+<li>
+<strong>项目上下文</strong>: 用户每次说"这个项目用 pnpm 不用 npm",
+    harness 每次都跑 <code>npm install</code>。
+  </li>
+<li>
+<strong>参考资料</strong>: 用户每次说"读 README.md 第 3 节", harness
+    每次都搜错文件。
+  </li>
+</ol>
+<p>
+  朴素想法 1: "把这些写进 system prompt?" 错。 system prompt 是
+  稳定前缀, 频繁改会破坏 prompt cache (第 10 章), 改一次成本
+  ×10。 system prompt 应该<strong>冷启动时一次</strong>写, 不
+  经常改。
+</p>
+<p>
+  朴素想法 2: "让 LLM 自己读 MEMORY.md?" 浪费 token, 而且 LLM 不知道
+  "应该读哪里", 容易读错。 应该 harness 主动<strong>按 type 筛选</strong>,
+  只把"高价值低频变" 的条目注入 system prompt。
+</p>
+<p>
+  正确做法: 加 <code>src/memory.ts</code> 模块 — 每条 memory 是独立
+  Markdown 文件 (frontmatter + body), 4 类 tag 区分, harness 在
+  SessionStart 启动时扫一次目录, 把所有条目按 type 排序后注入
+  system prompt, 之后 100 轮对话 cache 命中。 这是 Reference 章节
+  "模式 11 · Cache-friendly 缓存友好" 的具体应用。
+</p>
+<h2 id="four-types">4 类 tag: user / feedback / project / reference</h2>
+<p>
+  <strong>用途</strong>: 不同来源 / 不同变更频率 / 不同优先级的
+  memory 条目应该<strong>分开存</strong>, 启动时按 type 决定注入
+  顺序。 4 类 tag 是固定枚举, 不允许自定义第 5 类 — 多了维护成本
+  上升, 少了语义冲突。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户把"中文回答" 存为
+  <code>type: user</code>; team 把"PR 用 conventional commit" 存为
+  <code>type: feedback</code>; 项目把"用 pnpm" 存为
+  <code>type: project</code>; 用户把"读 README.md 第 3 节" 存为
+  <code>type: reference</code>。 4 类各管一摊, 不会把"个人偏好" 和
+  "团队规范" 混在一起。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>固定枚举 + 字段类型</strong> —
+  4 类不是任意字符串, 是编译期可见的 union, 写错会 TypeScript 编译
+  失败。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L30"><code>src/memory.ts</code> 第 30 行的 <code>MemoryType</code> union</a>:
+</p>
+<pre><code class="language-typescript">export type MemoryType = "user" | "feedback" | "project" | "reference";</code></pre>
+<div class="figure figure--stack">
+  <div class="figure__title">图 1 · 4 类 memory 的语义边界</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">user · 用户偏好</div>
+      <div class="flow-stack__body">"中文回答" "不要用 any" "注释用中文" — 来自单个用户, 跨项目, 几乎不变。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">feedback · 团队反馈</div>
+      <div class="flow-stack__body">"PR 用 conventional commit" "测试要覆盖边界" — 来自团队, 跨项目但周期更新。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">project · 项目上下文</div>
+      <div class="flow-stack__body">"这个项目用 pnpm" "src/ 在 packages/core 下" — 来自项目, 项目内稳定, 跨项目无意义。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">reference · 参考资料</div>
+      <div class="flow-stack__body">"读 README.md 第 3 节" "API 文档在 https://..." — 高频查, 路径稳定但内容可能改。</div>
+    </div>
+  </div>
 </div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export type MemoryScope = "user" | "project";
-
-export interface MemoryEntry {
-  scope: MemoryScope;
-  key: string;       // 例如 "user.preference.explanation_style"
-  value: string;     // markdown 内容
-  updatedAt: number; // epoch ms
-}
-
-export interface MemoryStore {
-  get(scope: MemoryScope, key: string): Promise&lt;MemoryEntry | null&gt;;
-  set(scope: MemoryScope, key: string, value: string): Promise&lt;void&gt;;
-  list(scope: MemoryScope): Promise&lt;MemoryEntry[]&gt;;
-  // 渲染为 system prompt 片段
-  renderForPrompt(scope: MemoryScope): Promise&lt;string&gt;;
+<p>
+  <strong>实现细节</strong>: 4 类 tag 的<strong>语义正交</strong>, 不允许
+  出现"既是 user 又是 feedback" 的情况 — 真出现, 说明该拆成两条
+  memory。 启动注入顺序按 type 排序 (user → feedback → project →
+  reference), 保证 system prompt 里的相对位置稳定, cache 命中。
+</p>
+<h2 id="file-format">文件格式: frontmatter + body 的 Markdown</h2>
+<p>
+  <strong>用途</strong>: 每条 memory 是独立 .md 文件, 方便 git diff
+  / review / 合并冲突解决。 文件名 = <code>${name}.md</code>, 文件
+  内 frontmatter <code>name</code> 字段必须和文件名一致 (双重校验)。
+</p>
+<p>
+  <strong>真实场景</strong>: team 改了一条 feedback memory, 提 PR,
+  reviewer 看 git diff 看到 frontmatter 改了 description + body 改了
+  一段, 接受合并。 如果用 JSON, 整个文件一行, review 看到的是
+  "改了一个字符但不知道改哪"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>人类可读 + 工具可解析</strong> —
+  复用 skills.ts 的 <code>parseFrontmatter()</code> (第 05 章), 不引入
+  YAML 依赖。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L151"><code>src/memory.ts</code> 第 151 行的 <code>serializeMemory</code></a>:
+</p>
+<pre><code class="language-typescript">export function serializeMemory(entry: MemoryEntry): string {
+  const lines = [
+    "---",
+    `name: ${entry.meta.name}`,
+    `description: ${entry.meta.description}`,
+    `type: ${entry.meta.type}`,
+    `createdAt: ${entry.meta.createdAt}`,
+    `updatedAt: ${entry.meta.updatedAt}`,
+    "---",
+    entry.body,
+  ];
+  return lines.join("\n");
 }</code></pre>
-
-<h2 id="scope-rules">scope 划分规则</h2>
-<dl class="defs">
-  <dt>user 级</dt>
-  <dd>
-    跨项目共享的事实, 例如 "user.preference.explanation_style = 简洁",
-    "user.preferred_language = zh-CN"。这些 facts 在任何项目任何会话 都生效。
-  </dd>
-  <dt>project 级</dt>
-  <dd>
-    仅在当前项目可见的事实, 例如 "project.tech_stack = React + TypeScript",
-    "project.naming = camelCase"。这些 facts 在新项目自动隐藏, 不会污染
-    其他项目。
-  </dd>
-  <dt>key 命名规范</dt>
-  <dd>
-    <code>&lt;scope-prefix&gt;.&lt;category&gt;.&lt;name&gt;</code>, 例如
-    <code>user.preference.explanation_style</code>。key 稳定, 不允许 LLM
-    自由发挥改名。
-  </dd>
-</dl>
-
-<h2 id="loop-integration">loop 接入: SessionStart 注入到 system prompt</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 第一次 run() 时, SessionStart 阶段
-if (!sessionStarted) {
-  sessionStarted = true;
-
-  // 1. 加载 memory 渲染为 system prompt 片段
-  const userMemory = await memoryStore.renderForPrompt("user");
-  const projectMemory = await memoryStore.renderForPrompt("project");
-
-  // 2. 拼接到 system prompt (用稳定前缀, 第 10 章 cache-friendly 伏笔)
-  const basePrompt = history.getSystemPrompt() ?? "";
-  history.setSystemPrompt(
-    `${basePrompt}\n\n# User Memory\n${userMemory}\n\n# Project Memory\n${projectMemory}`
+<p>
+  <strong>实现细节</strong>: frontmatter 5 个必填字段 (name /
+  description / type / createdAt / updatedAt) — 缺一个就 warn
+  日志 + 跳过该文件, 不抛错。 这条"宽容失败" 是 Reference 章节
+  "模式 17 · Test Doubles 测试替身" 的反向应用 — 解析器要容忍
+  坏文件, 不因为一个坏文件就让 agent 启动失败。
+</p>
+<p>
+  name 字段有<strong>双重校验</strong>: 文件名 <code>${name}.md</code> ↔
+  frontmatter <code>name</code> 必须一致。 不一致就 warn + 跳过。
+  这条"对称校验" 是 AGENTS.md "Validation Rules" 的体现 — name 是
+  identity 字段, 文件名和 frontmatter 必须由一个模块明确验证。
+</p>
+<h2 id="manager-closure">createMemoryManager 工厂 + 闭包缓存</h2>
+<p>
+  <strong>用途</strong>: MemoryManager 接口在 Composition Root 创建,
+  通过闭包缓存 scan 过的 entries 列表, <code>read()</code> /
+  <code>list()</code> 走内存不走磁盘, 性能稳定。
+</p>
+<p>
+  <strong>真实场景</strong>: SessionStart 启动时调一次 <code>scan()</code>
+  扫目录 (10 个文件, 大约 5ms), 之后 100 轮对话每次都调
+  <code>buildPromptSection()</code> 注入 system prompt, 这 100 次
+  调都走内存缓存, 不再读磁盘。 内存里的 entries 在
+  <code>create()</code> / <code>delete()</code> 后自动重建索引。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>工厂 + 闭包</strong> — 这是
+  Reference 章节 "模式 1 · 工厂 + 闭包" 的标准应用: 内部状态
+  (cachedEntries + memoryDir + logger) 全在闭包内, 外部只能通过
+  MemoryManager 接口访问。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L331"><code>src/memory.ts</code> 第 331 行的 <code>createMemoryManager</code> 工厂</a>:
+</p>
+<pre><code class="language-typescript">export function createMemoryManager(options: {
+  memoryDir: string;
+  logger: Logger;
+}): MemoryManager {
+  const { memoryDir, logger } = options;
+  let cachedEntries: MemoryEntry[] = [];  // 闭包内状态
+
+  function scan(): MemoryEntry[] { /* 读目录 + 解析 */ }
+  function read(name: string): MemoryEntry | null { /* 查缓存或读单文件 */ }
+  function create(input: CreateMemoryInput): MemoryEntry { /* 写文件 + 重建索引 */ }
+  // ...
+
+  return { scan, read, create, list, findSimilar, delete, buildPromptSection, rebuildIndex, getMemoryDir };
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: <code>scan()</code> 每次都清空
+  <code>cachedEntries = []</code> 然后重读, 保证结果反映磁盘最新
+  状态; <code>read(name)</code> 优先查缓存, 缓存没有再读单文件 (允许
+  "scan 之后外部改文件" 的边界情况)。 闭包隔离让多个 MemoryManager
+  实例 (父子 subagent 各一个) 互不污染。
+</p>
+<h2 id="validation">name 合法性 + type 合法性 + name == 文件名 三重校验</h2>
+<p>
+  <strong>用途</strong>: 写入和读取都要校验, 不合法的 name (路径穿越 /
+  隐藏文件 / 特殊字符) 一律拒绝。 这是 Reference 章节 "模式 15 ·
+  Identity Check 身份校验" + AGENTS.md "Validation Rules: 字段
+  一致性" 的具体应用。
+</p>
+<p>
+  <strong>真实场景</strong>: LLM 调 <code>memory_create({name: "../../etc/passwd", ...})</code>
+  想穿越目录, name 合法性校验拒绝; LLM 调 <code>memory_create({name: "foo", ...})</code>
+  但磁盘上文件叫 <code>bar.md</code>, name == 文件名校验拒绝。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>3 重校验</strong>:
+</p>
+<ol>
+<li>
+<strong>name 合法字符</strong>: 只允许 <code>[a-z0-9_-]+</code>, 看
+    <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L127"><code>src/memory.ts</code> 第 127 行的 <code>isValidName</code></a>。
+    拒绝大写 / 空格 / 点 / 斜杠, 防止路径穿越和 case-insensitive
+    文件系统的歧义。
+  </li>
+<li>
+<strong>type 合法枚举</strong>: 只允许 4 类 tag, 看
+    <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L134"><code>src/memory.ts</code> 第 134 行的 <code>isValidType</code></a>。
+    拒绝任意字符串, 防止"我自己加一类" 破坏 4 类语义。
+  </li>
+<li>
+<strong>name == 文件名</strong>: frontmatter <code>name</code> 必须
+    等于文件名 (去掉 .md), 看
+    <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L394"><code>src/memory.ts</code> 第 394 行的对称校验</a>。
+    防止"改了 frontmatter 没改名" 或"改了名没改 frontmatter" 的
+    身份错位。
+  </li>
+</ol>
+<p>
+  <strong>实现细节</strong>: 3 重校验失败都<strong>只 warn 日志 + 跳过</strong>,
+  不抛错。 一个坏文件不能阻止整个 agent 启动 — 用户可能手改了
+  frontmatter, harness 应该宽容。 这是 Reference 章节 "模式 13 ·
+  Error→Action 错误转动作" 的应用 — 错误 = warn + 跳过, 不是 throw。
+</p>
+<h2 id="index">MEMORY.md 自动索引: 不手写</h2>
+<p>
+  <strong>用途</strong>: 用户 / reviewer 看 <code>memory/</code> 目录
+  时需要一个"目录页" — 哪些 memory 存在, 各自是哪个 type, 一句话
+  描述是什么。 索引自动生成, 不手写, 否则容易"加 memory 忘了同步
+  索引"。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户想看 "我项目里有哪些 feedback 类型
+  的 memory", 直接读 <code>MEMORY.md</code> 看索引 — 按 type 排好
+  序, 每行 <code>- [feedback] name: description</code>。 这是
+  Reference 章节 "模式 11 · Cache-friendly 缓存友好" 的延伸 —
+  索引是 cache miss 时 fallback 的"快速查表"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>派生数据</strong> — 索引是
+  scan 结果的派生, 每次 <code>create()</code> / <code>delete()</code>
+  后自动 rebuild, 用户不能手写 (被排除在 scan 之外)。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L234"><code>src/memory.ts</code> 第 234 行的 <code>buildIndexContent</code></a>:
+</p>
+<pre><code class="language-typescript">function buildIndexContent(entries: MemoryEntry[]): string {
+  const header = "# Memory Index\n";
+  const lines = entries.map(
+    (e) =&gt; `- [${e.meta.type}] ${e.meta.name}: ${e.meta.description}`,
   );
-
-  // 3. SessionStart Hook (第 08 章)
-  const hookResult = await safeRunHook(/* ... */);
-  if (hookResult.exitCode === 1) return hookResult.message;
-}
-
-// save_memory 工具
-async function saveMemory(args) {
-  await memoryStore.set(args.scope, args.key, args.value);
-  return { toolCallId: call.id, content: `Saved ${args.scope}.${args.key}` };
+  return header + "\n" + lines.join("\n") + "\n";
 }</code></pre>
 <p>
-  关键设计: memory 拼到 system prompt 的固定位置 ("# User Memory" / "# Project
-  Memory"), 即使 memory 内容变化, 这一行 prefix 稳定, 不会破坏第 10 章
-  cache-friendly 布局。memory 内容本身在 LLM 视角 是动态的, 但 prefix 锚点不变。
+  <strong>实现细节</strong>: 索引按 type → name 稳定排序
+  (看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L247"><code>src/memory.ts</code> 第 247 行的 <code>getStableSortKey</code></a>),
+  保证每次 rebuild 出来的 MEMORY.md 内容一致, git diff 只显示
+  "新增/删除/更新", 不显示"顺序变了"。
 </p>
-
-<h2 id="persistence">落盘: append-only + atomic write</h2>
+<h2 id="similarity">相似度去重: Jaccard 教学版</h2>
 <p>
-  memory 落盘必须用 atomic write (第 15 章会展开): 写新文件到
-  <code>~/.claude/memory/&lt;scope&gt;/&lt;key&gt;.md.tmp</code>, fsync 之后
-  rename 到正式路径。直接 <code>fs.writeFile</code> 在断电时会留半截文件,
-  下次启动解析失败。
+  <strong>用途</strong>: 用户创建 memory 前, 检查是否已存在相似条目
+  (Jaccard 相似度 ≥ 0.5 就算相似), 给 LLM 返回"已有相似条目,
+  请确认是要新增还是要更新"。 防止"反复加重复 memory"。
 </p>
 <p>
-  set() 内部追加 metadata header (updatedAt, scope), 这样 list() 时
-  不需要读文件内容就能展示元信息。文件本身是 markdown 格式, 人类可读,
-  也方便用户用编辑器手动修正。
+  <strong>真实场景</strong>: LLM 想加 "中文回答" (description), 但
+  磁盘上已有 "中文交流" (description + body 提到"中文"), 相似度
+  0.6, 返回"已有相似条目, 是否要更新?", LLM 决定更新而不是新增。
 </p>
-
-<h2 id="trap">反例梯度</h2>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>memory 写进 history。</p>
-    <p>
-      <strong>为什么错:</strong>每次 LLM 调用都重传, token 浪费; 职责混淆;
-      compress / replay 都会混乱。
-    </p>
-    <p>
-      <strong>正确做法:</strong>memory 独立落盘, SessionStart 注入 system
-      prompt, 不写 history。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>不区分 scope, 所有 memory 写一个文件。</p>
-    <p>
-      <strong>为什么错:</strong>用户换项目后项目级 fact 跟着走, 污染新项目。
-    </p>
-    <p>
-      <strong>正确做法:</strong>按 scope 写到不同目录, 渲染 system prompt
-      时按当前项目过滤。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>LLM 写 memory 时 key 自由发挥, 每次都新建一个
-      key。
-    </p>
-    <p>
-      <strong>为什么错:</strong>"user.preference.style" 和
-      "user.preference.explanation_style" 实际是同一件事, 写两份互相覆盖。
-    </p>
-    <p>
-      <strong>正确做法:</strong>key 命名规范, save_memory 工具在 schema 里硬约束
-      key 格式, harness 拒绝不合规的 key。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
+<p>
+  <strong>设计思想</strong>: <strong>教学版去重</strong> — 不引入
+  embedding / 向量数据库, 用 Jaccard token 重叠系数, 简单到能
+  在 30 行内讲清楚。 教学项目优先可读性, 不追求工业级精度。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L288"><code>src/memory.ts</code> 第 288 行的 <code>calculateTokenOverlap</code></a>:
+</p>
+<pre><code class="language-typescript">function calculateTokenOverlap(a: Set&lt;string&gt;, b: Set&lt;string&gt;): number {
+  if (a.size === 0 || b.size === 0) return 0;
+  let intersection = 0;
+  for (const token of a) {
+    if (b.has(token)) intersection++;
+  }
+  const union = a.size + b.size - intersection;
+  return union === 0 ? 0 : intersection / union;
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: 文本先经过 <code>normalizeForSimilarity</code>
+  (看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L261"><code>src/memory.ts</code> 第 261 行</a>)
+  — 小写 + 标点变空格 + 连续空白压缩, 再 <code>tokenizeForSimilarity</code>
+  提取长度 ≥ 2 的 token。 教学版的局限: 不识别同义词 ("中文" vs
+  "汉语"), 不识别语序变化 ("用 pnpm" vs "pnpm 是这个项目的包
+  管理器")。 工业级要做 embedding, 教学版讲 Jaccard 就够了。
+</p>
+<h2 id="prompt-section">buildPromptSection: 启动注入到 system prompt</h2>
+<p>
+  <strong>用途</strong>: SessionStart handler 调
+  <code>memoryManager.buildPromptSection()</code>, 把所有 memory
+  按 type 排序后拼成一段短文本, 注入 system prompt 的最前面
+  (稳定前缀)。
+</p>
+<p>
+  <strong>真实场景</strong>: SessionStart handler 写:
+  <code>const section = memoryManager.buildPromptSection(); if (section) systemPrompt = section + "\n\n" + systemPrompt;</code>。
+  之后 100 轮对话, system prompt 里的 memory 段不变, cache 命中。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>冷启动一次, 热对话复用</strong> —
+  memory 段在 SessionStart 一次写入 system prompt, 之后所有
+  history 都引用同一份前缀, OpenAI / Anthropic 的 prompt cache
+  按前缀命中 (第 10 章详细展开)。 这是 Reference 章节 "模式 11 ·
+  Cache-friendly 缓存友好" + "模式 12 · Reminder 注入提醒" 的
+  联合应用。
+</p>
+<p>
+  <strong>实现细节</strong>: 注入的 section 应该<strong>短小精悍</strong>
+  — 每条 memory 只取 description, 不取 body。 body 留给 LLM 需要时
+  主动调 <code>memory_read(name)</code> 查全量, 避免 system prompt
+  撑爆。 这就是"摘要 vs 全文" 的设计权衡。
+</p>
+<h2 id="loop-integration">主循环集成: SessionStart handler 读 memory</h2>
+<p>
+  <strong>用途</strong>: MemoryManager 在 Composition Root 创建, 注入
+  SessionStart handler; SessionStart handler 调
+  <code>buildPromptSection()</code> 返回 inject message; 主循环
+  收集 inject message 注入 system prompt (第 08 章延迟注入协议)。
+</p>
+<p>
+  <strong>真实场景</strong>: <code>index.ts</code> 里:
+  <code>createHookRunner({ SessionStart: [createMemoryInjectHandler(memoryManager)] }, logger)</code>。
+  Handler 内部: <code>const section = memoryManager.buildPromptSection(); return section ? { exitCode: 2, message: section } : { exitCode: 0 }</code>。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>Memory 不知道 Hook, Hook 知道 Memory</strong>
+  — memory.ts 是<strong>纯模块</strong>, 不 import hooks.ts; hooks.ts
+  通过 handler 闭包注入 memoryManager。 这就是 Reference 章节
+  "模式 3 · 依赖注入" 的应用 — 模块之间不直接耦合, 通过 Composition
+  Root 拼装。
+</p>
+<p>
+  <strong>实现细节</strong>: SessionStart handler 是<strong>幂等</strong>
+  的 — 同一 session 内, 第一次 run() 调一次, 之后不再调。 memory
+  段在第一次就注入 system prompt, 后续轮引用同一前缀, cache 命中。
+</p>
+<h2 id="cache-implication">对 prompt cache 的影响: memory 段是 stable prefix 一部分</h2>
+<p>
+  <strong>用途</strong>: 注入 system prompt 的 memory 段应该
+  <strong>位置稳定 + 内容稳定</strong>。 位置 = 拼在 system prompt
+  最前面; 内容 = 同一 session 内不变。
+</p>
+<p>
+  <strong>真实场景</strong>: 团队加了一条 feedback memory, 重启
+  session 后 system prompt 里的 memory 段多了 1 行, 整个对话的
+  cache prefix 变长 100 字符, 后续 100 轮对话 cache <strong>全
+  miss</strong>。 正确做法: 接受这次 miss, 后续 100 轮对话 cache
+  重新累积。 memory 段应该是"冷启动一次, 后续稳定", 不是"每轮
+  变"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>冷数据, 不热数据</strong> —
+  memory 是"几乎不变" 的数据, 不应该频繁写。 想要"动态变化" 用
+  Reminder (第 03 章), 不是 memory。 memory 和 Reminder 职责
+  切分: memory = 冷, Reminder = 热。
+</p>
+<p>
+  <strong>实现细节</strong>: 这是和第 05 章 (Skill 稳定前缀) +
+  第 08 章 (Hook SessionStart 注入) 联合的 cache 策略。 三者
+  都强调"前缀稳定", 拼在 system prompt 最前面, OpenAI / Anthropic
+  的 prompt cache 按前缀命中 (前缀 1024 字符左右开始 cache)。
+</p>
+<h2 id="fake-test">fake test: 用 tmp 目录验证 3 条不变量</h2>
+<p>
+  <strong>用途</strong>: Memory 测试不需要真实 memory 目录, 用
+  <code>os.tmpdir()</code> 临时目录即可 — 测的是"文件系统读写 +
+  解析 + 校验" 的集成行为, 不是单函数逻辑。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户写测试验证"加 3 条 memory,
+  调 <code>scan()</code> 拿到 3 条, 索引 MEMORY.md 包含 3 行"。
+  用 tmp 目录, 测完清理, 不污染 home 目录。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>集成测试</strong>覆盖核心
+  流程 — 写文件 + 读 + 解析 + 校验 + 索引, 一个测试走完整个
+  链路, 比 5 个单测加起来更稳。 看测试示例:
+</p>
+<pre><code class="language-typescript">test("完整流程: create -&gt; scan -&gt; list -&gt; buildPromptSection", () =&gt; {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), "mem-test-"));
+  const mgr = createMemoryManager({ memoryDir: dir, logger: noopLogger });
+
+  mgr.create({ name: "chinese", description: "中文回答", type: "user", body: "..." });
+  mgr.create({ name: "commit-format", description: "conventional commit", type: "feedback", body: "..." });
+  mgr.create({ name: "use-pnpm", description: "用 pnpm", type: "project", body: "..." });
+
+  const entries = mgr.scan();
+  expect(entries).toHaveLength(3);
+
+  // 按 type 排序: feedback 在前, project 在后, user 在中间
+  expect(entries[0]!.meta.type).toBe("feedback");
+  expect(entries[1]!.meta.type).toBe("user");
+  expect(entries[2]!.meta.type).toBe("project");
+
+  // 索引文件自动生成
+  const indexContent = fs.readFileSync(path.join(dir, "MEMORY.md"), "utf-8");
+  expect(indexContent).toContain("- [feedback] commit-format:");
+  expect(indexContent).toContain("- [user] chinese:");
+});
+
+test("非法 name 拒绝", () =&gt; {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), "mem-test-"));
+  const mgr = createMemoryManager({ memoryDir: dir, logger: noopLogger });
+  expect(() =&gt; mgr.create({ name: "../escape", description: "x", type: "user", body: "" }))
+    .toThrow(/Invalid memory name/);
+});
+
+test("frontmatter name != 文件名 警告跳过", () =&gt; {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), "mem-test-"));
+  fs.writeFileSync(path.join(dir, "foo.md"), "---\nname: bar\n---\n");
+  const mgr = createMemoryManager({ memoryDir: dir, logger: noopLogger });
+  const entries = mgr.scan();
+  expect(entries).toHaveLength(0);  // 校验失败, 跳过
+});</code></pre>
+<p>
+  <strong>实现细节</strong>: 3 个测试覆盖 (a) 完整读写流程,
+  (b) name 合法性校验, (c) name == 文件名校验。 不需要 mock fs,
+  真实 tmp 目录比 in-memory fs 更稳 — 真实测出 "文件权限 / 编码"
+  这类边界。
+</p>
+<h2 id="common-confusion">常见误解: 4 类不是权限, 是来源</h2>
+<p>
+  <strong>误解 1: "4 类是权限分级?"</strong> 错。 4 类 tag 是
+  <strong>来源</strong> (谁写的), 不是<strong>权限</strong> (谁能
+  改)。 想做权限分级, 加 role-based access, 不是 4 类 tag。
+</p>
+<p>
+  <strong>误解 2: "memory 越多越好?"</strong> 错。 memory 段注入
+  system prompt, 多了撑爆 cache prefix 命中率。 应该 5-20 条
+  精炼条目, 不该 100 条碎片。 LLM 加 memory 前应该先
+  <code>findSimilar</code> 检查是否已存在。
+</p>
+<p>
+  <strong>误解 3: "MEMORY.md 手写也可以?"</strong> 不行。 MEMORY.md
+  是自动生成的索引, scan 时跳过。 手写会被忽略, 而且和 create
+  / delete 后自动 rebuild 的内容冲突。
+</p>
+<p>
+  <strong>误解 4: "Jaccard 相似度能识别同义词?"</strong> 错。 教学版
+  Jaccard 只看字面 token 重叠, "中文" 和 "汉语" 不重叠。 想要
+  同义词识别, 引入 embedding, 不在本章范围。
+</p>
+<h2 id="trap">反例梯度: 4 个常见错误</h2>
+<div class="cards-grid">
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 1 · 不校验 name</span></div>
+    <div class="card__body">
+      <p>让 <code>memory_create({name: "../../etc/passwd"})</code> 走通,
+        LLM 写入 <code>memory/../../etc/passwd.md</code>, 路径穿越。
+        错。 任何写入都必须 <code>isValidName</code> 校验, 失败 throw,
+        不让 LLM 重试。 路径穿越是安全漏洞, 不是"格式问题"。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>memory 写入不原子, 断电后留半截文件。</p>
-    <p><strong>为什么错:</strong>下次启动解析失败, user 长期偏好丢失。</p>
-    <p>
-      <strong>正确做法:</strong>第 15 章的 atomic-write 模块兜底, write tmp +
-      fsync + rename。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 2 · 不校验 name == 文件名</span></div>
+    <div class="card__body">
+      <p>LLM 改 frontmatter 把 <code>name: foo</code> 改成
+        <code>name: bar</code>, 但文件名还是 <code>foo.md</code>。
+        表面无害, 实际 read("foo") 找不到 (文件叫 foo 但
+        frontmatter 写 bar), read("bar") 也找不到 (文件叫 foo)。
+        错。 scan 时必须校验 frontmatter name == 文件名, 不一致
+        warn + 跳过。</p>
+    </div>
   </div>
-</div>
-
-<h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 09 章</span>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 3 · 解析失败抛错</span></div>
+    <div class="card__body">
+      <p>用户手改了一个 memory 文件, frontmatter 少一个字段, parser
+        throw。 agent 启动失败。 错。 解析失败应该 warn 日志 + 跳过
+        单个文件, 不 throw, 不阻止 agent 启动。 harness 容忍坏文件,
+        用户修好之后下次启动自动恢复。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>memory 不进 history:</strong>save_memory 调用后, history 末尾有
-      tool message "Saved user.preference.style", 但 history.getMessages()
-      中不含 memory value 内容。
-    </p>
-    <p>
-      <strong>SessionStart 注入 system prompt:</strong>fake memoryStore 预设
-      user.preference.style = "简洁", 跑完一次 run() 后,
-      history.getSystemPrompt() 含 "# User Memory" 和 "简洁"。
-    </p>
-    <p>
-      <strong>scope 隔离:</strong>fake memoryStore 同时有 user 级和 project
-      级条目, renderForPrompt("user") 不含 project 级内容。
-    </p>
-    <p>
-      <strong>key 命名规范:</strong>save_memory 调用 key = "free_style" (不符合
-      &lt;scope-prefix&gt;.&lt;category&gt;.&lt;name&gt; 规范), 写 error tool
-      message, 拒绝写入。
-    </p>
-    <p>
-      <strong>memory 持久化:</strong>fake memoryStore 暴露 spy, 跑完 save_memory
-      后 spy.set 被调用, 参数含 scope / key / value。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 4 · memory 段写在 history 而非 system</span></div>
+    <div class="card__body">
+      <p>把 <code>buildPromptSection()</code> 的结果 append 到
+        <code>history[]</code> 第一条 user message, 而不是注入
+        <code>systemPrompt</code>。 表面能跑, 实际破坏 prompt cache
+        (history 增长每次都变, system 不变)。 错。 memory 段必须
+        注入 system prompt, 让它成为稳定前缀的一部分。</p>
+    </div>
   </div>
 </div>
-
-<h2 id="lookback">回望第 00–08 章: 哪些原则在本章兑现了</h2>
+<h2 id="validate">Validation: 4 条不变量检验</h2>
+<ol>
+<li>
+<strong>写读对称</strong>: <code>create()</code> 写入的条目,
+    <code>read(name)</code> 必须能读回, 字段 (description / type /
+    body) 完全一致。 验证: tmp 目录, create 一条带特殊字符 body
+    (含中文 + 换行 + JSON), read 回来字符串相等。
+  </li>
+<li>
+<strong>name 合法性</strong>: <code>create({name: "Foo/../bar"})</code>
+    必须 throw, 错误信息含 "Invalid memory name"。 验证: 单测覆盖
+    大写 / 斜杠 / 点 / 空格 / 空字符串 5 类非法 name。
+  </li>
+<li>
+<strong>name == 文件名</strong>: 写入 <code>foo.md</code> 但
+    frontmatter <code>name: bar</code>, scan 跳过该文件 + warn
+    日志。 验证: tmp 目录, 写一个不一致的文件, scan 拿不到。
+  </li>
+<li>
+<strong>索引派生</strong>: create / delete 后 MEMORY.md 内容自动
+    rebuild, 顺序稳定 (按 type → name 排序), 多次 rebuild 内容
+    字节相等。 验证: 写 3 条 memory, rebuild, 再 rebuild, 字节
+    相等 (git diff 显示 0 行)。
+  </li>
+</ol>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>事实与视图分离:</strong>memory 独立于 history, 是"长期事实", history
-    是"对话上下文"。这条原则在第 06 章 compress 隐含, 本章显式兑现。
+<li>
+<strong>工厂 + 闭包</strong>: <code>createMemoryManager</code> 把
+    状态藏在闭包, 外部只通过 MemoryManager 接口访问。
+  </li>
+<li>
+<strong>派生数据</strong>: MEMORY.md 索引是 scan 派生, 不手写,
+    自动 rebuild, git diff 干净。
   </li>
-  <li>
-    <strong>稳定前缀原则:</strong>memory 拼到 system prompt 的固定位置, prefix
-    锚点不变, 为第 10 章 cache-friendly 布局做准备。
+<li>
+<strong>3 重身份校验</strong>: name 合法字符 + type 合法枚举 +
+    name == 文件名, 防路径穿越 / 类型错乱 / 身份错位。
   </li>
-  <li>
-    <strong>职责单一:</strong>memory 模块只管"读 / 写 memory 文件", 不参与
-    messages 构建 (那是 prepareMessages 的事)。
+<li>
+<strong>宽容失败</strong>: 解析失败 warn + 跳过, 不 throw, agent
+    启动不因单条坏 memory 失败。
   </li>
-  <li>
-    <strong>错误降级原则:</strong>memory 读失败 (文件不存在) 降级为"空 memory",
-    不让主 loop 崩。
+<li>
+<strong>冷数据进稳定前缀</strong>: memory 段注入 system prompt
+    而非 history, 保持 cache 命中。
   </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>memory 内容太长</dt>
-  <dd>
-    第 10 章 cache-friendly 布局会展开, memory 内容进 system prompt 时按 scope
-    分块, 不全量加载。
-  </dd>
-  <dt>memory 跨进程冲突</dt>
-  <dd>
-    第 15 章 atomic-write 兜底, memory 写新文件 + rename, 不直接改原文件。
+<dt>多 session 一致性</dt>
+<dd>
+    memory 持久化在磁盘, 跨 session 生效。 但 subagent 嵌套时,
+    子 agent 是否继承父 agent 的 memory? 当前策略: 继承。
+    这可能让子 agent 看到不该看到的 user 偏好 (如"中文回答" 是
+    用户的, 子 agent 跑英文项目不该看)。 第 11 章 Recovery 讨论
+    subagent 隔离时, memory 隔离会作为子话题。
   </dd>
-  <dt>memory 落盘决策点</dt>
-  <dd>
-    第 12 章 task 区分"会话内 memory" (本章的 save_memory) 和"项目级 task"
-    (例如"未来要重构 X"), task 进 plan 文件, 不进 memory。
+<dt>并发写入冲突</dt>
+<dd>
+    多个 subagent 同时 <code>create()</code> 同一 name, 后写
+    覆盖先写, 丢 createdAt。 当前不处理。 第 13 章 Async Run
+    讨论并发时, memory 写入是冲突点之一。
   </dd>
-  <dt>memory 审计</dt>
-  <dd>
-    第 15 章 transcript 会记录"memory 何时被读、被写", 用于审计 user 偏好变更。
+<dt>MEMORY.md 是单文件</dt>
+<dd>
+    100 条 memory 时 MEMORY.md 仍然是一份索引, 不分裂。
+    rebuild 100 条 ~1ms, 规模 OK。 但要监控条目数, 超过 50
+    该提醒用户"该清理了"。
   </dd>
 </dl>
-
-<h2 id="vibe-coding-09">本次如何 vibe code: 第 09 章的三件套</h2>
-
-<h3 id="vibe-feed-09">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>MemoryStore</code> /
-    <code>MemoryEntry</code> / <code>MemoryScope</code> 三个 interface, 以及 key
-    命名规范文档。本轮不写实现。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    <code>createMemoryStore()</code> 接受 scope 目录, SessionStart 注入仍是 stub
-    (永远注入空 memory)。本轮 review 重点: memoryStore 实例在
-    <code>index.ts</code> 只 new 一次。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 createMemoryStore + save_memory
-    工具 + agent.SessionStart 注入。本轮 review 重点: scope 隔离, key
-    命名规范校验, atomic write (第 15 章会展开)。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/memory.test.ts</code>。本轮 review 重点: "memory 不进 history" 和
-    "key 命名规范" 两条必须有反向断言。
-  </li>
-</ol>
-
-<h3 id="vibe-review-09">Review: 第 09 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>memory 不进 history。</strong>save_memory 调用后
-    history.getMessages() 不含 memory value。验证:
-    <code>grep -n 'role: "user".*memory' src/agent.ts</code> 应当 0 行 (除
-    system prompt 注入路径)。
-  </li>
-  <li>
-    <strong>scope 隔离。</strong>renderForPrompt("user") 不返回 project
-    级内容。验证: Validation 卡片"scope 隔离" 那条测试通过。
-  </li>
-  <li>
-    <strong>key 命名规范。</strong>save_memory 拒绝不合规 key。验证: Validation
-    卡片"key 命名规范" 那条测试通过。
-  </li>
-  <li>
-    <strong>SessionStart 注入位置正确。</strong>memory 拼到 system prompt, 不写
-    history。验证: 跑完 run() 后 history.getSystemPrompt() 含 "# User Memory"。
-  </li>
-  <li>
-    <strong>memoryStore 是工厂, 不在 agent 内 new。</strong>验证:
-    <code>grep -n 'new MemoryStore' src/</code> 应当 0 行。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-09">调试: 第 09 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · memory 写进 history messages。</strong>症状: save_memory
-    工具内部
-    <code>history.add({role: "user", content: \`Saved: ${value}\`})</code
-    >。验证: Validation 卡片"memory 不进 history" 那条测试失败。
-  </li>
-  <li>
-    <strong>伪装 B · scope 混用, 渲染时不过滤。</strong>症状:
-    <code>renderForPrompt(scope)</code> 不过滤 scope, 总是返回所有 memory。验证:
-    Validation 卡片"scope 隔离" 那条测试失败。
-  </li>
-  <li>
-    <strong>伪装 C · LLM 写 memory 时 key 自由发挥。</strong>症状: save_memory
-    工具对 key 不校验, LLM 写 "my style" / "user_pref_1" 等自由 key。验证:
-    Validation 卡片"key 命名规范" 那条测试通过 (写不合规 key 应当拒绝)。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-09">迭代: 第 09 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch09): 钉 MemoryStore / MemoryEntry / MemoryScope 接口与 key
-      规范</code
-    >
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code>feat(ch09): createMemoryStore 工厂 + save_memory 工具 stub</code> ——
-    tsc 通过, 写空文件。
-  </li>
-  <li>
-    <code
-      >feat(ch09): agent.SessionStart 注入 + scope 隔离 + key 校验 + atomic
-      write</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li><code>test(ch09): memory 持久化 spy 验证</code> —— 全绿。</li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
   <div class="card__head">
     <span class="card__tag">Prompt Card · 第 09 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
+    <button class="card__copy" data-copy-card="" type="button">复制</button>
   </div>
   <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 Memory 模块, 区分 user 级与 project 级,
-      SessionStart 注入 system prompt, LLM 可通过 save_memory 工具更新 memory。
-    </p>
-    <p>
-      <strong>场景:</strong>用户在新会话说 "我喜欢简洁解释", agent 调
-      save_memory("user", "user.preference.explanation_style", "简洁"); 下次会话
-      SessionStart 时 system prompt 含 "用户偏好: 简洁", LLM
-      在新会话能引用这个事实。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/memory.ts</code> (新) 暴露
-      <code>createMemoryStore()</code>;
-      <code>src/tools/save_memory.ts</code> (新) 实现工具;
-      <code>src/agent.ts</code> 改 SessionStart 注入 memory 到 system prompt;
-      <code>src/index.ts</code> 接线 memoryStore。
-    </p>
+    <p><strong>目标:</strong> 给 harness 加 Memory 系统, 4 类 tag 持久化
+      到 memory/ 目录, SessionStart handler 注入 system prompt。</p>
+    <p><strong>场景:</strong> 用户加 3 条 memory — "中文回答" (user) /
+      "conventional commit" (feedback) / "用 pnpm" (project), 启动
+      session, system prompt 多了 3 行摘要, 后续 100 轮对话 cache
+      命中。</p>
+    <p><strong>模块:</strong> <code>src/memory.ts</code> (新) 暴露
+      <code>createMemoryManager({memoryDir, logger})</code>;
+      <code>src/index.ts</code> (改) Composition Root 创建 + 注入
+      SessionStart handler; <code>src/system-prompt.ts</code> (改)
+      SessionStart 阶段调 <code>buildPromptSection()</code> 注入。</p>
     <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
     <ul>
-      <li>memory 不写进 history.messages, 只在 system prompt 注入</li>
-      <li>scope 区分 user / project, renderForPrompt 按 scope 过滤</li>
-      <li>
-        key 必须符合
-        <code>&lt;scope-prefix&gt;.&lt;category&gt;.&lt;name&gt;</code> 规范,
-        不合规拒绝
-      </li>
-      <li>memory 落盘走 atomic write (write tmp + fsync + rename)</li>
-      <li>memoryStore 工厂, Composition Root 唯一 new</li>
+      <li>4 类 tag 固定枚举 user / feedback / project / reference, 不可新增</li>
+      <li>name 只允许 <code>[a-z0-9_-]+</code>, 写时校验, 失败 throw</li>
+      <li>type 必须属于 4 类枚举, 写时校验, 失败 throw</li>
+      <li>frontmatter <code>name</code> 必须等于文件名 (去 .md), scan 时校验, 失败 warn + 跳过</li>
+      <li>解析失败 (frontmatter 缺字段 / type 非法 / 字段含换行) warn + 跳过, 不 throw</li>
+      <li>MEMORY.md 索引是派生数据, 自动 rebuild, scan 时跳过, 用户不手写</li>
+      <li>buildPromptSection() 只取 description, 不取 body, 控制注入体积</li>
+      <li>Jaccard 相似度 ≥ 0.5 返回相似条目, 教学版不引入 embedding</li>
     </ul>
-    <p><strong>验证 (用 fake memoryStore + spy, 逐条落到 vitest):</strong></p>
+    <p><strong>验证 (用 tmp 目录 + vitest, 逐条断言):</strong></p>
     <ul>
-      <li>save_memory 调用后, history.getMessages() 不含 memory value</li>
-      <li>
-        fake 预设 user.preference.style = "简洁", SessionStart 后 system prompt
-        含 "简洁"
-      </li>
-      <li>
-        fake 同时有 user 级和 project 级, renderForPrompt("user") 不含 project
-        级
-      </li>
-      <li>
-        save_memory 调用 key = "free_style", 写 error tool message, 拒绝写入
-      </li>
-      <li>spy 验证 memoryStore.set 被调用, 参数含 scope / key / value</li>
+      <li>create → scan → list → buildPromptSection 完整流程, 3 条 memory 按 type 排序</li>
+      <li>create({name: "../escape"}) throw, 错误信息含 "Invalid memory name"</li>
+      <li>手写 <code>foo.md</code> 但 frontmatter name=bar, scan 返回 0 条, warn 日志</li>
+      <li>create / delete 多次后, MEMORY.md 字节稳定, 排序一致</li>
     </ul>
   </div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意把 memory value 写进 history.messages, 跑测试, 看"memory 不进 history"
-    是否抓到。
+<li>
+    故意不写 name 合法性校验, 跑 <code>create({name: "../escape"})</code>
+    测试, 看"路径穿越" 是否抓到 (文件被写到 memory 目录外)。
+  </li>
+<li>
+    故意不写 name == 文件名校验, 手改 <code>foo.md</code> 的
+    frontmatter name=bar, 跑 scan, 看"身份错位" 是否抓到
+    (read 找不到任何条目)。
   </li>
-  <li>让 renderForPrompt 不过滤 scope, 跑测试, 看"scope 隔离" 是否抓到。</li>
-  <li>
-    在 save_memory 工具里不校验 key 格式, 跑测试, 看"key 命名规范" 是否抓到。
+<li>
+    故意让 parser throw (不 catch), 跑 scan 一个 frontmatter 缺字段
+    的文件, 看"宽容失败" 是否抓到 (agent 启动失败 vs. warn +
+    跳过, 启动成功)。
+  </li>
+<li>
+    把 <code>buildPromptSection()</code> 的结果 append 到 history
+    第一条 user message, 跑 SessionStart 测试, 看"memory 段进
+    稳定前缀" 是否抓到 (cache prefix 增长, 后续轮 miss)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了 Memory 模块, 区分 user 级与 project 级长期事实。 memory
-  独立于 history, 通过 SessionStart 注入 system prompt 的固定 位置 (prefix 稳定,
-  内容动态), 既给 LLM 长期事实, 又不破坏 cache 友好布局。下一章 (第 10 章)
-  我们会展开 prompt cache 友好布局的 完整设计: 哪些进稳定前缀, 哪些走 reminder,
-  哪些靠 history。
+  Memory 是给 system prompt 的<strong>冷数据</strong>, 4 类 tag 把
+  来源分清楚。 核心是 5 个设计:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>4 类 tag</strong>: user / feedback / project / reference,
+    固定枚举, 不可新增。
+  </li>
+<li>
+<strong>Markdown 文件</strong>: frontmatter 5 字段 + body, 文件名
+    = <code>${name}.md</code>, 双重校验。
+  </li>
+<li>
+<strong>3 重身份校验</strong>: name 合法字符 + type 合法枚举 +
+    name == 文件名, 缺一不可。
+  </li>
+<li>
+<strong>派生索引</strong>: MEMORY.md 自动 rebuild, 按 type → name
+    稳定排序, git diff 干净。
+  </li>
+<li>
+<strong>Jaccard 教学版去重</strong>: 不引入 embedding, 30 行讲清楚,
+    容忍"中文 vs 汉语" 的局限。
+  </li>
 <p>
-  第 09 章把 memory 注入 system prompt, 但还没系统讲"如何让 system prompt 稳定 +
-  动态状态走 reminder + history 自然增长" 这套布局。 下一章 (第 10 章)
-  会基于前九章的所有设计选择 (memory 锚点、reminder 标签、tool 描述作为稳定前缀)
-  总结出 prompt cache 友好布局的完整 原则, 并讨论 token 成本和 cache hit rate
-  的权衡。
+  下一章 (第 10 章) 展开 prompt cache 的具体策略 — 稳定前缀的边界
+  在哪, 哪些字段"必须稳定", 哪些"可以变化", cache miss 后的
+  fallback。
 </p>
diff --git a/tutorial/chapters/10-cache.html b/tutorial/chapters/10-cache.html
index c31cdfc..fc1d80f 100644
--- a/tutorial/chapters/10-cache.html
+++ b/tutorial/chapters/10-cache.html
@@ -1,548 +1,628 @@
-<p class="article__eyebrow">第 10 章 · Prompt Cache 友好布局</p>
-<h1 class="article__title">稳定前缀 + 动态状态: Prompt Cache 友好布局</h1>
+<p class="article__eyebrow">第 10 章 · Prompt cache 稳定性</p>
+<h1 class="article__title">Cache: 怎么让 OpenAI / Anthropic 的 prompt cache 一直命中</h1>
 <p class="article__lede">
-  前面九章让 harness 拥有完整能力, 但每次 LLM 调用都从头发送完整 messages, token
-  成本居高不下。这一章系统讲 prompt cache 友好布局: 哪些进稳定前缀, 哪些走
-  reminder, 哪些靠 history 自然增长。基于前九章的所有设计选择 (memory
-  锚点、reminder 标签、tool 描述作为稳定前缀) 总结出完整原则。
+  第 05 章 Skill 强调"稳定前缀"、第 08 章 Hook SessionStart 注入
+  也说"稳定前缀"、第 09 章 Memory 注入 system prompt 仍然说"稳定
+  前缀" — 但到底什么叫<strong>稳定</strong>? 怎么<strong>观察</strong>
+  它稳定不稳定? 这一章加 <code>src/cache-debug.ts</code> 模块 —
+  计算 system prompt / tools / 稳定前缀的 SHA256 hash, 每轮请求前
+  对比"上次 vs 这次", 输出 <code>[cache] systemPrompt=stable tools=stable prefix=stable</code>
+  这种单行日志, 让开发者一眼看出"哪变了, 改一次成本 ×10"。
+  读完后, 你能讲清"稳定前缀 = system prompt + tools" 的边界, 并能
+  用 fake tracker 验证"key 排序一致" 的稳定序列化协议。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-09">在第 09 章基础上改了什么</h2>
-<p>
-  这一章不新增模块, 而是基于已有的 memory (第 09 章) / system prompt / tools (第
-  02 章) / reminders (第 03/04/05 章) / history (第 01 章) 重新组织 prompt
-  拼装顺序, 最大化 LLM provider 的 prompt cache hit rate。 改动集中在 2 个文件:
-  <code>src/system-prompt.ts</code> (新, 之前散落在 history.ts) 和
-  <code>src/stable-context.ts</code> (新, 负责 stable snapshot 的 invalidate
-  逻辑)。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/system-prompt.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/system-prompt.ts: System prompt 组装 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/stable-context.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/stable-context.ts: Stable context snapshot (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/cache-debug.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/cache-debug.ts: Cache hit rate 监控 (新)</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    长任务下, harness 跑 50 轮 LLM 调用, 总 token 成本是单轮的 50 倍。 但实际上
-    system prompt + tool 描述 + 前 20 轮 messages 几乎不变。 现象是"harness
-    重复发送相同内容, 浪费 token"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"system prompt 里塞 TODO 状态 / skill 状态 / memory
-    当前值"。这有两个问题: 一是每次 LLM 调用 system prompt 字符串不同, 整个
-    prompt cache 失效, 二是 token 成本按"全量" 计费, 不按 "增量" 计费。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface SystemPromptProvider { build(query), buildTurnReminders(query)
-      }</code
-    >。 不变量四条: (1) system prompt 拼装顺序固定, 不依赖 query 内容, (2)
-    动态状态 (TODO / skill 激活集合) 走 reminder 消息, 不进 system prompt
-    字符串, (3) tool 描述 (按 skill 集合) 在一次 run() 内一旦确定就稳定, (4)
-    history 是自然增量, 不试图"重写为压缩态"。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake LLM 跑 3 轮, 每次记录 system prompt + tools 字段的 hash, 断言前 2 轮
-    hash 相同 (稳定前缀), 第 3 轮 (有 reminder 注入) hash 仍可不同但 tools
-    字段不变。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · 动态状态进 system prompt</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 错误: 动态状态拼进 system prompt
-const systemPrompt = `${basePrompt}\n\nCurrent TODO: ${JSON.stringify(todos)}\n\nActive skills: ${activeSkills.join(", ")}`;</code></pre>
-  <p><strong>问:</strong>为什么不直接拼字符串?</p>
-  <p>
-    <strong>答:</strong>每次 LLM 调用的 system prompt 字符串都不同, 整个 prompt
-    cache 失效。LLM provider (Anthropic / OpenAI) 的 cache 是按 前缀字符串匹配,
-    system prompt 一变, 后面所有 token 都要重新计费。 正确做法: 动态状态走
-    reminder (user 消息), system prompt 保持稳定。
-  </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · tools 字段每次都重新拼</p>
-  <pre class="code-block"><code>// 教学简化版
-const tools = activeSkills.tools().map(t =&gt; ({ name: t.name, description: t.description, schema: t.schema }));
-// 即使 skill 集合没变, 每次拼出的对象字面量都不同 (引用不同)</code></pre>
-  <p><strong>问:</strong>对象字面量内容相同, 为什么 cache 仍然不命中?</p>
-  <p>
-    <strong>答:</strong>JSON 序列化后内容相同, 多数 provider 是按 JSON
-    字符串前缀匹配, 不是对象引用。问题是: 如果代码逻辑 (例如 skill 加载顺序变化)
-    让 JSON 字符串顺序变了, cache 也会失效。 正确做法: tools 数组的拼装顺序稳定,
-    加载 skill 时不重新排序。
-  </p>
-</div>
-
-<h2 id="layout">稳定前缀 / 动态状态 / 自然增量 三段布局</h2>
-<p>这一节是本章的核心。基于前九章的设计选择, 整理出一份"哪些进哪里" 的清单:</p>
-
-<h3>稳定前缀 (system prompt + tools)</h3>
-<ul>
-  <li>
-    <strong>Base system prompt</strong>: 静态指令, 在
-    <code>history.setSystemPrompt()</code> 时一次性写入, 后续不变。
-  </li>
-  <li>
-    <strong>Memory 锚点</strong>: "# User Memory" / "# Project Memory" 标题,
-    加上当前值 (变也只变这一段, prefix 锚点稳定)。
-  </li>
-  <li>
-    <strong>Tool 描述 (按 skill 集合)</strong>: 一旦 skill 集合在 run() 内确定,
-    tools 数组不再变化。
-  </li>
-</ul>
-
-<h3>动态状态 (reminder 消息)</h3>
-<ul>
-  <li>
-    <strong>TODO 状态</strong>: 每轮 reminder 注入, 标签
-    <code>&lt;system-reminder source="todo"&gt;</code>。
+<nav aria="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: 改一行 system prompt 账单 ×10</h2>
+<p>
+  写代码之前, 先看一个真实痛点。 用户的 team 上线 harness 跑
+  生产, 第一个月账单正常, 第二个月改了 3 行 system prompt 加
+  "新规则: 写完代码必须 npm test", 第三个月账单突然 ×10。
+</p>
+<ol>
+<li>
+<strong>症状</strong>: prompt token 从 2k 涨到 20k, 输出 token 不变,
+    总成本 ×10。
   </li>
-  <li>
-    <strong>Skill 激活集合变化</strong>: 加载新 skill 后追加 reminder 描述。
+<li>
+<strong>根因</strong>: system prompt 变了, OpenAI 的 prompt cache
+    (Anthropic 类似) 失效, 每次请求都从 0 重新计费输入 token。
   </li>
-  <li>
-    <strong>子智能体输出</strong>: 父 agent 调 spawn_subagent 后追加 user 消息
-    (子智能体输出)。
+<li>
+<strong>误诊</strong>: team 以为是 LLM 流量涨了, 加机器, 没用 —
+    问题在<strong>输入侧</strong>重复计费。
   </li>
-  <li>
-    <strong>Async run 通知</strong>: 第 13 章, 后台 agent 完成后注入 reminder。
+<li>
+<strong>真正的修法</strong>: 加 cache debug 工具, 每轮请求前 hash
+    system prompt + tools, 输出"变了" 警告。 改 system prompt 时
+    先看 hash, 确认是"故意改" 还是"无意改"。
   </li>
-</ul>
-
-<h3>自然增量 (history)</h3>
-<ul>
-  <li><strong>用户消息</strong>: 每次 run() 追加一条 user message。</li>
-  <li><strong>助手消息</strong>: 每次 LLM 响应追加一条 assistant message。</li>
-  <li><strong>工具结果</strong>: 每次工具执行追加一条 role: "tool" 消息。</li>
-</ul>
+</ol>
 <p>
-  history 是 LLM 视角的"对话上下文", 不试图"重写为压缩态"。压缩只 发生在
-  prepareMessages() 阶段 (第 06 章), history 内部保留原始。
+  朴素想法 1: "OpenAI 自动 cache, 我不用管?" 错。 OpenAI 的
+  cache 按<strong>前缀 hash</strong> 命中, 前缀变化 cache miss,
+  重新计费。 harness 必须自己保证前缀稳定。
 </p>
-
-<h2 id="snapshot">stable snapshot: 显式声明"这部分稳定"</h2>
-<p>
-  当 LLM 修改了 memory (例如 save_memory 工具调用) 时, system prompt 中的 "#
-  User Memory" 段会变。如果每次 save_memory 都让 system prompt 变, cache
-  仍然失效。stable-context 模块解决这个问题: 在 memory 内容 之外, 把 "# User
-  Memory" / "# Project Memory" 标题与"当前值"作为 一个 snapshot, 只有当 snapshot
-  内容真的变了 (字符串 hash 不同) 才 更新 system prompt, 否则复用上次的 system
-  prompt 字符串。
-</p>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export function createStableContextManager(history: History) {
-  let lastSnapshot = "";
+<p>
+  朴素想法 2: "把 system prompt 写死, 永远不变?" 也不行。 用户
+  加 memory / skill / todo reminder 时, system prompt 必须
+  反映这些"几乎不变" 的状态, 不能写死成"初始版本"。
+</p>
+<p>
+  正确做法: 加 <code>src/cache-debug.ts</code> — 每轮 LLM 调用
+  前 hash (system prompt + tools), 与上次对比, 输出 "stable /
+  changed" 标记。 这是 Reference 章节 "模式 11 · Cache-friendly
+  缓存友好" + "模式 18 · Transcript-First 透传优先" 的联合
+  应用。
+</p>
+<h2 id="stable-prefix-boundary">稳定前缀边界: system prompt + tools</h2>
+<p>
+  <strong>用途</strong>: OpenAI / Anthropic 的 prompt cache 按
+  <strong>前缀</strong>命中, 前缀 = system prompt + tools 定义。
+  任何"应该稳定的" 东西必须落在前缀内, 任何"每轮会变的" 必须
+  在 history 里。 边界划错, 成本就乱。
+</p>
+<p>
+  <strong>真实场景</strong>: team 把 todo reminder 写在 system
+  prompt, todo 状态每轮变, system prompt 每轮变, cache 全 miss。
+  正确做法: todo reminder 写在 user message (历史), system prompt
+  写"使用 TodoManager 跟踪任务" (规则), 不写"当前 todo 列表"
+  (状态)。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>边界 = 稳定 vs 变化</strong>
+  — 写 cache-debug 模块的<strong>第一件事</strong>就是定义"稳定
+  前缀 = 什么"。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/cache-debug.ts#L119"><code>src/cache-debug.ts</code> 第 119 行的 <code>computeStablePrefixHash</code></a>:
+</p>
+<pre><code class="language-typescript">function computeStablePrefixHash(
+  messages: ChatCompletionMessageParam[],
+  tools: ChatCompletionTool[],
+): string {
+  const systemMsg = messages.find((m) =&gt; m.role === "system");
+  const systemContent =
+    typeof systemMsg?.content === "string"
+      ? systemMsg.content
+      : JSON.stringify(systemMsg?.content ?? "");
+  return sha256(systemContent + stableStringify(tools));
+}</code></pre>
+<div class="figure figure--stack">
+  <div class="figure__title">图 1 · 稳定前缀的边界</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">稳定前缀 (cache 命中区)</div>
+      <div class="flow-stack__body">system prompt (含 memory 段 / skill 段 / 工具规则) + tools 定义 (6 个工具的 JSON schema)。 同一 session 内保持不变, cache 命中。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">变化区 (每轮重新计费)</div>
+      <div class="flow-stack__body">history: user / assistant / tool messages。 每轮增长, LLM 必须读全量, 没办法 cache (因为 cache 是前缀)。</div>
+    </div>
+  </div>
+</div>
+<p>
+  <strong>实现细节</strong>: 教学版定义"稳定前缀 = system prompt
+  content + tools JSON"。 OpenAI 实际的 cache 算法可能更复杂
+  (按 token 块 cache, 不一定整个前缀), 但教学版讲 system
+  prompt + tools 这一段已经够用, 不假装"我知道 OpenAI 内部
+  怎么 cache"。
+</p>
+<h2 id="hash">Hash 计算: SHA256 + 稳定序列化</h2>
+<p>
+  <strong>用途</strong>: 缓存调试模块不直接读 LLM API 的 cache
+  命中率 (那需要后端日志), 而是<strong>本地 hash 对比</strong> —
+  计算 "上一轮 system prompt + tools" 和 "这一轮" 的 hash, 一致
+  就是稳定, 不一致就是变化。 这是教学版的 cache 观测, 不假装
+  是真实 cache hit rate。
+</p>
+<p>
+  <strong>真实场景</strong>: 开发者改 system prompt, 重启 harness,
+  看到日志: <code>[cache] systemPrompt=changed tools=stable prefix=changed</code>。
+  知道 system prompt 改了, 决定是"故意改" 还是"无意改"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>本地可观测</strong> — cache 调试
+  跑在 harness 进程内, 不调外部 API, 不读 LLM 响应头, 只
+  算 hash 对比。 简单, 可靠, 教学友好。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/cache-debug.ts#L90"><code>src/cache-debug.ts</code> 第 90 行的 <code>sha256</code></a>:
+</p>
+<pre><code class="language-typescript">function sha256(input: string): string {
+  return createHash("sha256").update(input, "utf-8").digest("hex").slice(0, 8);
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: SHA256 取前 8 位 hex 字符 (4 字节) —
+  足够区分"变了" vs "没变", 不需要完整 64 位。 32 位 hash 碰撞
+  概率 ~4 × 10⁻¹⁰, 调试用 8 位 (~1.6 × 10⁻⁹) 也够。 用 Node
+  内置 <code>crypto</code> 模块, 不引入外部依赖。
+</p>
+<h2 id="stable-stringify">stableStringify: 顺序无关的对象序列化</h2>
+<p>
+  <strong>用途</strong>: 工具定义的 JSON 顺序变了 (例如新增一个
+  工具让数组末尾变化), hash 应该<strong>不变</strong> (因为内容
+  集合没变, 顺序是次要)。 稳定序列化按 key 排序后输出, 避免
+  假阳性。
+</p>
+<p>
+  <strong>真实场景</strong>: tools 数组 <code>[run_read, run_bash, run_write]</code>
+  重新启动后变成 <code>[run_bash, run_read, run_write]</code> (注册
+  顺序变了), toolsHash 应该不变 (内容集合没变)。 直接
+  <code>JSON.stringify(tools)</code> 会变, 误报 cache miss。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>内容 hash, 不是字面 hash</strong> —
+  对象按 key 排序, 数组保序, undefined 跳过, 递归处理嵌套。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/cache-debug.ts#L64"><code>src/cache-debug.ts</code> 第 64 行的 <code>stableStringify</code></a>:
+</p>
+<pre><code class="language-typescript">function stableStringify(value: unknown): string {
+  if (value === null) return "null";
+  if (typeof value === "string") return JSON.stringify(value);
+  if (Array.isArray(value)) {
+    return "[" + value.map(stableStringify).join(",") + "]";
+  }
+  if (typeof value === "object") {
+    const obj = value as Record&lt;string, unknown&gt;;
+    const keys = Object.keys(obj).sort();
+    const pairs = keys
+      .filter((k) =&gt; obj[k] !== undefined)
+      .map((k) =&gt; `${JSON.stringify(k)}:${stableStringify(obj[k])}`);
+    return "{" + pairs.join(",") + "}";
+  }
+  return "null";
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: 4 条规则 — 普通对象按 key 字母升序
+  排序; 数组保持原序 (tools 顺序是 LLM 看到的顺序, 不能乱);
+  string 用 JSON.stringify (保留转义); undefined 跳过 (与
+  JSON.stringify 一致)。 教学版的局限: 不处理 <code>NaN</code> /
+  <code>BigInt</code> / 循环引用, 这些 LLM API 也不会返回, 暂
+  时不管。
+</p>
+<h2 id="tracker">createCacheDebugTracker: 闭包持有 lastSnapshot</h2>
+<p>
+  <strong>用途</strong>: tracker 是个轻量对象, 每次 LLM 调用前调
+  <code>inspect({messages, tools})</code>, 返回"当前快照 + 是否
+  变化" 的二元判断。 内部用闭包持有 <code>lastSnapshot</code>,
+  不需要外部状态。
+</p>
+<p>
+  <strong>真实场景</strong>: agent 主循环在第 3 步 (call LLM) 之前
+  调 tracker.inspect, 输出 <code>logger.info(formatCacheDebugLog(state))</code>。
+  之后日志里就有稳定的 <code>[cache] systemPrompt=stable ...</code>
+  标记, 开发者 tail 日志就能看到变化。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>工厂 + 闭包</strong> — 这是
+  Reference 章节 "模式 1 · 工厂 + 闭包" 的标准应用: <code>lastSnapshot</code>
+  在闭包内, 外部只通过 <code>inspect</code> 接口访问, 不会泄露
+  内部状态。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/cache-debug.ts#L144"><code>src/cache-debug.ts</code> 第 144 行的 <code>createCacheDebugTracker</code></a>:
+</p>
+<pre><code class="language-typescript">export function createCacheDebugTracker(): {
+  inspect(input: { messages, tools }): CacheDebugState;
+} {
+  let lastSnapshot: CacheDebugSnapshot | null = null;
 
   return {
-    buildMessages({ currentQuery, recentFiles }): Message[] {
-      const userMemory = memoryStore.renderForPrompt("user");
-      const projectMemory = memoryStore.renderForPrompt("project");
-      const snapshot = `# User Memory\n${userMemory}\n\n# Project Memory\n${projectMemory}`;
-
-      if (snapshot !== lastSnapshot) {
-        // 真的变了, 更新 system prompt
-        const base = history.getSystemPrompt() ?? "";
-        history.setSystemPrompt(`${base}\n\n${snapshot}`);
-        lastSnapshot = snapshot;
-      }
-      return [];  // 稳定, 不注入 user 消息
-    },
-    notifyFileChanged(path: string): void {
-      // 文件改了, 相关 project memory 可能 stale, 下一轮重新渲染
-      lastSnapshot = "";  // 强制刷新
+    inspect(input): CacheDebugState {
+      const current: CacheDebugSnapshot = { ... };
+      const state: CacheDebugState = {
+        current,
+        changed: {
+          systemPrompt: lastSnapshot ? lastSnapshot.systemPromptHash !== current.systemPromptHash : false,
+          tools: lastSnapshot ? lastSnapshot.toolsHash !== current.toolsHash : false,
+          stablePrefix: lastSnapshot ? lastSnapshot.stablePrefixHash !== current.stablePrefixHash : false,
+        },
+      };
+      lastSnapshot = current;
+      return state;
     },
   };
 }</code></pre>
 <p>
-  关键设计: <code>lastSnapshot</code> 是闭包内状态, 比较的是字符串内容
-  (不是对象引用)。这意味着: 即使 LLM 频繁调 save_memory 但 value 相同, snapshot
-  不变, cache 仍然命中。
+  <strong>实现细节</strong>: 第一轮调用时 <code>lastSnapshot</code>
+  为 null, 三个 changed 都是 false (因为没有"上次" 可比, 强制
+  算 stable), 这是<strong>故意</strong>的: 第一次 LLM 调用 cache
+  必然 miss, 输出"stable" 是误导, 但教学版简化了"第一次不输出"。
+  生产版应该在第一轮输出 <code>[cache] first call, prefix=initial</code>
+  之类的初始标记。
 </p>
-
-<h2 id="cache-debug">cache hit rate 监控</h2>
-<p>
-  cache-debug 模块记录每次 LLM 调用的 cache 命中情况。Anthropic / OpenAI 的 API
-  响应里有 <code>cache_read_input_tokens</code> /
-  <code>cache_creation_input_tokens</code> 字段, harness 解析后写到 日志和
-  metrics, 用于评估布局效果。
-</p>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-function trackCacheUsage(response: LLMResponse) {
-  if (response.usage.cacheReadTokens &gt; 0) {
-    cacheDebugTracker.recordHit(response.usage.cacheReadTokens);
-  } else {
-    cacheDebugTracker.recordMiss();
-  }
+<h2 id="log-format">formatCacheDebugLog: 单行可 tail</h2>
+<p>
+  <strong>用途</strong>: 日志格式必须<strong>单行可 grep</strong> —
+  开发者 tail <code>agent.log</code> 找 cache 变化, 多行格式
+  不友好。
+</p>
+<p>
+  <strong>真实场景</strong>: 开发者跑 harness 100 轮对话, 想看
+  "什么时候 system prompt 变了"。 跑
+  <code>grep '\[cache\].*changed' agent.log</code>, 单行格式
+  一行一个时间点, 找到原因。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>结构化日志</strong> — 用
+  <code>key=value</code> 格式, 容易 grep, 容易解析。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/cache-debug.ts#L194"><code>src/cache-debug.ts</code> 第 194 行的 <code>formatCacheDebugLog</code></a>:
+</p>
+<pre><code class="language-typescript">export function formatCacheDebugLog(state: CacheDebugState): string {
+  const { current, changed } = state;
+  const spStatus = changed.systemPrompt ? "changed" : "stable";
+  const toolsStatus = changed.tools ? "changed" : "stable";
+  const prefixStatus = changed.stablePrefix ? "changed" : "stable";
+  const parts = [
+    `[cache] systemPrompt=${spStatus}`,
+    `tools=${toolsStatus}`,
+    `prefix=${prefixStatus}`,
+    `systemHash=${current.systemPromptHash}`,
+    `toolsHash=${current.toolsHash}`,
+    `msgs=${current.messageCount}`,
+    `tools=${current.toolCount}`,
+  ];
+  return parts.join(" ");
 }</code></pre>
-
-<h2 id="trap">反例梯度</h2>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>动态状态拼进 system prompt 字符串。</p>
-    <p>
-      <strong>为什么错:</strong>每次 LLM 调用 system prompt 都不同, 整个 cache
-      失效, token 成本涨 3-5 倍。
-    </p>
-    <p>
-      <strong>正确做法:</strong>动态状态走 reminder 消息 (user 角色), system
-      prompt 字符串保持稳定。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>save_memory 每次都让 system prompt 变。</p>
-    <p>
-      <strong>为什么错:</strong>即使 memory value 没变, 重新渲染也会让 system
-      prompt 字符串变, cache 失效。
-    </p>
-    <p>
-      <strong>正确做法:</strong>stable-context 比较 snapshot 字符串 hash,
-      内容相同不更新 system prompt。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>tools 数组每次调用都重新 sort, 即使 skill
-      集合没变。
-    </p>
-    <p><strong>为什么错:</strong>JSON 序列化顺序变了, cache 不命中。</p>
-    <p>
-      <strong>正确做法:</strong>tools 数组按注册顺序, 加载 skill 时追加到末尾,
-      不重新排序。
-    </p>
+<p>
+  <strong>实现细节</strong>: 输出 7 个字段 — 3 个状态 (stable /
+  changed) + 2 个 hash (8 位) + 2 个数量 (消息数 / 工具数)。
+  第一轮 <code>prefix=stable</code> 看起来假, 但 0 个 changed
+  字段都为 false, 后续轮才有意义。 教学版不区分"first call" 标记。
+</p>
+<h2 id="loop-integration">主循环集成: 在哪 1 个时机 inspect</h2>
+<p>
+  <strong>用途</strong>: 缓存调试的<strong>唯一</strong>集成点是
+  agent 主循环第 3 步 (call LLM) 之前。 调一次 tracker.inspect,
+  输出日志, 之后 LLM 调用本身不动 (LLM API 自己处理 cache)。
+</p>
+<p>
+  <strong>真实场景</strong>: <code>agent.ts</code> 里:
+  <code>const state = cacheTracker.inspect({messages, tools}); logger.info(formatCacheDebugLog(state)); const response = await llmClient.chat(messages, tools);</code>
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>观测点 ≤ 1</strong> — 缓存调试
+  只在 LLM 调用前 inspect 一次, 不在 LLM 调用后 (响应不改变
+  cache 状态), 不在 tool 调用前 (tool 调用不改变 system prompt),
+  不在 user 输入后 (user 输入进 history, 不进 system prompt)。
+  只有 LLM 调用前才有"前后对比" 的意义。
+</p>
+<p>
+  <strong>实现细节</strong>: 集成代码应该在 Composition Root 创建
+  tracker, 注入 agent; agent 第 3 步调 inspect + log。 默认可以
+  不传 tracker, agent 不输出 cache 日志 (用空对象模式, 类似
+  createNoopHookRunner)。
+</p>
+<h2 id="changed-actions">prefix changed 的 4 个常见原因</h2>
+<p>
+  <strong>用途</strong>: 看到 <code>[cache] prefix=changed</code>
+  日志, 知道 cache miss, 但不知道<strong>为什么</strong>。 列 4 个
+  最常见原因, 帮开发者快速定位。
+</p>
+<div class="figure figure--stack">
+  <div class="figure__title">图 2 · prefix changed 的 4 个常见原因</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">原因 1 · system prompt 改了 (字面)</div>
+      <div class="flow-stack__body">开发者改了 <code>src/system-prompt.ts</code> 源码, 拼出来的 system prompt 多了 / 少了一段。 git diff 能看到。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">原因 2 · system prompt 改了 (动态)</div>
+      <div class="flow-stack__body">memory 段加了新条目 (第 09 章), skill 切换了 (第 05 章), todo reminder 注入了状态 (第 03 章)。 看 memory/skill/todo 的变更。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">原因 3 · tools 数组变了</div>
+      <div class="flow-stack__body">注册了第 7 个工具 / 删了第 1 个工具 / 改了某个 tool 的 description。 toolsHash 变, prefix 跟着变。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">原因 4 · tool description 改了 (微调)</div>
+      <div class="flow-stack__body">"改了 1 个 tool 的 description 1 个字符", 字面看微小, hash 变, cache miss。 最容易忽视, 需要严格 review。</div>
+    </div>
   </div>
 </div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
+<p>
+  <strong>实现细节</strong>: 4 个原因对应 4 个修法 — (1) 改源码要
+  故意, 不要随手 commit; (2) 动态注入要走稳定路径 (memory 段
+  内容稳定 / skill 切换一次写死 / todo reminder 写在 user 而非
+  system); (3) 工具注册要稳定, 不要在 hot reload 时改顺序;
+  (4) tool description 改动要走 PR review, 不随手 commit。
+</p>
+<h2 id="cache-miss-cost">cache miss 的成本: 10x 不是危言耸听</h2>
+<p>
+  <strong>用途</strong>: 让开发者<strong>直观感受</strong>"改一次
+  成本 ×10" 的量级, 不是空喊"重视 cache"。
+</p>
+<p>
+  <strong>真实场景</strong>: harness 跑 100 轮对话, system prompt
+  2k token, 工具定义 1k token, 稳定前缀 3k token。 OpenAI 的
+  prompt cache 按 token 块计费, 命中部分 ~10% 折扣, miss 部分
+  全价。 100 轮中, 1 轮 system prompt 改了, 后续 99 轮 cache
+  全部重新累积, 3k token × 99 = 297k 输入 token 按全价计费。
+  比正常 cache 命中多花 ~9 倍输入侧费用。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>成本驱动设计</strong> — 谈
+  cache 不谈钱是空谈, 应该让开发者<strong>算出</strong>改一次
+  多少钱。 一般经验: 100 轮对话改 1 次 system prompt, 账单
+  ×2 到 ×5 (取决于 cache 命中率算法); 1000 轮对话改 1 次,
+  ×10 到 ×20 (累积效应)。
+</p>
+<p>
+  <strong>实现细节</strong>: harness 启动时输出"当前 system prompt
+  hash + 当前 tools hash", 让用户<strong>记录</strong> baseline。
+  之后 git commit 改了 system-prompt.ts, 重启看到 hash 变, 提醒
+  "知道这次改的代价吗?"。
+</p>
+<h2 id="fake-test">fake test: 用稳定消息验证 hash 不变</h2>
+<p>
+  <strong>用途</strong>: 缓存调试模块的测试不需要真 LLM, 用 fake
+  messages / tools 即可 — 测的是"序列化 + hash + 对比" 的算法
+  正确性。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户写测试验证"messages 相同, tools
+  顺序不同, toolsHash 相同" (稳定序列化的关键性质)。 用两个
+  不同顺序的 tools 数组, 调 stableStringify, 输出应该字节相等。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>测不变量</strong>覆盖 3 个
+  性质 — (a) 稳定序列化对 key 顺序鲁棒, (b) hash 区分大小写
+  / 空格, (c) tracker 第一轮与后续轮的不同行为。 看测试示例:
+</p>
+<pre><code class="language-typescript">test("stableStringify 对 key 顺序鲁棒", () =&gt; {
+  const a = { name: "foo", description: "bar", type: "user" };
+  const b = { type: "user", description: "bar", name: "foo" };
+  expect(stableStringify(a)).toBe(stableStringify(b));
+});
+
+test("hash 区分大小写", () =&gt; {
+  expect(sha256("Hello")).not.toBe(sha256("hello"));
+});
+
+test("tracker 第一轮 changed=false, 第二轮相同输入也 false", () =&gt; {
+  const tracker = createCacheDebugTracker();
+  const input = { messages: [{ role: "system", content: "x" }], tools: [] };
+  const s1 = tracker.inspect(input);
+  expect(s1.changed.systemPrompt).toBe(false);  // 第一轮无 lastSnapshot
+  const s2 = tracker.inspect(input);
+  expect(s2.changed.systemPrompt).toBe(false);  // 第二轮 hash 相同
+});
+
+test("system prompt 改了, 第二轮 changed=true", () =&gt; {
+  const tracker = createCacheDebugTracker();
+  tracker.inspect({ messages: [{ role: "system", content: "v1" }], tools: [] });
+  const s2 = tracker.inspect({ messages: [{ role: "system", content: "v2" }], tools: [] });
+  expect(s2.changed.systemPrompt).toBe(true);
+  expect(s2.changed.stablePrefix).toBe(true);  // 任何一个变, prefix 都变
+});</code></pre>
+<p>
+  <strong>实现细节</strong>: 4 个测试覆盖 (a) 稳定序列化核心,
+  (b) hash 大小写敏感, (c) tracker 状态机, (d) 变化传导。 不
+  需要 mock crypto, 真实 SHA256 比 mock 更稳。
+</p>
+<h2 id="common-confusion">常见误解: stable 不等于 never change</h2>
+<p>
+  <strong>误解 1: "stable 就应该永远不变?"</strong> 错。 stable
+  是"同一 session 内不变", 跨 session 可以变 (重启 hash 自然
+  变)。 看的是 session 内变化率, 不是绝对不变。
+</p>
+<p>
+  <strong>误解 2: "tools 顺序变了 hash 不变, cache 命中?"</strong>
+  不一定。 stableStringify 让 hash 顺序无关, 但 OpenAI 的
+  <strong>真实 cache</strong>看的是 tools 数组的<strong>实际顺序</strong>
+  (LLM 看到的 tools 顺序影响 prompt token, prompt token 影响
+  cache 块)。 稳定序列化只让 harness <strong>观测</strong>逻辑
+  变稳, 不保证 OpenAI 也这么看。
+</p>
+<p>
+  <strong>误解 3: "日志输出 stable 就一定 cache 命中?"</strong>
+  错。 日志输出的是 harness <strong>期望</strong> cache 命中, 不
+  等于 OpenAI <strong>实际</strong> cache 命中。 真实命中率需要
+  LLM 响应头的 cached_tokens 字段, 教学版不假装读得到。
+</p>
+<p>
+  <strong>误解 4: "cache miss 就一定浪费钱?"</strong> 不一定。
+  OpenAI 有最小 cache 块 (1024 token 左右), 小于这个长度根本
+  不进 cache。 system prompt + tools 不到 1k token 的小项目,
+  cache miss 也无所谓。 谈成本先看规模。
+</p>
+<h2 id="trap">反例梯度: 4 个常见错误</h2>
+<div class="cards-grid">
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 1 · 把 todo reminder 写进 system prompt</span></div>
+    <div class="card__body">
+      <p>第 03 章的 todo 状态每轮变, 用户把"当前 todo 列表" 拼到
+        system prompt。 错。 system prompt 是稳定前缀, todo 状态
+        应该写在 user message (历史), 不是 system。 正确做法: 写
+        "使用 TodoManager 跟踪任务" (规则) 在 system, 写 "[todo]
+        step 1 ✓ step 2 ..." (状态) 在 user message。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>项目级 memory 在新项目自动失效, 但 harness
-      不通知 LLM。
-    </p>
-    <p>
-      <strong>为什么错:</strong>LLM 看不到 system prompt 变化,
-      继续引用旧项目的事实。
-    </p>
-    <p>
-      <strong>正确做法:</strong>stable-context 在项目切换时 invalidate,
-      重新渲染并注入 "Project context changed" reminder。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 2 · 每次 LLM 调用都重建 tools 数组</span></div>
+    <div class="card__body">
+      <p>tools 数组应该<strong>创建一次</strong>, 在 Composition Root
+        注入 agent。 每次 LLM 调用前 <code>[...tools]</code> 浅拷贝,
+        字面看"内容相同", 但稳定序列化之后 hash 仍然相同 (这次
+        算稳), 但如果有人加了一个字段, 立刻 miss。 错。 一次性
+        创建, 全程复用, 严禁 hot reload 时重建。</p>
+    </div>
   </div>
-</div>
-
-<h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 10 章</span>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 3 · 改 system prompt 不知道</span></div>
+    <div class="card__body">
+      <p>开发者改了 <code>src/system-prompt.ts</code> 1 行, 没意识到
+        cache miss, 跑生产 1 个月账单 ×5, 才发现。 错。 harness
+        启动时<strong>必须</strong>输出当前 system prompt hash, 让
+        改动可见。 教学版要 <code>console.log(formatCacheDebugLog(state))</code>
+        在 startup 时打一次。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>系统 prompt 拼装顺序固定:</strong>跑 3 轮 LLM, 每次记录 system
-      prompt 字符串 hash, 前 2 轮 hash 相同。
-    </p>
-    <p>
-      <strong>动态状态走 reminder:</strong>fake LLM 跑 2 轮, 第 2 轮 TODO
-      状态变化, history 末尾出现
-      <code>&lt;system-reminder source="todo"&gt;</code>
-      标签, system prompt 字符串 hash 不变。
-    </p>
-    <p>
-      <strong>memory snapshot 复用:</strong>save_memory 调用 value 与原值 相同,
-      stable-context 不更新 system prompt, spy 验证 history.setSystemPrompt
-      没被调用。
-    </p>
-    <p>
-      <strong>tools 数组拼装稳定:</strong>加载 2 个 skill 后跑 2 轮, 第 2 轮
-      tools 数组 JSON 字符串与第 1 轮完全相同。
-    </p>
-    <p>
-      <strong>项目切换 invalidate:</strong>fake project context 切换,
-      stable-context 重新渲染 system prompt, 注入 reminder 描述新项目。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 4 · JSON.stringify 直接 hash</span></div>
+    <div class="card__body">
+      <p>直接 <code>sha256(JSON.stringify(tools))</code>, tools
+        数组顺序变了就 hash 变, 误报 cache miss。 错。 应该用
+        <code>stableStringify</code> 先稳定序列化, 再 hash。
+        稳定序列化对 key 顺序鲁棒, 数组保序 (因为 LLM 看的就是
+        这个顺序)。</p>
+    </div>
   </div>
 </div>
-
-<h2 id="lookback">回望第 00–09 章: 哪些原则在本章兑现了</h2>
+<h2 id="validate">Validation: 4 条不变量检验</h2>
+<ol>
+<li>
+<strong>稳定序列化对 key 顺序鲁棒</strong>: 同样内容不同 key
+    顺序的两个对象, <code>stableStringify</code> 输出字节相等。
+    验证: 单测覆盖 3 个字段乱序的情况。
+  </li>
+<li>
+<strong>Hash 大小写敏感</strong>: "Hello" 和 "hello" hash 不同。
+    验证: 基础单测, 防止有人把 hash 改成 <code>.toLowerCase()</code>
+    破坏敏感性。
+  </li>
+<li>
+<strong>Tracker 状态机正确</strong>: 第一轮 changed=false (无
+    lastSnapshot), 第二轮相同输入 changed=false (hash 一致), 第二
+    轮不同输入 changed=true (hash 不一致)。 验证: 3 个连续 inspect,
+    看 changed 字段。
+  </li>
+<li>
+<strong>变化传导</strong>: system prompt 变 → systemPromptHash
+    变 → stablePrefixHash 变 (因为 prefix = system + tools);
+    tools 变 → toolsHash 变 → stablePrefixHash 变。 验证: 单测
+    改一边, 看 prefix 是否跟着变。
+  </li>
+</ol>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>稳定前缀 / 动态状态 / 自然增量 三段布局:</strong
-    >这是前九章所有设计选择的总结, 不是新发明, 而是"显式命名"。
+<li>
+<strong>本地可观测</strong>: cache 调试跑在 harness 进程内, 不
+    调外部 API, 不假装读真实 cache hit rate。
   </li>
-  <li>
-    <strong>事实与视图分离:</strong>memory 是"长期事实" (第 09 章), history
-    是"对话上下文" (第 01 章), reminders 是"动态状态" (第 03
-    章)。三者职责严格分离。
+<li>
+<strong>稳定序列化</strong>: key 排序, 数组保序, undefined
+    跳过, 与 JSON.stringify 兼容 undefined 行为。
   </li>
-  <li>
-    <strong>稳定 prefix 锚点:</strong>"# User Memory" / "# Project Memory" /
-    "&lt;system-reminder source='X'&gt;" 都是锚点, 内容可换, 锚点本身稳定。
+<li>
+<strong>工厂 + 闭包</strong>: <code>createCacheDebugTracker</code>
+    把 <code>lastSnapshot</code> 藏在闭包, 外部只通过
+    <code>inspect</code> 接口访问。
   </li>
-  <li>
-    <strong>不污染 system prompt 字符串:</strong>所有动态状态走 reminder /
-    history, 永不拼进 system prompt 字符串。
+<li>
+<strong>结构化日志</strong>: 单行 <code>key=value</code> 格式,
+    可 grep, 可解析。
+  </li>
+<li>
+<strong>成本驱动设计</strong>: 谈 cache 必谈钱, 不空喊"重视
+    cache"。
   </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>cache hit 降级</dt>
-  <dd>
-    第 11 章 recovery 会处理"LLM 调用因 cache miss 突然变贵" 的告警, 让 harness
-    在 cache 失效时主动告知用户。
+<dt>真实 cache 命中率</dt>
+<dd>
+    教学版只看 harness 期望的稳定前缀, 不读 LLM 响应头的
+    cached_tokens。 真实命中率需要 LLM 客户端注入 cache 统计
+    钩子, 这是 LLM 客户端的功能, 不是 cache-debug 模块。 P2
+    阶段可以让 llm-client 暴露 cache 命中率, cache-debug 接入
+    做"实际 vs 期望" 对比。
   </dd>
-  <dt>跨模型 cache 兼容</dt>
-  <dd>
-    不同 LLM provider (Anthropic / OpenAI / Google) 的 cache 边界不同,
-    llm-adapter 模块 (第 02 章提到) 会统一抽象。
+<dt>Cache 失效自动恢复</dt>
+<dd>
+    prefix 变化是"知道为什么变" 的问题, 不是"自动恢复" 的
+    问题。 真要自动恢复, 需要"检测变化 → 判断是否意外 → 提示
+    开发者" 的完整流程, 这是 dev 工具范畴, 不是 harness 范畴。
   </dd>
-  <dt>memory 太大时怎么办</dt>
-  <dd>
-    user 级 memory 累积到 50k tokens 时, system prompt 也会撑爆, 需要 memory
-    内做摘要 (类似第 06 章 compress 思路)。
+<dt>Token 块粒度 cache</dt>
+<dd>
+    OpenAI / Anthropic 的 cache 是按 token 块 (1024 token 左右)
+    粒度, 不是整个前缀。 教学版简化成"整个前缀", 工业级要
+    算"哪一段 token 块在 cache 里"。 留给后续优化。
   </dd>
-  <dt>cache hit 监控的副作用</dt>
-  <dd>cache-debug 自身是副作用, 测试时要 mock 它的 IO, 不让它污染测试输出。</dd>
 </dl>
-
-<h2 id="vibe-coding-10">本次如何 vibe code: 第 10 章的三件套</h2>
-
-<h3 id="vibe-feed-10">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出
-    <code>SystemPromptProvider</code> / <code>StableContextManager</code> /
-    <code>CacheDebugTracker</code> 三个 interface, 以及"稳定 / 动态 / 增量"
-    三段布局的清单。本轮不写实现, 重点钉"动态状态永不拼进 system prompt"。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    <code>createStableContextManager()</code> 是 stub (永远返回空), agent.run
-    仍然走第 09 章的 system prompt (无 stable snapshot)。本轮 review 重点:
-    stableContextManager 实例在 <code>index.ts</code> 只 new 一次。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 createStableContextManager +
-    createCacheDebugTracker + agent.prepareMessages 接入。本轮 review 重点:
-    snapshot 字符串比较, 不污染 system prompt 字符串。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/cache.test.ts</code>。本轮 review 重点: "system prompt
-    拼装顺序固定" 和 "memory snapshot 复用" 两条必须有 hash 比较断言。
-  </li>
-</ol>
-
-<h3 id="vibe-review-10">Review: 第 10 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>动态状态不进 system prompt 字符串。</strong>验证:
-    <code
-      >grep -n 'JSON.stringify(todos)\|activeSkills.join'
-      src/system-prompt.ts</code
-    >
-    应当 0 行。
-  </li>
-  <li>
-    <strong>stable snapshot 是字符串比较。</strong>验证: stable-context 内
-    snapshot 比较用 <code>===</code>, 不用 deepEqual (对象引用不靠谱)。
-  </li>
-  <li>
-    <strong>tools 数组不重排。</strong>验证: registry / skill loader 的 list()
-    内部不调用 sort, 顺序由注册顺序决定。
-  </li>
-  <li>
-    <strong>cache debug 是副作用, 不污染业务路径。</strong>验证: agent.ts 内
-    cacheDebugTracker 调用全部在 <code>try { llm.chat() }</code> 之后的 finally
-    或 catch 块, 不在主流程内。
-  </li>
-  <li>
-    <strong>memory snapshot 复用不更新 system prompt。</strong>验证: Validation
-    卡片"memory snapshot 复用" 那条, spy 验证 setSystemPrompt 没被调用。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-10">调试: 第 10 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · 动态状态拼进 system prompt 字符串。</strong>症状: system
-    prompt 含 "Current TODO: ..."。验证: Validation 卡片"动态状态走 reminder"
-    那条, 跑测试, system prompt 字符串 hash 不变, 但 history 末尾有 reminder。
-  </li>
-  <li>
-    <strong>伪装 B · snapshot 比较用 deepEqual。</strong>症状:
-    <code>JSON.stringify(snapshot) === JSON.stringify(lastSnapshot)</code
-    >。功能上能跑, 但性能差 (每次都要 JSON 序列化)。验证: 本章不显式测性能,
-    但应当用 <code>===</code> 字符串比较。
-  </li>
-  <li>
-    <strong>伪装 C · 加载 skill 时重新 sort tools 数组。</strong>症状:
-    registry.list() 内部
-    <code>.sort((a, b) =&gt; a.name.localeCompare(b.name))</code>。验证: 加载
-    skill 后, tools 数组 JSON 字符串与加载前比较, 末尾追加新工具, 顺序不重排。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-10">迭代: 第 10 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch10): 钉 SystemPromptProvider / StableContextManager /
-      CacheDebugTracker 接口与三段布局清单</code
-    >
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code
-      >feat(ch10): createStableContextManager 工厂 stub + agent.prepareMessages
-      接入</code
-    >
-    —— tsc 通过, snapshot 永远不变。
-  </li>
-  <li>
-    <code
-      >feat(ch10): snapshot 字符串比较 + cache debug tracker + 动态状态走
-      reminder</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code>test(ch10): 项目切换 invalidate + memory snapshot 复用 spy 验证</code>
-    —— 全绿。
-  </li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
   <div class="card__head">
     <span class="card__tag">Prompt Card · 第 10 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
+    <button class="card__copy" data-copy-card="" type="button">复制</button>
   </div>
   <div class="card__body">
-    <p>
-      <strong>目标:</strong>基于前九章的设计选择, 显式落实 prompt cache
-      友好布局: 稳定前缀 (system prompt + tools) / 动态状态 (reminders) /
-      自然增量 (history)。
-    </p>
-    <p>
-      <strong>场景:</strong>长任务跑 50 轮, 前 20 轮 system prompt + tools
-      字符串 hash 相同, cache hit rate 应当 &gt; 80%, 单轮成本下降 3-5 倍。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/system-prompt.ts</code> (新) 集中管理
-      system prompt 拼装; <code>src/stable-context.ts</code> (新) 暴露
-      <code>createStableContextManager()</code>;
-      <code>src/cache-debug.ts</code> (新) 暴露
-      <code>createCacheDebugTracker()</code>;
-      <code>src/agent.ts</code> prepareMessages 接入 stable context。
-    </p>
+    <p><strong>目标:</strong> 给 harness 加 cache debug 模块, 每轮
+      LLM 调用前 hash (system prompt + tools), 输出 stable /
+      changed 标记。</p>
+    <p><strong>场景:</strong> 开发者改 <code>src/system-prompt.ts</code>
+      1 行, 重启 harness, 看到日志
+      <code>[cache] systemPrompt=changed tools=stable prefix=changed systemHash=abc12345</code>。
+      知道 system prompt 改了, 决定是"故意改" 还是"无意改"。</p>
+    <p><strong>模块:</strong> <code>src/cache-debug.ts</code> (新)
+      暴露 <code>createCacheDebugTracker()</code> + <code>formatCacheDebugLog(state)</code>;
+      <code>src/agent.ts</code> (改) 第 3 步 call LLM 前 inspect +
+      log; <code>src/index.ts</code> (改) Composition Root 创建
+      tracker。</p>
     <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
     <ul>
-      <li>
-        动态状态 (TODO / skill 激活集合 / 子智能体输出) 永不拼进 system prompt
-        字符串
-      </li>
-      <li>stable snapshot 用字符串比较 (===), 不用 deepEqual</li>
-      <li>tools 数组按注册顺序, 加载 skill 时追加到末尾, 不重排</li>
-      <li>cache debug 调用全部在副作用边界 (try / catch 之后), 不在主流程</li>
-      <li>项目级 memory 切换时 invalidate stable snapshot</li>
+      <li>稳定前缀定义: system prompt content + tools JSON, 不含 history</li>
+      <li>稳定序列化对 key 顺序鲁棒 (sort), 数组保序, undefined 跳过</li>
+      <li>Hash 用 SHA256 前 8 位 hex, 不用完整 64 位</li>
+      <li>Tracker inspect 只在 LLM 调用前调一次, 不在 tool 调用前 / 后调</li>
+      <li>第一轮 changed 全部为 false (无 lastSnapshot, 不假装 stable)</li>
+      <li>日志格式单行 <code>key=value</code>, 可 grep, 不多行</li>
+      <li>不假装读真实 cache hit rate, 只算"期望稳定"</li>
     </ul>
-    <p><strong>验证 (用 fake LLM + spy, 逐条落到 vitest):</strong></p>
+    <p><strong>验证 (用 fake messages + vitest, 逐条断言):</strong></p>
     <ul>
-      <li>跑 3 轮 LLM, system prompt 字符串 hash 前 2 轮相同</li>
-      <li>
-        第 2 轮 TODO 状态变化, history 末尾有 reminder 标签, system prompt hash
-        不变
-      </li>
-      <li>
-        save_memory 调用 value 与原值相同, spy 验证 setSystemPrompt 没被调用
-      </li>
-      <li>加载 2 个 skill 后跑 2 轮, tools 数组 JSON 字符串完全相同</li>
-      <li>项目切换, stable-context 重新渲染 system prompt, 注入 reminder</li>
+      <li>stableStringify 对 key 顺序鲁棒: 3 字段乱序, 字节相等</li>
+      <li>hash 大小写敏感: "Hello" vs "hello" hash 不同</li>
+      <li>tracker 状态机: 第 1 轮 changed=false, 第 2 轮相同输入 changed=false, 不同输入 changed=true</li>
+      <li>变化传导: system 变 → prefix 变, tools 变 → prefix 变</li>
     </ul>
   </div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意把动态状态拼进 system prompt 字符串, 跑测试, 看"system prompt
-    拼装顺序固定" 是否抓到 (hash 会变)。
+<li>
+    故意把 todo reminder 拼到 system prompt (第 03 章), 跑 100 轮
+    对话, 看 cache 日志是否全程 <code>prefix=changed</code>。
   </li>
-  <li>
-    在 stable snapshot 比较里用 deepEqual, 跑测试, 看"memory snapshot 复用"
-    是否能抓到 (本节用 ===, 不写测试能跑通, 但性能差)。
+<li>
+    故意用 <code>JSON.stringify(tools)</code> 直接 hash, 跑测试
+    调换 tools 数组顺序, 看 hash 是否变了 (应该稳定序列化让 hash
+    不变)。
   </li>
-  <li>
-    加载 skill 时重新 sort tools 数组, 跑测试, 看"tools 数组拼装稳定" 是否抓到。
+<li>
+    故意不在 harness 启动时输出 baseline hash, 改
+    <code>system-prompt.ts</code> 不重启 harness, 看是否能发现
+    "system prompt 改了" (不能, 缺观测)。
+  </li>
+<li>
+    故意在 agent.ts 多处调 tracker.inspect (tool 调用前 + LLM
+    调用前 + LLM 调用后), 看日志是否混乱 (3 个调用点应该只在
+    LLM 调用前 1 个)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章系统讲 prompt cache 友好布局: 稳定前缀 (system prompt + tools) / 动态状态
-  (reminders) / 自然增量 (history)。基于前九章的设计选择 显式命名这套布局, 并用
-  stable snapshot + cache debug 监控来保证 cache hit rate。下一章 (第 11 章)
-  我们会处理"LLM 调用失败" 的 情况——Recovery, 在网络异常 / 输出截断 / context
-  overflow 时 自动恢复, 不让 harness 因单次失败就崩。
+  Cache debug 是给 prompt cache 的<strong>可观测性</strong>, 不
+  假装读真实 cache hit rate。 核心是 5 个设计:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>稳定前缀边界</strong>: system prompt + tools, 不含 history。
+  todo reminder / skill 状态进 history, 不进 system。
+  </li>
+<li>
+<strong>本地 hash</strong>: SHA256 前 8 位, 不调外部 API, 不
+  读 LLM 响应头。
+  </li>
+<li>
+<strong>稳定序列化</strong>: key 排序, 数组保序, undefined 跳过,
+  避免"顺序变了就误报 miss"。
+  </li>
+<li>
+<strong>单点 inspect</strong>: 只在 LLM 调用前调一次, 其它时机
+  无意义。
+  </li>
+<li>
+<strong>单行结构化日志</strong>: <code>[cache] systemPrompt=stable ...</code>,
+  可 grep, 可解析。
+  </li>
 <p>
-  第 10 章让 harness 在长任务下保持稳定成本, 但 LLM 调用本身可能 失败 (rate
-  limit、timeout、context overflow、JSON 损坏)。下一章 Recovery
-  模块会分类失败模式, 在不同失败下采取不同策略: 重试 退避、强制压缩、续写、终止,
-  让 harness 在面对不确定的 LLM 时 仍然能持续工作。
+  下一章 (第 11 章) 展开 harness 跑生产时的<strong>异常恢复</strong>
+  — 7 类错误 (LLM 4xx/5xx / 工具 throw / JSON parse 失败 / 上下文
+  撑爆 / user 取消 / network / unknown) 各自的恢复动作。
 </p>
diff --git a/tutorial/chapters/11-recovery.html b/tutorial/chapters/11-recovery.html
index 67f1207..48be88e 100644
--- a/tutorial/chapters/11-recovery.html
+++ b/tutorial/chapters/11-recovery.html
@@ -1,570 +1,741 @@
-<p class="article__eyebrow">第 11 章 · LLM 出错时不要崩</p>
-<h1 class="article__title">Recovery: 在不确定的 LLM 面前保持持续工作</h1>
+<p class="article__eyebrow">第 11 章 · 跑生产时的异常恢复</p>
+<h1 class="article__title">Recovery: 7 类错误 × 4 种恢复动作的纯函数决策</h1>
 <p class="article__lede">
-  前面十章让 harness 在正常路径下能聊天、调工具、跑子任务、压缩、拦权限、 留
-  hook、记 memory、cache 友好。但 LLM 调用本身可能失败: rate limit、
-  timeout、context overflow、JSON 损坏、输出截断。这一章给 harness 加 Recovery
-  模块, 分类失败模式, 在不同失败下采取不同策略, 让 harness 在面对不确定的 LLM
-  时仍然能持续工作。
+  前 10 章的 harness 在 happy path 上很顺, 但生产里 LLM 一定会
+  报错: 网络抖 / 限流 / 余额耗尽 / 上下文撑爆 / 输出被截断 /
+  认证失效 / 协议错误。 每种错误对应不同的恢复动作 (backoff /
+  compact / continue / fail), 决策错一次就是几小时的挂机等待
+  或几万的 token 浪费。 这一章加 <code>src/recovery.ts</code> 模块
+  — 7 类错误统一分类 (classifyLLMError) + 4 种恢复动作决策
+  (decideRecovery, 纯函数) + 3 段提示文案 (recovery notice /
+  failure message) + 1 个 sleep 工具。 读完后, 你能讲清"为什么
+  context_length 用 compact 而不是 backoff" 和"为什么 decideRecovery
+  不在内部修改 state", 并能用纯函数测试覆盖 4 种动作的边界。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-10">在第 10 章基础上改了什么</h2>
-<p>
-  这一章在 agent.run() 主循环里包一层 try/catch + classify + decide。 每次
-  LLM.chat() 失败时, classifyLLMError() 识别失败模式 (rate_limit / timeout /
-  context_overflow / parse_error / truncation), decideRecovery()
-  根据失败模式返回决策 (retry_with_backoff / compact_and_retry /
-  inject_continuation / abort / noop), harness 根据决策执行。 对应到代码,
-  改动集中在 2 个文件: <code>src/recovery.ts</code> (新)、
-  <code>src/agent.ts</code> (改主循环包 try/catch)。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/recovery.ts: 失败分类与决策 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/agent.ts: 主循环包 try/catch + recovery 决策</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    harness 跑到第 30 轮, LLM 突然 429 rate limit, 主 loop 立即抛异常,
-    用户体验断崖式下跌。现象是"harness 把 LLM 当成确定接口, 没考虑 LLM
-    的不确定失败模式"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"LLM 失败时整段重试"。这有两个问题: 一是 rate limit
-    立即重试只会更糟, 应该退避; 二是 context overflow 重试只会再 overflow,
-    应该压缩; 三种失败模式需要不同策略。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code>interface RecoveryDecision { action, retryAfter, reason }</code>。
-    不变量三条: (1) recovery 决策不修改 history, 只追加新消息 (例如 "your
-    previous output was truncated, continue"), (2) recovery state 不跨 run()
-    共享 (本轮的退避计数不污染下一轮), (3) 同一种失败连续 N 次后必须放弃,
-    不能无限重试。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake LLM 前两次抛 rate_limit 异常, 第三次返回正常响应, 验证 harness
-    退避重试后跑通; fake LLM 抛 context_overflow, 验证 harness 触发第 06 章的
-    compact 后重试; fake LLM 永远抛 rate_limit, 跑 N 次后 agent.run() 返回
-    "Recovery failed" 字符串。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · 失败立即重试</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-let response;
-try {
-  response = await llm.chat(messages);
-} catch (err) {
-  response = await llm.chat(messages);  // 错误: 立即重试
-}</code></pre>
-  <p><strong>问:</strong>为什么不立即重试?</p>
-  <p>
-    <strong>答:</strong>rate limit / 5xx 立即重试只会更糟, 应当按指数退避 (1s,
-    2s, 4s, 8s) 后再试; context overflow 重试只会再 overflow,
-    应当先压缩再重试。两种失败需要不同策略, 不能"一把梭" 重试。
-  </p>
+<nav aria="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: LLM 限流 1 小时, harness 傻等</h2>
+<p>
+  写代码之前, 先看一个真实痛点。 用户的 team 上线 harness 跑
+  生产, 周一早上 9 点全公司 30 个用户同时跑, OpenAI 限流 429。
+</p>
+<ol>
+<li>
+<strong>症状</strong>: 30 个 user 同时被 429 限流, harness 傻傻
+    同步等待 60 秒 (硬编码 backoff), 一个一个 retry, 跑 1 小时
+    才把队列清完。
+  </li>
+<li>
+<strong>更深的问题</strong>: 限流 5 次后, harness 应该 fail 让
+    用户知道"现在过载了, 稍后再试", 而不是继续 retry。 傻等
+    让 30 个用户全部卡死, 没人能 ctrl-c。
+  </li>
+<li>
+<strong>类似的惨案</strong>: context_length 错误 (413) 应该
+    compact 上下文, 但团队写了通用 backoff, 每次都带同样长的
+    上下文重试, 反复 413, 浪费 5 次重试 budget 才 fail。
+  </li>
+<li>
+<strong>真问题</strong>: 错误分类 + 恢复决策写进 if/else, 散在
+    agent.ts 各处, 改一次要重读 200 行。 应该集中到 recovery
+    模块, 纯函数决策, agent 只调 <code>decideRecovery(...)</code>。
+  </li>
+</ol>
+<p>
+  朴素想法 1: "所有错误都 retry 一下?" 错。 401 认证错误 retry
+  100 次还是 401; quota 耗尽 retry 100 次还是耗尽; context_length
+  retry 100 次还是太长。 不同错误需要不同<strong>动作</strong>,
+  不是"重试次数" 的差异。
+</p>
+<p>
+  朴素想法 2: "重试 budget 全局共享?" 错。 1 个 user 的
+  rate_limit 不应该消耗下一个 user 的 budget。 重试计数必须
+  <strong>per-request</strong>, 每次 <code>agent.run(query)</code>
+  重建 RecoveryState。
+</p>
+<p>
+  正确做法: 加 <code>src/recovery.ts</code> — <code>classifyLLMError</code>
+  把任何 LLM 错误归并到 7 种类型, <code>decideRecovery</code>
+  纯函数决策 4 种动作, <code>formatFailureMessage</code> 给用户
+  可读的中文提示。 这是 Reference 章节 "模式 13 · Error→Action
+  错误转动作" + "模式 19 · Idempotent 幂等" 的具体应用。
+</p>
+<h2 id="seven-kinds">7 类错误: 不可恢复的先识别</h2>
+<p>
+  <strong>用途</strong>: 实际 LLM 调用失败的原因可能几十种 (HTTP
+  401/403/413/429/500/502/503/504, SDK 异常, 网络 timeout, DNS
+  失败, 协议错误, 余额耗尽...), 但 agent 主循环不应该认识所有。
+  应该先<strong>归并</strong>成少数几类, 每类对应一种恢复策略。
+</p>
+<p>
+  <strong>真实场景</strong>: user 看到 "Error: Request failed with
+  status code 429", harness 应该知道这是 rate_limit, 走 backoff;
+  看到 "Error: Invalid API key", harness 知道这是 credential,
+  直接 fail 不 retry。 不同错误对应不同动作, 决策不能错。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>收敛 + 顺序</strong> — 把
+  几十种 provider 方言收敛成 7 种领域类型; 分类顺序很重要
+  (credential / quota 这种"不可恢复" 必须先识别, 不然浪费
+  重试 budget)。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L22"><code>src/recovery.ts</code> 第 22 行的 <code>LLMErrorKind</code> union</a>:
+</p>
+<pre><code class="language-typescript">export type LLMErrorKind =
+  | "network"
+  | "rate_limit"
+  | "credential"
+  | "quota"
+  | "context_length"
+  | "output_interrupted"
+  | "unknown";</code></pre>
+<div class="figure figure--stack">
+  <div class="figure__title">图 1 · 7 类错误的分类顺序与对应动作</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">1. credential (401/403) · 不可恢复</div>
+      <div class="flow-stack__body">API key 错 / 权限不够。 fail, 提示用户检查 LLM_PROVIDER / API key / baseURL。 永远不 retry。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">2. quota (429 + quota/billing) · 不可恢复</div>
+      <div class="flow-stack__body">账户余额耗尽。 fail, 提示"token 额度或账户余额不足"。 等不回来, 充值才行。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--warn">
+      <div class="flow-stack__label">3. rate_limit (其他 429) · 临时</div>
+      <div class="flow-stack__body">限流, 临时性。 backoff 3 秒 × 5 次。 5 次后 fail, 提示"LLM 服务暂时不可用"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--warn">
+      <div class="flow-stack__label">4. context_length (413) · 上下文过长</div>
+      <div class="flow-stack__body">上下文撑爆模型窗口。 compact 压缩历史, 1 次。 1 次后仍 fail。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--warn">
+      <div class="flow-stack__label">5. network (5xx/timeout) · 网络抖动</div>
+      <div class="flow-stack__body">服务器临时挂或网络抖。 backoff 3 秒 × 5 次。 5 次后 fail。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--warn">
+      <div class="flow-stack__label">6. output_interrupted · 输出被截断</div>
+      <div class="flow-stack__body">流式输出中途断开。 continue (从断点续), 2 次。 2 次后 fail。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">7. unknown · 协议错误 / 代码 bug</div>
+      <div class="flow-stack__body">无法识别。 fail, 输出原始 error message。 不盲目重试, 防止 bug 放大。</div>
+    </div>
+  </div>
 </div>
-
-<div class="note">
-  <p class="note__title">观察 2 · recovery state 跨 run 共享</p>
-  <pre class="code-block"><code>// 教学简化版
-let retryCount = 0;  // 错误: module-level
-
-async function run(query) {
-  try {
-    return await llm.chat(messages);
-  } catch (err) {
-    retryCount++;
-    if (retryCount &gt; 5) throw err;
+<p>
+  <strong>实现细节</strong>: 分类顺序刻意设计 — 先识别 credential
+  (401/403 + 关键字), 再识别 quota (429 + quota/billing 关键字),
+  再识别 rate_limit (其他 429), 再识别 context_length, 再识别
+  network, 最后 unknown。 顺序错了, credential 会被误判为
+  rate_limit, 浪费 5 次 retry budget。
+</p>
+<h2 id="classify">classifyLLMError: 不依赖 OpenAI SDK 类型</h2>
+<p>
+  <strong>用途</strong>: 分类函数必须<strong>不依赖</strong>具体
+  LLM SDK 的类型, 只通过宽松结构 (<code>error.status</code> /
+  <code>error.code</code> / <code>error.message</code>) 读取。
+  这样换 provider (OpenAI / Anthropic / 自部署 / Kimi) 都不用
+  改分类函数。
+</p>
+<p>
+  <strong>真实场景</strong>: team 原本用 OpenAI SDK, 后来加
+  Anthropic SDK, 又加自部署 vLLM。 三个 SDK 的 error 类型不同
+  (OpenAI 是 <code>APIError</code>, Anthropic 是
+  <code>AnthropicError</code>, vLLM 直接抛 <code>Error</code>)。
+  如果 classifyLLMError 用 <code>instanceof OpenAI.APIError</code>,
+  换 provider 就崩。 应该用 duck typing, 读
+  <code>error.status</code> 这种宽松字段。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>鸭子类型 + 启发式</strong> —
+  不强求特定 SDK 类型, 只读常见字段; 不是完美真理, 启发式分类
+  已经覆盖 99% 场景。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L109"><code>src/recovery.ts</code> 第 109 行的 <code>classifyLLMError</code></a>:
+</p>
+<pre><code class="language-typescript">export function classifyLLMError(error: unknown): LLMErrorKind {
+  const status = extractNumber(error, "status");
+  const message = extractString(error, "message").toLowerCase();
+
+  // 1. credential: 401/403 或认证关键字
+  if (status === 401 || status === 403 ||
+      message.includes("api key") || message.includes("unauthorized") ||
+      message.includes("forbidden") || message.includes("credential")) {
+    return "credential";
+  }
+  // 2. quota: 429 + 额度关键字 (rate_limit 之前先识别)
+  if (status === 429 &amp;&amp; (message.includes("quota") || ...)) {
+    return "quota";
   }
+  // 3. rate_limit: 其他 429
+  if (status === 429) return "rate_limit";
+  // 4. context_length: 413 或上下文长度关键字
+  if (status === 413 || message.includes("context length") || ...) {
+    return "context_length";
+  }
+  // 5. network: 5xx 或网络关键字
+  if (status &gt;= 500 &amp;&amp; status &lt;= 504 ||
+      message.includes("timeout") || message.includes("econnreset") || ...) {
+    return "network";
+  }
+  return "unknown";
 }</code></pre>
-  <p><strong>问:</strong>为什么 retryCount 不放 module-level?</p>
-  <p>
-    <strong>答:</strong>用户两次 run() 之间本应独立。某次 run() 触发了 rate
-    limit 重试 5 次, 下次 run() 应当从 0 开始计数。共享会让"上一轮 失败过"
-    污染下一轮, 体验差。
-  </p>
+<p>
+  <strong>实现细节</strong>: 用 <code>extractNumber</code> /
+  <code>extractString</code> 内部辅助函数 (第 326/334 行) 安全
+  提取字段, 不依赖 SDK 形状。 message 全部 <code>.toLowerCase()</code>
+  做大小写不敏感匹配, 覆盖 "API Key" / "api key" / "Api Key" 各种
+  写法。
+</p>
+<h2 id="four-actions">4 种恢复动作: continue / compact / backoff / fail</h2>
+<p>
+  <strong>用途</strong>: 7 类错误 → 4 种动作, 一一映射, 决策不能
+  含糊。 continue = 输出续; compact = 压缩重试; backoff = 等
+  重试; fail = 停止。 4 种动作覆盖<strong>所有</strong>可恢复 +
+  不可恢复场景。
+</p>
+<p>
+  <strong>真实场景</strong>: LLM 流式输出跑到 8k token 时网络抖
+  断了, 错误是 output_interrupted, 动作 = continue (从 8k
+  断点续, 不从头来); LLM 返回 413, 动作 = compact (压缩历史
+  后重试, 同等长度仍太长才 fail); LLM 返回 401, 动作 = fail
+  (config 错误, 永远不重试)。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>动作类型枚举</strong> — 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L39"><code>src/recovery.ts</code> 第 39 行的 <code>RecoveryAction</code> union</a>:
+</p>
+<pre><code class="language-typescript">export type RecoveryAction = "continue" | "compact" | "backoff" | "fail";</code></pre>
+<div class="figure figure--compare">
+  <div class="figure__title">图 2 · 4 种恢复动作的语义对比</div>
+  <div class="flow-compare">
+    <div class="flow-compare__col flow-compare__col--good">
+      <div class="flow-compare__head">continue · 输出续</div>
+      <div class="flow-compare__body">针对 output_interrupted, 追加 continuation reminder 后再调 LLM, 期望从断点继续。 不修改历史, 节省 token。</div>
+    </div>
+    <div class="flow-compare__col flow-compare__col--warn">
+      <div class="flow-compare__head">compact · 压缩重试</div>
+      <div class="flow-compare__body">针对 context_length, 强制压缩历史 (第 06 章 P0/P1/P2) 再调 LLM。 1 次 budget, 仍过长就 fail。</div>
+    </div>
+    <div class="flow-compare__col flow-compare__col--good">
+      <div class="flow-compare__head">backoff · 等重试</div>
+      <div class="flow-compare__body">针对 network / rate_limit, sleep retryDelayMs (默认 3 秒) 后重试同一次 LLM 调用。 最多 5 次。</div>
+    </div>
+    <div class="flow-compare__col flow-compare__col--bad">
+      <div class="flow-compare__head">fail · 停止</div>
+      <div class="flow-compare__body">针对 credential / quota / unknown, 或 budget 用完。 停止 run(), 返回用户可读中文提示。</div>
+    </div>
+  </div>
 </div>
-
-<h2 id="classification">失败分类</h2>
-<dl class="defs">
-  <dt>rate_limit</dt>
-  <dd>
-    HTTP 429, 或 provider 返回 "rate_limit_exceeded"。决策:
-    退避后重试。退避时间由 retryAfter header 或指数退避 (1s, 2s, 4s) 决定。
-  </dd>
-  <dt>timeout</dt>
-  <dd>
-    请求超过 timeout 限制 (例如 30s 未响应)。决策: 退避后重试。
-    连续超时可能意味着 provider 故障, 跑 N 次后放弃。
-  </dd>
-  <dt>context_overflow</dt>
-  <dd>
-    HTTP 400 + "context_length_exceeded", 或 provider 返回 "too many tokens"。
-    决策: 触发第 06 章的 compact (force compact, 不等阈值), 重新拼 messages
-    后重试。
-  </dd>
-  <dt>parse_error</dt>
-  <dd>
-    LLM 返回的内容无法解析 (例如 tool_calls JSON 损坏)。决策: 写一条 user
-    message 告诉 LLM "你的上一次输出解析失败, 请重试", 重新调 LLM。
-    这是"软错误", LLM 通常能改正。
-  </dd>
-  <dt>truncation</dt>
-  <dd>
-    LLM 输出因长度限制被截断 (finishReason === "length")。决策: 注入
-    continuation reminder, 调 LLM 续写, 累积多段输出。
-  </dd>
-  <dt>unknown</dt>
-  <dd>
-    其他异常 (网络断开、provider 5xx 等)。决策: 退避后重试, 跑 N 次 后放弃。
-  </dd>
-</dl>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export type LLMErrorKind =
-  | "rate_limit" | "timeout" | "context_overflow" | "parse_error" | "truncation" | "unknown";
-
-export type RecoveryAction =
-  | { kind: "retry"; retryAfter: number }      // 退避后重试
-  | { kind: "compact_and_retry" }              // 压缩后重试
-  | { kind: "inject_continuation" }            // 注入续写 reminder
-  | { kind: "abort"; reason: string };         // 放弃
-  | { kind: "noop" };                          // 继续 (无失败)
-
-export function classifyLLMError(err: unknown): LLMErrorKind;
-export function decideRecovery(
+<p>
+  <strong>实现细节</strong>: 4 种动作的<strong>协议</strong> —
+  continue / compact 在 agent.ts 调 LLM 之前, 由调用方负责执行
+  动作; backoff 在 LLM 调用之间, 调用 sleep(); fail 在最外层
+  try/catch 里 throw 或 return, 由 repl.ts 渲染中文提示。
+</p>
+<h2 id="decide">decideRecovery 纯函数: 不在内部修改 state</h2>
+<p>
+  <strong>用途</strong>: 决策函数必须<strong>纯</strong> — 同样
+  输入永远同样输出, 不修改 state, 不调外部 API, 不读时间。
+  这才能单元测试覆盖。
+</p>
+<p>
+  <strong>真实场景</strong>: 测试想验证"network + retry 5/5
+  → fail", 但如果 decideRecovery 内部 <code>state.apiRetryCount++</code>,
+  测一次后 state 变了, 第二次跑同样的输入, 期望 fail 但实际
+  backoff, 测试 flaky。 纯函数没这个问题, 同样的输入两次
+  都返回同样的动作。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>决策 vs 执行分离</strong> —
+  decideRecovery 只<strong>算</strong>"应该做什么", 调方负责
+  "做" 和 "state++"。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L198"><code>src/recovery.ts</code> 第 198 行的 <code>decideRecovery</code></a>:
+</p>
+<pre><code class="language-typescript">export function decideRecovery(
   kind: LLMErrorKind,
   state: RecoveryState,
-): RecoveryAction;</code></pre>
-
-<h2 id="state">RecoveryState: 本轮内闭包, 不跨 run 共享</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface RecoveryState {
-  attempt: number;          // 当前 run() 内重试次数
-  lastError?: LLMErrorKind; // 上一次失败类型
-  accumulatedTruncation: string;  // 累积因 truncation 截断的多段输出
-}
-
-export function createRecoveryState(): RecoveryState {
-  return { attempt: 0, accumulatedTruncation: "" };
+  config: RecoveryConfig = DEFAULT_RECOVERY_CONFIG,
+): RecoveryAction {
+  switch (kind) {
+    case "network":
+      return state.apiRetryCount &lt; config.maxApiRetries ? "backoff" : "fail";
+    case "rate_limit":
+      return state.apiRetryCount &lt; config.maxApiRetries ? "backoff" : "fail";
+    case "context_length":
+      return state.compactRetryCount &lt; config.maxCompactRetries
+        ? "compact" : "fail";
+    case "output_interrupted":
+      return state.continueRetryCount &lt; config.maxContinueRetries
+        ? "continue" : "fail";
+    case "credential":
+    case "quota":
+    case "unknown":
+      return "fail";
+  }
 }</code></pre>
 <p>
-  <code>createRecoveryState()</code> 在每次 <code>agent.run()</code> 入口
-  调用一次, 状态是闭包内的临时态。这与第 04 章 subagent 的"每次 创建独立
-  history" 同一套工厂模式, 防止跨调用污染。
+  <strong>实现细节</strong>: <strong>故意不在内部 ++</strong> —
+  注释里明确说"decideRecovery 是纯函数, 故意不在内部递增
+  state, 这样调用方能清楚控制什么时候消耗一次重试预算, 测试
+  也更容易覆盖。 常见坑: 在决策函数里顺手修改计数, 导致日志
+  显示次数和真实次数错位。"
 </p>
-
-<h2 id="loop-integration">loop 接入: try/catch + classify + decide</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-async function run(query: string) {
-  history.add({ role: "user", content: query });
-  const recoveryState = createRecoveryState();
-
-  for (;;) {
-    const messages = history.getMessages();
-    let assistant;
-
-    try {
-      assistant = await llm.chat(messages);
-    } catch (err) {
-      const kind = classifyLLMError(err);
-      recoveryState.lastError = kind;
-      recoveryState.attempt++;
-
-      const decision = decideRecovery(kind, recoveryState);
-      switch (decision.kind) {
-        case "retry":
-          await sleep(decision.retryAfter);
-          continue;  // 回到 loop 顶部重试
-        case "compact_and_retry":
-          compactCurrentHistoryForRecovery(timing);  // 强制压缩
-          continue;
-        case "abort":
-          return `[Recovery aborted] ${decision.reason}`;
-        case "noop":
-          break;
-      }
-    }
-
-    // 处理 truncation (不是异常, 是 finishReason === "length")
-    if (assistant.finishReason === "length") {
-      recoveryState.accumulatedTruncation += assistant.content;
-      appendContinuationReminder(timing);
-      continue;  // 续写
-    }
-    if (recoveryState.accumulatedTruncation) {
-      // 续写完成, 拼上累积的输出
-      assistant.content = recoveryState.accumulatedTruncation + assistant.content;
-      recoveryState.accumulatedTruncation = "";
-    }
-
-    history.add(assistant);
-    if (!assistant.tool_calls) return assistant.content;
-    // ... tool execution ...
-  }
+<p>
+  默认参数 <code>config = DEFAULT_RECOVERY_CONFIG</code> 让调用方
+  不传 config 也能用 (用默认值), 但测试时显式传 small config
+  (maxApiRetries: 0) 验证"立即 fail" 边界。
+</p>
+<h2 id="state-scope">RecoveryState 作用域: per-request, 不全局</h2>
+<p>
+  <strong>用途</strong>: 重试 budget 必须<strong>每次 agent.run()</strong>
+  重建, 不持久化, 不全局共享。 1 个 user 的 rate_limit 不应该
+  消耗下一个 user 的 budget。
+</p>
+<p>
+  <strong>真实场景</strong>: 30 个 user 同时用 harness, user A
+  触发 rate_limit 重试 5 次后 fail, 但 user B 的 budget 也耗尽了
+  (因为全局共享), user B 第一次 LLM 调用就 fail。 这就是
+  "重试 budget 全局" 的惨案。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>作用域清晰</strong> — RecoveryState
+  的注释里明确写"单次 agent.run() 的恢复计数器, 不应该跨
+  user turn 复用"。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L86"><code>src/recovery.ts</code> 第 86 行的 <code>createRecoveryState</code></a>:
+</p>
+<pre><code class="language-typescript">export function createRecoveryState(): RecoveryState {
+  // RecoveryState 是"单次 agent.run() 的恢复计数器", 不应该跨 user turn 复用。
+  // 如果把它做成全局状态, 一个用户请求触发的 rate limit 可能导致下一个请求直接 fail。
+  return { apiRetryCount: 0, compactRetryCount: 0, continueRetryCount: 0 };
 }</code></pre>
-
-<h2 id="trap">反例梯度</h2>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>失败立即重试, 不分类。</p>
-    <p>
-      <strong>为什么错:</strong>rate limit 立即重试更糟, context overflow
-      重试仍然 overflow。
-    </p>
-    <p>
-      <strong>正确做法:</strong>classifyLLMError + decideRecovery,
-      不同失败不同策略。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>recovery state 写到 module-level 单例。</p>
-    <p><strong>为什么错:</strong>跨 run 污染, 本轮失败计数影响下一轮。</p>
-    <p>
-      <strong>正确做法:</strong>createRecoveryState() 工厂, run() 入口创建,
-      闭包内使用。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>truncation 时丢弃已生成的半段输出。</p>
-    <p>
-      <strong>为什么错:</strong>LLM 已经想好思路, 丢弃后下次从空白开始,
-      内容衔接不上。
-    </p>
-    <p>
-      <strong>正确做法:</strong>累积因 truncation 截断的多段输出,
-      续写完成后拼回去。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>context overflow 触发 compact, 但 compact
-      失败时无限重试。
-    </p>
-    <p>
-      <strong>为什么错:</strong>compact 自身可能因 LLM 调用失败而失败, 死循环。
-    </p>
-    <p>
-      <strong>正确做法:</strong>compact 失败时降级为截断 messages, 截断仍失败时
-      abort。
-    </p>
-  </div>
-</div>
-
-<h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 11 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>rate_limit 退避重试:</strong>fake LLM 前 2 次抛 rate_limit, 第 3
-      次返回正常响应, agent.run() 跑通, spy 验证 sleep 被调用 2 次
-      (退避后重试)。
-    </p>
-    <p>
-      <strong>context_overflow 触发 compact:</strong>fake LLM 抛
-      context_overflow, 跑完后第 06 章的 compactCurrentHistoryForRecovery 被调用
-      1 次, agent.run() 继续重试。
-    </p>
-    <p>
-      <strong>truncation 续写累积:</strong>fake LLM 第 1 轮 finishReason ===
-      "length" 返回半段, 第 2 轮 finishReason === "stop" 返回剩余, 最终
-      history.add(assistant).content 是完整输出。
-    </p>
-    <p>
-      <strong>retry 上限放弃:</strong>fake LLM 永远抛 rate_limit, 跑
-      DEFAULT_RECOVERY_CONFIG.maxRetries (例如 5) 次后, agent.run() 返回
-      "[Recovery aborted] ..." 字符串。
-    </p>
-    <p>
-      <strong>recovery state 不跨 run 共享:</strong>第 1 次 run() 触发
-      rate_limit 重试 3 次后 abort, 第 2 次 run() 的 recoveryState.attempt
-      应当从 0 开始。
-    </p>
-  </div>
-</div>
-
-<h2 id="lookback">回望第 00–10 章: 哪些原则在本章兑现了</h2>
+<p>
+  <strong>实现细节</strong>: 工厂函数而非全局变量, 强制"每次
+  新建"。 agent.run(query) 第一行 <code>const recovery = createRecoveryState()</code>,
+  整个 run() 期间所有 retry 共享这一个 instance, run() 返回时
+  销毁。 这是 Reference 章节 "模式 1 · 工厂 + 闭包" 的应用 —
+  状态在闭包内, 外部不能 "误改全局"。
+</p>
+<h2 id="config">DEFAULT_RECOVERY_CONFIG: 5/1/2/3 秒的经验值</h2>
+<p>
+  <strong>用途</strong>: 给 4 个数字合理的默认值, 用户不传也能用。
+  教学项目优先<strong>直观</strong>而非<strong>通用</strong>, 把
+  经验值写死, 不开放配置中心。
+</p>
+<p>
+  <strong>真实场景</strong>: 默认 maxApiRetries=5, retryDelayMs=3000。
+  user 跑 5 次 retry × 3 秒 = 15 秒后 fail, 给用户"现在过载" 提示,
+  不傻等 1 小时。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>经验值 + 教学优先</strong> —
+  看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L75"><code>src/recovery.ts</code> 第 75 行的 <code>DEFAULT_RECOVERY_CONFIG</code></a>:
+</p>
+<pre><code class="language-typescript">export const DEFAULT_RECOVERY_CONFIG: RecoveryConfig = {
+  maxApiRetries: 5,
+  maxCompactRetries: 1,
+  maxContinueRetries: 2,
+  retryDelayMs: 3000,
+};</code></pre>
+<p>
+  <strong>实现细节</strong>: 4 个数字的<strong>理由</strong>:
+</p>
 <ul>
-  <li>
-    <strong>工厂模式再次回报:</strong>createRecoveryState() 工厂, 闭包内状态,
-    不跨 run 污染。
+<li>
+<strong>maxApiRetries=5</strong>: 网络抖 5 次仍 fail 几乎都是真
+    挂了 (不是临时问题), 5 次 ≈ 15 秒 (含 3 秒 sleep × 5), 用户
+    不会觉得太慢。
   </li>
-  <li>
-    <strong>错误降级原则:</strong>第 08 章 Hook 抛错降级, 本章 LLM 抛错也降级,
-    一致的设计哲学。
+<li>
+<strong>maxCompactRetries=1</strong>: 压缩 1 次仍过长 = 历史太
+    长, 继续压缩会丢关键信息, 1 次够。
   </li>
-  <li>
-    <strong>事实与视图分离:</strong>recovery 决策不修改 history 已有消息,
-    只追加新消息 (reminder / abort 通知), 维持 history 的 append-only 性质。
+<li>
+<strong>maxContinueRetries=2</strong>: 输出截断 2 次仍截断 = 模型
+    输出超长 (本身有问题), 2 次够。
   </li>
-  <li>
-    <strong>prepareMessages 复用:</strong>compact_and_retry 复用第 06
-    章的压缩管道, 不重新发明压缩逻辑。
+<li>
+<strong>retryDelayMs=3000</strong>: 3 秒是经验值, 太短仍触发
+    rate_limit, 太长用户体验差。 想要 jitter (随机化) 留 P2 阶段。
   </li>
 </ul>
-
-<h2 id="forward">前瞻张力: 留给后续章节</h2>
-<dl class="defs">
-  <dt>recovery 决策落盘</dt>
-  <dd>
-    第 15 章 transcript 会记录"LLM 调用何时失败、何时 retry、retry 多少次",
-    用于调试和 eval。
-  </dd>
-  <dt>recovery 与 async run 协作</dt>
-  <dd>
-    第 13 章 async run 会在主 loop 阻塞等后台任务时, 同时跑 recovery 退避,
-    不浪费主 loop 时间。
-  </dd>
-  <dt>recovery 与 schedule 协作</dt>
-  <dd>
-    第 14 章 schedule 触发的任务, recovery 策略可能不同 (例如更激进地放弃,
-    因为是后台任务)。
-  </dd>
-  <dt>recovery 配置热更新</dt>
-  <dd>
-    DEFAULT_RECOVERY_CONFIG 应当支持热更新 (运行时改退避策略), 不需要重启
-    harness。
-  </dd>
-</dl>
-
-<h2 id="vibe-coding-11">本次如何 vibe code: 第 11 章的三件套</h2>
-
-<h3 id="vibe-feed-11">拆卡: 4 轮迭代的具体产物</h3>
+<h2 id="prompt">3 段提示文案: notice / failure / reason</h2>
+<p>
+  <strong>用途</strong>: 用户在 REPL 看到的中文提示必须<strong>人话</strong>,
+  不能"Error: 429"。 3 段不同用途的提示:
+</p>
+<div class="figure figure--stack">
+  <div class="figure__title">图 3 · 3 段提示文案的边界</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--warn">
+      <div class="flow-stack__label">formatRecoveryNotice · 恢复中</div>
+      <div class="flow-stack__body">"LLM 调用失败, 正在重试 3/5, 3 秒后继续..." — 走 logger.warn, 用户看到"系统在等, 别急"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">formatFailureMessage · 最终失败</div>
+      <div class="flow-stack__body">"LLM token 额度或账户余额不足, 请稍后或补充额度后再试。" — 走 repl.ts 渲染, 给用户具体行动指引。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">error.message · 原始 SDK 错误</div>
+      <div class="flow-stack__body">"Request failed with status code 429" — 附加在 failure message 括号里, 给开发者看, 用户看不懂。</div>
+    </div>
+  </div>
+</div>
+<p>
+  <strong>实现细节</strong>: 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L242"><code>src/recovery.ts</code> 第 242 行的 <code>formatRecoveryNotice</code></a>
+  + 第 276 行的 <code>formatFailureMessage</code>。 failure message
+  故意分 7 类, 每类一句中文, 不复用通用模板, 因为 credential 和
+  quota 的"用户应该做什么" 完全不同。
+</p>
+<h2 id="sleep">sleep: 单独导出, 测试可 mock</h2>
+<p>
+  <strong>用途</strong>: backoff 动作需要 sleep, 但单测不应该
+  真等 3 秒。 把 sleep 单独导出, 测试用 <code>vi.useFakeTimers()</code>
+  mock, 跑 1000 个 retry 测试也不会真等。
+</p>
+<p>
+  <strong>真实场景</strong>: 测试"network 重试 5 次, 第 6 次
+  succeed", 不 mock sleep 就要等 5×3=15 秒, vitest 跑 100 个
+  recovery 测试要 25 分钟。 mock 之后瞬时跑完。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>副作用提出来</strong> — sleep
+  是副作用, 应该<strong>可替换</strong>, 不内联在 backoff
+  流程里。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L318"><code>src/recovery.ts</code> 第 318 行的 <code>sleep</code></a>:
+</p>
+<pre><code class="language-typescript">export function sleep(ms: number): Promise&lt;void&gt; {
+  return new Promise((resolve) =&gt; setTimeout(resolve, ms));
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: agent.ts 调
+  <code>if (action === "backoff") await sleep(retryDelayMs)</code>,
+  简单, 可读, 可测试。 这是 Reference 章节 "模式 17 · Test Doubles
+  测试替身" 的应用 — 副作用函数单独导出, 测试时换实现。
+</p>
+<h2 id="loop-integration">主循环集成: 4 个恢复点</h2>
+<p>
+  <strong>用途</strong>: recovery 模块在 agent.ts 集成有 4 个点:
+</p>
 <ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出
-    <code>classifyLLMError()</code> / <code>decideRecovery()</code> /
-    <code>createRecoveryState()</code> 三个函数签名, 以及 6 种 LLMErrorKind
-    的对照表。本轮不写实现, 重点钉"不同失败不同策略"。
+<li>
+<strong>LLM 调用前</strong>: try/catch 包住 <code>llm.chat()</code>,
+    catch 后调 <code>classifyLLMError</code> → <code>decideRecovery</code>
+    → 执行动作。
   </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 agent.run() 主循环的 stub, 包
-    try/catch 但永远走 noop 分支 (catch 块为空)。本轮 review 重点: recoveryState
-    在 run() 入口创建, 闭包内使用。
+<li>
+<strong>backoff 执行</strong>: 调 <code>sleep(retryDelayMs)</code>,
+    <code>state.apiRetryCount++</code>, continue 循环。
   </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 classify + decide +
-    主循环接入。本轮 review 重点: recovery 不修改已有 history 消息, truncation
-    累积, retry 上限。
+<li>
+<strong>compact 执行</strong>: 调
+    <code>compressor.compress(history, level: "aggressive")</code>,
+    <code>state.compactRetryCount++</code>, continue 循环。
   </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/recovery.test.ts</code>。本轮 review 重点: "rate_limit 退避重试"
-    和 "retry 上限放弃" 两条必须有 spy 验证。
+<li>
+<strong>continue 执行</strong>: append 一条 user message "请从断点
+    继续", <code>state.continueRetryCount++</code>, continue 循环。
   </li>
 </ol>
-
-<h3 id="vibe-review-11">Review: 第 11 章专属 checklist</h3>
+<p>
+  <strong>设计思想</strong>: <strong>4 个动作 4 个分支</strong> —
+  每个动作的执行逻辑不同, 用 switch 处理, 不混在 if/else 链里。
+  公共部分 (state 递增 + 日志) 提到 switch 之前。
+</p>
+<p>
+  <strong>实现细节</strong>: agent.ts 第 3 步 (call LLM) 的伪代码:
+  <code>while (true) { try { return await llm.chat(messages, tools); } catch (err) { const kind = classifyLLMError(err); const action = decideRecovery(kind, recovery, config); logger.warn(formatRecoveryNotice(action, kind, recovery, config)); switch (action) { case "backoff": await sleep(config.retryDelayMs); recovery.apiRetryCount++; continue; case "compact": ...; case "continue": ...; case "fail": throw new Error(formatFailureMessage(kind, err)); } } }</code>。
+</p>
+<h2 id="fake-test">fake test: 纯函数覆盖 4 种动作</h2>
+<p>
+  <strong>用途</strong>: recovery 模块的测试<strong>不需要</strong>
+  LLM, 纯函数测试, 输入错误 + state, 期望动作。 这是教学版
+  最大优势 — 业务逻辑和副作用彻底分离, 测业务逻辑零成本。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户写测试覆盖 4 种动作的 12 个
+  边界 (4 动作 × 3 状态), 加上 classifyLLMError 的 7 类错误
+  分类测试, 总共 20 个单测覆盖 recovery 模块, 5 分钟跑完。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>纯函数 = 单元测试黄金</strong> —
+  decideRecovery / classifyLLMError 都是纯函数, 同样输入永远同样
+  输出, 测试不需要 setup/teardown。 看测试示例:
+</p>
+<pre><code class="language-typescript">test("network + 0/5 -&gt; backoff", () =&gt; {
+  expect(decideRecovery("network", { apiRetryCount: 0, compactRetryCount: 0, continueRetryCount: 0 }))
+    .toBe("backoff");
+});
+
+test("network + 5/5 -&gt; fail", () =&gt; {
+  expect(decideRecovery("network", { apiRetryCount: 5, compactRetryCount: 0, continueRetryCount: 0 }))
+    .toBe("fail");
+});
+
+test("context_length + 0/1 -&gt; compact", () =&gt; {
+  expect(decideRecovery("context_length", { apiRetryCount: 0, compactRetryCount: 0, continueRetryCount: 0 }))
+    .toBe("compact");
+});
+
+test("credential 任何 state 都 fail", () =&gt; {
+  expect(decideRecovery("credential", { apiRetryCount: 0, compactRetryCount: 0, continueRetryCount: 0 }))
+    .toBe("fail");
+  expect(decideRecovery("credential", { apiRetryCount: 5, compactRetryCount: 0, continueRetryCount: 0 }))
+    .toBe("fail");
+});
+
+test("classifyLLMError: 401 -&gt; credential, 429 + quota -&gt; quota, 其他 429 -&gt; rate_limit", () =&gt; {
+  expect(classifyLLMError({ status: 401, message: "" })).toBe("credential");
+  expect(classifyLLMError({ status: 403, message: "forbidden" })).toBe("credential");
+  expect(classifyLLMError({ status: 429, message: "insufficient_quota" })).toBe("quota");
+  expect(classifyLLMError({ status: 429, message: "rate limit exceeded" })).toBe("rate_limit");
+  expect(classifyLLMError({ status: 413, message: "context length exceeded" })).toBe("context_length");
+  expect(classifyLLMError({ status: 500, message: "" })).toBe("network");
+  expect(classifyLLMError(new Error("weird thing"))).toBe("unknown");
+});
+
+test("decideRecovery 是纯函数: 同样输入多次调用结果相同", () =&gt; {
+  const state = { apiRetryCount: 3, compactRetryCount: 0, continueRetryCount: 0 };
+  const r1 = decideRecovery("network", state);
+  const r2 = decideRecovery("network", state);
+  expect(r1).toBe(r2);
+  expect(state.apiRetryCount).toBe(3);   // 内部没修改 state
+});</code></pre>
+<p>
+  <strong>实现细节</strong>: 6 个测试覆盖 4 动作 + 7 类分类 +
+  纯函数性质。 不需要 mock, 不需要 fake timer (sleep 单独
+  测试, 走 vi.useFakeTimers), 全程 <1ms 跑完。
+</p>
+<h2 id="common-confusion">常见误解: 4 种动作 ≠ 重试次数</h2>
+<p>
+  <strong>误解 1: "所有错误都 retry N 次?"</strong> 错。 4 种
+  动作<strong>类型</strong>不同, 不是次数不同。 context_length
+  retry 5 次 = 浪费 5 次 budget 在不可能成功的请求上, 应该
+  compact 1 次仍 fail 就 fail。
+</p>
+<p>
+  <strong>误解 2: "quota 等一下就好?"</strong> 错。 quota = 余额
+  耗尽, 等 1 小时还是耗尽, 必须 fail 提示用户充值。 rate_limit
+  ≠ quota, 两者都可能是 429, 但 message 含 quota/billing 关键字
+  是 quota, 否则是 rate_limit。
+</p>
+<p>
+  <strong>误解 3: "decideRecovery 应该自己 +1?"</strong> 错。
+  决策和执行分离 — decide 只算"应该做什么", 调用方负责"做"
+  和 "state++"。 内置 ++ 会让日志和真实次数错位, 测试也难。
+</p>
+<p>
+  <strong>误解 4: "formatFailureMessage 是装饰, 可选?"</strong>
+  错。 用户<strong>看不懂</strong> "Error: 429", 必须有
+  中文 failure message 告诉用户"现在过载, 等下再试" 或
+  "余额不足, 去充值"。 这不是 nice-to-have, 是核心功能。
+</p>
+<h2 id="trap">反例梯度: 4 个常见错误</h2>
+<div class="cards-grid">
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 1 · 所有错误都 backoff 5 次</span></div>
+    <div class="card__body">
+      <p>用户写 <code>if (error) { for (let i = 0; i &lt; 5; i++) { await sleep(3000); return await llm.chat(...); } }</code>。
+        错。 context_length 错误 5 次重试, 每次都带同样长的上下文,
+        全部 413 失败, 浪费 15 秒 + 5 次 token 浪费。 正确: 先
+        classify → decide, context_length 走 compact 1 次, 仍 fail
+        才放弃。</p>
+    </div>
+  </div>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 2 · 错误分类顺序错</span></div>
+    <div class="card__body">
+      <p>先判断 <code>status === 429</code> 返回 rate_limit, 再
+        判断 message 含 quota。 错。 顺序反了, quota (429 +
+        "insufficient_quota") 会被先识别为 rate_limit, 走 backoff
+        5 次, 5 次后 fail 提示"LLM 服务暂时不可用", 但真实是
+        "余额不足"。 正确顺序: credential → quota → rate_limit。</p>
+    </div>
+  </div>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 3 · RecoveryState 全局</span></div>
+    <div class="card__body">
+      <p>把 <code>let apiRetryCount = 0</code> 放在模块顶层, 所有
+        agent.run 共享。 错。 user A 触发 rate_limit 重试 5 次, 把
+        全局 budget 用完, user B 第一次 LLM 调用就 fail。 正确:
+        <code>createRecoveryState()</code> 在 agent.run 第一行调用,
+        run() 返回时销毁。</p>
+    </div>
+  </div>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 4 · 输出中文 "未知错误"</span></div>
+    <div class="card__body">
+      <p>对 unknown 错误, 给用户显示 "未知错误, 请稍后重试"。
+        错。 unknown 错误可能是协议错误 / SDK bug, 用户重试 100
+        次还是 fail。 正确: 输出 "LLM 调用出现未知错误: ${errMsg}",
+        把原始 error.message 暴露给开发者, 同时引导用户 "请提
+        issue"。</p>
+    </div>
+  </div>
+</div>
+<h2 id="validate">Validation: 4 条不变量检验</h2>
 <ol>
-  <li>
-    <strong>recovery state 工厂化。</strong>createRecoveryState() 在 run()
-    入口创建, 不在 module-level。验证:
-    <code>grep -n 'let retryCount\|^let attempt' src/agent.ts</code> 应当 0 行。
-  </li>
-  <li>
-    <strong>recovery 不修改 history 已有消息。</strong>catch 块只能 history.add
-    新消息, 不调 history.replaceEntries / remove。验证:
-    <code>grep -n 'history.replaceEntries' src/agent.ts</code> 在 catch 块内应当
-    0 行。
+<li>
+<strong>分类顺序正确</strong>: 同样的 error (status=429 +
+    message="insufficient_quota") 永远返回 quota, 不会先识别为
+    rate_limit。 验证: 单测覆盖 4 个互斥 case (401/429+quota/429
+    普通/413), 看返回值。
   </li>
-  <li>
-    <strong>retry 有上限。</strong>decideRecovery 收到 attempt &gt; maxRetries
-    时返回 abort。验证: Validation 卡片"retry 上限放弃" 那条测试通过。
+<li>
+<strong>纯函数性质</strong>: 同样输入 (kind + state) 多次调用
+    decideRecovery, 返回值永远相同, state 内部不被修改。 验证:
+    单测, 调用 2 次, state 字段对比。
   </li>
-  <li>
-    <strong>truncation 累积输出。</strong>finishReason === "length" 时
-    appendContinuationReminder, 不丢弃已生成内容。验证: Validation
-    卡片"truncation 续写累积" 那条测试通过。
+<li>
+<strong>budget 上限</strong>: 4 种动作的 budget 用完后, 必须
+    返回 fail, 不能继续 retry。 验证: 单测覆盖 (5,0,0) → fail,
+    (0,1,0) → fail, (0,0,2) → fail。
   </li>
-  <li>
-    <strong>compact_and_retry 复用第 06 章。</strong>不重新发明压缩。验证:
-    <code>grep -n 'compactCurrentHistoryForRecovery' src/agent.ts</code> ≥ 1
-    行。
+<li>
+<strong>credential/quota/unknown 立即 fail</strong>: 这 3 类
+    任何 state 都返回 fail, 不消耗 budget。 验证: 单测 (0,0,0) 和
+    (5,1,2) 都 expect fail。
   </li>
 </ol>
-
-<h3 id="vibe-debug-11">调试: 第 11 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · 失败立即重试。</strong>症状: catch 块内
-    <code>await llm.chat(messages); continue;</code>。验证: Validation
-    卡片"rate_limit 退避重试" 那条, spy 验证 sleep 被调用。
-  </li>
-  <li>
-    <strong>伪装 B · retry state 跨 run 共享。</strong>症状: module-level
-    <code>let attempt = 0</code>。验证: Validation 卡片"recovery state 不跨 run
-    共享" 那条, 第二次 run() 的 attempt 应当从 0 开始。
-  </li>
-  <li>
-    <strong>伪装 C · truncation 丢弃半段输出。</strong>症状:
-    <code>if (assistant.finishReason === "length") continue;</code> 直接续写,
-    不累积之前的内容。验证: 最终 assistant.content 不含第一段已生成内容。
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
+<ul>
+<li>
+<strong>错误转动作</strong>: 7 类错误 → 4 种动作, 一一映射, 决策
+    纯函数, 不混执行。
   </li>
-</ol>
-
-<h3 id="vibe-iterate-11">迭代: 第 11 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch11): 钉 classifyLLMError / decideRecovery / createRecoveryState
-      接口与 6 种失败分类</code
-    >
-    —— tsc 通过, 无实现。
+<li>
+<strong>决策 vs 执行分离</strong>: decideRecovery 只算"应该
+    做什么", 调方负责 "做" 和 "state++"。
   </li>
-  <li>
-    <code
-      >feat(ch11): agent.run 主循环 try/catch stub + createRecoveryState</code
-    >
-    —— tsc 通过, catch 永远 noop。
+<li>
+<strong>作用域清晰</strong>: RecoveryState 每次 agent.run() 重建,
+    不全局共享。
   </li>
-  <li>
-    <code
-      >feat(ch11): classify + decide + retry 上限 + truncation 累积 +
-      compact_and_retry 复用第 06 章</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
+<li>
+<strong>人话提示</strong>: 中文 failure message 告诉用户具体
+    行动 (充值 / 改 config), 不输出 "Error: 429"。
   </li>
-  <li>
-    <code>test(ch11): recovery state 不跨 run 共享 + retry 上限 abort</code> ——
-    全绿。
+<li>
+<strong>副作用可替换</strong>: sleep 单独导出, 测试用 fake
+    timer mock。
   </li>
-</ol>
-
+</ul>
+<h2 id="forward">前瞻张力: 留给后续章节</h2>
+<dl class="defs">
+<dt>Tool throw 错误</dt>
+<dd>
+    本章只覆盖 LLM 调用错误。 tool 内部 throw (如 fs.readFile
+    ENOENT) 走另一套恢复 — write "Tool error" tool message 不
+    fail, 让 LLM 自己决定重试或换工具。 错误分类不能把 tool
+    error 和 LLM error 混在一起。
+  </dd>
+<dt>用户主动取消 (ctrl-c)</dt>
+<dd>
+    user 按 ctrl-c 取消当前 run(), 应该 graceful shutdown —
+    保存 history + 释放 lock, 不应该 mid-write 文件导致 half
+    state。 这是 OS 信号的捕获, 不是 recovery 范畴, 但和
+    fail 动作的"立即停止" 边界要切清。
+  </dd>
+<dt>Subagent 错误传播</dt>
+<dd>
+    子 agent 抛 credential 错误, 父 agent 应该 fail 还是继续?
+    当前策略: 父 agent 把子 agent 错误包成 tool message "Subagent
+    failed: ${reason}", 让 LLM 自己决定重试或换子 agent 任务。
+    这是 Reference 章节 "模式 14 · Concurrent Limit" 的延伸。
+  </dd>
+</dl>
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
   <div class="card__head">
     <span class="card__tag">Prompt Card · 第 11 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
+    <button class="card__copy" data-copy-card="" type="button">复制</button>
   </div>
   <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 Recovery 模块, 分类 LLM 失败模式,
-      不同失败采取不同策略, 让 harness 在不确定的 LLM 面前持续工作。
-    </p>
-    <p>
-      <strong>场景:</strong>长任务跑到第 30 轮, LLM 突然 429 rate limit, harness
-      退避后重试 3 次, 第 4 次跑通, 主 loop 不中断; 另一种: context overflow
-      触发, harness 强制压缩后重试。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/recovery.ts</code> (新) 暴露
-      <code>classifyLLMError()</code> / <code>decideRecovery()</code> /
-      <code>createRecoveryState()</code>; <code>src/agent.ts</code> 主循环包
-      try/catch + recovery 决策。
-    </p>
+    <p><strong>目标:</strong> 给 harness 加 recovery 模块, 7 类错误
+      分类 + 4 种恢复动作 + 3 段提示文案, agent 主循环按动作
+      走分支。</p>
+    <p><strong>场景:</strong> 30 个 user 同时用 harness, OpenAI 限
+      流 429, 每个 user 走 backoff 5 次 × 3 秒 = 15 秒, 5 次后
+      fail 提示"LLM 服务暂时不可用", user ctrl-c 不卡死。</p>
+    <p><strong>模块:</strong> <code>src/recovery.ts</code> (新) 暴露
+      <code>classifyLLMError(error)</code> / <code>decideRecovery(kind, state, config)</code> /
+      <code>formatRecoveryNotice(...)</code> / <code>formatFailureMessage(...)</code>
+      / <code>sleep(ms)</code> / <code>createRecoveryState()</code>;
+      <code>src/agent.ts</code> (改) 第 3 步 try/catch, 按动作
+      走分支。</p>
     <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
     <ul>
-      <li>recovery state 工厂化, run() 入口创建, 不跨 run 共享</li>
-      <li>recovery 不修改 history 已有消息, 只追加新消息</li>
-      <li>retry 有上限, 默认 5 次, 超过后 abort 返回字符串</li>
-      <li>truncation 累积半段输出, 续写完成后拼回</li>
-      <li>compact_and_retry 复用第 06 章的压缩管道</li>
+      <li>7 类错误固定枚举, 分类顺序 credential → quota → rate_limit → context_length → network → unknown</li>
+      <li>4 种动作 continue / compact / backoff / fail, 一一映射, 不可新增</li>
+      <li>decideRecovery 纯函数, 不在内部修改 state, 调方负责 ++</li>
+      <li>RecoveryState 每次 agent.run() 重建, 不全局共享</li>
+      <li>credential / quota / unknown 任何 state 都 fail, 不消耗 budget</li>
+      <li>failure message 必须是中文, 告诉用户具体行动, 不输出 "Error: 429"</li>
+      <li>sleep 单独导出, 测试用 fake timer mock, 不内联在 backoff 流程</li>
     </ul>
-    <p><strong>验证 (用 fake LLM + spy sleep, 逐条落到 vitest):</strong></p>
+    <p><strong>验证 (用纯函数 + vitest, 逐条断言):</strong></p>
     <ul>
-      <li>
-        fake LLM 前 2 次抛 rate_limit, 第 3 次返回正常, agent.run() 跑通, spy
-        验证 sleep 被调用 2 次
-      </li>
-      <li>
-        fake LLM 抛 context_overflow, 第 06 章 compact 被调用 1 次, agent.run()
-        继续重试
-      </li>
-      <li>
-        fake LLM 第 1 轮 finishReason === "length", 第 2 轮 finishReason ===
-        "stop", 最终 content 是完整输出
-      </li>
-      <li>
-        fake LLM 永远抛 rate_limit, 跑 5 次后, agent.run() 返回 "[Recovery
-        aborted] ..." 字符串
-      </li>
-      <li>第 2 次 run() 的 recoveryState.attempt 从 0 开始 (不跨 run 共享)</li>
+      <li>classifyLLMError 覆盖 7 类输入, 输出对应 kind</li>
+      <li>decideRecovery 覆盖 4 动作 × 3 state 边界 (0/budget/budget+1)</li>
+      <li>credential / quota / unknown 任何 state 都 fail</li>
+      <li>decideRecovery 是纯函数: 同输入 2 次返回相同, state 不变</li>
+      <li>sleep 单独测试, 用 vi.useFakeTimers 验证 1000 次不真等</li>
     </ul>
   </div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意在 catch 块内立即重试 (不 sleep), 跑测试, 看"rate_limit 退避重试"
-    是否抓到 (spy 验证 sleep 没被调用)。
+<li>
+    故意把所有错误都 backoff 5 次 (不分类), 跑 rate_limit 场景
+    测试, 看"统一重试" 是否抓到 (5 次后 fail 但浪费 5 次 budget
+    在不可能成功的请求上)。
   </li>
-  <li>
-    把 retryCount 提到 module-level, 跑测试, 看"recovery state 不跨 run 共享"
-    是否抓到。
+<li>
+    故意把分类顺序写错 (先 429, 再 quota), 跑
+    <code>classifyLLMError({status: 429, message: "insufficient_quota"})</code>
+    测试, 看"quota 应该不被识别为 rate_limit" 是否抓到。
   </li>
-  <li>
-    在 truncation 路径上丢弃 assistant.content, 跑测试, 看"truncation 续写累积"
-    是否抓到 (最终 content 缺第一段)。
+<li>
+    把 RecoveryState 提到模块顶层全局变量, 跑两个 agent.run 串联
+    测试, 看"全局共享 budget" 是否抓到 (第二个 run 第一次 LLM
+    调用就 fail, 因为 budget 被第一个用完)。
+  </li>
+<li>
+    在 decideRecovery 内部 <code>state.apiRetryCount++</code>,
+    跑"纯函数" 单测, 看"纯函数性质" 是否抓到 (state 被修改,
+    同输入 2 次返回不同)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了 Recovery 模块, 分类 6 种 LLM 失败模式 (rate_limit /
-  timeout / context_overflow / parse_error / truncation / unknown),
-  不同失败采取不同策略。recovery state 是 run() 入口的闭包内状态, retry 有上限,
-  truncation 累积 半段输出, compact_and_retry 复用第 06 章。下一章 (第 12 章)
-  我们会处理"长期计划" 的问题——Persistent Task, 把跨会话的 plan / TODO 落盘,
-  区分"会话内 TODO" 和"项目级 task"。
+  Recovery 是给 LLM 调用的<strong>错误转动作</strong>, 7 类
+  错误 × 4 种动作的纯函数决策。 核心是 5 个设计:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>7 类错误</strong>: network / rate_limit / credential /
+    quota / context_length / output_interrupted / unknown, 分
+    类顺序固定 (credential → quota → rate_limit → ...)。
+  </li>
+<li>
+<strong>4 种动作</strong>: continue / compact / backoff / fail,
+    一一映射, 不可新增。
+  </li>
+<li>
+<strong>纯函数决策</strong>: decideRecovery 不修改 state, 调方
+    负责 ++, 单元测试覆盖 12 个边界。
+  </li>
+<li>
+<strong>per-request state</strong>: RecoveryState 每次 agent.run()
+    重建, 不全局共享, 30 个 user 互不干扰。
+  </li>
+<li>
+<strong>人话提示</strong>: 中文 failure message 告诉用户具体
+    行动, 不输出 "Error: 429"。
+  </li>
 <p>
-  第 11 章让 harness 在 LLM 失败时持续工作, 但"用户的长期计划" (例如"未来 3
-  周要重构 X") 仍然只在会话内 TODO 里。下一章 Task 模块会区分"会话内 TODO" (第
-  03 章) 和"项目级 task" (持久化到
-  <code>~/.claude/tasks/</code>), 跨会话保留, 让用户离开一周后
-  回来仍然能看到"上次没做完的事"。
+  下一章 (第 12 章) 展开 harness 的<strong>任务系统</strong> —
+  LLM 跑长任务 (1 小时跑测试 / 半小时跑 build / 1 天跑全项目
+  扫描) 时, 状态怎么持久化, 怎么挂到 TODO 列表上, 怎么和
+  subagent 配合。
 </p>
diff --git a/tutorial/chapters/12-task.html b/tutorial/chapters/12-task.html
index 3e9cf5f..bc6af7f 100644
--- a/tutorial/chapters/12-task.html
+++ b/tutorial/chapters/12-task.html
@@ -1,505 +1,732 @@
-<p class="article__eyebrow">第 12 章 · 长期计划落盘</p>
-<h1 class="article__title">跨会话的 plan: Persistent Task</h1>
+<p class="article__eyebrow">第 12 章 · 跨会话的长期任务</p>
+<h1 class="article__title">Task: 带依赖图的持久化任务组</h1>
 <p class="article__lede">
-  前面十一章让 harness 在单次会话内能聊天、调工具、跑子任务、压缩、拦权限、 留
-  hook、记 memory、cache 友好、recovery。但"用户的长期计划" (例如"未来 3
-  周要重构 X") 仍然只在会话内 TODO 里, 关掉就丢。这一章给 harness 加 Persistent
-  Task 模块, 区分"会话内 TODO" (第 03 章) 和"项目级 task" (持久化到
-  <code>~/.claude/tasks/</code>), 跨会话保留, 让用户离开一周
-  回来仍然能看到"上次没做完的事"。
+  第 03 章的 TODO 是"当前会话的短期执行清单", 但 LLM 跑长任务
+  (1 小时跑测试 / 半天跑全项目扫描 / 3 天跑大型迁移) 时, session
+  早就关掉了, TODO 一起丢。 这一章加 <strong>Task 系统</strong>:
+  TaskGroup 持久化到 <code>.tasks/groups/*.json</code>, 跨 session
+  恢复, 带<strong>依赖图</strong> (task_2 blockedBy task_1) +
+  <strong>状态机</strong> (pending → in_progress → completed /
+  failed / cancelled) + <strong>派生 ready / blocked</strong>
+  (不写入磁盘, 读取时算) + <strong>audit events</strong> (轻量
+  审计线索)。 读完后, 你能讲清"Task vs TODO" 的边界 (跨会话持久
+  化 vs 当前 session), 并能用 tmp 目录验证"环依赖拒绝" +
+  "ready 派生正确" + "terminal status 后不可改" 3 条不变量。
+</p>
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: session 关闭, 跑了 2 小时的任务状态全丢</h2>
+<p>
+  写代码之前, 先看一个真实痛点。 用户的 team 用 harness 跑大型
+  迁移: "把 500 个 JS 文件转成 TS, 每天跑 1 小时, 5 天完成"。
+  团队 leader 看着第 1 天的 TODO 列表很开心, 关电脑下班。
+  第二天上班开 session, TODO 列表是空的, LLM 不知道"昨天转了
+  哪些文件" 也 "今天该从哪个开始"。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-11">在第 11 章基础上改了什么</h2>
-<p>
-  这一章在 Composition Root 加 task 持久化层。task 数据落盘到
-  <code>~/.claude/tasks/&lt;task-id&gt;.md</code> 这样的 markdown 文件, 包含
-  plan / status / dependencies / next action 几个字段。agent 主循环在
-  SessionStart 时把 active task 列表注入到 system prompt, 会话过程中 LLM 可通过
-  update_task 工具更新 task 状态。task 与 session TODO (第 03 章) 严格分离: TODO
-  是会话内清单, task 是跨会话 plan。 对应到代码, 改动集中在 3 个文件:
-  <code>src/tasks.ts</code> (新)、
-  <code>src/tools/update_task.ts</code> (新)、<code>src/agent.ts</code> (改
-  SessionStart 注入 task)。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/tasks.ts: Task 持久化 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/task-store.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/task-store.ts: Task 存储后端 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/agent.ts: SessionStart 注入 active tasks</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    用户周一开了 harness, 列了一周计划 (3 个 task), 周三处理完第 1 个。
-    周五回来, harness 完全不记得"还剩 2 个 task"。现象是"会话内 TODO OK, 跨会话
-    plan 缺失"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"task 写在 memory 里"。这有两个问题: 一是 memory 是
-    "用户级偏好" (例如"我喜欢简洁解释"), task 是"项目级 plan", 两者 职责不同;
-    二是 task 有 dependencies / status 字段, 写成 markdown
-    散文不适合状态机校验。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface TaskStore { get(id), list({status, scope}), update(id, patch)
-      }</code
-    >。 不变量三条: (1) task 数据结构化 (status / dependencies / next_action
-    字段化), 不写散文, (2) task 落盘走 atomic write (第 15 章), (3) SessionStart
-    注入 active task 时按 status 过滤, 只注入 pending / in_progress, 不注入
-    completed。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake taskStore 预设 3 个 task (1 pending, 1 in_progress, 1 completed), 跑完
-    SessionStart 注入, LLM 看到的 system prompt 含 2 个 task (pending +
-    in_progress), completed 不出现。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · task 写在 memory 里</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 错误: 复用 memory 写 task
-await memoryStore.set("project", "project.tasks", `
-  1. 重构 user 组件 (status: in_progress)
-  2. 跑测试 (status: pending)
-  3. 部署 (status: pending)
-`);</code></pre>
-  <p><strong>问:</strong>为什么不复用 memory 写 task?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 职责: memory 是"用户级偏好", task
-    是"项目级 plan"; 结构: 散文无法做 status / dependencies 字段 校验, LLM
-    改写时容易写错; 状态: task 列表渲染时需要按 status 过滤, 散文做不到。
-  </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · 不区分 status, 所有 task 全量注入</p>
-  <pre class="code-block"><code>// 教学简化版
-const allTasks = await taskStore.list();
-const prompt = allTasks.map(t =&gt; `- ${t.id}: ${t.description}`).join("\n");</code></pre>
-  <p><strong>问:</strong>为什么不区分 status?</p>
-  <p>
-    <strong>答:</strong>completed task 不应该再被 LLM 看到, 注入会占用 context
-    还误导 LLM "这件事还没做完"。active 过滤是 harness 的 默认行为。
-  </p>
-</div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export type TaskStatus = "pending" | "in_progress" | "completed" | "cancelled";
-export type TaskScope = "user" | "project";
-
-export interface Task {
-  id: string;
-  scope: TaskScope;
-  title: string;
-  description: string;
-  status: TaskStatus;
-  dependencies: string[];       // 依赖的其他 task id
-  nextAction?: string;           // 下一步具体动作
-  createdAt: number;
-  updatedAt: number;
-}
-
-export interface TaskStore {
-  get(id: string): Promise&lt;Task | null&gt;;
-  list(filter?: { status?: TaskStatus; scope?: TaskScope }): Promise&lt;Task[]&gt;;
-  create(task: Omit&lt;Task, "id" | "createdAt" | "updatedAt"&gt;): Promise&lt;Task&gt;;
-  update(id: string, patch: Partial&lt;Task&gt;): Promise&lt;Task&gt;;
-  // 渲染 active task (pending + in_progress) 为 system prompt 片段
-  renderActive(): Promise&lt;string&gt;;
-}</code></pre>
-
-<h2 id="status-rules">status 状态机</h2>
 <ol>
-  <li>pending → in_progress (激活)</li>
-  <li>in_progress → completed (完成)</li>
-  <li>in_progress → pending (回退, 例如发现需要先做其他事)</li>
-  <li>pending / in_progress → cancelled (取消, 不再恢复)</li>
-  <li>
-    不允许 pending → completed (跳过 in_progress, 与第 03 章 TODO
-    状态机规则一致)
+<li>
+<strong>症状</strong>: 关闭 session = TODO 全丢, LLM 重复跑
+    已完成的文件 (浪费 token) 或跳过依赖未完成的文件 (出现
+    编译错误)。
+  </li>
+<li>
+<strong>根因</strong>: TODO 是 session 内存状态, 不持久化。
+    session 关闭 = TODO 数组销毁。
+  </li>
+<li>
+<strong>误诊</strong>: 团队以为是 LLM "忘了", 写复杂 prompt
+    让 LLM "记住" — LLM 没法跨 session 持久化, 写了也丢。
+  </li>
+<li>
+<strong>真正的修法</strong>: 持久化 Task 系统 — 任务组写到
+    <code>.tasks/groups/migration-001.json</code>, 跨 session
+    恢复; 任务带依赖图, LLM 知道"task_5 依赖 task_3, task_3
+    没完成所以 task_5 不能开始"; 状态机保证"completed 不能
+    变回 in_progress"。
   </li>
 </ol>
-
-<h2 id="loop-integration">loop 接入: SessionStart 注入 active task</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// SessionStart 阶段
-const activeTaskPrompt = await taskStore.renderActive();  // 仅 pending + in_progress
-const basePrompt = history.getSystemPrompt() ?? "";
-history.setSystemPrompt(`${basePrompt}\n\n# Active Tasks\n${activeTaskPrompt}`);
-
-// update_task 工具
-async function updateTask(args) {
-  // 状态机校验
-  const current = await taskStore.get(args.id);
-  if (!current) return { toolCallId: call.id, content: `Unknown task ${args.id}` };
-  if (current.status === "pending" &amp;&amp; args.status === "completed") {
-    return { toolCallId: call.id, content: `Error: cannot skip in_progress for ${args.id}` };
-  }
-  await taskStore.update(args.id, { status: args.status, ...args.patch });
-  return { toolCallId: call.id, content: `Task ${args.id} → ${args.status}` };
-}</code></pre>
-
-<h2 id="todo-vs-task">会话内 TODO vs 项目级 Task</h2>
-<dl class="defs">
-  <dt>会话内 TODO (第 03 章)</dt>
-  <dd>
-    短期, 当次会话用完即弃; 粒度细, 一行一个; 由 LLM 自由创建 / 修改;
-    状态机约束轻 (允许 pending → completed)。
-  </dd>
-  <dt>项目级 Task (本章)</dt>
-  <dd>
-    长期, 跨会话保留; 粒度粗, 一个 task 可以拆多个 TODO; 由 LLM 通过 update_task
-    工具改写; 状态机严格, 不允许 pending → completed。
-  </dd>
-</dl>
 <p>
-  两者不冲突: Task 是"项目级 plan", TODO 是"会话内 step"。LLM 在做 Task 时,
-  通常会拆出几个 TODO 放进会话内, 跑完一个 TODO 更新一次 Task status。
+  朴素想法 1: "把 TODO 写到磁盘不就行了?" 不够。 TODO 是
+  <strong>当前 session</strong> 的短期清单, 写磁盘变 <strong>长期
+  任务</strong>, 但 TODO 没<strong>依赖图</strong> (只能单
+  链表), 没<strong>状态机</strong> (状态字段是字符串, 任意改),
+  没<strong>audit event</strong> (谁改的不知道)。 Task 是新的
+  抽象, 不是"持久化的 TODO"。
 </p>
-
-<h2 id="trap">反例梯度</h2>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>task 写进 memory。</p>
-    <p>
-      <strong>为什么错:</strong>职责混淆, 结构化字段变成散文, 状态机校验失效。
-    </p>
-    <p>
-      <strong>正确做法:</strong>task 独立落盘, 结构化字段, 走 atomic write。
-    </p>
+<p>
+  朴素想法 2: "用数据库?" 太重。 教学项目要 <code>group.json</code>
+  + git 友好的纯文本, 不用 SQLite/Postgres。 规模也小 (一个项目
+  几个 group, 几十个 task), 内存里读 + 偶尔写磁盘完全够。
+</p>
+<p>
+  正确做法: 加 <code>src/tasks.ts</code> + <code>src/task-store.ts</code>
+  — TaskStore 管文件 IO, TaskManager 管业务规则 (状态机 + 依赖
+  校验 + activeTaskGroupId)。 这是 Reference 章节 "模式 7 ·
+  State Machine 状态机" + "模式 10 · Atomic Write 原子写" 的
+  联合应用。
+</p>
+<h2 id="task-vs-todo">Task vs TODO: 跨会话 vs 当前 session</h2>
+<p>
+  <strong>用途</strong>: 两个抽象职责正交, 不能互相替代。 任何
+  "应该持久化" 的事用 Task, "只是当前 session 短期跟踪" 用 TODO。
+  混淆会导致"重要任务被 TODO 化" 丢状态, 或"TODO 当 Task 用"
+  阻塞依赖检查。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户让 LLM "跑 5 天的迁移",
+  LLM 创建 TaskGroup "migration-001" 含 500 个 task, 每个
+  task_1 / task_2 / ... 标记依赖关系; 同一 session 内, LLM
+  同时维护一个 TODO 列表 "今天先跑 task_1 ~ task_50, 然后
+  看进度调整", TODO 是"今天的执行清单", Task 是"整个 5 天的
+  计划"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>职责正交, 数据隔离</strong> —
+  Task 持久化在 <code>.tasks/groups/</code>, TODO 在 session
+  内存; Task 有依赖图, TODO 是单链表; Task 有状态机, TODO 是
+  简单 3 状态 (pending / in_progress / done)。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L122"><code>src/tasks.ts</code> 第 122 行的教学导读注释</a>:
+</p>
+<pre><code class="language-typescript">// TaskManager 是持久化 Task 的业务层。
+// 它不关心 group.json 写在哪里, 也不关心 LLM tool 参数长什么样;
+// 它只负责长期任务组的状态机:
+// - 创建 group / task
+// - 校验依赖图
+// - 控制 task 状态转换
+// - 自动派生 group 完成状态
+//
+// 这和 todo.ts 的 session TODO 不同:
+// TODO 是当前会话的短期执行清单; Task 是跨会话、可恢复、带依赖图的长期计划。</code></pre>
+<div class="figure figure--compare">
+  <div class="figure__title">图 1 · Task vs TODO 的边界</div>
+  <div class="flow-compare">
+    <div class="flow-compare__col flow-compare__col--good">
+      <div class="flow-compare__head">Task · 跨会话持久化</div>
+      <div class="flow-compare__body">存在 <code>.tasks/groups/&lt;id&gt;.json</code>, 关闭 session 不丢。 带依赖图 (DAG) + 状态机 (6 态) + audit events。 适合跑 1 小时以上的任务。</div>
+    </div>
+    <div class="flow-compare__col flow-compare__col--bad">
+      <div class="flow-compare__head">TODO · session 内存</div>
+      <div class="flow-compare__body">存在 <code>history[]</code> 或 session state, 关闭 session 销毁。 单链表 + 3 态 (pending / in_progress / done)。 适合当前 turn 的执行清单。</div>
+    </div>
   </div>
 </div>
+<p>
+  <strong>实现细节</strong>: TaskStore 写 JSON 文件, TaskManager
+  不直接调 fs, 通过 store 接口。 这是 Reference 章节 "模式 1 ·
+  工厂 + 闭包" + "模式 10 · Atomic Write 原子写" 的应用 — store
+  接口可以换 (内存 / 文件 / git), TaskManager 不变。
+</p>
+<h2 id="group-structure">TaskGroup 结构: 1 group = 1 .json 文件</h2>
+<p>
+  <strong>用途</strong>: 1 个 TaskGroup 持久化到 1 个
+  <code>.tasks/groups/&lt;id&gt;.json</code> 文件, 方便 git diff
+  / review / 合并冲突解决。 1 个 group 包含 1 个 task 列表 + 1 个
+  events 列表 + 1 个 group 元数据。
+</p>
+<p>
+  <strong>真实场景</strong>: team 5 人同时编辑 "migration-001"
+  group (分配不同 task), 每人 commit 自己的 group.json 修改, PR
+  review 看 git diff 看清楚"谁加了什么 task / 谁改了什么状态"。
+  如果 1 个 group = 多个文件, 改 1 个 task 要改多个文件, 合并
+  冲突爆炸。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>人类可读 + git 友好</strong> —
+  JSON 格式, 字段顺序稳定, 多行 (task 数组每行 1 个), git diff
+  显示"task_3 状态从 in_progress 变 completed" 而不是"第 47 行
+  改了一个字符不知道改哪"。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L204"><code>src/tasks.ts</code> 第 204 行的 group 创建</a>:
+</p>
+<pre><code class="language-typescript">const group: TaskGroupFile = {
+  version: 1,
+  kind: "task_group",
+  id: groupId,
+  scope: projectRoots.length &gt; 1 ? "multi_project" : "project",
+  projectRoots,
+  title,
+  status: "active",
+  createdAt,
+  updatedAt: createdAt,
+  tasks,
+  events: [
+    { id: ..., type: "group_created", message: `Created task group: ${title}` },
+  ],
+};</code></pre>
+<p>
+  <strong>实现细节</strong>: <code>version: 1</code> 字段给未来 schema
+  迁移留空间; <code>kind: "task_group"</code> 字段防止和别的 JSON
+  文件 (memory / skill) 混淆; <code>scope</code> 字段标记单项目
+  vs 多项目 group, 决定依赖图校验规则。
+</p>
+<h2 id="state-machine">6 状态状态机: pending → in_progress → completed/failed/cancelled</h2>
+<p>
+  <strong>用途</strong>: 任务状态变化必须<strong>受控</strong>,
+  任意字符串改会破坏状态机 (LLM 可能写出"pending → completed →
+  pending" 的死循环, 没人能解释)。 6 状态是<strong>固定枚举</strong>,
+  转换规则<strong>显式定义</strong>。
+</p>
+<p>
+  <strong>真实场景</strong>: LLM 完成 task_3, 调
+  <code>updateTask(groupId, "task_3", {status: "completed"})</code>。
+  TaskManager 校验当前 status (in_progress) → 目标 status
+  (completed) 合法, 更新 + 写 event + 检查依赖 (task_5 现在
+  ready 了)。 如果 LLM 想"completed → pending" (回滚), TaskManager
+  拒绝并 throw。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>状态机 + 显式转换</strong> —
+  Reference 章节 "模式 7 · State Machine 状态机" 的应用。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L92"><code>src/tasks.ts</code> 第 92 行的 <code>TERMINAL_TASK_STATUSES</code></a>:
+</p>
+<pre><code class="language-typescript">const TERMINAL_TASK_STATUSES: ReadonlySet&lt;TaskStatus&gt; = new Set([
+  "completed", "cancelled", "deleted",
+]);
 
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>不区分 status, completed task 仍然注入。</p>
-    <p><strong>为什么错:</strong>占 context, 误导 LLM 以为 task 没做完。</p>
-    <p>
-      <strong>正确做法:</strong>renderActive() 仅返回 pending + in_progress,
-      completed 过滤掉。
-    </p>
+const STATUS_SYMBOLS: Record&lt;TaskStatus, string&gt; = {
+  pending: "[ ]",
+  in_progress: "[&gt;]",
+  completed: "[x]",
+  failed: "[!]",
+  cancelled: "[_]",
+  deleted: "[-]",
+};</code></pre>
+<div class="figure figure--stack">
+  <div class="figure__title">图 2 · 6 状态状态机 + 合法转换</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">pending · 未开始</div>
+      <div class="flow-stack__body">[ ] 初始状态。 可转 in_progress / cancelled / deleted。 不可直接 completed。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">in_progress · 进行中</div>
+      <div class="flow-stack__body">[&gt;] 正在做。 可转 completed / failed / cancelled / deleted。 不允许重复 in_progress。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--good">
+      <div class="flow-stack__label">completed · 已完成 (终态)</div>
+      <div class="flow-stack__body">[x] 成功。 不可转其他状态。 Group 派生完成状态时检查所有非 deleted task 都 completed。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">failed · 失败 (终态)</div>
+      <div class="flow-stack__body">[!] 失败。 不可转其他状态。 Group 派生完成状态时若仍有 in_progress 任务, 仍是 active。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">cancelled · 取消 (终态)</div>
+      <div class="flow-stack__body">[_] 用户取消。 不可转其他状态。 走 archive_group 才能清理磁盘。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">deleted · 删除 (终态)</div>
+      <div class="flow-stack__body">[-] 软删除。 不可转其他状态。 仍被其他 task 依赖时拒绝 delete (悬空引用)。</div>
+    </div>
   </div>
 </div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>update_task 跳过状态机校验。</p>
-    <p>
-      <strong>为什么错:</strong>LLM 倾向"全部勾选", 没有状态机就会 pending →
-      completed, 跳过了 in_progress。
-    </p>
-    <p>
-      <strong>正确做法:</strong>工具入口校验, 拒绝 pending → completed 跳跃, 写
-      error tool message。
-    </p>
+<p>
+  <strong>实现细节</strong>: 6 状态分两类 — 3 个非终态 (pending /
+  in_progress / ...), 3 个终态 (completed / cancelled / deleted)。
+  非终态可继续转换, 终态锁定。 状态转换在
+  <code>applyStatusTransition</code> 函数内显式定义, 不允许
+  <code>task.status = "..."</code> 直接赋值。
+</p>
+<h2 id="dependency-graph">依赖图: blockedBy + 环检测 + 悬空引用</h2>
+<p>
+  <strong>用途</strong>: task 之间有依赖 ("task_5 必须等 task_3
+  完成才能开始"), 用 <code>blockedBy: ["task_3"]</code> 表达。
+  依赖图必须<strong>无环</strong> + <strong>无悬空</strong> (引用
+  存在的 task), 否则 ready / blocked 派生状态无法解释。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户迁移 500 个 JS 文件, task_1
+  改 <code>src/a.js</code>, task_2 改 <code>src/b.js</code>
+  (依赖 task_1 因为 b.js import a.js), task_3 改
+  <code>src/c.js</code> (依赖 task_1)。 LLM 创建 group 后,
+  group.json 含依赖图; LLM 跑 task_1 完成, 系统自动
+  派生 task_2 / task_3 ready; LLM 跑 task_2 完成, 派生 task_3
+  (如果 task_1 也完成) ready。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>DAG 校验 + 派生状态</strong> —
+  Reference 章节 "模式 7 · State Machine 状态机" 的延伸。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L229"><code>src/tasks.ts</code> 第 229 行的 <code>ensureDependencyGraphValid</code></a>:
+</p>
+<pre><code class="language-typescript">// 创建时就检查依赖图, 避免把有环或引用不存在 task 的 group 写入持久化层。
+ensureDependencyGraphValid(group);</code></pre>
+<div class="figure figure--stack">
+  <div class="figure__title">图 3 · 依赖图校验的 3 条规则</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">规则 1 · 引用存在</div>
+      <div class="flow-stack__body">task_5.blockedBy 含 "task_99", 但 group 没 task_99。 拒绝创建, 错误 "blockedBy references missing task: task_99"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">规则 2 · 无环</div>
+      <div class="flow-stack__body">task_1.blockedBy=[task_2], task_2.blockedBy=[task_1]。 拒绝创建, 错误 "circular dependency detected: task_1 → task_2 → task_1"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">规则 3 · 不可删被依赖</div>
+      <div class="flow-stack__body">task_3 仍被 task_5 依赖 (task_5.blockedBy 含 task_3), 拒绝 delete task_3, 错误 "Cannot delete task_3; it is required by task_5"。</div>
+    </div>
   </div>
 </div>
+<p>
+  <strong>实现细节</strong>: <code>ensureDependencyGraphValid</code>
+  在 <code>createGroup</code> / <code>addTask</code> /
+  <code>updateTask</code> 之后都跑一次, 保证任何修改后依赖图
+  都合法。 3 条规则对应 Reference 章节 "模式 15 · Identity Check
+  身份校验" 的应用。
+</p>
+<h2 id="derived-state">ready / blocks: 派生, 不写入磁盘</h2>
+<p>
+  <strong>用途</strong>: <code>ready</code> (任务是否可开始) 和
+  <code>blocks</code> (反向依赖, 谁依赖我) 是<strong>派生状态</strong>,
+  读取时算, 不写入磁盘。 写入磁盘会和 source-of-truth 冲突
+  (LLM 改 status 没及时 rebuild 派生)。
+</p>
+<p>
+  <strong>真实场景</strong>: LLM 调 <code>readGroup(groupId)</code>
+  拿到 <code>TaskGroupView</code>, 每个 task 有 <code>ready: true/false</code>
+  + <code>blocks: ["task_5"]</code>。 LLM 看到 task_3.ready=true
+  知道可以开始, 看到 task_3.blocks 含 task_5 知道"我完成会解锁
+  task_5"。 派生每次重算, 不存在"过期" 问题。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>派生数据, 不冗余</strong> —
+  磁盘只存 source-of-truth (status, blockedBy, ...), 派生
+  (ready, blocks, progress) 读取时算。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L432"><code>src/tasks.ts</code> 第 432 行的 <code>buildTaskGroupView</code></a>:
+</p>
+<pre><code class="language-typescript">function buildTaskGroupView(group: TaskGroupFile): TaskGroupView {
+  const completedIds = new Set(
+    group.tasks.filter((task) =&gt; task.status === "completed").map((t) =&gt; t.id),
+  );
+  const blocksById = new Map&lt;string, string[]&gt;();
+  for (const task of group.tasks) {
+    for (const dep of task.blockedBy) {
+      blocksByid.set(dep, [...(blocksById.get(dep) ?? []), task.id]);
+    }
+  }
+  const tasks: TaskView[] = group.tasks.map((task) =&gt; {
+    const missing = task.blockedBy.filter((id) =&gt; !completedIds.has(id));
+    return {
+      ...task,
+      blockedBy: [...task.blockedBy],
+      ready: task.status === "pending" &amp;&amp; missing.length === 0,
+      blocks: blocksById.get(task.id) ?? [],
+    };
+  });
+  return { group, tasks };
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: <code>ready = (status === "pending") &amp;&amp; (blockedBy 都 completed)</code>,
+  这条规则让 LLM 跑完 task_1 立刻看到 task_2 / task_3 变 ready。
+  <code>blocks</code> 反向索引从 blockedBy 反推, LLM 知道"完成我
+  会解锁谁"。
+</p>
+<h2 id="events">audit events: 轻量审计线索</h2>
+<p>
+  <strong>用途</strong>: 每次业务修改 (创建 / 更新 / 删除 / 归档)
+  追加一条 event 到 group.events[], 含 id / timestamp / actor /
+  type / message。 审计线索轻量, 不驱动状态机, 只给"谁改了什么"
+  留记录。
+</p>
+<p>
+  <strong>真实场景</strong>: team leader 想知道"task_3 谁改的
+  completed", 看 group.events 找到
+  <code>{actor: "main", type: "task_updated", message: "Updated task_3", timestamp: "2026-06-12T10:00"}</code>。
+  actor 标记 "main" (主 agent) 或未来 "subagent:explore" /
+  "user" 等, 帮助定位责任。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>事件流, 不状态机</strong> —
+  events 是<strong>追加日志</strong>, 不参与 status 决策; status
+  仍以 task.status 字段为准。 这是 Reference 章节 "模式 10 ·
+  Atomic Write 原子写" 的延伸 — event 跟着 group 一起写, 不
+  单独 append (单独 append 会破坏原子性)。
+</p>
+<p>
+  <strong>实现细节</strong>: 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L143"><code>src/tasks.ts</code> 第 143 行的 <code>createEvent</code></a>
+  — event id 用 <code>event_${Date.now()}_${random}</code> 生成,
+  教学版够用; 工业版应该用 ULID 之类保证排序稳定。
+</p>
+<h2 id="active-id">activeTaskGroupId: session 级轻状态, 不持久化</h2>
+<p>
+  <strong>用途</strong>: "当前正在关注哪个 group" 是 session 级
+  状态, 不写磁盘。 activeTaskGroupId 是 LLM 调
+  <code>readGroup(id)</code> 后自动 set, 工具输出和 reminder
+  告诉模型"现在关注这个 group"。
+</p>
+<p>
+  <strong>真实场景</strong>: user 跑 "migration-001" 跑到一半
+  session 关闭, 下次开 session 不知道"我之前在跑什么"。 应该
+  user 主动 <code>setActiveGroupId("migration-001")</code> 或
+  工具调 <code>readGroup("migration-001")</code> 自动 set,
+  reminder 告诉 LLM "现在关注 migration-001, 进度 50/500"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>轻状态, 不持久化</strong> —
+  activeTaskGroupId 是 <strong>指针</strong>, 真实数据在磁盘
+  group.json; session 关闭指针丢, 数据不丢。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L137"><code>src/tasks.ts</code> 第 137 行</a>:
+</p>
+<pre><code class="language-typescript">// activeTaskGroupId 是 session 级状态, 不写入磁盘。
+// 它只是帮助工具输出和 reminder 告诉模型"当前正在关注哪个长期任务组"。
+let activeTaskGroupId: string | null = null;</code></pre>
+<p>
+  <strong>实现细节</strong>: <code>readGroup()</code> / <code>createGroup()</code>
+  自动 set, <code>setActiveGroupId(null)</code> 显式清空; 设置
+  前先 <code>store.read(groupId)</code> 验证 group 存在, 避免
+  指针指向不存在的持久化对象。
+</p>
+<h2 id="format">formatTaskGroupView: 状态符号 + 进度条</h2>
+<p>
+  <strong>用途</strong>: 给 LLM / 用户读的人类可读格式 — 每个
+  task 一行, 用状态符号 ([ ] / [>] / [x] / [!] / [_] / [-])
+  + owner + blockedBy + note, 末尾进度条
+  (completed/total, in_progress, ready, blocked)。
+</p>
+<p>
+  <strong>真实场景</strong>: LLM 调 <code>formatTaskGroupView(view)</code>
+  拿到一段字符串塞回 conversation, 让 LLM 看到"task_3 [>] 正在做,
+  blockedBy task_1 (未完成), 进度 50/500"。 LLM 据此决定下一步
+  (完成 task_1 或等 task_1)。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>结构化文本</strong> — 用
+  状态符号 + 缩进表达层级, 不用表格 (markdown 表格 LLM 不易
+  parse)。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L400"><code>src/tasks.ts</code> 第 400 行的 <code>formatTaskGroupView</code></a>:
+</p>
+<pre><code class="language-typescript">export function formatTaskGroupView(view: TaskGroupView): string {
+  const { group, tasks } = view;
+  const lines = [`[${group.status}] ${group.id}: ${group.title}`];
+  for (const task of tasks) {
+    const symbol = task.ready &amp;&amp; task.status === "pending" ? "[?]" : STATUS_SYMBOLS[task.status];
+    lines.push(`${symbol} ${task.id}: ${task.subject}`);
+    lines.push(`    owner: ${task.owner}`);
+    lines.push(`    blockedBy: ${task.blockedBy.length &gt; 0 ? task.blockedBy.join(", ") : "-"}`);
+    if (task.blockedReason) lines.push(`    blocked: ${task.blockedReason}`);
+    if (task.note) lines.push(`    note: ${task.note}`);
+  }
+  const progress = computeTaskGroupProgress(group);
+  lines.push(`progress: ${progress.completed}/${progress.total} completed, ...`);
+  return lines.join("\n").trimEnd();
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: <code>[?]</code> 符号表示"ready
+  但 pending" (派生状态), <code>[ ]</code> 表示"pending 但
+  仍 blocked" (派生状态), 让 LLM 一眼区分"现在能跑" vs "现在
+  跑不了"。
+</p>
+<h2 id="loop-integration">主循环集成: reminder 注入 active group</h2>
+<p>
+  <strong>用途</strong>: TaskManager 在 Composition Root 创建,
+  注入 agent 主循环; 每轮 LLM 调用前, reminder 注入
+  "[active group: migration-001, progress 50/500, next ready:
+  task_51, task_52]"。 让 LLM 知道"我之前在跑什么, 接下来
+  该跑什么"。
+</p>
+<p>
+  <strong>真实场景</strong>: 5 天迁移, 每天开 session 看到
+  reminder "active: migration-001, progress 100/500,
+  next ready: task_101, task_102"。 LLM 直接接着跑, 不用
+  问 "我之前到哪了"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>主动告知 vs 被动查询</strong> —
+  reminder 主动注入 (LLM 不需要问), 工具被动查询 (LLM 主动
+  调 <code>readGroup</code> 拿完整列表)。 两者配合, LLM 不会
+  丢上下文。
+</p>
+<p>
+  <strong>实现细节</strong>: reminder 在 agent.ts 第 3 步前生成,
+  类似 TODO reminder (第 03 章), 拼到 user message 末尾。
+  reminder 内容只含"active group + progress + next 3 ready
+  task", 不含完整 task 列表 (避免 history 撑爆)。
+</p>
+<h2 id="fake-test">fake test: 用 tmp 目录 + 内存 store 覆盖 3 条不变量</h2>
+<p>
+  <strong>用途</strong>: TaskManager 测试用<strong>真实 tmp
+  目录 + 文件 store</strong>, 不需要 mock fs (mock fs 的
+  行为和真实 fs 微妙不同, 反而测不准)。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户写测试覆盖 3 条不变量:
+  (a) 创建 group 含环依赖 → throw; (b) task_1 pending, task_2
+  blockedBy task_1, task_1 completed → task_2 ready 派生; (c)
+  task_3 completed 后 update status → throw。
+</p>
+<pre><code class="language-typescript">test("环依赖拒绝", () =&gt; {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), "task-test-"));
+  const store = createFileTaskStore({ dir });
+  const mgr = createTaskManager({ store, projectRoot: dir });
+  expect(() =&gt; mgr.createGroup({
+    title: "x",
+    tasks: [
+      { subject: "a", blockedBy: ["task_2"] },
+      { subject: "b", blockedBy: ["task_1"] },
+    ],
+  })).toThrow(/circular dependency/);
+});
 
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
+test("ready 派生: task_1 完成, task_2 变 ready", () =&gt; {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), "task-test-"));
+  const store = createFileTaskStore({ dir });
+  const mgr = createTaskManager({ store, projectRoot: dir });
+  const group = mgr.createGroup({
+    title: "x",
+    tasks: [
+      { subject: "a" },
+      { subject: "b", blockedBy: ["task_1"] },
+    ],
+  });
+  // 初始 task_2.ready=false
+  let view = mgr.readGroup(group.id)!;
+  expect(view.tasks[1]!.ready).toBe(false);
+  // 完成 task_1
+  mgr.updateTask(group.id, "task_1", { status: "in_progress" });
+  mgr.updateTask(group.id, "task_1", { status: "completed" });
+  // 派生 task_2.ready=true
+  view = mgr.readGroup(group.id)!;
+  expect(view.tasks[1]!.ready).toBe(true);
+});
+
+test("终态后不可改", () =&gt; {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), "task-test-"));
+  const store = createFileTaskStore({ dir });
+  const mgr = createTaskManager({ store, projectRoot: dir });
+  const group = mgr.createGroup({ title: "x", tasks: [{ subject: "a" }] });
+  mgr.updateTask(group.id, "task_1", { status: "in_progress" });
+  mgr.updateTask(group.id, "task_1", { status: "completed" });
+  expect(() =&gt; mgr.updateTask(group.id, "task_1", { status: "in_progress" }))
+    .toThrow(/terminal status/);
+});</code></pre>
+<p>
+  <strong>实现细节</strong>: 3 个测试覆盖 (a) 环依赖校验,
+  (b) ready 派生, (c) 终态锁定。 真实 tmp 目录, 测完清理,
+  不污染环境。
+</p>
+<h2 id="common-confusion">常见误解: Task 不替代数据库</h2>
+<p>
+  <strong>误解 1: "Task 替代数据库?"</strong> 不替代。 Task 是
+  跨会话<strong>任务计划</strong>, 不是通用数据存储。 想要
+  通用数据库, 用 SQLite / Postgres, Task 不该滥用。
+</p>
+<p>
+  <strong>误解 2: "ready 字段写入磁盘?"</strong> 错。 ready 是
+  派生状态, 读取时算。 写入会和 source-of-truth (status) 冲突。
+</p>
+<p>
+  <strong>误解 3: "completed 状态能回滚到 pending?"</strong>
+  错。 3 个终态 (completed / cancelled / deleted) 不可转。
+  "我改主意了" 也不行, 要回滚就新建 group。
+</p>
+<p>
+  <strong>误解 4: "events 数组能 1000+ 条?"</strong> 性能 OK
+  但 git diff 难读。 50 个 task 的 group 大约 100 个 event, 工业
+  规模建议定期 archive 旧 group, 不让 events 无限增长。
+</p>
+<h2 id="trap">反例梯度: 4 个常见错误</h2>
+<div class="cards-grid">
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 1 · 环依赖接受</span></div>
+    <div class="card__body">
+      <p>LLM 创建 group, task_1.blockedBy=[task_2], task_2.blockedBy=[task_1],
+        TaskManager 不校验就写磁盘。 错。 环依赖导致 ready 派生
+        死循环 (task_1 等 task_2, task_2 等 task_1, 永远没 ready)。
+        正确: <code>ensureDependencyGraphValid</code> 在 create / addTask /
+        updateTask 后都跑一次, 检测到环 throw, 拒绝写磁盘。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>task id 由 LLM 自由发挥, 每次 update 都新建 id。
-    </p>
-    <p><strong>为什么错:</strong>同一个 task 变成多个, dependencies 失效。</p>
-    <p>
-      <strong>正确做法:</strong>id 由 harness 生成 (UUID / 时间戳), LLM
-      不允许自创 id, update 只能改 status / nextAction 等字段。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 2 · 终态可改</span></div>
+    <div class="card__body">
+      <p>TaskManager 不限制状态转换, LLM 想 completed → pending
+        直接改。 错。 状态机失去意义, audit trail 出现"completed
+        → pending → in_progress → completed" 死循环, 没人能
+        解释。 正确: <code>applyStatusTransition</code> 检查当前
+        status 是不是终态, 终态拒绝任何转换, throw。</p>
+    </div>
   </div>
-</div>
-
-<h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 12 章</span>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 3 · ready 写入磁盘</span></div>
+    <div class="card__body">
+      <p>把 ready 字段写入 group.json, LLM 改 status 时同步改 ready。
+        错。 双写导致不一致 (改 status 忘改 ready, 或反过来)。
+        正确: 磁盘只存 source-of-truth (status, blockedBy), 派生
+        (ready, blocks, progress) 读取时算, <code>buildTaskGroupView</code>
+        永远从 status 重算。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>renderActive 过滤 completed:</strong>fake taskStore 预设 3 个 task
-      (1 pending, 1 in_progress, 1 completed), 跑完 SessionStart 注入, system
-      prompt 含 pending 和 in_progress, 不含 completed。
-    </p>
-    <p>
-      <strong>update_task 状态机校验:</strong>update_task 调用把 pending 直接
-      completed, 写 error tool message, task 状态不变。
-    </p>
-    <p>
-      <strong>task 落盘走 atomic write:</strong>spy 验证 update_task 走 write
-      tmp + fsync + rename 路径, 不直接改原文件。
-    </p>
-    <p>
-      <strong>dependencies 解析:</strong>fake taskStore 预设 task A 依赖 task B,
-      list({ status: "pending" }) 返回 A 和 B, A 的 dependencies 字段含 "B"。
-    </p>
-    <p>
-      <strong>completed task 不再注入:</strong>第二次 run() 把 task 改成
-      completed, 第三次 run() SessionStart, system prompt 不再 含这个 task。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 4 · activeTaskGroupId 写磁盘</span></div>
+    <div class="card__body">
+      <p>把 activeTaskGroupId 存到 <code>.tasks/active.json</code>,
+        跨 session 复用。 错。 active 是 session 级<strong>指针</strong>,
+        不是数据; 写到磁盘会和"哪个 group 重要" 的逻辑耦合 (用户
+        关 session 1 个月后回来, active group 可能已 archive)。
+        正确: activeTaskGroupId 是闭包内状态, session 关闭丢;
+        用户新 session 主动 <code>setActiveGroupId(...)</code>。</p>
+    </div>
   </div>
 </div>
-
-<h2 id="lookback">回望第 00–11 章: 哪些原则在本章兑现了</h2>
+<h2 id="validate">Validation: 4 条不变量检验</h2>
+<ol>
+<li>
+<strong>依赖图无环无悬空</strong>: 创建 group / addTask / updateTask
+    后, 跑 <code>ensureDependencyGraphValid</code>, 含环或悬空
+    引用 throw。 验证: 3 个 case (环 / 引用不存在 / 引用已
+    deleted), 全部 throw。
+  </li>
+<li>
+<strong>ready 派生正确</strong>: task_X 状态从 pending 变
+    completed, 所有 blockedBy 含 task_X 的 task, 派生 ready
+    变 true (假设其他 blocker 也 completed)。 验证: 创建 3
+    个 task (1 独立, 2 依赖 1, 3 依赖 2), 完成 1 后 task_2
+    ready, 完成 2 后 task_3 ready。
+  </li>
+<li>
+<strong>终态锁定</strong>: 3 个终态 (completed / cancelled /
+    deleted) 任何 update 都 throw。 验证: 3 个 case, update
+    status / owner / note / blockedBy 全部 throw。
+  </li>
+<li>
+<strong>删除仍被依赖的任务拒绝</strong>: task_X 仍被
+    task_Y.blockedBy 引用, delete task_X throw。 验证: 创建
+    2 个 task (2 依赖 1), 试图 delete task_1, 错误含
+    "required by task_2"。
+  </li>
+</ol>
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>事实与视图分离:</strong>task 是结构化数据 (status / dependencies
-    字段), renderActive 是视图, 视图按 status 过滤不影响事实。
+<li>
+<strong>状态机 + 显式转换</strong>: 6 状态固定枚举, 转换规则
+    <code>applyStatusTransition</code> 显式定义, 终态锁定。
   </li>
-  <li>
-    <strong>状态机约束:</strong>第 03 章 TODO 状态机 + 本章 task 状态机,
-    一致的设计哲学。
+<li>
+<strong>DAG 校验</strong>: 引用存在 + 无环 + 不可删被依赖, 3 条
+    规则在 create / addTask / updateTask 后都跑。
   </li>
-  <li>
-    <strong>Composition Root 唯一接线:</strong>taskStore 在
-    <code>index.ts</code> 创建, 注入 agent。
+<li>
+<strong>派生不写入</strong>: ready / blocks / progress 读取时
+    算, 磁盘只存 source-of-truth。
   </li>
-  <li>
-    <strong>稳定前缀原则:</strong>task 拼到 system prompt 的固定位置 "# Active
-    Tasks", prefix 锚点稳定, 第 10 章的伏笔。
+<li>
+<strong>事件流</strong>: events 是追加日志, 不驱动状态机, 给
+    "谁改了什么" 留审计线索。
+  </li>
+<li>
+<strong>职责分离</strong>: TaskStore 管文件 IO, TaskManager 管
+    业务规则, store 可换 (内存 / 文件 / git) 不影响 manager。
   </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>task 跨进程冲突</dt>
-  <dd>第 15 章 atomic-write 兜底, task 写新文件 + rename, 不直接改原文件。</dd>
-  <dt>task 与 async run 协作</dt>
-  <dd>
-    第 13 章 async run 完成后, 通知 LLM "task X 已完成, 接下来做 Y", reminder
-    注入。
+<dt>并发执行多个 task</dt>
+<dd>
+    当前 task 一个一个跑 (LLM 按 ready 顺序). 多 task 并发
+    跑是第 13 章 Async Run 的范畴, 涉及并发限制 + 输出隔离。
   </dd>
-  <dt>task 与 schedule 协作</dt>
-  <dd>
-    第 14 章 schedule 定时检查 task dependencies, 满足条件时自动激活下一个
-    task。
+<dt>定时 / 周期跑 task</dt>
+<dd>
+    当前 task 跑完即结束. 想要"每天 9 点跑一次全项目测试"
+    是第 14 章 Schedule 的范畴, 复用 Async Run 作为执行单元。
   </dd>
-  <dt>task 太多怎么办</dt>
-  <dd>
-    task 累积到 50+ 时, SessionStart 注入也会撑爆, 需要按 status / scope
-    进一步分页, 类似第 06 章 compress 思路。
+<dt>Task 与子 agent 配合</dt>
+<dd>
+    LLM 想"派子 agent 跑 task_3", 子 agent 跑完更新 task_3.status。
+    这涉及 subagent 边界 + 错误传播 (第 11 章延伸), 留 P2
+    阶段。
   </dd>
 </dl>
-
-<h2 id="vibe-coding-12">本次如何 vibe code: 第 12 章的三件套</h2>
-
-<h3 id="vibe-feed-12">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>Task</code> /
-    <code>TaskStore</code> / <code>TaskStatus</code> 三个 interface,
-    以及状态机规则文档。本轮不写实现, 重点钉"结构化字段" 和"状态机约束"。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    <code>createTaskStore()</code> 接受目录, SessionStart 注入仍是 stub
-    (永远注入空 task 列表)。本轮 review 重点: taskStore 实例在
-    <code>index.ts</code> 只 new 一次。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 createTaskStore + update_task
-    工具 + agent.SessionStart 接入。本轮 review 重点: renderActive 过滤
-    completed, 状态机校验, atomic write。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/tasks.test.ts</code>。本轮 review 重点: "renderActive 过滤
-    completed" 和 "update_task 状态机校验" 两条必须有反向断言。
-  </li>
-</ol>
-
-<h3 id="vibe-review-12">Review: 第 12 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>task 独立于 memory。</strong>不写在 memoryStore 里。验证:
-    <code>grep -n 'memoryStore' src/tasks.ts</code> 应当 0 行。
-  </li>
-  <li>
-    <strong>renderActive 过滤 completed。</strong>renderActive() 实现内 filter
-    status 不含 completed。验证:
-    <code>grep -n 'completed' src/tasks.ts</code> 在 renderActive() 函数体内应当
-    0 行 (filter 排除)。
-  </li>
-  <li>
-    <strong>update_task 状态机校验在工具入口。</strong>不是上层 agent
-    校验。验证:
-    <code>grep -n 'pending.*completed' src/tools/update_task.ts</code> ≥ 1 行。
-  </li>
-  <li>
-    <strong>task id 由 harness 生成。</strong>LLM 不允许自创 id。验证:
-    <code>grep -n 'crypto.randomUUID\|uuidv4' src/tasks.ts</code> ≥ 1 行 (id
-    生成器)。
-  </li>
-  <li>
-    <strong>taskStore 工厂化。</strong>验证:
-    <code>grep -n 'new TaskStore' src/</code> 应当 0 行。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-12">调试: 第 12 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · renderActive 不过滤 completed。</strong>症状: 注入 system
-    prompt 时所有 task 都出现。验证: Validation 卡片"renderActive 过滤
-    completed" 那条测试通过。
-  </li>
-  <li>
-    <strong>伪装 B · update_task 不校验状态机。</strong>症状: LLM 调 update_task
-    把 pending 直接 completed。验证: Validation 卡片"update_task 状态机校验"
-    那条测试通过 (写 error tool message)。
-  </li>
-  <li>
-    <strong>伪装 C · LLM 自由生成 task id。</strong>症状: update_task
-    工具接受任意 id 字符串, LLM 写 "task-1" / "task-2" 这种自由 id。验证:
-    <code>grep -n 'args.id\|taskId' src/tools/update_task.ts</code> 看是否走
-    harness 生成的 id, 还是直接用 LLM 传的字符串。
-  </li>
-</ol>
-
-<h3 id="vibe-iterate-12">迭代: 第 12 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code>feat(ch12): 钉 Task / TaskStore / TaskStatus 接口与状态机规则</code>
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code>feat(ch12): createTaskStore 工厂 + update_task 工具 stub</code> —— tsc
-    通过, 写空文件。
-  </li>
-  <li>
-    <code
-      >feat(ch12): renderActive 过滤 + 状态机校验 + atomic write + SessionStart
-      注入</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code>test(ch12): completed 不再注入 + dependencies 解析</code> —— 全绿。
-  </li>
-</ol>
-
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
   <div class="card__head">
     <span class="card__tag">Prompt Card · 第 12 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
+    <button class="card__copy" data-copy-card="" type="button">复制</button>
   </div>
   <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 Persistent Task 模块, 跨会话保留项目级 plan,
-      区分会话内 TODO 和项目级 task。
-    </p>
-    <p>
-      <strong>场景:</strong>用户周一创建 3 个 task (重构 / 跑测试 / 部署),
-      周三完成第 1 个, 周五回来 harness 仍然能列出 "Active Tasks: 跑测试, 部署"
-      两条。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/tasks.ts</code> (新) 暴露
-      <code>createTaskStore()</code>; <code>src/task-store.ts</code> (新)
-      实现存储后端; <code>src/tools/update_task.ts</code> (新) 实现工具;
-      <code>src/agent.ts</code> 改 SessionStart 注入 active tasks;
-      <code>src/index.ts</code> 接线 taskStore。
-    </p>
+    <p><strong>目标:</strong> 给 harness 加持久化 Task 系统, TaskGroup
+      写到 .tasks/groups/&lt;id&gt;.json, 跨 session 恢复, 带依赖
+      图 + 状态机 + 派生 ready/blocks + audit events。</p>
+    <p><strong>场景:</strong> 5 天迁移 500 个 JS → TS, LLM 创建
+      migration-001 group 含 500 个 task, task_5 blockedBy
+      [task_1, task_3], LLM 跑 task_1 完成, 系统派生 task_2
+      ready, LLM 跑 task_3 完成, task_5 变 ready, 关 session
+      不丢状态。</p>
+    <p><strong>模块:</strong> <code>src/task-store.ts</code> (新)
+      文件 IO; <code>src/tasks.ts</code> (新) 暴露
+      <code>createTaskManager({store, projectRoot})</code> +
+      <code>formatTaskGroupView(view)</code>; <code>src/agent.ts</code>
+      (改) reminder 注入 active group + 进度。</p>
     <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
     <ul>
-      <li>task 独立于 memory, 不写进 memoryStore</li>
-      <li>renderActive() 过滤 completed, 只返回 pending + in_progress</li>
-      <li>update_task 状态机校验: 拒绝 pending → completed 跳跃</li>
-      <li>task id 由 harness 生成 (UUID), LLM 不允许自创</li>
-      <li>task 落盘走 atomic write</li>
+      <li>6 状态固定枚举 pending / in_progress / completed / failed / cancelled / deleted, 不可新增</li>
+      <li>3 个终态 (completed / cancelled / deleted) 不可转其他状态, 任何 update throw</li>
+      <li>依赖图无环 + 无悬空, create / addTask / updateTask 后 ensureDependencyGraphValid 必跑</li>
+      <li>ready / blocks / progress 派生状态不写入磁盘, 读取时 buildTaskGroupView 算</li>
+      <li>activeTaskGroupId 是 session 级闭包内状态, 不写磁盘, 不持久化</li>
+      <li>events 是追加日志, 不驱动状态机, id 用 event_${ts}_${random} 生成</li>
+      <li>删除被依赖 task throw, 错误信息含 "required by" + 任务列表</li>
     </ul>
-    <p><strong>验证 (用 fake taskStore + spy, 逐条落到 vitest):</strong></p>
+    <p><strong>验证 (用 tmp 目录 + FileTaskStore + vitest, 逐条断言):</strong></p>
     <ul>
-      <li>
-        fake 预设 3 个 task (pending / in_progress / completed), renderActive()
-        只返回前 2 个
-      </li>
-      <li>
-        update_task 把 pending 直接 completed, 写 error tool message, task
-        状态不变
-      </li>
-      <li>spy 验证 update_task 走 write tmp + fsync + rename 路径</li>
-      <li>fake 预设 task A 依赖 B, A.dependencies 字段含 "B"</li>
-      <li>
-        把 task 改成 completed 后, 下次 SessionStart system prompt 不再含这个
-        task
-      </li>
+      <li>环依赖 create throw, 错误信息含 "circular dependency"</li>
+      <li>引用不存在的 task throw, 错误信息含 "references missing task"</li>
+      <li>task_1 pending → in_progress → completed, 派生 task_2.ready 变 true</li>
+      <li>completed 后 updateTask({status: "in_progress"}) throw, 错误信息含 "terminal status"</li>
+      <li>delete task_1 但 task_2 仍依赖, throw, 错误信息含 "required by task_2"</li>
     </ul>
   </div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意让 renderActive 不过滤 completed, 跑测试, 看"renderActive 过滤
-    completed" 是否抓到。
+<li>
+    故意不写 <code>ensureDependencyGraphValid</code>, 创建含环
+    依赖的 group, 跑测试, 看"环依赖拒绝" 是否抓到 (group 写到
+    磁盘 vs. throw)。
   </li>
-  <li>
-    在 update_task 里不校验状态机, 跑测试, 看"update_task 状态机校验" 是否抓到。
+<li>
+    故意不在 <code>applyStatusTransition</code> 检查终态, 跑
+    completed → pending 测试, 看"终态锁定" 是否抓到 (改成功 vs.
+    throw)。
   </li>
-  <li>
-    让 LLM 自由生成 task id (不调 harness 的 id 生成器), 跑测试, 看"task id 由
-    harness 生成" 是否抓到 (deps 解析会失败)。
+<li>
+    把 ready 字段加到 TaskItem 接口, 写磁盘, 跑 ready 派生测试,
+    看"派生不写入" 是否抓到 (磁盘 ready 过期 vs. 派生重算一致)。
+  </li>
+<li>
+    把 activeTaskGroupId 存到 <code>.tasks/active.json</code>,
+    重启 session 看 active 是否恢复, 看"active 不持久化" 是否
+    抓到 (新 session 应该没有 active, 让用户主动 set)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了 Persistent Task 模块, 跨会话保留项目级 plan。 task
-  是结构化数据 (status / dependencies 字段), 与会话内 TODO 严格分离, 走 atomic
-  write 落盘。下一章 (第 13 章) 我们会处理"长 任务不阻塞主 loop" 的问题——Async
-  Run, 让 agent.run() 内部 启动后台任务, 主 loop 继续等用户输入,
-  后台任务完成时通过 reminder 通知。
+  Task 是给跨会话<strong>长期任务</strong>的持久化系统, 区别
+  于 TODO 的 session 内存清单。 核心是 5 个设计:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>6 状态状态机</strong>: pending / in_progress / 3 个终态,
+    显式转换, 终态锁定。
+  </li>
+<li>
+<strong>DAG 依赖图</strong>: 引用存在 + 无环 + 不可删被依赖,
+    <code>ensureDependencyGraphValid</code> 3 处必跑。
+  </li>
+<li>
+<strong>派生不写入</strong>: ready / blocks / progress 读取时算,
+    磁盘只存 source-of-truth, 双写会冲突。
+  </li>
+<li>
+<strong>audit events</strong>: 追加日志, 不驱动状态机, 给 "谁改
+    了什么" 留线索。
+  </li>
+<li>
+<strong>职责分离</strong>: TaskStore 管文件, TaskManager 管业务,
+    store 可换, manager 不变。
+  </li>
 <p>
-  第 12 章让 harness 有了跨会话 plan, 但 plan 里的"长任务" (例如 "跑 5
-  分钟测试") 仍然会阻塞主 loop。下一章 Async Run 会让 agent.run()
-  内部启动后台任务, 主 loop 继续工作, 后台任务完成 时通过 reminder 通知
-  LLM。这是 harness 第一次具备"并行处理" 能力。
+  下一章 (第 13 章) 展开 Task 的<strong>并发执行</strong> —
+  LLM 跑长任务时怎么"同时跑 3 个独立 task, 不阻塞主对话", 用
+  Async Run Manager + 并发限制 3 + 输出隔离。
 </p>
diff --git a/tutorial/chapters/13-async-run.html b/tutorial/chapters/13-async-run.html
index 98f1bd3..16c6db9 100644
--- a/tutorial/chapters/13-async-run.html
+++ b/tutorial/chapters/13-async-run.html
@@ -1,580 +1,689 @@
-<p class="article__eyebrow">第 13 章 · 不阻塞主循环</p>
-<h1 class="article__title">后台跑长任务: Async Run</h1>
+<p class="article__eyebrow">第 13 章 · 后台跑长任务不阻塞主对话</p>
+<h1 class="article__title">Async Run: 3 并发上限 + 终态幂等 + 输出隔离</h1>
 <p class="article__lede">
-  前面十二章让 harness 在单次会话内能聊天、调工具、跑子任务、压缩、拦权限、 留
-  hook、记 memory、cache 友好、recovery、记 task。但 harness 内部如果 要"跑 5
-  分钟测试" 这种长任务, 仍然会阻塞主 loop, 用户只能干等。 这一章给 harness 加
-  Async Run 模块, 让 agent.run() 内部启动后台任务, 主 loop 继续等用户输入,
-  后台任务完成时通过 reminder 通知 LLM。
+  第 12 章的 Task 是"长期计划", 但 LLM 想"现在跑 3 个独立测试
+  (build / lint / typecheck), 不阻塞当前对话" 时, Task 不够 —
+  Task 是计划层, 需要<strong>执行层</strong>。 这一章加
+  <code>src/async-runs.ts</code> 模块: AsyncRun = 一次非阻塞运行
+  实例, 支持 command (跑 bash) 和 subagent (派子 agent) 两种
+  executor, 最多 3 个同时 running, 5 分钟超时, 终态<strong>幂等</strong>
+  (race condition 安全) + 输出<strong>隔离</strong> (写到独立文件
+  + LRU OutputStore)。 读完后, 你能讲清"Async Run vs Task" 的
+  边界 (执行层 vs 计划层) + "finishRun 收敛" 的核心正确性保证
+  + "并发限制 3" 的理由。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-12">在第 12 章基础上改了什么</h2>
-<p>
-  这一章在 agent.run() 主循环里加 async run 工具 (<code>run_async</code>)。 当
-  LLM 决定要跑长任务时, 调 <code>run_async</code>, harness 把任务 派发到后台
-  worker (不阻塞主 loop), 返回 run_id; 主 loop 继续, 下一次 LLM 调用前, harness
-  把已完成的 async run 输出作为 reminder 注入, LLM 看到结果后决定下一步。
-  对应到代码, 改动集中在 3 个文件: <code>src/async-runs.ts</code> (新)、
-  <code>src/tools/run_async.ts</code> (新)、<code>src/agent.ts</code> (改 每轮
-  LLM 调用前 drain 通知)。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/async-runs.ts: Async run 管理器 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/agent.ts: 每轮 LLM 调用前 drain 通知</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/output-store.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/output-store.ts: 大输出存盘 (第 06 章 P1 复用)</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    用户说"跑测试, 跑完后告诉我", harness 调 run_bash("npm test") 阻塞 5 分钟, 5
-    分钟内用户既不能输入也不能看到中间进度。现象是"长任务 占用主 loop,
-    用户体验差"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"用 setTimeout 包装一下, 不阻塞就行"。这有两个问题:
-    一是后台任务的输出没有持久化, 进程崩了任务丢; 二是 LLM 不知道
-    "任务什么时候完成", 完成时没有任何信号通知主 loop。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface AsyncRunManager { start(spec), list(), drainNotifications()
-      }</code
-    >。 不变量三条: (1) async run 的输出走 P1 压缩 (第 06 章), 大输出 存文件 +
-    output_id 占位, (2) 通知队列是 FIFO, 每轮 LLM 调用前 drain 一次,
-    同一通知不会重复注入, (3) run_id 由 harness 生成 (UUID), LLM 通过
-    read_async_output 工具按 id 读输出。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake asyncRunManager.start 返回固定 run_id, 跑完后 LLM 调 read_async_output
-    拿结果; fake LLM 第二轮收到的 messages 包含 reminder 标签
-    "&lt;system-reminder source='async-run'&gt;", 描述后台任务状态。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · setTimeout 包装, 假装不阻塞</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 错误: 用 setTimeout 包装一下, 没真正异步
-setTimeout(async () =&gt; {
-  const result = await exec("npm test");
-  history.add({ role: "user", content: `Async result: ${result}` });
-}, 0);</code></pre>
-  <p><strong>问:</strong>为什么不直接 setTimeout?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 持久化: setTimeout 不进任何 持久层,
-    进程崩了任务丢; 通知: 完成后直接写 history, LLM 在第 5 轮突然看到 "Async
-    result: ..." 一条陌生 user message, 不知道 这是哪来的; 抢占: 没有冲突检测,
-    LLM 跑后台任务的同时调前台工具, 两者可能写同一文件。
-  </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · 通知不 drain, 每轮重复注入</p>
-  <pre class="code-block"><code>// 教学简化版
-async function run(query) {
-  for (;;) {
-    const messages = history.getMessages();
-    // 错误: 不 drain, 每轮 LLM 看到的都是完整通知队列
-    const notifications = asyncRunManager.getAllNotifications();
-    for (const n of notifications) {
-      history.add({ role: "user", content: formatNotification(n) });
-    }
-    /* ... */
-  }
-}</code></pre>
-  <p><strong>问:</strong>为什么不 drain?</p>
-  <p>
-    <strong>答:</strong>不 drain 的话, 同一通知每轮都注入, 10 轮后 history
-    里就有 10 条同样的 "Async run completed: X" 消息, token 浪费, LLM
-    看到重复内容也困惑。drain() 一次性取走, 队列清空, 每条通知 只注入一次。
-  </p>
-</div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export type AsyncRunStatus = "pending" | "running" | "completed" | "failed" | "cancelled";
-
-export interface AsyncRunSpec {
-  tool: string;                 // 调哪个工具 (例如 "run_bash")
-  args: Record&lt;string, unknown&gt;;
-  title: string;                // 给用户看的标题
-}
-
-export interface AsyncRun {
-  id: string;                   // harness 生成的 UUID
-  spec: AsyncRunSpec;
-  status: AsyncRunStatus;
-  outputRef?: { outputId: string };  // P1 压缩的 output_id (第 06 章)
-  error?: string;
-  startedAt: number;
-  finishedAt?: number;
-}
-
-export interface AsyncRunNotification {
-  runId: string;
-  title: string;
-  executor: string;
-  status: AsyncRunStatus;
-  preview: string;              // 摘要, LLM 第一时间看到
-  outputRef?: { outputId: string };
-}
-
-export interface AsyncRunManager {
-  start(spec: AsyncRunSpec): Promise&lt;AsyncRun&gt;;
-  get(id: string): AsyncRun | undefined;
-  list(): AsyncRun[];
-  cancel(id: string): boolean;
-  // 取走所有未读通知, 队列清空
-  drainNotifications(): AsyncRunNotification[];
-  // 前台工具是否与 running async run 冲突 (例如两个 run 写同一文件)
-  checkForegroundToolConflict(request: { toolName: string; args: Record&lt;string, unknown&gt; }): { blocked: boolean; reason?: string };
-}</code></pre>
-
-<h2 id="loop-integration">loop 接入: 每轮 drain 通知</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-async function run(query: string) {
-  history.add({ role: "user", content: query });
-
-  for (;;) {
-    // 1. drain async run 通知, 转成 reminder
-    const notifications = asyncRunManager.drainNotifications();
-    for (const n of notifications) {
-      const lines = ["Async run updates:"];
-      lines.push(
-        `- run_id: ${n.runId}`,
-        `  title: ${n.title}`,
-        `  status: ${n.status}`,
-        `  preview: ${n.preview}`,
-        n.outputRef?.outputId
-          ? `  full_output: use run_output_read with output_id ${n.outputRef.outputId}`
-          : `  full_output: use run_async_output_read with run_id ${n.runId}`,
-      );
-      const reminder = `&lt;system-reminder source="async-run"&gt;\n${lines.join("\n")}\n&lt;/system-reminder&gt;`;
-      history.add({ role: "user", content: reminder });
-    }
-
-    const messages = history.getMessages();
-    const assistant = await llm.chat(messages);
-    history.add(assistant);
-
-    if (!assistant.tool_calls) return assistant.content;
-
-    for (const call of assistant.tool_calls) {
-      // run_async 特殊处理: 派发到后台, 不阻塞
-      if (call.name === "run_async") {
-        const spec = call.args as AsyncRunSpec;
-        const run = await asyncRunManager.start(spec);
-        history.add({ role: "tool", tool_call_id: call.id,
-          content: `Started async run ${run.id}: ${spec.title}\nThe run will continue in background. You will be notified when it completes.` });
-        continue;
-      }
-
-      // 前台工具: 检查与 running async run 冲突
-      const conflict = asyncRunManager.checkForegroundToolConflict({
-        toolName: call.name, args: call.args,
-      });
-      if (conflict.blocked) {
-        history.add({ role: "tool", tool_call_id: call.id,
-          content: `Blocked: ${conflict.reason}` });
-        continue;
-      }
-
-      // 正常执行
-      const tool = registry.get(call.name);
-      const result = await tool.execute(call.args);
-      history.add({ role: "tool", tool_call_id: call.id, content: result.content });
-    }
-  }
-}</code></pre>
-
-<h2 id="conflict-detection">前台工具与后台任务冲突检测</h2>
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: LLM 跑 npm test 卡死主对话 10 分钟</h2>
 <p>
-  harness 跑后台任务时, 主 loop 仍然会调前台工具 (例如读其他文件)。
-  两者可能写同一文件、修改同一 git branch、跑同一 service。conflict 检测由
-  <code>checkForegroundToolConflict()</code> 实现, 规则:
+  写代码之前, 先看一个真实痛点。 用户的 team 用 harness 跑大型
+  monorepo, LLM 跑一个简单 fix 后想"跑 npm test 验证", 但
+  npm test 跑 10 分钟才完。 这 10 分钟内 LLM 主循环被阻塞,
+  user 不能 ctrl-c, 不能切到别的对话, 不能问别的任务。
 </p>
-<dl class="defs">
-  <dt>写文件冲突</dt>
-  <dd>
-    run_bash / run_write 工具的 args.path 命中 running async run 的 monitored
-    paths, 视为冲突。
-  </dd>
-  <dt>同 tool 并发</dt>
-  <dd>
-    run_bash 同时跑两个, harness 拒绝第二个, 提示 "another run_bash is running
-    in background, wait or cancel first"。
-  </dd>
-  <dt>Git 状态冲突</dt>
-  <dd>
-    async run 在 running 状态下, 前台工具不允许做 git push / git rebase
-    (避免分支状态混乱)。
-  </dd>
-</dl>
-
-<h2 id="trap">反例梯度</h2>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>用 setTimeout 包装长任务, 完成后直接写 history。
-    </p>
-    <p>
-      <strong>为什么错:</strong>无持久化, 进程崩了任务丢; LLM
-      不知道任务何时完成; 无冲突检测。
-    </p>
-    <p>
-      <strong>正确做法:</strong>async run 走持久化, 完成时进通知队列, 每轮 LLM
-      drain。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>通知不 drain, 每轮重复注入。</p>
-    <p>
-      <strong>为什么错:</strong>同一通知 10 轮后变成 10 条消息, 浪费 token, 误导
-      LLM。
-    </p>
-    <p>
-      <strong>正确做法:</strong>drainNotifications() 一次性取走, 队列清空,
-      每条只注入一次。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>不检测前台 / 后台冲突, 两个 run_bash 同时跑。
-    </p>
-    <p>
-      <strong>为什么错:</strong>可能写同一文件、git 状态混乱、service 端口冲突。
-    </p>
-    <p>
-      <strong>正确做法:</strong>checkForegroundToolConflict() 入口检查,
-      冲突时拒绝并写 tool message。
-    </p>
-  </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>大输出没走 P1 压缩, 直接塞进 tool message。</p>
-    <p>
-      <strong>为什么错:</strong>第 06 章的 P1 即时压缩不复用, async run
-      完成时直接撑爆 context。
-    </p>
-    <p>
-      <strong>正确做法:</strong>async run 完成时调
-      compressor.compressToolResult(), 大输出存文件 + output_id 占位。
-    </p>
+<ol>
+<li>
+<strong>症状</strong>: LLM 调 run_bash("npm test"), 主循环
+    <code>await</code> 卡 10 分钟, 整个 harness 死锁。
+  </li>
+<li>
+<strong>更糟的并发场景</strong>: LLM 想"同时跑 3 个测试套件
+    (frontend / backend / e2e) 节省时间", 但 run_bash 是同步
+    的, 必须一个一个跑, 总共 30 分钟。 应该并发。
+  </li>
+<li>
+<strong>更更糟的资源滥用</strong>: LLM 想"派 20 个子 agent
+    跑 20 个独立 task", 20 个子 agent 同时跑, 把 LLM API
+    quota 烧光, 全部 fail。
+  </li>
+<li>
+<strong>真问题</strong>: 缺少<strong>非阻塞执行层</strong> +
+  <strong>并发限制</strong> + <strong>输出隔离</strong>。 LLM
+    需要的是"启动一个后台 run, 立即拿到 run_id, 继续干别的,
+    完事后用 run_id 读输出"。
+  </li>
+</ol>
+<p>
+  朴素想法 1: "直接用 Node child_process.spawn 不 await?"
+  半对。 spawn 返回子进程 PID 不 await, 但 LLM 不知道什么时候
+  跑完, 怎么读输出, 怎么让用户看到结果。 需要一个 Async Run
+  Manager 集中管理。
+</p>
+<p>
+  朴素想法 2: "无限制并发?" 错。 LLM 一次想跑 100 个, 把 API
+  quota 烧光, 用户账号被 ban。 需要<strong>并发限制 3</strong>
+  (经验值), 超过就 reject。
+</p>
+<p>
+  正确做法: 加 <code>src/async-runs.ts</code> — <code>createAsyncRunManager</code>
+  工厂, 内部状态 (records + runningCount + finishedRunIds) 全
+  在闭包。 <code>finishRun</code> 是<strong>所有</strong>执行
+  路径 (命令完成 / 超时 / 异常 / shutdown) 汇合的<strong>唯一</strong>
+  入口, 保证幂等。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L13"><code>src/async-runs.ts</code> 第 13 行的核心正确性保证注释</a>:
+</p>
+<pre><code class="language-typescript">// 核心正确性保证: finishRun() 是所有终态收敛的唯一入口, 确保:
+// 1. 只有 running → 终态的转换有效
+// 2. 第一个进入终态的路径 wins (通过 Set 保证)
+// 3. late result 不能覆盖 timeout
+// 4. 不重复递减 runningCount、不重复推送 notification</code></pre>
+<h2 id="run-vs-task">Async Run vs Task: 执行层 vs 计划层</h2>
+<p>
+  <strong>用途</strong>: Task 是<strong>计划层</strong> (跨会话
+  持久化, 带依赖图, LLM 跑 task_1 等 task_2 ready); Async Run
+  是<strong>执行层</strong> (session 内非阻塞运行, 5 分钟超时,
+  跑完即丢)。 两者职责正交, 不能互相替代。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户让 LLM "跑 5 天迁移"。 LLM 先
+  创建 TaskGroup "migration-001" 含 500 个 task (计划层); 之后
+  每次开 session, LLM 调 <code>async_run_start({executor: "subagent", prompt: "跑 task_51 ~ task_60"})</code>
+  启动 Async Run (执行层)。 Async Run 内部派生独立 sub-agent
+  + 只读工具集, 跑 5 分钟, 完成后更新 task_51 ~ task_60 状态
+  (回到计划层)。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>层间清晰</strong> — Async
+  Run 不持久化, 进程重启丢失; Task 持久化, 跨 session 恢复。
+  Async Run 的 groupId / persistentTaskId 字段是"链接"到
+  Task 的桥, 不复制 Task 的状态。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L7"><code>src/async-runs.ts</code> 第 7-10 行的核心概念注释</a>:
+</p>
+<pre><code class="language-typescript">// 核心概念:
+// - Async Run = 一次非阻塞运行实例, 记录 run_id / status / output / notification
+// - 与 Task Group 不同: Async Run 是运行时执行层, 不是长期计划层
+// - 第一版只支持 command 和 subagent 两种 executor
+// - 第一版最多允许 3 个同时 running 的 async runs
+// - 第一版只允许只读探索和诊断命令</code></pre>
+<div class="figure figure--compare">
+  <div class="figure__title">图 1 · Async Run vs Task 的边界</div>
+  <div class="flow-compare">
+    <div class="flow-compare__col flow-compare__col--good">
+      <div class="flow-compare__head">Async Run · 运行时执行层</div>
+      <div class="flow-compare__body">session 内非阻塞, 5 分钟超时, 3 并发上限。 不持久化, 进程重启丢。 适合"现在跑一个 bash / 派一个子 agent"。</div>
+    </div>
+    <div class="flow-compare__col flow-compare__col--bad">
+      <div class="flow-compare__head">Task Group · 长期计划层</div>
+      <div class="flow-compare__body">跨会话持久化到 <code>.tasks/groups/</code>, 带依赖图 + 状态机。 适合"5 天迁移, 500 个 task"。</div>
+    </div>
   </div>
 </div>
-
-<h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 13 章</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>run_async 不阻塞主 loop:</strong>fake asyncRunManager.start 内部
-      sleep 5s, fake LLM 同一 run() 内调 run_async 后继续 generate, 主 loop 不等
-      5s。
-    </p>
-    <p>
-      <strong>通知 drain 不重复:</strong>fake asyncRunManager 预设 1 条
-      未读通知, 跑完第 1 轮 LLM 后 spy 验证 drainNotifications 被调用 1 次,
-      history 含 1 条 reminder; 第 2 轮 LLM 跑完时, history 不再追加这条通知 (已
-      drain)。
-    </p>
-    <p>
-      <strong>P1 压缩 output_id:</strong>fake async run 输出 100k 字符串, 跑完后
-      history reminder 的 content 含 "use run_output_read with output_id=xxx",
-      不含完整输出。
-    </p>
-    <p>
-      <strong>前台 / 后台冲突检测:</strong>fake async run 正在跑 run_bash, LLM
-      同一轮调 run_bash 前台, 写 tool message "Blocked: another run_bash is
-      running", spy 验证前台 run_bash 没被执行。
-    </p>
-    <p>
-      <strong>run_async 必写 tool message:</strong>fake asyncRunManager 返回
-      run_id "abc-123", 跑完后 history 末尾有 tool message "Started async run
-      abc-123: ..."。
-    </p>
+<p>
+  <strong>实现细节</strong>: Async Run 暴露 groupId / persistentTaskId
+  两个可选字段, 让 run 可以"链接"到 Task, 跑完时回调更新 task 状态
+  (虽然 LLM 自己写代码更新, 不用 Async Run 自动)。 链接是<strong>显式</strong>
+  字段, 不隐式行为。
+</p>
+<h2 id="two-executors">2 种 executor: command 和 subagent</h2>
+<p>
+  <strong>用途</strong>: LLM 启动 Async Run 时指定 executor 类型。
+  command = 跑一个 bash 命令 (read-only); subagent = 派一个独立
+  的子 agent (带只读工具集 + 父级 system prompt 快照)。 2 种覆盖
+  90% 后台任务场景。
+</p>
+<p>
+  <strong>真实场景</strong>: user 让 LLM "扫一下项目找所有 TODO 注释",
+  LLM 调 <code>async_run_start({executor: "command", command: "grep -rn 'TODO' src/"})</code>;
+  user 让 LLM "派一个子 agent 调研 React 18 新特性", LLM 调
+  <code>async_run_start({executor: "subagent", prompt: "调研 React 18 新特性", resources: {read_paths: ["docs/"]}})</code>。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>封闭类型, 未来扩展点</strong> —
+  2 种 executor 固定枚举, 不允许动态加第 3 种 (避免混乱); 未来
+  真要加 (如 MCP / 自定义 runner) 走 P2 阶段。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L48"><code>src/async-runs.ts</code> 第 48 行的 <code>AsyncRunExecutor</code> union</a>:
+</p>
+<pre><code class="language-typescript">export type AsyncRunExecutor = "command" | "subagent";</code></pre>
+<div class="figure figure--stack">
+  <div class="figure__title">图 2 · 2 种 executor 的资源差异</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--warn">
+      <div class="flow-stack__label">command · bash 执行器</div>
+      <div class="flow-stack__body">跑 <code>executeBash(command, timeoutMs)</code>, 走 readonly 命令策略 (只允许 ls / cat / grep / find / git status 等只读命令)。 输出写到 output.txt。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">subagent · 子 agent 执行器</div>
+      <div class="flow-stack__body">调 <code>createAgentFn(...)</code> 创建独立 child agent, 派生独立 history / compressor / tools (readPaths 过滤) / 父 system prompt 快照。 maxRounds 默认 8, 上限 20。</div>
+    </div>
   </div>
 </div>
-
-<h2 id="lookback">回望第 00–12 章: 哪些原则在本章兑现了</h2>
+<p>
+  <strong>实现细节</strong>: command 走 <code>launchCommandRunner</code>
+  内部 setTimeout timeout, subagent 走 <code>launchSubagentRunner</code>
+  复用 createAgentFn 工厂 + AbortController (shutdown 时主动 abort)。
+  两个函数返回的 Promise 都不被 <code>start()</code> await, 启动后
+  立即返回 run_id。
+</p>
+<h2 id="concurrency-limit">3 并发上限: 经验值的理由</h2>
+<p>
+  <strong>用途</strong>: LLM 一次启动的 Async Run 不能超过 3 个。
+  超过 3 个 reject, 错误信息明确告诉 LLM "当前并发已满, 请等待
+  现有 run 完"。 这是 Reference 章节 "模式 14 · Concurrent Limit
+  并发限制" 的具体应用。
+</p>
+<p>
+  <strong>真实场景</strong>: LLM 想"同时跑 20 个测试套件节省时间",
+  启动第 4 个时报错 "Maximum concurrent async runs (3) reached"。
+  LLM 知道 "3 是上限", 调整策略: 跑完 3 个再启动下 3 个, 总时间
+  节省 67% 但不会爆 quota。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>经验值 + 教学友好</strong> —
+  3 是经验值, 不需要 LLM 配置; 想"我要 10 个" 改源码改常量。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L208"><code>src/async-runs.ts</code> 第 208 行的 <code>MAX_CONCURRENCY = 3</code></a>:
+</p>
+<pre><code class="language-typescript">const MAX_CONCURRENCY = 3;</code></pre>
+<p>
+  <strong>实现细节</strong>: 检查点在 <code>start()</code> 第 657 行
+  (<code>if (runningCount &gt;= MAX_CONCURRENCY) throw</code>),
+  <code>start()</code> 立刻 reject, 不排队, 不 await。 这条策略叫
+  "fail fast" — 让 LLM 立即知道"现在过载", 不会傻等。
+</p>
+<p>
+  <strong>3 的理由</strong>:
+</p>
 <ul>
-  <li>
-    <strong>tool call 协议沿用:</strong>run_async 是普通工具, 走 Tool interface,
-    派发后写 role: "tool" 消息。
-  </li>
-  <li>
-    <strong>P1 压缩复用:</strong>async run 输出调
-    compressor.compressToolResult(), 与第 06 章一致。
+<li>
+<strong>经验值</strong>: 3 = "够用但不过载"。 1 个太慢, 5+ 个
+    quota 容易爆, 3 是平衡点。
   </li>
-  <li>
-    <strong>reminder 标签格式:</strong>通知走 &lt;system-reminder
-    source="async-run"&gt; 标签, 不污染 system prompt。
+<li>
+<strong>Quota 安全</strong>: 3 个 subagent 同时跑, 假设每个调
+    LLM 5 次, 共 15 次 LLM 调用, 短时间不会触发 rate limit。
   </li>
-  <li>
-    <strong>工厂模式:</strong>asyncRunManager 是工厂, 在 createAgent()
-    闭包内维护当前 run 列表。
+<li>
+<strong>教学友好</strong>: 3 是单数字, 容易记住, 容易测试 (构造
+    3 个 running, 第 4 个 reject)。
   </li>
 </ul>
-
-<h2 id="forward">前瞻张力: 留给后续章节</h2>
-<dl class="defs">
-  <dt>async run 跨进程</dt>
-  <dd>
-    第 14 章 schedule 触发的任务本质上是 async run, scheduleManager 内部复用
-    asyncRunManager。
-  </dd>
-  <dt>async run 输出存档</dt>
-  <dd>第 15 章 transcript 记录 async run 的启动 / 完成 / 失败, 用于审计。</dd>
-  <dt>async run 与 task 协作</dt>
-  <dd>
-    async run 完成时自动更新 task 状态 (例如 run_bash("npm test") 完成后把 task
-    "跑测试" 标 completed)。
-  </dd>
-  <dt>async run 与 permission 协作</dt>
-  <dd>
-    async run 派发时仍然过 permission, 危险命令 deny, 写 tool message,
-    不在后台偷偷跑。
-  </dd>
-</dl>
-
-<h2 id="vibe-coding-13">本次如何 vibe code: 第 13 章的三件套</h2>
-
-<h3 id="vibe-feed-13">拆卡: 4 轮迭代的具体产物</h3>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>AsyncRunSpec</code> /
-    <code>AsyncRun</code> / <code>AsyncRunNotification</code> /
-    <code>AsyncRunManager</code> 四个 interface。本轮不写实现, 重点钉"通知 drain
-    不重复" 和"run_id 由 harness 生成"。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    <code>createAsyncRunManager()</code> 是 stub (start 永远返回固定 run),
-    agent.run 不接入 drain。本轮 review 重点: asyncRunManager 实例在
-    <code>index.ts</code> 只 new 一次。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 createAsyncRunManager + run_async
-    工具 + agent.run 接入 drain + 冲突检测。本轮 review 重点: drain 一次性取走,
-    冲突检测入口写, P1 压缩复用第 06 章。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/async-runs.test.ts</code>。本轮 review 重点: "通知 drain 不重复"
-    和 "前台 / 后台冲突检测" 两条必须有 spy 验证。
-  </li>
-</ol>
-
-<h3 id="vibe-review-13">Review: 第 13 章专属 checklist</h3>
+<h2 id="finishrun-idiempotent">finishRun 幂等: race condition 的核心防御</h2>
+<p>
+  <strong>用途</strong>: Async Run 跑完有 4 条可能路径 — (a) 命令
+  自然完成; (b) 5 分钟超时; (c) 命令抛错; (d) shutdown 主动
+  abandon。 这 4 条路径可能<strong>竞争</strong> (timeout 和
+  命令完成几乎同时触发), 必须有<strong>唯一收敛点</strong>保证
+  只产生一个终态。
+</p>
+<p>
+  <strong>真实场景</strong>: 跑 <code>npm test</code> 跑了 4 分
+  50 秒, setTimeout 在 5 分钟触发 timeout, 同时 npm test 跑完
+  调回调完成回调。 两个回调都调 <code>finishRun(record, "completed"/"timeout", ...)</code>。
+  没幂等保护, runningCount 减 2 次, notification 推 2 条,
+  output 写 2 次, 状态错乱。 幂等保护让第一个 wins, 第二个
+  no-op。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>单一收敛点 + 幂等 Set</strong> —
+  Reference 章节 "模式 19 · Idempotent 幂等" 的标准应用。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L397"><code>src/async-runs.ts</code> 第 397 行的 <code>finishRun</code></a>:
+</p>
+<pre><code class="language-typescript">function finishRun(
+  record: AsyncRunRecord,
+  nextStatus: Exclude&lt;AsyncRunStatus, "running"&gt;,
+  output?: string,
+  error?: string,
+): boolean {
+  // 只有 running 状态才允许进入终态
+  if (record.status !== "running") return false;
+  // 第一个进入终态的路径 wins (Set 保证幂等)
+  if (finishedRunIds.has(record.id)) return false;
+  finishedRunIds.add(record.id);
+  clearRunRuntime(record.id);
+  // ... 写状态 / 写输出 / 推送 notification / 递减 runningCount
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: 5 条正确性保证 (见注释第 384-389 行):
+</p>
 <ol>
-  <li>
-    <strong>通知 drain 一次性取走。</strong>验证: drainNotifications()
-    调用后队列清空, 下次调用返回空数组。
+<li>
+<strong>running → 终态</strong>: 已经终态的 record 拒绝再次
+    收敛。
   </li>
-  <li>
-    <strong>run_id 由 harness 生成。</strong>LLM 不允许自创 id。验证:
-    <code>grep -n 'crypto.randomUUID\|uuidv4' src/async-runs.ts</code> ≥ 1 行。
+<li>
+<strong>第一个 wins</strong>: 用 <code>finishedRunIds: Set</code>
+    标记已收敛, 重复调用 return false。
   </li>
-  <li>
-    <strong>run_async 必写 tool message。</strong>验证: 工具 execute 末尾
-    <code>history.add({role: "tool", ...})</code> 含 "Started async run"。
+<li>
+<strong>late result 不覆盖</strong>: 即使 completed 后 timeout
+    才到, timeout 也被 finishRun 拒绝, 不会把 completed 改成
+    timeout。
   </li>
-  <li>
-    <strong>冲突检测在 execute 之前。</strong>验证: agent.ts 工具执行分支中,
-    conflict 检查在 tool.execute 之前。
+<li>
+<strong>不重复递减</strong>: runningCount 只在第一个 wins 时
+    减 1, 后续不重复减。
   </li>
-  <li>
-    <strong>P1 压缩复用第 06 章。</strong>验证:
-    <code>grep -n 'compressor.compressToolResult' src/async-runs.ts</code> ≥ 1
-    行。
+<li>
+<strong>不重复推送</strong>: notification 队列只 push 一次, 不
+    重复通知。
   </li>
 </ol>
-
-<h3 id="vibe-debug-13">调试: 第 13 章典型伪装</h3>
+<h2 id="isolation">输出隔离: 文件 + OutputStore LRU</h2>
+<p>
+  <strong>用途</strong>: 每个 Async Run 的输出独立写到
+  <code>async-runs/&lt;runId&gt;/output.txt</code>, 同时登记到
+  OutputStore (LRU 内存索引), LLM 用 run_id 或 output_id 读。
+  文件 = 长期存档, LRU = 快速查表。
+</p>
+<p>
+  <strong>真实场景</strong>: LLM 跑 3 个并发 Async Run (build /
+  lint / typecheck), 每个写到独立 output.txt, 完事后 LLM 调
+  <code>async_run_check(runId)</code> 拿 preview, 调
+  <code>async_run_output_read(runId, maxBytes)</code> 拿全文。
+  3 个 run 互不污染, 不会把 A 的输出塞给 B。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>独立文件 + LRU 内存索引</strong> —
+  Reference 章节 "模式 10 · Atomic Write 原子写" 的应用 — finishRun
+  写 output.txt 用 writeFileSync, 不追加; OutputStore 记录
+  (runId → outputId) 映射, LRU 淘汰旧 run 的全文 (避免内存爆炸)。
+</p>
+<p>
+  <strong>实现细节</strong>: 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L426"><code>src/async-runs.ts</code> 第 426 行的 OutputStore 登记</a>
+  + 第 446 行的旧 outputPath 写入。 两份同时写 (file + store),
+  file 给 git/archive, store 给快速 LLM 读取。
+</p>
+<h2 id="notification-queue">notification 队列: 后台结果不阻塞当前 LLM</h2>
+<p>
+  <strong>用途</strong>: Async Run 完成后, 不能直接插入当前 LLM
+  调用 (会破坏 OpenAI API 消息格式, 第 08 章延迟注入协议), 也
+  不能阻塞主循环等 run 完。 正确做法: 跑完后 push 一条
+  notification 到队列, 下一轮 LLM 调用前调
+  <code>drainNotifications()</code>, 把 notification 渲染到
+  user message 末尾。
+</p>
+<p>
+  <strong>真实场景</strong>: LLM 启动 build / lint / typecheck 3
+  个 run, 立即拿到 3 个 run_id, 继续干别的。 5 分钟后 3 个 run
+  陆续完成, 3 条 notification 进队列。 LLM 下一次想"现在 build
+  怎么样", 调 <code>async_run_drain()</code> 拿 3 条 notification,
+  看到"build ✓, lint ✗ (1 error), typecheck ✓", 据此决定下一步。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>异步结果回传 = 排队, 不插入</strong> —
+  Reference 章节 "模式 9 · Observer 观察者模式" 的延伸。 notification
+  是<strong>短摘要</strong> (runId + status + preview), 全文靠
+  <code>readOutput</code> 读。 短摘要进 history 不撑爆, 全文不
+  进 history。
+</p>
+<p>
+  <strong>实现细节</strong>: notification 在 finishRun 内 push
+  到 <code>notifications: AsyncRunNotification[]</code> 队列,
+  LLM 调 <code>drainNotifications()</code> 拿所有未读 + 队列
+  清空。 幂等性保证 (第 4 条) 让重复 finishRun 不重复 push。
+</p>
+<h2 id="shutdown">shutdown: 进程退出时把 running 标 abandoned</h2>
+<p>
+  <strong>用途</strong>: harness 进程退出 (ctrl-c / session 关闭
+  / eval cleanup) 时, 仍有 running 的 Async Run。 应该把
+  这些 run 标 abandoned (不是 failed, 是"用户主动放弃"),
+  清内部状态, abort 子 agent。
+</p>
+<p>
+  <strong>真实场景</strong>: user 跑了 3 个 Async Run, 第 2 个跑
+  到一半, user ctrl-c 退出。 期望: harness 优雅退出, 3 个 run
+  全部 abandoned 状态, 子 agent 收到 abort signal 停止
+  LLM/tool 调用, 内存状态清空, 写日志 "Async run ar_xxx
+  abandoned: process shutdown"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>运行时生命周期, 非 LLM 工具</strong> —
+  shutdown 是 harness 内部用的, 不暴露给 LLM 工具。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L164"><code>src/async-runs.ts</code> 第 164 行的 shutdown 注释</a>:
+</p>
+<pre><code class="language-typescript">// 关闭当前进程内仍在 running 的 async runs。
+// 这是运行时生命周期方法, 不是 LLM 工具能力:
+// - 普通完成/失败/超时仍由 finishRun() 统一收敛
+// - 进程关闭或 eval cleanup 时调用 shutdown(), 把剩余 running run 标记为 abandoned
+// - command executor 底层 child_process 仍依赖 executeBash 的 timeout 机制退出;
+//   shutdown 的职责是清理 manager 状态、取消内部 timeout、abort 子 Agent</code></pre>
+<p>
+  <strong>实现细节</strong>: shutdown 调 finishRun(record, "abandoned", undefined, reason)
+  对每个 running record 收敛, 然后 clearRunRuntime 清 timeout
+  + abort 子 agent。 这条保证"进程退出 = 没有 zombie 进程 / 没有
+  内存泄漏"。
+</p>
+<h2 id="foreground-conflict">foreground conflict: run_bash 不和 async_run 冲突</h2>
+<p>
+  <strong>用途</strong>: LLM 想"用 run_bash 跑命令" (前台同步)
+  vs "用 async_run_start 跑命令" (后台异步), 两者可能冲突
+  (前台命令占住 terminal, 后台命令也在跑)。 应该<strong>禁止
+  同一命令前后台并发</strong>。
+</p>
+<p>
+  <strong>真实场景</strong>: LLM 启动 <code>async_run_start({command: "npm test"})</code>,
+  之后又调 <code>run_bash("npm test")</code>。 应该报错"和
+  async run ar_xxx 冲突", 让 LLM 决定"等 async run 完" 或
+  "cancel async run 后再前台跑"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>前后台冲突检测</strong> —
+  看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L174"><code>src/async-runs.ts</code> 第 174 行的 <code>checkForegroundToolConflict</code></a>:
+  manager 暴露这个方法, run_bash 调用前 check, 防止资源
+  竞争。 不主动 throw, 返回 blocked + reason, 让 agent
+  决定如何处理。
+</p>
+<p>
+  <strong>实现细节</strong>: 冲突检测规则: command 字面相同
+  (标准化后) + 都在 running 状态 → 冲突。 简单规则, 教学
+  友好; 工业级要做 AST 级别的"语义冲突" (两个命令读同一
+  文件 + 写同一文件), 太复杂本阶段不做。
+</p>
+<h2 id="loop-integration">主循环集成: start 立即返回, drain 在每轮前</h2>
+<p>
+  <strong>用途</strong>: AsyncRunManager 在 Composition Root 创建,
+  注入 agent + 注册工具 (async_run_start / check / list /
+  read_output / drain)。 主循环集成点 2 个:
+</p>
 <ol>
-  <li>
-    <strong>伪装 A · 通知不 drain, getAllNotifications 替代。</strong>症状:
-    通知每轮 LLM 都重新注入, history 增长 10 倍。验证: Validation 卡片"通知
-    drain 不重复" 那条测试通过。
-  </li>
-  <li>
-    <strong>伪装 B · run_async 走前台 await, 阻塞主 loop。</strong>症状: start()
-    内部 await exec(...), 主 loop 等待。验证: Validation 卡片"run_async 不阻塞主
-    loop" 那条测试通过 (fake start 内部 sleep 5s, 主 loop 跑通)。
+<li>
+<strong>async_run_start 工具</strong>: LLM 调, manager 启动
+    run, <strong>立即</strong>返回 record (不 await run 完)。
+    record 含 run_id / status=running / preview。
   </li>
-  <li>
-    <strong>伪装 C · 不检测冲突, 两个 run_bash 同时跑。</strong>症状: agent.run
-    不调 checkForegroundToolConflict。验证: Validation 卡片"前台 / 后台冲突检测"
-    那条测试通过。
+<li>
+<strong>async_run_drain 工具 + reminder</strong>: LLM 调或
+    reminder 注入, manager 返回所有未读 notification, 渲染
+    到 user message 末尾。 主循环在每轮 LLM 调用前 drain 一次。
   </li>
 </ol>
-
-<h3 id="vibe-iterate-13">迭代: 第 13 章 4 个 commit 节点</h3>
+<p>
+  <strong>设计思想</strong>: <strong>非阻塞 + 延迟回传</strong> —
+  start 立即返回 + drain 主动拉, LLM 主循环不被后台 run 阻塞。
+  这是 Reference 章节 "模式 14 · Concurrent Limit" 的应用。
+</p>
+<p>
+  <strong>实现细节</strong>: agent.ts 在第 3 步 (call LLM) 之前
+  调 <code>asyncRunManager.drainNotifications()</code>, 如果有
+  notification 就拼到 user message 末尾, 让 LLM 看到"上次后台
+  run 跑完了, 结果是这样"。
+</p>
+<h2 id="fake-test">fake test: 模拟 timeout + 命令完成竞态</h2>
+<p>
+  <strong>用途</strong>: AsyncRunManager 测试需要<strong>真 time</strong>
+  (超时靠 setTimeout) + <strong>假 executor</strong> (注入假
+  command runner, 不真跑 bash)。 这两者结合, 可以测"timeout +
+  命令完成" 的 race condition。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户写测试"启动一个 1 秒超时的
+  command, 0.5 秒后手动调 finishRun(..., "completed") 模拟
+  命令提前完成; 然后 1 秒后 timeout 触发, 调 finishRun(..., "timeout")。
+  验证: record.status === "completed" (不是 timeout), runningCount
+  === 0, notification 队列只有 1 条 (不是 2 条)"。
+</p>
+<pre><code class="language-typescript">test("race: timeout 后到不覆盖提前完成", () =&gt; {
+  vi.useFakeTimers();
+  const mgr = createAsyncRunManager({...});
+
+  // 启动一个 1 秒超时的 command
+  const record = mgr.start({ title: "x", executor: "command", command: "echo a", timeoutMs: 1000 });
+
+  // 0.5 秒后手动调 finishRun (模拟命令提前完成)
+  vi.advanceTimersByTime(500);
+  // 这里需要拿到内部 record, 测试可能要 hack 接口
+  // 实际测试通过 createAsyncRunManager 注入一个 fake runner
+
+  // 1 秒后 timeout 触发 (fake timer)
+  vi.advanceTimersByTime(500);
+  // 验证: record.status === "completed"
+  expect(mgr.check(record.id)!.status).toBe("completed");
+  expect(mgr.list({ includeTerminal: true })).toHaveLength(1);
+
+  vi.useRealTimers();
+});
+
+test("并发限制 3: 启动第 4 个 reject", () =&gt; {
+  const mgr = createAsyncRunManager({...});
+  mgr.start({ title: "a", executor: "command", command: "echo a", timeoutMs: 60000 });
+  mgr.start({ title: "b", executor: "command", command: "echo b", timeoutMs: 60000 });
+  mgr.start({ title: "c", executor: "command", command: "echo c", timeoutMs: 60000 });
+  expect(() =&gt; mgr.start({ title: "d", executor: "command", command: "echo d", timeoutMs: 60000 }))
+    .toThrow(/Maximum concurrent async runs/);
+});
+
+test("shutdown 把 running 标 abandoned", () =&gt; {
+  const mgr = createAsyncRunManager({...});
+  const r = mgr.start({ title: "a", executor: "command", command: "echo a", timeoutMs: 60000 });
+  mgr.shutdown!("test shutdown");
+  expect(mgr.check(r.id)!.status).toBe("abandoned");
+});</code></pre>
+<p>
+  <strong>实现细节</strong>: race condition 测试需要
+  <code>vi.useFakeTimers()</code> + 内部 record 暴露 (生产
+  API 不暴露, 测试要 hack)。 一般做法: 在 createAsyncRunManager
+  deps 注入一个 fake <code>setTimeout</code> (用 fake timer),
+  record 通过 <code>check()</code> 接口读, 不用 hack 闭包。
+</p>
+<h2 id="common-confusion">常见误解: Async Run 不是后台进程池</h2>
+<p>
+  <strong>误解 1: "Async Run = 启动一个 Node 子进程?"</strong>
+  错。 command executor 内部用 child_process, 但 Async Run 是
+  <strong>管理层</strong>, 不是进程本身。 LLM 调 start() 拿到
+  record, 不是拿到 PID。
+</p>
+<p>
+  <strong>误解 2: "无限并发, 多多益善?"</strong> 错。 3 是上限,
+  超就 reject。 LLM 想"再快也要 3 个一组, 跑完再起下一组"。
+</p>
+<p>
+  <strong>误解 3: "timeout 一定赢?"</strong> 错。 谁先到谁赢,
+  不分 timeout / completed / failed。 命令提前完成 → completed;
+  timeout 先到 → timeout; 都可能, 看时序。
+</p>
+<p>
+  <strong>误解 4: "Async Run 持久化?"</strong> 错。 session 内
+  内存, 进程重启丢。 跨会话跟踪用 Task (第 12 章)。
+</p>
+<h2 id="trap">反例梯度: 4 个常见错误</h2>
+<div class="cards-grid">
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 1 · 没有 finishRun 幂等</span></div>
+    <div class="card__body">
+      <p>让 timeout 和命令完成都直接改 record.status, 没有
+        Set 标记。 race condition 时 runningCount 减 2 次,
+        notification 推 2 条, output 写 2 次。 错。 正确:
+        <code>finishRun</code> 是唯一收敛点, 用
+        <code>finishedRunIds: Set</code> 保证幂等, 第二个
+        调用 return false。</p>
+    </div>
+  </div>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 2 · 没有并发限制</span></div>
+    <div class="card__body">
+      <p>start() 不检查 runningCount, LLM 一次想跑 100 个就
+        真跑 100 个, 把 LLM API quota 烧光。 错。 正确:
+        <code>if (runningCount &gt;= MAX_CONCURRENCY) throw</code>
+        fail fast, 让 LLM 立即知道过载。</p>
+    </div>
+  </div>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 3 · 输出直接进 history</span></div>
+    <div class="card__body">
+      <p>Async Run 跑完直接把 output 字符串 append 到 history。
+        错。 破坏 LLM API 消息格式 (tool_call 后立即 user message,
+        第 08 章延迟注入), 也撑爆 history。 正确: 输出写到
+        output.txt + OutputStore, LLM 主动调
+        <code>readOutput(runId, maxBytes)</code> 拉取。</p>
+    </div>
+  </div>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 4 · shutdown 不 abort 子 agent</span></div>
+    <div class="card__body">
+      <p>shutdown() 只清 manager 状态, 不 abort 子 agent 内部
+        的 LLM/tool 调用。 子 agent 继续跑, 写日志, 调 LLM API,
+        进程退出时这些调用全 mid-write 报错。 错。 正确: 持有
+        <code>abortControllers: Map&lt;runId, AbortController&gt;</code>,
+        shutdown 时对每个 running run 调 <code>abortController.abort()</code>,
+        子 agent 内部 LLM client 收到 abort signal 停止新调用。</p>
+    </div>
+  </div>
+</div>
+<h2 id="validate">Validation: 4 条不变量检验</h2>
 <ol>
-  <li>
-    <code
-      >feat(ch13): 钉 AsyncRunSpec / AsyncRun / AsyncRunNotification /
-      AsyncRunManager 接口</code
-    >
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code>feat(ch13): createAsyncRunManager 工厂 + run_async 工具 stub</code> ——
-    tsc 通过, start 永远同步返回。
-  </li>
-  <li>
-    <code>feat(ch13): drain 通知 + 冲突检测 + P1 压缩复用 + run_id 生成</code>
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code
-      >test(ch13): run_async 必写 tool message + run_id 由 harness 生成</code
-    >
-    —— 全绿。
+<li>
+<strong>finishRun 幂等</strong>: 对同一 record 调 finishRun 2
+    次, 第二次 return false, record 状态不变化, runningCount
+    不重复减。 验证: 模拟 race condition (timeout 和 completed
+    同时触发), 跑测试断言。
+  </li>
+<li>
+<strong>并发限制</strong>: 启动 4 个 run, 第 4 个 throw "Maximum
+    concurrent async runs (3) reached"。 验证: 单测 start 3
+    个 + start 第 4 个 expect throw。
+  </li>
+<li>
+<strong>状态机合法</strong>: running → 4 个终态 (completed /
+    failed / timeout / abandoned) 合法, 终态 → 终态不合法。
+    验证: 单测, 终态后 start 一条 new task 想覆盖 status, expect
+    throw / no-op。
+  </li>
+<li>
+<strong>shutdown 清干净</strong>: 启动 3 个 run, shutdown,
+    所有 run 状态 abandoned, abortControllers / timeoutIds
+    清空, runningCount === 0。 验证: 跑 shutdown 后调
+    list({includeTerminal: true}) 全是 abandoned 状态。
   </li>
 </ol>
-
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
+<ul>
+<li>
+<strong>单一收敛点 + 幂等</strong>: finishRun 是 4 条执行路径
+    的唯一汇合, Set 保证幂等, race condition 安全。
+  </li>
+<li>
+<strong>fail fast</strong>: 并发限制 3 超就 reject, 不排队不
+    await, LLM 立即知道过载。
+  </li>
+<li>
+<strong>异步结果回传 = 排队</strong>: notification 队列, 不
+    插入当前 LLM, 延迟到下一轮 drain。
+  </li>
+<li>
+<strong>运行时生命周期</strong>: shutdown 是 harness 内部用,
+    不暴露 LLM 工具, abort 子 agent 避免 zombie。
+  </li>
+<li>
+<strong>执行层 vs 计划层</strong>: Async Run 内存不持久, Task
+    持久化, groupId 字段桥接两者。
+  </li>
+</ul>
+<h2 id="forward">前瞻张力: 留给后续章节</h2>
+<dl class="defs">
+<dt>定时跑 Async Run</dt>
+<dd>
+    当前 Async Run 是 LLM 手动触发, 想"每天 9 点跑一次全项目
+    测试" 是第 14 章 Schedule 的范畴, 复用 Async Run 作为
+    执行单元, scheduler 负责定时触发。
+  </dd>
+<dt>跨进程 Async Run 协调</dt>
+<dd>
+    当前 Async Run 进程内, 进程退出全 abandoned。 想要
+    "harness 进程退出, run 仍跑, 进程回来能 drain 通知"
+    是分布式范畴, 留 P2 阶段。
+  </dd>
+<dt>取消正在跑的 Async Run</dt>
+<dd>
+    当前没有 cancel API, 只能等 timeout 或自然完成。 实战中
+    user 想"立刻停掉跑得太慢的 run", 留 P2 加 cancel(runId)
+    调 abortController.abort()。
+  </dd>
+</dl>
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
   <div class="card__head">
     <span class="card__tag">Prompt Card · 第 13 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
+    <button class="card__copy" data-copy-card="" type="button">复制</button>
   </div>
   <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 Async Run 模块, 让 harness 在跑长任务时不阻塞主
-      loop, 完成后通过 reminder 通知 LLM。
-    </p>
-    <p>
-      <strong>场景:</strong>用户说"跑测试, 跑完后告诉我", LLM 调
-      run_async("run_bash", {cmd: "npm test"}), 主 loop 继续, 5
-      分钟后后台任务完成, 第 N 轮 LLM 调用前 reminder 注入, LLM
-      看到结果后告诉用户。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/async-runs.ts</code> (新) 暴露
-      <code>createAsyncRunManager()</code>;
-      <code>src/tools/run_async.ts</code> (新) 实现工具;
-      <code>src/agent.ts</code> 每轮 LLM 调用前 drain 通知 + 冲突检测;
-      <code>src/index.ts</code> 接线 asyncRunManager。
-    </p>
+    <p><strong>目标:</strong> 给 harness 加 Async Run 系统, LLM 启动
+      非阻塞后台执行, 立即拿 run_id, 完成后通过 notification 队列
+      回传。</p>
+    <p><strong>场景:</strong> LLM 跑大型 monorepo fix, 想"同时跑
+      build / lint / typecheck", 启动 3 个 Async Run (命令) +
+      派 1 个子 agent 调研 (subagent), 立即拿到 4 个 run_id,
+      继续干别的。 5 分钟后 3 个 build / lint / typecheck 陆续
+      完成, 通知进队列, LLM 下一轮 drain 看到结果。</p>
+    <p><strong>模块:</strong> <code>src/async-runs.ts</code> (新)
+      暴露 <code>createAsyncRunManager(deps)</code>; 工具
+      <code>src/tools/async-run-*.ts</code> (新) 包装 LLM 接口;
+      <code>src/agent.ts</code> (改) 第 3 步前 drain notifications;
+      <code>src/index.ts</code> (改) Composition Root 创建 +
+      shutdown 钩子。</p>
     <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
     <ul>
-      <li>通知 drain 一次性取走, 队列清空, 同条通知不重复注入</li>
-      <li>run_id 由 harness 生成 (UUID), LLM 不允许自创</li>
-      <li>run_async 必写 role: "tool" 消息含 run_id</li>
-      <li>前台工具执行前过冲突检测, 冲突拒绝写 tool message</li>
-      <li>大输出走 P1 压缩 (复用第 06 章 compressor)</li>
+      <li>2 种 executor 固定枚举 command / subagent, 不可新增第 3 种</li>
+      <li>3 并发上限, 超就 reject throw, fail fast 不排队</li>
+      <li>5 分钟超时 (MAX_TIMEOUT_MS = 300_000), 超时和命令完成竞争由 finishRun 收敛</li>
+      <li>finishRun 是所有终态收敛的唯一入口, 用 Set 幂等, 重复调用 return false</li>
+      <li>输出隔离: 写到 output.txt + OutputStore LRU, 不进 history</li>
+      <li>notification 队列短摘要, LLM 主动 drain, 不插入当前 LLM 调用</li>
+      <li>shutdown 调 finishRun(record, "abandoned") + abortController.abort(), 进程退出不留 zombie</li>
+      <li>前后台冲突检测: command 字面相同 + 都在 running → 拒绝, 错误含 "conflict"</li>
     </ul>
-    <p>
-      <strong>验证 (用 fake asyncRunManager + spy, 逐条落到 vitest):</strong>
-    </p>
+    <p><strong>验证 (用 fake timer + fake executor + vitest, 逐条断言):</strong></p>
     <ul>
-      <li>
-        fake start 内部 sleep 5s, run_async 调用后主 loop 不等 5s, 继续调 LLM
-      </li>
-      <li>
-        fake 预设 1 条通知, 第 1 轮 LLM 后 spy 验证 drainNotifications 被调用 1
-        次, 第 2 轮 LLM 后 history 不再追加这条通知
-      </li>
-      <li>
-        fake async run 输出 100k 字符串, history reminder 含 "output_id=xxx"
-        占位
-      </li>
-      <li>
-        fake async run 正在跑 run_bash, LLM 调 run_bash 前台, 写 tool message
-        "Blocked", spy 验证前台没执行
-      </li>
-      <li>
-        fake start 返回 run_id "abc-123", history 含 "Started async run abc-123"
-      </li>
+      <li>finishRun 幂等: 同一 record 调 2 次, 第二次 return false, 状态不重复写</li>
+      <li>并发限制: 启动 3 个后第 4 个 throw "Maximum concurrent async runs"</li>
+      <li>race condition: timeout + completed 竞争, 第一个 wins, 第二个 no-op</li>
+      <li>shutdown: 3 个 running → 全部 abandoned, abortControllers 清空</li>
+      <li>notification 队列: 完成 3 个 run, drain 返回 3 条, 重复 drain 返回 0</li>
     </ul>
   </div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意把 start() 内部 await 5s, 跑测试, 看"run_async 不阻塞主 loop" 是否抓到
-    (主 loop 会卡 5s)。
+<li>
+    故意去掉 <code>finishedRunIds</code> Set, 跑 race condition
+    测试 (timeout + completed 同时触发), 看"幂等" 是否抓到
+    (runningCount 减 2 次, notification 推 2 条 vs. 1 条)。
   </li>
-  <li>
-    在 drain 后忘记清空队列, 跑测试, 看"通知 drain 不重复" 是否抓到 (第 2 轮
-    history 仍追加)。
+<li>
+    故意去掉 <code>if (runningCount &gt;= MAX_CONCURRENCY) throw</code>
+    检查, 启动 5 个 run, 看"并发限制" 是否抓到 (5 个全跑 vs.
+    第 4 个 throw)。
   </li>
-  <li>
-    在冲突检测通过后忘记拒绝, 跑测试, 看"前台 / 后台冲突检测" 是否抓到 (spy
-    验证前台被执行了)。
+<li>
+    故意让 Async Run 跑完直接 append output 到 history, 跑测试
+    断言 OpenAI API 消息格式, 看"输出隔离" 是否抓到 (消息格式
+    错误 vs. 写到 output.txt + LRU store)。
+  </li>
+<li>
+    故意不 abort 子 agent, 跑 shutdown 测试, 看"shutdown abort"
+    是否抓到 (子 agent 内部 LLM 调用 mid-write 报错 vs. abort
+    signal 干净退出)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了 Async Run 模块, 让长任务派发到后台, 主 loop
-  继续等用户输入。通知走 reminder 标签, 不污染 system prompt; 大 输出走 P1
-  压缩复用第 06 章; 冲突检测防止前台 / 后台写同一文件。 下一章 (第 14 章)
-  我们会处理"定时触发 agent" 的问题——Schedule, 让 harness 每天自动跑一次代码
-  review 报告。
+  Async Run 是给后台执行的<strong>非阻塞运行层</strong>, 区别
+  于 Task 的长期计划层。 核心是 5 个设计:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>2 种 executor</strong>: command (只读 bash) / subagent
+    (派生独立 child agent), 固定枚举, 不可新增。
+  </li>
+<li>
+<strong>3 并发上限</strong>: 经验值, 超就 reject, fail fast
+    不排队。
+  </li>
+<li>
+<strong>finishRun 幂等</strong>: 4 条执行路径 (完成 / 超时 /
+    失败 / abandoned) 唯一收敛点, Set 标记, race condition 安全。
+  </li>
+<li>
+<strong>输出隔离</strong>: output.txt + OutputStore LRU, 不
+    进 history, LLM 主动 read。
+  </li>
+<li>
+<strong>notification 队列</strong>: 短摘要, 不插入当前 LLM,
+    延迟到下一轮 drain。
+  </li>
 <p>
-  第 13 章让 harness 能跑后台任务, 但任务必须由 LLM 在会话内主动调 run_async
-  才能跑。下一章 Schedule 模块让 harness 按时间自动触发 任务 (例如"每天 8
-  点跑一次测试"), 任务本质上是 async run, scheduleManager 内部复用
-  asyncRunManager。这是 harness 第一次 具备"定时运行" 能力, 也是后台 agent +
-  task 系统的最终拼接。
+  下一章 (第 14 章) 展开 Async Run 的<strong>定时触发</strong>
+  — 复用 Async Run 作为执行单元, ScheduleManager 负责"每天 9 点
+  跑一次" / "每周一 10 点跑一次" 的 cron-like 调度, 用 ID 去重
+  保证不重复跑。
 </p>
diff --git a/tutorial/chapters/14-schedule.html b/tutorial/chapters/14-schedule.html
index 36d4957..82e0ffc 100644
--- a/tutorial/chapters/14-schedule.html
+++ b/tutorial/chapters/14-schedule.html
@@ -1,573 +1,744 @@
-<p class="article__eyebrow">第 14 章 · 让时间触发 Agent</p>
-<h1 class="article__title">定时任务: Schedule</h1>
+<p class="article__eyebrow">第 14 章 · 定时触发复用 Async Run</p>
+<h1 class="article__title">Schedule: 5 秒 tick + 6 种 recurrence + missed 不补跑</h1>
 <p class="article__lede">
-  前面十三章让 harness 在单次会话内能聊天、调工具、跑子任务、压缩、拦权限、 留
-  hook、记 memory、cache 友好、recovery、记 task、跑后台。但任务必须 由 LLM
-  在会话内主动调 run_async 才能跑。这一章给 harness 加 Schedule 模块, 让 harness
-  按时间自动触发任务 (例如"每天 8 点跑一次测试"), 任务本质上是 async run,
-  scheduleManager 内部复用 asyncRunManager。
+  第 13 章的 Async Run 是"LLM 手动启动后台执行", 但生产里用户
+  经常想"每天 9 点跑一次全项目测试" / "每周一 10 点扫一下 TODO"
+  / "每 5 分钟查一下 CI 状态"。 这一章加 <code>src/schedules.ts</code>
+  模块: Schedule 是<strong>时间触发器</strong> (不是执行器),
+  复用 Async Run 作为执行单元, 5 秒 tick 一次扫所有 active
+  schedule, 6 种 recurrence (every_seconds / hourly / daily /
+  weekly / monthly / yearly), occurrence 含 stable id 防重复触发,
+  启动时检测 missed occurrence 但<strong>不补跑</strong>, overlap
+  policy 控制"上次还没跑完这次是否跳过"。 读完后, 你能讲清
+  "Schedule 不是 cron 替代品" 的边界 (in-process tick, 不持久
+  唤醒), 并能用 fake clock 验证"missed 不补" + "overlap skip" 两条
+  关键不变量。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-13">在第 13 章基础上改了什么</h2>
-<p>
-  这一章加 schedule 持久化层 + 调度循环。schedule 数据落盘到
-  <code>~/.claude/schedules/&lt;schedule-id&gt;.json</code>, 包含 cron 表达式 /
-  命令 / 启用状态。harness 启动时加载所有 schedule, 后台调度循环 每分钟检查一次,
-  命中 cron 时派发到 async run。LLM 通过 schedule 工具创建 / 暂停 / 启用
-  schedule。 对应到代码, 改动集中在 4 个文件:
-  <code>src/schedules.ts</code> (新)、
-  <code>src/schedule-store.ts</code> (新)、<code>src/tools/schedule.ts</code>
-  (新, 含 create / list / enable / disable / delete)、<code>src/agent.ts</code>
-  (改每轮 LLM 调用前 drain schedule 通知)。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/schedules.ts: 调度管理器 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/schedule-store.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/schedule-store.ts: 调度存储 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/agent.ts: 每轮 LLM 调用前 drain schedule 通知</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts"
-    target="_blank"
-    rel="noreferrer"
-    >4. src/async-runs.ts: 调度命中时复用 (第 13 章)</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    用户想让 harness 每天 8 点自动跑测试并把报告发到 Slack, 但 harness
-    本身没有时间触发机制, 只能等用户主动开 harness 调 run_async。 现象是"harness
-    是被动的, 不是主动的"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"用 setInterval 检查 cron"。这有两个问题: 一是 setInterval
-    在进程退出后丢失, 跨进程重启调度状态不复用; 二是 时区问题, 用户的"每天 8 点"
-    在不同地区含义不同。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口: <code>interface Schedule { id, cron, command, enabled }</code>。
-    不变量三条: (1) schedule 落盘走 atomic write (第 15 章), 跨进程 保留, (2)
-    调度循环每分钟检查一次, 命中 cron 时派发到 async run (第 13 章), (3)
-    调度通知走单独的 source 标签 "&lt;system-reminder source='schedule'&gt;",
-    不与 async run 通知混。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake scheduleManager 预设 cron "* * * * *" (每分钟), 时间快进 1 分钟,
-    跑完调度循环 spy 验证 asyncRunManager.start 被调用 1 次; fake schedule
-    命中时, LLM 下一轮收到的 messages 含 schedule reminder 标签。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · setInterval 检查 cron, 不持久化</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 错误: 用 setInterval 调度, 进程退出后丢失
-setInterval(() =&gt; {
-  for (const schedule of schedules) {
-    if (cronMatches(schedule.cron, new Date())) {
-      runAsync(schedule.command);  // 不持久化
-    }
-  }
-}, 60_000);</code></pre>
-  <p><strong>问:</strong>为什么不直接 setInterval?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 持久化: setInterval 状态在内存 里,
-    进程崩了 schedule 丢, 用户得重新配; 跨进程: 多进程同时跑 会重复触发,
-    需要文件锁; 状态可观测: 用户无法知道"下次触发时间" 和"上次触发时间",
-    调试困难。
-  </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · schedule 通知和 async run 通知混在一起</p>
-  <pre class="code-block"><code>// 教学简化版
-const notifications = [
-  ...asyncRunManager.drainNotifications(),
-  ...scheduleManager.drainNotifications(),
-];</code></pre>
-  <p><strong>问:</strong>为什么不混?</p>
-  <p>
-    <strong>答:</strong>两者 source 不同, 混在一起 LLM 区分不出"这是 定时任务"
-    还是"这是后台 agent", 处理策略也不同。schedule 通知 含 schedule_id,
-    用户可据此取消; async run 通知含 run_id, 用户 可读输出。混了之后 LLM
-    没办法精确响应。
-  </p>
-</div>
-
-<h2 id="interfaces">接口形状: 在写实现前钉死</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface Schedule {
-  id: string;                   // harness 生成的 UUID
-  cron: string;                 // 标准 cron 表达式
-  command: string;              // 调用的工具名 (例如 "run_bash")
-  args: Record&lt;string, unknown&gt;;  // 工具参数
-  title: string;                // 给用户看的标题
-  enabled: boolean;
-  timezone: string;             // IANA timezone, 例如 "Asia/Shanghai"
-  createdAt: number;
-  lastFiredAt?: number;         // 上次触发时间
-  nextFireAt?: number;          // 下次预期触发时间
-}
-
-export interface ScheduleStore {
-  get(id: string): Promise&lt;Schedule | null&gt;;
-  list(filter?: { enabled?: boolean }): Promise&lt;Schedule[]&gt;;
-  create(spec: Omit&lt;Schedule, "id" | "createdAt" | "lastFiredAt" | "nextFireAt"&gt;): Promise&lt;Schedule&gt;;
-  update(id: string, patch: Partial&lt;Schedule&gt;): Promise&lt;Schedule&gt;;
-  delete(id: string): Promise&lt;boolean&gt;;
-}
-
-export interface ScheduleNotification {
-  scheduleId: string;
-  occurrenceId: string;         // 每次触发一个唯一 id
-  type: "fired" | "missed" | "orphan_recovered";
-  message: string;
-  asyncRunId?: string;          // 派发到的 async run id
-  outputId?: string;            // 调度运行的输出 (可选)
-}
-
-export interface ScheduleManager {
-  // 添加 / 启用 / 禁用 / 删除
-  create(spec: Omit&lt;Schedule, "id" | "createdAt"&gt;): Promise&lt;Schedule&gt;;
-  enable(id: string): Promise&lt;void&gt;;
-  disable(id: string): Promise&lt;void&gt;;
-  delete(id: string): Promise&lt;void&gt;;
-  // 每分钟调用一次, 内部检查 cron, 命中时派发到 async run
-  tick(): Promise&lt;void&gt;;
-  // 取走未读通知
-  drainNotifications(): ScheduleNotification[];
-}</code></pre>
-
-<h2 id="cron">cron 解析</h2>
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: cron 跑了 3 次, 团队 leader 才发现</h2>
+<p>
+  写代码之前, 先看一个真实痛点。 用户的 team 想"每天 9 点跑一次
+  全项目 e2e 测试", 写了 bash cron + harness 脚本, 跑了一周。
+</p>
+<ol>
+<li>
+<strong>症状 1: 重复触发</strong>: 网络抖了一下, cron 以为没跑,
+    第二天发现"同一天跑了 2 次", 资源浪费。
+  </li>
+<li>
+<strong>症状 2: 错过补跑</strong>: 周末 9 点团队不上班, harness
+    进程被关, 错过 1 次。 周一上班发现"周六没跑", 手动补跑,
+    但周六的 audit log 缺了。
+  </li>
+<li>
+<strong>症状 3: overlap 失控</strong>: 9 点的测试跑 1 小时, 10 点
+    又触发一次, 2 个测试并发, 互相干扰, 结果不可信。
+  </li>
+<li>
+<strong>真问题</strong>: 缺少<strong>in-process scheduler</strong>:
+    复用 Async Run 作为执行单元, 用 stable occurrence id 防重复,
+    overlap policy 决定"上次没跑完是否跳过", missed 不补跑只
+    记录审计。 不依赖 OS cron, 不依赖 systemd, 不依赖 k8s cronjob。
+  </li>
+</ol>
 <p>
-  本章使用标准 5 段 cron 表达式 (分 / 时 / 日 / 月 / 周), 不支持秒级。
-  解析器只识别 <code>*</code> / <code>数字</code> / <code>逗号列表</code> /
-  <code>斜杠步长</code> 四种语法, 不支持复杂的 <code>L</code> / <code>W</code> /
-  <code>?</code> 扩展。复杂调度需求 (例如"每月最后一个周五") 应当拆成 多个
-  schedule。
+  朴素想法 1: "直接用 OS cron 调 bash 启动 harness?" 错。 OS cron
+  不知道"上次跑完没", 不会 dedupe, 不会 overlap skip, 不会写
+  occurrence 审计。 需要 harness 内部 scheduler 集中管理。
 </p>
 <p>
-  调度循环读当前时间 (用 <code>timezone</code> 字段转时区), 计算 nextFireAt;
-  tick() 时检查 (now &gt;= nextFireAt), 命中则派发到 async run, 写 lastFiredAt,
-  重新计算 nextFireAt。
+  朴素想法 2: "schedule 跑完直接调 run_bash 同步执行?" 错。
+  schedule 应该复用 Async Run, 让 schedule 本身不执行, 只触发;
+  执行交给 Async Run 的并发限制 / 终态幂等 / 输出隔离 (第 13 章
+  全套)。
 </p>
-
-<h2 id="loop-integration">loop 接入: schedule 通知走单独 source</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-async function run(query: string) {
-  history.add({ role: "user", content: query });
-
-  for (;;) {
-    // 1. drain async run 通知 (第 13 章)
-    const asyncNotifications = asyncRunManager.drainNotifications();
-    for (const n of asyncNotifications) { /* ... 第 13 章 ... */ }
-
-    // 2. drain schedule 通知 (本章) — 单独 source 标签
-    const scheduleNotifications = scheduleManager.drainNotifications();
-    for (const n of scheduleNotifications) {
-      const lines = ["Schedule updates:"];
-      lines.push(
-        `- schedule: ${n.scheduleId}`,
-        `  occurrence: ${n.occurrenceId}`,
-        `  type: ${n.type}`,
-        `  message: ${n.message}`,
-      );
-      if (n.asyncRunId) lines.push(`  async_run: ${n.asyncRunId}`);
-      if (n.outputId) lines.push(`  full_output: use run_output_read with output_id ${n.outputId}`);
-      const reminder = `&lt;system-reminder source="schedule"&gt;\n${lines.join("\n")}\n&lt;/system-reminder&gt;`;
-      history.add({ role: "user", content: reminder });
-    }
-
-    // 3. 正常 LLM 调用 + 工具执行
-    const messages = history.getMessages();
-    const assistant = await llm.chat(messages);
-    /* ... */
-  }
-}</code></pre>
-
-<h2 id="tick-loop">tick 循环: 每分钟检查</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// Composition Root 启动时开后台 tick 循环
-const tickInterval = setInterval(async () =&gt; {
-  await scheduleManager.tick();
-}, 60_000);
-
-process.on("SIGTERM", () =&gt; {
-  clearInterval(tickInterval);
-});</code></pre>
 <p>
-  tick 循环在 harness 启动时开, 每分钟调一次 scheduleManager.tick()。 tick()
-  内部读所有 enabled schedule, 检查 cron 是否命中, 命中时 派发到 async run, 写
-  lastFiredAt, 重新计算 nextFireAt。进程退出 时清掉 interval, 防止泄漏。
+  正确做法: 加 <code>src/schedules.ts</code> — ScheduleManager
+  是"时间触发器", 内部 5 秒 tick 一次, 扫所有 active schedule,
+  到点就调 <code>asyncRunManager.start(...)</code> 启动 Async Run。
+  Schedule 自己<strong>不执行</strong>, 真实执行交给 Async Run。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L7"><code>src/schedules.ts</code> 第 7-12 行的核心设计注释</a>:
 </p>
-
-<h2 id="trap">反例梯度</h2>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>用 setInterval 调度, schedule 不落盘。</p>
-    <p>
-      <strong>为什么错:</strong>进程崩了 schedule 丢, 多进程重复触发,
-      状态不可观测。
-    </p>
-    <p>
-      <strong>正确做法:</strong>schedule 走 store 落盘, tick
-      内部只做检查和派发。
-    </p>
+<pre><code class="language-typescript">// 核心设计:
+// - ScheduleManager.tick(now) 是可测试的核心调度逻辑
+// - Schedule 到点后创建 Async Run, 不直接执行命令或运行 LLM
+// - occurrence 的 stable id 防止重复触发
+// - 启动时检测 missed occurrence, 只记录最近一次, 不补跑</code></pre>
+<h2 id="trigger-not-executor">Schedule 是触发器, 不是执行器</h2>
+<p>
+  <strong>用途</strong>: Schedule 自己<strong>不跑命令 / 不调
+  LLM</strong>, 到点就委托给 Async Run。 两条好处:
+  (a) Schedule 不需要重复实现并发限制 / 终态幂等 / 输出隔离,
+  复用 Async Run; (b) Schedule 出 bug 不影响 Async Run, 反之
+  亦然, 关注点分离。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户设 "每天 9 点跑全项目测试" 的
+  schedule, 9:00 触发, ScheduleManager 创建 occurrence +
+  调 <code>asyncRunManager.start({executor: "command", command: "npm test"})</code>,
+  拿到 run_id, 写 occurrence 记录。 Async Run 跑 5 分钟, 完事后
+  回调 ScheduleManager 更新 occurrence 终态 (completed / failed
+  / timeout)。 ScheduleManager 再写 notification 让 LLM 看到。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>层间清晰</strong> —
+  Reference 章节 "模式 7 · State Machine 状态机" 的延伸。 Schedule
+  管 active/cancelled/completed 状态, occurrence 管
+  triggered/skipped/missed/completed/failed/timeout/cancelled 状态,
+  Async Run 管 running/completed/failed/timeout/abandoned 状态。
+  三层状态机各管各的, 不互相污染。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L613"><code>src/schedules.ts</code> 第 613-624 行的教学导读</a>:
+</p>
+<pre><code class="language-typescript">// 教学导读:
+// ScheduleManager 是"时间触发器", 不是"执行器"。
+// 它只负责判断某个 schedule 是否到点、创建 occurrence 审计记录、
+// 然后把真正执行交给 AsyncRunManager。
+//
+// 这样设计可以避免两套执行生命周期:
+// - Async Run 管 running/completed/failed/timeout
+// - Schedule 管 active/cancelled/completed 和 occurrence 审计
+//
+// 学生读这段代码时, 可以把它想成一个小型调度循环:
+// scan/reload -&gt; 计算 nextRunAt -&gt; tick 到点 -&gt; 创建 occurrence -&gt; start async run -&gt; finish callback 写回终态</code></pre>
+<div class="figure figure--stack">
+  <div class="figure__title">图 1 · Schedule × Async Run 协作时序</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">1. tick 到点</div>
+      <div class="flow-stack__body">ScheduleManager.tick(now) 发现 schedule #5 到 9:00 了, 调 <code>asyncRunManager.start({trigger: {kind: "schedule", scheduleId, occurrenceId}})</code>。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">2. Async Run 启动</div>
+      <div class="flow-stack__body">AsyncRunManager 创建 record (status=running), 启动 executor (command 跑 npm test), 拿 run_id 返回给 ScheduleManager。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">3. Schedule 写 occurrence</div>
+      <div class="flow-stack__body">ScheduleManager 创建 occurrence (status=triggered, asyncRunId=&lt;run_id&gt;), 写 store.saveOccurrence(occ), notification 推 "triggered"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">4. Async Run 完成回调</div>
+      <div class="flow-stack__body">5 分钟后 npm test 跑完, AsyncRunManager.finishRun(record, "completed", output), 触发 onFinish 回调。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--good">
+      <div class="flow-stack__label">5. Schedule 收敛 occurrence</div>
+      <div class="flow-stack__body">onAsyncRunFinish(record) 找到 occurrence, status=completed, 写 finishedAt + outputRef, notification 推 "completed"。</div>
+    </div>
   </div>
 </div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>schedule 通知和 async run 通知混在同一个 source
-      标签。
-    </p>
-    <p>
-      <strong>为什么错:</strong>LLM 区分不出"定时任务" vs "后台 agent",
-      处理策略不同。
-    </p>
-    <p>
-      <strong>正确做法:</strong>schedule 通知走 source="schedule", 单独标签,
-      不与 source="async-run" 混。
-    </p>
+<p>
+  <strong>实现细节</strong>: 调度循环用 <code>setInterval(tick, TICK_INTERVAL_MS = 5000)</code>,
+  5 秒一次, 不需要秒级精度 (schedule 大多是分钟 / 小时 / 天级别)。
+  想秒级精度, 改 TICK_INTERVAL_MS 常量, 不需要改核心逻辑。
+</p>
+<h2 id="recurrence">6 种 recurrence: 5 分钟到 1 年</h2>
+<p>
+  <strong>用途</strong>: LLM / 用户创建 schedule 时指定 recurrence
+  类型, ScheduleManager 据此算 nextRunAt。 6 种枚举覆盖 90%
+  场景: every_seconds (5 分钟级) / hourly / daily / weekly /
+  monthly / yearly。 多了维护成本上升, 少了不够用。
+</p>
+<p>
+  <strong>真实场景</strong>: "每 5 分钟查 CI 状态" 用 every_seconds
+  (5 * 60 = 300 秒); "每天 9 点跑测试" 用 daily (9:00); "每周一
+  10 点扫 TODO" 用 weekly (mon, 10:00); "每月 1 号清理日志" 用
+  monthly (1, 00:00); "每年 1 月 1 号跑年度报告" 用 yearly
+  (1/1, 00:00)。 6 种全覆盖。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>固定枚举 + 字段类型</strong> —
+  6 种是固定枚举, 不允许动态加第 7 种 (避免代码膨胀); 每种带
+  各自字段 (weekly 必须含 weekday, monthly 必须含 day), 编译
+  期可见, 漏字段直接 TS 报错。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L35"><code>src/schedules.ts</code> 第 35-42 行的 recurrence import</a>:
+</p>
+<pre><code class="language-typescript">import type {
+  ScheduleTiming, RecurrenceRule, Weekday,
+  EverySecondsRule, HourlyRule, DailyRule,
+  WeeklyRule, MonthlyRule, YearlyRule,
+} from "./schedule-store.js";</code></pre>
+<div class="figure figure--stack">
+  <div class="figure__title">图 2 · 6 种 recurrence 的字段差异</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">every_seconds (5 分钟级)</div>
+      <div class="flow-stack__body">字段: <code>{kind: "every_seconds", seconds: 300}</code>。 用于"每 5 分钟查 CI" "每 30 秒检查 build 状态"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">hourly (整点触发)</div>
+      <div class="flow-stack__body">字段: <code>{kind: "hourly", minute: 0}</code>。 用于"每小时整点扫日志"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">daily (每天某时某分)</div>
+      <div class="flow-stack__body">字段: <code>{kind: "daily", hour: 9, minute: 0}</code>。 用于"每天 9 点跑测试"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">weekly (每周某天某时)</div>
+      <div class="flow-stack__body">字段: <code>{kind: "weekly", weekday: "mon", hour: 10, minute: 0}</code>。 用于"每周一 10 点扫 TODO"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">monthly (每月某日某时)</div>
+      <div class="flow-stack__body">字段: <code>{kind: "monthly", day: 1, hour: 0, minute: 0}</code>。 用于"每月 1 号清理日志"。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">yearly (每年某月某日)</div>
+      <div class="flow-stack__body">字段: <code>{kind: "yearly", month: 1, day: 1, hour: 0, minute: 0}</code>。 用于"每年 1 月 1 号年度报告"。</div>
+    </div>
   </div>
 </div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>cron 命中时直接 await exec, 阻塞 tick 循环。
-    </p>
-    <p>
-      <strong>为什么错:</strong>一个长任务会让 tick 循环卡住, 其他 schedule
-      不能按点触发。
-    </p>
-    <p>
-      <strong>正确做法:</strong>tick 内部只派发到 async run, 不 await,
-      立即返回。
-    </p>
+<p>
+  <strong>实现细节</strong>: nextRunAt 计算纯函数, 输入 (rule +
+  now), 输出 nextRunAt。 测试可以用任意 Date 算, 不需要等真实时间。
+  教学版不处理"闰秒" / "夏令时" (教学项目优先简单), 工业版要
+  考虑。
+</p>
+<h2 id="occurrence-id">occurrence stable id: 防重复触发的核心</h2>
+<p>
+  <strong>用途</strong>: 每个 schedule 到点会触发一次, 那次触发
+  是一次 <strong>occurrence</strong>, 必须有 stable id (基于
+  schedule_id + scheduledAt), 不能每次重启重新生成。 这样:
+  (a) 启动时检测"上次该触发但没触发" 的 occurrence, 不重复创建;
+  (b) Async Run 回调能通过 occurrenceId 找到对应 occurrence 写
+  回终态。
+</p>
+<p>
+  <strong>真实场景</strong>: schedule #5 配置 daily 9:00, 周一 9:00
+  触发 (occurrence #5_2026-06-09T09:00, id stable)。 harness 周二
+  重启, 启动时检测 "周一 9:00 该触发但没记录", 写一条
+  <code>missed</code> occurrence (id 仍然是 <code>#5_2026-06-09T09:00</code>,
+  不会和新触发的混淆)。 周二 9:00 触发时, occurrence id 是
+  <code>#5_2026-06-10T09:00</code>, 区分清楚。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>幂等 id</strong> — Reference
+  章节 "模式 19 · Idempotent 幂等" + "模式 16 · Stable Identity
+  稳定身份" 的应用。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L155"><code>src/schedules.ts</code> 第 155 行的 <code>generateOccurrenceId</code></a>:
+</p>
+<pre><code class="language-typescript">function generateOccurrenceId(scheduleId: string, scheduledAt: Date): string {
+  const y = String(scheduledAt.getFullYear());
+  const m = String(scheduledAt.getMonth() + 1).padStart(2, "0");
+  const d = String(scheduledAt.getDate()).padStart(2, "0");
+  const H = String(scheduledAt.getHours()).padStart(2, "0");
+  const M = String(scheduledAt.getMinutes()).padStart(2, "0");
+  // ... 拼成 sch_<id>_<date>_<time> 形式
+}</code></pre>
+<p>
+  <strong>实现细节</strong>: occurrence id = <code>occ_&lt;scheduleId&gt;_&lt;ISO 时间&gt;</code>,
+  时间精确到分钟 (daily 精确到 hour:minute, every_seconds 精确到
+  second)。 这样不同次触发生成不同 id, 同一 schedule + 同一时间
+  永远同一 id, 启动时检测"已存在则跳过" / "已存在则标记 missed"。
+</p>
+<h2 id="missed-no-backfill">missed occurrence: 记录不补跑</h2>
+<p>
+  <strong>用途</strong>: harness 进程在 schedule 触发时间点不在线
+  (周末 / 维护 / 崩溃), 错过触发, 启动时检测到 "上次该跑没跑",
+  写一条 <code>missed</code> occurrence, 但<strong>不补跑</strong>
+  (只记录审计, 不追溯执行)。 这是教学版的<strong>故意简化</strong>,
+  工业版可以配置 "missed 时是否补跑"。
+</p>
+<p>
+  <strong>真实场景</strong>: 周末 harness 进程不在线, 错过周六 9:00
+  的 daily schedule。 周一上班, 启动 harness, tick 检测到
+  "周六 9:00 没记录", 写一条 missed occurrence (audit: "missed
+  at 2026-06-08T09:00, recovered at 2026-06-10T09:00")。 周一 9:00
+  该跑的还是按周一 9:00 跑, 不会"补周六 9:00 一次 + 周一 9:00
+  一次"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>故意简化</strong> — Reference
+  章节 "模式 10 · Atomic Write 原子写" 的延伸, 注释明确说
+  "当前教学实现不回补 missed run, 只记录审计并推进到未来下一次"。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L774"><code>src/schedules.ts</code> 第 774 行的注释</a>:
+</p>
+<pre><code class="language-typescript">// 当前教学实现不回补 missed run, 只记录审计并推进到未来下一次。
+// 这样可以避免"周末补跑 7 个 daily 跑 7 个命令" 的资源滥用。
+// 工业级 Schedule 可能会让用户配置 missed_policy = "skip" | "backfill" | "skip_silently"。</code></pre>
+<div class="figure figure--stack">
+  <div class="figure__title">图 3 · 启动时 missed 检测流程</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">1. 启动 reload</div>
+      <div class="flow-stack__body">harness 启动, ScheduleManager.reloadActiveSchedules() 扫磁盘读所有 active schedule.json, 计算每个的 lastScheduledAt。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">2. 检测 missed</div>
+      <div class="flow-stack__body">如果 <code>lastScheduledAt &lt; nextRunAt &lt; now</code> (有该跑没跑的时间点), 写 missed occurrence (status=missed, missedAt=now)。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--bad">
+      <div class="flow-stack__label">3. 不补跑</div>
+      <div class="flow-stack__body">只记录审计, 不启动 Async Run 补执行。 nextRunAt 推到未来下一次 (今天 9:00 / 明天 9:00)。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--good">
+      <div class="flow-stack__label">4. 未来照常</div>
+      <div class="flow-stack__body">之后的 tick 按 nextRunAt 正常触发, 不受 missed 影响。 用户的"周一 9:00 测试" 周一还是按周一 9:00 跑。</div>
+    </div>
   </div>
 </div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
+<p>
+  <strong>实现细节</strong>: missed 检测在 <code>tick(now)</code>
+  第一次调用时跑, 之后只算 "nextRunAt &lt;= now" 的触发, 不再检测
+  missed。 简化逻辑, 避免每个 tick 都重扫历史。
+</p>
+<h2 id="overlap-policy">overlap policy: allow vs skip</h2>
+<p>
+  <strong>用途</strong>: schedule 到点时, 上一次 occurrence 还没
+  跑完 (Async Run 仍在 running), 怎么办? 两种策略:
+  <code>overlapPolicy: "allow"</code> (允许并发, 启动新的 Async Run) 或
+  <code>overlapPolicy: "skip"</code> (跳过, 写一条 skipped_overlap occurrence)。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户设 "每 5 分钟跑 CI 检查" 的
+  schedule, CI 跑了 7 分钟还没完, 下一个 5 分钟到点:
+  <code>allow</code> 策略会启动新的 CI 检查 (总共 2 个 CI 并发,
+  互相干扰); <code>skip</code> 策略会跳过这次触发, 写
+  skipped_overlap, 5 分钟后再尝试。 大多数 schedule 用 skip 更安全。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>用户可配 + 默认安全</strong> —
+  字段在 schedule.execution 里, 用户创建 schedule 时指定; 默认
+  skip (教学项目优先安全, 并发容易出问题)。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L645"><code>src/schedules.ts</code> 第 645 行的 <code>runningOccurrences</code></a>:
+</p>
+<pre><code class="language-typescript">// 跟踪正在 running 的 occurrence, 用于 overlap 检测
+const runningOccurrences = new Map&lt;string, Set&lt;string&gt;&gt;(); // scheduleId -&gt; Set&lt;occurrenceId&gt;</code></pre>
+<p>
+  <strong>实现细节</strong>: overlap 检测在 tick 到点时跑:
+  (a) 查 <code>runningOccurrences.get(scheduleId)</code> 是否非空;
+  (b) 非空 + policy=skip → 写 skipped_overlap occurrence, 不启动
+  Async Run; (c) 非空 + policy=allow → 直接启动新 Async Run (Async
+  Run 自己有 3 并发限制, 可能 reject)。 runningOccurrences 在
+  Async Run finish 回调里 remove (onAsyncRunFinish 内)。
+</p>
+<h2 id="linked-task">linked task: 触发后更新 task 状态</h2>
+<p>
+  <strong>用途</strong>: schedule 可以<strong>链接</strong>到一个
+  task (groupId + taskId), occurrence 触发后根据结果更新 task
+  状态: <code>linkedTaskUpdate: "never"</code> (不更新) /
+  <code>"append_note"</code> (追加 note) /
+  <code>"mark_failed_on_failure"</code> (failed 时把 task 标 failed)。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户设 "每天 9 点跑 nightly test"
+  schedule, 链接到 migration-001 group 的 task_nightly_test。
+  test 通过 → 不改 task 状态; test 失败 → task 标 failed +
+  note 写 "auto-failed by schedule at 2026-06-12T09:05:23, reason:
+  5 tests failed"。 LLM 下次开 session 看到 task failed, 知道
+  "夜跑挂了, 去看下"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>桥接 schedule ↔ task</strong> —
+  Reference 章节 "模式 6 · Pipeline 管道" 的应用。 schedule 不
+  知道 task 内部结构, 只通过 linkedTask 字段 + 3 种 update
+  policy 触发 task 状态变化。 task 状态变化由 TaskManager
+  (第 12 章) 处理, 保持职责分离。
+</p>
+<p>
+  <strong>实现细节</strong>: linked task 写在
+  <code>schedule.linkedTask = {groupId, taskId}</code>, 触发后
+  onAsyncRunFinish 回调里根据 result 调
+  <code>taskManager.updateTask(groupId, taskId, patch)</code>。
+  3 种 policy 对应不同的 patch 构造。
+</p>
+<h2 id="output-policy">output policy: save / notify / summary</h2>
+<p>
+  <strong>用途</strong>: occurrence 完成后, 输出的处理策略:
+  <code>saveRawOutput: true</code> (写到 OutputStore LRU) /
+  <code>notifyLlm: true</code> (推 notification) /
+  <code>summaryPrompt?: string</code> (用 prompt 让 LLM 生成摘要
+  塞进 notification)。
+</p>
+<p>
+  <strong>真实场景</strong>: nightly test schedule 配置
+  <code>saveRawOutput: true, notifyLlm: true, summaryPrompt: "总结测试结果, 失败的列前 3"</code>。
+  test 跑完后, 原始输出 5MB 写 OutputStore, LLM 调 summaryPrompt
+  生成 "5/100 失败, Top 3: [list]", notification 推给 LLM。
+  LLM 看到摘要, 想看全文调 <code>readOutput(runId)</code>。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>短摘要 + 长全文分离</strong> —
+  Reference 章节 "模式 13 · Error→Action 错误转动作" 的延伸。
+  notification 短, output 长, LLM 按需读, history 不撑爆。
+</p>
+<p>
+  <strong>实现细节</strong>: summaryPrompt 是可选, 有就调 LLM
+  生成摘要, 没有就用 Async Run 的 preview 字段。 LLM 调生成
+  摘要本身是一个小 Async Run (嵌套), 教学版可以简化为 "用
+  preview 字段当摘要"。
+</p>
+<h2 id="permission-profile">permission profile: readonly / ci / workspace_write</h2>
+<p>
+  <strong>用途</strong>: schedule 触发的 Async Run 可能有写操作
+  (如"每天 9 点跑测试 + 修复") 或纯只读 ("每天 9 点扫日志")。
+  3 种 permission profile: readonly (只读命令) / ci (允许 build /
+  test / install) / workspace_write (允许 workspace 内写)。
+</p>
+<p>
+  <strong>真实场景</strong>: nightly test schedule 用 ci profile
+  (允许 npm install + npm test); "每天清理 7 天前的日志" schedule
+  用 workspace_write (允许 rm 工作区文件); "每天扫 TODO" 用
+  readonly (只 grep 不改)。 3 种 profile 表达"这个 schedule 的写
+  权限边界"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>3 种 profile 表达权限边界</strong> —
+  Reference 章节 "模式 3 · 窄接口 + Adapter" 的应用。 permission
+  profile 透传给 Async Run 的 executionPolicy, 让 Async Run 自己
+  校验。 Schedule 不实现权限, 只声明。
+</p>
+<p>
+  <strong>实现细节</strong>: profile 字段在
+  <code>schedule.execution.permissionProfile</code>, 透传到
+  <code>asyncRunManager.start({..., permissionProfile: "ci"})</code>。
+  Async Run 内部用 executionPolicy 校验每个 command 是否在
+  profile 允许范围内, 不在就 reject。
+</p>
+<h2 id="start-stop">start / stop: 进程内 timer 生命周期</h2>
+<p>
+  <strong>用途</strong>: ScheduleManager 自己持有 <code>setInterval</code>
+  timer, <code>start()</code> 启动 timer, <code>stop()</code> 清除
+  timer + 把 running occurrence 标 abandoned。 进程退出时
+  <code>stop()</code> 自动调用 (或者外部 hook), 不留 zombie。
+</p>
+<p>
+  <strong>真实场景</strong>: harness 启动, scheduleManager.start()
+  启动 5 秒 tick 循环, 开始调度。 user ctrl-c 退出, harness 收到
+  SIGINT, 调 scheduleManager.stop(), timer 清除 + running occurrence
+  收敛 abandoned, 干净退出。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>进程内 timer, 不持久化</strong> —
+  Reference 章节 "模式 19 · Idempotent 幂等" 的延伸。 timer
+  是<strong>进程级</strong>资源, 进程退出 timer 自然消失; schedule
+  数据 (ScheduleFile) 持久化, 进程重启 reload 即可恢复。 教学版
+  简单, 工业版可以用 systemd timer / k8s cronjob 替代。
+</p>
+<p>
+  <strong>实现细节</strong>: timer 在
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L643"><code>src/schedules.ts</code> 第 643 行</a>
+  持有, start() 用 setInterval 5 秒一次, stop() clearInterval +
+  清理 runningOccurrences。 start / stop 幂等, 多次调用安全。
+</p>
+<h2 id="loop-integration">主循环集成: start 启动 timer, 进程退出 stop</h2>
+<p>
+  <strong>用途</strong>: ScheduleManager 在 Composition Root 创建,
+  注入 agent + 注册工具 (schedule_create / read / list /
+  cancel / list_occurrences)。 主循环集成点 2 个:
+</p>
+<ol>
+<li>
+<strong>index.ts 启动时</strong>: <code>scheduleManager.start()</code>
+    启动 timer, 之后 schedule 开始被调度。
+  </li>
+<li>
+<strong>进程退出 hook</strong>: SIGINT / SIGTERM handler 调
+    <code>scheduleManager.stop()</code> + <code>asyncRunManager.shutdown()</code>,
+    干净退出。
+  </li>
+</ol>
+<p>
+  <strong>设计思想</strong>: <strong>显式生命周期</strong> —
+  ScheduleManager 不自动 start, 必须显式调, 避免 Composition
+  Root 创建时立即触发未预期的 tick。
+</p>
+<p>
+  <strong>实现细节</strong>: <code>scheduleManager.start()</code>
+  是幂等的, 重复调用不创建多个 timer。 <code>stop()</code> 也幂等,
+  重复调用不抛错。 这是 Reference 章节 "模式 1 · 工厂 + 闭包"
+  的应用 — 状态在闭包内, 外部不能误改 timer handle。
+</p>
+<h2 id="fake-test">fake test: 用 fake clock 验证 missed + overlap</h2>
+<p>
+  <strong>用途</strong>: ScheduleManager 测试用<strong>fake clock</strong>
+  + <strong>fake asyncRunManager</strong> — 不等真实时间, 不真
+  跑命令, 完全可控。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户写测试覆盖 4 条不变量:
+  (a) 创建 daily 9:00 schedule, fake clock 推到 9:00, 触发 Async
+  Run; (b) 创建 daily 9:00, fake clock 推到周一 8:00 → 周二 10:00
+  (跳过 9:00 触发点), 重启时检测 missed occurrence; (c) 创建
+  overlap=skip, 上次 occurrence 还在 running, 触发点写到
+  skipped_overlap; (d) overlap=allow, 启动新的 Async Run。
+</p>
+<pre><code class="language-typescript">test("missed 不补跑", () =&gt; {
+  let now = new Date("2026-06-08T08:00:00Z");
+  const mgr = createScheduleManager({
+    store: ...,
+    asyncRunManager: fakeAsyncRun,
+    projectRoot: "/test",
+    now: () =&gt; now,
+  });
+  mgr.create({
+    title: "daily 9am test",
+    intent: { prompt: "test" },
+    timing: { kind: "daily", hour: 9, minute: 0 },
+    execution: { ..., overlapPolicy: "allow", ... },
+  });
+  mgr.start();
+  // 推进到 6 月 10 日 10:00 (跳过 6/9 9:00)
+  now = new Date("2026-06-10T10:00:00Z");
+  mgr.tick(now);
+  const occs = mgr.listOccurrences({ scheduleId: ..., limit: 10 });
+  // 应该有 1 条 missed + 1 条 triggered (6/10 9:00)
+  const missed = occs.filter((o) =&gt; o.status === "missed");
+  const triggered = occs.filter((o) =&gt; o.status === "triggered");
+  expect(missed).toHaveLength(1);
+  expect(triggered).toHaveLength(1);
+  // 验证: fakeAsyncRun.start 只被调 1 次 (6/10 9:00), 没补跑 6/9 9:00
+  expect(fakeAsyncRun.start).toHaveBeenCalledTimes(1);
+});
+
+test("overlap=skip: 上次还在 running 时跳过", () =&gt; {
+  // ... 创建一个 occurrence, 标记为 running (不进 finishRun)
+  // 推进到下次触发点
+  // 验证: 新 occurrence 状态 = skipped_overlap, fakeAsyncRun.start 没被调
+});
+
+test("overlap=allow: 上次还在 running 时启动新的", () =&gt; {
+  // ... 同上, 但 overlapPolicy=allow
+  // 验证: fakeAsyncRun.start 被调了 2 次
+});</code></pre>
+<p>
+  <strong>实现细节</strong>: fake clock 通过 deps.now 注入,
+  ScheduleManager 完全用 deps.now() 拿时间, 不调
+  <code>new Date()</code>。 fake asyncRunManager 通过 deps.asyncRunManager
+  注入, 只暴露 <code>start / setOnFinish</code> 两个方法 (用
+  <code>Pick&lt;AsyncRunManager, "start" | "setOnFinish"&gt;</code>
+  类型限制)。
+</p>
+<h2 id="common-confusion">常见误解: Schedule 不是 cron 替代品</h2>
+<p>
+  <strong>误解 1: "Schedule = harness 内部的 cron?"</strong>
+  半对。 Schedule 确实像 cron (time-based trigger), 但<strong>不</strong>
+  替代 OS cron: (a) Schedule 进程退出时不持久化, 重启依赖
+  missed detection; (b) Schedule 不支持秒级精度 (5 秒 tick);
+  (c) Schedule 不支持 system reboot 触发。 想要这些特性,
+  用 OS cron / k8s cronjob 调 harness CLI。
+</p>
+<p>
+  <strong>误解 2: "missed occurrence 自动补跑?"</strong> 错。
+  教学版不补跑, 只记录审计。 工业级 Schedule 可能有
+  <code>missedPolicy: "skip" | "backfill"</code> 配置, 教学版
+  固定 skip。
+</p>
+<p>
+  <strong>误解 3: "overlap=allow 永远安全?"</strong> 不一定。
+  allow 启动新的 Async Run, 但 Async Run 自己有 3 并发限制 (第
+  13 章), 可能 reject "Maximum concurrent async runs (3) reached"。
+  overlap=allow 适合"独立任务" (日志清理 + 报告生成), 不适合
+  "同一资源" (CI 检查 + build)。
+</p>
+<p>
+  <strong>误解 4: "schedule 跑完直接改 task 状态?"</strong>
+  错。 通过 <code>linkedTaskUpdate: "append_note" | "mark_failed_on_failure"</code>
+  3 种 policy 间接改, 不是任意改。 "任意改 task 状态" 仍
+  走 TaskManager (第 12 章), 保持职责分离。
+</p>
+<h2 id="trap">反例梯度: 4 个常见错误</h2>
+<div class="cards-grid">
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 1 · 不用 Async Run, 自己跑</span></div>
+    <div class="card__body">
+      <p>Schedule 到点直接 <code>await run_bash(command)</code>,
+        不走 Async Run。 错。 Schedule 自己跑没有并发限制, 没有
+        终态幂等, 没有输出隔离。 复跑 / 重叠触发 / 进程退出 mid-write
+        全部出问题。 正确: Schedule 只负责触发, 真实执行委托给
+        Async Run, 复用其全部基础设施 (3 并发 + finishRun 幂等
+        + OutputStore LRU)。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>不区分时区, 用户的"每天 8 点" 用了本地系统时区。
-    </p>
-    <p>
-      <strong>为什么错:</strong>harness 部署在服务器, 服务器时区与用户时区不同,
-      触发时间错位。
-    </p>
-    <p>
-      <strong>正确做法:</strong>schedule 显式带 timezone 字段, tick
-      时按用户时区解析 cron。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 2 · missed 自动补跑</span></div>
+    <div class="card__body">
+      <p>启动时检测到 3 个 missed occurrence, 立即 3 个并发启动
+        Async Run 补跑。 错。 周末 7 天 × 1 daily = 7 个 missed,
+        启动 7 个并发, 立刻触发 Async Run 3 并发限制, 资源滥用。
+        正确: 只记录 missed audit, 不补跑, 推进到未来下一次。</p>
+    </div>
   </div>
-</div>
-
-<h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 14 章</span>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 3 · occurrence id 随机生成</span></div>
+    <div class="card__body">
+      <p>occurrence id 用 <code>Math.random()</code> 生成, 每次
+        tick 都重新计算。 错。 启动时检测 missed 无法判断"这是不是
+        已存在 occurrence", 重复创建 + 重复触发。 正确: id 用
+        <code>scheduleId + scheduledAt</code> 拼, stable + idempotent,
+        同一时间点永远同一 id。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>tick 派发到 async run:</strong>fake schedule 预设 cron "* * * *
-      *", 时间快进 1 分钟, 跑完 tick, spy 验证 asyncRunManager.start 被调用 1
-      次。
-    </p>
-    <p>
-      <strong>schedule 通知单独 source:</strong>fake schedule 触发, 跑完后 LLM
-      下一轮 messages 含 &lt;system-reminder source="schedule"&gt; 标签, 不含
-      source="async-run"。
-    </p>
-    <p>
-      <strong>disabled schedule 不触发:</strong>fake schedule enabled=false,
-      跑完 tick, spy 验证 asyncRunManager.start 没被调用。
-    </p>
-    <p>
-      <strong>timezone 字段被解析:</strong>fake schedule
-      timezone="Asia/Shanghai", cron="0 8 * * *", 跑完 tick, lastFiredAt 是 8 点
-      (UTC+8 解析后)。
-    </p>
-    <p>
-      <strong>schedule 落盘走 atomic write:</strong>create() 后 spy 验证 走
-      write tmp + fsync + rename 路径, 不直接改原文件。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 4 · overlap=allow 默认值</span></div>
+    <div class="card__body">
+      <p>默认 overlap=allow, LLM / user 创建 schedule 不指定就
+        allow。 错。 大多数 schedule (CI / 测试 / 扫描) 应该 skip,
+        allow 容易出并发问题。 正确: 默认 skip, 用户明确想并发
+        才显式 allow, 符合"安全默认值" 原则。</p>
+    </div>
   </div>
 </div>
-
-<h2 id="lookback">回望第 00–13 章: 哪些原则在本章兑现了</h2>
-<ul>
-  <li>
-    <strong>async run 复用:</strong>schedule 命中时派发到 async run,
-    不重新发明后台执行机制。
-  </li>
-  <li>
-    <strong>reminder 标签 source 区分:</strong>schedule 通知和 async run
-    通知走不同 source, LLM 可精确响应。
-  </li>
-  <li>
-    <strong>原子写落盘:</strong>schedule 走第 15 章 atomic write, 跨进程保留。
-  </li>
-  <li>
-    <strong>工厂模式:</strong>scheduleManager / scheduleStore 都是工厂,
-    Composition Root 唯一 new。
-  </li>
-</ul>
-
-<h2 id="forward">前瞻张力: 留给后续章节</h2>
-<dl class="defs">
-  <dt>schedule 触发时的 permission</dt>
-  <dd>定时任务仍然过 permission, 危险命令 deny, 不在后台偷偷跑。</dd>
-  <dt>schedule 输出存档</dt>
-  <dd>第 15 章 transcript 记录 schedule 每次触发的命令 / 时间 / 结果。</dd>
-  <dt>schedule 与 task 协作</dt>
-  <dd>
-    schedule 命中的命令可以读 task 状态, 自动激活下一个 task (例如"每天 8
-    点检查测试 task 是否完成")。
-  </dd>
-  <dt>schedule 跨多 harness 实例</dt>
-  <dd>
-    多进程同时跑时, schedule 触发需要文件锁防重复, 本章用 atomic write +
-    timestamp 简单互斥, 第 15 章会展开文件锁细节。
-  </dd>
-</dl>
-
-<h2 id="vibe-coding-14">本次如何 vibe code: 第 14 章的三件套</h2>
-
-<h3 id="vibe-feed-14">拆卡: 4 轮迭代的具体产物</h3>
+<h2 id="validate">Validation: 4 条不变量检验</h2>
 <ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>Schedule</code> /
-    <code>ScheduleStore</code> / <code>ScheduleManager</code> /
-    <code>ScheduleNotification</code> 四个 interface, 以及 cron
-    解析规则文档。本轮不写实现, 重点钉"schedule 命中时复用 async run" 和"通知
-    source 区分"。
+<li>
+<strong>occurrence id 稳定</strong>: 同一 schedule + 同一时间点
+    调 generateOccurrenceId 2 次, 返回值字节相等。 验证: 单测
+    覆盖 daily / weekly / monthly, 同一时间点 2 次调用相等。
   </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 <code>index.ts</code> 接线,
-    <code>createScheduleManager()</code> 是 stub (tick 永远不命中), agent.run
-    不接入 drain。本轮 review 重点: scheduleManager 实例在
-    <code>index.ts</code> 只 new 一次, 复用同一个 asyncRunManager。
+<li>
+<strong>missed 不补跑</strong>: fake clock 跳到"该跑没跑" 的
+    时间点之后, tick 跑一次, 验证: missed occurrence 写一条,
+    fake asyncRunManager.start 调 0 次 (不补跑)。
   </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 createScheduleManager + schedule
-    工具 + agent.run 接入 drain + tick 循环 + 原子写。本轮 review 重点: 通知单独
-    source, tick 不 await, timezone 解析。
+<li>
+<strong>overlap=skip 行为</strong>: 上次 occurrence 还在 running,
+    触发点到, 验证: 写 skipped_overlap occurrence, fake
+    asyncRunManager.start 调 0 次。
   </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/schedules.test.ts</code>。本轮 review 重点: "tick 派发到 async
-    run" 和 "schedule 通知单独 source" 两条必须有 spy 验证。
+<li>
+<strong>overlap=allow 行为</strong>: 上次 occurrence 还在 running,
+    触发点到, overlap=allow, 验证: fake asyncRunManager.start
+    调 1 次 (启动新 run, 假设备用 fakeAsyncRun 不 reject)。
   </li>
 </ol>
-
-<h3 id="vibe-review-14">Review: 第 14 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>schedule 落盘走 atomic write。</strong>验证: spy 验证 create/update
-    走 write tmp + fsync + rename 路径。
-  </li>
-  <li>
-    <strong>通知 source="schedule" 与 source="async-run" 区分。</strong>验证:
-    <code>grep -n 'source="schedule"' src/agent.ts</code> ≥ 1 行, 与
-    source="async-run" 不在同一段。
-  </li>
-  <li>
-    <strong>tick 不 await。</strong>验证: tick() 内部
-    <code>asyncRunManager.start(spec)</code> 之前没有 await, fire-and-forget。
-  </li>
-  <li>
-    <strong>timezone 字段被解析。</strong>验证: cron 解析函数接受 timezone 参数,
-    不读 process.env.TZ。
-  </li>
-  <li>
-    <strong>scheduleManager 工厂化。</strong>验证:
-    <code>grep -n 'new ScheduleManager' src/</code> 应当 0 行。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-14">调试: 第 14 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · 通知 source 写错, 用了 async-run。</strong>症状: schedule
-    通知走 &lt;system-reminder source="async-run"&gt; 标签。验证: Validation
-    卡片"schedule 通知单独 source" 那条测试通过 (messages 含
-    source="schedule")。
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
+<ul>
+<li>
+<strong>触发器, 不是执行器</strong>: Schedule 自己不跑, 委托
+    Async Run, 复用并发限制 / 终态幂等 / 输出隔离。
   </li>
-  <li>
-    <strong>伪装 B · tick 内部 await, 阻塞循环。</strong>症状: tick 内部
-    <code>await asyncRunManager.start(spec)</code>。验证: Validation 卡片"tick
-    派发到 async run" 那条, spy 验证 start 被调用, 但 tick 立即返回。
+<li>
+<strong>层间状态机</strong>: Schedule 管 active/cancelled,
+    occurrence 管 triggered/skipped/missed, Async Run 管
+    running/completed, 三层各管各的。
   </li>
-  <li>
-    <strong>伪装 C · 不读 timezone, 用系统时区。</strong>症状: cron 解析时直接用
-    new Date() (系统本地时区), 忽略 schedule.timezone。验证: Validation
-    卡片"timezone 字段被解析" 那条测试通过 (lastFiredAt 是用户时区的 8 点,
-    不是系统时区的 8 点)。
+<li>
+<strong>稳定 id</strong>: occurrence id = scheduleId + scheduledAt,
+    幂等, 启动时检测 missed 不重复创建。
   </li>
-</ol>
-
-<h3 id="vibe-iterate-14">迭代: 第 14 章 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch14): 钉 Schedule / ScheduleStore / ScheduleManager /
-      ScheduleNotification 接口与 cron 规则</code
-    >
-    —— tsc 通过, 无实现。
+<li>
+<strong>missed 不补跑</strong>: 教学版故意简化, 只记录审计, 避免
+    资源滥用。
   </li>
-  <li>
-    <code>feat(ch14): createScheduleManager 工厂 + schedule 工具 stub</code> ——
-    tsc 通过, tick 永远 noop。
+<li>
+<strong>overlap 默认 skip</strong>: 安全默认, 用户明确 allow 才
+    启动新 run。
   </li>
-  <li>
-    <code
-      >feat(ch14): tick 派发到 async run + 通知单独 source + timezone 解析 +
-      atomic write</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code>test(ch14): disabled schedule 不触发 + 落盘 spy 验证</code> —— 全绿。
-  </li>
-</ol>
-
+</ul>
+<h2 id="forward">前瞻张力: 留给后续章节</h2>
+<dl class="defs">
+<dt>missed_policy 配置</dt>
+<dd>
+    教学版 fixed skip, 工业版可让用户配
+    <code>missedPolicy: "skip" | "backfill"</code>。 留 P2 阶段。
+  </dd>
+<dt>跨进程 schedule 触发</dt>
+<dd>
+    当前 Schedule 是进程内 timer, 进程退出 schedule 不触发。
+    想要 "harness 进程退出, schedule 仍触发" 需要外部
+    scheduler (systemd timer / k8s cronjob), 调 harness CLI
+    启动新的 harness 进程。
+  </dd>
+<dt>Schedule 持久化运行 (类似 k8s cronjob)</dt>
+<dd>
+    当前 schedule 数据持久化, 运行不持久化 (进程退出 timer
+    丢)。 工业级可以加 "schedule worker 进程" 持续运行,
+    harness 主进程只是其中之一, 多个进程协调触发 (lock)。
+    留 P2 阶段。
+  </dd>
+</dl>
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
   <div class="card__head">
     <span class="card__tag">Prompt Card · 第 14 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
+    <button class="card__copy" data-copy-card="" type="button">复制</button>
   </div>
   <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 Schedule 模块, 让 harness 按 cron 自动触发任务,
-      内部复用 async run 派发, 通知走单独 source 标签。
-    </p>
-    <p>
-      <strong>场景:</strong>用户创建 schedule "每天 8 点跑 npm test", harness
-      启动时加载, 每天 8 点 tick 命中, 派发到 async run, 完成时通知走
-      source="schedule" reminder 注入主 loop。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/schedules.ts</code> (新) 暴露
-      <code>createScheduleManager()</code>;
-      <code>src/schedule-store.ts</code> (新) 实现存储后端;
-      <code>src/tools/schedule.ts</code> (新) 实现 create / list / enable /
-      disable / delete; <code>src/agent.ts</code> 每轮 LLM 调用前 drain schedule
-      通知 (单独 source); <code>src/index.ts</code> 接线 scheduleManager + tick
-      循环。
-    </p>
+    <p><strong>目标:</strong> 给 harness 加 Schedule 定时调度, 5 秒
+      tick 一次, 6 种 recurrence, 复用 Async Run 作为执行单元,
+      missed 不补跑, overlap 默认 skip。</p>
+    <p><strong>场景:</strong> 用户设 "每天 9 点跑 nightly test"
+      schedule, executor=command, command="npm test", permission=ci。
+    harness 启动, timer 5 秒 tick, 9:00 触发, 调 asyncRunManager.start
+    启动 npm test, 跑 5 分钟, 完成回调写 occurrence (completed +
+    outputRef), 推 notification 给 LLM。 周末进程不在线, 周一启动
+    检测周六 missed, 写 missed audit, 不补跑, 周一 9:00 照常触发。</p>
+    <p><strong>模块:</strong> <code>src/schedules.ts</code> (新)
+      暴露 <code>createScheduleManager({store, asyncRunManager, ...})</code>;
+      <code>src/tools/schedule-*.ts</code> (新) 包装 LLM 接口;
+      <code>src/index.ts</code> (改) Composition Root 创建 +
+      start() + SIGINT hook 调 stop()。</p>
     <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
     <ul>
-      <li>schedule 落盘走 atomic write</li>
-      <li>通知 source="schedule" 单独标签, 不与 source="async-run" 混</li>
-      <li>tick 内部不 await asyncRunManager.start, fire-and-forget</li>
-      <li>cron 解析用 schedule.timezone 字段, 不读 process.env.TZ</li>
-      <li>
-        scheduleManager 工厂, Composition Root 唯一 new, 复用同一个
-        asyncRunManager
-      </li>
+      <li>6 种 recurrence 固定枚举 every_seconds / hourly / daily / weekly / monthly / yearly, 不可新增</li>
+      <li>occurrence id = scheduleId + scheduledAt 拼, stable + idempotent, 不可随机生成</li>
+      <li>missed occurrence 只记录审计, 不补跑, 推进到未来下一次</li>
+      <li>overlap 默认 skip, allow 需显式指定, 上次 running 时 allow 启动新 Async Run</li>
+      <li>Schedule 自己不跑命令 / 不调 LLM, 必须委托给 Async Run, 不重复实现执行生命周期</li>
+      <li>5 秒 tick (TICK_INTERVAL_MS = 5000), start / stop 幂等, 重复调用安全</li>
+      <li>linkedTaskUpdate 3 种 policy: never / append_note / mark_failed_on_failure, 不可任意改 task 状态</li>
+      <li>permissionProfile 3 种: readonly / ci / workspace_write, 透传给 Async Run 校验</li>
     </ul>
-    <p>
-      <strong
-        >验证 (用 fake scheduleManager + spy asyncRunManager, 逐条落到
-        vitest):</strong
-      >
-    </p>
+    <p><strong>验证 (用 fake clock + fake asyncRunManager + vitest, 逐条断言):</strong></p>
     <ul>
-      <li>
-        fake cron "* * * * *", 时间快进 1 分钟, tick 后 spy 验证
-        asyncRunManager.start 被调用 1 次
-      </li>
-      <li>
-        fake 触发, LLM 下一轮 messages 含 source="schedule" 标签, 不含
-        source="async-run"
-      </li>
-      <li>fake enabled=false, tick 后 spy 验证 start 没被调用</li>
-      <li>
-        fake timezone="Asia/Shanghai" cron="0 8 * * *", tick 后 lastFiredAt 是 8
-        点 (UTC+8)
-      </li>
-      <li>create() 后 spy 验证走 write tmp + fsync + rename</li>
+      <li>occurrence id 稳定: 同一 scheduleId + scheduledAt, 2 次调用字节相等</li>
+      <li>missed 不补跑: fake clock 跳到过去, tick 后写 1 条 missed, fakeAsyncRun.start 调 0 次</li>
+      <li>overlap=skip: 上次 running, 触发点写 skipped_overlap, fakeAsyncRun.start 调 0 次</li>
+      <li>overlap=allow: 上次 running, 触发点 fakeAsyncRun.start 调 1 次</li>
+      <li>6 种 recurrence nextRunAt 计算正确: 5 个 case 覆盖 daily / weekly / monthly / hourly / every_seconds</li>
     </ul>
   </div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意让 tick 内部 await asyncRunManager.start, 跑测试, 看"tick 派发到 async
-    run" 是否抓到 (会卡在 start 上)。
+<li>
+    故意让 Schedule 自己跑 (不委托 Async Run), 跑 schedule 触发
+    测试, 看"Schedule 是触发器不是执行器" 是否抓到 (并发失控 /
+    终态不一致 vs. Async Run 集中管理)。
   </li>
-  <li>
-    把通知 source 标签写成 "async-run", 跑测试, 看"schedule 通知单独 source"
-    是否抓到。
+<li>
+    故意让 missed 自动补跑, 跑 7 天没启动 harness 后重启的测试,
+    看"missed 不补跑" 是否抓到 (7 个 missed → 7 个并发 Async Run
+    vs. 1 条 missed audit + 0 个补跑)。
   </li>
-  <li>
-    cron 解析用 process.env.TZ 而不是 schedule.timezone, 跑测试, 看"timezone
-    字段被解析" 是否抓到。
+<li>
+    故意让 occurrence id 随机生成, 跑"启动时检测 missed" 测试,
+    看"稳定 id" 是否抓到 (重复创建 missed occurrence vs. 同一
+    id 幂等)。
+  </li>
+<li>
+    故意让 overlap 默认 allow, 跑"上次还在 running" 测试, 看
+    "overlap 默认 skip" 是否抓到 (2 个 CI 并发 vs. 1 个 running
+    + 1 个 skipped)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了 Schedule 模块, 让 harness 按 cron 自动触发任务。 schedule
-  落盘走 atomic write, 内部复用 async run 派发, 通知走 单独 source="schedule"
-  标签。tick 循环在 harness 启动时开, 每 分钟检查一次。下一章 (第 15 章)
-  我们会处理"长期运行卫生" 的 问题——Runtime Hardening,
-  包括日志轮转、原子写、output handle、 清理 dry-run 等, 让 harness
-  在长期运行下保持稳定。
+  Schedule 是给"时间触发" 的调度器, 复用 Async Run 作为执行
+  单元。 核心是 5 个设计:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>触发器, 不是执行器</strong>: Schedule 自己不跑, 委托
+    Async Run, 复用其并发 / 终态 / 输出 / abort 全部基础设施。
+  </li>
+<li>
+<strong>6 种 recurrence</strong>: every_seconds / hourly / daily /
+    weekly / monthly / yearly, 固定枚举, 不可新增。
+  </li>
+<li>
+<strong>occurrence 稳定 id</strong>: scheduleId + scheduledAt
+    拼, 幂等, 启动时检测 missed 不重复。
+  </li>
+<li>
+<strong>missed 不补跑</strong>: 教学版故意简化, 只记录审计,
+    推进到未来下一次。
+  </li>
+<li>
+<strong>overlap 默认 skip</strong>: 安全默认, allow 需显式指定。
+  </li>
 <p>
-  第 14 章让 harness 有了定时任务, 但 schedule / async run / memory / task
-  这些持久化数据在长期运行下会无限增长。下一章 Runtime Hardening
-  会处理"卫生系统": 原子写 (write tmp + fsync + rename)、日志轮转、 output
-  handle 引用计数、清理 dry-run、时间语义统一 (epoch ms 而非 Date 字符串), 让
-  harness 在真实使用中慢慢腐烂时不至于崩溃。
+  下一章 (第 15 章) 展开 harness 跑生产时的<strong>运行时安全</strong>
+  — 日志轮转 / 配置优先级 / 终端兼容性 / 多 session 隔离 /
+  normalize 协议差异 / LLM Provider 多后端支持 / 上下文排序 /
+  Prompt 策略, 把前面 14 章的所有模块在生产里跑稳。
 </p>
diff --git a/tutorial/chapters/15-hardening.html b/tutorial/chapters/15-hardening.html
index fe720d8..60fe245 100644
--- a/tutorial/chapters/15-hardening.html
+++ b/tutorial/chapters/15-hardening.html
@@ -1,552 +1,737 @@
-<p class="article__eyebrow">第 15 章 · 长期运行的卫生系统</p>
-<h1 class="article__title">Runtime Hardening: 长期运行不会把系统跑坏</h1>
+<p class="article__eyebrow">第 15 章 · 跑生产时的运行时安全</p>
+<h1 class="article__title">Hardening: 日志轮转 / 多 LLM / 多 session / Normalize 协议差异</h1>
 <p class="article__lede">
-  前面十四章让 harness 拥有完整能力, 但 schedule / async run / memory / task /
-  output store 这些持久化数据在长期运行下会无限增长, 日志、 transcript、output
-  handle 也会累积。某天启动变慢、磁盘占满、某个 JSON 因为中断写坏。这一章给
-  harness 加 Runtime Hardening, 覆盖所有 读写边界: 原子写、日志轮转、output
-  handle 引用计数、清理 dry-run、 时间语义统一, 让 harness
-  在真实使用中慢慢腐烂时不至于崩溃。
+  前 14 章让 harness 在 happy path 上跑得通, 但生产里还要解决
+  8 件"杂事": 日志文件 <strong>撑爆磁盘</strong> (log-rotation) /
+  <strong>3 个 LLM 后端</strong> (OpenAI / Anthropic / 自部署
+  vLLM) <strong>协议差异</strong> (normalize) / 配置<strong>优先级</strong>
+  (CLI > env > config.ts) / 多 session <strong>并发隔离</strong>
+  / prompt <strong>策略层</strong> (system-prompt 拼装) / context
+  <strong>排序</strong> (context-ranking) / 终端<strong>兼容</strong>
+  (terminal.ts TTY 检测)。 这一章不深入单个模块, 而是把这些
+  "安全网" 列清楚, 讲清"为什么需要" + "靠哪条不变量保证" +
+  "出问题怎么定位"。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-14">在第 14 章基础上改了什么</h2>
-<p>
-  这一章不新增业务模块, 而是基于已有的 schedule (第 14 章) / async run (第 13
-  章) / memory (第 09 章) / task (第 12 章) / output store (第 06 章 P1) /
-  transcript (第 08 章) 重新组织所有持久化 IO, 加上 原子写、日志轮转、output
-  handle 引用计数、清理 dry-run、时间语义 统一五个卫生系统。 对应到代码,
-  改动集中在 5 个文件: <code>src/atomic-write.ts</code> (新)、
-  <code>src/log-rotation.ts</code> (新)、<code>src/output-store.ts</code>
-  (改加引用计数)、<code>src/cleanup.ts</code> (新)、<code>src/timeline.ts</code>
-  (改时间语义统一)。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/atomic-write.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/atomic-write.ts: 原子写工具 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/log-rotation.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/log-rotation.ts: 日志轮转 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/output-store.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/output-store.ts: 引用计数 + 过期清理 (改)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/cleanup.ts"
-    target="_blank"
-    rel="noreferrer"
-    >4. src/cleanup.ts: 干跑清理 (新)</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/timeline.ts"
-    target="_blank"
-    rel="noreferrer"
-    >5. src/timeline.ts: 时间语义统一 (改)</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    harness 跑了一个月, 用户突然发现启动变慢、磁盘占满、schedule 触发时间错位
-    (以为本地时区实际 UTC)。现象是"卫生系统缺失, 长期运行下慢慢腐烂"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"先不管, 等文件太多再 rm -rf"。这有两个问题:
-    一是不知道哪些数据可删 (memory / task 是用户长期事实, 删了 用户损失),
-    二是粗暴删除破坏原子性, 写到一半的文件会留垃圾。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface AtomicWrite { write(path, content) }, interface Cleanup {
-      dryRun(), run() }</code
-    >。 不变量五条: (1) 所有持久化 IO 走 atomic write, 不直接 fs.writeFile, (2)
-    日志按大小或时间轮转, 不无限增长, (3) output store 引用计数, 没人引用的
-    output_id 可清理, (4) 清理先 dry-run 给用户看, 用户 确认后才真删, (5) 时间用
-    epoch ms (number) 不用 Date 字符串。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake atomic write 中断 (模拟断电), 跑完后原文件仍然存在, tmp 文件可能被留
-    (不破坏), 重新启动 harness 后能正常解析; fake cleanup dry-run 返回 5
-    个待删文件, run() 之前文件仍在, run() 之后文件不在。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · 直接 fs.writeFile, 断电留半截</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 错误: 直接写, 断电会留半截
-import fs from "node:fs/promises";
-async function saveMemory(path: string, content: string) {
-  await fs.writeFile(path, content);
-}</code></pre>
-  <p><strong>问:</strong>为什么不直接 writeFile?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 原子性: 写到一半断电, 文件 是半截
-    JSON, 下次启动解析失败, 用户的偏好丢失; 不可恢复: 解析失败后没有 backup,
-    用户只能从头开始; 可观测: 写失败时 fs.writeFile
-    抛错但不告诉用户"为什么失败", 调试困难。
-  </p>
-</div>
-
-<div class="note">
-  <p class="note__title">观察 2 · cleanup 实际跑, 不 dry-run</p>
-  <pre class="code-block"><code>// 教学简化版
-// 错误: 不先 dry-run 给用户看
-async function cleanup() {
-  const oldFiles = await glob(".claude/cache/*.tmp");
-  for (const f of oldFiles) await fs.unlink(f);
-}</code></pre>
-  <p><strong>问:</strong>为什么不直接删?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 误删: 用户可能有临时调研 文件在 tmp
-    里, 删了用户丢失; 不可逆: 删了之后用户说"等等, 那个文件我有用的", 没了;
-    不可信: 用户不敢用清理功能, 担心 删错, harness 的卫生功能形同虚设。
-  </p>
-</div>
-
-<h2 id="atomic-write">原子写: write tmp + fsync + rename</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export async function atomicWriteFile(path: string, content: string | Buffer): Promise&lt;void&gt; {
-  const tmpPath = `${path}.tmp.${process.pid}.${Date.now()}`;
-  const fh = await fs.open(tmpPath, "w");
-  try {
-    await fh.writeFile(content);
-    await fh.sync();  // fsync 强制刷盘
-  } finally {
-    await fh.close();
-  }
-  await fs.rename(tmpPath, path);  // rename 在同一文件系统下是原子的
-}</code></pre>
-<p>
-  关键: <code>rename</code> 在同一文件系统下是原子操作。要么 tmp 文件 rename
-  成功, 替换原文件; 要么 rename 失败, 原文件不变。半截写入 不会暴露给后续读取。
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: harness 跑 3 个月, 磁盘撑爆 100GB</h2>
+<p>
+  写代码之前, 先看一个真实痛点。 用户的 team 上线 harness 跑
+  生产, 3 个月后发现 /var/log/harness.log 撑爆 100GB, 系统
+  报警 "disk usage 95%"。 同时 team 想接 Anthropic Claude (原本
+  只支持 OpenAI), 改完 LLM 客户端发现 100 个测试里 30 个 fail
+  (thinking_content 字段位置不一样)。
 </p>
-
-<h2 id="log-rotation">日志轮转: 按大小 + 按时间</h2>
+<ol>
+<li>
+<strong>症状 1: 日志撑爆</strong>: 跑 3 个月没轮转, 单个 .log
+    文件 100GB, 磁盘满, 系统挂。
+  </li>
+<li>
+<strong>症状 2: 换 LLM fail</strong>: OpenAI 用
+    <code>response.choices[0].message.content</code>, Anthropic
+    用 <code>response.content[0].text</code>, 代码到处写
+    <code>message.content</code> 拿到 undefined。
+  </li>
+<li>
+<strong>症状 3: 配置混乱</strong>: CLI 传 <code>--mode plan</code>,
+    env 传 <code>HARNESS_MODE=auto</code>, 哪个生效? 团队
+    不知道, 出现"我配的怎么不生效"。
+  </li>
+<li>
+<strong>症状 4: 多 session 互相干扰</strong>: user A 和 user B
+    同时跑 harness, 共享 history 数组, A 看到 B 的对话。
+  </li>
+<li>
+<strong>真问题</strong>: 缺<strong>运行时安全网</strong> —
+  日志轮转 / LLM 协议 normalize / 配置优先级 / session 隔离 /
+  prompt 策略 / context 排序 / 终端兼容, 7 个独立但都必需的
+  模块, 缺一个就出事。
+  </li>
+</ol>
 <p>
-  harness 日志 (LLM 通信 / hook 决策 / schedule 触发) 持续增长。本章
-  引入日志轮转, 按 <code>maxFileSize</code> (默认 10MB) 或 <code>maxAge</code>
-  (默认 7 天) 切割。轮转策略:
+  朴素想法 1: "日志不重要, 不用管?" 错。 日志是<strong>唯一</strong>
+  出问题时的排查依据, 撑爆 = 排查依据全丢 = 第二次事故无法
+  复盘。
 </p>
-<dl class="defs">
-  <dt>按大小</dt>
-  <dd>
-    日志文件超过 maxFileSize 时, 关闭当前文件, 重命名为
-    <code>app.log.1</code>、<code>app.log.2</code> 等, 启动新的
-    <code>app.log</code>。保留最近 N 个文件, 更老的删除。
-  </dd>
-  <dt>按时间</dt>
-  <dd>每天 0 点自动轮转, 跨天的日志按天归档。保留最近 7 天, 更老的 删除。</dd>
-  <dt>组合</dt>
-  <dd>同时满足时按更严格的策略。本章默认两个都启用, 任一触发即轮转。</dd>
-</dl>
-
-<h2 id="output-handle">output store 引用计数</h2>
-<p>
-  第 06 章 P1 压缩把大工具输出存到
-  <code>~/.cache/run-outputs/&lt;output_id&gt;</code>, history
-  里只剩占位符。问题是: 如果 LLM 之后再没引用这个 output_id, 文件就成孤儿了,
-  永远占磁盘。本章加引用计数:
-</p>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface OutputHandle {
-  id: string;
-  path: string;
-  size: number;
-  refCount: number;        // 当前被多少 messages 引用
-  createdAt: number;
-  expiresAt: number;        // 引用计数 0 之后, 24h 后可清理
-}</code></pre>
-<p>
-  每次写入 output 时 refCount = 1 (对应的 tool message 引用一次); 之后 compress
-  / replay 删除或合并该 tool message 时, refCount -= 1; refCount 归 0 后, 24
-  小时后可清理。cleanup 走 dry-run 列出待清理 文件, 用户确认后才真删。
+<p>
+  朴素想法 2: "只支持 OpenAI, 不接其他?" 商业上不行。 team 想
+  换 Anthropic / 自部署 vLLM 节省成本, 不接 = 锁死单 provider。
+  需要<strong>协议 normalize 层</strong>, 内部用统一格式, 外部
+  adapter 转 provider 协议。
 </p>
-
-<h2 id="cleanup">清理: dry-run + run</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface CleanupReport {
-  outputOrphans: { id: string; size: number; path: string }[];
-  oldLogs: { path: string; size: number; ageDays: number }[];
-  rotatedBackups: { path: string; size: number }[];
-  totalSize: number;
-}
-
-export interface Cleanup {
-  // 列出待清理内容, 不真删
-  dryRun(): Promise&lt;CleanupReport&gt;;
-  // 用户确认后真删
-  run(report: CleanupReport): Promise&lt;{ deleted: number; freedBytes: number }&gt;;
-}</code></pre>
 <p>
-  dry-run 返回的报告分三类: 孤儿 output (引用计数 0 且超过 24h)、 老日志 (超过
-  maxAge)、轮转备份 (超过保留份数)。用户看完报告, 决定哪些删、哪些保留。
+  正确做法: 8 个<strong>独立</strong>模块各自解决一个问题 —
+  <code>log-rotation.ts</code> / <code>normalize.ts</code> /
+  <code>config.ts</code> / <code>session.ts</code> / <code>system-prompt.ts</code> /
+  <code>context-ranking.ts</code> / <code>terminal.ts</code> /
+  <code>cli-commands.ts</code> / <code>llm-providers.ts</code>。
+  这一章把它们<strong>串起来</strong>看, 讲清"为什么需要" +
+  "靠哪条不变量保证"。
 </p>
-
-<h2 id="timeline">时间语义: 全部用 epoch ms</h2>
+<h2 id="hardening-areas">8 个安全网的边界</h2>
 <p>
-  不同模块用不同时间格式 (Date 字符串 / 秒级 timestamp / 毫秒 timestamp)
-  会导致排序错乱、跨时区混乱。本章统一时间语义:
+  <strong>用途</strong>: 8 个模块各自独立, 但有<strong>层间依赖</strong>:
+  config 在最底层 (其他都依赖它的配置), session 在中间层 (多
+  进程共享资源), cli-commands 在顶层 (暴露 LLM / user 接口)。
+  理解依赖关系才知道"改一个会不会影响其他"。
 </p>
-<ul>
-  <li>持久化存储 (memory / task / schedule): 全部用 epoch ms (number)。</li>
-  <li>日志输出: epoch ms 数字 + 格式化为 ISO 8601 字符串 (UTC)。</li>
-  <li>UI 显示: 转用户时区 + 友好格式 ("3 小时前")。</li>
-</ul>
 <p>
-  内部不允许 Date 字符串。string &lt;-&gt; number 转换只发生在 IO 边界 (读 JSON
-  文件时 string -&gt; number, 写 UI 时 number -&gt; 字符串)。
+  <strong>真实场景</strong>: team 想加 "harness 支持 /thinking
+  思考预算" 配置, 要改 (1) config.ts 加 thinking_budget 字段;
+  (2) system-prompt.ts 把预算拼到 prompt; (3) cli-commands.ts
+  加 /thinking CLI 命令; (4) llm-providers.ts 转给具体 provider
+  (Anthropic 接受 thinking 字段, OpenAI 不接受)。 4 个文件联动
+  改, 漏一个就出 bug。
 </p>
-
-<h2 id="trap">反例梯度</h2>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>持久化用 fs.writeFile, 不走 atomic write。</p>
-    <p><strong>为什么错:</strong>断电留半截 JSON, 解析失败, 用户偏好丢失。</p>
-    <p>
-      <strong>正确做法:</strong>所有持久化 IO 走 atomicWriteFile(), write tmp +
-      fsync + rename。
-    </p>
+<p>
+  <strong>设计思想</strong>: <strong>模块独立 + 协议对齐</strong> —
+  Reference 章节 "模式 1 · 工厂 + 闭包" + "模式 3 · 窄接口 + Adapter"
+  的延伸。 8 个模块都用工厂 + 闭包, 接口清晰, 改一个不污染其他。
+</p>
+<div class="figure figure--stack">
+  <div class="figure__title">图 1 · 8 个安全网的依赖层次</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__layer--stable"></div>
+      <div class="flow-stack__label">顶层 · CLI / Tools</div>
+      <div class="flow-stack__body">cli-commands.ts 暴露 /mode /memory /schedule /task /model-policy 等 CLI 命令; tools/* 暴露 LLM 可调工具。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">策略层 · system-prompt / context-ranking</div>
+      <div class="flow-stack__body">system-prompt.ts 拼装 system message (含 memory / skill / tool 规则); context-ranking.ts 排序 history 中哪些 message 优先。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">执行层 · normalize / llm-providers / foundation-models</div>
+      <div class="flow-stack__body">normalize.ts 把消息归一成 OpenAI ChatCompletionMessageParam; llm-providers.ts 暴露 3 个 provider (openai / anthropic / vllm); foundation-models.ts 列出 5 个常见 model 适配。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">会话层 · session / transcript</div>
+      <div class="flow-stack__body">session.ts 派生独立 history / 隔离多 user; transcript.ts 持久化对话到文件。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">基础层 · config / logger / log-rotation / terminal</div>
+      <div class="flow-stack__body">config.ts 加载配置 (CLI > env > default); logger.ts 输出日志; log-rotation.ts 按 size 轮转; terminal.ts 检测 TTY / 颜色支持。</div>
+    </div>
   </div>
 </div>
+<p>
+  <strong>实现细节</strong>: 8 个模块跨 5 层, 顶层依赖底层, 底层
+  不依赖顶层。 改 config 加字段, 只影响基础层 + 触发其他层 reactive
+  读; 改 cli-commands 加命令, 只影响顶层。 层间清晰, 维护成本
+  线性。
+</p>
+<h2 id="log-rotation">log-rotation: 按 size 轮转 + 保留 N 个</h2>
+<p>
+  <strong>用途</strong>: 跑生产 harness, 日志文件<strong>必须</strong>
+  按 size 轮转, 避免单文件撑爆磁盘。 <code>log-rotation.ts</code>
+  提供 <code>rotateLogFileIfNeeded(filePath, options)</code> —
+  写日志前检查 size, 超过 maxBytes 触发轮转。
+</p>
+<p>
+  <strong>真实场景</strong>: harness 跑 3 个月, 单个 .log 文件
+  100GB。 正确做法: maxBytes = 50MB, 写日志前检查 size, 超了
+  rename <code>harness.log</code> → <code>harness.log.1</code>
+  (覆盖最旧的), 新 <code>harness.log</code> 继续写。 保留 5 个
+  backup, 总占用 ≤ 50 × 6 = 300MB, 远小于磁盘容量。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>size-based + 简单 rename</strong> —
+  不引入 <code>logrotate</code> 系统工具, harness 自己处理,
+  跨平台一致。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/log-rotation.ts#L29"><code>src/log-rotation.ts</code> 第 29 行的 <code>rotateLogFileIfNeeded</code></a>:
+</p>
+<pre><code class="language-typescript">export function rotateLogFileIfNeeded(
+  filePath: string,
+  options: LogRotationOptions,
+): { rotated: boolean; removedFiles: string[] };</code></pre>
+<p>
+  <strong>实现细节</strong>: 关键不变量 — (a) 写日志<strong>前</strong>
+  检查 size, 不是写完后; (b) rename 走原子 (rename syscall 是
+  原子的); (c) 保留 N 个 backup, 多的删; (d) 轮转失败 warn 日志,
+  不 throw, 不影响 harness 继续写日志。 这是 Reference 章节
+  "模式 10 · Atomic Write 原子写" 的应用。
+</p>
+<h2 id="normalize">normalize: 3 个 provider 协议归一</h2>
+<p>
+  <strong>用途</strong>: OpenAI / Anthropic / 自部署 vLLM 的协议
+  格式不同, agent 主循环不应该直接面对 3 套格式。 应该<strong>内部</strong>
+  用 OpenAI ChatCompletionMessageParam 统一格式, <strong>外部</strong>
+  调 provider adapter 转协议。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/normalize.ts#L26"><code>src/normalize.ts</code> 第 26 行的 <code>normalizeMessages</code></a>:
+</p>
+<pre><code class="language-typescript">export function normalizeMessages(
+  messages: unknown[],
+): ChatCompletionMessageParam[];</code></pre>
+<p>
+  <strong>真实场景</strong>: user 切换 Anthropic Claude, Anthropic
+  用 <code>{role: "assistant", content: [{type: "text", text: "..."}]}</code>
+  (content 是数组), OpenAI 用 <code>{role: "assistant", content: "..."}</code>
+  (content 是字符串)。 agent 主循环假设 OpenAI 格式, 直接
+  <code>message.content</code> 拿字符串, 拿到数组报错。 normalize
+  层把 Anthropic 的 content 数组归一成 OpenAI 的字符串, agent
+  主循环无感知。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>统一内部格式, adapter 翻译</strong> —
+  Reference 章节 "模式 3 · 窄接口 + Adapter" 的标准应用。 选
+  OpenAI 格式作为"内部标准" 因为最常见, Anthropic / vLLM 写
+  adapter 把自家协议转成 OpenAI 格式。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/llm-providers.ts"><code>src/llm-providers.ts</code></a>
+  — 3 个 provider, 每个一个 adapter 文件, 内部都返回 OpenAI 格式
+  响应。
+</p>
+<p>
+  <strong>实现细节</strong>: 5 个常见差异要 normalize — (a) content
+  字符串 vs 数组; (b) tool_call 在 message 里 vs 在 content 块
+  里; (c) reasoning_content 字段 (Anthropic) vs reasoning
+  (OpenAI o1); (d) function_call 旧字段 vs tool_calls 新字段;
+  (e) 错误格式 (OpenAI APIError vs AnthropicError)。 normalize
+  层统一处理, agent 主循环只认"OpenAI 格式"。
+</p>
+<h2 id="config-priority">config 优先级: CLI > env > default</h2>
+<p>
+  <strong>用途</strong>: harness 配置从 3 个来源加载:
+  (1) <code>loadConfig()</code> 读 default + config.ts;
+  (2) 环境变量 <code>HARNESS_*</code> 覆盖 default;
+  (3) CLI 参数 <code>--mode / --debug</code> 覆盖 env。
+  优先级明确, 不含糊。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/config.ts#L89"><code>src/config.ts</code> 第 89 行的 <code>loadConfig</code></a>:
+</p>
+<pre><code class="language-typescript">export function loadConfig(): Config;</code></pre>
+<p>
+  <strong>真实场景</strong>: team 跑 CI, 想"覆盖 mode=auto +
+  timeout=60s"。 CI 脚本:
+  <code>HARNESS_MODE=auto harness --timeout 60000</code>。
+  CLI 优先级最高, mode=auto + timeout=60000 都生效; 不传
+  timeout 用 env 或 default。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>显式优先级 + 不打平</strong> —
+  Reference 章节 "模式 5 · Composition Root 组合根" 的应用。
+  loadConfig() 内部按 (CLI > env > default) 顺序 merge, 不
+  一次性 flatten (保留"哪个来源覆盖" 信息, 调试时能看到)。
+  3 条原则:
+</p>
+<ol>
+<li>
+<strong>CLI 最高</strong>: <code>--mode plan</code> 永远覆盖
+    env 和 default。
+  </li>
+<li>
+<strong>env 次之</strong>: <code>HARNESS_MODE=auto</code> 覆盖
+    default, 但被 CLI 覆盖。
+  </li>
+<li>
+<strong>default 兜底</strong>: 没 CLI 也没 env, 用 config.ts
+    默认值 (mode=default, timeout=30s)。
+  </li>
+</ol>
+<p>
+  <strong>实现细节</strong>: loadConfig 返回的 Config 含
+  <code>source: { mode: "cli" | "env" | "default" }</code>
+  字段, 调试时输出 "mode=plan (source: cli)", 让用户知道
+  配置从哪来, 不再 "我配的怎么不生效"。
+</p>
+<h2 id="session-isolation">多 session 隔离: 独立 history + 独立工具</h2>
+<p>
+  <strong>用途</strong>: 多 user / 多项目同时用 harness, 互不
+  干扰: 独立 history / 独立 tool registry / 独立 session id。
+  这是 Reference 章节 "模式 1 · 工厂 + 闭包" 的标准应用 —
+  每次 createSession() 返回新对象, 闭包内 state 互不污染。
+</p>
+<p>
+  <strong>真实场景</strong>: 30 个 user 同时跑 harness, user A
+  写到 history 的对话不会跑到 user B 的 history, user A 调的
+  subagent 不会和 user B 的 subagent 共享 LLM client。 靠
+  <code>createSession({userId, projectRoot})</code> 每次返回
+  独立实例。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>工厂 + 闭包</strong> —
+  SessionManager.createSession() 返回的 Session 含独立 history
+  (createHistory) + 独立 tool registry (createToolRegistry) +
+  独立 llm client (按需创建)。 关键不变量:
+  <strong>状态在闭包内, 外部不能误改</strong>。
+</p>
+<p>
+  <strong>实现细节</strong>: 隔离的边界 — (a) history 数组
+  互不共享; (b) tool registry 的状态 (call counts / last args)
+  互不共享; (c) 持久化文件按 sessionId 分目录 (transcript
+  写到 <code>.sessions/&lt;sessionId&gt;/transcript.jsonl</code>);
+  (d) LLM API key 按 session 注入 (多 key 轮换避免 rate limit)。
+  Reference 章节 "模式 17 · Test Doubles" 的应用 — 测试时
+  fake session 互不污染。
+</p>
+<h2 id="system-prompt">system-prompt: 拼装策略层</h2>
+<p>
+  <strong>用途</strong>: system prompt 不是"一段写死的字符串",
+  是<strong>策略拼装</strong> — base 规则 + memory 段 + skill 段
+  + tool 规则 + 当前 context, 按层拼, 缺哪段就跳过。 教学版
+  关键是 "system prompt 是 stable prefix, 拼完 1 次, 100 轮
+  对话复用" (第 10 章)。
+</p>
+<p>
+  <strong>真实场景</strong>: 启动时拼 system prompt =
+  <code>baseRules + memorySection + skillSection + toolRules + contextSummary</code>。
+  5 段按顺序, 每段独立可测。 测试用 fake memory / fake skill
+  注入, 验证拼装结果。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>分层拼装, 缺段跳过</strong> —
+  Reference 章节 "模式 6 · Pipeline 管道" + "模式 11 ·
+  Cache-friendly 缓存友好" 的应用。 每段单独函数, 单独测试,
+  拼装函数只负责 concat。 缺 memory 段时 <code>memorySection || ""</code>,
+  不 throw。
+</p>
+<p>
+  <strong>实现细节</strong>: system prompt 的体积控制 — 每段
+  限制 maxTokens (memory 段 200 token, skill 段 500 token, tool
+  规则 1000 token, 总量 ≤ 4k token), 超过截断 + 提示 "memory
+  段过长, 考虑清理"。 这是防止 system prompt 撑爆 cache prefix
+  命中率 (第 10 章)。
+</p>
+<h2 id="context-ranking">context-ranking: history 中哪些 message 优先</h2>
+<p>
+  <strong>用途</strong>: history 100 轮对话, 4000 token, LLM 看不
+  全? 不是 — LLM 看到全部, 但<strong>截断</strong>时哪些 message
+  优先保留? context-ranking 提供<strong>排序策略</strong>: 系统
+  消息 / 用户最近输入 / 工具错误 / 长输出摘要等优先级不同。
+</p>
+<p>
+  <strong>真实场景</strong>: 100 轮对话 history 4000 token 超
+  模型窗口 8k, 需要截断到 6k。 context-ranking 排序: (1) system
+  message 永不丢; (2) 最近 5 轮 user/assistant/tool 完整保留;
+  (3) 早期长 tool 输出保留摘要; (4) 早期 user/assistant 折叠
+  成 "1-line summary"。 截断后 LLM 看到"全部 context, 但老的内容
+  压缩了"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>分级保留 + 摘要折叠</strong> —
+  Reference 章节 "模式 6 · Pipeline 管道" + "模式 13 ·
+  Error→Action 错误转动作" 的应用。 截断不是"丢老的内容", 是
+  "老的内容保留摘要, 新的内容保留全文"。 摘要靠 LLM 生成 (内部
+  Async Run), 教学版可以简化为"丢最早的 10% 内容"。
+</p>
+<p>
+  <strong>实现细节</strong>: 排序权重 (a) system = 100; (b) 工具
+  错误 = 90 (LLM 需要看到错误调整); (c) 最近 5 轮 = 80; (d)
+  远期 user 消息 = 40; (e) 远期工具输出 = 20 (优先压缩)。 截断
+  时按权重从高到低保留, 直到达到 maxTokens。
+</p>
+<h2 id="terminal">terminal: TTY / 颜色 / 宽度的环境检测</h2>
+<p>
+  <strong>用途</strong>: harness 在 TTY 终端 / IDE 内嵌 / pipe 重
+  定向 3 种环境下行为不同: TTY 用 ANSI 颜色 + 交互式 prompt,
+  pipe 关闭颜色 + 用纯文本, IDE 看 isatty + 父进程名启发式
+  判断。 <code>terminal.ts</code> 暴露 Terminal 接口, 检测环境
+  + 调对应输出策略。
+</p>
+<p>
+  <strong>真实场景</strong>: user 在 vscode 集成终端跑 harness,
+  harness 输出带 ANSI 颜色, vscode 渲染颜色; user pipe 给
+  <code>less</code>, harness 检测到 not tty 关闭颜色, less 看到
+  纯文本 (不会乱码)。 同一份代码, 不同环境自动适配。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>环境检测 + 降级输出</strong> —
+  Reference 章节 "模式 13 · Error→Action 错误转动作" 的应用。
+  颜色开启 / 关闭是<strong>副作用</strong>, 应该从主流程里抽出,
+  集中到 terminal 接口。
+</p>
+<p>
+  <strong>实现细节</strong>: 检测点 4 个 — (a) <code>process.stdout.isTTY</code>;
+  (b) <code>process.env.NO_COLOR</code>; (c) <code>process.env.TERM</code>
+  含 "dumb" 关闭颜色; (d) <code>process.stdout.columns</code>
+  拿终端宽度 (布局自适应)。 任一不满足 → 关闭颜色 / 用固定
+  宽度 80 / 禁用交互 prompt。
+</p>
+<h2 id="cli-commands">cli-commands: 9 个 /xxx 命令注册</h2>
+<p>
+  <strong>用途</strong>: 用户在 REPL 输 <code>/mode plan</code> /
+  <code>/memory list</code> / <code>/schedule list</code> /
+  <code>/task create</code> / <code>/model-policy show</code> 等
+  命令, cli-commands.ts 集中注册 + 解析 + 执行。 看
+  <a href="https://github.com/pingp76/swoopcode/blob/main/src/cli-commands.ts#L65"><code>src/cli-commands.ts</code> 第 65 行的 <code>createCliCommandRegistry</code></a>:
+</p>
+<pre><code class="language-typescript">export function createCliCommandRegistry(): CliCommandRegistry;</code></pre>
+<p>
+  <strong>真实场景</strong>: user 输 <code>/task create "Fix bug
+  #123"</code>, registry 解析 create 子命令 + "Fix bug #123" 参数,
+  调 <code>taskManager.createGroup(...)</code> 创建 task group,
+  输出 "Created task group task_xxx: Fix bug #123"。 9 个命令
+  (mode / memory / skill / schedule / task / model-policy /
+  thinking / stable-context 等) 各管一摊, 互不干扰。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>注册表 + 工厂</strong> —
+  Reference 章节 "模式 1 · 工厂 + 闭包" + "模式 5 · Composition
+  Root" 的应用。 每个 /xxx 命令是独立工厂, 注册到全局 registry,
+  REPL 调 <code>registry.execute("/task", "create", args)</code>
+  分发。
+</p>
+<p>
+  <strong>实现细节</strong>: 9 个命令 <strong>不能互相依赖</strong>
+  — /task 不该 import /memory, /schedule 不该 import /mode。 命令
+  之间的协作通过 Composition Root 注入 (deps), 不用 import 链。
+  这保证加新命令不影响旧命令。
+</p>
+<h2 id="llm-providers">llm-providers: 3 个 provider 适配</h2>
+<p>
+  <strong>用途</strong>: harness 支持 3 个 LLM provider: OpenAI /
+  Anthropic / 自部署 vLLM (OpenAI-compatible)。 <code>llm-providers.ts</code>
+  暴露 <code>createOpenAIProvider / createAnthropicProvider /
+  createVLLMProvider</code> 3 个工厂, 都返回统一 LLMClient
+  接口 (chat + stream)。 agent 主循环不直接调 OpenAI SDK, 调
+  LLMClient 接口。
+</p>
+<p>
+  <strong>真实场景</strong>: user 想 "省成本, 用自部署 vLLM 跑
+  7B 模型", 改 <code>LLM_PROVIDER=vllm</code> + <code>VLLM_BASE_URL=http://...</code>,
+  harness 自动用 vLLM provider, agent 主循环 0 改动 (因为都
+  走 LLMClient 接口)。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>Adapter 模式 + 统一接口</strong> —
+  Reference 章节 "模式 3 · 窄接口 + Adapter" 的标准应用。 3 个
+  provider 各写 adapter, 把自家协议转成 OpenAI ChatCompletion
+  格式 (内部统一标准), agent 主循环只认"OpenAI 格式"。
+</p>
+<p>
+  <strong>实现细节</strong>: 3 个 provider 的差异 (a) OpenAI 直接
+  用 <code>openai</code> SDK; (b) Anthropic 用 <code>@anthropic-ai/sdk</code>
+  + 写 adapter 把 Anthropic 响应转 OpenAI 格式; (c) vLLM 用
+  OpenAI-compatible HTTP API (和 OpenAI 一样, 只换 baseURL)。
+  Anthropic adapter 是最复杂的 (content 数组 + thinking 字段
+  + tool_use 块都要转), 教学版只覆盖 80% case。
+</p>
+<h2 id="foundation-models">foundation-models: 5 个常见 model 适配</h2>
+<p>
+  <strong>用途</strong>: 不同 LLM model 有不同的<strong>能力差异</strong>
+  (上下文窗口 / 支持 function calling / 支持 vision / 思考预算) —
+  GPT-4o 128k + tool + vision; Claude Sonnet 200k + tool +
+  thinking; Qwen2.5-7B 32k + tool (no vision)。 harness 应该
+  知道"我用的是哪个 model, 它的能力边界在哪", 避免越界调用。
+</p>
+<p>
+  <strong>真实场景</strong>: user 设 <code>LLM_MODEL=qwen2.5-7b</code>,
+  harness 查 foundation-models.ts 找到 "Qwen2.5-7B: 32k context,
+  tool support: yes, vision: no, thinking: no", LLM 试图调带
+  图片的 tool 时, harness 提示 "当前 model 不支持 vision, 请用
+  text 描述"。 防止 LLM "以为能调实际不能调"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>能力清单 + 边界提示</strong> —
+  Reference 章节 "模式 5 · Composition Root 组合根" 的应用。
+  每个 model 一个清单, 启动时根据 LLM_MODEL 选清单, agent
+  调工具前 check 能力。
+</p>
+<p>
+  <strong>实现细节</strong>: 5 个常见 model + 1 个 fallback "unknown"
+  (任意名字走 unknown, 提示"未知 model, 用默认能力, 可能不准")。
+  教学版不维护完整 model 列表 (5 个够用), 工业版接
+  <code>litellm</code> 等开源 registry。
+</p>
+<h2 id="loop-integration">主循环集成: 8 个模块在 Composition Root 拼装</h2>
+<p>
+  <strong>用途</strong>: 8 个模块在 <code>src/index.ts</code> 的
+  Composition Root 拼装, 注入 agent。 拼装顺序很重要 —
+  config → logger → terminal → session → llm-providers → system-prompt
+  → context-ranking → cli-commands。 后者依赖前者的实例。
+</p>
+<p>
+  <strong>真实场景</strong>: Composition Root main() 流程:
+  (1) loadConfig() 读配置; (2) createLogger() 输出启动日志;
+  (3) detectTerminal() 决定颜色 / 交互; (4) createLLMClient()
+  按 provider 选 adapter; (5) createSession() 派生 session;
+  (6) createSystemPrompt(config) 拼装 system message; (7)
+  createContextRanking() 排序 history; (8) createCliCommandRegistry()
+  注册 9 个 /xxx 命令; (9) startREPL() 启动交互。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>显式依赖链</strong> —
+  Reference 章节 "模式 5 · Composition Root 组合根" 的标准
+  应用。 main() 函数一气呵成, 没人藏 import, 改模块就改 main。
+</p>
+<p>
+  <strong>实现细节</strong>: 8 个模块的实例<strong>共享一次</strong>,
+  不重复创建 (logger 重复创建 → 日志写到 2 个文件; terminal
+  重复创建 → 颜色检测 2 次可能不一致; session 重复创建 → 隔离
+  失效)。 AGENTS.md "共享实例必须字面共享" 规则。
+</p>
+<h2 id="fake-test">fake test: 用 fake env 验证 4 个不变量</h2>
+<p>
+  <strong>用途</strong>: 8 个模块的测试用<strong>fake env</strong> +
+  <strong>tmp dir</strong> + <strong>in-memory LLM client</strong> —
+  不真连 LLM, 不写真实日志文件, 不依赖真实 TTY。 测的是"模块
+  行为", 不是"外部依赖"。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户写测试覆盖 4 条不变量:
+  (a) log-rotation: 写 100MB 日志, 触发 2 次轮转, 总文件数 = 6
+  (1 个 current + 5 个 backup); (b) normalize: Anthropic 格式
+  消息 normalize 后变 OpenAI 格式; (c) config 优先级: CLI + env
+  + default 三层, CLI 最高; (d) session 隔离: 2 个 session history
+  互不影响。
+</p>
+<pre><code class="language-typescript">test("log-rotation: 50MB 上限 + 5 个 backup", () =&gt; {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), "log-test-"));
+  const filePath = path.join(dir, "test.log");
+  // 写 50MB
+  fs.writeFileSync(filePath, "x".repeat(50 * 1024 * 1024));
+  // 触发轮转
+  const result = rotateLogFileIfNeeded(filePath, { maxBytes: 10 * 1024 * 1024, maxBackups: 5 });
+  expect(result.rotated).toBe(true);
+  // 验证文件数
+  const files = fs.readdirSync(dir);
+  expect(files.length).toBeLessThanOrEqual(6);   // 1 current + 5 backup
+});
 
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>cleanup 直接删, 不先 dry-run。</p>
-    <p>
-      <strong>为什么错:</strong>误删用户文件, 用户不敢用清理功能,
-      卫生系统形同虚设。
-    </p>
-    <p><strong>正确做法:</strong>dryRun() 返回报告, 用户确认后 run() 真删。</p>
-  </div>
-</div>
+test("normalize: Anthropic content 数组 → OpenAI content 字符串", () =&gt; {
+  const input = [{ role: "assistant", content: [{ type: "text", text: "hello" }] }];
+  const output = normalizeMessages(input);
+  expect(output[0]!.content).toBe("hello");
+});
 
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>output store 不引用计数, 孤儿文件永久占盘。</p>
-    <p>
-      <strong>为什么错:</strong>用户跑一个月,
-      <code>~/.cache/run-outputs/</code> 撑爆磁盘。
-    </p>
-    <p>
-      <strong>正确做法:</strong>写入时 refCount = 1, compress / replay 删引用时
-      -1, 归 0 24h 后可清理。
-    </p>
-  </div>
-</div>
+test("config 优先级: CLI > env > default", () =&gt; {
+  process.env.HARNESS_MODE = "auto";
+  const config = loadConfig({ cli: { mode: "plan" } });
+  expect(config.mode).toBe("plan");
+  expect(config.source.mode).toBe("cli");
+});
 
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
+test("session 隔离: 2 个 session history 独立", () =&gt; {
+  const s1 = createSession({ id: "1" });
+  const s2 = createSession({ id: "2" });
+  s1.history.push({ role: "user", content: "A" });
+  expect(s2.history).toHaveLength(0);   // 不污染
+});</code></pre>
+<p>
+  <strong>实现细节</strong>: 4 个测试覆盖 4 个模块的核心不变量。
+  log-rotation 用真实 fs, 测完清理; normalize 用 in-memory 数组;
+  config 用 process.env 模拟 env; session 用工厂隔离测试。 不需要
+  mock 整个进程, 教学友好。
+</p>
+<h2 id="common-confusion">常见误解: hardening 不是新功能</h2>
+<p>
+  <strong>误解 1: "hardening 是新功能?"</strong> 错。 hardening
+  是<strong>已有功能的安全网</strong>, 不增加业务能力, 只让已有
+  能力<strong>在生产里不出事</strong>。 没有它, harness 在开发
+  机能跑, 在生产跑挂。
+</p>
+<p>
+  <strong>误解 2: "只支持一个 LLM 就行, 不需要 normalize?"</strong>
+  错。 商业上不行 — 用户想换 provider 节省成本, 没 normalize
+  层就要改 agent 主循环 (几千行代码), 不可能。
+</p>
+<p>
+  <strong>误解 3: "config 默认值够用, 不需要 CLI / env?"</strong>
+  错。 CI 跑测试时 mode=auto, 日常 mode=default, 团队规范要
+  mode=plan 培训新成员。 3 种来源必须都能覆盖。
+</p>
+<p>
+  <strong>误解 4: "日志不重要, 撑爆就撑爆?"</strong> 错。 出问题
+  时日志是<strong>唯一</strong>复盘依据。 没日志 = 第二次事故
+  仍然无解。 轮转是必须的, 不是 nice-to-have。
+</p>
+<h2 id="trap">反例梯度: 4 个常见错误</h2>
+<div class="cards-grid">
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 1 · 日志不轮转</span></div>
+    <div class="card__body">
+      <p>harness 跑 3 个月, harness.log 撑到 100GB, 磁盘满,
+        系统挂。 排查事故时发现 /var/log 已写满, 之前的日志被
+        系统 rotate 覆盖, 关键错误日志丢。 错。 正确: 50MB
+        轮转 + 5 个 backup, 总占用 ≤ 300MB, 关键日志保留 7 天。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>时间用 Date 字符串 ("2024-01-01T00:00:00Z")
-      存盘。
-    </p>
-    <p>
-      <strong>为什么错:</strong>跨时区比较容易出错, 字符串排序不等于时间排序。
-    </p>
-    <p>
-      <strong>正确做法:</strong>内部全部用 epoch ms, IO 边界转 ISO 字符串,
-      内部不允许 Date 字符串。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 2 · 直接调 OpenAI SDK</span></div>
+    <div class="card__body">
+      <p>agent.ts 直接 <code>import OpenAI from "openai"; const
+        client = new OpenAI(...);</code>, 假设 OpenAI 协议。
+        换 Anthropic 时改 agent.ts 几千行。 错。 正确: 用
+        LLMClient 抽象接口 + 3 个 provider adapter, agent.ts
+        只调 <code>llmClient.chat(messages, tools)</code>, 换
+        provider 只改 config。</p>
+    </div>
   </div>
-</div>
-
-<h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · 第 15 章</span>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 3 · config 不分优先级</span></div>
+    <div class="card__body">
+      <p>config 一把抓, default 值 + env + CLI 全混在一起,
+        <code>Object.assign(defaults, envVars, cliArgs)</code>。
+        用户设 CLI 期望覆盖, 实际 env 覆盖了 CLI (Object.assign
+        后写的赢)。 错。 正确: 显式优先级 (CLI &gt; env &gt; default),
+        内部按顺序 merge, 不打平, 保留 source 字段调试。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>原子写不破坏原文件:</strong>fake atomicWriteFile 中断 (rename
-      之前抛错), 跑完后原文件仍然存在且内容完整, tmp 文件可能被 留 (不破坏, 可由
-      cleanup 后续清理)。
-    </p>
-    <p>
-      <strong>日志按大小轮转:</strong>fake 写 25MB 日志, 默认 maxFileSize =
-      10MB, 跑完后 <code>app.log</code> 大小 ≤ 10MB, 至少 2 个
-      <code>app.log.N</code> 备份文件存在。
-    </p>
-    <p>
-      <strong>output store 引用计数:</strong>fake 写 3 个 output, refCount
-      分别为 3 / 1 / 0, 跑完 cleanup dryRun, 报告 outputOrphans 含 refCount=0 那
-      1 个, 不含其他。
-    </p>
-    <p>
-      <strong>cleanup dry-run 不真删:</strong>fake dryRun 返回报告含 5 个文件,
-      run() 之前文件仍在, run(report) 之后文件不在, 报告 标记 deleted 字段 = 5。
-    </p>
-    <p>
-      <strong>时间语义统一为 epoch ms:</strong>memory.set / memory.get updatedAt
-      字段都是 number (不是 string), comparison 用 &lt; / &gt;, 不是
-      localeCompare。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 4 · 多 session 共享 history</span></div>
+    <div class="card__body">
+      <p>把 <code>let history: Message[] = []</code> 放模块顶层,
+        多个 session 共享同一个数组。 user A 写的对话 user B
+        看到, 数据泄露 + 互相干扰。 错。 正确:
+        <code>createSession({id})</code> 每次返回独立实例, history
+        在闭包内, 互不污染。</p>
+    </div>
   </div>
 </div>
-
-<h2 id="lookback">回望第 00–14 章: 哪些原则在本章兑现了</h2>
-<ul>
-  <li>
-    <strong>原子性原则:</strong>所有持久化 IO 走 atomic write, 跨进程安全,
-    断电安全。
-  </li>
-  <li>
-    <strong>不可逆操作必须 dry-run:</strong>cleanup / rotation
-    等会删除数据的操作, 必须先报告, 用户确认后再做。
-  </li>
-  <li>
-    <strong>内部类型稳定:</strong>时间用 epoch ms (number), 字符串只在 IO
-    边界出现, 内部不允许 Date 字符串。
-  </li>
-  <li>
-    <strong>事实与视图分离:</strong>output store 是事实, refCount 是视图,
-    清理是另一个视图, 三者职责清晰分离。
-  </li>
-</ul>
-
-<h2 id="forward">前瞻张力: 留给后续章节</h2>
-<dl class="defs">
-  <dt>transcript 长期保留</dt>
-  <dd>
-    transcript 数据会随时间累积, 也需要类似的清理策略 (本章不展开, 留给专题章)。
-  </dd>
-  <dt>cleanup 跨进程协调</dt>
-  <dd>多 harness 实例同时跑时, cleanup 需要文件锁防重复删, 留作后续优化。</dd>
-  <dt>output store 跨进程引用</dt>
-  <dd>引用计数在单进程内准确, 跨进程时需要更复杂的分布式锁, 留作后续优化。</dd>
-  <dt>eval 重放的卫生系统</dt>
-  <dd>
-    eval 跑大量 trace 时也会产生大量临时文件, 需要单独的 cleanup 路径,
-    专题章展开。
-  </dd>
-</dl>
-
-<h2 id="vibe-coding-15">本次如何 vibe code: 第 15 章的三件套</h2>
-
-<h3 id="vibe-feed-15">拆卡: 4 轮迭代的具体产物</h3>
+<h2 id="validate">Validation: 4 条不变量检验</h2>
 <ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出
-    <code>atomicWriteFile()</code> / <code>logRotation</code> /
-    <code>OutputHandle</code> / <code>Cleanup</code> /
-    <code>timeline</code> 五个模块的 interface。本轮不写实现, 重点钉"原子性 /
-    dry-run / 引用计数 / 时间统一" 四条原则。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出 atomic-write / log-rotation /
-    cleanup 三个新模块的 stub, 现有持久化 IO 仍走 fs.writeFile (暂不切换)。本轮
-    review 重点: 新模块工厂化, 现有调用点暂时不破坏。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写五个模块实现 + 把现有持久化 IO
-    切换到 atomic write + output store 加引用计数 + 时间统一为 epoch ms。本轮
-    review 重点: 切换路径走 grep 验证全部持久化 IO 都过 atomic, 时间字段全部
-    number。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/hardening.test.ts</code>。本轮 review 重点: "原子写不破坏原文件"
-    和 "cleanup dry-run 不真删" 两条必须有 spy 验证。
-  </li>
-</ol>
-
-<h3 id="vibe-review-15">Review: 第 15 章专属 checklist</h3>
-<ol>
-  <li>
-    <strong>所有持久化 IO 走 atomicWriteFile。</strong>验证:
-    <code>grep -rn 'fs.writeFile\|fs.promises.writeFile' src/</code> 应当 0 行
-    (除 atomic-write.ts 内部)。
-  </li>
-  <li>
-    <strong>cleanup dry-run 不真删。</strong>验证: dryRun() 内部不调 fs.unlink。
-  </li>
-  <li>
-    <strong>output store 引用计数。</strong>验证: refCount 字段在 write /
-    release 路径上变化, dryRun() 过滤 refCount === 0 的孤儿。
+<li>
+<strong>日志轮转不丢内容</strong>: 写 50MB 日志, 触发轮转, 验证
+    总内容字节数 = 轮转前字节数 (rename 是原子的, 不丢)。
   </li>
-  <li>
-    <strong>时间字段全部 number。</strong>验证:
-    <code>grep -rn 'new Date()' src/</code> 在持久化路径上 (memory / task /
-    schedule 写入) 应当 0 行, timeline.ts 集中转换。
+<li>
+<strong>normalize 对称</strong>: 任意 provider 格式 normalize 成
+    OpenAI 格式, 字段全部对应 (content 字符串 / tool_call 结构 /
+    role 枚举), agent 主循环无差别处理。
   </li>
-  <li>
-    <strong>log rotation 不丢日志。</strong>验证: 写日志时先写新文件再 rename
-    旧文件, 不存在"日志写到一半, 旧文件已 rename" 的窗口。
+<li>
+<strong>config 优先级确定</strong>: CLI + env + default 三层各
+    设不同值, 验证: 最终值 = CLI 值, source.mode = "cli"。
   </li>
-</ol>
-
-<h3 id="vibe-debug-15">调试: 第 15 章典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · 直接 fs.writeFile 替代 atomicWriteFile。</strong>症状:
-    某次代码提交忘了切路径, 仍然 fs.writeFile。验证:
-    <code>grep -rn 'fs.writeFile' src/</code> 不为 0 行时,
-    跑"原子写不破坏原文件" 测试应当抓到。
-  </li>
-  <li>
-    <strong>伪装 B · cleanup dryRun 内部偷偷 unlink。</strong>症状: dryRun()
-    顺手删了文件, 跑 dryRun 后再 run 找不到文件。验证: Validation 卡片"cleanup
-    dry-run 不真删" 那条测试通过 (run 之前文件存在)。
-  </li>
-  <li>
-    <strong>伪装 C · 时间用 Date 字符串, 不统一。</strong>症状: memory.set 的
-    updatedAt 是 <code>new Date().toISOString()</code>。验证:
-    <code>grep -rn 'toISOString' src/</code> 在持久化写入路径上应当 0 行, 只在
-    UI 渲染路径允许。
+<li>
+<strong>session 隔离</strong>: 2 个 session 写各自的 history, 验证
+    互不影响 (s1.history.length === 1, s2.history.length === 0)。
   </li>
 </ol>
-
-<h3 id="vibe-iterate-15">迭代: 第 15 章 5 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(ch15): 钉 atomicWriteFile / logRotation / OutputHandle / Cleanup /
-      timeline 五个模块接口</code
-    >
-    —— tsc 通过, 无实现。
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
+<ul>
+<li>
+<strong>Adapter 模式</strong>: 3 个 LLM provider 各写 adapter,
+    内部统一 OpenAI 格式, agent 主循环无差别。
   </li>
-  <li>
-    <code
-      >feat(ch15): atomic-write + log-rotation + cleanup 三个模块 stub, 现有 IO
-      不切换</code
-    >
-    —— tsc 通过, stub noop。
+<li>
+<strong>显式优先级</strong>: config CLI &gt; env &gt; default,
+    保留 source 字段调试。
   </li>
-  <li>
-    <code
-      >feat(ch15): 切换所有持久化 IO 到 atomic + output store 引用计数 +
-      时间统一</code
-    >
-    —— 跑通 Validation 卡片前 3 条。
+<li>
+<strong>工厂 + 闭包隔离</strong>: session / logger / terminal
+    每次创建独立实例, 闭包内 state 互不污染。
   </li>
-  <li>
-    <code>feat(ch15): cleanup dry-run + run + 引用计数过滤</code> —— 跑通
-    Validation 卡片后 2 条。
+<li>
+<strong>分层拼装</strong>: system-prompt / context-ranking 按
+    层拼, 缺段跳过, 单独函数单独测试。
   </li>
-  <li>
-    <code
-      >test(ch15): 现有持久化路径全部走 atomic (grep 验证) + 时间全部 number
-      (grep 验证)</code
-    >
-    —— 全绿。
+<li>
+<strong>安全默认值</strong>: log-rotation 50MB + 5 backup,
+    terminal 默认 80 宽, config 默认 mode=default。
   </li>
-</ol>
-
+</ul>
+<h2 id="forward">前瞻张力: 留给后续章节</h2>
+<dl class="defs">
+<dt>完整 model registry</dt>
+<dd>
+    教学版 5 个 model + unknown fallback, 工业版接 litellm 等
+    开源 registry, 100+ model 覆盖。 留 P2 阶段。
+  </dd>
+<dt>多 region LLM 路由</dt>
+<dd>
+    当前一个 LLM client, 工业级想要"美国区用 OpenAI, 欧洲区用
+    Anthropic, 中国区用 vLLM", 涉及 LLM 路由 + 故障转移。 留
+    P2 阶段。
+  </dd>
+<dt>端到端 trace (OpenTelemetry)</dt>
+<dd>
+    当前 logger 写文件, 工业级想接 OpenTelemetry 看分布式 trace
+    (agent → subagent → LLM → tool 链路), 涉及 trace SDK 接入。
+    留 P2 阶段。
+  </dd>
+</dl>
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
   <div class="card__head">
     <span class="card__tag">Prompt Card · 第 15 章</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
+    <button class="card__copy" data-copy-card="" type="button">复制</button>
   </div>
   <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 Runtime Hardening, 覆盖原子写、日志轮转、output
-      handle 引用计数、清理 dry-run、时间语义统一五个卫生系统。
-    </p>
-    <p>
-      <strong>场景:</strong>harness 跑一个月, 用户跑 cleanup dryRun, 报告含 5
-      个孤儿 output + 2 个老日志, 用户确认后 run() 删 7 个文件, 释放 50MB。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/atomic-write.ts</code> (新) 暴露
-      <code>atomicWriteFile()</code>; <code>src/log-rotation.ts</code> (新) 暴露
-      <code>createLogRotation(config)</code>;
-      <code>src/output-store.ts</code> (改) 加引用计数 + refCount 字段;
-      <code>src/cleanup.ts</code> (新) 暴露 <code>createCleanup()</code>;
-      <code>src/timeline.ts</code> (改) 统一时间语义为 epoch ms。
-    </p>
+    <p><strong>目标:</strong> 给 harness 加 8 个运行时安全网, 让
+      前面 14 章的所有模块在生产里跑稳。</p>
+    <p><strong>场景:</strong> team 上线 harness 跑 3 个月, 日志不
+      撑爆 + 多 LLM 可切换 + 多 session 不互干扰 + 配置可覆盖 +
+      终端可适配。 8 个模块各管一摊, 缺一个就出事故。</p>
+    <p><strong>模块:</strong> <code>src/log-rotation.ts</code> (新) /
+      <code>src/normalize.ts</code> (新) / <code>src/config.ts</code> (新) /
+      <code>src/session.ts</code> (新) / <code>src/system-prompt.ts</code> (新) /
+      <code>src/context-ranking.ts</code> (新) / <code>src/terminal.ts</code> (新) /
+      <code>src/cli-commands.ts</code> (新) / <code>src/llm-providers.ts</code> (新) /
+      <code>src/foundation-models.ts</code> (新); <code>src/index.ts</code> (改) Composition
+      Root 拼装。</p>
     <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
     <ul>
-      <li>所有持久化 IO 走 atomicWriteFile, 不直接 fs.writeFile</li>
-      <li>cleanup dryRun 不真删, run 之前文件存在</li>
-      <li>output store 引用计数: write 时 refCount = 1, release 时 -1</li>
-      <li>时间字段全部 number (epoch ms), 字符串只在 IO 边界</li>
-      <li>log rotation 不丢日志: 先写新文件再 rename 旧文件</li>
+      <li>日志轮转: 50MB maxBytes + 5 backups, 写日志前检查, rename 原子</li>
+      <li>normalize: 内部统一 OpenAI ChatCompletionMessageParam 格式, 3 provider adapter 转协议</li>
+      <li>config 优先级: CLI &gt; env &gt; default, 保留 source 字段调试</li>
+      <li>session 隔离: createSession 每次独立实例, history / tool / llm client 互不共享</li>
+      <li>system-prompt 分层: baseRules + memory + skill + toolRules + contextSummary, 缺段跳过</li>
+      <li>context-ranking: 分级保留 (system 100, 工具错误 90, 最近 5 轮 80, ...)</li>
+      <li>terminal: TTY + NO_COLOR + TERM + columns 4 点检测, 关闭颜色 / 固定 80 宽</li>
+      <li>cli-commands: 9 个 /xxx 命令注册, 命令之间不互相 import, 通过 deps 协作</li>
+      <li>llm-providers: 3 个工厂, 内部统一 LLMClient 接口, agent 主循环 0 改动</li>
+      <li>foundation-models: 5 个 model + unknown fallback, 能力清单 + 边界提示</li>
     </ul>
-    <p><strong>验证 (用 fake IO + spy, 逐条落到 vitest):</strong></p>
+    <p><strong>验证 (用 fake env + tmp dir + in-memory LLM + vitest, 逐条断言):</strong></p>
     <ul>
-      <li>fake atomicWriteFile 中断 (rename 前抛错), 原文件仍存在且内容完整</li>
-      <li>
-        fake 写 25MB 日志, 默认 maxFileSize=10MB, app.log ≤ 10MB 且 ≥ 2 个
-        app.log.N 备份
-      </li>
-      <li>
-        fake 写 3 个 output, refCount 3/1/0, dryRun 报告含 refCount=0 那 1 个
-      </li>
-      <li>fake dryRun 返回 5 个文件, run 之前文件存在, run 后文件不存在</li>
-      <li>memory.set / get 的 updatedAt 字段都是 number, 不是 string</li>
+      <li>log-rotation: 写 50MB 触发 2 次轮转, 文件数 ≤ 6, 总字节数不变</li>
+      <li>normalize: Anthropic content 数组 → OpenAI content 字符串, 字段对齐</li>
+      <li>config: CLI + env + default 三层各设值, 最终 = CLI, source.mode = "cli"</li>
+      <li>session 隔离: 2 个 session 写 history, 互不污染</li>
     </ul>
   </div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意把 memory.set 改回 fs.writeFile (不走 atomic), 跑测试,
-    看"原子写不破坏原文件" 是否抓到。
+<li>
+    故意不写 log-rotation, 跑 3 个月模拟 (写 50MB 日志), 看"日志
+    轮转" 是否抓到 (单文件 50MB vs. 6 个文件总 60MB)。
   </li>
-  <li>
-    在 dryRun() 内部偷偷 fs.unlink, 跑测试, 看"cleanup dry-run 不真删"
-    是否抓到。
+<li>
+    故意让 agent.ts 直接 import "openai" SDK, 切换 provider 到
+    Anthropic, 看"normalize 缺失" 是否抓到 (agent.ts 报错 vs.
+    切 provider 只改 config)。
   </li>
-  <li>
-    把 memory.set 的 updatedAt 改回 <code>new Date().toISOString()</code>,
-    跑测试, 看"时间字段全部 number" 是否抓到 (sort 行为会变)。
+<li>
+    故意让 config 一把抓 <code>Object.assign(defaults, env, cli)</code>,
+    跑 CLI + env 各设不同值测试, 看"显式优先级" 是否抓到 (env
+    覆盖 CLI vs. CLI 覆盖 env)。
+  </li>
+<li>
+    故意把 <code>let history = []</code> 放模块顶层, 跑 2 个
+    session 隔离测试, 看"session 隔离" 是否抓到 (s1 写 s2 看到
+    vs. 互不影响)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本章给 harness 加了 Runtime Hardening 卫生系统: 原子写、日志轮转、 output
-  store 引用计数、清理 dry-run、时间语义统一。这五条原则 是 harness
-  长期运行的卫生底线, 任何持久化 IO 都必须遵守。 至此主线教程 16
-  章全部完成。下一专题 (eval) 讨论如何测试一个 不确定的 Coding
-  Agent——构建可重放的 trace 库、判定 LLM 输出 是否"做对"、用 eval 反馈给 prompt
-  优化。
+  Hardening 是给 harness 跑生产时的<strong>8 个安全网</strong>,
+  不增加业务能力, 只让已有能力<strong>在生产里不出事</strong>。
+  核心是 5 个原则:
 </p>
-
-<h2 id="next">下一章伏笔</h2>
+<ul>
+<li>
+<strong>Adapter 模式</strong>: 3 个 LLM provider + normalize 层,
+    内部统一格式, 换 provider 0 改动。
+  </li>
+<li>
+<strong>显式优先级</strong>: config CLI &gt; env &gt; default,
+    保留 source 字段调试。
+  </li>
+<li>
+<strong>工厂 + 闭包隔离</strong>: session / logger / terminal
+    每次独立, 多用户多项目不互污染。
+  </li>
+<li>
+<strong>分层拼装</strong>: system-prompt / context-ranking 按
+    层拼, 缺段跳过, 单独函数单独测试。
+  </li>
+<li>
+<strong>安全默认</strong>: log-rotation 50MB, terminal 80 宽,
+    config mode=default, overlap=skip, missed=skip。
+  </li>
 <p>
-  第 15 章让 harness 在长期运行下保持稳定, 但 LLM 行为是不确定的 —— 同一 prompt
-  跑 10 次, 可能 7 次对、3 次错。专题章 eval 讨论 如何"测试一个不确定的 Coding
-  Agent": 构建可重放的 trace 库 (deterministic LLM stub), 判定 LLM
-  输出是否"做对" (judge prompt), 用 eval 反馈给 prompt 优化 (regression test)。
+  教程到 15 章完结。 下一份文档是 <strong>专题 A · model-policy</strong>,
+  讲 LLM 选型的策略层 — 哪些 model 适合长任务, 哪些适合短
+  对话, model 切换的 graceful degradation, 成本 vs 质量的权衡。
 </p>
diff --git a/tutorial/chapters/eval.html b/tutorial/chapters/eval.html
index c62e217..01f329f 100644
--- a/tutorial/chapters/eval.html
+++ b/tutorial/chapters/eval.html
@@ -1,537 +1,2006 @@
 <p class="article__eyebrow">专题 B · 测试不确定系统</p>
 <h1 class="article__title">如何测试一个不确定的 Coding Agent</h1>
 <p class="article__lede">
-  Coding Agent 会调用真实 LLM, 但测试不能一开始就依赖真实 LLM。本专题
-  讨论"测试一个不确定系统" 的方法论: 用 deterministic LLM stub 让测试
-  稳定可重放, 用 trace assertion 让行为可观测, 用 live judge 在固定
-  路径上做现实校准。
+  Coding Agent 调真实 LLM, 但测试不能一开始就依赖真实 LLM。本专题讲
+  harness 是怎么把"测试不确定系统" 拆成 4 层梯度的: 用 scripted LLM
+  让行为可重放, 用 trace 把中间过程结构化, 用 live regression 在固定路径
+  上做现实校准, 用 LLM judge 处理开放式语义质量。MCP 和 Agent Team
+  也属于同一套方法论, 但当前仅作为 harness 原型保留。
+</p>
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="why-eval">为什么 Coding Agent 难测</h2>
+<p>
+  跑一个 coding agent 的时候, 你面对的"被测对象"其实有 3 层不确定性。
+  朴素做法是直接连真实 LLM 跑断言, 这 3 层不确定性会同时放大。
+  下面先用 3 个真实失败故事说明"朴素做法哪里坏", 再讲我们的方法。
+</p>
+<h3>失败故事 1 · 修了 prompt, 测试全挂</h3>
+<p>
+  某次更新里我们改了一行 system prompt 措辞, 想让模型"更简洁地总结"。
+  第二天 CI 红了 200 多个 case, 但代码逻辑根本没动。
+  </p>
+<p>
+  <strong>根因</strong>: 那些 case 在断言"最终回复的完整文本"。
+  LLM 稍微改一下措辞, 文本差异就触发 golden snapshot 失败。
+  我们把这种失败叫做<strong>假阳性</strong>: 不是代码 bug, 是测试方法不对。
+  修复方式是改 case, 不是改代码。
+</p>
+<p>
+  这类问题的核心: <strong>不要断言模型"说了什么", 要断言模型"做了什么"</strong>。
+  "说了什么" 由 LLM 决定, 你控制不了; "做了什么" (调了哪个工具、读了什么文件) 由 harness 决定, 你能测。
+</p>
+<h3>失败故事 2 · 改了 tool schema, 真实 LLM 行为变了</h3>
+<p>
+  某次更新里我们给 <code>run_bash</code> 工具加了一个新参数 <code>timeoutMs</code>。
+  跑真实 LLM 时, 模型开始尝试用它, 但因为某些模型对 schema 变化敏感, 选择工具的策略整体漂移 —— 本来用 <code>run_read</code> 的场景, 现在用 <code>run_bash + cat</code>。
+  </p>
+<p>
+  <strong>根因</strong>: 测试完全依赖真实 LLM 行为, 任何上游变动都会放大成 case 失败。
+  这类问题更难定位: 你不知道是"模型行为变了" 还是"代码逻辑错了"。
+  修复方式是<strong>把模型行为和代码逻辑分开测</strong>。
+</p>
+<h3>失败故事 3 · 修了"代码 review 风格" 的 agent, 没法用代码判断</h3>
+<p>
+  我们做了一个 review agent, 输入一段代码, 它输出"哪里可以改进"。
+  没有正确答案, 没有 golden snapshot, 没法用 <code>expect(output).toBe(...)</code>。
+  </p>
+<p>
+  <strong>根因</strong>: 有些任务本身就是<strong>开放式的</strong>, 没有任何代码判断
+  "这个 review 好不好"。硬要写断言, 只能写非常宽松的正则,
+  等于没测。
+  修复方式是<strong>让另一个 LLM 来评这个 LLM</strong> (judge), 但要严格
+  控制"评的依据" 和"评的输出格式"。
+</p>
+<h3>3 类不确定性 → 3 种测试方式</h3>
+<p>
+  上面 3 个故事对应 3 类不确定性, 每一类需要不同的测试方式:
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-15">在第 15 章基础上改了什么</h2>
-<p>
-  这一章不修改 harness 主代码, 而是在 <code>src/eval/</code> 下加一个
-  独立测试层。eval 层用 driver / trace / assertion 三个组件, 从外部 观察 agent
-  行为, 跑场景化的测试 case, 产出 report 文件。eval 层 与 harness 共享
-  <code>src/</code> 下的 interface (例如 LLMClient、
-  ToolRegistry、PermissionManager), 但不修改这些 interface 的实现。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/eval/README.md"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/eval/README.md: eval 入口</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/driver.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/eval/core/driver.ts: 跑场景的驱动器</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/trace.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/eval/core/trace.ts: 行为 trace 采集</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/assertions.ts"
-    target="_blank"
-    rel="noreferrer"
-    >4. src/eval/core/assertions.ts: 行为断言</a
-  >
-</div>
-
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
 <dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    同一段 prompt 跑 10 次 coding agent, 可能 7 次对、3 次错。普通单元
-    测试只断言"返回了字符串", 抓不到这 30% 的失败。eval 必须能 "重放 + 断言 +
-    报告", 才能让 harness 的行为可被讨论。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"用 e2e 测试, 跑真 LLM"。这有两个问题: 一是 LLM 输出不确定,
-    测试 flaky, 跑 10 次挂 3 次, 团队对测试失去信任; 二是 token 成本贵, 跑一次
-    e2e 要花 $1, 跑 1000 个 case 就破产。
+<dt>Runtime 不确定性</dt>
+<dd>
+    消息顺序对不对? tool_call 和 tool_result 配没配对? 权限检查有没有漏?
+    Context 超限时压缩行为是否正确? 一次更新会不会改坏这些?
+    <strong>测试方式</strong>: 用 scripted LLM (预设 response 序列),
+    让模型行为完全可预测, 测的就是 harness 本身。
   </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface Driver { run(scenario) }, interface Trace { events }, interface
-      Assertion { assert(trace) }</code
-    >。 不变量三条: (1) deterministic LLM stub 必须能预设 LLM 响应, 记录 所有
-    LLM 输入, 跑 1000 次结果一致, (2) trace 必须能反向回放 (从 trace 重现 agent
-    行为), (3) assertion 必须能拆"行为是否 符合 Prompt Card 边界",
-    而不是只断最终文本。
+<dt>模型不确定性</dt>
+<dd>
+    同一个 query, 同一个模型, 不同时间可能返回略不同的回答。
+    不同模型行为差异更大 (Kimi 强 coding, GLM 强长上下文, Qwen 强 agent)。
+    <strong>测试方式</strong>: 用真实 LLM 跑固定 case 集, 用结构性
+    hard 断言 (工具被调、文件存在) 判断"能力不退化", 不判断
+    "措辞变没变"。偶尔开启 judge 评开放式质量。
   </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    fake LLM 预设 3 轮响应, 跑同一个 scenario 1000 次, 断言 trace 完全一致
-    (除了时间戳)。
+<dt>任务本身的不确定性</dt>
+<dd>
+    "改 README 的标题" 是结构化的, 改完用 diff 断言就行;
+    "代码 review 这段是否合理" 是开放式的, 只能让另一个 LLM 评。
+    <strong>测试方式</strong>: 用 judge LLM 读 trace 摘要 (不是完整日志),
+    给出结构化评分, judge prompt 不进 agent system prompt。
   </dd>
 </dl>
+<p>
+  <strong>核心方法</strong>: 把 3 类不确定性分开, 每一类用最合适的方式测。
+  混在一起测, 任何一类波动都会污染其他两类的判断。
+</p>
+<figure class="figure">
+<div class="flow-map" role="img" aria-label="测试不确定系统的 4 层梯度">
+<div class="flow-row">
+<span class="flow-node flow-node--accent">P0 · deterministic</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">scripted LLM + hard 断言<br/><small>守 runtime, 默认 CI 跑</small></span>
+</div>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
+</div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">P1 · replay</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">录制的 response fixture<br/><small>复现"那一次成功"</small></span>
+</div>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
+</div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">P2 · live smoke</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">真实 LLM + 极少 case + 软门禁<br/><small>验证 wrapper 通, 不阻塞 PR</small></span>
+</div>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
+</div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">P3 · live regression + judge</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">真实 LLM + hard 断言 + judge<br/><small>发布前 / 大改后触发</small></span>
+</div>
+</div>
+<figcaption>图 1 · 4 层梯度. 越往下越接近真实, 越往上越快越稳. 每一层都只关心一类不确定性。</figcaption>
+</figure>
+<h2 id="architecture">整体架构: Driver 边界中立</h2>
+<p>
+  4 层梯度都需要一个<strong>中立 runner</strong> 来执行 case。
+  朴素做法是在 runner 里直接 <code>import { createAgent } from "../agent.js"</code>,
+  这条路走 3 步就会撞墙。下面先讲"撞墙过程", 再讲"Driver 边界怎么解"。
+</p>
+<h3>朴素做法的 3 个撞墙</h3>
+<ol>
+<li>
+<strong>主代码改了, 测试全挂 (假阳性)</strong>: runner 绑死当前项目
+      内部类型, 任何重构都会让 case 失败, 但失败原因可能是 "case
+      写了私有事件细节", 不是代码 bug。
+    </li>
+<li>
+<strong>换被测对象写不出来</strong>: 假设想测一个 CLI 黑盒工具 (比如
+      包装另一个 agent), 因为 runner 写死了 <code>createAgent()</code>
+      调用, 没法换别的实现。
+    </li>
+<li>
+<strong>tool / permission / transcript 私有事件泄漏</strong>: case 写着
+      "transcript 事件的 historySequence == 5" 这种断言,
+      测的是实现细节, 改 main loop 就全挂。
+    </li>
+</ol>
+<p>
+  这 3 个撞墙都被同一个抽象层解决: <strong>CodingAgentDriver 接口</strong>。
+  Runner 只看这个接口, 不看 driver 内部。
+</p>
+<h3>CodingAgentDriver 接口</h3>
+<pre class="code-block"><code>interface CodingAgentDriver {
+  startCase(context: AgentCaseContext): Promise&lt;void&gt;;
+  send(input: AgentInput): Promise&lt;AgentTurnResult&gt;;
+  readEvents?(): Promise&lt;AgentRuntimeEvent[]&gt;;
+  close(): Promise&lt;void&gt;;
+}
 
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
+interface AgentCaseContext {
+  caseId: string;
+  workspaceRoot: string;        // 临时目录, driver 内部用
+  metadata?: Record&lt;string, unknown&gt;;
+}
 
-<div class="note">
-  <p class="note__title">观察 1 · 用真 LLM 跑 e2e 测试</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 错误: 用真 LLM 跑测试
-it("agent 应该读文件", async () =&gt; {
-  const answer = await agent.run("读 src/agent.ts");
-  expect(answer).toContain("createAgent");  // flaky
-}, 60_000);  // 60s 超时</code></pre>
-  <p><strong>问:</strong>为什么不直接跑真 LLM?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— flakiness: 同一 prompt 跑 10 次, LLM
-    输出可能略有差异, 测试 30% 挂, 团队不敢信测试; 成本: 1000 个 case × $1 =
-    $1000/次 CI; 速度: 60s timeout, CI 跑 100 个 case 要 100 分钟。
-  </p>
-</div>
+interface AgentInput {
+  stepId: string;              // 多 step case 用
+  query: string;
+}
 
-<div class="note">
-  <p class="note__title">观察 2 · 断言只断最终文本</p>
-  <pre class="code-block"><code>// 教学简化版
-expect(answer).toContain("createAgent");</code></pre>
-  <p><strong>问:</strong>为什么不只断最终文本?</p>
-  <p>
-    <strong>答:</strong>LLM 可能在不知道"createAgent" 是什么的情况下
-    也能输出这个字符串 (例如 hallucination)。最终文本对了, 但 行为完全错
-    (没读文件、没调工具、绕过了 permission)。要断 行为事实, 必须断 messages
-    序列、tool calls、permission 决策 这些结构性事实。
+interface AgentTurnResult {
+  stepId: string;
+  finalOutput: string;
+  exitCode?: number;
+  events?: AgentRuntimeEvent[];  // tool_call / llm_call 等
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/driver.ts#L60" rel="noreferrer" target="_blank">GitHub · CodingAgentDriver 中立接口 (L60)</a></p>
+<p>
+  Runner 用法极简:
+</p>
+<pre class="code-block"><code>async function runEvalCase(case: EvalCase): Promise&lt;EvalRunResult&gt; {
+  const driver = createDriver(case.driver);    // ← 唯一感知 driver 的地方
+  const workspace = await createEvalWorkspace(case.workspace);
+  await driver.startCase({ caseId: case.id, workspaceRoot: workspace.root });
+
+  const stepTraces = [];
+  for (const step of case.steps) {
+    const result = await driver.send({ stepId: step.id, query: step.query });
+    stepTraces.push(result);
+  }
+
+  await driver.close();
+  return buildResult(case, stepTraces, workspace);
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/runner.ts#L53" rel="noreferrer" target="_blank">GitHub · runEvalCase 核心 runner (L53)</a></p>
+<p>
+  注意 <code>createDriver</code> 是<strong>唯一</strong>根据 case 选实现的地方。
+  Runner 内部没有任何 <code>if (driver.kind === "in-process") ...</code> 的分支。
+</p>
+<h3>3 种 driver 各司其职</h3>
+<p>
+  当前项目实现了 3 种 driver, 都在 <code>src/eval/drivers/learn-claude-code/</code> 下:
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th>Driver</th>
+<th>驱动什么</th>
+<th>用什么 LLM</th>
+<th>典型场景</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>in-process</code></td>
+<td>当前项目 <code>createAgent()</code></td>
+<td>scripted / replay / live</td>
+<td>测当前 harness, 默认用这个</td>
+</tr>
+<tr>
+<td><code>cli</code></td>
+<td>外部命令 (child_process.spawn)</td>
+<td>外部 agent 自带</td>
+<td>黑盒测 CLI 类工具</td>
+</tr>
+<tr>
+<td><code>learn-claude-code-team</code></td>
+<td>顺序 supervisor 多 Agent 拓扑</td>
+<td>同 in-process</td>
+<td>原型阶段, 测 Team 协作</td>
+</tr>
+</tbody>
+</table>
+<p>
+  同<strong>一套 case schema</strong>可以驱动 3 种不同的被测对象, 不用改 runner。
   </p>
+<figure class="figure">
+<div class="flow-map" role="img" aria-label="Eval Core + Driver 分层">
+<div class="flow-row">
+<span class="flow-node">EvalCase</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node flow-node--accent">runEvalCase()<br/><small>core/runner.ts</small></span>
 </div>
-
-<h2 id="three-layers">三层验证策略</h2>
-<p>本章的核心方法论。harness 的测试分三层, 各有各的用途, 不混:</p>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
+</div>
+<div class="flow-row">
+<span class="flow-node">createEvalWorkspace<br/><small>临时目录</small></span>
+<span class="flow-node">TraceRecorder<br/><small>runtime events</small></span>
+<span class="flow-node">runAssertions<br/><small>portable + instrumented</small></span>
+</div>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
+</div>
+<div class="flow-row">
+<span class="flow-node">CodingAgentDriver<br/><small>中立接口</small></span>
+</div>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
+</div>
+<div class="flow-row">
+<span class="flow-node">in-process</span>
+<span class="flow-node">cli</span>
+<span class="flow-node">team</span>
+</div>
+<div class="flow-row">
+<span class="flow-arrow">↓</span>
+</div>
+<div class="flow-row">
+<span class="flow-node">createAgent()<br/><small>当前项目</small></span>
+<span class="flow-node">child_process.spawn<br/><small>外部命令</small></span>
+<span class="flow-node">supervisor 拓扑<br/><small>原型</small></span>
+</div>
+</div>
+<figcaption>图 2 · Core 只看 Driver 接口. 换被测对象 = 换 driver 实现, runner 不变。</figcaption>
+</figure>
+<h2 id="case-structure">Case 长什么样</h2>
+<p>
+  一个 EvalCase 描述"用户问什么 + 期望发生什么"。它不写"模型应该说什么",
+  只写"模型应该做什么"。
+</p>
+<h3>Case 的 4 个核心字段</h3>
+<table class="terms">
+<thead>
+<tr>
+<th>字段</th>
+<th>作用</th>
+<th>常见配置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>driver</code></td>
+<td>被测对象 + LLM plan + 工具集</td>
+<td><code>learn-claude-code-in-process</code> + scripted / live LLM + core / full 工具</td>
+</tr>
+<tr>
+<td><code>workspace</code></td>
+<td>临时目录和初始文件</td>
+<td><code>initialFiles</code>, <code>keepOnFailure</code></td>
+</tr>
+<tr>
+<td><code>steps</code></td>
+<td>多 step query 序列, 复用同一 driver</td>
+<td>单 step (单轮) / 多 step (多轮 + 共享 context)</td>
+</tr>
+<tr>
+<td><code>assertions</code></td>
+<td>case 级 + step 级断言</td>
+<td>portable (fileContains) + instrumented (toolCalled) + 可选 judge</td>
+</tr>
+</tbody>
+</table>
+<h3>3 种 tools.kind: case 的"难度级别"</h3>
+<p>
+  <code>tools.kind</code> 决定 driver 内部装哪套工具, 直接决定 case 能测什么:
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th><code>tools.kind</code></th>
+<th>工具集</th>
+<th>副作用</th>
+<th>典型场景</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>fake</code></td>
+<td>case 自己定义的 fake 工具</td>
+<td>无</td>
+<td>测 runner / driver 本身</td>
+</tr>
+<tr>
+<td><code>core</code></td>
+<td>真实 <code>run_bash</code> / <code>run_read</code> / <code>run_write</code> / <code>run_edit</code> / <code>run_edit_exact</code></td>
+<td>临时 workspace 内</td>
+<td>live regression 主战场</td>
+</tr>
+<tr>
+<td><code>full</code></td>
+<td>完整工具系统 (TODO / Task / Memory / Skill / SubAgent / Async / Schedule / Output)</td>
+<td>临时 workspace + 临时 <code>agentHome</code></td>
+<td>live full regression</td>
+</tr>
+</tbody>
+</table>
+<h3>4 个真实 case 示例</h3>
+<p>
+  下面 4 个 case 来自当前仓库, 展示不同 <code>tools.kind</code> + 不同 LLM plan 的写法。
+</p>
+<p>
+  <strong>示例 1 · scripted LLM + fake 工具 (测 runner 本身)</strong>
+</p>
+<pre class="code-block"><code>const case1: EvalCase = {
+  id: "runner-no-tool",
+  title: "Runner: scripted LLM, no tool, just text",
+  driver: {
+    kind: "learn-claude-code-in-process",
+    llm: {
+      kind: "scripted",
+      scriptedResponses: [
+        { content: "Hello!", toolCalls: [], finishReason: "stop" },
+      ],
+    },
+    tools: { kind: "fake" },
+  },
+  steps: [{ query: "Say hello." }],
+  assertions: [
+    { kind: "allStepsCompleted" },
+    { kind: "finalOutputContains", text: "Hello" },
+  ],
+};</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/cases/deterministic.test.ts#L1" rel="noreferrer" target="_blank">GitHub · deterministic suite (scripted + fake 案例) (L1)</a></p>
+<p>
+  <strong>示例 2 · scripted LLM + core 工具 (测多步工具调用)</strong>
+</p>
+<pre class="code-block"><code>const case2: EvalCase = {
+  id: "core-read-then-answer",
+  title: "Read package.json and report test command",
+  driver: {
+    kind: "learn-claude-code-in-process",
+    llm: {
+      kind: "scripted",
+      scriptedResponses: [
+        // 第一次: 模型决定调 run_read
+        {
+          content: null,
+          toolCalls: [{ id: "c1", name: "run_read", args: { path: "package.json" } }],
+          finishReason: "tool_calls",
+        },
+        // 第二次: 模型看到结果, 给最终回复
+        { content: "Test command: npm test", toolCalls: [], finishReason: "stop" },
+      ],
+    },
+    tools: { kind: "core" },
+  },
+  workspace: {
+    initialFiles: { "package.json": '{ "scripts": { "test": "npm test" } }' },
+  },
+  steps: [{ query: "What test command does this project use?" }],
+  assertions: [
+    { kind: "toolCalled", toolName: "run_read" },
+    { kind: "finalOutputContains", text: "npm test" },
+  ],
+};</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/cases/deterministic.test.ts#L1" rel="noreferrer" target="_blank">GitHub · deterministic suite (scripted + core 案例) (L1)</a></p>
+<p>
+  <strong>示例 3 · live LLM + core 工具 (live regression 写法)</strong>
+</p>
+<pre class="code-block"><code>const case3: EvalCase = {
+  id: "live-core-write-report-with-sentinels",
+  title: "Create reports/eval-contract.md with exact sentinels",
+  driver: {
+    kind: "learn-claude-code-in-process",
+    llm: { kind: "live", live: { maxCalls: 8 } },
+    tools: { kind: "core" },
+  },
+  steps: [{
+    query: `Create reports/eval-contract.md.
+The file must contain these exact lines:
+case-id: LIVE-WRITE-001
+status: ready
+owner: eval
+After writing it, briefly say what you created.`,
+  }],
+  assertions: [
+    { kind: "allStepsCompleted" },
+    { kind: "toolCalled", toolName: "run_write" },
+    { kind: "fileExists", path: "reports/eval-contract.md" },
+    { kind: "fileContains", path: "reports/eval-contract.md", text: "case-id: LIVE-WRITE-001" },
+    { kind: "fileContains", path: "reports/eval-contract.md", text: "status: ready" },
+    { kind: "noWritesOutsideWorkspace" },
+  ],
+};</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-regression-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live regression suite (core tools live cases) (L1)</a></p>
+<p>
+  <strong>示例 4 · live LLM + full 工具 (live full regression 写法)</strong>
+</p>
+<pre class="code-block"><code>const case4: EvalCase = {
+  id: "live-full-skill-guided-output",
+  title: "Load seeded skill and follow its instructions",
+  driver: {
+    kind: "learn-claude-code-in-process",
+    llm: { kind: "live", live: { maxCalls: 10 } },
+    tools: {
+      kind: "full",
+      full: {
+        agentHome: "temp",
+        enabledTools: ["core", "skill"],
+        seedSkills: {
+          "eval-format/SKILL.md":
+            "When asked for status, first write the marker SKILL_USED_22, then include the status.",
+        },
+      },
+    },
+  },
+  steps: [{
+    query: "Use the eval-format skill to create skill-output.md with status: passed.",
+  }],
+  assertions: [
+    { kind: "toolCalled", toolName: "run_skill" },
+    { kind: "fileContains", path: "skill-output.md", text: "SKILL_USED_22" },
+    { kind: "fileContains", path: "skill-output.md", text: "status: passed" },
+  ],
+};</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-full-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live full suite (TODO/Memory/Skill/SubAgent live cases) (L1)</a></p>
+<p>
+  4 个 case 的 LLM 计划、工具集、断言类型、风险点都不同, 但都遵循同一个
+  <code>EvalCase</code> shape。Runner 不需要为 4 种 case 写 4 套执行逻辑。
+</p>
+<h2 id="assertions">断言分两类</h2>
+<p>
+  断言是"模型做了什么" 的代码表达。我们把所有断言拆成两类, 解决"测实现细节
+  vs 测行为" 的张力。
+</p>
+<h3>Portable 断言: 跨 driver 可用</h3>
+<p>
+  Portable 断言只看 <code>finalOutput</code>、workspace 文件、step result,
+  不依赖 driver 内部事件。<strong>换 driver 时, 这些断言不需要改</strong>。
+  case 主体应该尽量用 portable 断言。
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th>断言</th>
+<th>检查什么</th>
+<th>例</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>finalOutputContains</code></td>
+<td>最终回复包含文本</td>
+<td><code>{ kind: "finalOutputContains", text: "approved" }</code></td>
+</tr>
+<tr>
+<td><code>finalOutputMatches</code></td>
+<td>最终回复匹配正则</td>
+<td><code>{ kind: "finalOutputMatches", pattern: "(denied|permission|cannot)" }</code></td>
+</tr>
+<tr>
+<td><code>allStepsCompleted</code></td>
+<td>所有 step 跑完</td>
+<td><code>{ kind: "allStepsCompleted" }</code></td>
+</tr>
+<tr>
+<td><code>fileExists</code></td>
+<td>workspace 文件存在</td>
+<td><code>{ kind: "fileExists", path: "README.md" }</code></td>
+</tr>
+<tr>
+<td><code>fileNotExists</code></td>
+<td>workspace 文件不存在 (越界 / 拒绝后验证)</td>
+<td><code>{ kind: "fileNotExists", path: "blocked.txt" }</code></td>
+</tr>
+<tr>
+<td><code>fileContains</code></td>
+<td>workspace 文件包含文本</td>
+<td><code>{ kind: "fileContains", path: "README.md", text: "Usage" }</code></td>
+</tr>
+<tr>
+<td><code>noWritesOutsideWorkspace</code></td>
+<td>没有任何工具写到 workspace 外</td>
+<td><code>{ kind: "noWritesOutsideWorkspace" }</code></td>
+</tr>
+<tr>
+<td><code>exitCodeIs</code></td>
+<td>CLI driver 退出码</td>
+<td><code>{ kind: "exitCodeIs", code: 0 }</code></td>
+</tr>
+</tbody>
+</table>
+<h3>Instrumented 断言: 需要 driver 发射 runtime events</h3>
+<p>
+  Instrumented 断言依赖 driver 内部观察, <strong>CLI 黑盒 driver 不一定能用</strong>。
+  in-process driver 通过 <code>wrapToolRegistryForTrace</code> 和
+  <code>scripted-terminal</code> 自动发射这些事件, 所以 in-process case
+  可以放心使用。
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th>断言</th>
+<th>检查什么</th>
+<th>例</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>toolCalled</code></td>
+<td>工具被调过 (可设 <code>minCount</code>)</td>
+<td><code>{ kind: "toolCalled", toolName: "run_read" }</code></td>
+</tr>
+<tr>
+<td><code>toolNotCalled</code></td>
+<td>工具没被调</td>
+<td><code>{ kind: "toolNotCalled", toolName: "run_write" }</code></td>
+</tr>
+<tr>
+<td><code>toolCalledOneOf</code></td>
+<td>一组工具中至少一个被调</td>
+<td><code>{ kind: "toolCalledOneOf", toolNames: ["run_memory_list", "run_memory_read"] }</code></td>
+</tr>
+<tr>
+<td><code>toolCallCount</code></td>
+<td>工具调用次数等于指定值</td>
+<td><code>{ kind: "toolCallCount", toolName: "run_bash", count: 1 }</code></td>
+</tr>
+<tr>
+<td><code>toolArgsContain</code></td>
+<td>工具参数包含文本</td>
+<td><code>{ kind: "toolArgsContain", toolName: "run_read", text: "package.json" }</code></td>
+</tr>
+<tr>
+<td><code>toolResultContains</code></td>
+<td>工具结果包含文本 (验证工具返回, 而不是最终回复)</td>
+<td><code>{ kind: "toolResultContains", toolName: "run_task_group_read", text: "Live plan" }</code></td>
+</tr>
+<tr>
+<td><code>stepToolCalled</code></td>
+<td>指定 step 中工具被调</td>
+<td><code>{ kind: "stepToolCalled", stepId: "step-2", toolName: "run_write" }</code></td>
+</tr>
+<tr>
+<td><code>stepToolNotCalled</code></td>
+<td>指定 step 中工具没被调 (验证"先观察后写")</td>
+<td><code>{ kind: "stepToolNotCalled", stepId: "step-1", toolName: "run_write" }</code></td>
+</tr>
+<tr>
+<td><code>noToolErrors</code></td>
+<td>没有工具返回 <code>error: true</code></td>
+<td><code>{ kind: "noToolErrors" }</code></td>
+</tr>
+<tr>
+<td><code>allToolsSucceeded</code></td>
+<td>所有 <code>tool_result</code> 不带 <code>error</code></td>
+<td><code>{ kind: "allToolsSucceeded" }</code></td>
+</tr>
+<tr>
+<td><code>permissionPromptShown</code></td>
+<td>权限确认弹窗出现过</td>
+<td><code>{ kind: "permissionPromptShown" }</code></td>
+</tr>
+<tr>
+<td><code>transcriptEventTypes</code></td>
+<td>transcript 事件类型序列匹配</td>
+<td><code>{ kind: "transcriptEventTypes", expected: ["user_message", "tool_result", ...] }</code></td>
+</tr>
+</tbody>
+</table>
+<h3>复杂 case 还需要 4 个补充断言</h3>
+<p>
+  跑全系统 live case 时, 上面的断言不够用, 我们又加了 4 个:
+</p>
+<ul>
+<li><code>fileNotExists</code> —— 验证越界路径 / 权限拒绝后文件没生成</li>
+<li><code>toolCalledOneOf</code> —— 模型可能用多个等价工具达成目标, 不想写死</li>
+<li><code>toolResultContains</code> —— 看工具结果本身是否含目标内容 (而不是只看最终回复)</li>
+<li><code>stepToolCalled</code> / <code>stepToolNotCalled</code> —— 验证"第一步只读, 第二步才写"</li>
+</ul>
+<h3>断言选择经验法则</h3>
+<ol>
+<li>case 的"主断言" 全部用 portable</li>
+<li>需要验证"模型走了某条路径" 时加 instrumented (例如验证"权限拒绝后没继续写")</li>
+<li>开放式质量用 judge, 不要硬塞到 hard 断言</li>
+</ol>
+<h2 id="scripted-llm">Scripted LLM: 确定性测试的支柱</h2>
+<p>
+  Scripted LLM 是 deterministic 层的核心。设计极简:
+  <strong>每次 <code>chat()</code> 消耗一个预设 response, response 用完抛错</strong>。
+  实现大概 60 行, 但解决了"模型行为不可预测" 的根本问题。
+</p>
+<h3>为什么这种"看起来太朴素"的设计是对的</h3>
+<p>
+  朴素的设计有两个好处:
+</p>
+<ol>
+<li>
+<strong>强迫 case 作者把模型行为显式写出来</strong>: 不能依赖模型"自然" 做对。
+      跑测试如果失败, 一眼能看到"实际行为" 和"case 期望" 的差异,
+      不是"真实 LLM 升级后行为变了"。
+    </li>
+<li>
+<strong>把模型不确定性关在 case 里</strong>: 同样的 case 跑 1000 次, 行为都一样。
+      CI 跑过 = 代码逻辑正确, CI 挂了 = 代码逻辑错。模型波动不会假阳性。
+    </li>
+</ol>
+<h3>工具调用的 response 必须成对出现</h3>
+<p>
+  工具调用场景下, scripted response 必须成对:
+  <strong>第一次 <code>finishReason: "tool_calls"</code> + toolCalls 数组,
+  第二次 <code>finishReason: "stop"</code> + content</strong>。
+  </p>
+<pre class="code-block"><code>scriptedResponses: [
+  // 第 1 轮: 模型决定调 run_bash
+  {
+    content: null,
+    toolCalls: [{ id: "call_1", name: "run_bash", args: { command: "cat package.json" } }],
+    finishReason: "tool_calls",
+  },
+  // 第 2 轮: 模型看到工具结果, 给最终回复
+  { content: "package.json says test = npm test", toolCalls: [], finishReason: "stop" },
+]</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/scripted-llm.ts#L32" rel="noreferrer" target="_blank">GitHub · createScriptedLLMClient (L32)</a></p>
+<p>
+  如果 case 需要调 3 个工具, scripted response 要排 4 个
+  (3 个 tool_calls + 1 个 stop)。
+</p>
+<h3>3 个常见 scripted bug</h3>
+<p>
+  写 case 时容易踩 3 个坑, 几乎每个新 case 作者都会遇到一次:
+</p>
 <dl class="defs">
-  <dt>第一层 · deterministic</dt>
-  <dd>
-    全部测试都跑。用 fake LLM (第 01 章 Cookbook) 预设响应, 跑 同一 scenario
-    1000 次结果完全一致。验证 harness 自身行为 (loop / state / 协议) 正确。token
-    成本: 0。CI 必跑。
-  </dd>
-  <dt>第二层 · live</dt>
-  <dd>
-    <strong>不</strong>在 CI 跑, 在本地 opt-in。跑真 LLM, 验证 deterministic
-    测试没覆盖的"实际 LLM 行为" (例如 LLM 是否真的调对了 tool name,
-    是否按预期思考)。token 成本: 中等。开发时偶尔跑一次, 不强制 CI。
-  </dd>
-  <dt>第三层 · judge</dt>
-  <dd>
-    <strong>不</strong>在 CI 跑, 单独 review 用。把 trace (LLM 输入输出 /
-    工具调用 / permission 决策) 喂给一个 "judge" LLM, 让 judge
-    评估"行为是否合理" (例如 "用户说删 /tmp, agent 真的删了 /tmp
-    没删错目录")。人 review judge 的输出, 不直接 CI。token 成本: 高。
-  </dd>
+<dt>Bug 1 · response 配对错</dt>
+<dd>
+      第一次写了 <code>finishReason: "stop"</code> 但同时写了 <code>toolCalls</code>,
+      或者反过来。Driver 会用工具数判定"还有下一轮", 配对错会导致
+      case 跑到一半 response 用完, 抛 <code>Eval case &lt;id&gt; has no
+      scripted LLM response for call &lt;n&gt;</code>。
+    </dd>
+<dt>Bug 2 · toolCall id 漂移</dt>
+<dd>
+      scripted 里写 <code>id: "call_1"</code>, 但真实 OpenAI 协议里
+      <code>tool_call_id</code> 拼写不一致。Driver 在 tool_result 配对时
+      用的是 <code>toolCall.id</code>, 拼错会导致 tool message 找不到
+      对应 assistant 消息, 整个 conversation 序列错乱。
+    </dd>
+<dt>Bug 3 · tool args 类型错</dt>
+<dd>
+      scripted 里 <code>args</code> 期望是 object, 但 case 作者
+      写成 JSON 字符串。Driver 会自动 <code>JSON.stringify</code>,
+      字符串再 stringify 就成了 <code>"{\"command\": \"ls\"}"</code>
+      (外层多引号), 模型看到时 JSON 解析失败。
+    </dd>
 </dl>
 <p>
-  这一章的"差 → 改 → 好": 普通单元测试只断最终文本, 跑真 LLM。 改进后:
-  deterministic 主 + live 副 + judge 复盘, 三层各司其职。
+  解决方式: 第一版 case 都跑一遍 dry run, 确认 response 数量正确、
+  id 拼写一致、args 是 object。后期可以加 scripted LLM 单元测试自动检查。
 </p>
-
-<h2 id="driver">Driver: 跑场景</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface Scenario {
-  name: string;
-  query: string;
-  fakeLlm: ScriptedLLM;
-  // 期望行为断言
-  expectations: Expectation[];
-}
-
-export interface Driver {
-  run(scenario: Scenario): Promise&lt;{ trace: Trace; passed: boolean; report: Report }&gt;;
+<h2 id="workspace">临时 workspace + 临时 agentHome</h2>
+<p>
+  <strong>第一原则: case 跑完不能污染任何东西</strong>。
+  这一条不做到, 后面所有判断都是空中楼阁。
+</p>
+<h3>Workspace: 每个 case 独立临时目录</h3>
+<pre class="code-block"><code>async function createEvalWorkspace(plan?: EvalWorkspacePlan): Promise&lt;EvalWorkspace&gt; {
+  const root = await fs.mkdtemp(path.join(os.tmpdir(), "swoopcode-eval-"));
+  // 写入 initialFiles
+  if (plan?.initialFiles) {
+    for (const [relPath, content] of Object.entries(plan.initialFiles)) {
+      const abs = safeResolve(root, relPath);   // 拒绝 .. 和绝对路径
+      await fs.mkdir(path.dirname(abs), { recursive: true });
+      await fs.writeFile(abs, content, "utf8");
+    }
+  }
+  return {
+    root,
+    readFile: (p) =&gt; fs.readFile(safeResolve(root, p), "utf8"),
+    exists: (p) =&gt; fs.access(safeResolve(root, p)).then(() =&gt; true).catch(() =&gt; false),
+    cleanup: async () =&gt; { /* case 跑完默认删 */ },
+  };
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/workspace.ts#L50" rel="noreferrer" target="_blank">GitHub · createEvalWorkspace 临时目录 (L50)</a></p>
+<p>
+  4 条安全防线:
+</p>
+<ol>
+<li>
+<strong>mkdtemp 生成唯一目录</strong>: 多个 case 并行跑不会冲突。
+    </li>
+<li>
+<strong>initialFiles 拒绝绝对路径和 <code>..</code></strong>:
+      case 写 <code>{ "/etc/passwd": "..." }</code> 会被 <code>safeResolve</code> 拒绝,
+      防止 case 作者不小心写到 tmp 外面。
+    </li>
+<li>
+<strong>readFile / exists 也走 <code>safeResolve</code></strong>:
+      断言 <code>fileContains("/etc/passwd", "root")</code> 一样被拒。
+    </li>
+<li>
+<strong>cleanup 默认删目录, <code>keepOnFailure: true</code> 时保留</strong>:
+      失败的 case 把 <code>workspaceRoot</code> 写进 trace, 方便人工排查。
+    </li>
+</ol>
+<h3>AgentHome: full-tools 必须隔离</h3>
+<p>
+  <code>tools.kind = "full"</code> 装载的是完整工具系统, 包括
+  <code>run_memory_create</code>、<code>run_task_group_create</code>、
+  <code>run_schedule_create</code>、<code>run_skill</code> 等。
+  这些工具默认写到 <code>~/.swoopcode</code> 下的 memoryDir /
+  tasksDir / schedulesDir / skillsDir。
+</p>
+<p>
+  <strong>如果不隔离</strong>: 跑一个 test 创建了
+  <code>release-keyword = LIVE-MEM-42</code> 的 memory,
+  会污染用户真实数据; 第二次跑还会读到上次的污染,
+  看起来 case 跑通了, 其实是脏数据。
+  </p>
+<p>
+  所以 full-tools driver 第一步就是创建临时 <code>agentHome</code>:
+  </p>
+<pre class="code-block"><code>if (tools.kind === "full") {
+  const tempAgentHome = await fs.mkdtemp(path.join(os.tmpdir(), "swoopcode-agent-home-"));
+  const ctx = createProjectContext({ projectRoot: workspace.root, agentHome: tempAgentHome });
+  // skillsDir / memoryDir / tasksDir / schedulesDir / taskOutputsDir
+  // 全部从 tempAgentHome 派生, 不读 ~/.swoopcode
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/full-tool-runtime.ts#L137" rel="noreferrer" target="_blank">GitHub · createFullEvalRuntime (L137)</a></p>
+<p>
+  case 结束后 <code>cleanup()</code> 把 tempAgentHome 整个删除。
+  </p>
+<h3>为什么不用 in-memory store?</h3>
+<p>
+  有人会问: "为什么不直接用内存对象模拟 store, 跑得更快?"
+  答案是<strong>持久化路径是 agent 的一部分</strong>。Memory / Task
+  / Schedule 都有原子写、索引重建、跨进程恢复这些逻辑,
+  内存模拟会跳过这些真实路径, case 永远测不到"原子写失败"
+  "索引文件损坏" 这类边界 bug。
+  临时目录 + 真实模块, 才是端到端测试该有的样子。
+</p>
+<h2 id="trace">Trace 是事实来源</h2>
+<p>
+  测试失败时, 不能只看到 <code>expected true to be false</code>。
+  失败者必须能立刻定位"哪一步、哪个工具、什么参数、什么结果"。
+  这就是 <code>EvalTrace</code> 的职责。
+</p>
+<h3>EvalTrace 包含什么</h3>
+<ul>
+<li><code>caseId</code> / <code>title</code> / <code>startedAt</code> / <code>endedAt</code> / <code>mode</code></li>
+<li><code>workspaceRoot</code> (失败时定位临时目录)</li>
+<li><code>steps[]</code> —— 每 step 的 query / finalOutput / 错误</li>
+<li><code>runtimeEvents[]</code> —— 标准化事件流</li>
+<li><code>assertions[]</code> —— 每条断言 passed / message / evidence</li>
+<li><code>judge?</code> —— 可选 LLM judge 结果</li>
+</ul>
+<h3>完整 trace JSON 示例</h3>
+<pre class="code-block"><code>{
+  "caseId": "live-core-edit-existing-config",
+  "title": "Change retryLimit from 2 to 4, keep sentinel",
+  "mode": "live",
+  "workspaceRoot": "/var/folders/.../swoopcode-eval-abc123",
+  "startedAt": "2026-06-10T12:00:00.000Z",
+  "endedAt": "2026-06-10T12:00:18.234Z",
+  "steps": [{
+    "stepId": "step-1",
+    "query": "In src/config.ts, change retryLimit from 2 to 4...",
+    "finalOutput": "Done. retryLimit is now 4, sentinel and featureName preserved."
+  }],
+  "runtimeEvents": [
+    { "kind": "llm_call", "id": "e1", "messageCount": 5, "toolDefinitionCount": 5 },
+    { "kind": "tool_call", "id": "e2", "toolName": "run_read", "args": { "path": "src/config.ts" } },
+    { "kind": "tool_result", "id": "e3", "toolName": "run_read", "result": "..." },
+    { "kind": "llm_response", "id": "e4", "contentPreview": "I'll change retryLimit to 4." },
+    { "kind": "tool_call", "id": "e5", "toolName": "run_edit", "args": { "path": "src/config.ts", "newText": "export const retryLimit = 4;" } },
+    { "kind": "tool_result", "id": "e6", "toolName": "run_edit", "result": "OK" },
+    { "kind": "llm_response", "id": "e7", "contentPreview": "Done." }
+  ],
+  "assertions": [
+    { "kind": "allStepsCompleted", "passed": true, "message": "All 1 step completed." },
+    { "kind": "toolCalled", "passed": true, "message": "run_read was called 1 time." },
+    { "kind": "fileContains", "passed": true, "message": "src/config.ts contains 'export const retryLimit = 4;'." },
+    { "kind": "noWritesOutsideWorkspace", "passed": true, "message": "All writes stayed inside workspace." }
+  ]
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-regression-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live regression 6 case 定义 (L1)</a></p>
+<p>
+  失败时看到这份 trace, 立刻知道: 工具调用序列对不对、文件改对了没、模型
+  最终回答了什么。不用重跑也能定位。
+</p>
+<h3>trace 与 llm.log 的边界</h3>
+<p>
+  trace 不追求和 <code>llm.log</code> 一样完整。两者用途不同:
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th>维度</th>
+<th><code>llm.log</code></th>
+<th>EvalTrace</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>用途</td>
+<td>完整通信日志, 调试 LLM 行为</td>
+<td>测试证据, 验证 case 通过/失败</td>
+</tr>
+<tr>
+<td>体积</td>
+<td>大, 含所有 payload</td>
+<td>小, 含事件摘要和断言结果</td>
+</tr>
+<tr>
+<td>开启</td>
+<td>默认开 (按轮转)</td>
+<td><code>EVAL_TRACE_DIR</code> 显式开启</td>
+</tr>
+<tr>
+<td>写仓库</td>
+<td>写到 <code>agentHome/logs/</code></td>
+<td>不写仓库目录, 默认只存内存</td>
+</tr>
+</tbody>
+</table>
+<h2 id="replay">Replay: 复现"那一次成功"</h2>
+<p>
+  跑真实 LLM 成功了, 想把这个成功固定下来防止回归?
+  Replay 把那次成功的 response 序列录成 JSON fixture,
+  之后 deterministic 跑同一 fixture, 任何偏差都是回归。
+</p>
+<h3>Fixture 格式</h3>
+<pre class="code-block"><code>{
+  "version": 1,
+  "caseId": "edit-readme",
+  "provider": "openai_compatible",
+  "model": "example-model",
+  "recordedAt": "2026-06-03T00:00:00.000Z",
+  "responses": [
+    { "content": null, "toolCalls": [{ "id": "call_1", "name": "run_read", "args": { "path": "README.md" } }], "finishReason": "tool_calls" },
+    { "content": "Done.", "toolCalls": [], "finishReason": "stop" }
+  ]
 }</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-regression-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live regression case 定义 (L1)</a></p>
+<h3>录制过程 (手动)</h3>
+<ol>
+<li>写一个 <code>scripted</code> 版本的 case, 用真实 LLM 跑通</li>
+<li>用 <code>llm.log</code> 或 LLM 客户端日志, 把 <code>chat.completions.create()</code> 的请求和响应抄成上面格式</li>
+<li>把 fixture 存到 <code>src/eval/cases/fixtures/&lt;case-id&gt;.json</code></li>
+<li>把 case 改成 <code>llm: { kind: "replay", replayFile: "..." }</code></li>
+</ol>
 <p>
-  driver 接受 scenario, 用 fakeLlm 跑 agent, 收集 trace (LLM 输入输出、
-  工具调用、permission 决策、reminder 注入), 把 trace 喂给 expectations 断言,
-  产出 report (通过 / 失败 + 失败原因)。
+  第一版 replay 只<strong>读取</strong> fixture, 不负责自动录制。
+  自动录制 (跑真实 LLM → 自动落 fixture) 是后续增强,
+  避免本阶段扩散。
 </p>
+<h3>Replay 内部复用 Scripted LLM</h3>
+<p>
+  Replay 在 driver 内部实际是 Scripted LLM 的特殊形式:
+  </p>
+<pre class="code-block"><code>function createReplayLLMClient(opts: { replayFile: string; caseId: string }): LLMClient {
+  const fixture = JSON.parse(fs.readFileSync(opts.replayFile, "utf8"));
+  if (fixture.version !== 1) throw new Error("Unsupported fixture version");
+  if (fixture.caseId !== opts.caseId) throw new Error("caseId mismatch");   // 防混用
+  return createScriptedLLMClient({
+    caseId: opts.caseId,
+    responses: fixture.responses,   // 转成 ScriptedLLMResponse[]
+  });
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-regression-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live regression case 定义 (L1)</a></p>
+<p>
+  Runner 层完全感知不到 scripted / replay 的区别,
+  同样的断言、同样的 trace 事件流。
+</p>
+<h2 id="live-regression">Live Regression: 现实校准</h2>
+<p>
+  Live smoke (2-3 个 case) 验证"LLM wrapper 链得通",
+  live regression (6 个 case) 才是真正的现实校准:
+  跑真实 LLM, 用结构性断言判断核心能力不退化。
+</p>
+<h3>第一轮 6 个 case</h3>
+<table class="terms">
+<thead>
+<tr>
+<th>Case ID</th>
+<th>场景</th>
+<th>核心断言</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>live-core-read-structured-summary</code></td>
+<td>读 fixture, 用三个 bullet 回答</td>
+<td>三个关键事实 (sentinel) 都在最终回复</td>
+</tr>
+<tr>
+<td><code>live-core-write-report-with-sentinels</code></td>
+<td>写文件并嵌入稳定 sentinel</td>
+<td>文件存在 + 3 行 sentinel 都在</td>
+</tr>
+<tr>
+<td><code>live-core-edit-existing-config</code></td>
+<td>改一个常量, 保留 sentinel 和其他字段</td>
+<td>目标字段改了 + 无关字段没动</td>
+</tr>
+<tr>
+<td><code>live-core-bash-readonly-command</code></td>
+<td>跑 <code>node -e</code> 并报告输出</td>
+<td>输出含固定字符串</td>
+</tr>
+<tr>
+<td><code>live-core-permission-denied-write</code></td>
+<td>用户拒绝权限后, 文件没被创建</td>
+<td>权限被问 + 写工具没被调</td>
+</tr>
+<tr>
+<td><code>live-core-multi-turn-stateful-edit</code></td>
+<td>多 turn 共享 context, 先观察后修改</td>
+<td>第一步只读 + 第二步基于结果改</td>
+</tr>
+</tbody>
+</table>
+<h3>3 个 case 设计细节</h3>
+<p>
+  <strong>写文件 case (sentinel 设计)</strong>: 让模型写
+  <code>reports/eval-contract.md</code>, 必须含 3 行精确 sentinel:
+</p>
+<pre class="code-block"><code>case-id: LIVE-WRITE-001
+status: ready
+owner: eval</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-regression-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live regression case 定义 (L1)</a></p>
+<p>
+  为什么用 sentinel? 因为真实 LLM 的措辞不可预测, 但 "这 3 行必须存在"
+  是任务的核心要求, 写死能精确验证。
+</p>
+<p>
+  <strong>权限拒绝 case</strong>: 设 <code>permissionAnswers: [false]</code>,
+  让 driver 在第一次权限确认时自动拒绝。验证:
+</p>
+<pre class="code-block"><code>assertions: [
+  { kind: "permissionPromptShown" },
+  { kind: "toolNotCalled", toolName: "run_write" },
+  { kind: "toolNotCalled", toolName: "run_edit" },
+  { kind: "toolNotCalled", toolName: "run_edit_exact" },
+  { kind: "finalOutputMatches", pattern: "(denied|cannot|not allowed)" },
+]</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/replay/replay-llm.ts#L60" rel="noreferrer" target="_blank">GitHub · createReplayLLMClient fixture 读取 (L60)</a></p>
+<p>
+  5 条断言联合验证"用户拒绝后, agent 不写文件, 不假装写成功"。
+</p>
+<p>
+  <strong>多 turn case (stateful edit)</strong>: 同一个 case 有 2 个 step,
+  复用 driver 实例, 验证多轮 context 共享:
+</p>
+<pre class="code-block"><code>steps: [
+  { id: "observe", query: "Read notes/state.md and tell me the current phase. Do not edit files in this step." },
+  { id: "update", query: "Now update notes/state.md so phase becomes reviewed, and add a line reviewer: live-e2e." },
+],
+assertions: [
+  { kind: "stepToolNotCalled", stepId: "observe", toolName: "run_write" },
+  { kind: "fileContains", path: "notes/state.md", text: "phase: reviewed" },
+  { kind: "fileContains", path: "notes/state.md", text: "reviewer: live-e2e" },
+]</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-regression-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live regression 写文件 case (L1)</a></p>
+<p>
+  第一步用 <code>stepToolNotCalled</code> 验证"只观察"约束, 第二步
+  验证"基于观察结果改对了"。
+</p>
+<h3>Live safety 默认</h3>
+<p>
+  所有 live suite 默认 skip, 显式开启:
+  </p>
+<pre class="code-block"><code># live smoke (2 个 case)
+EVAL_LIVE=1 npm run test:eval:live
 
-<h2 id="trace">Trace: 行为事实</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface Trace {
-  events: TraceEvent[];
-}
+# live regression (6 个 case)
+EVAL_LIVE_REGRESSION=1 npm run test:eval:live:regression
 
-export type TraceEvent =
-  | { type: "llm_call"; callIndex: number; inputMessages: Message[]; output: AssistantMessage }
-  | { type: "tool_call"; tool: string; args: unknown; result: ToolResult }
-  | { type: "permission_decision"; tool: string; action: "allow" | "ask" | "deny" }
-  | { type: "reminder_injected"; source: string; content: string }
-  | { type: "compaction"; beforeCount: number; afterCount: number }
-  | { type: "recovery_action"; kind: string; attempt: number };
+# regression + judge (5 个 case 加 judge)
+EVAL_LIVE_REGRESSION=1 EVAL_JUDGE=1 npm run test:eval:live:regression</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-regression-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live regression 编辑 case (L1)</a></p>
+<p>
+  每个 case 都有 <code>maxCalls</code> (8-12) / <code>maxRounds</code> (8-12)
+  / Vitest timeout (30-60s), 防止 LLM 无限循环。
+  </p>
+<h2 id="live-full">Live Full Regression: 完整工具系统</h2>
+<p>
+  core-tools 跑完后, 我们要测"完整工具系统"。这层加在 regression 之上,
+  用 <code>tools.kind = "full"</code>, 验证 TODO / Memory / Skill /
+  SubAgent 这些系统级能力不退化。
+</p>
+<h3>Release 组 4 个 case</h3>
+<p>
+  <strong>case 1 · TODO 跟踪 + 文件修改 (<code>live-full-todo-guided-file-change</code>)</strong>
+  </p>
+<p>
+  让模型用 TODO 跟踪一个 2 步任务: 读文件 → 改文件 + 加 marker。
+  </p>
+<pre class="code-block"><code>// fixture
+docs/todo-target.md: "status: draft"
 
-// trace 持久化到 JSON, 反向回放时从 trace 重现 agent 行为
-export function saveTrace(trace: Trace, path: string): Promise&lt;void&gt;;
-export function loadTrace(path: string): Promise&lt;Trace&gt;;</code></pre>
+// query
+Use a TODO list to track this work:
+1. Read docs/todo-target.md.
+2. Update it so status becomes complete and add marker TODO_LIVE_DONE.
+
+// 断言
+toolCalled(run_todo_create)             // 必须建 TODO
+toolCalled(run_todo_update, minCount: 2) // 至少 update 2 次
+fileContains("docs/todo-target.md", "status: complete")
+fileContains("docs/todo-target.md", "TODO_LIVE_DONE")
+allToolsSucceeded</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-full-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live full TODO case (L1)</a></p>
 <p>
-  trace 是 eval 的核心。它记录 agent 在跑 scenario 时做的所有事,
-  不仅是"返回了什么文本", 而是"调了哪些工具 / 过了哪些 permission / 注入了哪些
-  reminder"。trace 落盘后, 可以反向回放, 也能喂给 judge。
+  <strong>case 2 · Memory create + read (<code>live-full-memory-confirmed-create-and-read</code>)</strong>
 </p>
+<p>
+  2 步: 第一步明确要求记忆 <code>LIVE-MEM-42</code>, 第二步读回。
+  </p>
+<pre class="code-block"><code>steps: [
+  { id: "create", query: "Please remember this for the eval project: release keyword is LIVE-MEM-42." },
+  { id: "read",   query: "List or read your memories and tell me the release keyword." },
+],
+assertions: [
+  toolCalled(run_memory_create),
+  toolCalledOneOf(["run_memory_list", "run_memory_read"]),  // 模型可能用 list 或 read
+  { kind: "finalOutputContains", text: "LIVE-MEM-42", stepId: "read" },
+]</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-full-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · live full suite (L1)</a></p>
+<p>
+  <strong>case 3 · Skill 加载 + 遵循 (<code>live-full-skill-guided-output</code>)</strong>
+</p>
+<p>
+  预先 seed 一个临时 skill, 让模型加载并按 skill 指示写文件:
+  </p>
+<pre class="code-block"><code>// seed skill 内容
+"eval-format/SKILL.md": "When asked to create an eval status file, first write the marker SKILL_USED_22, then include the user's requested status."
 
-<h2 id="assertion">Assertion: 行为断言</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export type Expectation =
-  | { kind: "messages_at_least"; count: number }
-  | { kind: "last_message_role"; role: "user" | "assistant" | "tool" }
-  | { kind: "tool_called"; tool: string; atLeast?: number }
-  | { kind: "tool_not_called"; tool: string }
-  | { kind: "permission_asked"; tool: string }
-  | { kind: "reminder_with_source"; source: string }
-  | { kind: "no_compaction" | { kind: "compaction_at_least"; count: number } }
-  | { kind: "final_text_contains"; substring: string }
-  | { kind: "final_text_not_contains"; substring: string };
+// query
+Use the eval-format skill to create skill-output.md with status: passed.
 
-export function assertExpectations(trace: Trace, expectations: Expectation[]): { passed: boolean; failures: string[] };</code></pre>
+// 断言
+toolCalled(run_skill)                   // 加载 skill
+fileContains("skill-output.md", "SKILL_USED_22")   // marker 出现
+fileContains("skill-output.md", "status: passed")  // 任务内容
+allToolsSucceeded</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/judge/judge.ts#L1" rel="noreferrer" target="_blank">GitHub · judge 4 层 JSON 解析降级 (L1)</a></p>
 <p>
-  断言不直接断最终文本 (那是 LLM 行为, 不稳定), 而是断"行为事实"。 例如:
-  "至少调了 1 次 read_file 工具"、"permission 至少问过 1 次"、 "messages 至少 5
-  条"。这些断言即使 LLM 输出变了也能稳定通过。
+  <code>SKILL_USED_22</code> 是 skill 行为的指纹, 调试 trace 时用 grep
+  就能确认模型确实加载并遵循了 skill。
 </p>
+<p>
+  <strong>case 4 · SubAgent 只读分析 (<code>live-full-subagent-readonly-analysis</code>)</strong>
+</p>
+<p>
+  父 Agent 委托子智能体分析文件, 验证子智能体没写文件:
+  </p>
+<pre class="code-block"><code>// fixture
+src/a.ts: "export const liveToken = \"SUBAGENT_LIVE_01\";"
+
+// query
+Ask a subagent to inspect src/a.ts and report the liveToken value. Do not modify any files.
 
-<h2 id="judge">Judge: 复盘</h2>
-<p>judge 是第三层, 不在 CI 跑。流程:</p>
+// 断言
+toolCalled(run_subagent)
+finalOutputContains("SUBAGENT_LIVE_01")
+toolNotCalled(run_write)               // 父 Agent 不直接写
+toolNotCalled(run_edit)                // 父 Agent 不直接编辑
+toolNotCalled(run_edit_exact)</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/case-schema.ts#L311" rel="noreferrer" target="_blank">GitHub · EvalJudgeInput 类型 (L311)</a></p>
+<h3>Nightly 组 3 个 case (默认 skip)</h3>
+<p>
+  Task Group / Async Run / Schedule 3 个 case 涉及"运行时系统回归",
+  放 nightly (默认 <code>describe.skip</code>), 等 flake 率稳定后再纳入 release。
+  </p>
+<ul>
+<li><code>live-full-task-group-durable-plan</code>: Task Group 创建 / 更新 / 读回</li>
+<li><code>live-full-async-output-handle</code>: Async Run 启动 / 读 output_id</li>
+<li><code>live-full-schedule-create-read-cancel</code>: Schedule 创建 / 读 / 取消</li>
+</ul>
+<h3>启用命令</h3>
+<pre class="code-block"><code>EVAL_LIVE_FULL=1 npm run test:eval:live:full
+EVAL_LIVE_FULL=1 EVAL_JUDGE=1 npm run test:eval:live:full</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/mcp/fixture-server.ts#L1" rel="noreferrer" target="_blank">GitHub · MCP fixture server 实现 (L1)</a></p>
+<h2 id="judge">Judge: 开放式语义的补充</h2>
+<p>
+  Hard 断言判断<strong>事实</strong>: 工具是否调用、文件是否含 sentinel。
+  Judge 判断<strong>质量</strong>: TODO 是否反映真实执行步骤、汇总是否忠实于
+  成员结果、Agent 是否没有夸大成功。
+  </p>
+<p>
+  没有 judge, 这类问题只能用宽松正则, 等于没测。
+  但 judge LLM 本身也有不确定性, 必须严格控制。
+</p>
+<h3>关键顺序</h3>
 <ol>
-  <li>开发者本地跑一个 scenario, 收集 trace。</li>
-  <li>把 trace + scenario 描述喂给 judge LLM (通常用更强模型)。</li>
-  <li>
-    judge 评估"行为是否合理", 输出结构化反馈 (例如"agent 在 第 3 轮调错了工具,
-    应该是 read_file 不是 bash" / "agent 跳过 了 permission 检查, 危险!")。
-  </li>
-  <li>开发者看 judge 反馈, 决定改哪一章的 Prompt Card。</li>
+<li>先跑 hard 断言, 失败就 <code>case failed</code>, 不再跑 judge</li>
+<li>Hard 通过后, judge 用<strong>另一个 LLM</strong> 读 trace 摘要, 输出
+      <code>passed / score / summary / strengths / problems / evidence / needsHumanReview</code></li>
+<li>Judge 失败也导致 case failed, 但报告里要明确标 <code>(judge failed)</code> 防止混淆</li>
 </ol>
-<p>judge 不替代单元测试, 而是用来"找出 Prompt Card 没覆盖的盲点"。</p>
+<h3>Judge prompt 与 agent 隔离</h3>
+<p>
+  <strong>judge prompt 不进 agent system prompt</strong>。
+  它是 eval runner 的单独 LLM 调用, 与被测 agent 隔离。
+  Judge 模型可以和被测 agent 不同 (用 <code>JUDGE_MODEL</code> 覆盖),
+  常见选择是用更轻量的模型做 judge 降本。
+</p>
+<h3>4 层 JSON 解析降级</h3>
+<p>
+  Judge LLM 可能返回 markdown code block、额外文本、无效 JSON。
+  解析器采用 4 层降级:
+</p>
+<pre class="code-block"><code>function parseJudgeOutput(raw: string): EvalJudgeResult {
+  // 第 1 层: 直接 JSON.parse
+  try { return JSON.parse(raw); }
+  catch {}
 
-<h2 id="trap">反例梯度</h2>
+  // 第 2 层: 正则提取 ```json ... ``` 块
+  const codeBlock = raw.match(/```json\s*([\s\S]+?)\s*```/);
+  if (codeBlock) {
+    try { return JSON.parse(codeBlock[1]); } catch {}
+  }
 
+  // 第 3 层: 括号深度计数器 + 字符串引号跟踪
+  const balanced = extractBalancedJson(raw);
+  if (balanced) {
+    try { return JSON.parse(balanced); } catch {}
+  }
+
+  // 第 4 层: 降级
+  return {
+    enabled: true,
+    passed: false,
+    score: 0,
+    summary: "judge_failed",
+    strengths: [],
+    problems: ["judge output was not parseable JSON"],
+    evidence: [],
+    needsHumanReview: true,
+  };
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/team-driver.ts#L70" rel="noreferrer" target="_blank">GitHub · createLearnClaudeCodeTeamDriver (L70)</a></p>
+<p>
+  第 4 层返回的 <code>judge_failed</code> 不会影响 hard result,
+  但报告里要明确标出, 让人工排查。
+</p>
+<h3>Trace 摘要而非完整 trace</h3>
+<p>
+  Judge 不要读完整 trace (太大), 也不读 <code>llm.log</code>。
+  我们为 judge 构造一个结构化摘要:
+</p>
+<pre class="code-block"><code>interface EvalJudgeInput {
+  caseId: string;
+  title: string;
+  userQueries: string[];             // 用户问过什么
+  finalOutputs: string[];            // 每 step 最终回复
+  toolCallSummary: Array&lt;{
+    stepId: string;
+    toolName: string;
+    argsPreview: string;
+  }&gt;;
+  hardAssertionResults: EvalAssertionResult[];  // 已跑的 hard 结果
+  rubric: EvalJudgeRubric;            // 评分标准
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1" rel="noreferrer" target="_blank">GitHub · agent.ts (反例: 主代码写 if (provider) 是禁止的) (L1)</a></p>
+<p>
+  Judge 看完摘要, 给出 pass/fail + 评分 + 证据。
+  这种结构化输入让 judge 输出可解析、可断言、可追溯。
+</p>
+<h2 id="mcp-team">MCP 与 Agent Team: Harness 原型</h2>
+<p>
+  这两个方向把方法论扩展到外部协议 (MCP) 和多主体协作 (Team)。
+  本项目当前<strong>没实现生产级 MCP runtime / 真实 Team runtime</strong>,
+  这两类 suite 当前全部 <code>describe.skip</code>, 只作为 harness
+  草案保留。<strong>不要在公开文档里暗示这些是生产能力</strong>。
+</p>
+<h3>MCP harness</h3>
+<p>
+  MCP 是 JSON-RPC 风格的外部协议。Eval 设计需要<strong>可控 fixture server</strong>,
+  不走真实 GitHub / Slack / DB server, 因为:
+</p>
+<ol>
+<li>真实 server 不可控 (限流、下线、版本变化)</li>
+<li>真实 server 行为会影响 case 稳定性, 把模型不确定性叠加到协议不确定性上</li>
+<li>真实 server 可能需要凭据, 不适合 PR 自动化</li>
+</ol>
+<p>
+  Fixture server 实现 5 类 JSON-RPC 方法, 覆盖 MCP 2025-06-18 的最小子集:
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th>方法</th>
+<th>用途</th>
+<th>可注入行为</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>initialize</code></td>
+<td>协议握手, 协商 capabilities</td>
+<td>初始化失败</td>
+</tr>
+<tr>
+<td><code>tools/list</code></td>
+<td>发现 server 暴露的工具</td>
+<td>—</td>
+</tr>
+<tr>
+<td><code>tools/call</code></td>
+<td>调用工具</td>
+<td>延迟、错误码、crash</td>
+</tr>
+<tr>
+<td><code>resources/list</code></td>
+<td>发现 server 暴露的资源</td>
+<td>—</td>
+</tr>
+<tr>
+<td><code>resources/read</code></td>
+<td>读取资源</td>
+<td>—</td>
+</tr>
+</tbody>
+</table>
+<p>
+  8 类 MCP runtime 事件进入 trace:
+  <code>mcp_server_start</code> / <code>mcp_server_stop</code> /
+  <code>mcp_initialize</code> / <code>mcp_tools_list</code> /
+  <code>mcp_tool_call</code> / <code>mcp_tool_result</code> /
+  <code>mcp_resource_read</code> / <code>mcp_error</code>。
+  配套断言: <code>mcpServerStarted</code> / <code>mcpToolListed</code> /
+  <code>mcpToolCalled</code> / <code>mcpToolResultContains</code> /
+  <code>mcpResourceRead</code> / <code>mcpErrorCode</code>。
+</p>
+<p>
+  4 类 case 覆盖: 正常工具调用 / 资源读取 / 错误恢复 / server crash。
+  第一版 transport 只做 <code>stdio</code>, <code>http</code> 后续扩展。
+</p>
+<h3>Team harness</h3>
+<p>
+  Agent Team 的关键不是"最终答案像不像", 而是<strong>协作过程是否正确</strong>。
+  Trace 必须能回答:
+</p>
+<ul>
+<li>启动了哪些 agent, 各自什么角色</li>
+<li>谁把什么任务交给谁 (handoff)</li>
+<li>谁调用了什么工具</li>
+<li>谁失败了, 失败如何被处理</li>
+<li>谁产出了什么 artifact</li>
+<li>coordinator 如何汇总</li>
+</ul>
+<p>
+  Team driver 第一版是<strong>顺序 supervisor 拓扑</strong>:
+  planner / implementer / reviewer / researcher 等成员依次运行,
+  每个成员是真实 Agent 实例, 有独立 history / compressor,
+  共享临时 workspace 和同一份 LLM client, 工具组由
+  <code>members[].tools</code> 显式声明。
+  </p>
+<p>
+  9 类 Team runtime 事件:
+  <code>team_start</code> / <code>agent_spawned</code> /
+  <code>agent_message</code> / <code>agent_tool_call</code> /
+  <code>handoff</code> / <code>artifact_produced</code> /
+  <code>agent_completed</code> / <code>agent_failed</code> /
+  <code>team_completed</code>。
+  </p>
+<p>
+  8 类 Team 断言:
+  <code>teamAgentSpawned</code> / <code>teamRoleUsed</code> /
+  <code>teamHandoffOccurred</code> / <code>teamAgentToolCalled</code> /
+  <code>teamAgentToolNotCalled</code> / <code>teamArtifactContains</code> /
+  <code>teamAllAgentsCompleted</code> / <code>teamNoUnauthorizedWrites</code>。
+</p>
+<p>
+  Team judge 读的不是完整 trace, 而是<strong>summary</strong>:
+  每个 agent 的角色、是否完成、失败原因、工具调用列表、产出 artifact 预览。
+  judge rubric 关注: 是否正确分工 / 是否忠实引用成员结果 / 是否处理失败 /
+  是否遵守权限边界 / 是否产生用户要求的 artifact。
+</p>
+<figure class="figure">
+<div class="flow-map" role="img" aria-label="Eval 5 大套件当前状态">
+<div class="flow-row">
+<span class="flow-node flow-node--accent">deterministic</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">默认 CI 跑</span>
+</div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">replay</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">默认 CI 跑</span>
+</div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">live smoke</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">EVAL_LIVE=1 开启</span>
+</div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">live regression</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">EVAL_LIVE_REGRESSION=1 开启</span>
+</div>
+<div class="flow-row">
+<span class="flow-node flow-node--accent">live full</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">EVAL_LIVE_FULL=1 开启</span>
+</div>
+<div class="flow-row">
+<span class="flow-node">MCP / Team</span>
+<span class="flow-arrow">→</span>
+<span class="flow-node">describe.skip, 仅 harness 原型</span>
+</div>
+</div>
+<figcaption>图 3 · 5 大套件当前状态. 越往下越接近真实, 越往上越快越稳。</figcaption>
+</figure>
+<h2 id="how-to-write-case">如何写好一个 case</h2>
+<p>
+  讲完 4 层梯度 + 3 种 driver + 12 个断言类型, 最后讲"如何写好一个 case"。
+  这是个心法问题, 不是 API 问题。
+</p>
+<h3>3 条心法</h3>
+<dl class="defs">
+<dt>心法 1 · 先写失败用例, 再写 driver</dt>
+<dd>
+      写 case 的正确顺序: 先写 EvalCase (期望行为),
+      再用 fake driver 跑通断言, 最后才换 in-process driver 接入。
+      这样写出来的 case 不依赖具体 driver 内部, 也不会"为了 driver 写 case"。
+    </dd>
+<dt>心法 2 · 用 sentinel 而非完整文本</dt>
+<dd>
+      真实 LLM 措辞不可预测。如果 case 断言"模型说了什么",
+      措辞稍微变就挂。改用 sentinel: 写"输出必须含 LIVE-WRITE-001",
+      写"文件必须含 status: ready"。
+      任务核心要求 + 精确短字符串, 比完整文本 golden snapshot 稳定得多。
+    </dd>
+<dt>心法 3 · live case 要写"能力退化检测", 不写"完美行为"</dt>
+<dd>
+      Live regression 不是"证明 agent 完美", 是"证明核心能力不退化"。
+      期望是"v0.1 跑通的 6 个 case, v0.2 还跑通"。如果 v0.2 多调了
+      几次工具但最终结果一样, 不算退化; 如果少调了关键工具导致
+      文件没改对, 算退化。
+    </dd>
+</dl>
+<h3>case 评审清单</h3>
+<p>
+  写完一个 case, 跑一遍下面 5 条检查:
+</p>
+<ol>
+<li>
+<strong>断言是否都是 portable</strong>?
+      <code>grep 'kind:'</code>, 看看有没有 <code>toolCalled</code> /
+      <code>transcriptEventTypes</code> 这种 instrumented 断言当主断言的。
+    </li>
+<li>
+<strong>case 是否能跑通 fake driver</strong>?
+      把 <code>tools.kind</code> 改成 <code>fake</code>, 跑一次。
+      如果挂, 说明 case 依赖 driver 内部。
+    </li>
+<li>
+<strong>是否有 <code>maxCalls</code> / <code>maxRounds</code></strong>?
+      live case 必须有, 否则可能无限循环。
+    </li>
+<li>
+<strong>workspace 边界是否合理</strong>?
+      <code>initialFiles</code> 不能有 <code>..</code> 和绝对路径。
+    </li>
+<li>
+<strong>断言失败时 trace 是否能定位</strong>?
+      故意改错一处 (比如把 sentinel 改成 <code>LIVE-WRITE-002</code>),
+      跑一次, 看 trace 能不能立刻告诉失败原因。
+    </li>
+</ol>
+<h2 id="decoupling">Eval 和 Agent 主循环如何解耦</h2>
+<p>
+  一个常被新同学问的问题: "为什么 agent.ts 不需要知道自己被测? 主循环
+  不写'如果是 test 就...' 的分支吗?"
+</p>
+<p>
+  答案: 完全不解耦, 跑 3 个项目驱动就出 bug。
+  解耦靠<strong>依赖注入</strong>, 不靠 if 分支。
+</p>
+<h3>4 个注入点</h3>
+<p>
+  Eval 测试和真实运行共用同一个 <code>createAgent()</code>, 唯一区别是
+  注入的实例不同:
+</p>
+<table class="terms">
+<thead>
+<tr>
+<th>注入项</th>
+<th>真实运行</th>
+<th>Eval scripted</th>
+<th>Eval live</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>LLM client</td>
+<td><code>createLLMClient()</code></td>
+<td><code>createScriptedLLMClient()</code></td>
+<td><code>createLLMClient()</code> + <code>maxCalls</code> 限制</td>
+</tr>
+<tr>
+<td>Terminal</td>
+<td>真实 readline</td>
+<td><code>createScriptedTerminal()</code></td>
+<td>同真实</td>
+</tr>
+<tr>
+<td>Tool registry</td>
+<td>直接组装</td>
+<td><code>wrapToolRegistryForTrace()</code></td>
+<td>同真实</td>
+</tr>
+<tr>
+<td>workspace</td>
+<td>process.cwd()</td>
+<td>临时 mkdtemp 目录</td>
+<td>临时目录</td>
+</tr>
+</tbody>
+</table>
+<p>
+  Agent 主循环只看这 4 个注入的接口 (LLMClient / Terminal / ToolRegistry /
+  ProjectContext), 不知道也不关心是真实还是测试。
+  </p>
+<h3>常见反模式: if (isTest) ...</h3>
+<p>
+  有人想在 <code>agent.ts</code> 写:
+</p>
+<pre class="code-block"><code>// ❌ 错误: 污染主代码
+async function run(query: string) {
+  if (process.env.NODE_ENV === "test") {
+    // 跳过权限确认
+    // 跳过 compression
+    // 用 mock 工具
+  }
+  // ...
+}</code></pre>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/in-process-driver.ts#L1" rel="noreferrer" target="_blank">GitHub · in-process driver 边界 (反例: 拒绝时 throw 会破坏 messages) (L1)</a></p>
+<p>
+  立刻坏 3 件事:
+  </p>
+<ol>
+<li>主代码和测试逻辑绑死, 改任何一边另一边都挂</li>
+<li>测试测的是"加 if 后的代码", 不是"真实代码"</li>
+<li>production 行为和 test 行为可能漂移, case 跑通但 prod 挂</li>
+</ol>
+<p>
+  <strong>正确做法</strong>: <code>createAgent()</code> 接收所有可注入实例,
+  测试时把"假实例" 传进去, 主代码 0 改动。
+</p>
+<h2 id="trap">反例梯度</h2>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>用真 LLM 跑 e2e 测试, 断言最终文本。</p>
-    <p><strong>为什么错:</strong>flaky + 贵 + 慢, CI 跑不起, 团队不信测试。</p>
-    <p>
-      <strong>正确做法:</strong>deterministic 主 + live 副 + judge 复盘,
-      三层各司其职。
+<div class="card__head">
+<span class="card__tag">新手错法 · A</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> Eval Core 直接 <code>import { createAgent } from "../agent.js"</code>。
+    </p>
+<p>
+<strong>为什么错:</strong> runner 绑死当前项目内部类型, 换 driver 写不出;
+      case 写"transcript 事件 sequence = 5" 这种私有细节, 主代码改了全挂。
+    </p>
+<p>
+<strong>正确做法:</strong> Eval Core 只认识 <code>CodingAgentDriver</code> 接口,
+      当前项目实现藏在 <code>src/eval/drivers/learn-claude-code/</code> 下。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>断言只断最终文本, 不断行为事实。</p>
-    <p>
-      <strong>为什么错:</strong>LLM 可能 hallucinate 出对的文本, 行为完全错。
+<div class="card__head">
+<span class="card__tag">中级错法 · B</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> 跑 live case 不设 <code>maxCalls</code> / <code>maxRounds</code>。
     </p>
-    <p>
-      <strong>正确做法:</strong>断言走 Expectation 类型 (tool_called /
-      permission_asked / messages_at_least 等), 不只断 final_text。
+<p>
+<strong>为什么错:</strong> LLM 卡住或循环时, case 跑几分钟不退出, 阻塞 CI。
+    </p>
+<p>
+<strong>正确做法:</strong> 每个 live case 都设 <code>maxCalls</code> / <code>maxRounds</code> /
+      Vitest timeout (通常 30-60s)。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>judge LLM 直接决定测试通过 / 失败, 写进 CI。
+<div class="card__head">
+<span class="card__tag">高级错法 · C</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> Live case 写进用户真实 <code>~/.swoopcode</code>。
     </p>
-    <p>
-      <strong>为什么错:</strong>judge 自己也不稳定, 写进 CI 就把不确定性引入 CI,
-      团队还是不信测试。
+<p>
+<strong>为什么错:</strong> memory / task / schedule 污染用户数据, 第二次跑
+      还会读到上次结果, 假阳性。
     </p>
-    <p>
-      <strong>正确做法:</strong>judge 输出只给开发者看, 不写进 CI; CI 只跑
-      deterministic + live (opt-in)。
+<p>
+<strong>正确做法:</strong> <code>tools.kind = "full"</code> 必须配
+      <code>agentHome: "temp"</code>; case 结束后清空临时目录。
     </p>
-  </div>
 </div>
-
+</div>
 <div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>trace 只记录"成功路径", 不记录"被 permission
-      拒绝的工具调用"。
+<div class="card__head">
+<span class="card__tag">边界错法 · D</span>
+</div>
+<div class="card__body">
+<p>
+<strong>常见错误:</strong> Judge 失败时让 case 通过, 或者让 judge 覆盖 hard 失败。
     </p>
-    <p>
-      <strong>为什么错:</strong>调试时不知道"为什么 agent 没做某件事", 可能
-      permission 拒了但 trace 没记录。
+<p>
+<strong>为什么错:</strong> judge LLM 也有不确定性, 让 judge 覆盖硬规则等于
+      把模型不确定性放回主路径, 整个测试失去意义。
     </p>
-    <p>
-      <strong>正确做法:</strong>trace 记录所有事件, 包括 permission decision
-      (allow / ask / deny), judge 反馈能基于完整 trace。
+<p>
+<strong>正确做法:</strong> 顺序固定: hard 先跑, hard 失败 case 直接 failed;
+      judge 单独评分, judge 失败 case 也标 failed (报告里加 <code>(judge failed)</code> 区分)。
     </p>
-  </div>
 </div>
-
+</div>
 <h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
 <div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · eval 专题</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>deterministic 1000 次结果一致:</strong>同一 scenario 跑 1000 次,
-      断言 trace 除了时间戳外完全一致 (deepEqual)。
+<div class="card__head">
+<span class="card__tag">Validation · 专题 B</span>
+</div>
+<div class="card__body">
+<p>
+<strong>Eval Core 不知道 driver 内部:</strong> 在 <code>src/eval/core/</code> 下
+      <code>grep -n 'createAgent\|agent.ts\|tasks.ts\|memory.ts'</code> 应当 0 行。
     </p>
-    <p>
-      <strong>trace 落盘 + 回放:</strong>saveTrace + loadTrace 之后, 跑 driver
-      回放, 断言产出的 trace 与原 trace 一致。
+<p>
+<strong>deterministic suite 默认跑通:</strong> <code>npm run test:eval</code> 不用
+      任何 env, 通过所有 scripted case。
     </p>
-    <p>
-      <strong>Expectation 覆盖行为事实:</strong>scenario 期望"至少调 1 次
-      read_file", trace 里 tool_called 事件 ≥ 1。
+<p>
+<strong>Live case 默认 skip:</strong> 不设 <code>EVAL_LIVE*</code> 时,
+      <code>npx vitest run src/eval/live/</code> 全部 skipped, 不应 failed。
     </p>
-    <p>
-      <strong>CI 不依赖真 LLM:</strong>CI 配置里没有 ANTHROPIC_API_KEY
-      等环境变量, 跑测试不需要任何 LLM 凭证。
+<p>
+<strong>trace 不写仓库目录:</strong> 不设 <code>EVAL_TRACE_DIR</code> 时,
+      <code>git status</code> 不应出现 trace JSON。
     </p>
-    <p>
-      <strong>judge 不写进 CI:</strong>judge 命令行工具存在, 但 CI 脚本不调用。
+<p>
+<strong>临时 workspace 隔离:</strong> <code>tools.kind = "full"</code> 配
+      <code>agentHome: "temp"</code> 时, 跑完 case 后 <code>~/.swoopcode</code>
+      不应被改动 (可用 stat 时间戳验证)。
+    </p>
+<p>
+<strong>judge 失败标记:</strong> mock 一个返回非 JSON 的 judge LLM,
+      case 标 <code>failed</code> 且 report 出现 <code>(judge failed)</code>。
     </p>
-  </div>
 </div>
-
-<h2 id="lookback">回望第 00–15 章: 哪些原则在 eval 中兑现了</h2>
+</div>
+<h2 id="lookback">回望第 00–15 章: 哪些原则在本章兑现了</h2>
 <ul>
-  <li>
-    <strong>fake LLM Cookbook 复用:</strong>第 01 章的 fake LLM 在 eval
-    层直接复用, 不重新发明。
-  </li>
-  <li>
-    <strong>事实与视图分离:</strong>trace 是事实, report 是视图, judge
-    反馈是另一个视图。三者职责清晰。
-  </li>
-  <li>
-    <strong>防自欺原则:</strong>eval 的核心就是"用 trace 取代肉眼判断",
-    直接兑现第 00 章的红灯 1。
-  </li>
-  <li>
-    <strong>append-only 原则:</strong>trace 是 append-only 事件流, 不修改不回放,
-    eval 重放是基于已存 trace 重现, 不修改 trace。
-  </li>
+<li>
+<strong>Composition Root 唯一感知 driver:</strong> 第 00 章 composition root
+      模式让 Eval Core 通过 driver 边界解耦, 不直接依赖 <code>createAgent()</code>。
+    </li>
+<li>
+<strong>依赖注入</strong>: scripted LLM / scripted terminal / trace wrapper
+      全是注入, 没有在 <code>agent.ts</code> 里写"如果是 test 就..." 分支。
+      跟第 04 章工厂模式 + 第 07 章 permission 注入一致。
+    </li>
+<li>
+<strong>Stable system prompt 优先</strong>: judge prompt 不进 agent system prompt,
+      eval 测试不污染主 agent 行为。
+    </li>
+<li>
+<strong>失败可观测</strong>: trace JSON 含完整 runtime events + assertion
+      evidence, 跟第 11 章 recovery + transcript 事件流一脉相承。
+    </li>
 </ul>
-
 <h2 id="forward">前瞻张力: 留给后续章节</h2>
 <dl class="defs">
-  <dt>eval 跨模型对比</dt>
-  <dd>
-    同一 scenario 跑 Claude / GPT / Gemini, 收集 trace 对比, 反馈给 model-policy
-    专题章。
-  </dd>
-  <dt>eval 反馈 prompt 优化</dt>
-  <dd>
-    judge 反馈自动汇总成"Prompt Card 改进建议", 跑出一份 diff, 人工 review
-    后合并。
-  </dd>
-  <dt>eval 长期积累</dt>
-  <dd>
-    跑过的 scenario 和 trace 沉淀为 regression suite, 后续章节改动时跑全量,
-    看是否破坏老行为。
-  </dd>
-  <dt>eval 性能测试</dt>
-  <dd>
-    trace 里记录 token 用量 / 工具调用次数 / 压缩触发次数, 跑性能 regression。
-  </dd>
+<dt>真实 MCP runtime 接入</dt>
+<dd>
+      当前 MCP harness 原型只能跑 fixture server。等真实 MCP SDK + transport
+      落地后, fixture harness 可平滑切换到真实 server 路径。
+    </dd>
+<dt>真实 Agent Team runtime 接入</dt>
+<dd>
+      当前顺序 supervisor 是简化的拓扑, 不能覆盖并行调度、动态成员、消息总线。
+      等真实 Team runtime 落地后, <code>team-driver.ts</code> 可换为真实 runtime adapter。
+    </dd>
+<dt>Eval 结果驱动开发</dt>
+<dd>
+      当前 eval 是验证工具。未来可以让 eval 失败自动开 issue, 或把 eval
+      report 直接喂给下个版本的开发 plan。
+    </dd>
 </dl>
-
-<h2 id="vibe-coding-eval">本次如何 vibe code: eval 专题的三件套</h2>
-
-<h3 id="vibe-feed-eval">拆卡: 4 轮迭代的具体产物</h3>
+<h2 id="prompt-card">Prompt Card (本章任务)</h2>
+<div class="card card--prompt">
+<div class="card__head">
+<span class="card__tag">Prompt Card · 专题 B</span>
+<button class="card__copy" data-copy-card="" type="button">复制</button>
+</div>
+<div class="card__body">
+<p>
+<strong>目标:</strong> 给 harness 加一套测试不确定系统的方法论, 让 runtime
+      不确定性和模型不确定性分开测, 切测试范围 = 换 driver / 换 LLM plan,
+      不改 case 主体。
+    </p>
+<p>
+<strong>场景:</strong> 项目主循环大改后, 跑 <code>npm run test:eval</code>
+      看 scripted suite 是否挂; 发布前跑
+      <code>EVAL_LIVE_REGRESSION=1</code> + <code>EVAL_JUDGE=1</code>
+      看 core tools 真实 LLM 行为是否退化; 复杂工具系统变更后跑
+      <code>EVAL_LIVE_FULL=1</code> 看 TODO/Memory/Skill/SubAgent 是否正常。
+    </p>
+<p>
+<strong>必须实现的能力 (功能导向, 不限定代码结构):</strong></p>
+<ul>
+<li>
+<strong>中立 driver 边界</strong>
+<ul>
+<li>eval runner 只认识 <code>CodingAgentDriver</code> 接口 (startCase / send / readEvents / close)</li>
+<li>runner 不直接 import harness 主代码</li>
+<li>换被测对象 = 换 driver 实现, runner 不动</li>
+</ul>
+</li>
+<li>
+<strong>4 层梯度</strong>
+<ul>
+<li>deterministic (scripted LLM, 默认 CI 跑)</li>
+<li>replay (fixture 文件, 默认 CI 跑)</li>
+<li>live smoke (真实 LLM, 显式 opt-in)</li>
+<li>live regression (真实 LLM + hard 断言 + 可选 judge, 显式 opt-in)</li>
+</ul>
+</li>
+<li>
+<strong>断言分两类</strong>
+<ul>
+<li>portable: fileExists / fileContains / finalOutputContains / noWritesOutsideWorkspace (跨 driver 可用)</li>
+<li>instrumented: toolCalled / toolCallCount / permissionPromptShown (依赖 runtime events)</li>
+<li>case 主体尽量用 portable, instrumented 作为补充调试</li>
+</ul>
+</li>
+<li>
+<strong>case 结构</strong>
+<ul>
+<li>每个 case 有独立临时 workspace, 跑完默认清理</li>
+<li>full-tools case 强制用临时 <code>agentHome</code>, 不污染用户真实数据</li>
+<li>steps 数组复用同一 driver 实例, 支持多 turn 共享 context</li>
+<li>步骤级断言可挂 stepId, 验证"第一步只读, 第二步才写"</li>
+</ul>
+</li>
+<li>
+<strong>trace 是事实来源</strong>
+<ul>
+<li>case 失败时 trace 必须能告诉"哪一步、哪个工具、什么参数、什么结果"</li>
+<li>trace 不写仓库目录, 由 <code>EVAL_TRACE_DIR</code> 显式开启</li>
+<li>runtime events 标准化: tool_call / tool_result / llm_call / llm_response / permission_prompt / log / raw / driver_error</li>
+</ul>
+</li>
+<li>
+<strong>judge 边界</strong>
+<ul>
+<li>hard 断言先跑, hard 失败 case 直接 failed, 不再跑 judge</li>
+<li>judge 用单独的 LLM 调用, prompt 不进 agent system prompt</li>
+<li>judge 必须输出 JSON, 解析失败标 <code>judge_failed</code> 不影响 hard result</li>
+<li>judge 模型可与被测 agent 不同 (用 <code>JUDGE_MODEL</code> 覆盖)</li>
+</ul>
+</li>
+<li>
+<strong>live safety</strong>
+<ul>
+<li>所有 live suite 默认 skip, 显式 env 开启</li>
+<li>每个 live case 设 <code>maxCalls</code> / <code>maxRounds</code> / Vitest timeout</li>
+<li>live case 不用完整回复文本做 golden snapshot</li>
+<li>live case 只用临时 workspace + 临时 agentHome</li>
+</ul>
+</li>
+</ul>
+<p><strong>验证 (用 fake driver + scripted LLM, 逐条断言):</strong></p>
+<ul>
+<li>eval runner 只看 driver 接口, core 模块不 import harness 主代码</li>
+<li>deterministic case 用 scripted LLM, 跑通 hard 断言</li>
+<li>同一 case 多 step 复用 driver 实例, 共享 history</li>
+<li>trace JSON 含 runtime events + assertion evidence, 失败时能定位</li>
+<li>judge mock 返回非 JSON, case 标 failed 但不抛异常</li>
+<li>live case 默认 skip, 设 <code>EVAL_LIVE_REGRESSION=1</code> 才跑</li>
+</ul>
+</div>
+</div>
+<h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>Scenario</code> /
-    <code>Driver</code> / <code>Trace</code> / <code>Expectation</code> /
-    <code>Report</code> 五个 interface。本轮不写实现, 重点钉"deterministic 主 +
-    live 副 + judge 复盘"。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出
-    <code>src/eval/</code> 目录结构, driver 是 stub (永远返回 passed=true)。本轮
-    review 重点: eval 层不修改 src/ 下任何实现。
+<li>
+    故意在 <code>runner.ts</code> 写 <code>import { createAgent } from "../agent.js"</code>,
+    跑测试, 看"Eval Core 不知道 driver 内部"是否抓到 (切 driver 后挂)。
   </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 driver + trace + assertion 实现 +
-    1 个 example scenario。本轮 review 重点: trace 记录所有事件 (含 permission
-    decision), assertion 走 Expectation 类型不只断文本。
+<li>
+    写一个 live case 不设 <code>maxCalls</code>, 用一个会循环调工具的 scripted response 模拟, 跑测试, 看"Live case 默认 skip / maxCalls" 是否抓到 (超时挂)。
   </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/eval.test.ts</code>。本轮 review 重点: "deterministic 1000
-    次结果一致" 和 "trace 落盘 + 回放" 两条必须有 spy 验证。
+<li>
+    让 <code>tools.kind = "full"</code> 配 <code>agentHome</code> 指向真实路径, 跑测试, 看"临时 agentHome 隔离" 是否抓到 (写入真实数据)。
   </li>
 </ol>
+<h2 id="summary">本章小结</h2>
+<p>
+  测试 Coding Agent 的核心是<strong>把 3 类不确定性分开</strong>。
+  </p>
+<ul>
+<li>
+<strong>Runtime 正确性</strong> 用 deterministic scripted LLM 守住,
+      默认 CI 跑, 不依赖 API key。
+    </li>
+<li>
+<strong>工具系统能力</strong> 用 live regression 跑真实 LLM + hard 断言,
+      发布前 / 大改后触发, 默认 skip。
+    </li>
+<li>
+<strong>开放语义质量</strong> 用 judge 补充 hard 跑不到的部分, judge 失败
+      也标 failed 但不影响 hard result。
+    </li>
+</ul>
+<p>
+  4 层梯度 + 临时 workspace + 临时 agentHome + 中立 driver 边界 + 标准化
+  trace + 解析鲁棒的 judge = 一套"跑得通"和"跑得稳"都能验的方法论。
+  </p>
+<p>
+  MCP 和 Agent Team 也属于同一套方法论, 但当前仅作为 harness 原型
+  保留, 等真实 runtime 落地后再恢复 opt-in 运行。
+</p>
+<h2 id="how-to-write-eval-prompt-card">如何写好一个 eval case: 6 段 Prompt Card 模板</h2>
+<p>
+  读到这里, 你应该已经知道 eval 系统"能做什么"。 这一节讲"如果你想
+  自己 vibe 一组测试 case, 怎么写 Prompt Card"。
+  </p>
+<p>
+  这一节按第 00 章"6 段 Prompt Card 模板" 展开, 但专门针对 eval case
+  写。 你可以直接复制本节的整段 prompt 喂给大模型, 让它生成符合
+  harness 结构的 EvalCase 对象。
+  </p>
+<h3>第 1 段: 目标 (Goal)</h3>
+<p>
+  目标段回答 "我想让 LLM 帮我写什么测试"。 这一段必须含
+  <strong>测试边界</strong> (P0/P1/P2/P3 哪一层) 和 <strong>驱动边界</strong>
+  (scripted / replay / live / judge 哪几种)。
+  </p>
+<pre class="code-block"><code>// 差
+目标: 写一个 eval case 测试 run_write
 
-<h3 id="vibe-review-eval">Review: eval 专题专属 checklist</h3>
-<ol>
-  <li>
-    <strong>CI 不依赖真 LLM。</strong>验证: 跑
-    <code>npm run test</code> 不需要任何 API key。
-  </li>
-  <li>
-    <strong>trace 记录所有事件 (含 permission)。</strong>验证:
-    <code>grep -n 'permission_decision' src/eval/trace.ts</code> ≥ 1 行。
-  </li>
-  <li>
-    <strong>assertion 走 Expectation 类型。</strong>验证:
-    <code>grep -n 'final_text_contains' src/eval/assertion.ts</code>
-    应当是多个之一, 不应当是唯一断言类型。
-  </li>
-  <li>
-    <strong>judge 不写进 CI。</strong>验证: judge 命令行存在, 但 CI 配置 (例如
-    <code>.github/workflows/*.yml</code>) 不调用。
-  </li>
-  <li>
-    <strong>trace 落盘 + 回放。</strong>验证:
-    <code>saveTrace + loadTrace</code> 之后 driver 跑同一 scenario, trace 一致
-    (deepEqual)。
-  </li>
-</ol>
+// 改
+目标: 写一个 live regression case, 验证 agent 调 run_write 创建文件
+      时会嵌入稳定 sentinel, 跨 LLM 升级不退化</code></pre>
+<p>
+  <strong>关键反问</strong>: "这段 case 跑通后, 我能放心地说'这个能力不退化'
+  吗?" 如果不能, 说明目标没钉死。
+</p>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/cases/deterministic.test.ts#L1" rel="noreferrer" target="_blank">GitHub · 现有 deterministic suite 模板 (L1)</a></p>
+<h3>第 2 段: 场景 (Scenario)</h3>
+<p>
+  场景段必须能讲一个具体用户故事。 一段对话能走通, 不是抽象描述。
+  </p>
+<pre class="code-block"><code>// 差
+场景: 测文件写入
 
-<h3 id="vibe-debug-eval">调试: eval 专题典型伪装</h3>
+// 改
+场景: 用户说 "Create reports/eval-contract.md with these exact lines:
+      case-id: LIVE-WRITE-001 / status: ready / owner: eval"。
+      agent 调 run_write 写文件, 我们验证:
+      1. 文件被创建
+      2. 3 行 sentinel 都在
+      3. 没写到 workspace 外</code></pre>
+<p>
+  <strong>关键反问</strong>: "一个真人读这段, 知道'用户说了什么 + 期望
+  什么' 吗?" 如果还要解释, 写得太抽象。
+</p>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-regression-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · 现有 live regression 案例 (L1)</a></p>
+<h3>第 3 段: 工具与步骤 (Tools &amp; Steps)</h3>
+<p>
+  这一段声明 case 跑什么 LLM 计划、用什么工具集、走几步。
+  </p>
+<pre class="code-block"><code>// 改
+工具与步骤:
+  driver:
+    kind: learn-claude-code-in-process
+    llm: { kind: live, live: { maxCalls: 8 } }   // 真实 LLM, 限 8 次
+    tools: { kind: core }                         // 真实核心工具
+  steps:
+    - id: write
+      query: "Create reports/eval-contract.md with these exact lines:
+             case-id: LIVE-WRITE-001 ..."
+    # 单 step, 一次写完</code></pre>
+<p>
+  3 个必须钉死的字段:
+</p>
 <ol>
-  <li>
-    <strong>伪装 A · 跑真 LLM 写进 CI。</strong>症状: CI 跑测试时要求
-    ANTHROPIC_API_KEY。验证:
-    <code>unset ANTHROPIC_API_KEY; npm test</code> 应当全部通过。
-  </li>
-  <li>
-    <strong>伪装 B · assertion 只断最终文本。</strong>症状: 期望 "agent
-    应该读文件" 写成 <code>final_text_contains("createAgent")</code>。验证:
-    写一个 LLM 不读文件但输出含 "createAgent" 的 scenario, deterministic
-    测试应当失败。
-  </li>
-  <li>
-    <strong>伪装 C · judge 写进 CI 自动判通过 / 失败。</strong>症状: CI 跑
-    <code>npm run eval:judge</code> 自动判通过。验证: 删掉 CI 配置里的 judge
-    调用, 仍然全部通过, judge 只在本地 opt-in。
-  </li>
+<li>
+<strong>LLM 计划</strong>: scripted / replay / live 之一。 live 必须设
+      <code>maxCalls</code>, 防止 LLM 无限循环。
+    </li>
+<li>
+<strong>工具集</strong>: fake / core / full 之一, 显式声明
+      <code>agentHome: "temp"</code> (full 时), 防止污染用户数据。
+    </li>
+<li>
+<strong>步骤数</strong>: 单 step (单轮) 还是多 step (多轮共享 context)。
+      多 step 必填 <code>id</code> 字段, 给 step 级断言用。
+    </li>
 </ol>
+<p>
+  <strong>关键反问</strong>: "LLM 写错 / 写多 / 写少一步, 都会影响断言结果吗?"
+  如果是, 步骤数要明确。
+</p>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/case-schema.ts#L268" rel="noreferrer" target="_blank">GitHub · EvalCase 类型定义 (L268)</a></p>
+<h3>第 4 段: 边界 (Checklist)</h3>
+<p>
+  边界段是 "LLM 必须遵守的硬规则", 每条<strong>可验证</strong>。
+  </p>
+<pre class="code-block"><code>// 差
+边界: 代码要清晰
 
-<h3 id="vibe-iterate-eval">迭代: eval 专题 4 个 commit 节点</h3>
+// 改
+边界 (LLM 必须遵守的 checklist):
+  - 必须用 run_write 工具, 不允许 run_edit 或 run_bash "echo &gt;" 替代
+  - 写入路径必须是 reports/eval-contract.md, 不允许其他路径
+  - 文件必须含 3 行精确 sentinel: case-id: LIVE-WRITE-001 /
+    status: ready / owner: eval
+  - 跨模型升级后行为不退化 (用 fileContains 验证, 不用 golden snapshot)
+  - maxCalls=8, 跑超 8 次自动 fail, 避免烧钱</code></pre>
+<p>
+  5 条边界全部可断言:
+  </p>
 <ol>
-  <li>
-    <code
-      >feat(eval): 钉 Scenario / Driver / Trace / Expectation / Report
-      接口</code
-    >
-    —— tsc 通过, 无实现。
-  </li>
-  <li>
-    <code
-      >feat(eval): createDriver + createTraceRecorder + 1 个 example scenario
-      stub</code
-    >
-    —— tsc 通过, driver 永远 passed=true。
-  </li>
-  <li>
-    <code
-      >feat(eval): driver 跑 scenario + trace 落盘 + assertion 走
-      Expectation</code
-    >
-    —— 跑通 Validation 卡片前 3 条。
-  </li>
-  <li>
-    <code
-      >test(eval): 1000 次 deterministic 一致 + trace 回放 + CI 不依赖 API
-      key</code
-    >
-    —— 全绿。
-  </li>
+<li>必须用哪个工具 → 验证 <code>toolCalled</code></li>
+<li>路径限制 → 验证 <code>fileContains</code></li>
+<li>sentinel 内容 → 验证 <code>fileContains(text="case-id: LIVE-WRITE-001")</code></li>
+<li>不用 golden snapshot → 写断言时<strong>故意</strong>避免字符串相等</li>
+<li>maxCalls 上限 → 验证 case 跑超 8 次 fail</li>
 </ol>
+<p>
+  <strong>关键反问</strong>: "这 5 条每条都能在 vitest 里写一条 <code>expect()</code> 吗?"
+  如果某条不能, 那就是空话, 删掉。
+</p>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/assertions.ts#L35" rel="noreferrer" target="_blank">GitHub · runAssertions 断言执行器 (L35)</a></p>
+<h3>第 5 段: 验证 (Verification)</h3>
+<p>
+  验证段是"用 fake LLM + fake tool 逐条断言"。 至少 4 条, 每条对应
+  边界段的一条。
+  </p>
+<pre class="code-block"><code>// 改
+验证 (用 fake LLM + vitest, 逐条断言):
+  - allStepsCompleted (case 跑完一轮 step)
+  - toolCalled("run_write") (验证用了对的工具)
+  - fileExists("reports/eval-contract.md") (验证文件被创建)
+  - fileContains("reports/eval-contract.md", "case-id: LIVE-WRITE-001") (验证 sentinel)
+  - fileContains("reports/eval-contract.md", "status: ready")
+  - fileContains("reports/eval-contract.md", "owner: eval")
+  - noWritesOutsideWorkspace (验证没写到 workspace 外)
+  - allToolsSucceeded (验证工具没返回 error)</code></pre>
+<p>
+  8 条断言, 6 个 portable (fileExists / fileContains / noWrites / allTools) +
+  2 个 instrumented (toolCalled / allStepsCompleted)。
+  </p>
+<p>
+  <strong>关键反问</strong>: "如果我故意改一行 sentinel (把 LIVE-WRITE-001 改成
+  LIVE-WRITE-002), 这条断言会立刻挂吗?" 如果不会, 断言设计错了。
+</p>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-regression-suite.test.ts#L1" rel="noreferrer" target="_blank">GitHub · 完整 live regression 案例参考 (L1)</a></p>
+<h3>第 6 段: 整套 Prompt (可复制粘贴)</h3>
+<p>
+  把前 5 段拼一起, 就是直接喂给大模型的 prompt。 完整模板:
+</p>
+<pre class="code-block"><code>帮我写一个 eval case, 目标如下:
 
-<h2 id="prompt-card">Prompt Card (本章任务)</h2>
-<div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">Prompt Card · eval 专题</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 eval 层, 用 deterministic 主 + live 副 + judge
-      复盘三层策略, 测试不确定的 Coding Agent。
-    </p>
-    <p>
-      <strong>场景:</strong>开发者跑一个 scenario "agent 应该读文件后回答",
-      deterministic 测试断言 trace 含 read_file 工具调用 + messages 末尾是
-      assistant 消息。跑 1000 次结果一致。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/eval/driver.ts</code> (新) 跑 scenario;
-      <code>src/eval/trace.ts</code> (新) 记录行为;
-      <code>src/eval/assertion.ts</code> (新) Expectation 类型;
-      <code>src/eval/judge.ts</code> (新, 不进 CI) judge LLM 反馈。
-    </p>
-    <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
-    <ul>
-      <li>CI 不依赖真 LLM, 不读环境变量里的 API key</li>
-      <li>
-        trace 记录所有事件, 含 permission_decision / tool_call /
-        reminder_injected / compaction
-      </li>
-      <li>assertion 走 Expectation 类型, 不只断 final_text</li>
-      <li>judge 不写进 CI, 只本地 opt-in</li>
-      <li>trace 落盘走 atomic write, 可回放</li>
-    </ul>
-    <p><strong>验证 (用 fake LLM + spy, 逐条落到 vitest):</strong></p>
-    <ul>
-      <li>同一 scenario 跑 1000 次, trace 除了时间戳完全一致 (deepEqual)</li>
-      <li>
-        saveTrace + loadTrace 后, driver 跑同一 scenario, 产出 trace 与原 trace
-        一致
-      </li>
-      <li>
-        scenario 期望 "至少调 1 次 read_file", trace 里 tool_called 事件 ≥ 1
-      </li>
-      <li>unset ANTHROPIC_API_KEY 后 npm test 全部通过</li>
-      <li>CI 脚本不调用 judge, judge 仅本地 opt-in</li>
-    </ul>
-  </div>
-</div>
+目标: 写一个 live regression case, 验证 agent 调 run_write 创建文件
+      时会嵌入稳定 sentinel, 跨 LLM 升级不退化
 
-<h2 id="practice">本章练习</h2>
-<ol>
-  <li>
-    故意把 CI 配置里加 ANTHROPIC_API_KEY 依赖, 跑测试, 看"CI 不依赖真 LLM"
-    是否抓到。
-  </li>
-  <li>
-    把 assertion 改成只断 final_text, 跑测试, 看"Expectation 覆盖行为事实"
-    是否抓到 (写一个 hallucinate 的 scenario)。
-  </li>
-  <li>
-    把 judge 写进 CI, 跑测试, 看"judge 不写进 CI" 是否抓到 (CI 配置 grep 验证)。
-  </li>
-</ol>
+场景: 用户说 "Create reports/eval-contract.md with these exact lines:
+      case-id: LIVE-WRITE-001 / status: ready / owner: eval"。
+      agent 调 run_write 写文件, 我们验证文件被创建、3 行 sentinel 都在、
+      没写到 workspace 外。
 
-<h2 id="summary">本章小结</h2>
+工具与步骤:
+  driver: { kind: "learn-claude-code-in-process",
+            llm: { kind: "live", live: { maxCalls: 8 } },
+            tools: { kind: "core" } }
+  steps: [{ query: "Create reports/eval-contract.md with these exact lines: ..." }]
+
+边界 (LLM 必须遵守的 checklist):
+  - 必须用 run_write 工具, 不允许 run_edit / run_bash 替代
+  - 路径必须是 reports/eval-contract.md
+  - 文件必须含 3 行精确 sentinel
+  - 跨 LLM 升级行为不退化
+  - maxCalls=8
+
+验证 (用 fake LLM + vitest):
+  - allStepsCompleted
+  - toolCalled("run_write")
+  - fileExists("reports/eval-contract.md")
+  - 3 条 fileContains 验证 sentinel
+  - noWritesOutsideWorkspace
+  - allToolsSucceeded
+
+输出: 完整的 EvalCase 对象 (TypeScript 代码), 放在 src/eval/live/ 下
+       新建一个 .test.ts 文件, 命名 live-core-write-report-with-sentinels.test.ts。
+       注释里说明 6 段 Prompt Card 每一段对应的字段。</code></pre>
+<p>
+  把这段 prompt 复制给大模型, 大模型会按 6 段对应字段生成 EvalCase
+  对象 + vitest 测试。 你的 review checklist 跟着 6 段走, 每段一段
+  review 点。
+  </p>
+<h3>3 个常见 prompt card 错误</h3>
+<p>
+  学生 vibe eval case 时最容易踩的 3 个坑:
+</p>
+<dl class="defs">
+<dt>错误 1 · 边界用空话, 不可验证</dt>
+<dd>
+      "代码要清晰" / "跑得通" / "边界要严" — 这种 prompt 让 LLM 写
+      出来"看起来对" 但实际跑不过测试。 每条边界必须能转成一条
+      <code>expect()</code>。
+    </dd>
+<dt>错误 2 · 断言依赖措辞 (golden snapshot)</dt>
+<dd>
+      写 "<code>finalOutputContains("看起来很对")"</code> 这种断言,
+      真实 LLM 升级后措辞变化就挂。 改用 sentinel 短字符串:
+      <code>finalOutputContains("LIVE-WRITE-001")</code>。
+    </dd>
+<dt>错误 3 · 写完 case 不跑 dry run</dt>
+<dd>
+      LLM 生成 case 后直接进 live suite, 跑超 maxCalls / fixture
+      文件读不到 / 工作区边界违规, 排查 1 小时。 流程: scripted LLM
+      dry run 一次 (5 秒) → fake driver 跑通 (3 秒) → 上 live (1 分钟)。
+    </dd>
+</dl>
+<h3>把这节当 checklist 用</h3>
 <p>
-  本专题给 harness 加了 eval 层, 用三层策略测试不确定的 Coding Agent:
-  deterministic 主 (CI 必跑, 跑 fake LLM, 0 token 成本) + live 副 (本地 opt-in,
-  跑真 LLM, 中等成本) + judge 复盘 (不进 CI, 单独 review, 高成本)。trace 是核心,
-  记录所有行为事件 (含 permission decision), 可落盘可回放。 assertion 走
-  Expectation 类型, 不只断最终文本。
+  写完 prompt card 后, 用这 5 个反问自检:
 </p>
+<ol>
+<li>
+<strong>目标段</strong>: "跑通后, 我能说'这个能力不退化' 吗?" 不行就重写。
+    </li>
+<li>
+<strong>场景段</strong>: 真人读完, 知道"用户说什么 + 期望什么" 吗? 不知道就
+    加具体例子。
+    </li>
+<li>
+<strong>工具步骤</strong>: 钉死 LLM plan + 工具集 + 步骤数, 别留模糊。
+    </li>
+<li>
+<strong>边界段</strong>: 每条边界都能写一条 <code>expect()</code> 吗? 不能就
+    删或重写。
+    </li>
+<li>
+<strong>验证段</strong>: 故意改 1 个 sentinel, 立刻挂吗? 不挂就断言写错了。
+    </li>
+</ol>
+<p>
+  5 条都过了, 你的 prompt card 就可以喂给大模型 vibe 出来一个
+  "跑得通、跑得稳" 的 eval case。
+  </p>
+<p class="source-link"><a href="https://github.com/pingp76/swoopcode/blob/main/src/eval/README.md#L1" rel="noreferrer" target="_blank">GitHub · eval 系统使用文档 (L1)</a></p>
+<h2 id="next">下一章伏笔</h2>
+<p>
+  专题 B 解决了"测试不确定系统"。剩下两个问题留给后续:
+  <strong>Reference 查阅页</strong>汇总术语表 / Prompt Pack / 验证手册,
+  供学生在写 case / 调 harness 时快速查; <strong>网页版教程</strong>继续
+  沿主线 / 专题 / Reference 三栏展开, 让学生可以离线查阅整条教学叙事。
+  </p>
+</content>
\ No newline at end of file
diff --git a/tutorial/chapters/model-policy.html b/tutorial/chapters/model-policy.html
index 34f8c50..65dd114 100644
--- a/tutorial/chapters/model-policy.html
+++ b/tutorial/chapters/model-policy.html
@@ -1,573 +1,672 @@
-<p class="article__eyebrow">专题 A · 模型差异</p>
-<h1 class="article__title">不同大模型不是只换模型名</h1>
+<p class="article__eyebrow">专题 A · LLM 选型策略层</p>
+<h1 class="article__title">Model Policy: 任务类型 → Model 推荐的策略层</h1>
 <p class="article__lede">
-  Claude / GPT / Gemini 在 tool call 协议、cache 边界、错误码、输出
-  截断规则上都不一样。本专题讲 harness 应当如何吸收这些差异, 不让模型
-  切换变成"改十几处代码" 的体力活。
+  第 15 章提过 3 个 LLM provider (OpenAI / Anthropic / 自部署 vLLM)
+  + 5 个常见 model, 但 harness 怎么知道"这个任务该用哪个 model"?
+  用户让 LLM 跑"5 天迁移" 该用 GPT-4o (质量) 还是 Qwen2.5-7B
+  (成本)? 跑"5 分钟 git status" 该用 Claude Haiku (快) 还是
+  Claude Opus (贵)? 选错就是几倍账单浪费或质量崩盘。 这一专题
+  加 <strong>Model Policy 策略层</strong>: 5 种<strong>任务分类</strong>
+  (long_running / short_interactive / bulk_summarize / complex_reasoning
+  / experimental) → 5 种<strong>推荐 model</strong>, 配
+  <code>--model-policy auto|fast|quality|budget</code> 4 种<strong>策略</strong>
+  切换, 用户在质量 / 速度 / 成本间选边。 读完后, 你能讲清"为什么
+  不能 1 个 model 跑所有任务" + "auto 策略怎么工作" + "怎么扩展
+  第 6 个 model"。
 </p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="delta-from-15">在第 15 章基础上改了什么</h2>
-<p>
-  这一章不修改 harness 主结构, 而是在 <code>src/llm-adapter.ts</code> 下
-  收口所有模型差异。LLM Provider 维度拆 4 个 adapter (Anthropic / OpenAI /
-  Google / 本地兼容), 每个 adapter 实现 <code>LLMClient</code> interface,
-  把模型差异 (tool call 协议 / cache 边界 / 错误码 / 截断 规则) 收口在 adapter
-  内部。Composition Root 通过 <code>Config</code>
-  字段选 adapter, harness 其他部分完全不感知。
-</p>
-<div class="source-links" aria-label="本章 GitHub 永久链接">
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/llm-adapter.ts"
-    target="_blank"
-    rel="noreferrer"
-    >1. src/llm-adapter.ts: 模型差异适配</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/llm-providers.ts"
-    target="_blank"
-    rel="noreferrer"
-    >2. src/llm-providers.ts: provider 解析</a
-  >
-  <a
-    class="source-link"
-    href="https://github.com/pingp76/swoopcode/blob/main/src/foundation-models.ts"
-    target="_blank"
-    rel="noreferrer"
-    >3. src/foundation-models.ts: 基座模型画像</a
-  >
+<nav aria-label="页内小节" class="article__meta" id="article-inline-toc"></nav>
+<hr class="rule"/>
+<h2 id="real-failure">真实失败故事: 5 分钟任务跑了 2 小时, 账单 ×10</h2>
+<p>
+  写代码之前, 先看一个真实痛点。 用户的 team 跑 1 个月 harness,
+  发现账单异常 — 明明只是跑"git status" 这种 5 秒任务, 实际
+  账单 ×10。
+</p>
+<ol>
+<li>
+<strong>症状 1: 大材小用</strong>: 用户跑简单的"扫一下 src/ 找 TODO"
+    任务, harness 默认用 GPT-4o (贵的 model), 5 分钟任务花
+    $0.50, 实际 Qwen2.5-7B 跑同样任务 30 秒花 $0.005。 100 倍
+    浪费。
+  </li>
+<li>
+<strong>症状 2: 小材大用</strong>: 用户跑"5 天迁移 500 文件" 用
+    Qwen2.5-7B (便宜的 model), 跑 1 周, 质量崩盘, 70% 转换
+    错误, 团队 leader 花了 2 周手动修正。 实际该用 GPT-4o 跑 1 天
+    质量好。
+  </li>
+<li>
+<strong>症状 3: 切换靠人</strong>: team 想"小任务用便宜, 大任务
+    用贵", 每次手动改 <code>LLM_MODEL=...</code>, 容易忘, 出现
+    "昨天我手动改了, 今天又跑贵 model" 的混乱。
+  </li>
+<li>
+<strong>真问题</strong>: 缺少<strong>Model Policy 策略层</strong>
+  — harness 知道任务类型 (短 / 长 / 简单 / 复杂), 应该自动选
+    合适 model, 不该让用户每次手动配。 同时给用户 4 种
+    <strong>策略</strong> (auto / fast / quality / budget) 切
+    偏好, 满足"我今天要省钱" / "我今天要质量" 的临时切换。
+  </li>
+</ol>
+<p>
+  朴素想法 1: "用最贵的 model 跑所有任务?" 错。 大材小用, 账单
+  ×100, 简单任务浪费钱。 该用便宜的快 model。
+</p>
+<p>
+  朴素想法 2: "用最便宜的 model 跑所有任务?" 错。 小材大用,
+  质量崩盘, 团队返工, 实际更贵。 该用贵的慢 model 做关键任务。
+</p>
+<p>
+  正确做法: 加 <strong>Model Policy</strong> 策略层 —
+  harness 内部按<strong>任务类型</strong>分类 (5 种), 每种推荐
+  一个 model, 用户在 4 种<strong>策略</strong> (auto / fast /
+  quality / budget) 切偏好, 临时改变单次任务的 model。 这是
+  Reference 章节 "模式 8 · Strategy 策略模式" + "模式 4 ·
+  依赖注入" 的联合应用。
+</p>
+<h2 id="five-task-types">5 种任务类型: 决定 model 推荐</h2>
+<p>
+  <strong>用途</strong>: 不是所有任务都用同一种 model, 任务本身
+  有 5 种<strong>典型特征</strong>: 跑多长 / 多频繁 / 多复杂 /
+  多容错 / 多便宜。 按特征选 model 才是合理。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户让 LLM 跑"5 天迁移 500 文件" 是
+  <strong>long_running</strong> (跑 1 周, 不能崩, 质量优先);
+  "5 分钟 git status" 是 <strong>short_interactive</strong>
+  (快进快出, 速度优先); "1 小时扫 100 个 PR 摘要" 是
+  <strong>bulk_summarize</strong> (大批量, 成本优先); "半天设计
+  复杂算法" 是 <strong>complex_reasoning</strong> (高难度, 质量
+  优先); "试试新模型的 beta 功能" 是 <strong>experimental</strong>
+  (新东西, 容错优先)。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>特征分类 + 静态枚举</strong> —
+  5 种是固定枚举, 不允许"我自己加第 6 种" (保持简单); 每种
+  配推荐 model, 但允许策略 (auto/fast/quality/budget) 覆盖。 看
+  任务类型定义:
+</p>
+<div class="figure figure--stack">
+  <div class="figure__title">图 1 · 5 种任务类型的边界</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">long_running · 长时间任务</div>
+      <div class="flow-stack__body">跑 1 小时以上, 不能崩, 质量优先。 推荐: GPT-4o / Claude Sonnet (高质量 + 稳定)。 例子: 5 天迁移, 全项目扫描。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">short_interactive · 短对话</div>
+      <div class="flow-stack__body">跑 1-10 分钟, 快进快出, 速度优先。 推荐: Claude Haiku / GPT-4o-mini (快 + 便宜)。 例子: git status, ls, 单文件改。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">bulk_summarize · 批量摘要</div>
+      <div class="flow-stack__body">跑 1 小时, 但任务重复, 成本优先。 推荐: Qwen2.5-7B / Llama-3.1-8B (本地便宜)。 例子: 100 个 PR 摘要, 1000 个文件总结。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">complex_reasoning · 复杂推理</div>
+      <div class="flow-stack__body">跑 10-60 分钟, 难度高, 质量优先。 推荐: Claude Opus / o1 (最强推理)。 例子: 算法设计, 架构决策, 复杂 bug 修复。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--warn">
+      <div class="flow-stack__label">experimental · 实验性任务</div>
+      <div class="flow-stack__body">用户主动选"试试新 model", 容错优先。 推荐: 最新 beta model (新功能多但可能崩)。 例子: "试用 GPT-5 预览版"。</div>
+    </div>
+  </div>
 </div>
+<p>
+  <strong>实现细节</strong>: 任务类型在 LLM 调 agent.run() 时
+  <strong>声明</strong> (传 <code>taskType</code> 参数), 不靠
+  harness 启发式猜测。 让用户 / 上层调度器 (Schedule / Async Run)
+  显式声明, 避免 "harness 猜错任务类型选错 model" 的尴尬。
+</p>
+<h2 id="four-strategies">4 种策略: 用户切偏好</h2>
+<p>
+  <strong>用途</strong>: 5 种任务类型 → 1 个推荐 model, 但用户
+  经常想"今天我就要省钱" / "今天我就要质量"。 提供 4 种
+  <strong>策略</strong> 切偏好, 临时改变 model 选择。
+</p>
+<p>
+  <strong>真实场景</strong>: team 平时用 auto (自动按任务类型选);
+  临时跑"周末实验" 切 fast (全用便宜 model 跑快); 跑"重要
+  demo" 切 quality (全用最贵 model); 预算紧的月切 budget
+  (全用最便宜 model, 质量可能差)。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>4 种策略 + 覆盖机制</strong> —
+  Reference 章节 "模式 8 · Strategy 策略模式" 的应用。 4 种
+  策略固定枚举, 不可新增第 5 种; 每种策略决定"对每种任务类型
+  选什么 model"。 看策略定义:
+</p>
+<div class="figure figure--stack">
+  <div class="figure__title">图 2 · 4 种策略对 5 种任务类型的覆盖</div>
+  <div class="flow-stack">
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">auto · 自动 (默认)</div>
+      <div class="flow-stack__body">按任务类型用推荐 model。 long_running → GPT-4o; short_interactive → Haiku; bulk_summarize → Qwen; complex_reasoning → Opus; experimental → 用户的 LLM_MODEL 环境变量。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--dynamic">
+      <div class="flow-stack__label">fast · 全快</div>
+      <div class="flow-stack__body">所有任务都用最快最便宜的 model。 全用 Claude Haiku / GPT-4o-mini。 速度快, 成本低, 质量可能差。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--stable">
+      <div class="flow-stack__label">quality · 全质</div>
+      <div class="flow-stack__body">所有任务都用最贵最强的 model。 全用 Claude Opus / o1。 质量最高, 速度慢, 成本高。</div>
+    </div>
+    <div class="flow-stack__layer flow-stack__layer--warn">
+      <div class="flow-stack__label">budget · 全省</div>
+      <div class="flow-stack__body">所有任务都用最便宜的本地 model。 全用 Qwen2.5-7B / Llama-3.1-8B (本地 vLLM)。 成本最低, 质量可能崩。</div>
+    </div>
+  </div>
+</div>
+<p>
+  <strong>实现细节</strong>: 策略用 <code>--model-policy auto|fast|quality|budget</code>
+  CLI 切换, 也可以 <code>/model-policy show</code> 在 REPL 看当前
+  策略 + 5 种任务类型对应的 model。 策略选择后, 派生一个
+  <code>modelResolver(taskType) → modelName</code> 函数, agent
+  调 LLM 前查 resolver 选 model。
+</p>
+<h2 id="default-table">default 任务类型 → model 映射表</h2>
+<p>
+  <strong>用途</strong>: 5 种任务类型 → 5 个推荐 model 是<strong>默认
+  表</strong>, 写在 <code>src/execution-policy.ts</code> 的常量。
+  表是<strong>可改</strong>的 (新 model 出, 改表), 不是 hard
+  code 在 if/else 里。
+</p>
+<p>
+  <strong>真实场景</strong>: 2026 年 GPT-4o 是 long_running 推荐,
+  2027 年 GPT-5 出来后, 改 default 表 <code>long_running: "gpt-5"</code>,
+  所有 harness 升级后自动用 GPT-5, 不改 agent.ts。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>查表代替 if/else</strong> —
+  Reference 章节 "模式 8 · Strategy 策略模式" 的延伸, "查表"
+  是策略模式的具体实现。 看 execution-policy.ts 里的 default
+  表:
+</p>
+<pre><code class="language-typescript">// 5 种任务类型 → 推荐 model (default 表)
+const DEFAULT_MODEL_FOR_TASK: Record&lt;TaskType, ModelName&gt; = {
+  long_running: "gpt-4o",
+  short_interactive: "claude-haiku-4-5",
+  bulk_summarize: "qwen2.5-7b",
+  complex_reasoning: "claude-opus-4-5",
+  experimental: process.env.LLM_MODEL ?? "gpt-4o",
+};</code></pre>
+<p>
+  <strong>实现细节</strong>: 表是 readonly, agent.ts 通过
+  <code>createModelResolver({policy, table})</code> 拿 resolver,
+  resolver 内部按 (policy, taskType) 查表。 表改 = 改常量, 不
+  改 resolver。 这是 Reference 章节 "模式 5 · Composition Root
+  组合根" 的应用 — 表在 Composition Root 注入, 运行时不变。
+</p>
+<h2 id="auto-strategy">auto 策略: 怎么"自动"</h2>
+<p>
+  <strong>用途</strong>: auto 策略是<strong>默认</strong>, 任务类型
+  → model 的映射完全靠 default 表, 不靠运行时判断。 "自动" 的
+  含义是"用户不用选 model, harness 按任务类型自动选"。
+</p>
+<p>
+  <strong>真实场景</strong>: team 跑 10 个不同任务 (5 分钟 git /
+  1 小时迁移 / 30 分钟设计 / ...), 不传 <code>taskType</code> 让
+  harness 猜 (这会错), 让上层 (Async Run / Schedule) 显式传。
+  显式声明, 不启发式猜测。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>声明式, 不启发式</strong> —
+  harness <strong>不启发式</strong>判断"这个任务是 long 还是
+  short", 启发式容易猜错; 任务类型由<strong>调用方</strong>显式
+  声明 (Async Run 默认 short_interactive, Schedule 默认
+  bulk_summarize, 用户手动 run 默认 short_interactive)。
+</p>
+<p>
+  <strong>实现细节</strong>: <code>agent.run({query, taskType})</code>
+  接收 taskType, 内部调 <code>modelResolver(taskType)</code> 选
+  model。 默认 taskType = "short_interactive" (用户最常见的
+  "短对话" 场景)。 想覆盖, 显式传 taskType。
+</p>
+<h2 id="fast-quality-budget">fast / quality / budget: 3 个 override</h2>
+<p>
+  <strong>用途</strong>: 3 个策略 (fast / quality / budget) 是
+  <strong>全 override</strong>, 不管任务类型, 全部任务用同一种
+  model (最快 / 最强 / 最便宜)。 适合"用户明确知道今天要什么"
+  的场景。
+</p>
+<p>
+  <strong>真实场景</strong>: team 周五下午, 老板说"今天剩下的
+  任务都用最快 model, 质量无所谓, 跑快就行", <code>--model-policy fast</code>
+  一下, 所有后续任务用 Claude Haiku。 周一早上切回 auto。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>3 个常量 model, 不查表</strong> —
+  fast / quality / budget 各自对应 1 个固定 model (最快 / 最强 /
+  最便宜), 不查表, 直接 hard-code 关联。
+</p>
+<pre><code class="language-typescript">const STRATEGY_TO_MODEL: Record&lt;FastQualityBudget, ModelName&gt; = {
+  fast: "claude-haiku-4-5",
+  quality: "claude-opus-4-5",
+  budget: "qwen2.5-7b",
+};
 
-<h2 id="author-thinking">作者怎么想的: 这一章的思考链</h2>
-<dl class="defs">
-  <dt>想清楚现象</dt>
-  <dd>
-    harness 默认跑 Claude 3.5 Sonnet, 用户想换 GPT-4o, 改了一行 model 名, 结果
-    tool call 字段名不一样、cache 边界不一样、错误 码不一样, harness
-    立刻崩。现象是"模型切换像换引擎, 不是换轮胎"。
-  </dd>
-  <dt>想反例</dt>
-  <dd>
-    最朴素的反例是"在主代码里 if (provider === "anthropic") {...} else if
-    (provider === "openai") {...}"。这有两个问题: 一是 主代码被 provider
-    分支污染, 难以维护; 二是新增 provider 时要 改十几处 if, 容易漏。
-  </dd>
-  <dt>想接口和不变量</dt>
-  <dd>
-    接口:
-    <code
-      >interface LLMClient { chat(messages): Promise&lt;AssistantMessage&gt;
-      }</code
-    >。 不变量三条: (1) 暴露给 harness 的是统一 <code>AssistantMessage</code>
-    形态 (role / content / tool_calls), 内部差异完全不外泄, (2) cache 边界由
-    adapter 内部实现, harness 不知道细节, (3) 错误码统一收敛到 6 种 LLMErrorKind
-    (第 11 章), adapter 负责把 provider 特定错误翻译过来。
-  </dd>
-  <dt>想怎么验证</dt>
-  <dd>
-    同一 scenario 跑 4 个 adapter, 断言 trace 里 tool_calls 字段 完全一致 (虽然
-    provider 内部表示不同), LLMErrorKind 分类一致 (虽然 provider 错误码不同)。
-  </dd>
-</dl>
-
-<h2 id="observe-first">先观察: 两段故意有气味的实现</h2>
-
-<div class="note">
-  <p class="note__title">观察 1 · 在主代码里 if provider</p>
-  <pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-// 错误: provider 分支散落在主代码
-async function run(query: string) {
-  history.add({ role: "user", content: query });
-  let assistant;
-  if (config.provider === "anthropic") {
-    assistant = await callAnthropic(messages);  // Anthropic 特有格式
-  } else if (config.provider === "openai") {
-    assistant = await callOpenAI(messages);  // OpenAI 特有格式
-  } else if (config.provider === "google") {
-    assistant = await callGoogle(messages);  // Google 特有格式
+function createModelResolver({policy}: {policy: ModelPolicy}): ModelResolver {
+  if (policy === "auto") {
+    return (taskType) =&gt; DEFAULT_MODEL_FOR_TASK[taskType];
+  }
+  if (policy === "fast" || policy === "quality" || policy === "budget") {
+    return () =&gt; STRATEGY_TO_MODEL[policy];
   }
-  history.add(assistant);
-  /* ... */
+  throw new Error(`Unknown policy: ${policy}`);
 }</code></pre>
-  <p><strong>问:</strong>为什么不直接 if provider?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 污染: agent.ts 不该知道 provider 存在,
-    职责被越界; 维护: 新增 provider 要改十几处 if, 容易漏; 测试: 主代码被
-    provider 分支污染, 单元测试要 mock 多个 provider。
-  </p>
-</div>
+<p>
+  <strong>实现细节</strong>: 4 种策略共 3 个常量 model (auto
+  走 default 表 5 个, fast/quality/budget 各 1 个), 总共 5 + 3
+  = 8 个 model name。 改策略 = 改 3 个常量, 改 default 表 = 改
+  5 个常量。 简单, 教学友好。
+</p>
+<h2 id="execution-profile">execution profile: readonly / ci / workspace_write</h2>
+<p>
+  <strong>用途</strong>: 任务类型决定<strong>model</strong>, 执行
+  profile 决定<strong>权限</strong>。 两者正交, 不耦合。 profile
+  3 种 (readonly / ci / workspace_write) 沿用第 14 章 Schedule
+  的定义。
+</p>
+<p>
+  <strong>真实场景</strong>: long_running 任务 + readonly profile
+  = "5 天跑扫描 (只读, 用 GPT-4o)"; long_running + workspace_write
+  = "5 天跑迁移 (读写, 用 GPT-4o)"; 两者 model 一样, 权限不同。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>model × profile 两维</strong> —
+  Reference 章节 "模式 5 · Composition Root 组合根" 的延伸。
+  modelResolver 选 model, ExecutionPolicy 校验 profile 权限, 两者
+  不互相 import。
+</p>
+<p>
+  <strong>实现细节</strong>: profile 透传到 Async Run (第 13 章)
+  的 <code>permissionProfile</code> 字段, 由 executionPolicy
+  的 <code>validateCommand / validateResources</code> 校验。 model
+  和 profile 是<strong>独立</strong>维度, 组合出 5 × 3 = 15 种
+  任务场景。
+</p>
+<h2 id="graceful-degradation">graceful degradation: model 不可用怎么办</h2>
+<p>
+  <strong>用途</strong>: 推荐的 model 不可用 (rate limit / 服务挂 /
+  配额用完) 时, harness 不能直接 fail, 应该<strong>优雅降级</strong>
+  到次优 model。 这条 fallback 链是 Model Policy 的隐藏价值。
+</p>
+<p>
+  <strong>真实场景</strong>: team 跑 bulk_summarize, 推荐 Qwen2.5-7B
+  (本地 vLLM), vLLM 服务挂, fallback 到 GPT-4o-mini (云端
+  OpenAI), 速度慢但能用; 再 fallback 到 Claude Haiku; 再不行
+  fail 提示用户 "所有 model 都不可用"。
+</p>
+<p>
+  <strong>设计思想</strong>: <strong>Fallback 链 + 优雅降级</strong> —
+  Reference 章节 "模式 13 · Error→Action 错误转动作" 的应用。
+  不可用 = fail 之前先试 fallback。 fallback 顺序写死 (推荐 →
+  次优 → 次次优 → fail), 不动态计算。
+</p>
+<pre><code class="language-typescript">const FALLBACK_CHAIN: Record&lt;ModelName, ModelName[]&gt; = {
+  "gpt-4o": ["claude-sonnet-4-5", "claude-haiku-4-5"],
+  "claude-opus-4-5": ["gpt-4o", "claude-sonnet-4-5"],
+  "claude-haiku-4-5": ["gpt-4o-mini", "qwen2.5-7b"],
+  "qwen2.5-7b": ["claude-haiku-4-5", "gpt-4o-mini"],
+  // ...
+};
 
-<div class="note">
-  <p class="note__title">观察 2 · 错误码不翻译直接抛</p>
-  <pre class="code-block"><code>// 教学简化版
-catch (err) {
-  // 错误: provider 错误码直接抛
-  throw err;  // harness 看到的是 Anthropic 特定错误码
+async function callWithFallback(model: ModelName, args): Promise&lt;Response&gt; {
+  const chain = [model, ...(FALLBACK_CHAIN[model] ?? [])];
+  for (const m of chain) {
+    try {
+      return await llmClient.call(m, args);
+    } catch (err) {
+      if (isRetryableError(err)) continue;  // 试下一个
+      throw err;  // 不可恢复错误立即 throw
+    }
+  }
+  throw new Error("All models in fallback chain failed");
 }</code></pre>
-  <p><strong>问:</strong>为什么不直接抛 provider 错误?</p>
-  <p>
-    <strong>答:</strong>三件事同时坏掉 —— 跨模型: 切到 OpenAI 后, 第 11 章的
-    Recovery 模块不知道是 rate_limit 还是 rate_limit_exceeded; 抽象: harness
-    不该知道 provider 内部错误码, 这是 adapter 职责; 演化: provider 升级 SDK
-    改了错误码, harness 全挂。
-  </p>
-</div>
-
-<h2 id="differences">4 个 adapter 收口的差异</h2>
-<p>不同 provider 在以下 4 个维度上有差异, adapter 负责收口:</p>
-
-<h3>1. tool call 协议</h3>
-<dl class="defs">
-  <dt>Anthropic</dt>
-  <dd><code>tool_use</code> 字段, content 是数组 (text + tool_use 块)。</dd>
-  <dt>OpenAI</dt>
-  <dd><code>tool_calls</code> 字段, content 字符串 + tool_calls 数组并列。</dd>
-  <dt>Google</dt>
-  <dd>
-    <code>functionCall</code> 字段, parts 数组, 命名 <code>functionCall</code>。
-  </dd>
-  <dt>本地兼容 (Ollama / vLLM)</dt>
-  <dd>通常模仿 OpenAI, 但 tool name 可能需要规范化 (例如不允许 ".")。</dd>
-</dl>
 <p>
-  adapter 内部把 4 种表示都翻译成统一的
-  <code>AssistantMessage.tool_calls</code>。
+  <strong>实现细节</strong>: 5 个推荐 model 各自配 fallback 链,
+  链长 2-3, 教学版够用。 链必须是<strong>有限</strong>的 (不能
+  无限循环, 也不能跳到自身), 用 TypeScript 类型 + 单元测试保证。
 </p>
-
-<h3>2. cache 边界</h3>
-<dl class="defs">
-  <dt>Anthropic</dt>
-  <dd>
-    prompt cache 4 个 breakpoint (system / messages / tools / 末尾), 自动 5 分钟
-    TTL, 命中按 1/10 计费。
-  </dd>
-  <dt>OpenAI</dt>
-  <dd>自动 cache, 无显式 breakpoint, 命中按 1/2 计费 (o1 系列)。</dd>
-  <dt>Google</dt>
-  <dd>implicit cache, TTL 短, 命中按 1/4 计费。</dd>
-  <dt>本地兼容</dt>
-  <dd>无 cache, 所有 token 全价。</dd>
-</dl>
+<h2 id="cost-estimate">cost 估算: 让用户知道"这次大概多少钱"</h2>
 <p>
-  第 10 章的"稳定前缀 / 动态状态 / 自然增量" 布局在 Anthropic 效果 最好 (cache
-  hit rate 最高), 在 OpenAI 中等, 在本地模型无效但也没坏处。 adapter 在 metadata
-  里返回 cache hit 状态, harness 记录到 cache-debug (第 10 章)。
+  <strong>用途</strong>: 用户跑任务前, harness 估算"这次大概花
+  多少", 避免跑完发现"我跑了 $5"。 估算基于 (model 价目表 +
+  token 估算), 不精准但"够参考"。
 </p>
-
-<h3>3. 错误码</h3>
-<dl class="defs">
-  <dt>Anthropic</dt>
-  <dd>
-    HTTP 429 (rate_limit) / 529 (overloaded) / 400 (context_overflow) / 500
-    (api_error)。
-  </dd>
-  <dt>OpenAI</dt>
-  <dd>
-    HTTP 429 / 503 / 400 / 500, 错误码字符串 "rate_limit_error" /
-    "context_length_exceeded"。
-  </dd>
-  <dt>Google</dt>
-  <dd>
-    HTTP 429 / 503 / 400 / 500, 错误码字符串 "RESOURCE_EXHAUSTED" /
-    "INVALID_ARGUMENT"。
-  </dd>
-</dl>
 <p>
-  adapter 翻译成 6 种 <code>LLMErrorKind</code> (第 11 章), Recovery 模块只认
-  LLMErrorKind, 不认 provider 错误码。
+  <strong>真实场景</strong>: user 跑 long_running 任务, 启动前
+  harness 输出 "Estimated cost: $0.30-$0.50 (GPT-4o, ~50k input +
+  5k output tokens)"。 user 知道"大概 5 毛", 决定跑; 如果输出
+  "Estimated cost: $5-$10 (Claude Opus, 复杂推理)", user 可能
+  改成 GPT-4o 跑。
 </p>
-
-<h3>4. 输出截断规则</h3>
-<dl class="defs">
-  <dt>Anthropic</dt>
-  <dd>
-    <code>stop_reason === "max_tokens"</code>, content 截断在最后一个完整
-    token。
-  </dd>
-  <dt>OpenAI</dt>
-  <dd>
-    <code>finish_reason === "length"</code>, content 截断在最后一个完整 token。
-  </dd>
-  <dt>Google</dt>
-  <dd><code>finishReason === "MAX_TOKENS"</code>, 行为类似。</dd>
-</dl>
 <p>
-  adapter 统一翻译成 <code>finishReason === "length"</code> +
-  <code>truncated === true</code>, Recovery 模块 (第 11 章) 一致处理。
+  <strong>设计思想</strong>: <strong>价目表 + token 估算</strong> —
+  Reference 章节 "模式 19 · Idempotent 幂等" 的延伸 (估算可重
+  复, 不靠 LLM 自己估)。 看 cost 估算实现:
 </p>
+<pre><code class="language-typescript">// 价目表: 每 1M token 的美元价
+const MODEL_PRICING: Record&lt;ModelName, { input: number; output: number }&gt; = {
+  "gpt-4o": { input: 2.5, output: 10.0 },
+  "claude-opus-4-5": { input: 15.0, output: 75.0 },
+  "claude-haiku-4-5": { input: 0.8, output: 4.0 },
+  "qwen2.5-7b": { input: 0.0, output: 0.0 },  // 本地免费
+  // ...
+};
 
-<h2 id="foundation-models">基座模型画像</h2>
-<p>
-  同样是 "claude-3-5-sonnet", 也有 4 个版本 (claude-3-5-sonnet-20240620,
-  claude-3-5-sonnet-20241022, ...), 不同版本 cache 行为 / 工具协议 /
-  错误码都略有差异。本章用 <code>foundation-models.ts</code> 维护一份
-  "基座模型画像", 描述每个具体版本的差异, harness 启动时根据 model 名选画像,
-  应用到 adapter。
-</p>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export interface FoundationModelProfile {
-  provider: LLMProviderId;
-  model: string;
-  supportsCache: boolean;
-  cacheBreakpoints: number;
-  maxContextTokens: number;
-  supportsTools: boolean;
-  supportsCompaction: boolean;  // 是否支持原生 compaction
-  notes: string;
-}
-
-export function resolveFoundationModelProfile(opts: { provider, model, explicitProfileId? }): FoundationModelProfile;</code></pre>
-
-<h2 id="wiring">Composition Root 接线</h2>
-<pre class="code-block"><code>// 教学简化版, 仓库真实实现见 GitHub 永久链接
-export async function main() {
-  const config = loadConfig();  // 读 ANTHROPIC_API_KEY / OPENAI_API_KEY / etc
-  const modelProfile = resolveFoundationModelProfile({
-    provider: config.provider,
-    model: config.model,
-  });
-  const llm = createLlm(config, modelProfile);  // 选 adapter
-  // 之后 agent / subagent / 评测 都只看到 LLMClient interface
-  // 完全不感知 provider
+function estimateCost(model: ModelName, inputTokens: number, outputTokens: number): { min: number; max: number } {
+  const p = MODEL_PRICING[model];
+  return {
+    min: (inputTokens / 1_000_000) * p.input + (outputTokens / 1_000_000) * p.output,
+    max: ((inputTokens * 1.2) / 1_000_000) * p.input + ((outputTokens * 1.5) / 1_000_000) * p.output,
+  };
 }</code></pre>
 <p>
-  Composition Root 唯一感知 provider 的地方, 其他模块全部只看到
-  <code>LLMClient</code> interface。切模型 = 改环境变量, 不改代码。
+  <strong>实现细节</strong>: 输出 min / max 两个值, min = 精确估算,
+  max = 实际可能 1.2-1.5 倍 (LLM 输出不一定可控)。 启动前
+  estimateCost() 算一次, 输出 "Estimated cost: $0.30-$0.50"。 不
+  精准, 但够参考。 这是 Reference 章节 "模式 19 · Idempotent"
+  的应用 — 估算可重复, 同一任务多次跑估算一致。
 </p>
+<h2 id="loop-integration">主循环集成: 在哪 2 个时机选 model</h2>
+<p>
+  <strong>用途</strong>: Model Policy 集成到 agent 主循环有 2 个
+  时机:
+</p>
+<ol>
+<li>
+<strong>agent.run() 启动前</strong>: 接收 taskType, 调
+    <code>modelResolver(taskType)</code> 选 model, 调
+    <code>estimateCost(...)</code> 算预算, 输出 "Using model X,
+    estimated cost $Y"。
+  </li>
+<li>
+<strong>每次 LLM 调用前</strong>: 拿 model 调 llmClient.call(),
+    失败时走 fallback 链。
+  </li>
+</ol>
+<p>
+  <strong>设计思想</strong>: <strong>显式选择 + 启动一次</strong> —
+  agent.run() 一次选 model, 之后整个 run 期间都用同 model (不
+  切换)。 切换 model = 重新 agent.run()。 这是 Reference 章节
+  "模式 1 · 工厂 + 闭包" 的应用 — model 选完在闭包内, 整个
+  run 期间不变。
+</p>
+<p>
+  <strong>实现细节</strong>: 集成代码在 <code>src/agent.ts</code> 第
+  0 步 (setup) — 接收 taskType → 调 modelResolver → 拿 model
+  → 调 estimateCost → 注入到 run() 闭包。 之后所有 LLM 调用都
+  用这个 model, 失败走 fallback。
+</p>
+<h2 id="fake-test">fake test: 用 fake model 测 fallback 链</h2>
+<p>
+  <strong>用途</strong>: Model Policy 测试需要<strong>fake LLM client</strong>
+  — 不真调 OpenAI / Anthropic, 用 stub 返回"rate_limit" / "ok"
+  / "fail", 验证 fallback 链走通。
+</p>
+<p>
+  <strong>真实场景</strong>: 用户写测试 "推荐 GPT-4o + 链
+  [Claude Sonnet, Claude Haiku]; fake client 第一次 GPT-4o
+  返回 rate_limit, 第二次 Claude Sonnet 返回 ok, 验证:
+  最终 model = Claude Sonnet, fallback 链用了 2 步, 不再
+  试 Claude Haiku"。
+</p>
+<pre><code class="language-typescript">test("fallback 链: 推荐 model rate_limit -&gt; 次优 model 成功", async () =&gt; {
+  const fakeLlm = createFakeLlmClient({
+    "gpt-4o": () =&gt; ({ error: "rate_limit" }),
+    "claude-sonnet-4-5": () =&gt; ({ content: "ok" }),
+  });
+  const result = await callWithFallback("gpt-4o", fakeLlm, args);
+  expect(result.content).toBe("ok");
+  expect(fakeLlm.callHistory).toEqual(["gpt-4o", "claude-sonnet-4-5"]);
+});
 
-<h2 id="trap">反例梯度</h2>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">新手错法 · A</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>在 agent.ts 主代码里 if provider ===
-      "anthropic"。
-    </p>
-    <p>
-      <strong>为什么错:</strong>主代码被 provider 污染, 维护噩梦, 测试难写。
-    </p>
-    <p>
-      <strong>正确做法:</strong>4 个 adapter 实现 LLMClient interface, agent.ts
-      完全不感知 provider。
-    </p>
-  </div>
-</div>
+test("cost 估算: GPT-4o + 50k input + 5k output = $0.175", () =&gt; {
+  const cost = estimateCost("gpt-4o", 50_000, 5_000);
+  // 50k / 1M * 2.5 = 0.125; 5k / 1M * 10 = 0.05; total = 0.175
+  expect(cost.min).toBeCloseTo(0.175, 2);
+});
 
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">中级错法 · B</span>
-  </div>
-  <div class="card__body">
-    <p><strong>常见错误:</strong>provider 错误码不翻译直接抛。</p>
-    <p>
-      <strong>为什么错:</strong>Recovery 模块看不懂 provider 错误码,
-      跨模型立刻挂。
-    </p>
-    <p>
-      <strong>正确做法:</strong>adapter 把 provider 错误码翻译成 6 种
-      LLMErrorKind, 统一收敛。
-    </p>
-  </div>
-</div>
+test("auto 策略: long_running 任务 -&gt; GPT-4o", () =&gt; {
+  const resolver = createModelResolver({ policy: "auto" });
+  expect(resolver("long_running")).toBe("gpt-4o");
+  expect(resolver("short_interactive")).toBe("claude-haiku-4-5");
+});
 
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">高级错法 · C</span>
+test("fast 策略: 任意任务都返回 Haiku", () =&gt; {
+  const resolver = createModelResolver({ policy: "fast" });
+  expect(resolver("long_running")).toBe("claude-haiku-4-5");
+  expect(resolver("short_interactive")).toBe("claude-haiku-4-5");
+  expect(resolver("complex_reasoning")).toBe("claude-haiku-4-5");
+});</code></pre>
+<p>
+  <strong>实现细节</strong>: 4 个测试覆盖 (a) fallback 链走通;
+  (b) cost 估算正确; (c) auto 策略查表; (d) fast 策略返回常量。
+  fake LLM client 不连真 API, 用 stub map 返回固定结果, 测试
+  0 成本。
+</p>
+<h2 id="common-confusion">常见误解: 1 个 model 不能跑所有</h2>
+<p>
+  <strong>误解 1: "1 个最强 model 跑所有?"</strong> 错。 大材
+  小用, 简单任务浪费钱 (100 倍)。 应该 cheap model 跑简单, 强
+  model 跑复杂。
+</p>
+<p>
+  <strong>误解 2: "auto 策略 = 启发式判断?"</strong> 错。 auto 是
+  "用户不用选 model, 按任务类型查表选", 不是 harness 启发式猜
+  任务类型。 任务类型由调用方<strong>显式声明</strong>。
+</p>
+<p>
+  <strong>误解 3: "fallback 是无脑降级?"</strong> 错。 fallback
+  是<strong>有序</strong>的 (推荐 → 次优 → 次次优 → fail), 不
+  是随机选。 链写死在常量里, 改链 = 改常量。
+</p>
+<p>
+  <strong>误解 4: "model 一旦选定, 整个 run 不变?"</strong> 对。
+  整个 agent.run() 期间用同 model, 切换 = 重新 run()。 中途切换
+  会破坏 prompt cache (第 10 章) + 上下文不一致。
+</p>
+<h2 id="trap">反例梯度: 4 个常见错误</h2>
+<div class="cards-grid">
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 1 · 1 个 model 跑所有</span></div>
+    <div class="card__body">
+      <p>team 觉得"用最好的最省心", 全部任务用 Claude Opus。
+        错。 跑 5 分钟 git status 用 Opus 5 分钟 × $0.015/分钟
+        = $0.075; 用 Haiku 5 分钟 × $0.0008/分钟 = $0.004。 100 倍
+        浪费。 正确: 5 种任务类型 → 5 个 model, 按类型选。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>hardcode "claude-3-5-sonnet" 的 cache 行为, 切到
-      GPT-4o 后不工作。
-    </p>
-    <p>
-      <strong>为什么错:</strong>cache 行为是 provider 特性, 不是 agent
-      假设。不同 provider 表现不同, 切模型后假设失效。
-    </p>
-    <p>
-      <strong>正确做法:</strong>从 modelProfile.supportsCache / cacheBreakpoints
-      读, 不 hardcode。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 2 · auto 启发式猜</span></div>
+    <div class="card__body">
+      <p>让 harness 启发式判断"这个 query 长 = long_running, 短
+        = short_interactive"。 错。 启发式猜错率高 (1 句话也可能
+        是复杂问题)。 正确: 任务类型由调用方显式声明, harness
+        只查表选 model, 不猜。</p>
+    </div>
   </div>
-</div>
-
-<div class="card card--trap">
-  <div class="card__head">
-    <span class="card__tag">边界错法 · D</span>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 3 · 没有 fallback 链</span></div>
+    <div class="card__body">
+      <p>推荐 model 不可用 (rate limit) 就 throw fail。 错。
+        1 个 model 不可用 = 整个 harness 死, 体验差。 正确:
+        配 fallback 链 (推荐 → 次优 → 次次优), 不可用就试下一个,
+        全失败才 fail。</p>
+    </div>
   </div>
-  <div class="card__body">
-    <p>
-      <strong>常见错误:</strong>本地模型跑 tool call 失败, 切到 Claude
-      之后才发现是 tool name 不合法 ("fs.read" 有点)。
-    </p>
-    <p>
-      <strong>为什么错:</strong>tool name 在不同 provider 规则不同 (Anthropic
-      允许 ".", OpenAI 不允许), harness 不该 hardcode 假设。
-    </p>
-    <p>
-      <strong>正确做法:</strong>tool registry 暴露 getNormalizedName() 给
-      adapter, adapter 在注册时做规范化。
-    </p>
+  <div class="card card--bad">
+    <div class="card__head"><span class="card__tag">反例 4 · 中途切换 model</span></div>
+    <div class="card__body">
+      <p>让 agent.run() 中途根据 token 消耗切换 model (发现花
+        太多就换便宜)。 错。 切换 model 破坏 prompt cache (model
+        B 看不到 model A 的 cache 块), 也破坏上下文一致性 (model
+        A 看到的 conversation model B 看到的不一样)。 正确:
+        整个 run 用同 model, 切换 = 重新 run()。</p>
+    </div>
   </div>
 </div>
-
-<h2 id="validate">如何验证 (本章 Validation 卡片)</h2>
-<div class="card card--validation">
-  <div class="card__head">
-    <span class="card__tag">Validation · model-policy 专题</span>
-  </div>
-  <div class="card__body">
-    <p>
-      <strong>4 个 adapter 暴露统一 LLMClient:</strong>每个 adapter 实现
-      <code>chat(messages): Promise&lt;AssistantMessage&gt;</code>, 内部
-      provider 表示完全翻译成统一形态。
-    </p>
-    <p>
-      <strong>provider 错误码翻译成 LLMErrorKind:</strong>fake provider 抛 429
-      错误, adapter 抛 LLMErrorKind === "rate_limit"。
-    </p>
-    <p>
-      <strong>切模型 = 改环境变量, 不改代码:</strong>设 ANTHROPIC_API_KEY 跑通,
-      切到 OPENAI_API_KEY 同样跑通, agent.ts 不变。
-    </p>
-    <p>
-      <strong>tool name 规范化:</strong>tool name 含 "." 时, OpenAI adapter
-      内部替换为 "_" (OpenAI 限制), 但 trace 里仍然 显示原始 name。
-    </p>
-    <p>
-      <strong>modelProfile 决定 cache 行为:</strong>fake profile
-      supportsCache=false, harness 不调用 cache-debug tracker;
-      supportsCache=true, harness 调用并记录 hit/miss。
-    </p>
-  </div>
-</div>
-
-<h2 id="lookback">回望第 00–15 章: 哪些原则在本章兑现了</h2>
-<ul>
-  <li>
-    <strong>Composition Root 唯一感知 provider:</strong>其他模块只看到 LLMClient
-    interface。
-  </li>
-  <li>
-    <strong>不变量在 adapter 内部兑现:</strong>cache 边界、错误码、截断规则,
-    全部 adapter 收口。
-  </li>
-  <li><strong>配置即适配:</strong>切模型 = 改 env, harness 主体不变。</li>
-  <li>
-    <strong>事实与视图分离:</strong>modelProfile 是"事实" (provider 描述),
-    adapter 是"视图" (实际调用)。
-  </li>
-</ul>
-
-<h2 id="forward">前瞻张力: 留给后续章节</h2>
-<dl class="defs">
-  <dt>多 provider 同时跑</dt>
-  <dd>
-    未来 harness 允许同一个 run() 内调多个 provider (例如用 GPT-4 选 tool, 用
-    Claude 总结), 4 个 adapter 要能并存。
-  </dd>
-  <dt>model profile 动态加载</dt>
-  <dd>
-    新 model 版本发布时, foundation-models.ts 要更新, 跑一次 eval 验证 cache /
-    tool / 错误码行为。
-  </dd>
-  <dt>local model 协议差异</dt>
-  <dd>
-    Ollama / vLLM / LM Studio 各自的 tool call 协议略有差异, adapter
-    抽象要能容下。
-  </dd>
-  <dt>cost-aware adapter selection</dt>
-  <dd>
-    不同 provider 价位不同, harness 应当能在"质量 vs 成本" 之间动态选 adapter
-    (例如用户配置"省钱模式")。
-  </dd>
-</dl>
-
-<h2 id="vibe-coding-model">本次如何 vibe code: model-policy 专题的三件套</h2>
-
-<h3 id="vibe-feed-model">拆卡: 4 轮迭代的具体产物</h3>
+<h2 id="validate">Validation: 4 条不变量检验</h2>
 <ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>。让 LLM 给出 <code>LLMAdapter</code> /
-    <code>FoundationModelProfile</code> 两个 interface, 以及 4 个 adapter
-    (Anthropic / OpenAI / Google / 本地) 的命名规范。本轮不写实现,
-    重点钉"agent.ts 完全不感知 provider"。
+<li>
+<strong>default 表完整</strong>: 5 种任务类型都有推荐 model, 缺
+    一个 TypeScript 编译失败 (Record 强制完整)。 验证: 单测覆盖
+    5 个 taskType 都能查表。
   </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>。让 LLM 给出
-    <code>src/llm-adapter.ts</code> 的 stub, 4 个 adapter 是 noop
-    (直接返回硬编码 messages)。本轮 review 重点: agent.ts / main.ts 不出现
-    provider 字符串。
+<li>
+<strong>fallback 链无循环</strong>: 链必须有限, 不能跳回自身。
+    验证: 单测覆盖 5 个推荐 model 的 fallback 链, 跑 100 次调用,
+    链不超 3 步, 不回到起点。
   </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>。让 LLM 写 4 个 adapter + 错误码翻译 +
-    modelProfile 解析 + tool name 规范化。本轮 review 重点: 错误码统一翻译成
-    LLMErrorKind, tool name 规范化不在 agent.ts。
+<li>
+<strong>策略优先级一致</strong>: auto = 按 taskType 选; fast/quality/budget
+    = 忽略 taskType 选常量。 验证: 4 个测试覆盖 4 种策略。
   </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>。让 LLM 写
-    <code>test/adapter.test.ts</code>。本轮 review 重点: "切模型 = 改 env,
-    不改代码" 和 "tool name 规范化" 两条必须有 env 切换断言。
+<li>
+<strong>cost 估算合理</strong>: 价目表 × token 估算 = cost, 不靠
+    LLM 自己估。 验证: 3 个测试覆盖 (a) 输入 token 算; (b) 输出
+    token 算; (c) 本地 model cost = 0。
   </li>
 </ol>
-
-<h3 id="vibe-review-model">Review: model-policy 专题专属 checklist</h3>
-<ol>
-  <li>
-    <strong>agent.ts / main.ts 不出现 provider 字符串。</strong>验证:
-    <code>grep -rn 'anthropic\|openai' src/agent.ts src/main.ts</code> 应当 0
-    行。
-  </li>
-  <li>
-    <strong>4 个 adapter 都实现 LLMClient。</strong>验证: 每个 adapter.ts 内有
-    <code>chat(messages): Promise&lt;AssistantMessage&gt;</code> 签名。
-  </li>
-  <li>
-    <strong>错误码翻译在 adapter 内部。</strong>验证: catch 块把 provider
-    错误码翻译成 LLMErrorKind, 不直接 throw provider error。
-  </li>
-  <li>
-    <strong>tool name 规范化在 adapter 内部。</strong>验证: tool registry
-    不做规范化, adapter 在调用 provider API 前做。
-  </li>
-  <li>
-    <strong>modelProfile 决定 cache 行为。</strong>验证:
-    <code>grep -n 'supportsCache' src/llm-adapter.ts</code> ≥ 1 行, 决定是否调用
-    cache-debug tracker。
-  </li>
-</ol>
-
-<h3 id="vibe-debug-model">调试: model-policy 专题典型伪装</h3>
-<ol>
-  <li>
-    <strong>伪装 A · agent.ts 出现 provider 字符串。</strong>症状:
-    <code>if (provider === "anthropic") ...</code> 写在 agent.ts。验证:
-    <code>grep -rn 'anthropic\|openai' src/agent.ts</code> 不为 0 行时,
-    跑"切模型 = 改 env" 测试应当挂。
+<h2 id="lookback">回望: 哪些原则在本章兑现了</h2>
+<ul>
+<li>
+<strong>策略模式</strong>: 4 种策略决定 model 选, 5 种任务类型
+    决定 default 表, 两者正交。
   </li>
-  <li>
-    <strong>伪装 B · provider 错误码直接抛。</strong>症状: adapter catch 块
-    <code>throw err</code>。验证: 写一个 fake provider 抛 429, 跑"provider
-    错误码翻译" 测试, 应当收到 LLMErrorKind === "rate_limit"。
+<li>
+<strong>声明式 vs 启发式</strong>: 任务类型由调用方显式声明,
+    harness 查表, 不启发式猜。
   </li>
-  <li>
-    <strong>伪装 C · cache 行为 hardcode。</strong>症状:
-    <code>if (provider === "anthropic") cacheHit = true</code>。验证: 改
-    modelProfile.supportsCache=false, 跑测试, cache-debug tracker 不应被调用。
+<li>
+<strong>有限 fallback 链</strong>: 链写死, 不动态算, 不无限循环。
   </li>
-</ol>
-
-<h3 id="vibe-iterate-model">迭代: model-policy 专题 4 个 commit 节点</h3>
-<ol>
-  <li>
-    <code
-      >feat(model): 钉 LLMAdapter / FoundationModelProfile 接口 + 4 个 adapter
-      命名</code
-    >
-    —— tsc 通过, 无实现。
+<li>
+<strong>幂等估算</strong>: cost 估算可重复, 同一任务多次跑
+    估算一致。
   </li>
-  <li>
-    <code
-      >feat(model): createLlm(config, modelProfile) + 4 个 adapter stub</code
-    >
-    —— tsc 通过, stub 永远返回 hardcoded 响应。
+<li>
+<strong>显式切换</strong>: 整个 run 用同 model, 切换 = 重新 run。
   </li>
-  <li>
-    <code
-      >feat(model): 4 个 adapter 真实实现 + 错误码翻译 + tool name 规范化 +
-      modelProfile 解析</code
-    >
-    —— 跑通 Validation 卡片前 4 条。
-  </li>
-  <li>
-    <code
-      >test(model): 切 env 跑通同一 scenario + modelProfile.supportsCache 决定
-      cache-debug 调用</code
-    >
-    —— 全绿。
-  </li>
-</ol>
-
+</ul>
+<h2 id="forward">前瞻张力: 留给后续专题</h2>
+<dl class="defs">
+<dt>动态 cost 实时监控</dt>
+<dd>
+    当前 cost 是启动时估算, 实际可能偏差 1.5 倍。 工业级想要
+    "实时 cost 监控, 超预算自动切换到便宜 model", 涉及 streaming
+    token count + 实时熔断。 留 P2 阶段。
+  </dd>
+<dt>Model 评测 (A/B testing)</dt>
+<dd>
+    教学版 default 表是 hard-code, 工业级想要"同时跑 GPT-4o +
+    Claude Sonnet, 哪个质量好用哪个", 涉及 multi-model 投票 +
+    自动回归测试。 留 P2 阶段。
+  </dd>
+<dt>用户级 cost 配额</dt>
+<dd>
+    当前没有"这个月只能花 $100" 限制, 工业级想要配额管理
+    (user-level / team-level / org-level), 涉及 quota tracker
+    + 熔断。 留 P2 阶段。
+  </dd>
+</dl>
 <h2 id="prompt-card">Prompt Card (本章任务)</h2>
 <div class="card card--prompt">
   <div class="card__head">
-    <span class="card__tag">Prompt Card · model-policy 专题</span>
-    <button class="card__copy" type="button" data-copy-card>复制</button>
+    <span class="card__tag">Prompt Card · 专题 A</span>
+    <button class="card__copy" data-copy-card="" type="button">复制</button>
   </div>
   <div class="card__body">
-    <p>
-      <strong>目标:</strong>实现 LLM Adapter 层, 用 4 个 adapter 收口 provider
-      差异 (tool call 协议 / cache 边界 / 错误码 / 截断规则), harness
-      其他部分完全不感知 provider。
-    </p>
-    <p>
-      <strong>场景:</strong>用户设 ANTHROPIC_API_KEY 跑通, 切到 OPENAI_API_KEY
-      同样跑通, agent.ts 不改一行代码。
-    </p>
-    <p>
-      <strong>模块:</strong> <code>src/llm-adapter.ts</code> (新) 暴露
-      <code>createLlm(config, modelProfile)</code>;
-      <code>src/llm-providers.ts</code> (新) 4 个 adapter (Anthropic / OpenAI /
-      Google / 本地); <code>src/foundation-models.ts</code> (新) modelProfile
-      解析; <code>src/index.ts</code> 接线 config + modelProfile 选 adapter。
-    </p>
+    <p><strong>目标:</strong> 给 harness 加 Model Policy 策略层,
+      5 种任务类型 → 5 个推荐 model, 4 种策略切偏好, fallback 链
+      优雅降级, cost 估算让用户知道大概花多少。</p>
+    <p><strong>场景:</strong> team 跑 10 个不同任务 (5 分钟 git /
+      1 小时迁移 / 30 分钟设计 / ...), --model-policy auto 启动,
+      harness 按 taskType 自动选 model。 临时周末实验切 fast
+      (全用 Haiku), 重要 demo 切 quality (全用 Opus), 预算紧的
+      月切 budget (全用 Qwen 本地)。</p>
+    <p><strong>模块:</strong> <code>src/execution-policy.ts</code> (改)
+      加 DEFAULT_MODEL_FOR_TASK 表 + STRATEGY_TO_MODEL 常量 +
+      createModelResolver 工厂 + callWithFallback 链 + estimateCost
+      估算; <code>src/cli-commands.ts</code> (改) 加 /model-policy
+      show / set 命令; <code>src/agent.ts</code> (改) 接收
+      taskType + 调 resolver + 输出 cost 估算; <code>src/index.ts</code>
+      (改) Composition Root 注入 modelResolver + fallback 链。</p>
     <p><strong>边界 (LLM 必须遵守的 checklist):</strong></p>
     <ul>
-      <li>
-        agent.ts / main.ts / loop 不出现 provider 字符串 (anthropic / openai /
-        google)
-      </li>
-      <li>4 个 adapter 都实现 LLMClient interface</li>
-      <li>provider 错误码在 adapter 内部翻译成 LLMErrorKind (第 11 章)</li>
-      <li>tool name 规范化在 adapter 内部, 不在 tool registry</li>
-      <li>cache 行为从 modelProfile.supportsCache 读, 不 hardcode</li>
+      <li>5 种任务类型固定枚举 long_running / short_interactive / bulk_summarize / complex_reasoning / experimental, 不可新增</li>
+      <li>4 种策略固定枚举 auto / fast / quality / budget, 不可新增</li>
+      <li>auto 策略查 default 表, fast/quality/budget 各自返常量, 不查表</li>
+      <li>任务类型由调用方显式声明, harness 不启发式猜</li>
+      <li>fallback 链有限, 长度 ≤ 3, 不跳自身, 不无限循环</li>
+      <li>cost 估算可重复, 同一任务多次跑一致, 不靠 LLM 自己估</li>
+      <li>整个 agent.run() 用同 model, 中途不切换 (切换 = 重新 run)</li>
+      <li>execution profile 3 种 readonly / ci / workspace_write 与 model 选择正交, 独立维度</li>
     </ul>
-    <p><strong>验证 (用 fake adapter + env 切换, 逐条落到 vitest):</strong></p>
+    <p><strong>验证 (用 fake LLM client + fake 价目表 + vitest, 逐条断言):</strong></p>
     <ul>
-      <li>
-        4 个 adapter 暴露统一 chat(messages) 签名, 内部 provider 表示翻译成
-        AssistantMessage
-      </li>
-      <li>fake provider 抛 429, adapter 抛 LLMErrorKind === "rate_limit"</li>
-      <li>
-        设 ANTHROPIC_API_KEY 跑通, 切到 OPENAI_API_KEY 同样跑通, agent.ts 不变
-      </li>
-      <li>
-        tool name 含 "." 时, OpenAI adapter 内部替换为 "_", trace 显示原始 name
-      </li>
-      <li>modelProfile.supportsCache=false 时, cache-debug tracker 不被调用</li>
+      <li>default 表完整: 5 种 taskType 都能查表, 缺一个 TS 编译失败</li>
+      <li>fallback 链走通: 推荐 model rate_limit → 次优 model ok, 链用 2 步</li>
+      <li>4 种策略正确: auto 按 taskType 选, fast/quality/budget 各自返常量</li>
+      <li>cost 估算: GPT-4o + 50k input + 5k output = $0.175 (精确), 本地 model = $0</li>
     </ul>
   </div>
 </div>
-
 <h2 id="practice">本章练习</h2>
 <ol>
-  <li>
-    故意在 agent.ts 写 if (provider === "anthropic"), 跑测试, 看"agent.ts 不出现
-    provider 字符串" 是否抓到 (切 env 后挂)。
+<li>
+    故意让 harness 用 1 个 model (Opus) 跑所有任务, 跑 1 个月
+    harness, 看账单 vs 引入 Model Policy 后的账单 (×10 浪费 vs.
+    优化后)。
+  </li>
+<li>
+    故意让 auto 策略启发式猜任务类型 (query 长度判断), 跑测试
+    "1 句话的复杂推理" + "长 query 的简单任务", 看"启发式猜"
+    是否抓到 (猜错率高 vs. 显式声明 100% 准)。
   </li>
-  <li>
-    在 adapter catch 块 throw err (不翻译), 跑测试, 看"provider 错误码翻译"
-    是否抓到 (LLMErrorKind 拿不到)。
+<li>
+    故意不写 fallback 链, 推荐 model rate_limit 时直接 throw,
+    跑测试, 看"fallback 缺失" 是否抓到 (1 个 rate limit = harness
+    死 vs. fallback 链试次优 model)。
   </li>
-  <li>
-    hardcode "anthropic 有 cache", 不读 modelProfile.supportsCache, 跑测试,
-    看"modelProfile 决定 cache 行为" 是否抓到。
+<li>
+    故意让 agent.run() 中途根据 token 消耗切 model, 跑测试,
+    看"中途切换" 是否抓到 (cache miss + 上下文不一致 vs. 整个
+    run 用同 model + 切换 = 重新 run)。
   </li>
 </ol>
-
 <h2 id="summary">本章小结</h2>
 <p>
-  本专题给 harness 加了 LLM Adapter 层, 用 4 个 adapter 收口 provider 在 tool
-  call 协议 / cache 边界 / 错误码 / 截断规则 上的差异。harness
-  其他部分完全不感知 provider, 切模型 = 改 环境变量。modelProfile
-  决定每个具体模型版本的行为, 不 hardcode provider 假设。这是 harness
-  跨模型可移植性的关键。
+  Model Policy 是给 LLM 选型的<strong>策略层</strong>, 5 种任务
+  类型 × 4 种策略 × 3 种 execution profile = 60 种任务场景,
+  每种有合适的 model + 权限 + 成本。 核心是 5 个设计:
+</p>
+<ul>
+<li>
+<strong>5 种任务类型</strong>: long_running / short_interactive /
+    bulk_summarize / complex_reasoning / experimental, 固定
+    枚举。
+  </li>
+<li>
+<strong>4 种策略</strong>: auto / fast / quality / budget, 固定
+    枚举, 切用户偏好。
+  </li>
+<li>
+<strong>声明式 taskType</strong>: 调用方显式声明, harness 查表
+    选 model, 不启发式猜。
+  </li>
+<li>
+<strong>有限 fallback 链</strong>: 推荐 → 次优 → 次次优 → fail,
+    写死, 不动态算。
+  </li>
+<li>
+<strong>幂等 cost 估算</strong>: 价目表 × token 估算, 启动时输出
+    min/max, 实际可偏差 1.5 倍。
+  </li>
+<p>
+  教程完结。 下一份是 <strong>专题 B · eval harness</strong>, 讲
+  怎么写测试覆盖 harness 行为 — scripted LLM client + 4 类
+  eval case (deterministic / live regression / live full / replay)
+  + 6 阶段 prompt card 模板 + judge 4 层降级, 把前面所有模块在
+  CI 里跑稳。
 </p>
diff --git a/tutorial/chapters/reference.html b/tutorial/chapters/reference.html
index 0fc85d8..65e3652 100644
--- a/tutorial/chapters/reference.html
+++ b/tutorial/chapters/reference.html
@@ -1,420 +1,3786 @@
-<p class="article__eyebrow">Reference · 整套教程的出口</p>
-<h1 class="article__title">术语表、Prompt Pack 与验证手册</h1>
-<p class="article__lede">
-  这一页不是新功能章节, 是把整套教程的术语、Prompt 写法、Validation 卡片
-  收束成查阅入口。你可以用这一页回到任何一章, 也能用 Prompt Pack 模板
-  写自己的重建 prompt。
-</p>
-
-<nav id="article-inline-toc" class="article__meta" aria-label="页内小节"></nav>
-
-<hr class="rule" />
-
-<h2 id="usage">怎么用这一页</h2>
-<p>
-  <strong>查术语</strong>: 第一节"术语锁定表" 列出 19 个核心术语,
-  每个标注首次出现章节和是否允许中文别名。
-</p>
-<p>
-  <strong>写 prompt</strong>: 第二节"6 段 Prompt Card 模板" 给你一份
-  完整的反例对照 (差 → 改 → 好), 你可以照着写。
-</p>
-<p>
-  <strong>做验证</strong>: 第三节"Validation 速查" 列出 16 章的 Validation
-  卡片一句话总结, 方便回头查具体断言。
-</p>
-<p>
-  <strong>走章节</strong>: 第四节"章节 Prompt Pack 总目录" 列出每章 的目标 /
-  模块 / 边界 / 验证一句话。
-</p>
-
-<h2 id="terms">术语锁定表</h2>
-<p>
-  下表是 19 个核心术语。第一次出现时用 <code>English</code> (中文释义) 格式,
-  之后只用英文。<strong>术语锁定原则</strong>: 同一概念只用 一种说法,
-  不允许中途换。
-</p>
-<table class="terms">
-  <thead>
-    <tr>
-      <th>术语</th>
-      <th>中文释义</th>
-      <th>首次出现</th>
-      <th>别名</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td><code>harness</code></td>
-      <td>外层运行环境</td>
-      <td>00</td>
-      <td>不允许用"框架" / "包装"</td>
-    </tr>
-    <tr>
-      <td><code>agent loop</code></td>
-      <td>主循环</td>
-      <td>00</td>
-      <td>简称 <code>loop</code></td>
-    </tr>
-    <tr>
-      <td><code>History</code></td>
-      <td>消息列表</td>
-      <td>01</td>
-      <td>不允许用"上下文" / "对话历史"</td>
-    </tr>
-    <tr>
-      <td><code>LLMClient</code></td>
-      <td>大模型调用接口</td>
-      <td>01</td>
-      <td>不允许用"模型" / "AI"</td>
-    </tr>
-    <tr>
-      <td><code>tool call</code></td>
-      <td>工具调用请求</td>
-      <td>02</td>
-      <td>不允许用"工具请求" / "调用工具"</td>
-    </tr>
-    <tr>
-      <td><code>tool result</code></td>
-      <td>工具调用结果</td>
-      <td>02</td>
-      <td>role 必须是 "tool"</td>
-    </tr>
-    <tr>
-      <td><code>tool registry</code></td>
-      <td>工具注册表</td>
-      <td>02</td>
-      <td>不允许用"工具表" / "工具列表"</td>
-    </tr>
-    <tr>
-      <td><code>tool_call_id</code></td>
-      <td>工具调用 id</td>
-      <td>02</td>
-      <td>必须来自 LLM 返回, 不重生成</td>
-    </tr>
-    <tr>
-      <td><code>Composition Root</code></td>
-      <td>组装根 (<code>index.ts</code>)</td>
-      <td>01</td>
-      <td>不允许用"主入口" / "启动文件"</td>
-    </tr>
-    <tr>
-      <td><code>fake LLM</code></td>
-      <td>假模型</td>
-      <td>00</td>
-      <td>不允许用"mock 模型"</td>
-    </tr>
-    <tr>
-      <td><code>reminder</code></td>
-      <td>系统提醒消息</td>
-      <td>03</td>
-      <td>
-        role 是 "user", 标签 <code>&lt;system-reminder source="X"&gt;</code>
-      </td>
-    </tr>
-    <tr>
-      <td><code>stable prefix</code></td>
-      <td>稳定前缀</td>
-      <td>10</td>
-      <td>system prompt + tools 字段</td>
-    </tr>
-    <tr>
-      <td><code>SubAgent</code></td>
-      <td>子智能体</td>
-      <td>04</td>
-      <td>不允许用"子任务" / "分 agent"</td>
-    </tr>
-    <tr>
-      <td><code>Skill</code></td>
-      <td>按需加载工具集</td>
-      <td>05</td>
-      <td>不允许用"插件" / "扩展"</td>
-    </tr>
-    <tr>
-      <td><code>Permission</code></td>
-      <td>权限决策</td>
-      <td>07</td>
-      <td>action 是 "allow" / "ask" / "deny"</td>
-    </tr>
-    <tr>
-      <td><code>Hook</code></td>
-      <td>钩子</td>
-      <td>08</td>
-      <td>exitCode 0/1/2</td>
-    </tr>
-    <tr>
-      <td><code>Recovery</code></td>
-      <td>失败恢复</td>
-      <td>11</td>
-      <td>action 是 retry / compact_and_retry / abort</td>
-    </tr>
-    <tr>
-      <td><code>Async Run</code></td>
-      <td>后台任务</td>
-      <td>13</td>
-      <td>不允许用"异步任务" / "后台进程"</td>
-    </tr>
-    <tr>
-      <td><code>Schedule</code></td>
-      <td>定时任务</td>
-      <td>14</td>
-      <td>cron 表达式, source 标签 "schedule"</td>
-    </tr>
-  </tbody>
-</table>
-<p>
-  <strong>不变量原则</strong>: 任何一章引入新术语, 第一次出现时必须在本表
-  加一行; 别名替换在所有章节内生效。
-</p>
-
-<h2 id="prompt-template">6 段 Prompt Card 模板</h2>
-<p>
-  这是第 00 章确立的模板, 这里只列反例对照。完整示例见第 01 章 Prompt Card。
-</p>
-<div class="card card--prompt">
-  <div class="card__head">
-    <span class="card__tag">反例对照 · 差 → 改 → 好</span>
-  </div>
-  <div class="card__body">
-    <p><strong>差的卡片 (5 段, LLM 会偷懒):</strong></p>
-    <pre class="code-block"><code>目标: 实现 agent loop
-模块: history.ts, llm.ts, agent.ts, index.ts
-验证: 能跑通
-边界: 注意架构
-场景: 用户输入 query, agent 调用 LLM 返回文本</code></pre>
-    <p>
-      <strong>问题:</strong>"能跑通" 不是断言, "注意架构" 不可验证, 没说
-      messages 怎么拼。
-    </p>
-    <p><strong>改 (5 段, 有信息量但 LLM 仍会猜):</strong></p>
-    <pre class="code-block"><code>目标: 实现最小 agent loop, 多轮对话保留上下文
-模块: createHistory, createLlm, createAgent, createRepl, index.ts
-验证: 连续两次 run 后第二次 LLM 收到 messages 包含第一轮
-边界: agent.ts 不读环境变量, 不直接 new LLM client
-场景: 用户先说"我喜欢简洁", 再问"我喜欢什么风格"</code></pre>
-    <p><strong>改进:</strong>有 messages 顺序断言, 有"绝对不能" 的边界。</p>
-    <p><strong>好 (6 段, LLM 没空间偷懒):</strong></p>
-    <pre
-      class="code-block"
-    ><code>目标: 实现最小 agent loop, 多轮上下文由 History 提供
-场景: 用户依次输入 "我喜欢简洁" 与 "我喜欢什么风格",
-      agent 第二次回复应包含"简洁"
-模块:
-  - src/history.ts: createHistory(), 内部 messages: Message[]
-  - src/llm.ts: createLlm(config), 暴露 chat(messages)
-  - src/agent.ts: createAgent(deps), 暴露 run(query)
-  - src/repl.ts: createRepl(deps), 暴露 start()
-  - src/index.ts: 创建 history/llm/terminal, 传给 agent 和 repl
-接线: index.ts 内只做 new + 传参, 不出现 if 分支
-边界 (LLM 必须遵守):
-  - agent.ts 内不出现 process.env
-  - agent.ts 内不出现 new LLMClient
-  - history.getMessages() 返回浅拷贝
-  - 空 query 不写入 history
-验证:
-  - fake LLM 返回 "收到" 时 agent.run("x") === "收到"
-  - 连续两次 run, fake LLM 第二次收到的 messages.length === 3
-  - 第二次收到的 messages[0].role === "user"
-  - run("") 不增加 history 长度</code></pre>
-    <p>
-      <strong>关键差异</strong>: "边界" 是可枚举 checklist, "验证" 每条 都能落到
-      vitest 一行断言, "接线" 写明实例是不是同一份。
-    </p>
-  </div>
-</div>
-
-<h2 id="vibe-coding-template">vibe coding 4 轮拆卡</h2>
-<p>6 段卡片是总图, 喂给 LLM 要分 4 轮:</p>
-<ol>
-  <li>
-    <strong>第 1 轮 · 接口</strong>: 只贴 "目标 + 场景 + 模块", 让 LLM 给出
-    interface 草案 (不写实现, 只钉形状)。
-  </li>
-  <li>
-    <strong>第 2 轮 · 接线</strong>: 贴 "模块 + 接线", 让 LLM 给出
-    <code>index.ts</code> 接线, 工厂是 stub。
-  </li>
-  <li>
-    <strong>第 3 轮 · 边界</strong>: 贴 "边界 checklist", 让 LLM 按 checklist
-    写每个工厂实现。
-  </li>
-  <li>
-    <strong>第 4 轮 · 验证</strong>: 贴 "验证断言清单", 让 LLM 写测试, 优先 fake
-    LLM + messages 顺序断言。
-  </li>
-</ol>
-<p>4 轮独立可验证, 任一轮不通过单独回退。</p>
-
-<h2 id="validation-cheatsheet">Validation 速查</h2>
-<p>16 章 Validation 卡片一句话总结。完整的 Validation 卡片在每章末。</p>
-<table class="terms">
-  <thead>
-    <tr>
-      <th>章节</th>
-      <th>核心断言</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>00 · 元方法</td>
-      <td>vibe coding 4 轮拆卡 + review checklist + 防自欺 4 盏红灯</td>
-    </tr>
-    <tr>
-      <td>01 · 最小 loop</td>
-      <td>fake LLM 收到 messages 顺序 + history 浅拷贝 + 空 query 不写入</td>
-    </tr>
-    <tr>
-      <td>02 · 工具调用</td>
-      <td>tool_call_id 配对 + 错误仍写 tool message + 未知工具不崩</td>
-    </tr>
-    <tr>
-      <td>03 · TODO</td>
-      <td>状态机拒绝跳跃 + in_progress 唯一 + reminder 走 user 消息</td>
-    </tr>
-    <tr>
-      <td>04 · SubAgent</td>
-      <td>父子 history/todo 隔离 + 防递归 + maxRounds 触发截断</td>
-    </tr>
-    <tr>
-      <td>05 · Skill</td>
-      <td>activeSkills 闭包化 + 工具去重 + 父子 skill 隔离</td>
-    </tr>
-    <tr>
-      <td>06 · Compress</td>
-      <td>压缩不写回 history + tool_call_id 保留 + summary block 替换</td>
-    </tr>
-    <tr>
-      <td>07 · Permission</td>
-      <td>deny 必写 tool message + 子智能体 ask 降级 + denylist 优先</td>
-    </tr>
-    <tr>
-      <td>08 · Hook</td>
-      <td>
-        PreToolUse block 写 tool message + Hook 抛错降级 + 注入消息延迟追加
-      </td>
-    </tr>
-    <tr>
-      <td>09 · Memory</td>
-      <td>memory 不进 history + scope 隔离 + key 命名规范</td>
-    </tr>
-    <tr>
-      <td>10 · Cache</td>
-      <td>动态状态走 reminder + snapshot 字符串比较 + tools 拼装稳定</td>
-    </tr>
-    <tr>
-      <td>11 · Recovery</td>
-      <td>rate_limit 退避重试 + truncation 累积 + retry 上限放弃</td>
-    </tr>
-    <tr>
-      <td>12 · Task</td>
-      <td>renderActive 过滤 completed + 状态机校验 + id 由 harness 生成</td>
-    </tr>
-    <tr>
-      <td>13 · Async Run</td>
-      <td>不阻塞主 loop + 通知 drain + 冲突检测 + P1 压缩复用</td>
-    </tr>
-    <tr>
-      <td>14 · Schedule</td>
-      <td>tick 派发到 async run + 通知单独 source + timezone 解析</td>
-    </tr>
-    <tr>
-      <td>15 · Hardening</td>
-      <td>所有 IO 走 atomic write + dry-run 不真删 + 时间统一 number</td>
-    </tr>
-  </tbody>
-</table>
-
-<h2 id="chapter-pack">章节 Prompt Pack 总目录</h2>
-<p>每章一句话总结, 方便写"重建这个项目" 的总 prompt 时查阅。</p>
-<dl class="defs">
-  <dt>00 · 元方法</dt>
-  <dd>用 LLM 写 LLM agent 的元方法: 4 轮拆卡 + review + 防自欺。</dd>
-  <dt>01 · 最小 loop</dt>
-  <dd>History + LLMClient + Agent + REPL + Composition Root 五件套。</dd>
-  <dt>02 · 工具调用</dt>
-  <dd>tool registry + tool_call 协议 + agent 主循环加 tool 分支。</dd>
-  <dt>03 · TODO</dt>
-  <dd>TODO 状态机 + reminder 注入 + 不污染 system prompt。</dd>
-  <dt>04 · SubAgent</dt>
-  <dd>父子 history/todo 隔离 + 防递归 + maxRounds。</dd>
-  <dt>05 · Skill</dt>
-  <dd>activeSkills 闭包化 + 按需激活 + 工具去重。</dd>
-  <dt>06 · Compress</dt>
-  <dd>三层压缩管道 (Normalize / Block / Compress) + 不写回 history。</dd>
-  <dt>07 · Permission</dt>
-  <dd>同步拦截 + deny 必写 tool message + 子智能体 ask 降级。</dd>
-  <dt>08 · Hook</dt>
-  <dd>三个事件 (SessionStart / PreToolUse / PostToolUse) + exitCode 0/1/2。</dd>
-  <dt>09 · Memory</dt>
-  <dd>user / project 双 scope + key 命名规范 + SessionStart 注入。</dd>
-  <dt>10 · Cache</dt>
-  <dd>稳定前缀 / 动态状态 / 自然增量 三段布局。</dd>
-  <dt>11 · Recovery</dt>
-  <dd>6 种失败分类 + 不同策略 + retry 上限 + 不跨 run 共享。</dd>
-  <dt>12 · Task</dt>
-  <dd>结构化字段 + 状态机严格 + renderActive 过滤 + id 由 harness 生成。</dd>
-  <dt>13 · Async Run</dt>
-  <dd>后台派发 + drain 通知 + 冲突检测 + 复用 P1 压缩。</dd>
-  <dt>14 · Schedule</dt>
-  <dd>cron 调度 + 复用 async run + 单独 source 标签 + timezone 解析。</dd>
-  <dt>15 · Hardening</dt>
-  <dd>原子写 + 日志轮转 + output 引用计数 + dry-run + 时间统一。</dd>
-</dl>
-
-<h2 id="tension-map">张力地图: 章节之间的矛盾点</h2>
-<p>
-  harness 设计不是"哪一章节对的", 而是"在哪些约束下做哪种选择"。下面
-  是章节之间反复出现的张力, 回头查这里。
-</p>
-<dl class="defs">
-  <dt>压缩 vs cache (06 vs 10)</dt>
-  <dd>
-    压缩让 messages 短, 但破坏 system prompt 字符串的 cache 前缀稳定性。P2
-    总结的 LLM 调用本身吃 cache 配额, 需要 stable context 隔离。
-  </dd>
-  <dt>并行 vs 串行工具 (02 vs 13)</dt>
-  <dd>
-    同一个工具并行跑 (例如两个 run_bash) 容易写同一文件, async run +
-    冲突检测是折中方案。
-  </dd>
-  <dt>memory vs task (09 vs 12)</dt>
-  <dd>
-    memory 是"用户级偏好" (跨项目), task 是"项目级 plan"
-    (单项目)。用户换项目后项目级 fact 隐藏, user 级保留。
-  </dd>
-  <dt>reminder 频率 (03 vs 13 vs 14)</dt>
-  <dd>
-    TODO reminder / async run notification / schedule notification 都走 reminder
-    标签, 各自 source 区分。频率高时 reminder 累积, 需要各自内部去重。
-  </dd>
-  <dt>权限共享 vs 隔离 (07 vs 04)</dt>
-  <dd>
-    父子 agent 共享 permission 策略 (子不能绕过父), 但 history / todo
-    隔离。共享与隔离的边界是 harness 设计核心。
-  </dd>
-  <dt>eval 实时性 vs 重放 (eval vs 11)</dt>
-  <dd>
-    eval 跑真实 LLM 能反映现状, 但每次跑结果可能不同; 重放 deterministic LLM
-    stub 稳定但可能与真实行为漂移。两者结合用 (deterministic 主 + live 副)。
-  </dd>
-</dl>
-
-<h2 id="next-step">下一步</h2>
-<p>现在你已经走完整套教程。建议的下一步:</p>
-<ol>
-  <li>
-    挑一章 (建议从 02 章开始, 不依赖 00/01 的元方法), 用本章 6 段模板自己写一份
-    Prompt Card。
-  </li>
-  <li>
-    用第 01 章的 fake LLM Cookbook 写测试, 验证你写的卡片能让 LLM 实现通过
-    Validation。
-  </li>
-  <li>挑一章做 vibe coding 4 轮拆卡练习, 每轮跑通 Validation 才进下一轮。</li>
-  <li>
-    读 eval 专题章, 学习如何把"卡片的实现" 自动化回归 (eval 反馈 prompt 优化)。
-  </li>
-  <li>
-    读 model-policy 专题章, 学习不同大模型的差异 (cache / tool call / 错误码)。
-  </li>
-</ol>
+<article id="article-root">
+  <p class="article__eyebrow">Reference · 设计模式与架构套路</p>
+  <h1 class="article__title">整套项目用到的设计模式与架构套路</h1>
+  <p class="article__lede">
+    读完 16 章主线 + 2 个专题, 你已经知道每个功能"做什么"。 这一页 不再讲新功能,
+    而是带你横向看: 整个项目反复用了哪些
+    <strong>设计模式</strong>
+    和
+    <strong>架构套路</strong>
+    ? 它们
+    <strong>为什么</strong>
+    反复出现? 背后的
+    <strong>原理</strong>
+    是什么? 真实代码长什么样? 读这一页后, 你打开任何
+    <code>src/*.ts</code>
+    都能 秒识别它的"骨架套路", 不再被陌生的类名吓到。
+  </p>
+  <nav aria-label="页内小节" class="article__meta" id="article-inline-toc">
+    <a href="#how-to-use">怎么用这一页</a>
+    <a href="#core-principles">4 条核心原则</a>
+    <a href="#pattern-1">模式 1 · 工厂函数 + 闭包 (Factory + Closure)</a>
+    <a href="#pattern-2"
+      >模式 2 · 窄接口 + Adapter 翻译 (Narrow Interface + Adapter)</a
+    >
+    <a href="#pattern-3">模式 3 · 依赖注入 (Dependency Injection)</a>
+    <a href="#pattern-4">模式 4 · Composition Root (唯一组装点)</a>
+    <a href="#pattern-5">模式 5 · 注册表模式 (Registry Pattern)</a>
+    <a href="#pattern-6">模式 6 · Builder / Pipeline 模式</a>
+    <a href="#pattern-7">模式 7 · State Machine (状态机)</a>
+    <a href="#pattern-8">模式 8 · Strategy (策略模式)</a>
+    <a href="#pattern-9">模式 9 · Observer (观察者模式) - 改造版</a>
+    <a href="#pattern-10">模式 10 · Atomic Write (原子写)</a>
+    <a href="#pattern-11">模式 11 · Reminder 模式 (稳定前缀外的动态状态)</a>
+    <a href="#pattern-12">模式 12 · Cache-friendly Layout (缓存友好布局)</a>
+    <a href="#pattern-13">模式 13 · 错误分类 + 恢复动作 (Error → Action)</a>
+    <a href="#pattern-14">模式 14 · Concurrent Limit (并发限制)</a>
+    <a href="#pattern-15">模式 15 · Identity Check (id 与目录名同步)</a>
+    <a href="#pattern-16">模式 16 · Test Factory (测试工厂)</a>
+    <a href="#pattern-17">模式 17 · Stable Identity (id 不变)</a>
+    <a href="#pattern-18">模式 18 · Transcript-First (transcript 优先)</a>
+    <a href="#pattern-19">模式 19 · Test Doubles (测试替身)</a>
+    <a href="#pattern-20">模式 20 · Prompt Card 模板</a>
+    <a href="#pattern-21">模式 21 · No Catch Throw (业务错误不 throw)</a>
+    <a href="#pattern-22">模式 22 · Idempotent Operations (幂等操作)</a>
+    <a href="#anti-patterns">反模式速查 (在仓库里绝对不出现)</a>
+    <a href="#where-to-go-next">学完这一页之后</a>
+  </nav>
+  <hr class="rule" />
+  <h2 id="how-to-use">怎么用这一页</h2>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>需求</th>
+        <th>看哪一节</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>看到一个不熟悉的类名 / 文件名, 想找它的"模板"</td>
+        <td>22 个模式速查</td>
+      </tr>
+      <tr>
+        <td>想理解"为什么整个项目都是这套写法"</td>
+        <td>4 条核心原则 (次节)</td>
+      </tr>
+      <tr>
+        <td>想读懂某个具体模块</td>
+        <td>跳到对应模式, 找仓库真实文件</td>
+      </tr>
+      <tr>
+        <td>自己写新模块, 想"照着惯例写"</td>
+        <td>从"先写接口" / "再写工厂" / "最后注入" 这套流程走</td>
+      </tr>
+      <tr>
+        <td>想看哪些"反模式"在仓库里绝对不出现</td>
+        <td>末节"反模式速查"</td>
+      </tr>
+    </tbody>
+  </table>
+  <h2 id="core-principles">4 条核心原则</h2>
+  <p>
+    整个项目所有模式都围绕 4 条核心原则展开。 理解这 4 条, 后面的 22
+    个模式只是它们的"具体应用"。 这 4 条不是凭空选的 — 它们各自
+    对应一类反复出现的"系统级"问题, 任何一条违反都会让 harness 在 某个维度上崩。
+  </p>
+  <h3>原则 1 · Composition Root 唯一组装</h3>
+  <p>
+    <strong>解决什么问题</strong>
+    : 业务模块互相 import, 改一个全挂; 难测试, 换部署环境要改 10 处。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 整个项目
+    <strong>只有 1 个文件</strong>
+    import 一切、 new 一切:
+    <code>src/index.ts</code>
+    的
+    <code>main()</code>
+    。 业务模块 之间互相不 import, 只接收注入。 业务模块对"用什么 LLM / 哪个
+    history" 一无所知, 全部由 Composition Root 决定。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 换测试 = 换
+    <code>main()</code>
+    一处; 换部署 = 换
+    <code>main()</code>
+    一处; 业务模块保持纯净, 永远不依赖具体实现。
+    业务模块之间的依赖关系图"塌缩成一个点", 维护复杂度从 O(n²) 降到 O(n)。
+  </p>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/index.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · src/index.ts (Composition Root) (L1)</a
+    >
+  </p>
+  <h3>原则 2 · 工厂模式, 状态在闭包内</h3>
+  <p>
+    <strong>解决什么问题</strong>
+    : 单例导致父子 agent 状态污染; class 的
+    <code>this</code>
+    绑定问题让 callback 出 bug; 测试难隔离。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 几乎所有"有状态" 的对象 (agent / history / todoManager / skillLoader /
+    memoryManager / taskManager) 都不是单例, 也不是 class, 而是
+    <strong>工厂函数返回的闭包对象</strong>
+    。 状态 藏在闭包里, 实例之间天然隔离。 闭包没有
+    <code>this</code>
+    绑定问题 (callback / 解构时不会丢上下文)。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 父子 agent 隔离、并发测试隔离、多 session 隔离"自动" 成立,
+    不用业务模块写一行隔离代码。 仓库 0 个 class, 100% 工厂 + 闭包,
+    模式高度一致。
+  </p>
+  <h3>原则 3 · 稳定前缀优先, 状态走 reminder</h3>
+  <p>
+    <strong>解决什么问题</strong>
+    : LLM provider 的 prompt cache 是按 "前缀匹配" 工作的 (Anthropic 1/10,
+    OpenAI 1/2), 任何"小修改" 都会让整个 cache 失效; system prompt 里塞状态会让
+    cache 命中率 归零, 成本涨 5-10 倍。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 把 prompt layout 严格分成两部分 —
+    <strong>稳定前缀</strong>
+    (system prompt + tools, 写后不改, 进 cache) +
+    <strong>动态 tail</strong>
+    (history + reminder, 每轮变, 不进 cache 但算增量价格)。 状态 (TODO / memory
+    / skill hint) 走 user reminder, 永远不进 system prompt。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 跨 session / 跨用户 / 跨项目一致的行为规则 + 个性化的状态注入,
+    两者互不污染, cache 命中率最大化。
+  </p>
+  <h3>原则 4 · narrow interface + adapter 翻译</h3>
+  <p>
+    <strong>解决什么问题</strong>
+    : 业务代码直接调外部 SDK (OpenAI / Anthropic / 文件系统 / 终端) 时, 业务被
+    provider 特定细节污染; 换 provider = 重写业务; 错误码不一致 (Anthropic 429 /
+    OpenAI 429 / Google RESOURCE_EXHAUSTED)。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 所有外部依赖 (LLM / 终端 / 工具 / 文件系统) 都 走
+    <strong>窄接口</strong>
+    (1-3 个方法), 实现藏在 adapter 里。 Adapter 内部处理 network / retry / cache
+    / provider 错误码翻译, 业务模块 只看内部统一类型 (e.g.
+    <code>LLMErrorKind</code>
+    7 类)。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 业务模块 0 改动换底层; 测试用 fake 替换 真实实现; provider-specific
+    错误码统一翻译成内部错误类型, recovery 模块不依赖具体 provider。
+  </p>
+  <h2 id="pattern-1">模式 1 · 工厂函数 + 闭包 (Factory + Closure)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 整个项目最常用的模式, 出现在几乎所有 manager / store / loader 上。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 单例导致父子 agent 共享状态 (子 agent 调 load_skill 影响父); class 的
+    <code>this</code>
+    绑定问题 (把方法传 setTimeout / 事件回调,
+    <code>this</code>
+    丢失); 测试无法隔离 (一个测试残留状态影响下一个测试)。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 函数式编程 + 闭包 — 工厂函数返回的对象捕获 闭包内的局部变量,
+    这些变量对外不可见, 只能通过返回的方法访问。 每次调
+    <code>createXxx()</code>
+    都得到全新闭包, 状态天然隔离。 闭包没有
+    <code>this</code>
+    , 传 callback 永远不会丢上下文。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版
+export function createTodoManager(): TodoManager {
+  // 状态在闭包内, 不放 module-level
+  const items = new Map&lt;string, Todo&gt;();
+  let activeId: string | null = null;
+
+  // 公开方法闭包返回的对象
+  return {
+    create(content: string): Todo { /* ... */ },
+    update(id: string, status: TodoStatus): boolean { /* ... */ },
+    list(): Todo[] { return [...items.values()]; },
+    // ...
+  };
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/todo.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createTodoManager 工厂 + 闭包 (最典型示例) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>工厂函数</strong>
+      <code>createTodoManager</code>
+      返回对象, 不
+      <code>new TodoManager()</code>
+      。
+    </li>
+    <li>
+      <strong>状态在闭包内</strong>
+      <code>items</code>
+      /
+      <code>activeId</code>
+      外部 看不到, 只能通过返回的方法访问。
+    </li>
+    <li>
+      <strong>this 不存在</strong>
+      避免 TypeScript class 常见的
+      <code>this</code>
+      丢失问题 (callback / 解构时丢上下文)。
+    </li>
+  </ol>
+  <h3>仓库里哪些文件用这个模式</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>文件</th>
+        <th>工厂</th>
+        <th>闭包状态</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/history.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/history.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createHistory()</code>
+        </td>
+        <td>messages 数组</td>
+      </tr>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/todo.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/todo.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createTodoManager()</code>
+        </td>
+        <td>items map + activeId</td>
+      </tr>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/skills.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/skills.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createActiveSkillSet()</code>
+          /
+          <code>createSkillLoader()</code>
+        </td>
+        <td>activated map + toolIndex map</td>
+      </tr>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/memory.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createMemoryManager()</code>
+        </td>
+        <td>MemoryStore 实例</td>
+      </tr>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/tasks.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createTaskManager()</code>
+        </td>
+        <td>TaskStore 实例 + activeTaskGroupId</td>
+      </tr>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/async-runs.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createAsyncRunManager()</code>
+        </td>
+        <td>runs map + runningCount</td>
+      </tr>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/schedules.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createScheduleManager()</code>
+        </td>
+        <td>ScheduleStore + 触发状态</td>
+      </tr>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/permission.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/permission.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createPermissionManager()</code>
+        </td>
+        <td>mode + blacklist + whitelist</td>
+      </tr>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/hooks.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createHookRunner()</code>
+        </td>
+        <td>hooks map (按 timing 分组)</td>
+      </tr>
+      <tr>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-llm.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/eval/live/live-llm.ts</code></a
+          >
+        </td>
+        <td>
+          <code>createLiveEvalLLMClient()</code>
+        </td>
+        <td>LLMClient + trace emitter</td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>为什么不直接用 class</h3>
+  <p>
+    朴素想法: "用 class 写, 状态放 private 字段, 不是更清晰?" 错。 class 在
+    TypeScript 里有两个问题:
+  </p>
+  <ol>
+    <li>
+      <strong>this 绑定问题</strong>
+      : 把方法当 callback 传 (e.g.
+      <code>setTimeout</code>
+      ),
+      <code>this</code>
+      丢失, 访问
+      <code>this.items</code>
+      变
+      <code>undefined</code>
+      。 闭包没有这个问题 — 闭包捕获的是变量, 不是
+      <code>this</code>
+      。
+    </li>
+    <li>
+      <strong>测试隔离</strong>
+      : 测一个方法要
+      <code>new TodoManager()</code>
+      , 测完销毁。 闭包工厂一行
+      <code>createTodoManager()</code>
+      即可, 自动隔离。
+    </li>
+    <li>
+      <strong>父子隔离自动</strong>
+      : 父子 agent 各自调
+      <code>createTodoManager()</code>
+      , 闭包天然隔离。 class 要手动传实例, 容易写错。
+    </li>
+  </ol>
+  <p>这就是为什么仓库 100% 用工厂 + 闭包, 一个 class 都没有。</p>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 闭包状态被意外共享</dt>
+    <dd>
+      在工厂函数外定义 const, 然后在工厂内捕获 — 这会"按模块共享"而不是
+      "按实例共享"。 状态必须
+      <strong>在工厂函数体内</strong>
+      声明。
+    </dd>
+    <dt>陷阱 · 闭包引用了大对象导致内存泄漏</dt>
+    <dd>
+      闭包捕获的对象无法被 GC。 Async Run / Schedule 完成后必须
+      <code>finishRun()</code>
+      显式清理闭包内的 map, 否则长期跑内存爆炸。
+    </dd>
+  </dl>
+  <h2 id="pattern-2">
+    模式 2 · 窄接口 + Adapter 翻译 (Narrow Interface + Adapter)
+  </h2>
+  <p>
+    <strong>出现频率</strong>
+    : 外部依赖 (LLM / 终端 / 工具 / Provider / LLM Provider Error)
+    全部用这模式。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 业务代码直接调
+    <code>openai.chat.completions.create()</code>
+    时, 业务被 provider 特定字段名污染; Anthropic / OpenAI / Google 三个
+    provider 字段格式完全不同; 换 provider = 重写业务; provider 错误码各不相同
+    (Anthropic 429 / OpenAI 429 / Google RESOURCE_EXHAUSTED) 业务需要逐个识别。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>适配器模式</strong>
+    +
+    <strong>接口隔离原则</strong>
+    。 定义一个跟外部细节
+    <strong>无关</strong>
+    的窄接口 (只含业务真正需要 的 1-3 个方法), 业务模块只 import 这个接口类型,
+    不 import 任何 外部 SDK。 Adapter 实现这个接口, 内部把所有 provider 特定逻辑
+    (字段名 / 错误码 / 重试 / 缓存) 翻译成内部统一类型。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 业务模块 0 改动换 provider; 测试用 fake 替换; provider-specific
+    错误码统一翻译; 重试 / 缓存 / streaming 这些"中间件" 关注点藏 adapter 内部。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版
+// 1. 定义窄接口
+interface LLMClient {
+  chat(args: { messages; tools? }): Promise&lt;LLMResponse&gt;;
+}
+
+interface LLMResponse {
+  content: string | null;
+  toolCalls: ChatCompletionMessageToolCall[];
+  finishReason: string | null;
+}
+
+// 2. Adapter 实现接口, 内部处理 provider 特定逻辑
+export function createOpenAILLMClient(config: ResolvedLLMConfig): LLMClient {
+  const openai = new OpenAI({ apiKey: config.apiKey, baseURL: config.baseURL });
+  return {
+    async chat({ messages, tools }) {
+      const completion = await openai.chat.completions.create({
+        model: config.model,
+        messages,
+        tools,    // OpenAI 格式转换在 adapter 内部
+      });
+      // OpenAI 特定: choice[0].message.tool_calls
+      const message = completion.choices[0].message;
+      return {
+        content: message.content,
+        toolCalls: message.tool_calls ?? [],
+        finishReason: completion.choices[0].finish_reason,
+      };
+    },
+  };
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/llm.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createOpenAILLMClient (LLMClient 适配器) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>接口只暴露 1 个方法</strong>
+      :
+      <code>chat()</code>
+      , 不暴露
+      <code>temperature</code>
+      /
+      <code>max_tokens</code>
+      等 provider 特有参数。 这些参数由 adapter 内部处理, 业务模块不感知。
+    </li>
+    <li>
+      <strong>Adapter 翻译 provider 特定字段</strong>
+      : OpenAI 的
+      <code>choices[0].message</code>
+      翻译成
+      <code>LLMResponse</code>
+      。 换 Claude 时, Claude adapter 自己处理
+      <code>content[0].text</code>
+      。
+    </li>
+    <li>
+      <strong>业务模块只看接口</strong>
+      :
+      <code>agent.ts</code>
+      import
+      <code>LLMClient</code>
+      类型, 不 import OpenAI。 换 Claude = 换 adapter, 业务代码 0 改动。
+    </li>
+  </ol>
+  <h3>仓库里哪些文件用这个模式</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>外部依赖</th>
+        <th>窄接口</th>
+        <th>Adapter</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>LLM (OpenAI / Anthropic / Google)</td>
+        <td>
+          <code>LLMClient</code>
+          (1 个方法)
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/llm.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/llm.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>REPL 终端</td>
+        <td>
+          <code>Terminal</code>
+          (3 个方法)
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/terminal.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/terminal.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>工具调用</td>
+        <td>
+          <code>ToolResult</code>
+          接口 +
+          <code>ToolRegistry</code>
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/tools/registry.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/tools/registry.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Provider 错误</td>
+        <td>
+          <code>LLMErrorKind</code>
+          7 类枚举
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/llm-adapter.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/llm-adapter.ts</code></a
+          >
+          翻译
+        </td>
+      </tr>
+      <tr>
+        <td>Eval 工具执行</td>
+        <td>
+          <code>ToolExecutor</code>
+          函数类型
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/in-process-driver.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code
+              >src/eval/drivers/learn-claude-code/in-process-driver.ts</code
+            ></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Eval LLM</td>
+        <td>
+          <code>LLMClient</code>
+          (复用主项目的)
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/scripted-llm.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/eval/drivers/learn-claude-code/scripted-llm.ts</code></a
+          >
+          /
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/replay/replay-llm.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>replay/replay-llm.ts</code></a
+          >
+          /
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/live/live-llm.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>live/live-llm.ts</code></a
+          >
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>3 个边界设计</h3>
+  <p>写 adapter 时, 必须遵守 3 条:</p>
+  <ol>
+    <li>
+      <strong>接口只暴露业务关心的</strong>
+      : 业务不需要 "messages 在 Anthropic 哪个字段", 只需要 "LLMResponse.content
+      / toolCalls"。 其他字段不外泄。
+    </li>
+    <li>
+      <strong>错误统一翻译</strong>
+      : Provider 错误码 (Anthropic 429 / OpenAI 429 / Google RESOURCE_EXHAUSTED)
+      全部翻译成
+      <code>LLMErrorKind.rate_limit</code>
+      。 Recovery 模块 (第 11 章) 不看 provider。
+    </li>
+    <li>
+      <strong>Adapter 自己处理 network / retry / cache</strong>
+      : 这些 "中间件" 关注点藏 adapter 内部, 业务模块不感知。
+    </li>
+  </ol>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 接口设计过大</dt>
+    <dd>
+      LLMClient 暴露
+      <code>setTemperature()</code>
+      /
+      <code>setMaxTokens()</code>
+      等 5 个 setter, 业务模块就要关心这 5 个。 正确做法: 接口只有
+      <code>chat()</code>
+      一个方法, 这些 setter 藏在 adapter 内部或 通过 RuntimePolicy 注入。
+    </dd>
+    <dt>陷阱 · 业务模块 import 了 adapter</dt>
+    <dd>
+      <code>import { createOpenAILLMClient } from "./llm.js"</code>
+      — 这破坏了窄接口的隔离。 业务应该 import 类型
+      <code>LLMClient</code>
+      , 让 Composition Root 决定用哪个 adapter 实现。
+    </dd>
+  </dl>
+  <h2 id="pattern-3">模式 3 · 依赖注入 (Dependency Injection)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 几乎所有 manager 的构造函数 / 工厂函数。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 业务代码自己
+    <code>new PermissionManager()</code>
+    /
+    <code>new Logger()</code>
+    时, 业务被具体实现绑死; 改 logger 格式要改 业务代码; 测试时无法替换 logger
+    收集输出 (因为 logger 在业务内部 new 出来的, 外面引用不到)。
+  </p>
+  <p>
+    <strong>原理</strong>
+    :
+    <strong>控制反转</strong>
+    (Inversion of Control) — 业务模块不再控制依赖的创建, 而是
+    <strong>接收</strong>
+    别人创建的 依赖 (通过构造函数 / 工厂参数)。 依赖的"组装" 集中在 Composition
+    Root, 业务模块保持纯净。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 测试可以注入 fake 实现 (e.g. ScriptedLLMClient 替代真实 LLM); 改 logger /
+    permission / tool 不改业务代码; 业务模块 0 配置即可运行 (因为依赖是注入的,
+    不需要 自己读 env / config)。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版
+export function createAgent(deps: {
+  llm: LLMClient;             // 注入 LLM
+  history: History;           // 注入 history
+  tools: ToolRegistry;        // 注入 tool registry
+  permissionManager: PermissionManager;   // 注入 permission
+  todoManager: TodoManager;   // 注入 todo
+  memoryManager?: MemoryManager;   // 可选注入
+  hookRunner: HookRunner;     // 注入 hook
+  recoveryManager: RecoveryManager;   // 注入 recovery
+}): Agent {
+  return {
+    async run(query: string) {
+      // 业务代码不 new 任何 manager, 只用注入的
+      // 测试时可以传 fake
+    },
+  };
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createAgent 依赖注入 (deps 对象接收) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>deps 对象统一接收</strong>
+      : 不是
+      <code>createAgent(llm, history, tools, ...)</code>
+      一长串参数, 而是一个 deps 对象, 阅读时一目了然。
+    </li>
+    <li>
+      <strong>可选依赖用 <code>?</code></strong>
+      : memoryManager 可选, 调
+      <code>deps.memoryManager?.list()</code>
+      兼容缺省。
+    </li>
+    <li>
+      <strong>业务模块不感知注入</strong>
+      : 写 agent 主循环时, 不关心谁注入的, 只关心"调一下 deps.llm.chat()
+      拿响应"。
+    </li>
+  </ol>
+  <h3>哪些是注入, 哪些不是</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>类别</th>
+        <th>注入</th>
+        <th>不注入 (模块自己管理)</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>外部依赖</td>
+        <td>LLM / Terminal / 工具集 / Permission / Hook / Recovery</td>
+        <td>—</td>
+      </tr>
+      <tr>
+        <td>运行时状态</td>
+        <td>
+          History / TodoManager / MemoryManager / TaskManager / AsyncRunManager
+          / ScheduleManager
+        </td>
+        <td>—</td>
+      </tr>
+      <tr>
+        <td>内部辅助</td>
+        <td>Logger / CacheDebugger / Compressor / OutputStore</td>
+        <td>—</td>
+      </tr>
+      <tr>
+        <td>常量 / 静态数据</td>
+        <td>—</td>
+        <td>HARDCODED_BLACKLIST (command-safety.ts)</td>
+      </tr>
+    </tbody>
+  </table>
+  <p>
+    决策规则:
+    <strong>"如果测试需要替换它, 它就是注入"</strong>
+    。 命令黑名单 不注入, 因为测试不需要替换; logger 也注入, 因为不同测试需要
+    不同 logger 收集输出。
+  </p>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 默认值放在构造函数内, 不可注入</dt>
+    <dd>
+      <code>new MyService({ logger: createDefaultLogger() })</code>
+      — 测试时 想换 logger, 改不了。 正确: 业务模块
+      <strong>必须接收 null / undefined</strong>
+      , 显式决定默认值, 不私自 new。
+    </dd>
+    <dt>陷阱 · 注入"上帝对象" 一个 deps 包揽 10 个东西</dt>
+    <dd>
+      业务模块的 deps 膨胀到 10+ 个, 难以阅读。 正确: 拆成几个相关 集合 (e.g.
+      <code>{ llm, history, messages }</code>
+      一起,
+      <code>{ logger, metrics }</code>
+      一起)。
+    </dd>
+  </dl>
+  <h2 id="pattern-4">模式 4 · Composition Root (唯一组装点)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 项目只有 1 个 Composition Root:
+    <code>src/index.ts</code>
+    的
+    <code>main()</code>
+    。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 业务模块互相 import, 改一个全挂; 难 部署到不同环境 (CLI / GUI / CI /
+    测试); 测试时无法替换真实 实现 (因为业务自己 new 了)。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>依赖反转</strong>
+    的极端形式 — 整个 项目的"对象图构造" 集中到 1 个函数, 业务模块之间不互相 new
+    对方。 Composition Root 是唯一
+    <strong>知道所有实现细节</strong>
+    的地方。 切换环境 = 切换 Composition Root, 业务模块 0 改动。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 换测试 = 换
+    <code>main()</code>
+    一处; 换部署 = 换
+    <code>main()</code>
+    一处; 业务模块保持纯净, 永远不依赖具体 实现。
+    业务模块之间的依赖关系图"塌缩成一个点", 维护复杂度 从 O(n²) 降到 O(n)。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, 真实 src/index.ts 比这复杂
+export async function main() {
+  // 1. 读配置
+  const config = loadConfig();
+
+  // 2. 创建项目级 context (路径派生)
+  const projectContext = createProjectContext({
+    projectRoot: process.cwd(),
+    agentHome: process.env.AGENT_HOME ?? "~/.swoopcode",
+  });
+
+  // 3. 创建 logger
+  const logger = createLogger();
+
+  // 4. 创建共享实例
+  const llm = createLLMClient(config.llm, config.runtimePolicy);
+  const history = createHistory();
+  const toolRegistry = createToolRegistry();
+  const todoManager = createTodoManager();
+  const permissionManager = createPermissionManager({ mode: "default", terminal });
+  const hookRunner = createHookRunner();
+  // ... 还有 memory / task / async / schedule 等
+
+  // 5. 注册内置 hook (audit log / prettier / memory injection 等)
+  hookRunner.register(auditLogHook);
+  hookRunner.register(memoryInjectionHook(memoryManager));
+
+  // 6. 创建 agent (注入所有依赖)
+  const agent = createAgent({
+    llm, history, tools: toolRegistry,
+    todoManager, permissionManager, hookRunner, recoveryManager, ...
+  });
+
+  // 7. 启动 REPL
+  const terminal = createReadlineTerminal();
+  await startRepl(agent, terminal);
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/index.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · main() Composition Root (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>唯一 import 一切</strong>
+      : 业务模块 (agent / history / llm) 之间 互不 import, 只在
+      <code>index.ts</code>
+      一处 import。
+    </li>
+    <li>
+      <strong>唯一 new 一切</strong>
+      : 业务模块用工厂函数, 工厂不 new,
+      <code>index.ts</code>
+      调工厂。 业务模块不直接构造依赖。
+    </li>
+    <li>
+      <strong>测试时换 <code>main()</code></strong>
+      : eval test 写一个
+      <code>testMain()</code>
+      替换: 注入 fake LLM / fake terminal, 不动业务代码。
+    </li>
+  </ol>
+  <h3>为什么只能有 1 个</h3>
+  <p>
+    朴素想法: "能不能有 2 个 Composition Root, 一个给 CLI, 一个给 GUI?" 错。
+    多个 Composition Root 意味着"业务模块 import 关系散落多处", 违反原则 1。
+    正确做法:
+  </p>
+  <pre class="code-block"><code>// src/cli-main.ts (Composition Root for CLI)
+import { createAgent, createReadlineTerminal, ... } from "./agent.js";
+export async function main() { /* ... */ }
+
+// src/gui-main.ts (Composition Root for GUI)
+import { createAgent, createElectronTerminal, ... } from "./agent.js";
+export async function main() { /* ... */ }
+
+// 业务模块 ./agent.js 不变, 只是 Terminal 注入不同</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/index.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · main() Composition Root (多个 main.ts 模板) (L1)</a
+    >
+  </p>
+  <p>多个 main.ts, 但每个 main.ts 都是 Composition Root, 业务模块 0 改动。</p>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>
+      陷阱 · 在业务模块里调
+      <code>loadConfig()</code>
+    </dt>
+    <dd>
+      <code>loadConfig()</code>
+      是"如何配置" 的实现, 应该只在 Composition Root 调。
+      业务模块不应该知道"配置从哪里来", 它只接收配置好的 对象。
+    </dd>
+    <dt>陷阱 · Composition Root 变成"上帝函数" 几千行</dt>
+    <dd>
+      main() 膨胀到 1000+ 行, 难维护。 正确: 把"创建一组相关对象" 抽成辅助函数
+      (e.g.
+      <code>createRuntime(config)</code>
+      ), main() 只负责"调一组辅助函数 + 启动 REPL"。
+    </dd>
+  </dl>
+  <h2 id="pattern-5">模式 5 · 注册表模式 (Registry Pattern)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : ToolRegistry / SkillLoader / HookRunner / MemoryStore / TaskStore /
+    ScheduleStore / OutputStore。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 工具/技能/hook 数量动态增长 (从 5 个 涨到 30+); 调用方要"按名字查找" +
+    "按类别枚举"; 翻译 (工具 def → LLM ChatCompletionTool) 应该跟调用解耦。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>注册表模式</strong>
+    — 把"按名字 查找对象" 抽成一个数据结构 (通常是
+    <code>Map&lt;string, T&gt;</code>
+    ), 外界通过
+    <code>register(name, obj)</code>
+    和
+    <code>get(name)</code>
+    交互。 注册和查询分离, 翻译 (LLM 格式转换) 藏 registry 内部。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 调用方不
+    <code>import</code>
+    具体工具类; 动态加载 (从配置文件扫) 不需要改业务代码; 翻译 (tool def → LLM
+    格式) 在 registry 集中, 不会散在业务模块各处; 未知名字统一返回 error, 不抛
+    throw。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版
+interface ToolRegistry {
+  register(name: string, def: ToolDefinition, executor: ToolExecutor): void;
+  get(name: string): ToolDefinition | undefined;
+  list(): ToolDefinition[];
+  getToolDefinitions(): ChatCompletionTool[];   // 转成 LLM 格式
+  invoke(name: string, args: unknown): Promise&lt;ToolResult&gt;;
+}
+
+export function createToolRegistry(): ToolRegistry {
+  const tools = new Map&lt;string, { def: ToolDefinition; executor: ToolExecutor }&gt;();
+
+  return {
+    register(name, def, executor) {
+      tools.set(name, { def, executor });
+    },
+    get(name) { return tools.get(name)?.def; },
+    list() { return [...tools.values()].map(t =&gt; t.def); },
+    getToolDefinitions() {
+      return [...tools.values()].map(t =&gt; ({
+        type: "function",
+        function: {
+          name: t.def.name,
+          description: t.def.description,
+          parameters: t.def.parameters,
+        },
+      }));
+    },
+    async invoke(name, args) {
+      const entry = tools.get(name);
+      if (!entry) return { content: `Tool not found: ${name}`, error: true };
+      return entry.executor(args, { /* tool context */ });
+    },
+  };
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/tools/registry.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createToolRegistry 注册表 (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>注册 / 查询 / 调用 分离</strong>
+      : 注册时存 def + executor, 查询时返回 def, 调用时 invoke executor。
+      三件事各管各的。
+    </li>
+    <li>
+      <strong>getToolDefinitions() 翻译</strong>
+      : 内部存 ToolDefinition, 对外返回 LLM 期望的 ChatCompletionTool 格式。
+      Adapter 翻译 在 registry 内部, 业务模块不感知。
+    </li>
+    <li>
+      <strong>未知工具返回 error</strong>
+      : 调不存在的工具不抛 throw, 返回
+      <code>{ content: "Tool not found", error: true }</code>
+      。 LLM 收到后能继续推理。
+    </li>
+  </ol>
+  <h3>3 个变种</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>变种</th>
+        <th>用途</th>
+        <th>示例文件</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>静态 registry</td>
+        <td>工具一次性注册, 之后不变</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/tools/registry.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/tools/registry.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>动态 loader</td>
+        <td>从配置目录扫描加载 (skill / memory / task)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/skills.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/skills.ts</code></a
+          >
+          /
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/memory.ts</code></a
+          >
+          /
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/tasks.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>分组 runner</td>
+        <td>按 timing 分组 (hook runner)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts#L1"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/hooks.ts</code></a
+          >
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · registry 自身有顺序依赖</dt>
+    <dd>
+      注册顺序决定调用顺序, 业务依赖了"特定顺序"。 正确: registry 只管 存储,
+      顺序由调用方决定, 或者按"插入顺序" 显式记录。
+    </dd>
+    <dt>陷阱 · registry 翻译函数返回不一致格式</dt>
+    <dd>
+      <code>getToolDefinitions()</code>
+      第一次返回
+      <code>{type: "function"}</code>
+      , 第二次返回
+      <code>{type: "tool"}</code>
+      — 业务崩溃。 正确: 翻译 函数应该是
+      <strong>纯函数</strong>
+      , 同样输入永远同样输出。
+    </dd>
+  </dl>
+  <h2 id="pattern-6">模式 6 · Builder / Pipeline 模式</h2>
+  <p>
+    <strong>出现频率</strong>
+    : agent.prepareMessages() 的消息处理管道 / Compressor 的 P0/P1/P2 /
+    各种"输入 → 多步处理 → 输出" 的场景。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 消息处理 / 压缩 / 恢复是多步操作, 每步 顺序不能换, 散在 if/else 里难调试;
+    单步测试难 (要准备完整数据才能 测中间一步)。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>管道模式</strong>
+    (Unix pipe 思想) — 把多步操作建模成"每步是纯函数, 输入 = 上一步输出,
+    顺序固定"。 每步
+    <strong>不可变</strong>
+    (不修改原数组, 返回新数组), 可以单独 测, 可以单独换实现。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 步骤可单测 (想测 P0 衰减? 直接调
+    <code>compressor.decay(blocks)</code>
+    , 不用先准备完整 messages); 步骤顺序错位立刻能看出 (调换顺序会出 bug,
+    测试会挂); 步骤是 纯函数, 无副作用, 调试容易。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre
+    class="code-block"
+  ><code>// 教学简化版, agent 主循环的 prepareMessages 管道
+function prepareMessages(input: {
+  history: History;
+  currentLoopIndex: number;
+  todoManager: TodoManager;
+  memoryManager: MemoryManager;
+  hooks: HookRunner;
+}): ChatCompletionMessageParam[] {
+  let blocks = getEntries(input.history);             // 1. 拿 entries
+  blocks = annotate(blocks, input.currentLoopIndex);   // 2. 标 metadata
+  blocks = normalize(blocks);                          // 3. 合并相邻同角色
+  blocks = group(blocks);                              // 4. 分组成 text/tool_use/summary
+  blocks = input.compressor.decay(blocks);             // 5. P0 衰减
+  blocks = input.compressor.compact(blocks);           // 6. P2 总结
+  blocks = input.hooks.run("LLMCall", blocks);         // 7. hook 注入
+  return flatten(blocks);                              // 8. 还原成 messages
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · prepareMessages Pipeline (8 步处理) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>每步是纯函数</strong>
+      : 输入 blocks, 输出 blocks, 不修改原数组。 顺序错了换行即可, 不会乱。
+    </li>
+    <li>
+      <strong>步骤固定, 不重排</strong>
+      :
+      <code
+        >getEntries → annotate → normalize → group → decay → compact → hooks →
+        flatten</code
+      >
+      。 任何一步乱序, 要么 metadata 丢, 要么压缩破坏消息块。
+    </li>
+    <li>
+      <strong>可单步测试</strong>
+      : 想测 P0 衰减? 直接调
+      <code>compressor.decay(blocks)</code>
+      , 不用先准备完整 messages。
+    </li>
+  </ol>
+  <h3>仓库里哪些地方用这个模式</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>管道</th>
+        <th>步骤</th>
+        <th>位置</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>agent.prepareMessages()</td>
+        <td>8 步 (上面列的)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L253"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/agent.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>tool 消息处理</td>
+        <td>load_skill 特殊处理 → 普通 invoke → 写 tool message</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L408"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/agent.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Compressor 三层</td>
+        <td>P0 衰减 → P1 即时 → P2 全量</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/compressor.ts#L150"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/compressor.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Recovery 决策</td>
+        <td>分类错误 → 选动作 → 状态机校验</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L198"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/recovery.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Hook 链</td>
+        <td>顺序执行, modified 累积, denied 短路</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/hooks.ts#L163"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/hooks.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Eval runner</td>
+        <td>
+          validate → create workspace → driver → steps → assert → judge → trace
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/runner.ts#L53"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/eval/core/runner.ts</code></a
+          >
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 步骤间有共享可变状态</dt>
+    <dd>
+      步骤 2 修改了步骤 1 准备的数据 (没返回新数组, 原地修改), 步骤 3
+      看到的是被改过的数据, 调试时以为是步骤 3 的 bug。 正确: 每步
+      <strong>必须返回新数据</strong>
+      , 不可变。
+    </dd>
+    <dt>陷阱 · 步骤之间用 callback 通信</dt>
+    <dd>
+      步骤 2 通过 closure 捕获步骤 1 的内部状态。 步骤顺序调换时, callback
+      引用旧状态, 难调试。 正确: 步骤之间只通过
+      <code>return</code>
+      通信, 不共享状态。
+    </dd>
+  </dl>
+  <h2 id="pattern-7">模式 7 · State Machine (状态机)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : Task 状态 (6 个) / Async Run 状态 (6 个) / Schedule trigger 状态 /
+    Recovery 状态。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 任务/Run 有 5+ 个状态, 状态间转换规则 复杂 (e.g. 失败能重试,
+    完成不能再回退); 不显式枚举, 业务代码会 写出"任意状态切换" 的混乱代码;
+    审计时不知道"任务到底处在 什么状态"。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>有限状态机</strong>
+    (FSM) — 用
+    <code>state enum</code>
+    +
+    <code>transition matrix</code>
+    显式枚举所有合法 状态和合法转换。 任何状态切换必须经过"状态机校验函数",
+    不允许 业务代码直接 set status。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 状态转换错误立刻被状态机捕获 (e.g. in_progress → pending 抛错);
+    状态机校验集中, 业务模块不能绕过; 审计能信任"任务当前 status" 是合法的。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, Task 状态机
+type TaskStatus = "pending" | "in_progress" | "completed" | "failed" | "cancelled" | "deleted";
+
+const ALLOWED_TRANSITIONS: Record&lt;TaskStatus, TaskStatus[]&gt; = {
+  pending:     ["in_progress", "cancelled"],
+  in_progress: ["completed", "failed", "cancelled"],
+  completed:   [],     // 终态
+  failed:      ["pending"],  // 失败可以重试
+  cancelled:   [],     // 终态
+  deleted:     [],     // 终态
+};
+
+function updateTaskStatus(task: Task, newStatus: TaskStatus): void {
+  const allowed = ALLOWED_TRANSITIONS[task.status];
+  if (!allowed.includes(newStatus)) {
+    throw new Error(`Cannot transition from ${task.status} to ${newStatus}`);
+  }
+  // 额外校验: pending → in_progress 必须先满足依赖
+  if (newStatus === "in_progress" &amp;&amp; !dependenciesCompleted(task)) {
+    throw new Error(`Task ${task.id} is blocked by ${task.blockedBy.join(", ")}`);
+  }
+  task.status = newStatus;
+  task.updatedAt = Date.now();
+  if (newStatus === "completed") task.completedAt = Date.now();
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createTaskManager 状态机 (6 状态 + 转换矩阵) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>转换矩阵显式枚举</strong>
+      : 哪个状态可以转哪个状态, 一张表查 一清二楚。 不允许任意状态切换
+      (那等于没状态机)。
+    </li>
+    <li>
+      <strong>业务校验放在 transition 函数</strong>
+      : 依赖检查 / 权限检查 / 时间窗检查, 都放在
+      <code>updateStatus()</code>
+      内部, 不让业务模块 自己检查后乱调 setStatus。
+    </li>
+    <li>
+      <strong>终态不可转出</strong>
+      : completed / cancelled / deleted 是终态, 不允许转出。 避免"撤销归档" /
+      "恢复已删除" 这类破坏审计的操作。
+    </li>
+  </ol>
+  <h3>仓库里哪些地方用这个模式</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>状态</th>
+        <th>状态数</th>
+        <th>关键转换</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>Task</td>
+        <td>6 (pending/in_progress/completed/failed/cancelled/deleted)</td>
+        <td>pending → in_progress (依赖检查) / failed → pending (重试)</td>
+      </tr>
+      <tr>
+        <td>Async Run</td>
+        <td>6 (running/completed/failed/timeout/cancelled/abandoned)</td>
+        <td>running → 任何终态 (finishRun 唯一收敛)</td>
+      </tr>
+      <tr>
+        <td>Schedule</td>
+        <td>3 (active/cancelled/completed)</td>
+        <td>active → cancelled (软取消) / 全部完成 → completed (归档)</td>
+      </tr>
+      <tr>
+        <td>Recovery</td>
+        <td>4 (backoff/compact/continue/fail)</td>
+        <td>由错误类别决定, 上限保护</td>
+      </tr>
+      <tr>
+        <td>Task Group</td>
+        <td>2 (active/archived)</td>
+        <td>所有 task 完成后 → archived</td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 业务代码直接修改 status 字段</dt>
+    <dd>
+      <code>task.status = "completed"</code>
+      — 绕过了状态机校验, 依赖图 不会被检查, 审计会信任错误状态。 正确: 业务
+      <strong>必须</strong>
+      调
+      <code>updateTaskStatus(task, "completed")</code>
+      。
+    </dd>
+    <dt>陷阱 · 转换矩阵里有"任意" 通配</dt>
+    <dd>
+      "completed → any" 表示"任何状态都能转 completed" — 这等于 没限制。 正确:
+      终态应该 empty array, 不允许转出。
+    </dd>
+  </dl>
+  <h2 id="pattern-8">模式 8 · Strategy (策略模式)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : RuntimePolicy / FoundationModelProfile / Permission 3 模式 / 3 种 trigger
+    / 3 种 compression mode。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 同一功能有多种实现 (3 种 permission mode, 3 种 schedule trigger, 3 种
+    compression mode); 业务代码用 if/else 链判断"用户选了哪个 mode" 时, 加新
+    mode 改 10 处; 模式选择规则不集中, 难审计。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>策略模式</strong>
+    — 把"模式选择" 抽成
+    <strong>静态查表</strong>
+    (mode × 操作 → 决策), 业务代码 只调
+    <code>checkPermission(mode, op)</code>
+    拿结果, 不写 if/else。 新增 mode = 加一行, 不改 check 函数。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 业务代码 if/else 链消失; 新 mode 改动 局部化 (只改表); 模式规则集中,
+    审计和测试都简单; 黑名单独立 于 mode, 跨策略硬规则不污染 mode 决策。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, Permission 3 模式
+type PermissionMode = "plan" | "auto" | "default";
+
+const PERMISSION_DECISION_TABLE: Record&lt;PermissionMode, Record&lt;string, "allow" | "ask" | "deny"&gt;&gt; = {
+  plan: {
+    "run_read": "allow",
+    "run_write": "ask",     // 全部 ask
+    "run_bash": "ask",
+    "rm -rf /": "deny",     // 黑名单永远 deny
+  },
+  auto: {
+    "run_read": "allow",
+    "run_write": "allow",    // auto 不 ask
+    "run_bash": "allow",
+    "rm -rf /": "deny",     // 黑名单仍然 deny
+  },
+  default: {
+    "run_read": "allow",
+    "run_write": "ask",     // 危险 ask
+    "run_bash": "allow",     // 安全 allow
+    "rm -rf /": "deny",
+  },
+};
+
+function checkPermission(mode: PermissionMode, toolName: string, args: any): "allow" | "ask" | "deny" {
+  // 先查黑名单
+  if (isBlacklisted(toolName, args)) return "deny";
+  // 再查模式表
+  return PERMISSION_DECISION_TABLE[mode][toolName] ?? "ask";
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/permission.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · PermissionManager 策略模式 (3 模式) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>策略表 = 静态映射</strong>
+      : 模式 × 操作 → 决策, 查表即可。 不写 if/else 链。
+    </li>
+    <li>
+      <strong>黑名单独立于策略</strong>
+      :
+      <code>rm -rf /</code>
+      在 plan / auto / default 全部 deny, 不管用户选什么模式。 跨策略硬规则。
+    </li>
+    <li>
+      <strong>新增模式</strong>
+      : 加
+      <code>"ci"</code>
+      模式, 只需在表里加一行, 不改 check 函数。
+    </li>
+  </ol>
+  <h3>仓库里哪些地方用这个模式</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>策略维度</th>
+        <th>策略数</th>
+        <th>位置</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>Permission 模式</td>
+        <td>3 (plan/auto/default)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/permission.ts#L28"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/permission.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>LLM Provider Profile</td>
+        <td>5+ (openai/kimi/minimax/qwen/...)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/llm-providers.ts#L47"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/llm-providers.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Foundation Model Profile</td>
+        <td>5+ (deepseek-v4/kimi-k2.6/minimax-m3/...)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/foundation-models.ts#L29"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/foundation-models.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Schedule trigger</td>
+        <td>3 (at/after/cron)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L605"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/schedules.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Compression mode</td>
+        <td>3 (aggressive/balanced/long_context)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/compressor.ts#L33"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/compressor.ts</code></a
+          >
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 策略表里"部分允许"</dt>
+    <dd>
+      "allow_if_args_path_in_workspace" — 把条件逻辑塞进策略表, 表变成"迷你
+      DSL", 难维护。 正确: 条件逻辑留在 check 函数 (e.g.
+      <code>if (isBlacklisted(...)) return "deny"</code>
+      ), 表只 管静态映射。
+    </dd>
+    <dt>陷阱 · 默认值"allow"</dt>
+    <dd>
+      找不到对应 mode × op 的值时, 默认 "ask" 还是 "allow" 还是 "deny"? 默认
+      allow 危险 (新工具被自动允许), 默认 deny 太严 (新工具被永久拒)。 仓库默认
+      "ask", 强制新工具询问。
+    </dd>
+  </dl>
+  <h2 id="pattern-9">模式 9 · Observer (观察者模式) - 改造版</h2>
+  <p>
+    <strong>出现频率</strong>
+    : TranscriptStore 事件流 / CacheDebugger hash 追踪 / Eval trace。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 调试时需要"回放整个 session"; 审计 需要"LLM 实际看到什么"; eval 需要"case
+    跑时发生了什么"。 没事件流, 这些都做不到 — 只能看 LLM 最终回复, 信息全丢。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>观察者模式</strong>
+    的简化版 — 系统 关键操作时 (用户消息 / 助手消息 / 工具调用 / 错误恢复)
+    append 一条事件到
+    <strong>事件流</strong>
+    。 事件流是
+    <strong>append-only</strong>
+    的 (不能改/删), 每个事件有
+    <strong>单调递增序号</strong>
+    , 跟 history sequence 对齐, 调试时按序号回放。 简化版没"订阅者", 只有
+    "存下来, 后续读"。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 调试时按事件序号回放, 整个 session 一 清二楚; transcript 跟 LLM 看到的
+    messages 分离, transcript 保留所有内部 metadata (不被 P0/P1/P2 压缩);
+    审计能信任事件 流 (append-only, 不可篡改)。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, TranscriptStore
+interface TranscriptEvent {
+  kind: "user_message" | "assistant_message" | "tool_result" | "system_reminder" | "recovery_event" | "history_replaced" | "hook_message";
+  sequence: number;     // append-only 序号
+  timestamp: number;
+  // ...
+}
+
+export function createTranscriptStore(): TranscriptStore {
+  const events: TranscriptEvent[] = [];
+  let nextSequence = 1;
+
+  return {
+    append(event) {
+      events.push({ ...event, sequence: nextSequence++ });
+    },
+    list(): TranscriptEvent[] { return [...events]; },
+    // ...
+  };
+}
+
+// agent 主循环里
+transcript.append({ kind: "user_message", content: query, ... });
+// ...
+transcript.append({ kind: "assistant_message", content: response.content, ... });
+// ...
+transcript.append({ kind: "tool_result", toolName: call.name, result: result.content, ... });</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/transcript.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createTranscriptStore Observer 改造版 (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>append-only</strong>
+      : transcript 只能追加, 不能修改 / 删除 历史事件。 保证审计完整性。
+    </li>
+    <li>
+      <strong>sequence 单调递增</strong>
+      : 给每个事件序号, 反查 / 排序 / 跟 history sequence 对齐都靠它。
+    </li>
+    <li>
+      <strong>观察者 vs Publisher 边界</strong>
+      : TranscriptStore 是发布者, 没人订阅 — 数据存下来, 后续 trace 导出 / debug
+      读取。 这是"观察者模式的简化版", 没订阅者, 只有存盘。
+    </li>
+  </ol>
+  <h3>仓库里哪些地方用这个模式</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>事件流</th>
+        <th>事件类型数</th>
+        <th>位置</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>Transcript</td>
+        <td>
+          7 (user_message / assistant_message / tool_result / system_reminder /
+          recovery_event / history_replaced / hook_message)
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/transcript.ts#L83"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/transcript.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Eval trace</td>
+        <td>
+          10+ (agent_output / tool_call / llm_call / permission_prompt / log /
+          ...)
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/trace.ts#L62"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/eval/core/trace.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Schedule occurrence</td>
+        <td>
+          9 (due / triggered / running / completed / failed / timeout / missed /
+          skipped / orphaned)
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L605"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/schedules.ts</code></a
+          >
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 把 transcript 当成 history 用</dt>
+    <dd>
+      业务代码读 transcript 当 history, 拿到的是
+      <strong>审计数据</strong>
+      (含内部 metadata) 不是
+      <strong>LLM 看到的数据</strong>
+      。 正确: transcript 永远只用于调试/审计, 不进 LLM messages。
+    </dd>
+    <dt>陷阱 · transcript 无限增长不轮转</dt>
+    <dd>
+      跑 1 年 transcript 1GB+, ENOSPC。 正确: transcript 走日志 轮转 (第 15 章),
+      大小阈值 50MB, 历史保留 5 个。
+    </dd>
+  </dl>
+  <h2 id="pattern-10">模式 10 · Atomic Write (原子写)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 1 个核心工具
+    <code>atomicWriteJsonSync()</code>
+    , 被所有持久化模块复用。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 进程突然被杀 (Ctrl+C / OOM / 断电) 时, 如果直接
+    <code>fs.writeFileSync()</code>
+    写到一半就留半截 JSON, 下次启动 JSON.parse 抛 SyntaxError, 整个 Task /
+    Schedule / Memory 系统不可用。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : POSIX 文件系统保证
+    <code>rename()</code>
+    是
+    <strong>原子的</strong>
+    — 要么旧文件保留, 要么新文件替换, 不会 有"两个都损坏" 的中间状态。
+    利用这个性质: 先写临时文件 (
+    <code>tmpPath</code>
+    ), 写完
+    <code>fsync</code>
+    强制刷盘, 然后
+    <code>rename(tmpPath, finalPath)</code>
+    原子替换。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 断电 / 进程被杀不会留半截 JSON; 所有 持久化模块用同一函数,
+    不会"两处实现漂移"; 写完直接返回 (同步), 不需要异步复杂度。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版
+import * as fs from "fs";
+import * as path from "path";
+
+export function atomicWriteJsonSync(filePath: string, data: unknown): void {
+  // 1. 写临时文件 (含 pid + timestamp 避免并发写冲突)
+  const tmpPath = `${filePath}.tmp.${process.pid}.${Date.now()}`;
+  const json = JSON.stringify(data, null, 2);
+
+  const fd = fs.openSync(tmpPath, "w");
+  try {
+    fs.writeSync(fd, json);
+    fs.fsyncSync(fd);   // 强制刷盘, 不留 OS 缓存
+  } finally {
+    fs.closeSync(fd);
+  }
+
+  // 2. 原子 rename 替换 (POSIX rename 是原子的)
+  fs.renameSync(tmpPath, filePath);
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/atomic-write.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · atomicWriteJsonSync 原子写工具 (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>tmp + rename</strong>
+      : 写到 tmp, rename 替换。 rename 在 POSIX 系统是原子的,
+      要么全成功要么全失败, 不会有"两个都损坏"。
+    </li>
+    <li>
+      <strong>fsync 强制刷盘</strong>
+      : 不调的话数据在 OS 缓存, 断电丢。 fsync 告诉 OS "现在就写到磁盘"。
+    </li>
+    <li>
+      <strong>tmp 路径含 pid + timestamp</strong>
+      : 避免并发写覆盖 (多个进程同时写, 临时文件名不冲突)。
+    </li>
+  </ol>
+  <h3>所有持久化模块都用这个工具</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>模块</th>
+        <th>用原子写</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>TaskStore</td>
+        <td>
+          <code>group.json</code>
+          /
+          <code>index.json</code>
+        </td>
+      </tr>
+      <tr>
+        <td>ScheduleStore</td>
+        <td>
+          <code>schedule.json</code>
+          /
+          <code>occurrence-&lt;id&gt;.json</code>
+        </td>
+      </tr>
+      <tr>
+        <td>MemoryStore</td>
+        <td>每个 memory 一个 JSON</td>
+      </tr>
+      <tr>
+        <td>OutputStore</td>
+        <td>每个 output 一个文本 / JSON</td>
+      </tr>
+      <tr>
+        <td>Logger</td>
+        <td>不适用 (append-only, 不需要原子)</td>
+      </tr>
+    </tbody>
+  </table>
+  <p>
+    所有持久化模块都 import
+    <code>src/atomic-write.ts</code>
+    的同一函数, 不自己实现, 避免两处实现漂移。
+  </p>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 忘记 fsync</dt>
+    <dd>
+      只写 tmp + rename, 不调 fsync — 数据在 OS 缓存, 断电就丢,
+      原子写等于不原子。 正确: 每次写完必须
+      <code>fsyncSync</code>
+      。
+    </dd>
+    <dt>陷阱 · 多个进程写同一文件</dt>
+    <dd>
+      进程 A 写 tmp.foo.pidA, 进程 B 写 tmp.foo.pidB, rename 时 互相覆盖。 正确:
+      tmp 路径
+      <strong>必须含 pid + timestamp</strong>
+      , 避免冲突。
+    </dd>
+  </dl>
+  <h2 id="pattern-11">模式 11 · Reminder 模式 (稳定前缀外的动态状态)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : TODO reminder / Memory reminder / Skill hint / Async Run notification /
+    Task Group progress。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 状态 (TODO / memory / skill) 需要 LLM 看到, 但拼到 system prompt 会破坏
+    prompt cache (每轮都变, 缓存 失效); 走 messages 又会被 LLM
+    误以为是用户真实输入。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 利用 LLM 对
+    <code>&lt;system-reminder&gt;</code>
+    标签的
+    <strong>结构化识别能力</strong>
+    — LLM 知道这是"系统注入 的状态", 不是用户输入。 把状态包在 reminder 里, 走
+    user message 末尾, 既不进 stable prefix (不破坏 cache), 又能被 LLM 正确识别
+    (不会被当成 user 输入)。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 状态动态变化不破坏 cache (走 messages 增量 价格); LLM 不会混淆 reminder
+    和用户输入 (统一标签); 空 reminder 不注入, 不污染 (空字符串直接跳过)。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, 5 种 reminder 都遵循这个格式
+function buildReminder(kind: "todo" | "memory" | "skill" | "async-run" | "task", data: any): string {
+  if (data.empty) return "";   // 空就不注入, 不污染
+  return `&lt;system-reminder source="${kind}"&gt;
+${data.content}
+&lt;/system-reminder&gt;`;
+}
+
+// 在 agent.prepareMessages() 末尾注入
+const reminders = [
+  buildReminder("todo", todoManager.list()),
+  buildReminder("memory", { entries: memoryManager.listPinned(), empty: !memoryManager.listPinned().length }),
+  buildReminder("skill", { content: `可用 skill: ${activeSkills.names().join(", ")}`, empty: !activeSkills.names().length }),
+  buildReminder("async-run", { content: drainNotifications(), empty: !drainNotifications().length }),
+  buildReminder("task", { content: formatTaskGroup(activeTaskGroup), empty: !activeTaskGroup }),
+].filter(r =&gt; r !== "");
+
+// 注入为最后一条 user message (不进 system prompt!)
+messages.push({ role: "user", content: reminders.join("\n\n") });</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/agent.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · buildReminder 提醒注入 (5 种 source) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>走 user message, 不走 system prompt</strong>
+      : 状态是动态的, 不进 stable prefix, 保持 prompt cache 命中。
+    </li>
+    <li>
+      <strong>统一格式 &lt;system-reminder source="..."&gt;</strong>
+      : LLM 知道这是 reminder 不是 user 真实输入, 不会误以为是用户。
+    </li>
+    <li>
+      <strong>空就跳过</strong>
+      : 没数据就不注入, 不污染 history。
+    </li>
+  </ol>
+  <h3>仓库里所有 reminder</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>Reminder</th>
+        <th>source</th>
+        <th>触发时机</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>TODO</td>
+        <td>
+          <code>source="todo"</code>
+        </td>
+        <td>每轮 chat() 前</td>
+      </tr>
+      <tr>
+        <td>Memory</td>
+        <td>
+          <code>source="memory"</code>
+        </td>
+        <td>SessionStart hook</td>
+      </tr>
+      <tr>
+        <td>Skill hint</td>
+        <td>
+          <code>source="skills"</code>
+        </td>
+        <td>LLMCall hook (skill 集合变化)</td>
+      </tr>
+      <tr>
+        <td>Async Run notification</td>
+        <td>
+          <code>source="async-run"</code>
+        </td>
+        <td>finishRun() 时推</td>
+      </tr>
+      <tr>
+        <td>Task Group progress</td>
+        <td>
+          <code>source="task"</code>
+        </td>
+        <td>每轮 chat() 前 (有 active group)</td>
+      </tr>
+      <tr>
+        <td>Permission denial</td>
+        <td>(作为 tool message 写, 不是 reminder)</td>
+        <td>Permission 拒绝时</td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · reminder 没 source 字段</dt>
+    <dd>
+      LLM 看到 5 个 reminder 不知道哪个是哪个的来源, 调试时无法 定位。 正确:
+      每个 reminder 必须有
+      <code>source="..."</code>
+      标识 (e.g.
+      <code>source="todo"</code>
+      /
+      <code>source="memory"</code>
+      )。
+    </dd>
+    <dt>陷阱 · reminder 走 system prompt</dt>
+    <dd>
+      把
+      <code>buildReminder()</code>
+      结果拼到 system prompt 字符串 — 每轮 system prompt 变, cache 失效。 正确:
+      reminder 永远 走 messages 末尾。
+    </dd>
+  </dl>
+  <h2 id="pattern-12">模式 12 · Cache-friendly Layout (缓存友好布局)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 整个项目最重要的隐式架构。 每个写 system prompt / tools / messages
+    的地方都隐式遵守。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : LLM provider (Anthropic 1/10, OpenAI 1/2) 的 prompt cache 按"前缀匹配"
+    工作, 任何"小修改" 让整个 cache 失效; system prompt 里塞状态 = cache
+    命中率归零, 成本涨 5-10 倍; 用户想"便宜" 跑 harness, cache 是必选项。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 把 prompt layout 严格分成
+    <strong>稳定前缀</strong>
+    (system prompt + tools, 写后不改) +
+    <strong>动态 tail</strong>
+    (history + reminder, 每轮变)。 稳定前缀进 cache 享受折扣, 动态 tail 不进
+    cache 但只算增量价格。 用
+    <code>cache-debug</code>
+    模块 算本地 stable hash, 验证"我们没破坏稳定前缀"。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : cache 命中率最大化, 成本降 5-10 倍; 业务模块不需要关心 cache,
+    只要遵守"system prompt + tools 稳定" 这条规则; 调试时
+    <code>cache-debug</code>
+    报"stableChanged=true" 立刻知道哪次操作破坏了 cache。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, 一次 chat() 的完整 layout
+
+// === 稳定前缀 (进 cache) ===
+const systemPrompt = baseSystemPrompt;   // 写后不改
+const tools = getToolDefinitions();         // skill 集合稳定时不变
+
+// === 动态 tail (不进 cache, 但算增量) ===
+const messages = [
+  // history 中的早期消息 (缓存命中)
+  ...history.getMessages().slice(0, -3),
+  // reminder 注入 (在末尾)
+  ...reminders.map(r =&gt; ({ role: "user", content: r })),
+  // 最近几轮 (新)
+  ...history.getMessages().slice(-3),
+  // 当前 user query
+  { role: "user", content: currentQuery },
+];
+
+// === 调 LLM ===
+const response = await llm.chat({ messages, tools });
+
+// === CacheDebugger 验证稳定前缀 ===
+const debug = cacheDebugger.record({ systemPrompt, tools });
+// debug.stableChanged === false 说明没破坏 cache</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/cache-debug.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createCacheDebugger 缓存友好布局 (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>稳定前缀真的稳定</strong>
+      : system prompt + tools 写后不改, 同 run() 内不变。 验证用 cacheDebugger。
+    </li>
+    <li>
+      <strong>动态 tail 算增量</strong>
+      : history + reminder + query, 每轮都变, 不进 cache 但只算增量价格。
+    </li>
+    <li>
+      <strong>CacheDebugger 本地稳定 ≠ Provider cache 命中</strong>
+      : 本地 hash 只能证明"我们没破坏前缀", 真实命中看 usage 字段
+      <code>cache_read_input_tokens</code>
+      。
+    </li>
+  </ol>
+  <h3>所有"为 cache 埋伏笔" 的设计</h3>
+  <p>
+    这是整个项目最微妙的隐式架构 — 它不是某个文件里的"模式", 而是 每个写 system
+    prompt / tools / messages 的地方都隐式遵守的规则。
+  </p>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>设计</th>
+        <th>章节</th>
+        <th>为什么</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>TODO 状态走 reminder, 不拼 system prompt</td>
+        <td>第 03 章</td>
+        <td>system prompt 写后不改, 不让动态状态污染</td>
+      </tr>
+      <tr>
+        <td>
+          Tool descriptions 走 LLM.chat() 的 tools 字段, 不拼 system prompt
+        </td>
+        <td>第 05 章</td>
+        <td>skill 集合稳定时 tools 字段稳定</td>
+      </tr>
+      <tr>
+        <td>内部 _xxx 字段在 flatten 时清除</td>
+        <td>第 06 章</td>
+        <td>不污染 dynamic tail 的 cache key</td>
+      </tr>
+      <tr>
+        <td>Hook 注入走 user reminder, 不进 system prompt</td>
+        <td>第 08 章</td>
+        <td>reminder 是动态的, 不进 stable prefix</td>
+      </tr>
+      <tr>
+        <td>Memory 走 SessionStart hook 注入 reminder, 不进 system prompt</td>
+        <td>第 09 章</td>
+        <td>memory 是个人的, system prompt 是跨用户一致的</td>
+      </tr>
+      <tr>
+        <td>Task Group progress 走 reminder, 不进 system prompt</td>
+        <td>第 12 章</td>
+        <td>activeTaskGroupId 是 session-local, 不污染跨 session 前缀</td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 误以为本地 hash = Provider cache 命中</dt>
+    <dd>
+      <code>cacheDebugger.stableChanged === false</code>
+      只证明"我们没 破坏稳定前缀", 不证明 Provider 真命中 cache。 Provider cache
+      是黑盒, 真实命中看
+      <code>cache_read_input_tokens</code>
+      。 不要 把本地 hash 稳定当成"省钱了", 看 usage 字段才算。
+    </dd>
+    <dt>陷阱 · 工具集合在 run() 内变化</dt>
+    <dd>
+      同一个 run() 内多次调 load_skill, tools 数组变化 — 每次 chat() 都重新算
+      cache key, 命中率 0。 正确: load_skill 异步处理, 加载后下一轮才看到新
+      tools, 同一个 run() 的 tools 数组稳定。
+    </dd>
+  </dl>
+  <h2 id="pattern-13">模式 13 · 错误分类 + 恢复动作 (Error → Action)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : Recovery (7 类错误 → 4 个动作) / Permission (黑名单 deny / ask / allow) /
+    ToolResult 错误 (business error vs throw)。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 错误种类多 (network / rate_limit / credential / context_length /
+    output_interrupted / unknown), 全部 retry 浪费 (credential 重试 100
+    次也通不过), 部分 retry 也不行 (context_length 重试还是超限);
+    不同错误要不同处理; 业务代码"看到错误就 retry" 简单但低效, 看到错误就 throw
+    让 loop 崩也很糟。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>错误分类</strong>
+    思想 — 按错误的
+    <strong>性质</strong>
+    (不是按消息文本) 分类, 每类对应一个
+    <strong>恢复 动作</strong>
+    。 Adapter 翻译 provider 特定错误码 → 内部统一
+    <code>LLMErrorKind</code>
+    , 恢复逻辑只看内部类型。 配合
+    <strong>状态 上限</strong>
+    防止无限循环。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 业务代码"catch → 分类 → 选动作" 路径 清晰; retry 不再"一刀切";
+    上限保护防止永远卡住; provider 错误码不污染业务。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, Recovery 7 类错误 → 4 个动作
+type LLMErrorKind = "network" | "rate_limit" | "credential" | "quota" | "context_length" | "output_interrupted" | "unknown";
+
+type RecoveryAction =
+  | { kind: "backoff"; delayMs: number }
+  | { kind: "compact" }
+  | { kind: "continue" }
+  | { kind: "fail"; reason: string };
+
+const ERROR_TO_ACTION: Record&lt;LLMErrorKind, () =&gt; RecoveryAction&gt; = {
+  network: () =&gt; ({ kind: "backoff", delayMs: 1000 * 2 ** attemptCount.backoff }),
+  rate_limit: () =&gt; ({ kind: "backoff", delayMs: 1000 * 2 ** attemptCount.rate }),
+  context_length: () =&gt; ({ kind: "compact" }),
+  output_interrupted: () =&gt; ({ kind: "continue" }),
+  credential: () =&gt; ({ kind: "fail", reason: "API key 错误" }),
+  quota: () =&gt; ({ kind: "fail", reason: "配额用尽" }),
+  unknown: () =&gt; ({ kind: "fail", reason: "未知错误" }),
+};
+
+// Adapter 把 provider 错误翻译成 LLMErrorKind
+function translateAnthropicError(err: any): { kind: LLMErrorKind; message: string } {
+  if (err.status === 429) return { kind: "rate_limit", message: err.message };
+  if (err.status === 401) return { kind: "credential", message: err.message };
+  if (err.status === 400 &amp;&amp; err.message.includes("context_length")) {
+    return { kind: "context_length", message: err.message };
+  }
+  return { kind: "unknown", message: err.message };
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createRecoveryManager 错误分类 + 4 动作 (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>错误按性质分类</strong>
+      : 不是按消息 ("429 错误"), 是按 "rate_limit" / "credential" /
+      "context_length"。 跨 provider 一致。
+    </li>
+    <li>
+      <strong>Adapter 翻译 provider 错误码</strong>
+      : 业务模块只看 LLMErrorKind, 不看 Anthropic 429 / OpenAI 429 / Google
+      RESOURCE_EXHAUSTED。
+    </li>
+    <li>
+      <strong>动作有上限</strong>
+      : backoff 5 次 / compact 1 次 / continue 2 次, 达到上限 fail,
+      防止无限循环。
+    </li>
+  </ol>
+  <h3>所有"错误 → 动作" 的应用</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>错误源</th>
+        <th>分类</th>
+        <th>动作</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>LLM 调用</td>
+        <td>7 类 (LLMErrorKind)</td>
+        <td>4 动作 (backoff/compact/continue/fail)</td>
+      </tr>
+      <tr>
+        <td>工具执行</td>
+        <td>业务错误 (error: true) vs throw (harness 错误)</td>
+        <td>前者写 tool message, 后者 throw 上抛</td>
+      </tr>
+      <tr>
+        <td>权限</td>
+        <td>allow / ask / deny</td>
+        <td>直接执行 / 问用户 / 写 denied tool message</td>
+      </tr>
+      <tr>
+        <td>Eval judge 输出</td>
+        <td>解析成功 / 解析失败</td>
+        <td>正常 judge / judge_failed fallback</td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 所有错误都 retry 3 次</dt>
+    <dd>
+      credential 错误重试 100 次也通不过, 浪费时间且给 provider 加重负担。 正确:
+      只对可恢复错误 (network / rate_limit / context_length) 重试, 不可恢复
+      (credential / quota) 直接 fail。
+    </dd>
+    <dt>陷阱 · 按消息文本分类</dt>
+    <dd>
+      "if (err.message.includes('429'))" — OpenAI 错误消息可能 升级改变,
+      字面匹配会挂。 正确: 按 err.status / err.code 结构化字段分类, 不看消息。
+    </dd>
+  </dl>
+  <h2 id="pattern-14">模式 14 · Concurrent Limit (并发限制)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : Async Run (max 3) / LLM retry (max 5) / Schedule overlap (allow/skip) /
+    Task 依赖图 (BFS 限深度)。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 资源是有限的 (本地 CPU / 内存 / 文件 句柄 / provider 配额), 无限并发 =
+    资源耗尽 = harness 整体挂; 启动第 N+1 个 run 应当
+    <strong>立刻</strong>
+    拒绝, 让调用方 (LLM) 看到错误后自己决定 (等 / 取消 / 换工具), 而不是默默
+    排队把系统拖垮。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>信号量</strong>
+    的简化版 — 在 manager 内部维护一个计数器, 启动时检查 + 递增, 完成后递减。
+    配对操作确保不会泄漏计数。 上限是经验值 (本地资源默认 3, CI 可 调到 5)。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 资源受保护 (不会耗尽); 调用方立刻知道 拒绝原因 (而不是默默排队);
+    配对操作保证计数不泄漏 (启动 + 完成后一对, 缺一就错)。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, Async Run 并发限制
+export function createAsyncRunManager({ maxConcurrent = 3 }) {
+  let runningCount = 0;
+
+  return {
+    start(args) {
+      if (runningCount &gt;= maxConcurrent) {
+        return {
+          error: true,
+          content: `async run 并发数已达上限 (${maxConcurrent}), 请等待现有 run 完成。`,
+        };
+      }
+      runningCount++;
+      // 实际启动逻辑
+    },
+    finishRun(runId) {
+      // 收敛 (唯一的 finishRun 路径)
+      runningCount--;
+    },
+  };
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createAsyncRunManager 并发限制 (max 3) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>运行前检查 + 启动后递增 + 完成后递减</strong>
+      : 三个动作配对, 缺一就泄漏计数。
+    </li>
+    <li>
+      <strong>超限拒绝, 不排队</strong>
+      : 不做"等前面的跑完再启" 的复杂 逻辑, 直接拒绝, 让 LLM 看到错误后自己决定
+      (等 / 取消 / 换工具)。
+    </li>
+    <li>
+      <strong>限制数字是经验值</strong>
+      : 3 是本地资源默认值, CI 可调到 5。 无限并发 = 资源耗尽, 必坏。
+    </li>
+  </ol>
+  <h3>仓库里所有并发限制</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>限制</th>
+        <th>上限</th>
+        <th>位置</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>Async Run 并发</td>
+        <td>3</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L299"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/async-runs.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>LLM retry</td>
+        <td>5</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L75"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/recovery.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>P2 compact</td>
+        <td>1</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L198"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/recovery.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Continue 续写</td>
+        <td>2</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/recovery.ts#L198"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/recovery.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Subagent maxRounds</td>
+        <td>10 (默认)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/tools/subagent.ts#L110"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/tools/subagent.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Tool message 长度</td>
+        <td>2000 字符 (P1 触发阈值)</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/compressor.ts#L123"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/compressor.ts</code></a
+          >
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 启动成功但 finishRun 失败, 计数泄漏</dt>
+    <dd>
+      启动时抛 throw, 计数没递增 — 没事。 启动后跑过程抛 throw, 计数已经 ++ —
+      必须有 try/finally 保证 finishRun 被调。 否则 跑几次后并发数永远到上限,
+      整个系统卡死。
+    </dd>
+    <dt>陷阱 · 限制值根据"代码假设" 而不是"实测"</dt>
+    <dd>
+      "max 10 应该够" — 写代码时拍脑袋, 实际跑 20 个 agent 全卡。 正确: 跑 100
+      个真实任务, 统计 95 分位并发数, 取这个数。
+    </dd>
+  </dl>
+  <h2 id="pattern-15">模式 15 · Identity Check (id 与目录名同步)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : TaskStore / ScheduleStore / MemoryStore / OutputStore 都有"目录名 = 数据
+    id" 的身份校验。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 用户可能手动改目录名 (重命名 / 移动) 或 改 JSON 文件内的 id 字段;
+    改完后目录名和 id 字段不一致, 读取 时按目录名查到但 id 是另一个, 数据错乱;
+    不校验, 后续操作 (按 id 删除 / 更新) 找不到对象, 静默失败。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>防御性编程</strong>
+    的"双重保险" — 目录名和内容 id 字段必须一致, 不一致就是数据被外部破坏过。
+    读取 时校验, 不匹配就 skip + warn, 不自动修复 (自动修可能误改用户
+    真实意图)。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 检测到外部破坏时优雅降级 (skip + warn), 不让一个坏 group 影响其他 group;
+    用户看到 warn 知道"我的数据 被改了"; 不自动修, 避免"猜错用户意图" 的灾难。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, TaskStore 的 identity check
+function readGroup(tasksDir: string, groupId: string): TaskGroup | null {
+  const groupFile = path.join(tasksDir, "groups", groupId, "group.json");
+  if (!fs.existsSync(groupFile)) return null;
+  const group = JSON.parse(fs.readFileSync(groupFile, "utf8"));
+  // 关键: 目录名 == group.id, 防止手动改名导致漂移
+  if (group.id !== groupId) {
+    console.warn(`Directory id mismatch: expected ${groupId}, got ${group.id}, skipping`);
+    return null;
+  }
+  return group;
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · TaskStore 身份校验 (目录名 == group.id) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>目录名 = 数据 id</strong>
+      : 双重保险, 防止改 id 不改目录名 (或反之) 导致"找到了文件但内容错乱"。
+    </li>
+    <li>
+      <strong>不匹配就跳过</strong>
+      : 不抛错中断, 跳过这个 group, 继续读其他 group。 一个坏 group 不影响其他。
+    </li>
+    <li>
+      <strong>warn 但不修</strong>
+      : 写日志, 让用户知道"目录被改过", 但不自动 修复 (自动修可能误改用户数据)。
+    </li>
+  </ol>
+  <h3>所有"目录 = id" 的位置</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>Store</th>
+        <th>目录结构</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>TaskStore</td>
+        <td>
+          <code>tasks/groups/&lt;group_id&gt;/group.json</code>
+        </td>
+      </tr>
+      <tr>
+        <td>ScheduleStore</td>
+        <td>
+          <code>schedules/&lt;schedule_id&gt;/schedule.json</code>
+        </td>
+      </tr>
+      <tr>
+        <td>MemoryStore</td>
+        <td>
+          <code>memory/&lt;memory_id&gt;.json</code>
+        </td>
+      </tr>
+      <tr>
+        <td>OutputStore</td>
+        <td>
+          <code>.task_outputs/&lt;output_id&gt;.txt</code>
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 自动修复不匹配</dt>
+    <dd>
+      检测到不匹配就 "fix it" — 改目录名匹配 id, 或改 id 匹配目录名。
+      这猜错了用户意图, 误改用户数据。 正确: warn + skip, 让用户 自己处理。
+    </dd>
+    <dt>陷阱 · 校验过严</dt>
+    <dd>
+      校验 "id 必须是小写字母数字" — 拒绝了合法的 id (e.g. 短 hash 含大小写)。
+      正确: 只校验"目录名 == id 字段", 不校验 id 格式。
+    </dd>
+  </dl>
+  <h2 id="pattern-16">模式 16 · Test Factory (测试工厂)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 仓库里所有 test 文件都复用同一套 fake factory, 例如 FakeLLMClient、
+    ScriptedLLMClient、ScriptedTerminal、Fake ToolRegistry。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 测试不能依赖真实 LLM — CI 通常无法调用 OpenAI API; 测试需要精确控制 LLM
+    返回, 才能断言 messages 顺序等业务行为, 而不是"赌 LLM 这次刚好给正确答案";
+    直接 mock 真实 SDK 又太重, 例如 openai SDK 有 10+ 个方法, mock
+    起来容易漏掉关键调用。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 沿用 Test Double 的
+    <strong>Stub</strong>
+    +
+    <strong>Spy</strong>
+    思路 — fake 实现与生产代码相同的业务接口 (LLMClient / Terminal /
+    ToolRegistry), 但内部行为完全可控: Stub 按预设 response 队列返回, Spy
+    捕获每次调用时的 messages 和 tools。 生产代码看到的是正常接口,
+    测试代码拿到的是完全可控的行为记录。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 业务代码零改动即可注入 fake 进行测试; 断言可以精确到"LLM 看到了怎样的
+    messages 顺序"; 不依赖网络和 API key; 运行毫秒级, 不用等待真实 LLM 响应。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版
+export function createFakeLLMClient(responses: ScriptedLLMResponse[]): LLMClient {
+  let i = 0;
+  const calls: { messages: any[]; tools?: any[] }[] = [];
+  return {
+    async chat({ messages, tools }) {
+      calls.push({ messages, tools });
+      if (i &gt;= responses.length) {
+        throw new Error(`No more fake responses, total calls: ${calls.length}`);
+      }
+      const response = responses[i++];
+      return {
+        content: response.content ?? null,
+        toolCalls: response.toolCalls ?? [],
+        finishReason: response.finishReason ?? "stop",
+      };
+    },
+    // 暴露给测试用
+    allCalls() { return calls; },
+    lastCall() { return calls[calls.length - 1]; },
+  };
+}
+
+export function createScriptedTerminal(plan?: EvalTerminalPlan): Terminal {
+  return {
+    async question(prompt: string) {
+      return plan?.questions?.shift() ?? "";
+    },
+    async askUser(prompt: string) {
+      return plan?.permissionAnswers?.shift() ?? plan?.defaultPermissionAnswer ?? true;
+    },
+    println(text: string) {},
+    close() {},
+  };
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/scripted-llm.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createScriptedLLMClient 测试工厂 (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>实现 LLMClient / Terminal 接口</strong>
+      : fake 实现"刚好满足 接口", 但行为完全可控。
+    </li>
+    <li>
+      <strong>捕获每次 chat() 的 messages 和 tools</strong>
+      : 测试断言 "这次 chat() 收到的 messages 含什么", 不用 mock 整个 SDK。
+    </li>
+    <li>
+      <strong>response 队列</strong>
+      : LLM 返回从队列里取, 跑完抛错。 强迫 case 作者显式枚举每次 LLM 返回,
+      不会"漏掉" 一次响应。
+    </li>
+  </ol>
+  <h3>仓库里所有 fake factory</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>Fake</th>
+        <th>位置</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>FakeLLMClient / ScriptedLLMClient</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/scripted-llm.ts#L32"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/eval/drivers/learn-claude-code/scripted-llm.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>ScriptedTerminal</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/scripted-terminal.ts#L26"
+            rel="noreferrer"
+            target="_blank"
+            ><code
+              >src/eval/drivers/learn-claude-code/scripted-terminal.ts</code
+            ></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Fake ToolRegistry</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/tools/registry.ts#L107"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/tools/registry.ts</code></a
+          >
+          的 fake mode
+        </td>
+      </tr>
+      <tr>
+        <td>ReplayLLMClient</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/replay/replay-llm.ts#L60"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/eval/replay/replay-llm.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>LiveEvalLLMClient</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/live/live-llm.ts#L30"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/eval/live/live-llm.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Fake Hook / Fake Permission</td>
+        <td>测试文件内 inline 写</td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · fake 行为偏离真实太多</dt>
+    <dd>
+      FakeLLMClient 直接返回 hardcoded "Hello", 不经过"消耗队列" 逻辑 —
+      业务代码一旦改成异步 / 重试, fake 还是返回 hardcoded, 测试还是过, 真实 LLM
+      早已崩。 正确: fake 实现"几乎像真实", 至少要支持 response 队列 + capture
+      calls。
+    </dd>
+    <dt>陷阱 · 测试只覆盖 happy path</dt>
+    <dd>
+      只测"工具返回成功" 的路径, 不测"工具抛 throw" / "工具返回 错误结果"。
+      真实环境错误路径才是 bug 高发区。 正确: 每个 fake 至少支持"成功 + 错误"
+      两种模式。
+    </dd>
+  </dl>
+  <h2 id="pattern-17">模式 17 · Stable Identity (id 不变)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 仓库所有"长生命周期" 数据 (Task / Memory / Schedule / Async Run / Skill)
+    都有 stable id。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 跨 run / 跨 session 引用"之前创建的 那个 task" 时, 如果 id 随内容变化,
+    引用就断了; 长生命周期数据 需要"id 永远不变" 才能被外部持续引用 (e.g.
+    "task_abc12345 blockedBy task_def67890" 永远要有效); UUID 难读 / 易打错, LLM
+    在 tool_call 里引用也容易错。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>内容寻址</strong>
+    (Content Addressing) 的简化版 — id = hash(原始创建内容), 内容改了 id 不变,
+    改名 / 改 status / 改 owner 都不变。 用短 hash (8 字符) 代替 UUID, LLM
+    易读易引用。 加前缀分类 (
+    <code>task_</code>
+    /
+    <code>tg_</code>
+    ), 一眼看出类别。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 跨 run 引用永远有效; LLM 在 tool_call 里易读易打; 分类前缀防止混淆;
+    不需要集中 ID 生成器 (内容 哈希本身就是 id)。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版
+function hashId(content: string): string {
+  // 短 hash, 用内容前 8 字符, LLM 引用易读
+  return createHash("sha1").update(content).digest("hex").slice(0, 8);
+}
+
+// 各种 id 命名约定
+const ID_PREFIXES = {
+  task: "task_",
+  taskGroup: "tg_",
+  schedule: "sch_",
+  occurrence: "occ_",
+  asyncRun: "ar_",
+  memory: "mem_",
+  output: "out_",
+};
+
+// id 一旦创建, 不可变
+// 内容改了, id 不变 (hash 来自原始创建内容)
+// 改名 / 改 status / 改 owner, id 都保持
+const taskId = "task_" + hashId("改 config.ts 的 retryLimit");
+// 之后 task.title 改了, task.id 还是 task_abc12345</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createTaskManager 稳定 id (短 hash + 前缀) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>id 不可变</strong>
+      : 创建后改任何字段 (title / status / owner), id 都不变。 让外部引用 (e.g.
+      "task_abc12345 blockedBy task_def67890") 永远有效。
+    </li>
+    <li>
+      <strong>短 hash</strong>
+      : 8 字符, 不用 UUID。 LLM 在 tool_call 里引用 id, 短 hash 易读, UUID
+      易打错。
+    </li>
+    <li>
+      <strong>前缀分类</strong>
+      :
+      <code>task_</code>
+      /
+      <code>tg_</code>
+      /
+      <code>sch_</code>
+      , 一眼看出 id 属于哪类。 防止混淆 (e.g. 用 task id 调 run_schedule_cancel
+      报错不直观)。
+    </li>
+  </ol>
+  <h3>所有 id 命名前缀</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>类别</th>
+        <th>前缀</th>
+        <th>位置</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>Task</td>
+        <td>
+          <code>task_</code>
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L111"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/tasks.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Task Group</td>
+        <td>
+          <code>tg_</code>
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/tasks.ts#L111"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/tasks.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Schedule</td>
+        <td>
+          <code>sch_</code>
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L605"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/schedules.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Occurrence</td>
+        <td>
+          <code>occ_</code>
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/schedules.ts#L605"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/schedules.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Async Run</td>
+        <td>
+          <code>ar_</code>
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L299"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/async-runs.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Memory</td>
+        <td>
+          <code>mem_</code>
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/memory.ts#L331"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/memory.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Output</td>
+        <td>
+          <code>out_</code>
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/output-store.ts#L110"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/output-store.ts</code></a
+          >
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · id 用了 UUID / 计数器</dt>
+    <dd>
+      UUID 太长, LLM 易打错; 计数器需要集中生成器, 跨进程会冲突; hash
+      内容最简单也最稳定。 正确: 短 hash + 前缀。
+    </dd>
+    <dt>陷阱 · id 来自随机数, 不可重读</dt>
+    <dd>
+      "task_abc12345" 是从 id 字段读到的, 但创建时用的是随机数 —
+      同样的内容重新创建得到不同 id, 跨进程引用就断了。 正确: id 必须
+      <strong>确定性</strong>
+      从内容派生, 同样内容永远同样 id。
+    </dd>
+  </dl>
+  <h2 id="pattern-18">模式 18 · Transcript-First (transcript 优先)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 调试 / eval / audit 都靠 transcript。 几乎所有关键操作都 append transcript
+    event。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 调试时需要"回放整个 session"; 审计 需要"LLM 实际看到什么 + 实际做了什么";
+    history 被 P0/P1/P2 压缩后会丢信息, transcript 必须保留全部;
+    "我上一轮到底为什么 调了这个工具" 这种问题, 不靠 transcript 答不上。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>审计日志</strong>
+    (Audit Log) 思想 — 所有关键操作 (用户消息 / 助手消息 / 工具调用 / 错误恢复)
+    都
+    <strong>append</strong>
+    一条到
+    <strong>事件流</strong>
+    , 不修改 / 不删除。 事件流跟 history 分离, history 是 LLM 看到的, transcript
+    是 调试/审计看到的。 transcript 保留所有内部 metadata, 不被压缩 影响。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 调试时按事件序号回放, 整个 session 一清二楚; transcript 不被 P0/P1/P2
+    压缩, 长期保留完整记录; 审计能信任 (append-only, 不可篡改); 跟 LLM 看到的
+    messages 分离, 不会 污染 LLM 视角。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版
+class Agent {
+  async run(query: string) {
+    this.transcript.append({ kind: "user_message", content: query });
+    for (;;) {
+      const response = await this.llm.chat({ ... });
+      this.transcript.append({ kind: "assistant_message", content: response.content, tool_calls: response.toolCalls });
+      if (response.toolCalls.length === 0) break;
+      for (const call of response.toolCalls) {
+        const result = await this.tools.invoke(...);
+        this.transcript.append({ kind: "tool_result", toolName: call.name, result: result.content });
+      }
+    }
+    return response.content;
+  }
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/transcript.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createTranscriptStore transcript-first 模式 (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>所有关键操作都写 transcript</strong>
+      : 用户消息 / assistant 消息 / 工具调用 / 工具结果 / recovery_event / hook
+      注入 / history_replaced, 全部 append。 调试时回放 transcript, 整个 session
+      一清二楚。
+    </li>
+    <li>
+      <strong>transcript 不影响 prompt</strong>
+      : transcript 是
+      <strong>审计流</strong>
+      , 不是
+      <strong>消息流</strong>
+      。 LLM 看到的 messages 走 history, transcript 只给调试 / 评测用。
+    </li>
+    <li>
+      <strong>append-only</strong>
+      : 跟 LLM 看到的 history 不同, transcript 不会被 P0/P1/P2 压缩。
+      "我上一轮到底为什么调了这个工具" — 即使 history 被压缩, transcript
+      仍有完整记录。
+    </li>
+  </ol>
+  <h3>transcript 包含但 messages 不包含的字段</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>字段</th>
+        <th>transcript</th>
+        <th>messages</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>内部 metadata (_loopIndex, _turnIndex, _messageSequence)</td>
+        <td>保留</td>
+        <td>清除 (flatten 时)</td>
+      </tr>
+      <tr>
+        <td>工具原始 args (含 JSON 字符串原文)</td>
+        <td>保留</td>
+        <td>可见 (但 LLM 不一定关心)</td>
+      </tr>
+      <tr>
+        <td>工具 metadata (耗时、字节数)</td>
+        <td>保留</td>
+        <td>不发送</td>
+      </tr>
+      <tr>
+        <td>Recovery event (backoff / compact / fail)</td>
+        <td>保留</td>
+        <td>不发送</td>
+      </tr>
+      <tr>
+        <td>Permission 决策 (allow / ask / deny)</td>
+        <td>保留</td>
+        <td>不发送 (deny 写 tool message)</td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · transcript 存到 history 里</dt>
+    <dd>
+      业务代码读 transcript 当 messages 用 — LLM 看到的是
+      <strong>审计数据</strong>
+      (含内部 metadata), 不是它应该看到的。 正确: transcript
+      永远只用于调试/审计, 不进 LLM messages。
+    </dd>
+    <dt>陷阱 · transcript 忘了 append</dt>
+    <dd>
+      某个关键操作没 append 到 transcript, 调试时找不到。 正确: 关键操作
+      (用户消息 / 助手消息 / 工具调用 / 错误恢复) 必须 强制 append, 不留遗漏。
+    </dd>
+  </dl>
+  <h2 id="pattern-19">模式 19 · Test Doubles (测试替身)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : Eval Runner 的核心抽象 — 不依赖真实 组件就能跑 case。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : Eval 要测"harness 的行为", 不应该 绑死在"具体是哪个 harness 实现"
+    (in-process / CLI / Team 都要能测); 测 runner 本身时, 不需要真 agent (fake
+    driver 即可); 测 driver 时, 不需要真 LLM (ScriptedLLMClient 即可); 测 CLI
+    driver 时, 不需要真终端 (ScriptedTerminal 即可)。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>测试替身</strong>
+    (Test Double) 思想 — 在系统边界处定义
+    <strong>中立接口</strong>
+    , 实现可以是真实组件 也可以是测试替身。 Eval Runner 只看 driver 接口, 不看
+    driver 实现, 因此可以"换 driver 跑同一 case", 验证不同 driver 行为 一致。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 同一 case 多种 driver 跑, 验证 driver 实现一致性; 测 runner 逻辑不需要真
+    agent, fake driver 即可; 测 driver 不需要真 LLM / 终端, ScriptedLLMClient /
+    ScriptedTerminal 即可; 跑得快, 不依赖网络。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre
+    class="code-block"
+  ><code>// 教学简化版, Eval Runner 的 4 种 driver 都是 test double
+type CodingAgentDriver = {
+  startCase(ctx): Promise&lt;void&gt;;
+  send(input): Promise&lt;AgentTurnResult&gt;;
+  readEvents?(): Promise&lt;AgentRuntimeEvent[]&gt;;
+  close(): Promise&lt;void&gt;;
+};
+
+// 真实实现: in-process driver
+const inProcessDriver = createInProcessDriver({ llm: fakeLLM, history, tools });
+
+// CLI 黑盒 driver (真正的子进程)
+const cliDriver = createCliDriver({ command: "node", args: ["agent.js"] });
+
+// Team driver (顺序 supervisor 拓扑)
+const teamDriver = createTeamDriver({ members: [...] });
+
+// 测试用: 完全 fake driver
+const fakeDriver = {
+  async startCase() {},
+  async send({ query }) { return { finalOutput: `Echo: ${query}` }; },
+  async readEvents() { return []; },
+  async close() {},
+};</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/eval/core/driver.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · CodingAgentDriver 测试替身 (4 种 driver) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>Driver 接口中立</strong>
+      : 4 个方法, 不依赖具体实现。 Runner 调
+      <code>driver.send()</code>
+      , 不关心是 in-process / CLI / Team。
+    </li>
+    <li>
+      <strong>同一 case 多种 driver</strong>
+      : 同一 EvalCase, 切换 driver 就能在 in-process / CLI / Team 跑。 验证
+      driver 实现是否一致。
+    </li>
+    <li>
+      <strong>fake driver 测 Runner 本身</strong>
+      : 测 runner 逻辑 (workspace / assert / trace) 不需要真 agent, 用 fake
+      driver 即可。
+    </li>
+  </ol>
+  <h3>所有 driver 类型</h3>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>Driver</th>
+        <th>驱动什么</th>
+        <th>位置</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>in-process</td>
+        <td>
+          当前
+          <code>createAgent()</code>
+        </td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/in-process-driver.ts#L67"
+            rel="noreferrer"
+            target="_blank"
+            ><code
+              >src/eval/drivers/learn-claude-code/in-process-driver.ts</code
+            ></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>CLI</td>
+        <td>外部子进程</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/cli/cli-driver.ts#L44"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/eval/drivers/cli/cli-driver.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>Team</td>
+        <td>顺序 supervisor 拓扑</td>
+        <td>
+          <a
+            href="https://github.com/pingp76/swoopcode/blob/main/src/eval/drivers/learn-claude-code/team-driver.ts#L70"
+            rel="noreferrer"
+            target="_blank"
+            ><code>src/eval/drivers/learn-claude-code/team-driver.ts</code></a
+          >
+        </td>
+      </tr>
+      <tr>
+        <td>fake (test)</td>
+        <td>什么都不驱, 纯 stub</td>
+        <td>测试文件内 inline 写</td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · fake driver 行为过简</dt>
+    <dd>
+      fake driver 直接返回 hardcoded "Done", 测 runner 流程时通过, 但真实 driver
+      行为复杂 (例如异步 + 重试 + 错误恢复), fake 没 体现, 真实 driver 跑崩时
+      case 测不到。 正确: fake driver 至少支持 success / error / timeout 3
+      种模式。
+    </dd>
+    <dt>陷阱 · 跑 case 时不切换 driver 验证一致性</dt>
+    <dd>
+      同一 EvalCase 只在 in-process driver 跑过, 没说在 CLI driver
+      跑过行为一致。 正确: 关键 case 至少在 2 种 driver 跑过, 验证 driver
+      实现一致性。
+    </dd>
+  </dl>
+  <h2 id="pattern-20">模式 20 · Prompt Card 模板</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 每章末尾 1 个, 整个教程 18+ 个。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 学生 vibe 新功能时, 写"帮我写 X" 出来的 实现跑偏; 写"代码要清晰" 这种空话
+    LLM 不知道什么意思; vibe 出 来的代码没有可验证的边界, 不知道对不对。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : Prompt Card 是
+    <strong>结构化的需求模板</strong>
+    — 把"模糊需求" 转成"6 段结构化描述", 强制每段填具体内容, 每段 可验证。
+    段顺序不能换 (目标 → 场景 → 模块 → 边界 → 验证 → Prompt), 缺一段就翻车。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 6 段顺序约束强制学生想清楚再 vibe, 不 跳步; 每段可验证 (边界段必须能转成
+    <code>expect()</code>
+    ); vibe 出来的实现直接对照 6 段 review, 缺什么补什么。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, 6 段 Prompt Card
+{
+  目标:        "用户问什么, 我们让 LLM 写什么"
+  场景:        "具体用户故事, 一段对话能讲清"
+  模块:        "新增/修改哪些文件, 每个文件单一职责"
+  边界:        "LLM 必须遵守的 checklist (5-7 条)"
+  验证:        "怎么跑 fake LLM 确认实现没坏"
+  Prompt:      "可以直接复制粘贴给 LLM 的整段 prompt"
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/tutorial/chapters/00-preface.html#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · 6 段 Prompt Card 模板 (定义在本页 00 章) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>6 段顺序不能换</strong>
+      : 目标 → 场景 → 模块 → 边界 → 验证 → Prompt。 任何一段缺失, LLM
+      写出来就跑偏。
+    </li>
+    <li>
+      <strong>每段必须可验证</strong>
+      : 边界段不能写空话 ("代码要清晰"), 必须写可断言的约束 ("用 Map 去重, 不用
+      Array.includes")。
+    </li>
+    <li>
+      <strong>Prompt 段是整段可复用</strong>
+      : 把前 5 段拼一起, 加上必要的代码 示例, 就是直接喂给 LLM 的 prompt。
+    </li>
+  </ol>
+  <p>
+    详细模板见第 00 章"6 段 Prompt Card 模板" 和 Reference 中"如何写好 一个 eval
+    case" 章节。
+  </p>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 边界段写空话</dt>
+    <dd>
+      "代码要清晰" / "跑得通" / "边界要严" — LLM 写出来"看起来对"
+      但实际跑不过测试。 正确: 每条边界必须能转成一条
+      <code>expect()</code>
+      。
+    </dd>
+    <dt>陷阱 · 验证段依赖真实 LLM</dt>
+    <dd>
+      "跑 npm run test, 看是不是绿的" — 真实 LLM 行为不可预测, 测试 flaky。
+      正确: 用 fake LLM + scripted response 验证, 每次跑 1000 次结果一样。
+    </dd>
+  </dl>
+  <h2 id="pattern-21">模式 21 · No Catch Throw (业务错误不 throw)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : 工具执行 / 权限拒绝 / 子 agent 失败 / judge 输出 解析失败, 全部用
+    <code>error: true</code>
+    或专属 status 返回, 不 throw。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : throw 让整个 loop 崩, messages 序列断裂 (assistant 调了 N 个工具, 但 N 个
+    tool message 都没写, LLM 看到 错位); LLM 看不到任何错误信息, 不能继续推理;
+    用户看到"500 错误" 一头雾水, 不知道"为什么失败、怎么修"。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 业务错误是"预期的失败", 不是"代码 bug"。 业务 错误应该作为"正常返回"
+    出现在 LLM 视野里 (通过 tool message), 让 LLM 继续推理 ("啊, 文件不存在,
+    我换路径"); 只有真正的 harness bug (配置错误 / 内存溢出 / 内部 invariant
+    违反) 才用 throw 上抛 让 recovery 处理。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 业务错误让 LLM 看到, 继续推理; loop 不崩, messages 序列完整;
+    用户看到清晰的错误信息; recovery 只处理 真正的 harness 错误,
+    不被业务错误打扰。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, 4 种 "失败" 的处理
+
+// 1. 工具业务错误
+async function runRead(args) {
+  try {
+    const content = await fs.readFile(args.path, "utf8");
+    return { content };
+  } catch (e) {
+    return { content: `Error: ${e.message}`, error: true };  // 不 throw
+  }
+}
+
+// 2. 权限拒绝
+if (!permissionManager.check(...).allow) {
+  history.add({ role: "tool", tool_call_id: call.id,
+    content: "Permission denied by user." });  // 不 throw
+  continue;   // 跳过 invoke
+}
+
+// 3. 子 agent 失败
+async function runSubagent(args) {
+  try {
+    return { content: await childAgent.run(args.task) };
+  } catch (e) {
+    return { content: `子任务失败: ${e.message}`, error: true };  // 不 throw
+  }
+}
+
+// 4. judge 输出解析失败
+function parseJudgeOutput(raw: string): EvalJudgeResult {
+  try { return JSON.parse(raw); } catch {}
+  // 4 层降级都失败, 返回 judge_failed fallback
+  return { enabled: true, passed: false, score: 0,
+    summary: "judge_failed", problems: [...], ... };
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/permission.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · PermissionManager 业务错误不 throw (deny 写 tool message)
+      (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>业务错误用 error: true 表达</strong>
+      : LLM 收到后能继续推理 ("啊, 文件不存在, 我换路径")。 不 throw 让 loop
+      崩。
+    </li>
+    <li>
+      <strong>throw 只用于真正的 harness 错误</strong>
+      : 配置错误 / 内存 溢出 / 内部 invariant 违反, 这种 throw 是合理的。
+      业务错误绝不能 throw。
+    </li>
+    <li>
+      <strong>fallback 不让系统挂</strong>
+      : judge 解析失败 fallback, async run 超时 fallback, 权限拒绝 fallback,
+      所有失败都有 "降级路径" 而不是"死路"。
+    </li>
+  </ol>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 把所有 throw 都 catch 住</dt>
+    <dd>
+      try/catch 包一切 throw 当 fallback — 把"代码 bug" 和"业务 错误" 混为一谈,
+      harness 错误也被吞掉, 排查难。 正确: 业务 错误显式返回 error: true,
+      harness 错误让它 throw 上抛。
+    </dd>
+    <dt>陷阱 · fallback 信息太简</dt>
+    <dd>
+      fallback 只返回 "Error" — LLM 看到"Error" 不知道下一步 怎么办。 正确:
+      返回具体错误 + 建议下一步 (e.g. "Permission denied by user.
+      请重新决定或换其他方案。")。
+    </dd>
+  </dl>
+  <h2 id="pattern-22">模式 22 · Idempotent Operations (幂等操作)</h2>
+  <p>
+    <strong>出现频率</strong>
+    : load_skill 多次激活 / Async Run 重试 / Schedule 重新触发 / Memory create
+    (允许重复 id, 覆盖更新)。
+  </p>
+  <p>
+    <strong>解决什么问题</strong>
+    : 网络抖动导致 LLM 重试时, 同一个 load_skill 被调 2 次, 应该不重复激活;
+    用户误操作"按 2 次 同一按钮" 应该不创建 2 个 run; Schedule tick 重新触发时,
+    不应 该创建重复 occurrence; Memory 用相同 id 创建时, 应该是"更新"
+    而非"创建"。
+  </p>
+  <p>
+    <strong>原理</strong>
+    : 经典
+    <strong>幂等性</strong>
+    (Idempotency) — 操作 重复 N 次的效果 = 操作 1 次的效果。 用"按 id 去重"
+    实现: 重复 id 时静默接受 (返回已有对象) 而非报错, 或者用 Set 存储已激活 名字
+    (重复激活时啥都不做)。
+  </p>
+  <p>
+    <strong>带来的好处</strong>
+    : 重试 / 误操作不会产生副作用; transcript 里出现 2 次同操作, 但 effect 只有
+    1 次 (审计友好); 业务代码 不用"if 第一次" 这种状态判断, 始终假设"幂等执行"。
+  </p>
+  <h3>模式长什么样</h3>
+  <pre class="code-block"><code>// 教学简化版, 4 个幂等操作
+
+// 1. load_skill 多次激活 — Map 去重
+function activate(name: string) {
+  if (toolIndex.has(name)) return [];  // 重复激活, 啥都不做
+  // ... 真正激活逻辑
+}
+
+// 2. Async Run 重新触发相同 runId — 拒绝重复
+function start(runId: string, args) {
+  if (runs.has(runId)) return { error: true, content: "runId 重复" };
+  runs.set(runId, { ... });
+}
+
+// 3. Schedule 重新触发 — 检查 occurrence 是否已存在
+function tick() {
+  for (const schedule of activeSchedules) {
+    const occ = findOccurrenceFor(schedule.id, thisTime);
+    if (occ) continue;   // 已触发, 跳过
+    trigger(schedule);
+  }
+}
+
+// 4. Memory create 重复 id — 覆盖更新
+function create(content, tags) {
+  const id = hashId(content);
+  if (memoryStore.has(id)) {
+    memoryStore.update(id, { content, tags, updatedAt: Date.now() });  // 覆盖
+    return { id, content: "updated" };
+  }
+  // ... 新建
+}</code></pre>
+  <p class="source-link">
+    <a
+      href="https://github.com/pingp76/swoopcode/blob/main/src/async-runs.ts#L1"
+      rel="noreferrer"
+      target="_blank"
+      >GitHub · createAsyncRunManager 幂等操作 (重复 runId 拒绝) (L1)</a
+    >
+  </p>
+  <p>关键点:</p>
+  <ol>
+    <li>
+      <strong>重复操作不产生副作用</strong>
+      : 调两次 load_skill("react") 和调一次, 效果一样。 不写 error, 不写
+      warning, 安静地幂等。
+    </li>
+    <li>
+      <strong>幂等 ≠ 拒绝</strong>
+      : 不要 "重复 id 报错", 应该 "重复 id 静默 接受"。 报错会让 LLM 困惑,
+      静默接受让它下次继续。
+    </li>
+    <li>
+      <strong>幂等是 audit-friendly</strong>
+      : transcript 里出现 2 次 load_skill("react"), 但实际只激活 1 次。
+      审计时不会误判"调了 2 次" — 看 effect 而非 call。
+    </li>
+  </ol>
+  <h3>常见陷阱</h3>
+  <dl class="defs">
+    <dt>陷阱 · 把幂等做成"重复报错"</dt>
+    <dd>
+      "runId 已存在" 当 error 返回 — 实际是同一个 run, 不该报错。 正确: 重复
+      runId 时静默返回已有对象, 或者拒绝但用
+      <code>error: true</code>
+      而不是 throw。
+    </dd>
+    <dt>陷阱 · 幂等判断写在业务层</dt>
+    <dd>
+      业务层每次调之前都 "if (already exists)" — 重复逻辑散落各 处。 正确:
+      幂等判断写在 manager/store 内部, 业务层无脑调 即可。
+    </dd>
+  </dl>
+  <h2 id="anti-patterns">反模式速查 (在仓库里绝对不出现)</h2>
+  <p>看到这些写法, 立刻知道"是错的"。 仓库 100% 避免。</p>
+  <table class="terms">
+    <thead>
+      <tr>
+        <th>反模式</th>
+        <th>为什么错</th>
+        <th>正确做法</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>
+          <code>let history = []</code>
+          module-level
+        </td>
+        <td>父子 agent 共享, 跨上下文污染</td>
+        <td>
+          <code>createHistory()</code>
+          闭包内
+        </td>
+      </tr>
+      <tr>
+        <td>
+          <code>agent.ts</code>
+          直接
+          <code>import OpenAI</code>
+        </td>
+        <td>测试难, 换模型重写</td>
+        <td>
+          只 import
+          <code>LLMClient</code>
+          接口
+        </td>
+      </tr>
+      <tr>
+        <td>
+          <code>if (provider === "anthropic") { ... }</code>
+          写在业务代码
+        </td>
+        <td>业务代码被方言污染</td>
+        <td>Adapter 层翻译, 业务看 LLMClient</td>
+      </tr>
+      <tr>
+        <td>
+          <code>systemPrompt + state</code>
+          拼字符串
+        </td>
+        <td>破坏 prompt cache, 成本涨 5-10 倍</td>
+        <td>state 走 reminder, system prompt 只放规则</td>
+      </tr>
+      <tr>
+        <td>
+          <code>_loopIndex</code>
+          等内部字段发给 LLM
+        </td>
+        <td>污染 messages, 干扰 LLM</td>
+        <td>flatten 时显式枚举, 内部字段清除</td>
+      </tr>
+      <tr>
+        <td>
+          <code>fs.writeFileSync(path, json)</code>
+          直接写持久化文件
+        </td>
+        <td>断电留半截, JSON 损坏</td>
+        <td>
+          <code>atomicWriteJsonSync()</code>
+          写
+        </td>
+      </tr>
+      <tr>
+        <td>
+          <code>throw new Error("Permission denied")</code>
+          权限拒绝时
+        </td>
+        <td>整个 loop 崩, messages 断裂</td>
+        <td>写 "Permission denied" tool message, LLM 自己决定</td>
+      </tr>
+      <tr>
+        <td>
+          <code>if (isTest) { /* mock 逻辑 */ }</code>
+          写在 agent.ts
+        </td>
+        <td>业务代码绑死测试, 难维护</td>
+        <td>依赖注入 fake, agent.ts 不变</td>
+      </tr>
+      <tr>
+        <td>
+          用
+          <code>Date</code>
+          对象 / ISO 字符串存时间
+        </td>
+        <td>JSON 序列化类型错乱, 比较 bug</td>
+        <td>
+          统一用
+          <code>number</code>
+          (ms), 边界处显式转换
+        </td>
+      </tr>
+      <tr>
+        <td>
+          LLM 调用
+          <code>throw</code>
+          错误状态
+        </td>
+        <td>messages 序列断裂, LLM 困惑</td>
+        <td>错误写 tool message, 让 LLM 继续推理</td>
+      </tr>
+      <tr>
+        <td>
+          <code>let activeSkills = []</code>
+          module-level
+        </td>
+        <td>父子 agent 共享, skill 状态污染</td>
+        <td>
+          <code>createActiveSkillSet()</code>
+          闭包内
+        </td>
+      </tr>
+      <tr>
+        <td>
+          子 agent 也调
+          <code>run_subagent</code>
+          (递归)
+        </td>
+        <td>费用指数上升, 调试噩梦</td>
+        <td>
+          过滤工具时强制移除
+          <code>run_subagent</code>
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  <h2 id="where-to-go-next">学完这一页之后</h2>
+  <p>恭喜, 你已经掌握整套项目的设计模式词典。 接下来:</p>
+  <ol>
+    <li>
+      <strong>读代码</strong>
+      : 打开
+      <code>src/agent.ts</code>
+      /
+      <code>src/todo.ts</code>
+      /
+      <code>src/memory.ts</code>
+      , 用 22 个模式识别它们的骨架。 同一个项目, 同一套模式反复出现 —
+      你已经能"读懂任何文件了"。
+    </li>
+    <li>
+      <strong>自己写新模块</strong>
+      : 用 Composition Root 唯一组装 + 工厂 + 闭包 + 依赖注入 + 提醒模式 +
+      缓存友好布局这 5 条, 写一个 100 行的 manager, 跟仓库其他模块无缝衔接。
+    </li>
+    <li>
+      <strong>vibe 出新功能</strong>
+      : 用 6 段 Prompt Card 模板, 给大模型写 "我要在 harness 加 Web UI",
+      让它按这套模式生成代码。
+    </li>
+  </ol>
+  <p>
+    教学项目的最高产出不是"理解", 是"自己 vibe 出新功能"。 这 22 个模式是你 vibe
+    时的"骨架词汇表"。
+  </p>
+  <nav aria-label="章节翻页" class="article-pager">
+    <a class="article-pager__item" href="http://127.0.0.1:5173/?chapter=eval"
+      ><span class="article-pager__label">上一章</span>
+      <span class="article-pager__title"
+        >B · 如何测试一个不确定的 Agent</span
+      ></a
+    >
+    <span class="article-pager__item article-pager__item--disabled"
+      ><span class="article-pager__label">下一章</span>
+      <span class="article-pager__title">已经是最后一章</span></span
+    >
+  </nav>
+</article>