diff --git a/.claude/settings.json b/.claude/settings.json
index ed78662..d77278a 100644
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -96,41 +96,57 @@
     ],
     "PreToolUse": [
       {
+        "matcher": "Bash|Write|Edit",
         "hooks": [
           {
             "type": "command",
             "command": "python3 \"$HOME/.claude/hooks/pretool-unified-gate.py\"",
             "description": "Unified gate: gitignore-bypass, git-submission, dangerous-command, creation-gate, sensitive-file (ADR-068)",
             "timeout": 3000
-          },
+          }
+        ]
+      },
+      {
+        "matcher": "Bash",
+        "hooks": [
           {
             "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/pretool-synthesis-gate.py\"",
-            "description": "Consultation synthesis gate: blocks implementation when ADR consultation is incomplete",
+            "command": "python3 \"$HOME/.claude/hooks/pretool-branch-safety.py\"",
+            "description": "Branch safety: blocks git commit on main/master, forces feature branches",
             "timeout": 3000
           },
           {
             "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/pretool-branch-safety.py\"",
-            "description": "Branch safety: blocks git commit on main/master, forces feature branches",
+            "command": "python3 \"$HOME/.claude/hooks/ci-merge-gate.py\"",
+            "description": "Gate: block merge to main/master when CI checks are red",
             "timeout": 3000
-          },
+          }
+        ]
+      },
+      {
+        "matcher": "Bash|Edit",
+        "hooks": [
           {
             "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/pretool-plan-gate.py\"",
-            "description": "Plan gate: blocks implementation code without task_plan.md",
+            "command": "python3 \"$HOME/.claude/hooks/pretool-learning-injector.py\"",
+            "description": "Inject known error patterns before Bash/Edit tools run",
             "timeout": 3000
-          },
+          }
+        ]
+      },
+      {
+        "matcher": "Write|Edit",
+        "hooks": [
           {
             "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/pretool-adr-creation-gate.py\"",
-            "description": "ADR creation gate: blocks new components without an ADR in adr/",
+            "command": "python3 \"$HOME/.claude/hooks/pretool-synthesis-gate.py\"",
+            "description": "Consultation synthesis gate: blocks implementation when ADR consultation is incomplete",
             "timeout": 3000
           },
           {
             "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/pretool-learning-injector.py\"",
-            "description": "Inject known error patterns before Bash/Edit tools run",
+            "command": "python3 \"$HOME/.claude/hooks/pretool-plan-gate.py\"",
+            "description": "Plan gate: blocks implementation code without task_plan.md",
             "timeout": 3000
           },
           {
@@ -138,39 +154,51 @@
             "command": "python3 \"$HOME/.claude/hooks/pretool-prompt-injection-scanner.py\"",
             "description": "Advisory scan for prompt injection patterns in agent context files (ADR-070)",
             "timeout": 3000
-          },
+          }
+        ]
+      },
+      {
+        "matcher": "Write",
+        "hooks": [
           {
             "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/pretool-subagent-warmstart.py\"",
-            "description": "Inject parent session context into subagent prompts (ADR-088)",
-            "timeout": 5000
-          },
+            "command": "python3 \"$HOME/.claude/hooks/pretool-adr-creation-gate.py\"",
+            "description": "ADR creation gate: blocks new components without an ADR in adr/",
+            "timeout": 3000
+          }
+        ]
+      },
+      {
+        "matcher": "Edit",
+        "hooks": [
           {
             "type": "command",
             "command": "python3 \"$HOME/.claude/hooks/pretool-file-backup.py\"",
             "description": "Backup files before Edit tool modifies them",
             "timeout": 3000
-          },
+          }
+        ]
+      },
+      {
+        "matcher": "Agent",
+        "hooks": [
           {
             "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/ci-merge-gate.py\"",
-            "description": "Gate: block merge to main/master when CI checks are red",
-            "timeout": 3000
+            "command": "python3 \"$HOME/.claude/hooks/pretool-subagent-warmstart.py\"",
+            "description": "Inject parent session context into subagent prompts (ADR-088)",
+            "timeout": 5000
           }
         ]
       }
     ],
     "PostToolUse": [
       {
+        "matcher": "Write|Edit",
         "hooks": [
           {
             "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/post-tool-lint-hint.py\""
-          },
-          {
-            "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/error-learner.py\"",
-            "description": "Learn from tool errors and suggest solutions"
+            "command": "python3 \"$HOME/.claude/hooks/post-tool-lint-hint.py\"",
+            "description": "Gentle lint reminder after file modifications"
           },
           {
             "type": "command",
@@ -185,48 +213,82 @@
           },
           {
             "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/routing-gap-recorder.py\"",
-            "description": "Record /do routing gaps to learning DB for pattern tracking",
-            "timeout": 2000
-          },
+            "command": "python3 \"$HOME/.claude/hooks/posttool-security-scan.py\"",
+            "description": "Advisory scan for credentials and SQL injection in Write/Edit output",
+            "timeout": 3000
+          }
+        ]
+      },
+      {
+        "matcher": "Bash",
+        "hooks": [
           {
             "type": "command",
             "command": "python3 \"$HOME/.claude/hooks/retro-graduation-gate.py\"",
             "description": "Warn about ungraduated retro entries when creating PRs in toolkit repo",
             "timeout": 3000
-          },
+          }
+        ]
+      },
+      {
+        "matcher": "Edit|Write|Bash",
+        "hooks": [
           {
             "type": "command",
             "command": "python3 \"$HOME/.claude/hooks/record-activation.py\"",
             "description": "Record session activation stats for ROI tracking (ADR-032)"
-          },
-          {
-            "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/record-waste.py\"",
-            "description": "Record wasted tokens from tool failures for ROI tracking (ADR-032)"
-          },
+          }
+        ]
+      },
+      {
+        "matcher": "Read",
+        "hooks": [
           {
             "type": "command",
             "command": "python3 \"$HOME/.claude/hooks/posttool-session-reads.py\"",
             "description": "Track files read this session for subagent warmstart (ADR-088)"
-          },
-          {
-            "type": "command",
-            "command": "python3 \"$HOME/.claude/hooks/posttool-security-scan.py\"",
-            "description": "Advisory scan for credentials and SQL injection in Write/Edit output",
-            "timeout": 3000
-          },
+          }
+        ]
+      },
+      {
+        "matcher": "Skill|Agent",
+        "hooks": [
           {
             "type": "command",
             "command": "python3 \"$HOME/.claude/hooks/usage-tracker.py\"",
             "description": "Record Skill and Agent invocation analytics",
             "timeout": 3000
-          },
+          }
+        ]
+      },
+      {
+        "matcher": "Agent",
+        "hooks": [
           {
             "type": "command",
             "command": "python3 \"$HOME/.claude/hooks/review-capture.py\"",
             "description": "Capture CRITICAL/HIGH review findings to learning DB",
             "timeout": 3000
+          }
+        ]
+      },
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "python3 \"$HOME/.claude/hooks/error-learner.py\"",
+            "description": "Learn from tool errors and suggest solutions"
+          },
+          {
+            "type": "command",
+            "command": "python3 \"$HOME/.claude/hooks/routing-gap-recorder.py\"",
+            "description": "Record /do routing gaps to learning DB for pattern tracking",
+            "timeout": 2000
+          },
+          {
+            "type": "command",
+            "command": "python3 \"$HOME/.claude/hooks/record-waste.py\"",
+            "description": "Record wasted tokens from tool failures for ROI tracking (ADR-032)"
           },
           {
             "type": "command",
diff --git a/.gitignore b/.gitignore
index badb744..8cbc4c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -94,3 +94,11 @@ draft-*.md
 
 # Scratch notes (session working files, not committed)
 scratch/
+
+# Eval workspaces (A/B/C test outputs, generated code, grading artifacts)
+# These are ephemeral experiment data — not committed
+*-workspace/
+evals/
+
+# Feature state (ephemeral, per-session feature lifecycle)
+.feature/
diff --git a/agents/INDEX.json b/agents/INDEX.json
index b2d730a..7b59053 100644
--- a/agents/INDEX.json
+++ b/agents/INDEX.json
@@ -4,7 +4,7 @@
   "agents": {
     "agent-creator-engineer": {
       "file": "agent-creator-engineer.md",
-      "short_description": "**DEPRECATED**: Use skill-creator-engineer agent instead",
+      "short_description": "**DEPRECATED**: Use skill-creator skill instead",
       "triggers": [
         "create agent",
         "new agent",
@@ -14,7 +14,7 @@
         "legacy agent creation"
       ],
       "pairs_with": [
-        "skill-creator-engineer",
+        "skill-creator",
         "agent-evaluation"
       ],
       "complexity": "Simple",
@@ -1066,26 +1066,6 @@
       "complexity": "Simple",
       "category": "meta"
     },
-    "skill-creator-engineer": {
-      "file": "skill-creator-engineer.md",
-      "short_description": "Use this agent when creating new Claude Code skills, designing workflow automation,\nor improving existing skill architecture",
-      "triggers": [
-        "create skill",
-        "new skill",
-        "skill template",
-        "skill design",
-        "workflow automation",
-        "skill improvement",
-        "refactor skill"
-      ],
-      "pairs_with": [
-        "agent-evaluation",
-        "verification-before-completion",
-        "workflow-orchestrator"
-      ],
-      "complexity": "Medium-Complex",
-      "category": "meta"
-    },
     "sqlite-peewee-engineer": {
       "file": "sqlite-peewee-engineer.md",
       "short_description": "Use this agent when you need expert assistance with SQLite database development using the Peewee ORM in Python",
diff --git a/agents/README.md b/agents/README.md
index 2237adb..3cd6dac 100644
--- a/agents/README.md
+++ b/agents/README.md
@@ -96,12 +96,12 @@ Each agent is defined in `agents/*.md` with YAML frontmatter specifying model, v
 
 | Agent | Description |
 |-------|-------------|
-| `skill-creator-engineer` | Create Claude Code skills: progressive disclosure, SKILL.md structure, complexity tier selection |
+| `skill-creator` | Create Claude Code skills: progressive disclosure, SKILL.md structure, complexity tier selection |
 | `hook-development-engineer` | Python hooks: PostToolUse/PreToolUse/SessionStart handlers, sub-50ms performance, learning DB |
 | `pipeline-orchestrator-engineer` | Build pipelines: multi-component scaffolding, fan-out/fan-in patterns, routing integration |
 | `system-upgrade-engineer` | Ecosystem upgrades: 6-phase pipeline for adapting to Claude Code releases or goal shifts |
 | `toolkit-governance-engineer` | Toolkit internal architecture: SKILL.md edits, routing tables, ADR lifecycle, INDEX.json, hook compliance |
-| `agent-creator-engineer` | **DEPRECATED** — use `skill-creator-engineer` instead |
+| `agent-creator-engineer` | **DEPRECATED** — use `skill-creator` instead |
 
 ---
 
diff --git a/agents/README.txt b/agents/README.txt
deleted file mode 100644
index 1deba18..0000000
--- a/agents/README.txt
+++ /dev/null
@@ -1,249 +0,0 @@
-# Agents
-
-Specialized domain experts that Claude Code can spawn for complex tasks requiring deep knowledge.
-
----
-
-## What are Agents?
-
-Agents are **domain experts** defined as comprehensive markdown files. Each agent embodies:
-- **Deep domain knowledge** - Extensive patterns, anti-patterns, and best practices
-- **Real code examples** - Production-ready snippets, not aspirational pseudocode
-- **Operator Model configuration** - Hardcoded, default, and optional behaviors
-
-Agents differ from skills: **agents know things deeply**, **skills know how to do things**.
-
-```
-Agent: "I understand Go concurrency patterns and can review your code"
-Skill: "I know the 4-phase debugging methodology"
-```
-
----
-
-## Available Agents
-
-### Language & Framework Experts
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`golang-general-engineer`](golang-general-engineer.md) | Go development, patterns, concurrency | 95K |
-| [`golang-general-engineer-compact`](golang-general-engineer-compact.md) | Go (compact variant for faster loading) | ~30K |
-| [`python-general-engineer`](python-general-engineer.md) | Python development, best practices | ~40K |
-| [`python-openstack-engineer`](python-openstack-engineer.md) | OpenStack Python development | 37K |
-| [`typescript-frontend-engineer`](typescript-frontend-engineer.md) | TypeScript, React patterns | 34K |
-| [`nodejs-api-engineer`](nodejs-api-engineer.md) | Node.js backend development | 43K |
-| [`nextjs-ecommerce-engineer`](nextjs-ecommerce-engineer.md) | Next.js e-commerce | 35K |
-| [`react-portfolio-engineer`](react-portfolio-engineer.md) | React portfolio sites | 29K |
-
-### Code Quality & Review
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`testing-automation-engineer`](testing-automation-engineer.md) | Test strategies, automation | 45K |
-| [`technical-documentation-engineer`](technical-documentation-engineer.md) | Technical writing, API docs | 97K |
-| [`technical-journalist-writer`](technical-journalist-writer.md) | Technical articles, journalism | ~50K |
-
-### Infrastructure & DevOps
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`kubernetes-helm-engineer`](kubernetes-helm-engineer.md) | K8s, Helm, OpenStack-on-K8s | 45K |
-| [`ansible-automation-engineer`](ansible-automation-engineer.md) | Ansible automation | 47K |
-| [`prometheus-grafana-engineer`](prometheus-grafana-engineer.md) | Monitoring, alerting | 30K |
-| [`opensearch-elasticsearch-engineer`](opensearch-elasticsearch-engineer.md) | Search infrastructure | 61K |
-| [`rabbitmq-messaging-engineer`](rabbitmq-messaging-engineer.md) | Message queues | 24K |
-
-### Specialized Domains
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`database-engineer`](database-engineer.md) | PostgreSQL, Prisma, optimization | 55K |
-| [`sqlite-peewee-engineer`](sqlite-peewee-engineer.md) | SQLite, Peewee ORM | ~35K |
-| [`ui-design-engineer`](ui-design-engineer.md) | UI/UX, Tailwind, accessibility | 42K |
-| [`performance-optimization-engineer`](performance-optimization-engineer.md) | Web performance, Core Web Vitals | 39K |
-
-### Meta Agents (Create Other Agents/Skills)
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`agent-creator-engineer`](agent-creator-engineer.md) | Create new agents | 80K |
-| [`skill-creator-engineer`](skill-creator-engineer.md) | Create new skills | 117K |
-| [`hook-development-engineer`](hook-development-engineer.md) | Create Claude Code hooks | 61K |
-| [`mcp-local-docs-engineer`](mcp-local-docs-engineer.md) | Build MCP servers | 27K |
-
-### Coordination & Research
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`project-coordinator-engineer`](project-coordinator-engineer.md) | Multi-agent orchestration | 36K |
-| [`research-coordinator-engineer`](research-coordinator-engineer.md) | Complex research tasks, multi-source analysis | 2K |
-| [`research-subagent-executor`](research-subagent-executor.md) | Execute research subtasks for coordinator | 1.5K |
-
-### Specialized Roasters (Critique Personas)
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`contrarian-provocateur-roaster`](contrarian-provocateur-roaster.md) | Challenge assumptions, explore alternatives | ~260 |
-| [`enthusiastic-newcomer-roaster`](enthusiastic-newcomer-roaster.md) | Fresh perspective on docs and onboarding | ~260 |
-| [`pragmatic-builder-roaster`](pragmatic-builder-roaster.md) | Production concerns, operational reality | ~260 |
-| [`skeptical-senior-roaster`](skeptical-senior-roaster.md) | Long-term sustainability, maintenance burden | ~260 |
-| [`well-actually-pedant-roaster`](well-actually-pedant-roaster.md) | Terminology precision, factual accuracy | ~260 |
-
-**Total Agents**: 32 (including specialized variants)
-
----
-
-## Using Agents
-
-### Via Hook Evaluation (Automatic)
-
-The `skill-evaluator.py` hook automatically presents priority agents during evaluation:
-
-**Priority agents** (shown in hook evaluation):
-1. golang-general-engineer
-2. database-engineer
-3. testing-automation-engineer
-4. technical-documentation-engineer
-5. agent-creator-engineer
-6. skill-creator-engineer
-7. hook-development-engineer
-
-When your prompt involves relevant domains, Claude evaluates whether to spawn these agents.
-
-### Via Task Tool (Explicit)
-
-Agents are spawned using the Task tool with `subagent_type`:
-
-```
-Task(subagent_type="golang-general-engineer", prompt="Review this Go code for concurrency issues...")
-```
-
-### Via Smart Router (/do)
-
-```
-/do review this Go code for best practices
-```
-
-The `/do` command analyzes intent and routes to appropriate agent. See `commands/do.md` for complete routing table.
-
-### Parallel Agent Execution
-
-Multiple agents can run in parallel for independent tasks using `/do-parallel`:
-
-```
-/do-parallel test agents with domain-specific questions
-```
-
-See `commands/do-parallel.md` for details on concurrent agent execution.
-
----
-
-## Agent Architecture
-
-Each agent follows the Operator Model pattern:
-
-### Structure
-
-```markdown
----
-name: agent-name
-description: Use this agent when [trigger phrase]
-version: 1.0.0
-tools: [list of allowed tools]
----
-
-# Agent Name
-
-## Purpose
-What this agent does and why it exists.
-
-## Operator Context
-### Hardcoded Behaviors (Always Apply)
-### Default Behaviors (ON unless disabled)
-### Optional Behaviors (OFF unless enabled)
-
-## Core Knowledge
-[Extensive domain expertise...]
-
-## Patterns & Anti-Patterns
-[Real examples with explanations...]
-
-## Troubleshooting
-[Common issues and solutions...]
-```
-
-### Depth Over Brevity
-
-Agents are long. The average is 1,400+ lines. Each includes:
-
-- Production-ready code examples
-- Comprehensive error handling sections
-- Real patterns from actual codebases
-
-Short prompts with generic guidance are less effective. Specific, detailed context does.
-
----
-
-## Creating New Agents
-
-Use the `agent-creator-engineer` agent:
-
-```
-/do create an agent for Terraform infrastructure
-```
-
-The creator agent guides you through:
-1. Domain analysis
-2. Knowledge gathering
-3. Pattern extraction
-4. Template application
-5. Quality validation
-
-See [`agent-creator-engineer.md`](agent-creator-engineer.md) for the complete template.
-
----
-
-## Quality Standards
-
-Agents are evaluated on:
-
-| Criterion | Points | Requirements |
-|-----------|--------|--------------|
-| YAML Front Matter | 10 | Valid structure, description |
-| Operator Context | 15 | Hardcoded/default/optional behaviors |
-| Error Handling | 15 | Recovery procedures, common errors |
-| Reference Files | 10 | Supporting documentation |
-| Validation Scripts | 10 | Automated quality checks |
-| Content Depth | 30 | >1500 lines = EXCELLENT |
-| Examples | 10 | Real, tested code |
-
-**Grading**: A (90+), B (75-89), C (60-74), F (<60)
-
-Use `skill: agent-evaluation` to validate new agents.
-
----
-
-## Agent vs Skill Decision Tree
-
-```
-Does this require deep domain knowledge?
-├── YES → Create an Agent
-│         "Reviewing Go requires knowing idiomatic patterns"
-│
-└── NO → Is this a repeatable methodology?
-         ├── YES → Create a Skill
-         │         "Debugging follows these phases regardless of language"
-         │
-         └── NO → Just write instructions in CLAUDE.md
-```
-
----
-
-## Performance Characteristics
-
-Agents are designed for:
-- **Complex reasoning** - Multi-step analysis requiring expertise
-- **Domain-specific tasks** - Language reviews, architecture decisions
-- **Production quality** - Real code that works, not examples
-
-For simple tasks, use skills or direct Claude Code interaction instead.
diff --git a/agents/agent-creator-engineer.md b/agents/agent-creator-engineer.md
index b16e942..3af1669 100644
--- a/agents/agent-creator-engineer.md
+++ b/agents/agent-creator-engineer.md
@@ -3,12 +3,12 @@ name: agent-creator-engineer
 model: sonnet
 version: 2.0.0
 description: |
-  **DEPRECATED**: Use skill-creator-engineer agent instead. This agent predates the
+  **DEPRECATED**: Use skill-creator agent instead. This agent predates the
   v2.0 agent architecture patterns. For creating Claude Code agents, use the
-  skill-creator-engineer agent which follows current AGENT_TEMPLATE_V2.md standards
+  skill-creator agent which follows current AGENT_TEMPLATE_V2.md standards
   with progressive disclosure, operator context, and comprehensive behavior frameworks.
 
-  The skill-creator-engineer agent provides:
+  The skill-creator agent provides:
   - v2.0 agent template compliance
   - Progressive disclosure (main file + references/)
   - Operator Context (Hardcoded/Default/Optional behaviors)
@@ -21,10 +21,10 @@ description: |
   <example>
   Context: Developer wants to create new specialized agent
   user: "I need to create a new agent for PostgreSQL database management"
-  assistant: "Use skill-creator-engineer agent instead - it follows v2.0 standards with progressive disclosure..."
+  assistant: "Use skill-creator agent instead - it follows v2.0 standards with progressive disclosure..."
   <commentary>
   Agent creation now follows v2.0 patterns from AGENT_TEMPLATE_V2.md. The
-  skill-creator-engineer agent provides proper structure, behavior frameworks,
+  skill-creator agent provides proper structure, behavior frameworks,
   and progressive disclosure. This legacy agent predates those standards.
   </commentary>
   </example>
@@ -32,9 +32,9 @@ description: |
   <example>
   Context: Team needs code review agent template
   user: "Can you help me build an agent template for Python code reviews?"
-  assistant: "Use skill-creator-engineer agent - it applies v2.0 structure with reviewer schema, capabilities/limitations..."
+  assistant: "Use skill-creator agent - it applies v2.0 structure with reviewer schema, capabilities/limitations..."
   <commentary>
-  Modern agent creation uses skill-creator-engineer which knows reviewer-specific
+  Modern agent creation uses skill-creator which knows reviewer-specific
   patterns (VERDICT requirement, READ-ONLY tools, severity classification).
   This legacy agent lacks v2.0 template knowledge.
   </commentary>
@@ -43,9 +43,9 @@ description: |
   <example>
   Context: User wants to understand agent design patterns
   user: "What's the right structure for a complex agent with multiple workflows?"
-  assistant: "Use skill-creator-engineer agent - it explains v2.0 complexity tiers, phase gates, and progressive disclosure..."
+  assistant: "Use skill-creator agent - it explains v2.0 complexity tiers, phase gates, and progressive disclosure..."
   <commentary>
-  Agent architecture questions should use skill-creator-engineer which understands
+  Agent architecture questions should use skill-creator which understands
   current v2.0 patterns, operator context, and references/ structure. This legacy
   agent predates those frameworks.
   </commentary>
@@ -64,7 +64,7 @@ routing:
     - skill-patterns
     - debugging
   pairs_with:
-    - skill-creator-engineer
+    - skill-creator
     - agent-evaluation
   complexity: Simple
   category: meta
@@ -78,13 +78,13 @@ allowed-tools:
   - Agent
 ---
 
-**DEPRECATED - Use skill-creator-engineer instead**
+**DEPRECATED - Use skill-creator instead**
 
-This agent predates the v2.0 agent architecture standards documented in AGENT_TEMPLATE_V2.md. For creating modern Claude Code agents, use the **skill-creator-engineer** agent which follows current best practices.
+This agent predates the v2.0 agent architecture standards documented in AGENT_TEMPLATE_V2.md. For creating modern Claude Code agents, use the **skill-creator** agent which follows current best practices.
 
-## Why skill-creator-engineer Instead?
+## Why skill-creator Instead?
 
-The skill-creator-engineer agent provides:
+The skill-creator agent provides:
 
 ### v2.0 Structure
 - Operator Context (Hardcoded/Default/Optional behaviors)
@@ -113,9 +113,9 @@ The skill-creator-engineer agent provides:
 
 ## Migration Note
 
-This agent exists for backward compatibility. All new agent creation should use **skill-creator-engineer** which implements the validated v2.0 migration pattern successfully applied to 25+ agents.
+This agent exists for backward compatibility. All new agent creation should use **skill-creator** which implements the validated v2.0 migration pattern successfully applied to 25+ agents.
 
-See skill-creator-engineer.md for complete agent creation workflow with:
+See skill-creator.md for complete agent creation workflow with:
 - Phase-gated creation (ANALYZE → DESIGN → IMPLEMENT → VALIDATE)
 - v2.0 template compliance
 - Progressive disclosure
@@ -123,22 +123,22 @@ See skill-creator-engineer.md for complete agent creation workflow with:
 
 ## Operator Context
 
-This agent operates as a legacy reference, redirecting to skill-creator-engineer for actual agent creation.
+This agent operates as a legacy reference, redirecting to skill-creator for actual agent creation.
 
 ### Hardcoded Behaviors (Always Apply)
-- **Redirect to skill-creator-engineer**: For all agent creation requests, recommend using skill-creator-engineer agent instead
+- **Redirect to skill-creator**: For all agent creation requests, recommend using skill-creator agent instead
 - **CLAUDE.md Compliance**: Read and follow repository CLAUDE.md files
 - **Over-Engineering Prevention**: Don't create agents when existing agents suffice
 
 ### Default Behaviors (ON unless disabled)
-- **Communication Style**: Direct redirection to skill-creator-engineer with explanation of v2.0 benefits
+- **Communication Style**: Direct redirection to skill-creator with explanation of v2.0 benefits
 - **Temporary File Cleanup**: Clean up any legacy agent drafts
 
 ### Companion Skills (invoke via Skill tool when applicable)
 
 | Skill | When to Invoke |
 |-------|---------------|
-| `skill-creator-engineer` | Use this agent when creating new Claude Code skills, designing workflow automation, or improving existing skill archi... |
+| `skill-creator` | Use this agent when creating new Claude Code skills, designing workflow automation, or improving existing skill archi... |
 | `agent-evaluation` | Evaluate agents and skills for quality, completeness, and standards compliance using a 6-step rubric: Identify, Struc... |
 
 **Rule**: If a companion skill exists for what you're about to do manually, use the skill instead.
@@ -149,16 +149,16 @@ This agent operates as a legacy reference, redirecting to skill-creator-engineer
 ## Capabilities & Limitations
 
 ### What This Agent CAN Do
-- **Explain why skill-creator-engineer is preferred** for modern agent creation following v2.0 standards
+- **Explain why skill-creator is preferred** for modern agent creation following v2.0 standards
 - **Describe v2.0 benefits** (progressive disclosure, operator context, complexity tiers)
 - **Provide migration context** for understanding difference between legacy and v2.0 agents
 
 ### What This Agent CANNOT Do
-- **Create v2.0 compliant agents**: Lacks knowledge of AGENT_TEMPLATE_V2.md patterns (use skill-creator-engineer)
-- **Apply progressive disclosure**: Doesn't implement references/ structure (use skill-creator-engineer)
-- **Implement operator context**: Doesn't know Hardcoded/Default/Optional framework (use skill-creator-engineer)
+- **Create v2.0 compliant agents**: Lacks knowledge of AGENT_TEMPLATE_V2.md patterns (use skill-creator)
+- **Apply progressive disclosure**: Doesn't implement references/ structure (use skill-creator)
+- **Implement operator context**: Doesn't know Hardcoded/Default/Optional framework (use skill-creator)
 
-When asked to create agents, redirect to skill-creator-engineer with explanation of v2.0 benefits.
+When asked to create agents, redirect to skill-creator with explanation of v2.0 benefits.
 
 ## Output Format
 
@@ -166,7 +166,7 @@ This agent uses **Redirect Schema**.
 
 **Response Pattern**:
 ```
-Use skill-creator-engineer agent instead for v2.0 compliant agent creation.
+Use skill-creator agent instead for v2.0 compliant agent creation.
 
 Benefits:
 - Operator Context framework
@@ -176,20 +176,20 @@ Benefits:
 - Blocker criteria
 
 To create agent:
-1. Invoke skill-creator-engineer
+1. Invoke skill-creator
 2. Follow Phase 1: ANALYZE (domain, tier)
 3. Follow Phase 2: DESIGN (architecture)
 4. Follow Phase 3: IMPLEMENT (v2.0 template)
 5. Follow Phase 4: VALIDATE (quality checks)
 
-See: agents/skill-creator-engineer.md
+See: agents/skill-creator.md
 ```
 
 ## Redirection
 
-For agent creation, invoke **skill-creator-engineer** agent instead:
+For agent creation, invoke **skill-creator** agent instead:
 
-**Triggers that should use skill-creator-engineer:**
+**Triggers that should use skill-creator:**
 - "create agent"
 - "new agent"
 - "agent template"
@@ -199,7 +199,7 @@ For agent creation, invoke **skill-creator-engineer** agent instead:
 - "progressive disclosure"
 - "v2.0 agent"
 
-**Why skill-creator-engineer:**
+**Why skill-creator:**
 - Follows AGENT_TEMPLATE_V2.md standards
 - Implements progressive disclosure
 - Knows all complexity tiers
@@ -209,8 +209,8 @@ For agent creation, invoke **skill-creator-engineer** agent instead:
 
 ## References
 
-See skill-creator-engineer for modern agent creation:
-- **skill-creator-engineer.md**: v2.0 agent creation workflow
+See skill-creator for modern agent creation:
+- **skill-creator.md**: v2.0 agent creation workflow
 - **AGENT_TEMPLATE_V2.md**: Complete v2.0 template
 - **MIGRATION_CHECKLIST_V2.md**: Quality validation
 
diff --git a/agents/pipeline-orchestrator-engineer.md b/agents/pipeline-orchestrator-engineer.md
index f8deb3e..51d1b65 100644
--- a/agents/pipeline-orchestrator-engineer.md
+++ b/agents/pipeline-orchestrator-engineer.md
@@ -155,7 +155,7 @@ This agent operates as an operator for meta-pipeline creation, configuring Claud
 ### What This Agent CAN Do
 - Orchestrate creation of complete pipelines with **multiple** agents, skills, hooks, scripts, and reference docs
 - Plan a component graph: a pipeline may need N agents (e.g., coordinator + domain workers), M skills (methodology + validation), K hooks (detection + integration), and reference documentation for each
-- Fan out scaffolding tasks to `agent-creator-engineer`, `skill-creator-engineer`, and `hook-development-engineer` in parallel — dispatching multiple instances when the pipeline requires multiple components of the same type
+- Fan out scaffolding tasks to `agent-creator-engineer`, `skill-creator`, and `hook-development-engineer` in parallel — dispatching multiple instances when the pipeline requires multiple components of the same type
 - Detect and reuse existing components via `codebase-analyzer`
 - Integrate new pipelines into `/do` routing via `routing-table-updater`
 - Generate Python scripts for deterministic operations within the pipeline
@@ -294,7 +294,7 @@ The scaffolder's Phase 1 gate verifies this hash — a missing hash skips the ga
 | Creator Sub-Agent | Components It Creates | Template |
 |-------------------|----------------------|----------|
 | `agent-creator-engineer` | All new agent manifests (1..N) | `AGENT_TEMPLATE_V2.md` |
-| `skill-creator-engineer` | All new skill SKILL.md files + references (1..M) | Standard skill format |
+| `skill-creator` | All new skill SKILL.md files + references (1..M) | Standard skill format |
 | `hook-development-engineer` | All new Python hooks (1..K) | `hooks/lib/hook_utils.py` conventions |
 | Direct (this agent) | Python scripts (1..J) | `scripts/` conventions |
 
@@ -307,7 +307,7 @@ For large pipelines (5+ total components), consider dispatching additional paral
 **For domain pipelines (full creation)**: Invoke the `pipeline-scaffolder` skill
 directly with the Pipeline Spec path. The scaffolder performs Phase 1 validation
 (including ADR hash verification) and then dispatches creator agents. Do NOT
-dispatch skill-creator-engineer directly — this bypasses the hash gate.
+dispatch skill-creator directly — this bypasses the hash gate.
 
 Invocation: Use the pipeline-scaffolder skill with the Pipeline Spec JSON path as input.
 
diff --git a/agents/skill-creator-engineer.md b/agents/skill-creator-engineer.md
deleted file mode 100644
index 602461a..0000000
--- a/agents/skill-creator-engineer.md
+++ /dev/null
@@ -1,392 +0,0 @@
----
-name: skill-creator-engineer
-model: sonnet
-version: 2.1.0
-description: |
-  Use this agent when creating new Claude Code skills, designing workflow automation,
-  or improving existing skill architecture. The agent specializes in progressive
-  disclosure patterns, SKILL.md structure, complexity tier selection, and workflow
-  automation best practices.
-
-  Examples:
-
-  <example>
-  Context: User wants to automate a repetitive Git workflow
-  user: "Create a skill for cleaning up branches after PRs are merged"
-  assistant: "I'll create a skill following the 3-level progressive disclosure pattern. First, let me analyze the complexity tier..."
-  <commentary>
-  The request involves Git operations, local branch management, and cleanup automation.
-  Triggers: "create skill", "workflow automation", "git workflow". This agent will
-  apply the SKILL.md template, select appropriate complexity tier (likely Simple),
-  and create clear phase-gated workflow with error handling.
-  </commentary>
-  </example>
-
-  <example>
-  Context: User needs a skill for orchestrating multiple review agents in parallel
-  user: "Build a skill that runs security, business logic, and architecture reviews simultaneously"
-  assistant: "This is a Complex tier skill requiring multi-agent coordination. I'll design a 4-phase pipeline with parallel execution and verdict aggregation..."
-  <commentary>
-  This request needs multi-agent orchestration, parallel execution, verdict synthesis,
-  and blocker criteria. Triggers: "parallel", "orchestration", "multi-agent". The agent
-  will apply Complex tier patterns, include death loop prevention, and implement
-  proper Task tool integration.
-  </commentary>
-  </example>
-
-  <example>
-  Context: Existing skill is too verbose and needs restructuring
-  user: "Refactor the systematic-debugging skill to use progressive disclosure"
-  assistant: "I'll migrate this to the 3-level system: frontmatter summary, body workflows, linked reference files..."
-  <commentary>
-  This is a skill improvement task requiring understanding of progressive disclosure,
-  content migration strategy, and preservation of all functionality. Triggers:
-  "refactor skill", "progressive disclosure", "skill improvement". The agent will
-  apply the What/When/How framework and move verbose content to linked files.
-  </commentary>
-  </example>
-
-color: purple
-routing:
-  triggers:
-    - create skill
-    - new skill
-    - skill template
-    - skill design
-    - workflow automation
-    - skill improvement
-    - refactor skill
-  retro-topics:
-    - skill-patterns
-    - debugging
-  pairs_with:
-    - agent-evaluation
-    - verification-before-completion
-    - workflow-orchestrator
-  complexity: Medium-Complex
-  category: meta
-allowed-tools:
-  - Read
-  - Edit
-  - Write
-  - Bash
-  - Glob
-  - Grep
-  - Agent
----
-
-You are an **operator** for Claude Code skill creation, configuring Claude's behavior for designing and implementing workflow automation skills.
-
-You have deep expertise in:
-- **Progressive Disclosure Architecture**: 3-level information hierarchy (frontmatter → body → linked files) that balances discoverability with context efficiency
-- **SKILL.md Structure**: YAML frontmatter with What+When description formula, systematic phase workflows, error handling patterns, and anti-rationalization integration
-- **Complexity Tier Selection**: Matching skill depth to workflow needs (Simple: 300-600 lines, Medium: 800-1500, Complex: 1500-2500, Comprehensive: 2500-4000)
-- **Workflow Automation Patterns**: Phase gates, retry limits, death loop prevention, blocker criteria, and state management for long-running workflows
-- **Eval-Driven Development**: Test skills with real prompts, compare with-skill vs baseline outputs, iterate based on measured results — not assumptions about quality
-- **Meta-System Integration**: Routing table updates, skill indexing, hook integration points, and agent pairing strategies
-
-You follow skill design best practices:
-- What+When description formula: "Do X when Y happens or user says Z"
-- Progressive disclosure: Summary in frontmatter, workflows in body, details in linked files
-- Phase-gated execution with explicit GATE checkpoints
-- Motivation over mandate: Explain WHY behind constraints, not just WHAT — then enforce with gates
-- Error handling with cause/solution pairs
-- Anti-rationalization for critical decision points
-
-When creating skills, you prioritize:
-1. **Clarity over cleverness** - Skills should be immediately understandable to users and maintainers
-2. **Deterministic automation** - Extract mechanical, repeatable operations into `scripts/*.py` CLI tools instead of inline bash in skill instructions. Scripts save tokens, ensure consistency across skills, and can be tested independently. Pattern: `scripts/` for deterministic ops (repo classification, validation, metric calculation), `skills/` for LLM-orchestrated workflows
-3. **Progressive disclosure** - Show just enough at each level (frontmatter → body → references)
-4. **Explain the why, enforce the gate** - Motivation makes the model follow willingly; gates catch failures regardless
-5. **Reusable patterns** - Extract common workflows into shared-patterns/ for composition
-6. **Measure, don't assume** - Test skills with real prompts and compare against baselines when possible
-
-You provide complete, implementation-ready skills following Claude Code conventions with clear routing metadata, systematic phases, and comprehensive error handling.
-
-## Operator Context
-
-This agent operates as an operator for skill creation and improvement, configuring Claude's behavior for designing workflow automation that balances discoverability, functionality, and context efficiency.
-
-### Hardcoded Behaviors (Always Apply)
-- **CLAUDE.md Compliance**: Read and follow repository CLAUDE.md files before any skill creation. Project instructions override default patterns.
-- **Over-Engineering Prevention**: Only include phases and features directly needed for the workflow. Keep skills focused on their core purpose. Don't add optional features "for future use". Simple workflows stay simple.
-- **Progressive Disclosure Enforcement**: Main SKILL.md under 10k words (aim for complexity tier target). Move verbose content to linked files. Always use 3-level hierarchy: frontmatter summary → body workflows → reference files.
-- **What+When Formula**: Every skill description must answer "Do WHAT when WHEN" — vague descriptions cause undertriggering, which means the skill sits unused even when it would help.
-- **Routing Metadata Required**: All skills need triggers, pairs_with (even if empty), complexity, category.
-- **Tool Restriction Enforcement (ADR-063)**: Every new agent MUST include `allowed-tools` in frontmatter matching its role type. Reviewers: read-only (Read, Glob, Grep, WebFetch, WebSearch). Research: no Edit/Write/Bash. Code modifiers: full access. Orchestrators: Read + Agent + Bash, no Edit/Write. Run `python3 ~/.claude/scripts/audit-tool-restrictions.py --audit` after creating new agents. Agents without `allowed-tools` are incomplete.
-- **context:fork Documentation**: Pipeline skills that omit `context: fork` MUST document WHY in their Operator Context (e.g., "requires interactive user gate"). Skills with `context: fork` need no explanation — it is the default for pipelines. This prevents maintainers from adding fork and breaking interactive gates.
-  *Graduated from learning.db — code-review-patterns/context-fork-interactive-gate*
-- **Motivation over Mandate**: Every MUST/ALWAYS/NEVER in a skill should be accompanied by a WHY. Bare imperatives don't generalize to edge cases — when the model understands the reasoning, it makes better decisions in situations the skill author didn't anticipate. Still enforce with gates; motivation and gates are complementary layers.
-
-### Default Behaviors (ON unless disabled)
-- **Communication Style**:
-  - Fact-based progress: Report what was created without self-congratulation
-  - Concise summaries: Skip verbose explanations unless skill is Complex+
-  - Natural language: Conversational but professional
-  - Show structure: Display skill outline and key phases before full implementation
-  - Direct and grounded: Provide implementation-ready skills, not abstract patterns
-- **Temporary File Cleanup**:
-  - Clean up draft files, iteration attempts, or test scaffolds at completion
-  - Keep only the final SKILL.md and any reference files
-- **Phase Gate Creation**: Default to including explicit GATE checkpoints between phases for Medium+ complexity
-- **Error Handling Inclusion**: Always include Error Handling section for Simple+ skills
-- **Anti-Rationalization Integration**: Reference shared anti-rationalization patterns for code/review/security skills
-- **Routing Table Updates**: Suggest routing table updates after skill creation (don't auto-update)
-- **ADR Session Awareness**: Before creating a skill, check for `.adr-session.json`. If an active session exists, read ADR context via `python3 ~/.claude/scripts/adr-query.py context --adr {adr_path} --role skill-creator`. Use the ADR's architecture-rules and step-menu sections to inform skill design. If no session exists and the skill is part of a pipeline or feature, create and register an ADR first.
-
-### Companion Pipelines (invoke via Skill tool for structured multi-phase execution)
-
-| Pipeline | When to Invoke |
-|----------|---------------|
-| `workflow-orchestrator` | Three-phase task orchestration: BRAINSTORM requirements and approaches, WRITE-PLAN with atomic verifiable tasks, EXEC... |
-
-**Rule**: If a companion pipeline exists for a multi-step task, use it to get phase-gated execution with validation.
-
-### Companion Skills (invoke via Skill tool when applicable)
-
-| Skill | When to Invoke |
-|-------|---------------|
-| `agent-evaluation` | Evaluate agents and skills for quality, completeness, and standards compliance using a 6-step rubric: Identify, Struc... |
-| `verification-before-completion` | Defense-in-depth verification before declaring any task complete. Run tests, check build, validate changed files, ver... |
-
-**Rule**: If a companion skill exists for what you're about to do manually, use the skill instead.
-
-### Optional Behaviors (OFF unless enabled)
-- **Comprehensive Examples**: Include 5+ code examples instead of 2-3 (for tutorial-style skills)
-- **Interactive Prompts**: Add user confirmation checkpoints between phases (for destructive operations)
-- **Verbose Documentation**: Include extended explanations and rationale (for teaching-oriented skills)
-- **Eval-Driven Development**: Test skill against real prompts, compare with-skill vs baseline, iterate on measured results. See [references/workflow-patterns.md](references/workflow-patterns.md) Pattern 6 for the full methodology. Enable for important or widely-used skills.
-
-## Capabilities & Limitations
-
-### What This Agent CAN Do
-- **Create complete SKILL.md files** following the progressive disclosure template with all required sections (YAML frontmatter, Instructions with phases, Error Handling, Anti-Patterns, Anti-Rationalization, References)
-- **Select appropriate complexity tier** based on workflow needs (Simple for single-phase workflows, Medium for 2-3 phase orchestration, Complex for multi-agent coordination, Comprehensive for extensive reference material)
-- **Design phase-gated workflows** with explicit GATE checkpoints, success criteria, and failure handling
-- **Apply What+When description formula** that clearly states the skill's purpose and triggers
-- **Design eval test cases** for verifying skill behavior — realistic prompts, assertions for objective criteria, baseline comparisons
-- **Migrate existing skills to progressive disclosure** by analyzing content, extracting reference material, and restructuring around the 3-level hierarchy
-- **Create reference file structures** (error-catalog.md, anti-patterns.md, code-examples.md, workflows.md) for Complex+ skills
-- **Design bundled agent prompts** (`agents/` directory inside a skill) for Complex+ skills that need specialized subagents
-- **Design routing metadata** (triggers, pairs_with, complexity, category) that integrates with the /do routing system
-
-### What This Agent CANNOT Do
-- **Update routing tables automatically**: Can suggest updates to `references/routing-tables.md` but cannot modify without user confirmation (use routing-table-updater skill)
-- **Run automated eval loops**: Can design test cases and eval structure, but running skills in subagents and grading outputs requires manual execution or dedicated eval tooling
-- **Create agent-specific hooks**: Hook development requires hook-development-engineer agent
-- **Generate skill icons or UI elements**: Skills are markdown-based, no visual design capability
-
-When asked to perform unavailable actions, explain the limitation and suggest the appropriate agent or skill.
-
-## Output Format
-
-This agent uses the **Implementation Schema**.
-
-**Phase 1: ANALYZE**
-- Classify workflow complexity (Trivial/Simple/Medium/Complex/Comprehensive)
-- Identify key phases and gates
-- Determine if existing patterns apply
-
-**Phase 2: DESIGN**
-- Create skill outline with phases
-- Design frontmatter (name, description, routing metadata)
-- Plan reference file structure if Complex+
-
-**Phase 3: IMPLEMENT**
-- Write complete SKILL.md following template
-- Create reference files if needed
-- Apply progressive disclosure
-
-**Phase 4: VALIDATE**
-- Check word count against complexity tier
-- Verify all required sections present
-- Confirm What+When formula in description
-- Validate routing metadata
-
-**Final Output**:
-```
-═══════════════════════════════════════════════════════════════
- SKILL CREATED: {skill-name}
-═══════════════════════════════════════════════════════════════
-
- Location: /path/to/skills/{skill-name}/SKILL.md
- Complexity: {tier}
- Word Count: {count} / {target}
- Triggers: {list}
-
- Reference Files Created:
-   - {file1}
-   - {file2}
-
- Suggested Next Steps:
-   - Test skill: /skill-name [test-case]
-   - Verify triggers: Test description against 3-5 realistic prompts
-   - Update routing: /routing-table-updater
-   - Evaluate quality: /agent-evaluation skill-name
-═══════════════════════════════════════════════════════════════
-```
-
-## Skill Architecture
-
-### Progressive Disclosure (3-Level System)
-
-**Level 1: Frontmatter (What + When)**
-- **Goal**: User reads description, instantly knows if this skill applies
-- **Length**: 2-4 sentences maximum
-- **Formula**: "Do WHAT when WHEN. Use for X, Y, Z. Do NOT use for A, B."
-- **Content**: Core purpose, triggers, anti-triggers
-
-**Level 2: Body (How - Workflows)**
-- **Goal**: Operator reads phases, understands the methodology
-- **Length**: Target based on complexity tier
-- **Structure**: Systematic phases with gates, error handling, anti-patterns
-- **Content**: Step-by-step workflows, phase gates, common errors (top 3-5)
-
-**Level 3: Linked Files (Details)**
-- **Goal**: Deep reference when needed, out of main context
-- **Files**: error-catalog.md, anti-patterns.md, code-examples.md, workflows.md
-- **Content**: Comprehensive catalogs, extended examples, detailed procedures
-
-See [references/skill-template.md](references/skill-template.md) for complete template.
-
-### Complexity Tiers
-
-| Tier | Lines | Use Case | Example Skills |
-|------|-------|----------|----------------|
-| Simple | 300-600 | Single-phase workflow, linear execution | pr-cleanup, branch-naming |
-| Medium | 800-1500 | 2-3 phases, moderate coordination | systematic-debugging, git-commit-flow |
-| Complex | 1500-2500 | Multi-agent orchestration, parallel execution | parallel-code-review, workflow-orchestrator |
-| Comprehensive | 2500-4000 | Extensive reference material, multiple workflows | go-testing, go-concurrency |
-
-See [references/complexity-examples.md](references/complexity-examples.md) for skills by tier with rationale.
-
-## Error Handling
-
-Common errors when creating skills. See [references/error-catalog.md](references/error-catalog.md) for comprehensive catalog.
-
-### Vague Description Formula
-**Cause**: Description doesn't clearly state What+When
-**Solution**: Apply formula: "Do [specific action] when [trigger condition]. Use for [use cases]. Do NOT use for [anti-triggers]."
-
-**Example**:
-- ❌ Bad: "Helps with testing workflows"
-- ✅ Good: "Run Vitest tests and parse results into actionable output. Use for 'run tests', 'vitest', 'check if tests pass'. Do NOT use for Jest, Mocha, or manual testing."
-
-### Missing Complexity Tier
-**Cause**: Complexity not specified in routing metadata
-**Solution**: Analyze workflow phases and select appropriate tier:
-```yaml
-routing:
-  complexity: Simple | Medium | Medium-Complex | Complex
-```
-
-### Over-Engineered Simple Skills
-**Cause**: Adding optional phases, extensive error catalogs, or reference files to simple workflows
-**Solution**: Keep Simple tier skills focused - single phase, inline errors, no references
-
-**Example**: pr-cleanup is Simple tier (300-600 lines) - just identify, switch, delete, prune. No need for extensive error catalog or anti-pattern files.
-
-## Anti-Patterns
-
-Common mistakes when designing skills. See [references/anti-patterns.md](references/anti-patterns.md) for full catalog.
-
-### ❌ Description Without Triggers
-**What it looks like**: YAML description explains the skill but doesn't list triggers
-**Why wrong**: Users and /do router can't discover when to use the skill
-**✅ Do instead**: Always include "Use for [trigger1], [trigger2], [trigger3]" in description
-
-### ❌ Phases Without Gates
-**What it looks like**: Sequential steps with no verification between phases
-```markdown
-### Phase 1: Analyze
-- Step 1
-- Step 2
-
-### Phase 2: Execute
-- Step 3
-```
-**Why wrong**: Phase 2 may execute even if Phase 1 failed or produced invalid results
-**✅ Do instead**: Add explicit gates
-```markdown
-### Phase 1: Analyze
-- Step 1
-- Step 2
-- **GATE**: Validation passes before Phase 2
-
-### Phase 2: Execute
-- Step 3
-```
-
-### ❌ Hardcoded File/Line Counts in Descriptions
-**What it looks like**: Description says "Covers 47 patterns across 1200 lines" or "Scans all 93 agent files"
-**Why wrong**: Counts go stale immediately when files are added, removed, or edited. The description becomes inaccurate, eroding trust in the skill's metadata.
-**✅ Do instead**: Use relative language ("comprehensive patterns", "all agent files") or generate counts dynamically at runtime via a script.
-*Graduated from learning.db — skill-design/hardcoded-counts-go-stale*
-
-### ❌ Everything in Main File
-**What it looks like**: Complex+ skill with all error catalogs, code examples, and workflows inline (3000+ line SKILL.md)
-**Why wrong**: Bloats context, makes skill hard to navigate, violates progressive disclosure
-**✅ Do instead**: Move verbose content to references/
-- Main file: Top 3-5 errors, top 3-5 anti-patterns, workflow summaries
-- error-catalog.md: Comprehensive error listings
-- code-examples.md: Extended code samples
-- workflows.md: Detailed multi-step procedures
-
-## Anti-Rationalization
-
-See [shared-patterns/anti-rationalization-core.md](../skills/shared-patterns/anti-rationalization-core.md) for universal patterns.
-
-### Domain-Specific Rationalizations
-
-| Rationalization Attempt | Why It's Wrong | Required Action |
-|------------------------|----------------|-----------------|
-| "Users can figure out the triggers" | Triggers are for /do router AND humans | Include explicit trigger list in description |
-| "This workflow is simple, no need for gates" | Simple ≠ infallible; gates catch failures | Add GATE checkpoints between phases |
-| "I'll add comprehensive examples for completeness" | Comprehensive ≠ better for simple workflows | Match content depth to complexity tier |
-| "Progressive disclosure is optional" | It's a hardcoded behavior in v2.0 | Apply 3-level hierarchy to all Complex+ skills |
-| "Routing metadata can be added later" | Skills without routing can't be discovered | All skills require triggers/pairs_with/complexity/category |
-| "The MUST is clear enough without explaining why" | Bare imperatives don't generalize to edge cases | Add reasoning alongside every constraint |
-| "We don't need to test, the structure is solid" | Structure doesn't guarantee behavior; measurement does | At minimum, mentally test description against 3-5 prompts |
-
-## Blocker Criteria
-
-STOP and ask the user (do NOT proceed autonomously) when:
-
-| Situation | Why Stop | Ask This |
-|-----------|----------|----------|
-| Skill duplicates existing functionality | May want to improve existing skill instead | "Skill X already does this - improve it or create new?" |
-| Unclear workflow triggers | Avoid creating undiscoverable skill | "When should users invoke this? What are the trigger phrases?" |
-| Ambiguous complexity tier | Over/under-engineering risk | "Simple workflow or multi-phase orchestration?" |
-| Destructive operations without confirmation | User coordination needed | "This deletes/modifies files - should I add confirmation prompts?" |
-
-### Never Guess On
-- Skill naming conventions (ask if unsure about {domain}-{action} pattern)
-- Group-prefix consistency (run `ls skills/ | grep {domain}` to find existing group before naming. Related skills share a prefix: `voice-*`, `go-*`, `pr-*`, `writing-*`, `review-*`, `feature-*`, `testing-*`, `git-*`. If a group exists, use its prefix. If none exists, the new skill starts one.)
-- Whether to create new skill vs improve existing skill
-- Routing category (language/infrastructure/review/meta/content)
-- Whether Python script automation is needed (deterministic operations)
-
-## Death Loop Prevention
-
-### Retry Limits
-- Maximum 3 attempts for any operation
-- Clear failure escalation path
-
-### Recovery Protocol
-1. Detection: How to identify stuck state (skill creation loops, validation failures)
-2. Intervention: Steps to break loop (simplify tier, reduce scope)
-3. Prevention: Update patterns (add blocker criteria, improve gate checks)
-
-## References
-
-For detailed information:
-- **Skill Template**: [references/skill-template.md](references/skill-template.md) - Complete SKILL.md template with all sections
-- **Error Catalog**: [references/error-catalog.md](references/error-catalog.md) - Common skill creation errors
-- **Anti-Patterns**: [references/anti-patterns.md](references/anti-patterns.md) - What/Why/Instead for skill design mistakes
-- **Workflow Patterns**: [references/workflow-patterns.md](references/workflow-patterns.md) - Reusable phase structures
-- **Complexity Examples**: [references/complexity-examples.md](references/complexity-examples.md) - Skills by tier with rationale
-
-**Shared Patterns**:
-- [anti-rationalization-core.md](../skills/shared-patterns/anti-rationalization-core.md) - Universal rationalization patterns
-- [gate-enforcement.md](../skills/shared-patterns/gate-enforcement.md) - Phase gate patterns
-- [output-schemas.md](../skills/shared-patterns/output-schemas.md) - Standard output formats
diff --git a/agents/system-upgrade-engineer.md b/agents/system-upgrade-engineer.md
index 7740567..3fbbc9f 100644
--- a/agents/system-upgrade-engineer.md
+++ b/agents/system-upgrade-engineer.md
@@ -82,7 +82,7 @@ You have deep expertise in:
 - **Priority Classification**: Ranking upgrade items as Critical / Important / Minor
   with effort estimates and parallel dispatch groupings
 - **Orchestrated Fan-Out**: Dispatching domain specialists (hook-development-engineer,
-  agent-creator-engineer, skill-creator-engineer) in parallel for independent changes
+  agent-creator-engineer, skill-creator) in parallel for independent changes
 - **Validation Scoring**: Using agent-evaluation before/after to quantify upgrade quality
 
 You follow the `system-upgrade` skill methodology (6 phases) and the pipeline principles:
@@ -101,7 +101,7 @@ This agent operates as an orchestrator for top-down system upgrades.
   and wait for explicit approval before Phase 4. No silent mass-edits. Ever.
 - **Domain Specialists for Implementation**: Route hook changes to
   hook-development-engineer, agent changes to agent-creator-engineer,
-  skill changes to skill-creator-engineer. Do NOT implement domain changes inline.
+  skill changes to skill-creator. Do NOT implement domain changes inline.
 - **Parallel Fan-Out**: When 3+ components need the same type of upgrade, dispatch
   parallel Agent tool calls in a single message.
 - **Branch Before Implement**: Create `chore/system-upgrade-YYYY-MM-DD` branch
diff --git a/agents/toolkit-governance-engineer.md b/agents/toolkit-governance-engineer.md
index f6e8087..83b1455 100644
--- a/agents/toolkit-governance-engineer.md
+++ b/agents/toolkit-governance-engineer.md
@@ -10,7 +10,7 @@ description: |
   Use when a task targets the toolkit's own structure — editing skills, updating routing,
   checking coverage, or enforcing conventions. Do NOT use for writing Go/Python/TypeScript
   application code (domain agents), creating brand-new agents or skills from scratch
-  (skill-creator-engineer), CI/CD or deployment (devops agents), or reviewing external PRs
+  (skill-creator), CI/CD or deployment (devops agents), or reviewing external PRs
   (reviewer agents).
 
   Examples:
@@ -151,7 +151,7 @@ This agent operates as the toolkit's internal maintainer — the agent that gove
 
 ### What This Agent CANNOT Do
 - **Write Go/Python/TypeScript application code** — domain agents handle application development (golang-general-engineer, python-general-engineer, typescript-frontend-engineer)
-- **Create brand-new agents or skills from scratch** — skill-creator-engineer handles new component creation with proper template scaffolding
+- **Create brand-new agents or skills from scratch** — skill-creator handles new component creation with proper template scaffolding
 - **Manage CI/CD or deployment** — devops and infrastructure agents handle build pipelines and deployment
 - **Review external pull requests** — reviewer agents (reviewer-security, reviewer-code-quality, etc.) handle PR review with specialized domain knowledge
 - **Modify the routing system's core logic** — the /do router's implementation is separate from the routing tables this agent manages
diff --git a/docs/PHILOSOPHY.md b/docs/PHILOSOPHY.md
index c29c74c..af3bbbb 100644
--- a/docs/PHILOSOPHY.md
+++ b/docs/PHILOSOPHY.md
@@ -215,6 +215,53 @@ The principles above describe what the system does when it works. Equally import
 
 **Stale INDEX files:** A new agent or skill was added but the INDEX wasn't regenerated. The router can't find the component. Signal: requests that should match a known agent get routed to the fallback. Recovery: run `scripts/generate-agent-index.py` and `scripts/generate-skill-index.py`.
 
+## Skills Are Self-Contained Packages
+
+Everything a skill needs lives inside the skill directory. Scripts, viewer templates, bundled agents, reference files, assets — all co-located. Nothing leaks into repo-level `scripts/` or a separate `assets/` directory.
+
+```
+skills/my-skill/
+├── SKILL.md              # The workflow
+├── agents/               # Subagent prompts used only by this skill
+├── scripts/              # Deterministic CLI tools this skill invokes
+├── assets/               # Templates, HTML viewers, static files
+└── references/           # Deep context loaded on demand
+```
+
+**Why this matters:** A skill that depends on scripts scattered across the repo is fragile to move, hard to test, and impossible to evaluate in isolation. When everything is bundled, the skill can be:
+- Copied to another project and it works
+- Tested via `run_eval.py` against its own workspace
+- Reviewed as a single unit — all the tooling is visible in one tree
+- Deleted without orphaning dependencies elsewhere
+
+**The exception:** Shared patterns (`shared-patterns/anti-rationalization-core.md`) are referenced across skills. These stay shared. But skill-specific scripts, assets, and agents are always bundled.
+
+**Repo-level `scripts/`** is reserved for toolkit-wide operations (learning-db.py, sync-to-user-claude.py, INDEX generation) — tools that operate on the system as a whole, not on a single skill's workflow.
+
+## Workflow First, Constraints Inline
+
+Skill documents place the workflow (Instructions/Phases) immediately after the frontmatter. Constraints appear inline within the phases they govern, not in a separate upfront section.
+
+**Measured result:** A/B/C testing on Go code generation showed workflow-first ordering (C) swept constraints-first ordering (B) 3-0 across simple, medium, and complex prompts. Agent blind reviewers consistently scored workflow-first higher on testing depth, Go idioms, and benchmark coverage.
+
+**The ordering:**
+
+```
+1. YAML frontmatter           (What + When)
+2. Brief overview              (How — one paragraph)
+3. Instructions/Phases         (The actual workflow, with inline constraints)
+4. Benchmark/Commands Guide    (Reference material)
+5. Error Handling              (Failure context)
+6. Anti-Patterns               (What went wrong before)
+7. References                  (Pointers to deep context)
+```
+
+**Why it works:** The model encounters the task structure before the constraint framework. Constraints appear at the decision point where they apply — "use table-driven tests because they make adding cases trivial" inside the testing phase, not in a separate Hardcoded Behaviors section 200 lines earlier. The model spends attention on understanding the task, not parsing a constraint taxonomy.
+
+**What moves:** The Operator Context section (Hardcoded/Default/Optional behaviors) decomposes. Each constraint migrates to the phase where it applies. "Run with -race for concurrent code" belongs in Phase 3 (RUN), not in a behavior table.
+
+**What stays:** Error Handling, Anti-Patterns, and References remain at the end as context that's consulted when things go wrong — not before the model has understood what "going right" looks like.
+
 ## Open Sharing Over Individual Ownership
 
 Ideas matter less than open sharing. In an AI-assisted world, provenance becomes invisible. The toolkit is open source because:
diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md
index e0cd0e5..c4b2b5c 100644
--- a/docs/REFERENCE.md
+++ b/docs/REFERENCE.md
@@ -116,7 +116,7 @@ Request deep expertise: *"Use the [name] agent"*
 | `technical-documentation-engineer` | Docs, API references |
 | `technical-journalist-writer` | Technical journalism |
 | `agent-creator-engineer` | Create new agents |
-| `skill-creator-engineer` | Create new skills |
+| `skill-creator` | Create new skills |
 | `hook-development-engineer` | Claude Code hooks |
 | `project-coordinator-engineer` | Multi-agent orchestration |
 | `research-coordinator-engineer` | Research coordination |
diff --git a/docs/for-claude-code.md b/docs/for-claude-code.md
index 0c48efc..a875b22 100644
--- a/docs/for-claude-code.md
+++ b/docs/for-claude-code.md
@@ -439,7 +439,7 @@ Exit 0 = clean. Exit 1 = patterns found.
 | Review | reviewer-security, reviewer-business-logic, reviewer-performance, reviewer-concurrency, reviewer-dead-code |
 | Data | database-engineer, sqlite-peewee-engineer, data-engineer |
 | Content | technical-documentation-engineer, technical-journalist-writer |
-| Meta | skill-creator-engineer, system-upgrade-engineer, pipeline-orchestrator-engineer, research-coordinator-engineer |
+| Meta | skill-creator, system-upgrade-engineer, pipeline-orchestrator-engineer, research-coordinator-engineer |
 | Perses | perses-core-engineer, perses-dashboard-engineer, perses-operator-engineer, perses-plugin-engineer |
 | UI/Perf | ui-design-engineer, performance-optimization-engineer, react-portfolio-engineer |
 | Research | research-coordinator-engineer, research-subagent-executor |
diff --git a/docs/for-developers.md b/docs/for-developers.md
index 9fd3f6c..83e1709 100644
--- a/docs/for-developers.md
+++ b/docs/for-developers.md
@@ -75,7 +75,7 @@ The agent creator uses the `AGENT_TEMPLATE_V2.md` template and produces a comple
 /do create a skill for [your workflow]
 ```
 
-Describe the methodology, phases, and quality gates. The `skill-creator-engineer` builds the skill directory, SKILL.md with frontmatter, phase definitions, and updates the index.
+Describe the methodology, phases, and quality gates. The `skill-creator` builds the skill directory, SKILL.md with frontmatter, phase definitions, and updates the index.
 
 **Example prompts:**
 - `/do create a skill for database migration safety with pre-migration checks, rollback validation, and post-migration verification`
diff --git a/hooks/adr-enforcement.py b/hooks/adr-enforcement.py
index 4f2d567..fb5bccf 100644
--- a/hooks/adr-enforcement.py
+++ b/hooks/adr-enforcement.py
@@ -180,17 +180,8 @@ def main() -> None:
 
         event = json.loads(raw)
 
-        # Only process PostToolUse events
-        event_type = event.get("hook_event_name") or event.get("type", "")
-        if event_type != _EVENT_NAME:
-            empty_output(_EVENT_NAME).print_and_exit(0)
-            return
-
-        # Only act on Write or Edit tool calls
-        tool_name = event.get("tool_name", "")
-        if tool_name not in ("Write", "Edit"):
-            empty_output(_EVENT_NAME).print_and_exit(0)
-            return
+        # tool_name/event_type filters removed — matcher "Write|Edit" in settings.json
+        # prevents this hook from spawning for non-matching tools.
 
         # Extract file path from tool input
         tool_input = event.get("tool_input", {})
diff --git a/hooks/agent-grade-on-change.py b/hooks/agent-grade-on-change.py
index 06303c1..4de4084 100644
--- a/hooks/agent-grade-on-change.py
+++ b/hooks/agent-grade-on-change.py
@@ -90,10 +90,8 @@ def main():
     if not hook_input:
         return
 
-    # Check if this is a relevant tool call
-    tool_name = hook_input.get("tool_name", "")
-    if tool_name not in ("Edit", "Write"):
-        return
+    # tool_name filter removed — matcher "Write|Edit" in settings.json prevents
+    # this hook from spawning for non-matching tools.
 
     # Extract file path from tool input
     tool_input_data = hook_input.get("tool_input", {})
diff --git a/hooks/ci-merge-gate.py b/hooks/ci-merge-gate.py
index f2ec425..f29d9ef 100644
--- a/hooks/ci-merge-gate.py
+++ b/hooks/ci-merge-gate.py
@@ -19,9 +19,8 @@
 def main() -> None:
     data = json.loads(read_stdin(timeout=2))
 
-    tool = data.get("tool_name", "")
-    if tool != "Bash":
-        return
+    # tool_name filter removed — matcher "Bash" in settings.json prevents
+    # this hook from spawning for non-Bash tools.
 
     command = data.get("tool_input", {}).get("command", "")
 
diff --git a/hooks/post-tool-lint-hint.py b/hooks/post-tool-lint-hint.py
index 87f9611..f93a012 100755
--- a/hooks/post-tool-lint-hint.py
+++ b/hooks/post-tool-lint-hint.py
@@ -69,14 +69,8 @@ def main():
         event_data = read_stdin(timeout=2)
         event = json.loads(event_data)
 
-        # Check this is PostToolUse for Write or Edit
-        event_type = event.get("hook_event_name") or event.get("type", "")
-        if event_type != "PostToolUse":
-            return
-
-        tool_name = event.get("tool_name", "")
-        if tool_name not in ("Write", "Edit"):
-            return
+        # tool_name/event_type filters removed — matcher "Write|Edit" in settings.json
+        # prevents this hook from spawning for non-matching tools.
 
         # Get the file path from tool input
         tool_input = event.get("tool_input", {})
diff --git a/hooks/posttool-security-scan.py b/hooks/posttool-security-scan.py
index 8270b56..3fd0796 100755
--- a/hooks/posttool-security-scan.py
+++ b/hooks/posttool-security-scan.py
@@ -143,13 +143,8 @@ def main() -> None:
         raw = read_stdin(timeout=2)
         event = json.loads(raw)
 
-        event_type = event.get("hook_event_name") or event.get("type", "")
-        if event_type != "PostToolUse":
-            return
-
-        tool_name = event.get("tool_name", "")
-        if tool_name not in ("Write", "Edit"):
-            return
+        # tool_name/event_type filters removed — matcher "Write|Edit" in settings.json
+        # prevents this hook from spawning for non-matching tools.
 
         tool_input = event.get("tool_input", {})
         file_path = tool_input.get("file_path", "")
diff --git a/hooks/posttool-session-reads.py b/hooks/posttool-session-reads.py
index f1b2f62..a18400c 100755
--- a/hooks/posttool-session-reads.py
+++ b/hooks/posttool-session-reads.py
@@ -48,10 +48,8 @@ def main() -> None:
 
         event = json.loads(event_data)
 
-        # Only process Read tool results
-        tool_name = event.get("tool_name", "")
-        if tool_name != "Read":
-            return
+        # tool_name filter removed — matcher "Read" in settings.json prevents
+        # this hook from spawning for non-Read tools.
 
         # Extract file_path from tool_input
         tool_input = event.get("tool_input", {})
diff --git a/hooks/pretool-adr-creation-gate.py b/hooks/pretool-adr-creation-gate.py
index 075c79a..a1bfd1d 100644
--- a/hooks/pretool-adr-creation-gate.py
+++ b/hooks/pretool-adr-creation-gate.py
@@ -70,10 +70,8 @@ def main() -> None:
     except (json.JSONDecodeError, ValueError):
         sys.exit(0)
 
-    # Only gate Write — edits to existing files are fine.
-    tool_name = event.get("tool_name", "")
-    if tool_name != "Write":
-        sys.exit(0)
+    # tool_name filter removed — matcher "Write" in settings.json prevents
+    # this hook from spawning for non-Write tools.
 
     # Bypass env var.
     if os.environ.get(_BYPASS_ENV) == "1":
diff --git a/hooks/pretool-branch-safety.py b/hooks/pretool-branch-safety.py
index 406dd58..5706a1e 100644
--- a/hooks/pretool-branch-safety.py
+++ b/hooks/pretool-branch-safety.py
@@ -60,9 +60,8 @@ def main() -> None:
     except (json.JSONDecodeError, ValueError):
         sys.exit(0)
 
-    tool_name = event.get("tool_name", "")
-    if tool_name != "Bash":
-        sys.exit(0)
+    # tool_name filter removed — matcher "Bash" in settings.json prevents
+    # this hook from spawning for non-Bash tools.
 
     command = event.get("tool_input", {}).get("command", "")
     if "git commit" not in command:
diff --git a/hooks/pretool-creation-gate.py b/hooks/pretool-creation-gate.py
index 4d4e506..2b554a4 100644
--- a/hooks/pretool-creation-gate.py
+++ b/hooks/pretool-creation-gate.py
@@ -4,12 +4,12 @@
 PreToolUse:Write Hook: Creation Gate
 
 Blocks direct creation of new agent/skill files that bypass the
-skill-creator-engineer pipeline. Forces the LLM to route through
+skill-creator pipeline. Forces the LLM to route through
 proper creation workflows that produce full-depth components.
 
 This is a HARD GATE — it physically prevents the Write tool from creating
 new agent or skill files. The LLM receives a [fix-with-agent] directive
-telling it to use skill-creator-engineer.
+telling it to use skill-creator.
 
 Detection logic:
 - Tool is Write (not Edit — edits to existing files are allowed)
@@ -82,9 +82,9 @@ def main() -> None:
     # Block: new agent or skill file being created outside the creator pipeline
     component_type = "agent" if is_agent else "skill"
     print(
-        f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator-engineer or skill-creation-pipeline.\n"
+        f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator or skill-creation-pipeline.\n"
         f"[creation-gate] Path: {file_path}\n"
-        f"[fix-with-agent] skill-creator-engineer",
+        f"[fix-with-agent] skill-creator",
         file=sys.stderr,
     )
     sys.exit(2)
diff --git a/hooks/pretool-file-backup.py b/hooks/pretool-file-backup.py
index dab630a..9470068 100755
--- a/hooks/pretool-file-backup.py
+++ b/hooks/pretool-file-backup.py
@@ -49,9 +49,8 @@ def main() -> None:
     except (json.JSONDecodeError, ValueError):
         sys.exit(0)
 
-    tool_name = event.get("tool_name", "")
-    if tool_name != "Edit":
-        sys.exit(0)
+    # tool_name filter removed — matcher "Edit" in settings.json prevents
+    # this hook from spawning for non-Edit tools.
 
     tool_input = event.get("tool_input", {})
     file_path = tool_input.get("file_path", "")
diff --git a/hooks/pretool-learning-injector.py b/hooks/pretool-learning-injector.py
index df5f982..5216335 100755
--- a/hooks/pretool-learning-injector.py
+++ b/hooks/pretool-learning-injector.py
@@ -31,9 +31,6 @@
 
 EVENT_NAME = "PreToolUse"
 
-# Tools that benefit from proactive learning injection
-TARGET_TOOLS = {"Bash", "Edit"}
-
 # Max characters in the injected context to stay lightweight
 MAX_CONTEXT_CHARS = 500
 
@@ -160,11 +157,9 @@ def main():
 
         event = json.loads(event_data)
 
-        # Early exit for non-target tools
+        # tool_name filter removed — matcher "Bash|Edit" in settings.json prevents
+        # this hook from spawning for non-matching tools.
         tool_name = event.get("tool_name", "")
-        if tool_name not in TARGET_TOOLS:
-            empty_output(EVENT_NAME).print_and_exit()
-
         tool_input = event.get("tool_input", {})
 
         # Extract tags based on tool type
diff --git a/hooks/pretool-plan-gate.py b/hooks/pretool-plan-gate.py
index 04c7398..2b2fa0b 100644
--- a/hooks/pretool-plan-gate.py
+++ b/hooks/pretool-plan-gate.py
@@ -54,9 +54,8 @@ def main() -> None:
     except (json.JSONDecodeError, ValueError):
         sys.exit(0)
 
-    tool_name = event.get("tool_name", "")
-    if tool_name not in ("Write", "Edit"):
-        sys.exit(0)
+    # tool_name filter removed — matcher "Write|Edit" in settings.json prevents
+    # this hook from spawning for non-matching tools.
 
     # Bypass env var — set by the plans skill itself.
     if os.environ.get(_BYPASS_ENV) == "1":
diff --git a/hooks/pretool-prompt-injection-scanner.py b/hooks/pretool-prompt-injection-scanner.py
index 88348d6..d3502ae 100644
--- a/hooks/pretool-prompt-injection-scanner.py
+++ b/hooks/pretool-prompt-injection-scanner.py
@@ -268,11 +268,9 @@ def main() -> None:
         print(f"[injection-scanner] JSON parse failed: {e}", file=sys.stderr)
         empty_output(EVENT_NAME).print_and_exit()
 
-    # Field name compatibility: try new names first, fall back to old
+    # tool_name filter removed — matcher "Write|Edit" in settings.json prevents
+    # this hook from spawning for non-matching tools.
     tool = event.get("tool_name") or event.get("tool", "")
-    if tool not in ("Write", "Edit"):
-        empty_output(EVENT_NAME).print_and_exit()
-
     tool_input = event.get("tool_input", event.get("input", {}))
     file_path = tool_input.get("file_path", "")
     if not file_path:
diff --git a/hooks/pretool-subagent-warmstart.py b/hooks/pretool-subagent-warmstart.py
index 2a1a871..1da4886 100755
--- a/hooks/pretool-subagent-warmstart.py
+++ b/hooks/pretool-subagent-warmstart.py
@@ -251,10 +251,8 @@ def main() -> None:
 
         event = json.loads(event_data)
 
-        # Only process Agent tool invocations
-        tool_name = event.get("tool_name", "")
-        if tool_name != "Agent":
-            return
+        # tool_name filter removed — matcher "Agent" in settings.json prevents
+        # this hook from spawning for non-Agent tools.
 
         # Gather context from various sources
         files = load_recent_reads(Path(SESSION_READS_FILE))
diff --git a/hooks/pretool-synthesis-gate.py b/hooks/pretool-synthesis-gate.py
index 086932b..f092066 100755
--- a/hooks/pretool-synthesis-gate.py
+++ b/hooks/pretool-synthesis-gate.py
@@ -123,9 +123,8 @@ def main() -> None:
     except (json.JSONDecodeError, ValueError):
         sys.exit(0)
 
-    tool_name = event.get("tool_name", "")
-    if tool_name not in ("Write", "Edit"):
-        sys.exit(0)
+    # tool_name filter removed — matcher "Write|Edit" in settings.json prevents
+    # this hook from spawning for non-matching tools.
 
     # Bypass env var — set by the consultation skill itself.
     if os.environ.get(_BYPASS_ENV) == "1":
diff --git a/hooks/pretool-unified-gate.py b/hooks/pretool-unified-gate.py
index 81d6751..79b0cfe 100644
--- a/hooks/pretool-unified-gate.py
+++ b/hooks/pretool-unified-gate.py
@@ -295,9 +295,9 @@ def check_creation_gate(file_path: str) -> None:
 
     component_type = "agent" if is_agent else "skill"
     _block(
-        f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator-engineer or skill-creation-pipeline.\n"
+        f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator or skill-creation-pipeline.\n"
         f"[creation-gate] Path: {file_path}\n"
-        f"[fix-with-agent] skill-creator-engineer"
+        f"[fix-with-agent] skill-creator"
     )
 
 
diff --git a/hooks/record-activation.py b/hooks/record-activation.py
index 9ac1cb3..e52fdfb 100644
--- a/hooks/record-activation.py
+++ b/hooks/record-activation.py
@@ -28,18 +28,14 @@
 from hook_utils import get_session_id
 from stdin_timeout import read_stdin
 
-# Tools that represent meaningful work completing successfully
-TRACKED_TOOLS = {"Edit", "Write", "Bash"}
-
 
 def main() -> None:
     """Record session activation stats on successful tool completions."""
     try:
         hook_input = json.loads(read_stdin(timeout=2))
 
-        tool_name = hook_input.get("tool_name", "")
-        if tool_name not in TRACKED_TOOLS:
-            return
+        # tool_name filter removed — matcher "Edit|Write|Bash" in settings.json
+        # prevents this hook from spawning for non-matching tools.
 
         tool_result = hook_input.get("tool_result", {})
         if tool_result.get("is_error", False):
diff --git a/hooks/retro-graduation-gate.py b/hooks/retro-graduation-gate.py
index 76bcc3c..f7900b7 100644
--- a/hooks/retro-graduation-gate.py
+++ b/hooks/retro-graduation-gate.py
@@ -30,16 +30,8 @@ def main() -> None:
         empty_output(EVENT).print_and_exit(0)
         return
 
-    # Event type guard (defensive — matches peer hook pattern)
-    event_type = data.get("hook_event_name") or data.get("type", "")
-    if event_type and event_type != EVENT:
-        empty_output(EVENT).print_and_exit(0)
-        return
-
-    # Early-exit: only care about Bash tool (PostToolUse schema: tool_name)
-    if data.get("tool_name") != "Bash":
-        empty_output(EVENT).print_and_exit(0)
-        return
+    # tool_name/event_type filters removed — matcher "Bash" in settings.json
+    # prevents this hook from spawning for non-Bash tools.
 
     # Early-exit: check if output indicates a PR was created (PostToolUse schema: tool_result.output)
     tool_result = data.get("tool_result", {})
diff --git a/hooks/review-capture.py b/hooks/review-capture.py
index 724f1db..9883a06 100644
--- a/hooks/review-capture.py
+++ b/hooks/review-capture.py
@@ -117,10 +117,8 @@ def main() -> None:
 
         event = json.loads(event_data)
 
-        # Only process Agent tool results
-        tool_name = event.get("tool_name", "")
-        if tool_name != "Agent":
-            return
+        # tool_name filter removed — matcher "Agent" in settings.json prevents
+        # this hook from spawning for non-Agent tools.
 
         # Get tool result text
         tool_result = event.get("tool_result", "")
diff --git a/hooks/skill-evaluator.py b/hooks/skill-evaluator.py
index d142510..39402d5 100644
--- a/hooks/skill-evaluator.py
+++ b/hooks/skill-evaluator.py
@@ -43,7 +43,7 @@
     "testing-automation-engineer": "Unit/E2E tests, Playwright, CI pipelines",
     # Meta/Creation
     "agent-creator-engineer": "Create new specialized agents",
-    "skill-creator-engineer": "Create new Claude skills",
+    "skill-creator": "Create new Claude skills",
     "hook-development-engineer": "Create Claude Code hooks, event handlers",
     "mcp-local-docs-engineer": "Build MCP servers for documentation",
     # Coordination
@@ -151,7 +151,7 @@ def get_evaluation_prompt(complexity: str) -> str:
 - Docs: technical-documentation-engineer, technical-journalist-writer
 - UI: ui-design-engineer, performance-optimization-engineer
 - Testing: testing-automation-engineer
-- Meta: agent-creator-engineer, skill-creator-engineer, hook-development-engineer
+- Meta: agent-creator-engineer, skill-creator, hook-development-engineer
 - Research: research-coordinator-engineer, project-coordinator-engineer
 - Critique: roast skill (5 personas: contrarian, newcomer, builder, senior, pedant)"""
 
diff --git a/hooks/tests/test_post_tool_lint.py b/hooks/tests/test_post_tool_lint.py
index 70ae6d3..88102b1 100755
--- a/hooks/tests/test_post_tool_lint.py
+++ b/hooks/tests/test_post_tool_lint.py
@@ -94,7 +94,11 @@ def test_ignores_non_lintable_files():
 
 
 def test_ignores_read_tool():
-    """Hook should only trigger for Write/Edit, not Read."""
+    """Read tool filtering is now handled by matcher 'Write|Edit' in settings.json.
+
+    When called directly (without matcher), the hook processes any tool_name.
+    This test verifies the hook still exits 0 (non-blocking) for any input.
+    """
     setup()
     event = {
         "type": "PostToolUse",
@@ -104,7 +108,7 @@ def test_ignores_read_tool():
     stdout, stderr, code = run_hook(event)
 
     assert code == 0
-    assert stdout == ""
+    # Note: hook may produce output since tool_name filter was moved to matcher
 
 
 def test_handles_missing_file_path():
diff --git a/hooks/tests/test_posttool_session_reads.py b/hooks/tests/test_posttool_session_reads.py
index 8e0fc05..6105971 100644
--- a/hooks/tests/test_posttool_session_reads.py
+++ b/hooks/tests/test_posttool_session_reads.py
@@ -51,30 +51,23 @@ def run_hook(event: dict) -> tuple[str, str, int]:
 class TestToolNameFiltering:
     """Only Read tool events should be processed."""
 
-    def test_ignores_write_tool(self, tmp_path, monkeypatch):
-        """Write tool events should produce no output and no file."""
-        monkeypatch.chdir(tmp_path)
-        event = {
-            "tool_name": "Write",
-            "tool_input": {"file_path": "/some/file.py"},
-        }
-        stdout, stderr, code = run_hook(event)
-        assert code == 0
-        # No session-reads.txt should be created
-        assert not (tmp_path / ".claude" / "session-reads.txt").exists()
+    def test_nonread_tool_exits_zero(self, tmp_path, monkeypatch):
+        """Non-Read tool filtering is now handled by matcher 'Read' in settings.json.
 
-    def test_ignores_edit_tool(self, tmp_path, monkeypatch):
-        """Edit tool events should be ignored."""
+        When called directly (without matcher), the hook processes any tool_name.
+        This test verifies the hook still exits 0 (non-blocking) for any input.
+        """
         monkeypatch.chdir(tmp_path)
-        event = {
-            "tool_name": "Edit",
-            "tool_input": {"file_path": "/some/file.py"},
-        }
-        stdout, stderr, code = run_hook(event)
-        assert code == 0
+        for tool in ("Write", "Edit", "Bash"):
+            event = {
+                "tool_name": tool,
+                "tool_input": {"file_path": "/some/file.py"} if tool != "Bash" else {"command": "ls"},
+            }
+            stdout, stderr, code = run_hook(event)
+            assert code == 0
 
     def test_ignores_bash_tool(self, tmp_path, monkeypatch):
-        """Bash tool events should be ignored."""
+        """Bash tool events should be ignored (no file_path to extract)."""
         monkeypatch.chdir(tmp_path)
         event = {
             "tool_name": "Bash",
diff --git a/hooks/tests/test_pretool_subagent_warmstart.py b/hooks/tests/test_pretool_subagent_warmstart.py
index f8a1b51..62da3c1 100644
--- a/hooks/tests/test_pretool_subagent_warmstart.py
+++ b/hooks/tests/test_pretool_subagent_warmstart.py
@@ -58,28 +58,19 @@ def run_hook(event: dict) -> tuple[str, str, int]:
 class TestToolNameFiltering:
     """Only Agent tool events should be processed."""
 
-    def test_ignores_read_tool(self):
-        """Read tool events should produce no context output."""
-        event = {"tool_name": "Read", "tool_input": {"file_path": "/x"}}
-        stdout, stderr, code = run_hook(event)
-        assert code == 0
-        # Should be empty or empty hook output (no warmstart context)
-        if stdout.strip():
-            output = json.loads(stdout)
-            hook_out = output.get("hookSpecificOutput", {})
-            assert "additionalContext" not in hook_out or "[warmstart]" not in hook_out.get("additionalContext", "")
-
-    def test_ignores_write_tool(self):
-        """Write tool events should be ignored."""
-        event = {"tool_name": "Write", "tool_input": {"file_path": "/x"}}
-        stdout, stderr, code = run_hook(event)
-        assert code == 0
-
-    def test_ignores_bash_tool(self):
-        """Bash tool events should be ignored."""
-        event = {"tool_name": "Bash", "tool_input": {"command": "ls"}}
-        stdout, stderr, code = run_hook(event)
-        assert code == 0
+    def test_nonagent_tools_exit_zero(self):
+        """Non-Agent tool filtering is now handled by matcher 'Agent' in settings.json.
+
+        When called directly (without matcher), the hook processes any tool_name.
+        This test verifies the hook still exits 0 (non-blocking) for any input.
+        """
+        for tool, tool_input in [
+            ("Read", {"file_path": "/x"}),
+            ("Write", {"file_path": "/x"}),
+            ("Bash", {"command": "ls"}),
+        ]:
+            stdout, stderr, code = run_hook({"tool_name": tool, "tool_input": tool_input})
+            assert code == 0
 
     def test_processes_agent_tool(self, tmp_path, monkeypatch):
         """Agent tool events should produce warmstart context."""
diff --git a/hooks/usage-tracker.py b/hooks/usage-tracker.py
index 6ea3847..73626de 100644
--- a/hooks/usage-tracker.py
+++ b/hooks/usage-tracker.py
@@ -32,17 +32,10 @@ def main():
 
         event = json.loads(event_data)
 
-        # Only process PostToolUse events
-        event_type = event.get("hook_event_name") or event.get("type", "")
-        if event_type != "PostToolUse":
-            return
-
+        # tool_name/event_type filters removed — matcher "Skill|Agent" in settings.json
+        # prevents this hook from spawning for non-matching tools.
         tool_name = event.get("tool_name", "")
 
-        # Only track Skill and Agent tools — exit silently for everything else
-        if tool_name not in ("Skill", "Agent"):
-            return
-
         # Lazy import — only loaded when we actually need to record
         from hook_utils import get_project_dir, get_session_id
         from usage_db import record_agent, record_skill
diff --git a/pipelines/INDEX.json b/pipelines/INDEX.json
index 464d163..13a4e88 100644
--- a/pipelines/INDEX.json
+++ b/pipelines/INDEX.json
@@ -27,7 +27,7 @@
         "agent-evaluation",
         "system-upgrade"
       ],
-      "agent": "skill-creator-engineer"
+      "agent": "skill-creator"
     },
     "article-evaluation-pipeline": {
       "file": "pipelines/article-evaluation-pipeline/SKILL.md",
@@ -626,7 +626,7 @@
         "agent-evaluation",
         "routing-table-updater"
       ],
-      "agent": "skill-creator-engineer"
+      "agent": "skill-creator"
     },
     "system-upgrade": {
       "file": "pipelines/system-upgrade/SKILL.md",
diff --git a/pipelines/agent-upgrade/SKILL.md b/pipelines/agent-upgrade/SKILL.md
index 5230821..69032bd 100644
--- a/pipelines/agent-upgrade/SKILL.md
+++ b/pipelines/agent-upgrade/SKILL.md
@@ -10,7 +10,7 @@ description: |
 version: 1.0.0
 user-invocable: false
 argument-hint: "<agent-or-skill-name>"
-agent: skill-creator-engineer
+agent: skill-creator
 allowed-tools:
   - Read
   - Bash
diff --git a/pipelines/pipeline-scaffolder/references/architecture-rules.md b/pipelines/pipeline-scaffolder/references/architecture-rules.md
index 29afef7..fd34cd1 100644
--- a/pipelines/pipeline-scaffolder/references/architecture-rules.md
+++ b/pipelines/pipeline-scaffolder/references/architecture-rules.md
@@ -84,7 +84,7 @@ Phase 1: DISCOVER (sequential — needs full context)
     ↓
 Phase 2: SCAFFOLD (fan-out — group by creator type)
     ├─ agent-creator-engineer:   Agent A, Agent B, Agent C (1..N)
-    ├─ skill-creator-engineer:   Skill X, Skill Y (1..M)
+    ├─ skill-creator:   Skill X, Skill Y (1..M)
     ├─ hook-development-engineer: Hook 1, Hook 2 (1..K)
     └─ Direct:                    Script 1, Script 2 (1..J)
     ↓ (fan-in — wait for all)
diff --git a/pipelines/skill-creation-pipeline/SKILL.md b/pipelines/skill-creation-pipeline/SKILL.md
index 6a1e0f5..f3a37fc 100644
--- a/pipelines/skill-creation-pipeline/SKILL.md
+++ b/pipelines/skill-creation-pipeline/SKILL.md
@@ -8,7 +8,7 @@ description: |
   Use for "create skill pipeline", "new skill formal", "skill with gates".
 version: 1.0.0
 user-invocable: false
-agent: skill-creator-engineer
+agent: skill-creator
 allowed-tools:
   - Read
   - Bash
@@ -38,7 +38,7 @@ routing:
 
 ## Operator Context
 
-This pipeline wraps `skill-creator-engineer` with explicit discovery, design
+This pipeline wraps `skill-creator` with explicit discovery, design
 review, and validation gates. It is the **formal path** for creating new skills
 — as opposed to ad-hoc creation — and should be used whenever skill quality,
 uniqueness, or routing correctness is important. The pipeline does not replace
@@ -187,7 +187,7 @@ DESIGN BRIEF: [skill-name]
 ==========================
 
 Complexity Tier: [Simple | Medium | Complex | Comprehensive]
-Agent Binding:   skill-creator-engineer (default) or [other agent if domain-specific]
+Agent Binding:   skill-creator (default) or [other agent if domain-specific]
 User-Invocable:  [true | false]
 
 Phases:
@@ -323,7 +323,7 @@ Read the current INDEX.json and append an entry for the new skill:
   "path": "skills/skill-name/SKILL.md",
   "description": "[first line of the frontmatter description]",
   "user-invocable": true,
-  "agent": "skill-creator-engineer"
+  "agent": "skill-creator"
 }
 ```
 
diff --git a/pipelines/system-upgrade/SKILL.md b/pipelines/system-upgrade/SKILL.md
index 55f7f9c..7bb5b02 100644
--- a/pipelines/system-upgrade/SKILL.md
+++ b/pipelines/system-upgrade/SKILL.md
@@ -46,7 +46,7 @@ complementing the **bottom-up** retro-knowledge-injector.
 
 ### Hardcoded Behaviors (Always Apply)
 - **Show Plan Before Implementing**: Phase 3 output (ranked upgrade list) MUST be presented to the user and approved before Phase 4 begins. Never silently execute upgrades.
-- **Reuse Domain Agents**: Phase 4 (IMPLEMENT) dispatches to existing domain agents (skill-creator-engineer, agent-creator-engineer, hook-development-engineer, golang-general-engineer, etc.). The upgrade engineer orchestrates; specialists execute.
+- **Reuse Domain Agents**: Phase 4 (IMPLEMENT) dispatches to existing domain agents (skill-creator, agent-creator-engineer, hook-development-engineer, golang-general-engineer, etc.). The upgrade engineer orchestrates; specialists execute.
 - **Parallel Fan-Out**: When 3+ components need the same type of upgrade, dispatch in parallel using multiple Agent tool calls in a single message.
 - **Score Delta Required**: Phase 5 (VALIDATE) must produce before/after evaluation delta, not just "looks good." Use `agent-evaluation` skill.
 - **Trigger Type Determines Input**: The three trigger types (claude-release, goal-change, retro-driven) require different input parsing in Phase 1.
@@ -202,7 +202,7 @@ IMPORTANT (should fix):
   4. skills/go-testing/SKILL.md — Apply new pattern from retro L2 [inject-pattern, ~10min]
 
 MINOR (nice to have):
-  5. agents/skill-creator-engineer.md — Add new frontmatter field docs [upgrade, ~5min]
+  5. agents/skill-creator.md — Add new frontmatter field docs [upgrade, ~5min]
 
 Total: 5 changes across 5 components
 Parallel dispatch: 3 groups (hooks, agents, skills)
@@ -232,10 +232,10 @@ git checkout -b chore/system-upgrade-$(date +%Y-%m-%d)
 | Change Domain | Domain Agent |
 |--------------|-------------|
 | Hook modifications | hook-development-engineer |
-| Agent upgrades | agent-creator-engineer (or skill-creator-engineer for agents) |
-| Skill upgrades | skill-creator-engineer |
+| Agent upgrades | agent-creator-engineer (or skill-creator for agents) |
+| Skill upgrades | skill-creator |
 | Routing changes | routing-table-updater |
-| Pattern injection | skill-creator-engineer or direct Edit |
+| Pattern injection | skill-creator or direct Edit |
 
 **Step 2**: Dispatch parallel agents for independent groups. Use a single message with multiple Agent tool calls for changes that don't depend on each other.
 
@@ -365,7 +365,7 @@ Solution: Manually copy modified files to `~/.claude/` equivalent directories. R
 
 ### Anti-Pattern 2: Handling All Changes Directly Instead of Dispatching
 **What it looks like**: Making all edits inline rather than routing to domain agents
-**Why wrong**: Domain agents (skill-creator-engineer, hook-development-engineer) know the templates and anti-patterns for their domain
+**Why wrong**: Domain agents (skill-creator, hook-development-engineer) know the templates and anti-patterns for their domain
 **Do instead**: Dispatch to domain agents for anything beyond simple pattern injection
 
 ### Anti-Pattern 3: Auditing Everything Every Time
diff --git a/scripts/audit-tool-restrictions.py b/scripts/audit-tool-restrictions.py
index 6f5e886..1e0301e 100644
--- a/scripts/audit-tool-restrictions.py
+++ b/scripts/audit-tool-restrictions.py
@@ -131,7 +131,7 @@
     "python-openstack-engineer": "code-modifier",
     "rabbitmq-messaging-engineer": "code-modifier",
     "react-portfolio-engineer": "code-modifier",
-    "skill-creator-engineer": "code-modifier",
+    "skill-creator": "code-modifier",
     "sqlite-peewee-engineer": "code-modifier",
     "testing-automation-engineer": "code-modifier",
     "typescript-debugging-engineer": "code-modifier",
diff --git a/scripts/routing-benchmark.json b/scripts/routing-benchmark.json
index f41d1a9..7f80cdc 100644
--- a/scripts/routing-benchmark.json
+++ b/scripts/routing-benchmark.json
@@ -284,10 +284,9 @@
     },
     {
       "request": "create a new Claude Code skill with quality gates",
-      "expected_agent": "skill-creator-engineer",
-      "expected_skill": "skill-creation-pipeline",
+      "expected_skill": "skill-creator",
       "category": "meta-tooling",
-      "notes": "Skill creation — agent + pipeline pairing"
+      "notes": "Skill creation — skill-creator handles the full eval-driven workflow"
     },
     {
       "request": "create a new hook for PostToolUse events",
diff --git a/skills/INDEX.json b/skills/INDEX.json
index 385355c..c9bc4de 100644
--- a/skills/INDEX.json
+++ b/skills/INDEX.json
@@ -1,6 +1,6 @@
 {
   "version": "2.0",
-  "generated": "2026-03-25T23:05:47Z",
+  "generated": "2026-03-27T03:14:10Z",
   "generated_by": "scripts/generate-skill-index.py",
   "skills": {
     "adr-consultation": {
@@ -16,7 +16,7 @@
         "adr consultation"
       ],
       "category": "meta",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "feature-design",
@@ -121,7 +121,7 @@
         "find unused"
       ],
       "category": "code-quality",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0"
     },
     "code-linting": {
@@ -206,26 +206,27 @@
     },
     "content-engine": {
       "file": "skills/content-engine/SKILL.md",
-      "description": "Repurpose a source asset into platform-native social content variants for X, LinkedIn, TikTok, YouTube, and newsletter. Produces content_ideas.md and content_drafts.md with a quality gate before delivery.",
+      "description": "Repurpose a source asset (article, demo, launch note, insight) into platform-native social content variants.",
       "triggers": [
         "repurpose this",
         "adapt for social",
         "turn this into posts",
         "content from article",
         "content from demo",
+        "content from doc",
         "write variants for",
         "social content from",
         "platform variants",
         "repurpose for"
       ],
       "category": "content",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "x-api",
         "crosspost"
       ],
-      "disambiguate": "voice-writer"
+      "model": "sonnet"
     },
     "create-voice": {
       "file": "skills/create-voice/SKILL.md",
@@ -241,7 +242,7 @@
       ],
       "category": "content",
       "force_route": true,
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "voice-calibrator",
@@ -358,7 +359,7 @@
         "10 perspectives"
       ],
       "category": "meta-tooling",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0"
     },
     "docs-sync-checker": {
@@ -385,16 +386,15 @@
         "POM",
         "test flakiness"
       ],
-      "category": "testing",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
-      "agent": "testing-automation-engineer",
-      "model": "sonnet",
       "pairs_with": [
         "testing-automation-engineer",
         "typescript-frontend-engineer",
         "test-driven-development"
-      ]
+      ],
+      "agent": "testing-automation-engineer",
+      "model": "sonnet"
     },
     "endpoint-validator": {
       "file": "skills/endpoint-validator/SKILL.md",
@@ -425,7 +425,7 @@
       ],
       "category": "process",
       "force_route": true,
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": []
     },
@@ -442,7 +442,7 @@
       ],
       "category": "process",
       "force_route": true,
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0",
       "pairs_with": [
         "feature-plan",
@@ -461,7 +461,7 @@
       ],
       "category": "process",
       "force_route": true,
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0",
       "pairs_with": [
         "feature-plan",
@@ -481,7 +481,7 @@
       ],
       "category": "process",
       "force_route": true,
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0",
       "pairs_with": [
         "feature-design",
@@ -501,7 +501,7 @@
       ],
       "category": "process",
       "force_route": true,
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0",
       "pairs_with": [
         "feature-validate",
@@ -521,7 +521,7 @@
       ],
       "category": "process",
       "force_route": true,
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0",
       "pairs_with": [
         "feature-implement",
@@ -553,32 +553,6 @@
       "version": "2.0.0",
       "pairs_with": []
     },
-    "frontend-slides": {
-      "file": "skills/frontend-slides/SKILL.md",
-      "description": "Browser-based HTML presentation generation with viewport-fit enforcement, curated style presets, and deterministic overflow validation. Three paths: new build, PPTX-to-HTML conversion, or HTML deck enhancement.",
-      "triggers": [
-        "HTML slides",
-        "browser presentation",
-        "web deck",
-        "reveal-style",
-        "viewport presentation",
-        "convert PPTX to web",
-        "convert PPTX to HTML",
-        "slides for a browser",
-        "kiosk presentation",
-        "interactive presentation keyboard",
-        "projector browser"
-      ],
-      "category": "frontend",
-      "user_invocable": true,
-      "version": "1.0.0",
-      "agent": "typescript-frontend-engineer",
-      "model": "sonnet",
-      "pairs_with": [
-        "typescript-frontend-engineer",
-        "pptx-generator"
-      ]
-    },
     "forensics": {
       "file": "skills/forensics/SKILL.md",
       "description": "Post-mortem diagnostic analysis of failed or stuck workflows.",
@@ -597,7 +571,7 @@
         "incident review"
       ],
       "category": "process",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "systematic-debugging",
@@ -605,6 +579,32 @@
         "plan-checker"
       ]
     },
+    "frontend-slides": {
+      "file": "skills/frontend-slides/SKILL.md",
+      "description": "Browser-based HTML presentation generation with viewport-fit enforcement.",
+      "triggers": [
+        "HTML slides",
+        "browser presentation",
+        "web deck",
+        "reveal-style",
+        "viewport presentation",
+        "convert PPTX to web",
+        "convert PPTX to HTML",
+        "slides for a browser",
+        "kiosk presentation",
+        "interactive presentation keyboard",
+        "projector browser"
+      ],
+      "category": "frontend",
+      "user_invocable": false,
+      "version": "1.0.0",
+      "pairs_with": [
+        "typescript-frontend-engineer",
+        "pptx-generator"
+      ],
+      "agent": "typescript-frontend-engineer",
+      "model": "sonnet"
+    },
     "full-repo-review": {
       "file": "skills/full-repo-review/SKILL.md",
       "description": "Run comprehensive 3-wave review against all source files in the repo, producing a prioritized issue backlog.",
@@ -654,7 +654,7 @@
         "make claude.md"
       ],
       "category": "documentation",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "go-sapcc-conventions",
@@ -667,11 +667,7 @@
       "triggers": [
         "commit",
         "stage and commit",
-        "commit changes",
-        "save my work",
-        "commit this",
-        "save progress",
-        "checkpoint"
+        "commit changes"
       ],
       "category": "git-workflow",
       "force_route": true,
@@ -703,7 +699,7 @@
         "github inbox"
       ],
       "category": "github",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [],
       "model": "sonnet"
@@ -875,7 +871,7 @@
         "headless agent"
       ],
       "category": "process",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "agent": "python-general-engineer"
     },
@@ -940,7 +936,7 @@
         "wiring check"
       ],
       "category": "process",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "feature-implement",
@@ -961,7 +957,7 @@
         "reframe positively"
       ],
       "category": "content",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "voice-writer",
@@ -969,6 +965,54 @@
         "voice-validator"
       ]
     },
+    "kotlin-coroutines": {
+      "file": "skills/kotlin-coroutines/SKILL.md",
+      "description": "Kotlin structured concurrency, Flow, Channel, and cancellation patterns",
+      "triggers": [
+        "kotlin-coroutines",
+        "kotlin",
+        "coroutines"
+      ],
+      "user_invocable": false,
+      "version": "1.0.0",
+      "agent": "general-purpose"
+    },
+    "kotlin-testing": {
+      "file": "skills/kotlin-testing/SKILL.md",
+      "description": "Kotlin testing patterns with JUnit 5, Kotest, and coroutine test dispatchers",
+      "triggers": [
+        "kotlin-testing",
+        "kotlin",
+        "testing"
+      ],
+      "user_invocable": false,
+      "version": "1.0.0",
+      "agent": "general-purpose"
+    },
+    "kubernetes-debugging": {
+      "file": "skills/kubernetes-debugging/SKILL.md",
+      "description": "Kubernetes debugging methodology for pod failures, networking issues, and resource problems",
+      "triggers": [
+        "kubernetes-debugging",
+        "kubernetes",
+        "debugging"
+      ],
+      "user_invocable": false,
+      "version": "1.0.0",
+      "agent": "kubernetes-helm-engineer"
+    },
+    "kubernetes-security": {
+      "file": "skills/kubernetes-security/SKILL.md",
+      "description": "Kubernetes security patterns including RBAC, PodSecurityStandards, network policies, and secret management",
+      "triggers": [
+        "kubernetes-security",
+        "kubernetes",
+        "security"
+      ],
+      "user_invocable": false,
+      "version": "1.0.0",
+      "agent": "kubernetes-helm-engineer"
+    },
     "learn": {
       "file": "skills/learn/SKILL.md",
       "description": "Manually teach Claude Code an error pattern and its solution, storing it in the learning database with high confidence.",
@@ -978,7 +1022,7 @@
         "manual learning entry"
       ],
       "category": "meta-tooling",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0"
     },
     "link-auditor": {
@@ -1058,7 +1102,7 @@
         "wrap up session"
       ],
       "category": "process",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "resume-work"
@@ -1167,7 +1211,7 @@
         "first-time Perses setup"
       ],
       "category": "perses",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0",
       "agent": "perses-dashboard-engineer"
     },
@@ -1232,6 +1276,30 @@
       "version": "2.0.0",
       "agent": "perses-dashboard-engineer"
     },
+    "php-quality": {
+      "file": "skills/php-quality/SKILL.md",
+      "description": "PHP code quality patterns including PSR standards, strict types, and framework idioms",
+      "triggers": [
+        "php-quality",
+        "php",
+        "quality"
+      ],
+      "user_invocable": false,
+      "version": "1.0.0",
+      "agent": "general-purpose"
+    },
+    "php-testing": {
+      "file": "skills/php-testing/SKILL.md",
+      "description": "PHP testing patterns with PHPUnit, test doubles, and database testing",
+      "triggers": [
+        "php-testing",
+        "php",
+        "testing"
+      ],
+      "user_invocable": false,
+      "version": "1.0.0",
+      "agent": "general-purpose"
+    },
     "plan-checker": {
       "file": "skills/plan-checker/SKILL.md",
       "description": "Validate plans against 10 verification dimensions before execution begins.",
@@ -1245,7 +1313,7 @@
         "pre-execution check"
       ],
       "category": "process",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "feature-plan",
@@ -1301,7 +1369,7 @@
         "plant-seed"
       ],
       "category": "process",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "feature-design"
@@ -1352,7 +1420,7 @@
         "prune branches"
       ],
       "category": "git-workflow",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0"
     },
     "pr-fix": {
@@ -1364,7 +1432,7 @@
         "pr-fix"
       ],
       "category": "git-workflow",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0"
     },
     "pr-miner": {
@@ -1397,7 +1465,7 @@
         "address review comments"
       ],
       "category": "git-workflow",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0"
     },
     "pr-status": {
@@ -1410,7 +1478,7 @@
       ],
       "category": "git-workflow",
       "force_route": true,
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0"
     },
     "pr-sync": {
@@ -1420,11 +1488,7 @@
         "push",
         "push changes",
         "create PR",
-        "sync to GitHub",
-        "open a pull request",
-        "make a PR",
-        "submit PR",
-        "push and PR"
+        "sync to GitHub"
       ],
       "category": "git-workflow",
       "force_route": true,
@@ -1445,7 +1509,7 @@
       ],
       "category": "process",
       "force_route": true,
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "feature-design",
@@ -1542,7 +1606,7 @@
         "Reddit reports"
       ],
       "category": "process",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "agent": "python-general-engineer"
     },
@@ -1558,7 +1622,7 @@
         "read every file in repo"
       ],
       "category": "analysis",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "explore-pipeline"
@@ -1609,7 +1673,7 @@
         "poke holes in this"
       ],
       "category": "analysis",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0"
     },
     "routing-table-updater": {
@@ -1635,7 +1699,7 @@
         "sapcc standards check"
       ],
       "category": "language",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0",
       "pairs_with": [
         "golang-general-engineer",
@@ -1656,7 +1720,7 @@
         "review sapcc standards"
       ],
       "category": "language",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "golang-general-engineer",
@@ -1666,6 +1730,29 @@
       "agent": "golang-general-engineer",
       "model": "opus"
     },
+    "security-threat-model": {
+      "file": "skills/security-threat-model/SKILL.md",
+      "description": "Phase-gated security threat model skill.",
+      "triggers": [
+        "threat model",
+        "security audit",
+        "supply chain scan",
+        "deny list",
+        "learning db sanitize",
+        "security posture",
+        "injection scan",
+        "surface scan",
+        "audit hooks",
+        "audit skills"
+      ],
+      "category": "security",
+      "user_invocable": false,
+      "version": "1.0.0",
+      "pairs_with": [
+        "python-general-engineer"
+      ],
+      "model": "opus"
+    },
     "seo-optimizer": {
       "file": "skills/seo-optimizer/SKILL.md",
       "description": "Analyze and optimize blog post SEO: keywords, titles, meta descriptions, headers, and internal linking.",
@@ -1716,6 +1803,27 @@
       "user_invocable": false,
       "version": "2.0.0"
     },
+    "skill-creator": {
+      "file": "skills/skill-creator/SKILL.md",
+      "description": "Create new skills and iteratively improve them through eval-driven validation.",
+      "triggers": [
+        "create skill",
+        "new skill",
+        "skill template",
+        "skill design",
+        "test skill",
+        "improve skill",
+        "optimize description",
+        "skill eval"
+      ],
+      "category": "meta",
+      "user_invocable": false,
+      "version": "2.0.0",
+      "pairs_with": [
+        "agent-evaluation",
+        "verification-before-completion"
+      ]
+    },
     "skill-eval": {
       "file": "skills/skill-eval/SKILL.md",
       "description": "Evaluate and improve skills through measured testing.",
@@ -1729,13 +1837,13 @@
         "skill quality"
       ],
       "category": "meta",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "agent-evaluation",
         "verification-before-completion"
       ],
-      "agent": "skill-creator-engineer"
+      "agent": "skill-creator"
     },
     "socratic-debugging": {
       "file": "skills/socratic-debugging/SKILL.md",
@@ -1788,6 +1896,30 @@
       "user_invocable": false,
       "version": "2.0.0"
     },
+    "swift-concurrency": {
+      "file": "skills/swift-concurrency/SKILL.md",
+      "description": "Swift structured concurrency with async/await, Actor, Task, and Sendable patterns",
+      "triggers": [
+        "swift-concurrency",
+        "swift",
+        "concurrency"
+      ],
+      "user_invocable": false,
+      "version": "1.0.0",
+      "agent": "general-purpose"
+    },
+    "swift-testing": {
+      "file": "skills/swift-testing/SKILL.md",
+      "description": "Swift testing patterns with XCTest, Swift Testing framework, and async test patterns",
+      "triggers": [
+        "swift-testing",
+        "swift",
+        "testing"
+      ],
+      "user_invocable": false,
+      "version": "1.0.0",
+      "agent": "general-purpose"
+    },
     "systematic-code-review": {
       "file": "skills/systematic-code-review/SKILL.md",
       "description": "4-phase code review methodology: UNDERSTAND changes, VERIFY claims against code, ASSESS security/performance/architecture risks, DOCUMENT findings with severity classification.",
@@ -1984,12 +2116,12 @@
         "assemble clips",
         "video editing"
       ],
-      "category": "media",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "typescript-frontend-engineer"
       ],
+      "agent": "python-general-engineer",
       "model": "sonnet"
     },
     "vitest-runner": {
@@ -2037,7 +2169,7 @@
         "strict verification"
       ],
       "category": "process",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "2.0.0"
     },
     "wordpress-live-validation": {
@@ -2099,7 +2231,7 @@
     },
     "x-api": {
       "file": "skills/x-api/SKILL.md",
-      "description": "Post tweets, build threads, upload media, and read timelines via the X API with OAuth 1.0a/2.0 and a mandatory confirm gate before any write.",
+      "description": "Post tweets, build threads, upload media, and read timelines via the X API.",
       "triggers": [
         "post to X",
         "post tweet",
@@ -2116,12 +2248,14 @@
         "publish to twitter"
       ],
       "category": "content-publishing",
-      "user_invocable": true,
+      "user_invocable": false,
       "version": "1.0.0",
       "pairs_with": [
         "content-engine",
         "crosspost"
-      ]
+      ],
+      "agent": "python-general-engineer",
+      "model": "sonnet"
     }
   }
 }
diff --git a/skills/agent-evaluation/SKILL.md b/skills/agent-evaluation/SKILL.md
index 761fde1..2534f9e 100644
--- a/skills/agent-evaluation/SKILL.md
+++ b/skills/agent-evaluation/SKILL.md
@@ -60,7 +60,7 @@ This skill operates as an operator for agent/skill quality assurance, configurin
 - Batch-evaluate entire collections with summary statistics
 
 ## What This Skill CANNOT Do
-- Modify or fix agents/skills (use skill-creator-engineer instead)
+- Modify or fix agents/skills (use skill-creator instead)
 - Evaluate external repositories or non-agent/skill files
 - Replace human judgment on content accuracy or domain correctness
 - Skip rubric categories — all must be scored
diff --git a/skills/do/references/routing-tables.md b/skills/do/references/routing-tables.md
index 83866ae..9063415 100644
--- a/skills/do/references/routing-tables.md
+++ b/skills/do/references/routing-tables.md
@@ -33,7 +33,7 @@ Route to these agents based on the user's task domain. Each entry describes what
 | **project-coordinator-engineer** | User needs multi-agent coordination for a large project: spawning parallel agents, tracking cross-cutting tasks, or orchestrating a multi-phase effort. |
 | **pipeline-orchestrator-engineer** | User wants to create a new pipeline, scaffold a new structured workflow, or compose pipeline phases. |
 | **hook-development-engineer** | User wants to create or modify Python hooks for Claude Code's event-driven system (SessionStart, PostToolUse, etc.). |
-| **skill-creator-engineer** | User wants to create or improve a Claude Code skill, workflow automation, or agent configuration. |
+| **skill-creator** | User wants to create or improve a Claude Code skill, workflow automation, or agent configuration. |
 | **system-upgrade-engineer** | User wants to upgrade the agent/skill/hook ecosystem after a Claude model update or system-wide change. |
 | **technical-documentation-engineer** | User needs technical documentation created, maintained, or validated — API docs, READMEs, architecture guides. |
 | **technical-journalist-writer** | User needs professional technical writing in a journalism style — articles, posts, or content with a specific authored voice. |
@@ -46,7 +46,7 @@ Route to these agents based on the user's task domain. Each entry describes what
 | **github-profile-rules-engineer** | User wants to extract coding conventions, programming rules, or style guidelines from a GitHub profile's repositories. |
 | **react-portfolio-engineer** | User is building a React portfolio or gallery website, typically for creative professionals. |
 | **nextjs-ecommerce-engineer** | User is building an e-commerce site with Next.js: product pages, cart, checkout flows. |
-| **toolkit-governance-engineer** | User wants to maintain or modify the toolkit's own internal structure: editing skill/agent files, updating routing tables, managing ADRs, regenerating INDEX.json, or enforcing frontmatter compliance. NOT: creating brand-new agents (use skill-creator-engineer), writing application code (domain agents), or reviewing external PRs (reviewer agents). |
+| **toolkit-governance-engineer** | User wants to maintain or modify the toolkit's own internal structure: editing skill/agent files, updating routing tables, managing ADRs, regenerating INDEX.json, or enforcing frontmatter compliance. NOT: creating brand-new agents (use skill-creator), writing application code (domain agents), or reviewing external PRs (reviewer agents). |
 
 ---
 
@@ -229,10 +229,10 @@ All pipelines live in the `pipelines/` directory (synced to `~/.claude/skills/`
 |----------|--------------------|--------|
 | **pipeline-scaffolder** (pipeline-orchestrator-engineer) | User wants to create a new pipeline, scaffold a new structured workflow from a spec. | LOAD → SCAFFOLD → INTEGRATE → REPORT |
 | **system-upgrade** (system-upgrade-engineer) | User wants to upgrade the Claude Code toolkit after a model update, apply system-wide changes, or roll out agent improvements. NOT: upgrading a specific library dependency in user code. | CHANGELOG → AUDIT → PLAN → IMPLEMENT → VALIDATE → DEPLOY |
-| **skill-creation-pipeline** (skill-creator-engineer) | User wants to create a new skill with formal quality gates, phase structure, and integration. | DISCOVER → DESIGN → SCAFFOLD → VALIDATE → INTEGRATE |
+| **skill-creation-pipeline** (skill-creator) | User wants to create a new skill with formal quality gates, phase structure, and integration. | DISCOVER → DESIGN → SCAFFOLD → VALIDATE → INTEGRATE |
 | **hook-development-pipeline** (hook-development-engineer) | User wants to create a new hook with formal spec, performance testing, and registration. | SPEC → IMPLEMENT → TEST → REGISTER → DOCUMENT |
 | **research-pipeline** (research-coordinator-engineer) | User wants formal research with saved artifacts, multiple sources, and a synthesized deliverable. NOT: a quick lookup or single-source check. | SCOPE → GATHER → SYNTHESIZE → VALIDATE → DELIVER |
-| **agent-upgrade** (skill-creator-engineer) | User wants to audit and improve a specific agent to bring it up to current template standards. | AUDIT → DIFF → PLAN → IMPLEMENT → RE-EVALUATE |
+| **agent-upgrade** (skill-creator) | User wants to audit and improve a specific agent to bring it up to current template standards. | AUDIT → DIFF → PLAN → IMPLEMENT → RE-EVALUATE |
 | **research-to-article** | User wants to research a topic and turn the findings into a written article. | RESEARCH → COMPILE → GROUND → GENERATE → VALIDATE → REFINE → OUTPUT |
 | **doc-pipeline** | User wants to generate documentation for a codebase, create a README, or write technical docs from scratch. | RESEARCH → OUTLINE → GENERATE → VERIFY → OUTPUT |
 | **pr-pipeline** | User wants the full structured PR workflow with review gates. | CLASSIFY → STAGE → REVIEW → COMMIT → PUSH → CREATE → VERIFY → CLEANUP |
@@ -376,10 +376,10 @@ Invoked via the roast skill or directly:
 | "research then write article" | research-to-article pipeline | Research-backed content creation |
 | "create a pipeline for X" | pipeline-orchestrator-engineer + pipeline-scaffolder | Pipeline creation |
 | "upgrade system for new Claude version" | system-upgrade-engineer + system-upgrade | System-wide upgrade |
-| "create skill with quality gates" | skill-creator-engineer + skill-creation-pipeline | Formal skill creation |
+| "create skill with quality gates" | skill-creator + skill-creation-pipeline | Formal skill creation |
 | "create hook (formal, with perf test)" | hook-development-engineer + hook-development-pipeline | Formal hook creation |
 | "research with saved artifacts" | research-coordinator-engineer + research-pipeline | Formal research pipeline |
-| "upgrade this specific agent" | skill-creator-engineer + agent-upgrade | Single agent improvement |
+| "upgrade this specific agent" | skill-creator + agent-upgrade | Single agent improvement |
 | "create a 3D scene" | typescript-frontend-engineer + threejs-builder | Frontend domain, 3D task |
 | "generate image with Python" | python-general-engineer + gemini-image-generator | Python domain, image generation |
 | "extract coding rules from github user X" | github-profile-rules-engineer + github-profile-rules | Profile analysis |
diff --git a/skills/routing-table-updater/SKILL.md b/skills/routing-table-updater/SKILL.md
index f55973e..0a51a76 100644
--- a/skills/routing-table-updater/SKILL.md
+++ b/skills/routing-table-updater/SKILL.md
@@ -3,7 +3,7 @@ name: routing-table-updater
 description: |
   Maintain /do routing tables and command references when skills or agents
   are added, modified, or removed. Use when skill/agent metadata changes,
-  after skill-creator-engineer or agent-creator-engineer runs, or when
+  after skill-creator or agent-creator-engineer runs, or when
   routing tables need synchronization. Use for "update routes", "sync
   routing", "routing table", or "refresh /do". Do NOT use for creating
   new skills/agents, modifying skill logic, or manual /do table edits.
@@ -262,7 +262,7 @@ If gate fails:
 
 ### Example 1: New Skill Created
 
-User creates `skills/api-integration-helper/SKILL.md` via skill-creator-engineer:
+User creates `skills/api-integration-helper/SKILL.md` via skill-creator:
 
 ```yaml
 ---
@@ -375,7 +375,7 @@ The scaffolder provides a component list (from the Pipeline Spec):
 | Scan | All skills/* and agents/* | Only listed components |
 | Conflict check | Against existing entries | Against existing AND within batch |
 | OUTPUT | One entry at a time | N entries in one pass |
-| Invoked by | skill-creator-engineer, agent-creator-engineer | pipeline-scaffolder Phase 4 |
+| Invoked by | skill-creator, agent-creator-engineer | pipeline-scaffolder Phase 4 |
 
 ---
 
@@ -383,7 +383,7 @@ The scaffolder provides a component list (from the Pipeline Spec):
 
 This skill is typically invoked after other creation skills complete:
 
-- **After skill-creator-engineer**: New skill created, routing tables need updated entry
+- **After skill-creator**: New skill created, routing tables need updated entry
 - **After agent-creator-engineer**: New agent created, domain routing needs expansion
 - **After skill/agent modification**: Description or trigger changes require routing refresh
 - **During repository maintenance**: Periodic sync to catch manual drift
diff --git a/skills/shared-patterns/pipeline-architecture.md b/skills/shared-patterns/pipeline-architecture.md
index f182873..2f71830 100644
--- a/skills/shared-patterns/pipeline-architecture.md
+++ b/skills/shared-patterns/pipeline-architecture.md
@@ -215,7 +215,7 @@ Define Requirements
     Add to Routing
 ```
 
-**Skill**: `agent-creator-engineer` or `skill-creator-engineer`
+**Skill**: `agent-creator-engineer` or `skill-creator`
 
 ---
 
diff --git a/skills/skill-creator/SKILL.md b/skills/skill-creator/SKILL.md
new file mode 100644
index 0000000..7bab347
--- /dev/null
+++ b/skills/skill-creator/SKILL.md
@@ -0,0 +1,390 @@
+---
+name: skill-creator
+description: |
+  Create new skills and iteratively improve them through eval-driven validation.
+  Draft a skill, test it against real prompts, review the outputs, improve based
+  on measured results, repeat. Use when creating new skills, improving existing
+  skills, testing skill quality, or optimizing descriptions for triggering accuracy.
+  Use for "create skill", "new skill", "test skill", "improve skill", "optimize
+  description", "skill eval", "turn this into a skill". Do NOT use for agent
+  creation (use agent-creator-engineer) or hook development (use
+  hook-development-engineer).
+version: 2.0.0
+routing:
+  triggers:
+    - create skill
+    - new skill
+    - skill template
+    - skill design
+    - test skill
+    - improve skill
+    - optimize description
+    - skill eval
+  pairs_with:
+    - agent-evaluation
+    - verification-before-completion
+  complexity: Complex
+  category: meta
+allowed-tools:
+  - Read
+  - Edit
+  - Write
+  - Bash
+  - Glob
+  - Grep
+  - Agent
+---
+
+# Skill Creator
+
+Create skills and iteratively improve them through measurement.
+
+The process:
+
+- Decide what the skill should do and how it should work
+- Write a draft of the skill
+- Create test prompts and run claude-with-the-skill on them
+- Evaluate the results — both with agent reviewers and optionally human review
+- Improve the skill based on what the evaluation reveals
+- Repeat until the skill demonstrably helps
+
+Figure out where the user is in this process and help them progress. If they say
+"I want to make a skill for X", help narrow scope, write a draft, write test cases,
+and run the eval loop. If they already have a draft, go straight to testing.
+
+---
+
+## Creating a skill
+
+### Capture intent
+
+Start by understanding what the user wants. The current conversation might already
+contain a workflow worth capturing ("turn this into a skill"). If so, extract:
+
+1. What should this skill enable Claude to do?
+2. When should this skill trigger? (what user phrases, what contexts)
+3. What is the expected output?
+4. Are the outputs objectively verifiable (code, data transforms, structured files)
+   or subjective (writing quality, design aesthetics)? Objectively verifiable outputs
+   benefit from test cases. Subjective outputs are better evaluated by human review.
+
+### Research
+
+Check for existing skills that overlap — run `grep -r "trigger-keyword" skills/*/SKILL.md`
+to avoid duplicating what already exists. If a similar skill exists, offer to improve
+it rather than create a new one.
+
+Read the repository CLAUDE.md before writing anything. Project conventions override
+default patterns.
+
+### Write the SKILL.md
+
+Based on the user interview, create the skill directory and write the SKILL.md.
+
+**Skill structure:**
+
+```
+skill-name/
+├── SKILL.md              # Required — the workflow
+├── scripts/              # Deterministic CLI tools the skill invokes
+├── agents/               # Subagent prompts used only by this skill
+├── references/           # Deep context loaded on demand
+└── assets/               # Templates, viewers, static files
+```
+
+**Frontmatter** — name, description, routing metadata:
+
+```yaml
+---
+name: skill-slug-name
+description: |
+  [What it does — 1-2 sentences]. Use when [trigger conditions].
+  Use for "[phrase 1]", "[phrase 2]". Do NOT use for [exclusions].
+version: 1.0.0
+routing:
+  triggers:
+    - keyword1
+    - keyword2
+  pairs_with:
+    - related-skill
+  complexity: Simple | Medium | Complex
+  category: language | infrastructure | review | meta | content
+allowed-tools:
+  - Read
+  - Write
+  - Bash
+---
+```
+
+The description is the primary triggering mechanism. Claude tends to undertrigger
+skills — not activating them when they would help. Combat this by being explicit
+about trigger contexts. Include "Use for" with concrete phrases users would say.
+
+**Body** — workflow first, then context:
+
+1. Brief overview (2-3 sentences: what this does and how)
+2. Instructions / workflow phases (the actual methodology)
+3. Reference material (commands, guides, schemas)
+4. Error handling (cause/solution pairs for common failures)
+5. References to bundled files
+
+Constraints belong inline within the workflow step where they apply, not in a
+separate section. If a constraint matters during Phase 2, put it in Phase 2 —
+not in a preamble the model reads 200 lines before it reaches Phase 2.
+
+Explain the reasoning behind constraints rather than issuing bare imperatives.
+"Run with `-race` because race conditions are silent until production" is more
+effective than "ALWAYS run with -race" because the model can generalize the
+reasoning to situations the skill author didn't anticipate.
+
+**Progressive disclosure** — keep SKILL.md navigable:
+- Summary in frontmatter, workflow in body, deep reference in `references/`
+- If SKILL.md exceeds ~500 lines, move detailed catalogs to reference files
+- Reference files clearly linked from SKILL.md with guidance on when to read them
+
+### Bundled scripts
+
+Extract deterministic, repeatable operations into `scripts/*.py` CLI tools with
+argparse interfaces. Scripts save tokens (the model doesn't reinvent the wheel
+each invocation), ensure consistency across runs, and can be tested independently.
+
+Pattern: `scripts/` for deterministic ops, SKILL.md for LLM-orchestrated workflow.
+
+### Bundled agents
+
+For skills that spawn subagents with specialized roles, bundle agent prompts in
+`agents/`. These are not registered in the routing system — they are internal to
+the skill's workflow.
+
+| Scenario | Approach |
+|----------|----------|
+| Agent used only by this skill | Bundle in `agents/` |
+| Agent shared across skills | Keep in repo `agents/` directory |
+| Agent needs routing metadata | Keep in repo `agents/` directory |
+
+---
+
+## Testing the skill
+
+This is the core of the eval loop. Do not stop after writing — test the skill
+against real prompts and measure whether it actually helps.
+
+### Create test prompts
+
+Write 2-3 realistic test prompts — the kind of thing a real user would say. Rich,
+detailed, specific. Not abstract one-liners.
+
+Bad: `"Format this data"`
+Good: `"I have a CSV in ~/downloads/q4-sales.csv with revenue in column C and costs
+in column D. Add a profit margin percentage column and highlight rows where margin
+is below 10%."`
+
+Share prompts with the user for review before running them.
+
+Save test cases to `evals/evals.json` in the workspace (not in the skill directory —
+eval data is ephemeral):
+
+```json
+{
+  "skill_name": "example-skill",
+  "evals": [
+    {
+      "id": 1,
+      "name": "descriptive-name",
+      "prompt": "The realistic user prompt",
+      "assertions": []
+    }
+  ]
+}
+```
+
+### Run test prompts
+
+For each test case, spawn two subagents in the same turn — one with the skill
+loaded, one without (baseline). Launch everything at once so it finishes together.
+
+**With-skill run:** Tell the subagent to read the skill's SKILL.md first, then
+execute the task. Save outputs to the workspace.
+
+**Baseline run:** Same prompt, no skill loaded. Save to a separate directory.
+
+Organize results by iteration:
+
+```
+skill-workspace/
+├── evals/evals.json
+├── iteration-1/
+│   ├── eval-descriptive-name/
+│   │   ├── with_skill/outputs/
+│   │   ├── without_skill/outputs/
+│   │   └── grading.json
+│   └── benchmark.json
+└── iteration-2/
+    └── ...
+```
+
+### Evaluate results
+
+Evaluation has three tiers, applied in order:
+
+**Tier 1: Deterministic checks** — run automatically where applicable:
+- Does the code compile? (`go build`, `tsc --noEmit`, `python -m py_compile`)
+- Do tests pass? (`go test -race`, `pytest`, `vitest`)
+- Does the linter pass? (`go vet`, `ruff`, `biome`)
+
+**Tier 2: Agent blind review** — dispatch using `agents/comparator.md`:
+- Comparator receives both outputs labeled "Output 1" / "Output 2"
+- It does NOT know which is the skill version
+- Scores on relevant dimensions, picks a winner with reasoning
+- Save results to `blind_comparison.json`
+
+**Tier 3: Human review (optional)** — generate the comparison viewer:
+```bash
+python3 scripts/eval_compare.py path/to/workspace
+open path/to/workspace/compare_report.html
+```
+
+The viewer shows outputs side by side with blind labels, agent review panels,
+deterministic check results, winner picker, feedback textarea, and a
+skip-to-results option. Human reviews are optional — agent reviews are sufficient
+for iteration.
+
+### Draft assertions
+
+While test runs are in progress, draft quantitative assertions for objective
+criteria. Good assertions are discriminating — they fail when the skill doesn't
+help and pass when it does. Non-discriminating assertions ("file exists") provide
+false confidence.
+
+Run the grader (`agents/grader.md`) to evaluate assertions against outputs:
+- PASS requires genuine substance, not surface compliance
+- The grader also critiques the assertions themselves — flagging ones that would
+  pass regardless of skill quality
+
+Aggregate results with `scripts/aggregate_benchmark.py` to get pass rates,
+timing, and token usage with mean/stddev across runs.
+
+---
+
+## Improving the skill
+
+This is the iterative heart of the process.
+
+**Generalize from feedback.** Skills will be used across many prompts, not just
+test cases. If a fix only helps the test case but wouldn't generalize, it's
+overfitting. Try different approaches rather than fiddly adjustments.
+
+**Keep instructions lean.** Read the execution transcripts, not just the final
+outputs. If the skill causes the model to waste time on unproductive work, remove
+those instructions. Instructions that don't pull their weight hurt more than they
+help — they consume attention budget without producing value.
+
+**Explain the reasoning.** Motivation-based instructions generalize better than
+rigid imperatives. "Prefer table-driven tests because they make adding cases
+trivial and the input-output relationship explicit" works better than "MUST use
+table-driven tests" because the model understands when the pattern applies and
+when it doesn't.
+
+**Extract repeated work.** Read the transcripts from test runs. If all subagents
+independently wrote similar helper scripts or took the same multi-step approach,
+bundle that script in `scripts/`. One shared implementation beats N independent
+reinventions.
+
+### The iteration loop
+
+1. Apply improvements to the skill
+2. Rerun all test cases into `iteration-<N+1>/`, including baselines
+3. Generate the comparison viewer with `--previous-workspace` pointing at the
+   prior iteration
+4. Review — agent or human
+5. Repeat until results plateau or the user is satisfied
+
+Stop iterating when:
+- Feedback is empty (outputs look good)
+- Pass rates aren't improving between iterations
+- The user says they're satisfied
+
+---
+
+## Description optimization
+
+The description field determines whether Claude activates the skill. After the
+skill is working well, optimize the description for triggering accuracy.
+
+Generate 20 eval queries — 10 that should trigger, 10 that should not. The
+should-not queries are the most important: they should be near-misses from
+adjacent domains, not obviously irrelevant queries.
+
+Run the optimization loop:
+```bash
+python3 scripts/optimize_description.py \
+  --skill-path path/to/skill \
+  --eval-set evals/trigger-eval.json \
+  --max-iterations 5
+```
+
+This splits queries 60/40 train/test, evaluates the current description (3 runs
+per query for reliability), proposes improvements based on failures, and selects
+the best description by test-set score to avoid overfitting.
+
+---
+
+## Bundled agents
+
+The `agents/` directory contains prompts for specialized subagents used by this
+skill. Read them when you need to spawn the relevant subagent.
+
+- `agents/grader.md` — Evaluate assertions against outputs with cited evidence
+- `agents/comparator.md` — Blind A/B comparison of two outputs
+- `agents/analyzer.md` — Post-hoc analysis of why one version beat another
+
+---
+
+## Bundled scripts
+
+- `scripts/run_eval.py` — Execute a skill against a test prompt via `claude -p`
+- `scripts/aggregate_benchmark.py` — Compute pass rate statistics across runs
+- `scripts/optimize_description.py` — Train/test description optimization loop
+- `scripts/package_results.py` — Consolidate iteration artifacts into a report
+- `scripts/eval_compare.py` — Generate blind comparison HTML viewer
+
+---
+
+## Reference files
+
+- `references/artifact-schemas.md` — JSON schemas for eval artifacts (evals.json,
+  grading.json, benchmark.json, comparison.json, timing.json, metrics.json)
+- `references/skill-template.md` — Complete SKILL.md template with all sections
+- `references/complexity-tiers.md` — Skill examples by complexity tier
+- `references/workflow-patterns.md` — Reusable phase structures and gate patterns
+- `references/error-catalog.md` — Common skill creation errors with solutions
+
+---
+
+## Error handling
+
+### Skill doesn't trigger when it should
+Cause: Description is too vague or missing trigger phrases
+Solution: Add explicit "Use for" phrases matching what users actually say.
+Test with `scripts/optimize_description.py`.
+
+### Test run produces empty output
+Cause: The `claude -p` subprocess didn't load the skill, or the skill path is wrong
+Solution: Verify the skill directory contains SKILL.md (exact case). Check
+the `--skill-path` argument points to the directory, not the file.
+
+### Grading results show all-pass regardless of skill
+Cause: Assertions are non-discriminating (e.g., "file exists")
+Solution: Write assertions that test behavior, not structure. The grader's
+eval critique section flags these — read it.
+
+### Iteration loop doesn't converge
+Cause: Changes are overfitting to test cases rather than improving the skill
+Solution: Expand the test set with more diverse prompts. Focus improvements
+on understanding WHY outputs differ, not on patching specific failures.
+
+### Description optimization overfits to train set
+Cause: Test set is too small or train/test queries are too similar
+Solution: Ensure should-trigger and should-not-trigger queries are realistic
+near-misses, not obviously different. The 60/40 split guards against this,
+but only if the queries are well-designed.
diff --git a/skills/skill-creator/agents/analyzer.md b/skills/skill-creator/agents/analyzer.md
new file mode 100644
index 0000000..e4665e2
--- /dev/null
+++ b/skills/skill-creator/agents/analyzer.md
@@ -0,0 +1,109 @@
+# Analyzer Agent
+
+You are a post-hoc analysis agent for eval pipelines. You operate after unblinding —
+you know which output was produced with the skill and which without. Your role is to
+produce actionable improvement suggestions based on the full picture of evidence.
+
+## Modes
+
+You operate in one of two modes, specified in the input:
+
+### Mode: comparison
+
+**When to use**: After a single eval's blind comparison has been completed and unblinded.
+
+**Inputs**:
+- `comparison_json`: Path to comparison.json from the comparator agent
+- `skill_a_path` or `skill_b_path`: Which label (A or B) corresponds to with_skill
+- `with_skill_transcript`: Path to with_skill/transcript.md
+- `without_skill_transcript`: Path to without_skill/transcript.md
+- `with_skill_outputs_dir`: Path to with_skill/outputs/
+- `without_skill_outputs_dir`: Path to without_skill/outputs/
+
+**Analysis tasks**:
+1. Identify WHY the winner won (specific criterion advantages)
+2. Identify WHERE the loser can improve (specific, actionable suggestions)
+3. If the skill won: identify what instructions produced the winning behavior so they
+   can be strengthened
+4. If the skill lost: identify which instructions caused harm or were simply ineffective
+5. Check if the skill caused unnecessary work in the transcript (unproductive loops,
+   redundant steps, ignored instructions)
+
+### Mode: benchmark
+
+**When to use**: After an iteration's full benchmark has been computed.
+
+**Inputs**:
+- `benchmark_json`: Path to iteration's benchmark.json
+- `all_grading_jsons`: List of paths to all grading.json files in the iteration
+- `all_comparison_jsons`: List of paths to all comparison.json files in the iteration
+
+**Analysis tasks**:
+1. Identify patterns across all evals (which assertion types consistently fail?)
+2. Flag non-discriminating assertions that appeared in multiple evals
+3. Identify high-variance evals (comparator score spreads, grading inconsistencies)
+4. Surface metric outliers (evals with unusually high token cost or duration)
+5. Produce 3-5 prioritized improvement suggestions for the skill
+
+## Output
+
+Produce a JSON file named `analysis.json` with exactly this structure:
+
+```json
+{
+  "mode": "comparison | benchmark",
+  "timestamp": "ISO 8601 timestamp",
+  "skill_won": "boolean — true if with_skill won (comparison mode) or pass_rate delta > 0 (benchmark mode)",
+  "findings": [
+    {
+      "category": "winner_factors | loser_improvements | instruction_analysis | transcript_waste | assertion_quality | metric_outliers | variance",
+      "priority": "high | medium | low",
+      "finding": "specific observation with cited evidence",
+      "actionable_suggestion": "concrete change to make to the skill or eval"
+    }
+  ],
+  "improvements_for_skill": [
+    {
+      "target": "which section/instruction to change",
+      "current_behavior": "what the skill currently does",
+      "desired_behavior": "what it should do instead",
+      "rationale": "why this change would improve results",
+      "generalization_risk": "low | medium | high — risk of overfitting this change to test cases"
+    }
+  ],
+  "improvements_for_evals": [
+    {
+      "assertion": "the assertion to improve or replace",
+      "problem": "why this assertion is weak or non-discriminating",
+      "replacement": "suggested replacement assertion text"
+    }
+  ],
+  "benchmark_summary": {
+    "with_skill_pass_rate_mean": "float — benchmark mode only",
+    "without_skill_pass_rate_mean": "float — benchmark mode only",
+    "delta": "float — with_skill minus without_skill",
+    "comparator_win_rate": "float — fraction of evals where skill won",
+    "top_failure_categories": ["list of assertion categories that frequently fail"]
+  },
+  "analyzer_notes": "optional string — observations that do not fit the structured fields"
+}
+```
+
+The schema is a contract. Field names, types, and nesting must match exactly. The
+`package_results.py` script reads `findings`, `improvements_for_skill`, and
+`benchmark_summary` by field name.
+
+## Behavior Rules
+
+- Every finding must cite specific evidence. "The skill seems to help" is not a finding.
+  "The skill produced a YAML frontmatter with 7 required fields; without-skill produced
+  3" is a finding.
+- `generalization_risk` is mandatory for every improvement_for_skill entry. High risk
+  means the change would only help on the specific test case and would likely confuse
+  the model on unseen prompts.
+- In benchmark mode, if `delta` is near zero (within 0.05), investigate whether the
+  assertions are non-discriminating before concluding the skill is ineffective.
+- Prioritize `improvements_for_skill` by expected impact. High priority means the change
+  would plausibly improve pass rate by more than 10 percentage points.
+- Do not suggest adding more instructions as a default. If the skill is not helping,
+  removing instructions (reducing noise) is often more effective than adding them.
diff --git a/skills/skill-creator/agents/comparator.md b/skills/skill-creator/agents/comparator.md
new file mode 100644
index 0000000..9ff7361
--- /dev/null
+++ b/skills/skill-creator/agents/comparator.md
@@ -0,0 +1,118 @@
+# Comparator Agent
+
+You are a blind A/B comparison agent for eval pipelines. You receive two sets of execution
+outputs labeled A and B. You do not know which skill produced which output. Your role is
+to produce a scored comparison without knowing the answer — this prevents confirmation bias
+from affecting the verdict.
+
+## Inputs
+
+You will receive:
+- `output_a_dir`: Path to the first execution's outputs directory
+- `output_b_dir`: Path to the second execution's outputs directory
+- `transcript_a`: Path to the first execution's transcript.md
+- `transcript_b`: Path to the second execution's transcript.md
+- `assertions` (optional): Assertion list from evals.json, as a secondary signal
+
+## Process
+
+### Step 1: Read all artifacts without bias
+
+Read all output files and transcripts for both A and B. Do not attempt to determine which
+is "with skill" and which is "without skill." Treat them as two independent submissions
+competing on quality.
+
+### Step 2: Generate a rubric
+
+Before scoring, write a rubric with 4-6 evaluation criteria. Criteria must be grounded in
+the actual content — do not use generic criteria like "quality" without defining what
+quality means for this specific type of output.
+
+Example criteria for a SKILL.md creation eval:
+- Frontmatter completeness (required fields present and populated)
+- Phase structure quality (phases have clear inputs, outputs, and gate conditions)
+- Instruction specificity (steps are actionable, not aspirational)
+- Error handling coverage (top errors covered with cause/solution pairs)
+- Anti-rationalization presence and quality
+
+### Step 3: Score both outputs
+
+For each criterion, assign a score from 1 to 5:
+- 5: Excellent — exceeds expectations with specific, substantive content
+- 4: Good — meets expectations consistently
+- 3: Adequate — meets minimum requirements with some gaps
+- 2: Weak — below expectations, significant gaps
+- 1: Poor — fails to meet basic requirements
+
+Score A and B independently for each criterion. Do not adjust one score based on the
+other — each score must stand alone against the rubric.
+
+### Step 4: Check assertions (secondary signal)
+
+If assertions were provided, evaluate each output against them. This is a secondary
+signal to the rubric scores, not a replacement. A high assertion pass rate with low
+rubric scores indicates weak assertions.
+
+### Step 5: Determine winner
+
+Compute total rubric scores for A and B. The higher total is the winner. If scores are
+tied within 2 points, classify as "tie." Include the overall scores (1-10 scale, where
+10 is perfect across all criteria at weight 2 each).
+
+## Output
+
+Produce a JSON file named `comparison.json` with exactly this structure:
+
+```json
+{
+  "eval_id": "string — the eval name/identifier",
+  "timestamp": "ISO 8601 timestamp",
+  "rubric": [
+    {
+      "criterion": "criterion name",
+      "description": "what this criterion measures",
+      "weight": "float — relative importance, all weights sum to 1.0"
+    }
+  ],
+  "scores": {
+    "A": {
+      "criteria_scores": [
+        {
+          "criterion": "criterion name",
+          "score": "integer 1-5",
+          "rationale": "specific evidence for this score"
+        }
+      ],
+      "total_score": "float — weighted sum of criteria scores normalized to 1-10",
+      "assertion_pass_rate": "float 0.0–1.0 — if assertions provided, else null"
+    },
+    "B": {
+      "criteria_scores": [],
+      "total_score": "float",
+      "assertion_pass_rate": "float or null"
+    }
+  },
+  "winner": "A | B | tie",
+  "winner_margin": "float — difference in total scores",
+  "reasoning": "string — 2-4 sentences explaining the decision, referencing specific criterion differences",
+  "confidence": "high | medium | low",
+  "comparator_notes": "optional — observations about the comparison that don't fit the rubric"
+}
+```
+
+The schema is a contract. Field names, types, and nesting must match exactly. The
+`analyzer.md` agent reads `winner`, `total_score`, and `reasoning` by field name.
+
+## Behavior Rules
+
+- Never attempt to determine which output is "with skill" or "without skill." You will
+  be unblinded by the analyzer agent after this step.
+- Never use "quality" or "better" as criterion names without defining what they mean for
+  this specific content type.
+- Each `rationale` must cite specific content from the output, not general impressions.
+  "A's error handling section covers 5 specific errors with cause/solution pairs" is
+  acceptable. "A's error handling seems more thorough" is not.
+- If both outputs are identical or near-identical, set `winner` to "tie" and note this
+  in `comparator_notes`.
+- If one output is clearly empty or failed, score all criteria 1 and set winner to
+  the non-empty output. Note the failure in `comparator_notes`.
diff --git a/skills/skill-creator/agents/grader.md b/skills/skill-creator/agents/grader.md
new file mode 100644
index 0000000..9665022
--- /dev/null
+++ b/skills/skill-creator/agents/grader.md
@@ -0,0 +1,105 @@
+# Grader Agent
+
+You are a grading agent for eval pipelines. Your role is to evaluate whether execution
+outputs satisfy a set of assertions, producing cited evidence for every verdict.
+
+## Inputs
+
+You will receive:
+- `expectations`: A list of assertion strings from `evals.json`
+- `transcript_path`: Path to `transcript.md` from the execution run
+- `outputs_dir`: Path to the `outputs/` directory from the execution run
+
+## Process
+
+### Step 1: Read all artifacts
+
+Read `transcript.md` in full. Read all files in `outputs/`. Build a complete picture of
+what the execution produced before evaluating any assertion.
+
+### Step 2: Evaluate each assertion
+
+For each assertion in `expectations`:
+
+1. Determine whether it is PASS or FAIL based on the artifacts.
+2. Cite specific evidence: quote the relevant section of transcript.md or the relevant
+   content from an output file. Do not assert PASS without pointing to the specific
+   content that satisfies the assertion.
+3. If the assertion is ambiguous (could be interpreted in multiple ways), apply the
+   stricter interpretation and note the ambiguity.
+
+**Key rule**: PASS requires genuine substance, not surface compliance. Examples:
+- Correct filename with wrong content → FAIL
+- Correct structure with placeholder values → FAIL
+- Required field present but empty → FAIL
+- Required section heading present but no content under it → FAIL
+
+### Step 3: Extract and verify implicit claims
+
+After evaluating explicit assertions, scan the outputs for implicit claims — statements
+or artifacts that appear to assert something specific. Verify 2-3 of the most significant
+implicit claims. These are not scored against the pass rate but are included in the report
+for the analyzer agent.
+
+### Step 4: Critique eval quality
+
+Identify non-discriminating assertions: assertions that would PASS regardless of whether
+the skill was loaded. Flag these clearly because they inflate pass rates without measuring
+skill-specific behavior.
+
+Examples of non-discriminating assertions:
+- "Output is in English"
+- "No error messages present"
+- "Response is non-empty"
+- "File exists" (if any execution would produce a file)
+
+## Output
+
+Produce a JSON file named `grading.json` with exactly this structure:
+
+```json
+{
+  "eval_id": "string — the eval name/identifier",
+  "configuration": "with_skill | without_skill",
+  "timestamp": "ISO 8601 timestamp",
+  "assertions": [
+    {
+      "assertion": "the assertion text",
+      "verdict": "PASS | FAIL",
+      "evidence": "quoted excerpt or file reference supporting the verdict",
+      "confidence": "high | medium | low"
+    }
+  ],
+  "pass_count": "integer — number of PASS verdicts",
+  "fail_count": "integer — number of FAIL verdicts",
+  "pass_rate": "float 0.0–1.0",
+  "implicit_claims": [
+    {
+      "claim": "the implicit claim identified",
+      "verdict": "VERIFIED | UNVERIFIED | CONTRADICTED",
+      "evidence": "supporting or contradicting evidence"
+    }
+  ],
+  "eval_critique": {
+    "non_discriminating_assertions": ["list of assertion texts flagged as non-discriminating"],
+    "recommendation": "string — suggested assertion improvements"
+  },
+  "grader_notes": "optional string — any observations about unusual execution patterns"
+}
+```
+
+The schema is a contract. Field names, types, and nesting must match exactly. The
+`aggregate_benchmark.py` script parses `pass_rate`, `pass_count`, and `fail_count`
+by name.
+
+## Behavior Rules
+
+- Never infer PASS from ambiguous evidence. When in doubt, FAIL with a note explaining
+  what evidence would be needed for PASS.
+- Never skip an assertion. Every assertion in `expectations` must appear in `assertions`.
+- The `evidence` field must contain a direct quote or file path reference. "Looks correct"
+  is not evidence.
+- If `outputs/` is empty, all file-existence assertions are FAIL. Note this prominently
+  in `grader_notes`.
+- If `transcript.md` contains error messages from the execution, note them in
+  `grader_notes` even if no assertion directly tests for errors.
diff --git a/skills/skill-creator/assets/eval_viewer.html b/skills/skill-creator/assets/eval_viewer.html
new file mode 100644
index 0000000..636532b
--- /dev/null
+++ b/skills/skill-creator/assets/eval_viewer.html
@@ -0,0 +1,1189 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Blind A/B Code Review</title>
+<style>
+/* ─── Design Tokens ─────────────────────────────────────────────────── */
+:root {
+  --bg:           #0a0c10;
+  --surface:      #111318;
+  --surface-2:    #161a22;
+  --surface-3:    #1c2130;
+  --border:       #222832;
+  --border-2:     #2a3340;
+  --text:         #b8c4d4;
+  --muted:        #5c6a7e;
+  --bright:       #e8edf5;
+  --accent:       #4d8ef5;
+  --accent-dim:   #1a2d50;
+  --green:        #3dba6c;
+  --green-dim:    #0d2420;
+  --red:          #e05454;
+  --red-dim:      #2a1015;
+  --yellow:       #d4a830;
+  --yellow-dim:   #2a2008;
+  --purple:       #9b72f5;
+  --purple-dim:   #1e1540;
+  --code-bg:      #080b0f;
+  --radius-sm:    4px;
+  --radius:       8px;
+  --radius-lg:    12px;
+  --shadow-sm:    0 1px 3px rgba(0,0,0,0.5);
+  --shadow:       0 2px 8px rgba(0,0,0,0.6), 0 0 0 1px rgba(255,255,255,0.03);
+  --shadow-lg:    0 4px 20px rgba(0,0,0,0.7), 0 0 0 1px rgba(255,255,255,0.04);
+  --font-sans:    -apple-system, BlinkMacSystemFont, 'Segoe UI', system-ui, sans-serif;
+  --font-mono:    'SF Mono', 'Cascadia Code', 'Fira Code', 'Consolas', monospace;
+}
+
+/* ─── Reset & Base ──────────────────────────────────────────────────── */
+*, *::before, *::after { margin: 0; padding: 0; box-sizing: border-box; }
+
+body {
+  font-family: var(--font-sans);
+  background: var(--bg);
+  color: var(--text);
+  line-height: 1.6;
+  font-size: 14px;
+  -webkit-font-smoothing: antialiased;
+  min-height: 100vh;
+}
+
+/* ─── Header ────────────────────────────────────────────────────────── */
+.header {
+  padding: 14px 32px;
+  border-bottom: 1px solid var(--border);
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  background: var(--surface);
+  position: sticky;
+  top: 0;
+  z-index: 10;
+  box-shadow: var(--shadow-sm);
+}
+
+.header-left {
+  display: flex;
+  align-items: center;
+  gap: 16px;
+}
+
+.header h1 {
+  font-size: 15px;
+  font-weight: 600;
+  color: var(--bright);
+  letter-spacing: -0.01em;
+}
+
+.blind-notice {
+  font-size: 11px;
+  font-weight: 500;
+  color: var(--yellow);
+  padding: 3px 10px;
+  border: 1px solid rgba(212, 168, 48, 0.35);
+  border-radius: 20px;
+  background: var(--yellow-dim);
+  letter-spacing: 0.03em;
+  text-transform: uppercase;
+  transition: color 0.3s, border-color 0.3s, background 0.3s;
+}
+
+/* ─── Navigation ────────────────────────────────────────────────────── */
+.nav {
+  display: flex;
+  border-bottom: 1px solid var(--border);
+  padding: 0 32px;
+  background: var(--surface);
+  gap: 2px;
+}
+
+.nav-btn {
+  padding: 11px 18px;
+  cursor: pointer;
+  color: var(--muted);
+  border: none;
+  background: none;
+  border-bottom: 2px solid transparent;
+  font-size: 13px;
+  font-family: var(--font-sans);
+  font-weight: 500;
+  letter-spacing: -0.005em;
+  transition: color 0.15s, border-color 0.15s;
+  position: relative;
+  bottom: -1px;
+}
+
+.nav-btn:hover { color: var(--text); }
+.nav-btn:focus-visible { outline: 2px solid var(--accent); outline-offset: -2px; border-radius: var(--radius-sm); }
+.nav-btn.active { color: var(--bright); border-bottom-color: var(--accent); }
+
+/* ─── Pages ─────────────────────────────────────────────────────────── */
+.page { padding: 28px 32px; max-width: 1600px; display: none; }
+.page.active { display: block; }
+
+/* ─── Eval Tabs ─────────────────────────────────────────────────────── */
+.eval-tabs {
+  display: flex;
+  gap: 6px;
+  margin-bottom: 24px;
+  flex-wrap: wrap;
+}
+
+.eval-tab {
+  padding: 6px 14px;
+  border-radius: var(--radius-sm);
+  border: 1px solid var(--border-2);
+  background: var(--surface-2);
+  color: var(--muted);
+  cursor: pointer;
+  font-size: 12px;
+  font-family: var(--font-sans);
+  font-weight: 500;
+  transition: all 0.15s;
+}
+
+.eval-tab:hover { color: var(--text); border-color: var(--border-2); background: var(--surface-3); }
+.eval-tab:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }
+.eval-tab.active { border-color: var(--accent); color: var(--accent); background: var(--accent-dim); }
+
+/* ─── Grid ───────────────────────────────────────────────────────────── */
+.grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(380px, 1fr));
+  gap: 20px;
+}
+
+/* ─── Card ───────────────────────────────────────────────────────────── */
+.card {
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: var(--radius-lg);
+  overflow: hidden;
+  box-shadow: var(--shadow);
+  transition: box-shadow 0.2s;
+}
+
+.card:hover { box-shadow: var(--shadow-lg); }
+
+.card-head {
+  padding: 12px 16px;
+  border-bottom: 1px solid var(--border);
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  background: var(--surface-2);
+}
+
+.card-head h3 {
+  font-size: 13px;
+  font-weight: 600;
+  color: var(--bright);
+  letter-spacing: -0.01em;
+}
+
+/* ─── Label / Badge ──────────────────────────────────────────────────── */
+.label {
+  font-size: 10px;
+  font-weight: 600;
+  padding: 2px 9px;
+  border-radius: 20px;
+  background: var(--accent-dim);
+  color: var(--accent);
+  border: 1px solid rgba(77, 142, 245, 0.25);
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  transition: all 0.3s;
+}
+
+.revealed .label-a { background: var(--accent-dim); color: var(--accent); border-color: rgba(77, 142, 245, 0.3); }
+.revealed .label-b { background: var(--green-dim); color: var(--green); border-color: rgba(61, 186, 108, 0.3); }
+
+/* ─── Card Body & Stats ─────────────────────────────────────────────── */
+.card-body { padding: 16px; }
+
+.stat {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 7px 0;
+  border-bottom: 1px solid var(--border);
+  font-size: 12px;
+}
+
+.stat:last-child { border: none; }
+
+.stat-name {
+  color: var(--muted);
+  font-weight: 400;
+}
+
+.stat-val {
+  font-weight: 600;
+  font-variant-numeric: tabular-nums;
+}
+
+/* ─── Status Colors ──────────────────────────────────────────────────── */
+.pass  { color: var(--green); }
+.fail  { color: var(--red); }
+.warn  { color: var(--yellow); }
+
+/* ─── Status Pills ───────────────────────────────────────────────────── */
+.stat-val.pass::before  { content: ''; display: inline-block; width: 6px; height: 6px; border-radius: 50%; background: var(--green); margin-right: 6px; vertical-align: middle; box-shadow: 0 0 6px rgba(61, 186, 108, 0.5); }
+.stat-val.fail::before  { content: ''; display: inline-block; width: 6px; height: 6px; border-radius: 50%; background: var(--red); margin-right: 6px; vertical-align: middle; box-shadow: 0 0 6px rgba(224, 84, 84, 0.5); }
+.stat-val.warn::before  { content: ''; display: inline-block; width: 6px; height: 6px; border-radius: 50%; background: var(--yellow); margin-right: 6px; vertical-align: middle; }
+
+/* ─── Code Block ─────────────────────────────────────────────────────── */
+.code-block {
+  background: var(--code-bg);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 12px 14px;
+  margin: 10px 0;
+  font-size: 11.5px;
+  font-family: var(--font-mono);
+  overflow-x: auto;
+  max-height: 380px;
+  overflow-y: auto;
+  white-space: pre;
+  line-height: 1.6;
+  color: #8899bb;
+  tab-size: 2;
+  scrollbar-width: thin;
+  scrollbar-color: var(--border-2) transparent;
+}
+
+.code-block::-webkit-scrollbar { width: 6px; height: 6px; }
+.code-block::-webkit-scrollbar-track { background: transparent; }
+.code-block::-webkit-scrollbar-thumb { background: var(--border-2); border-radius: 3px; }
+
+/* ─── Syntax Highlighting ─────────────────────────────────────────────── */
+.sh-kw { color: #c678dd; }           /* keywords: func, if, for, return */
+.sh-type { color: #e5c07b; }         /* types: string, int, error, bool */
+.sh-str { color: #98c379; }          /* strings */
+.sh-num { color: #d19a66; }          /* numbers */
+.sh-cmt { color: #5c6370; font-style: italic; } /* comments */
+.sh-fn { color: #61afef; }           /* function calls */
+.sh-pkg { color: #56b6c2; }          /* package names in qualified calls */
+.sh-op { color: #8899bb; }           /* operators */
+
+/* ─── Checks ─────────────────────────────────────────────────────────── */
+.checks { margin-top: 14px; }
+
+.chk {
+  padding: 7px 10px;
+  margin: 4px 0;
+  border-radius: var(--radius-sm);
+  font-size: 12px;
+  display: flex;
+  gap: 8px;
+  align-items: flex-start;
+  line-height: 1.5;
+}
+
+.chk.p {
+  background: var(--green-dim);
+  border: 1px solid rgba(61, 186, 108, 0.15);
+  color: #9dd4b0;
+}
+
+.chk.f {
+  background: var(--red-dim);
+  border: 1px solid rgba(224, 84, 84, 0.15);
+  color: #d49090;
+}
+
+.chk .icon {
+  flex-shrink: 0;
+  width: 15px;
+  height: 15px;
+  border-radius: 50%;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  font-size: 9px;
+  font-weight: 700;
+  margin-top: 1px;
+}
+
+.chk.p .icon { background: rgba(61, 186, 108, 0.2); color: var(--green); }
+.chk.f .icon { background: rgba(224, 84, 84, 0.2); color: var(--red); }
+
+.chk .ev {
+  color: var(--muted);
+  font-style: italic;
+  display: block;
+  margin-top: 3px;
+  font-size: 11px;
+}
+
+/* ─── Prompt Block ───────────────────────────────────────────────────── */
+.prompt-section {
+  margin-bottom: 22px;
+}
+
+.section-label {
+  font-size: 11px;
+  font-weight: 600;
+  color: var(--muted);
+  text-transform: uppercase;
+  letter-spacing: 0.08em;
+  margin-bottom: 8px;
+}
+
+/* ─── Winner Picker ──────────────────────────────────────────────────── */
+.winner-pick {
+  margin-top: 18px;
+  display: flex;
+  gap: 10px;
+  align-items: center;
+  flex-wrap: wrap;
+}
+
+.winner-pick > .section-label {
+  margin-bottom: 0;
+  white-space: nowrap;
+}
+
+.winner-btn {
+  padding: 7px 18px;
+  border-radius: var(--radius);
+  border: 1px solid var(--border-2);
+  background: var(--surface-2);
+  color: var(--text);
+  cursor: pointer;
+  font-size: 12px;
+  font-family: var(--font-sans);
+  font-weight: 500;
+  transition: all 0.15s;
+}
+
+.winner-btn:hover { border-color: var(--border-2); background: var(--surface-3); color: var(--bright); }
+.winner-btn:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }
+
+.winner-btn.selected {
+  border-color: var(--green);
+  color: var(--green);
+  background: var(--green-dim);
+  box-shadow: 0 0 0 1px rgba(61, 186, 108, 0.2);
+}
+
+/* ─── Agent Panel ────────────────────────────────────────────────────── */
+.agent-panel {
+  margin-top: 18px;
+}
+
+.agent-toggle-btn {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  width: 100%;
+  text-align: left;
+  background: none;
+  border: none;
+  cursor: pointer;
+  padding: 0;
+  font-family: var(--font-sans);
+}
+
+.agent-toggle-btn:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; border-radius: var(--radius-sm); }
+
+.agent-body {
+  margin-top: 14px;
+  font-size: 13px;
+  line-height: 1.7;
+  color: var(--text);
+  display: none;
+}
+
+.agent-body p { color: var(--text); }
+
+/* ─── Score Grid ─────────────────────────────────────────────────────── */
+.score-col h3 {
+  font-size: 12px;
+  font-weight: 600;
+  color: var(--bright);
+  margin-bottom: 10px;
+}
+
+.score-bar-row {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  padding: 5px 0;
+  border-bottom: 1px solid var(--border);
+  font-size: 12px;
+}
+
+.score-bar-row:last-child { border: none; }
+
+.score-dim {
+  color: var(--muted);
+  width: 110px;
+  flex-shrink: 0;
+  font-size: 11px;
+}
+
+.score-track {
+  flex: 1;
+  height: 4px;
+  background: var(--border-2);
+  border-radius: 2px;
+  overflow: hidden;
+}
+
+.score-fill {
+  height: 100%;
+  border-radius: 2px;
+  transition: width 0.4s ease;
+}
+
+.score-fill.high  { background: var(--green); }
+.score-fill.mid   { background: var(--yellow); }
+.score-fill.low   { background: var(--red); }
+
+.score-num {
+  font-weight: 600;
+  font-variant-numeric: tabular-nums;
+  width: 32px;
+  text-align: right;
+  font-size: 11px;
+}
+
+.score-num.high { color: var(--green); }
+.score-num.mid  { color: var(--yellow); }
+.score-num.low  { color: var(--red); }
+
+/* ─── Notable Differences ────────────────────────────────────────────── */
+.diff-list {
+  margin-top: 14px;
+}
+
+.diff-list h3 {
+  font-size: 11px;
+  font-weight: 600;
+  color: var(--muted);
+  text-transform: uppercase;
+  letter-spacing: 0.08em;
+  margin-bottom: 8px;
+}
+
+/* ─── Feedback Area ──────────────────────────────────────────────────── */
+.feedback-area { margin-top: 20px; }
+
+.feedback-area label {
+  font-size: 11px;
+  font-weight: 600;
+  color: var(--muted);
+  text-transform: uppercase;
+  letter-spacing: 0.07em;
+  display: block;
+  margin-bottom: 8px;
+}
+
+.feedback-area textarea {
+  width: 100%;
+  min-height: 90px;
+  background: var(--surface);
+  border: 1px solid var(--border-2);
+  border-radius: var(--radius);
+  color: var(--text);
+  padding: 10px 12px;
+  font-family: var(--font-sans);
+  font-size: 13px;
+  resize: vertical;
+  transition: border-color 0.15s, box-shadow 0.15s;
+  line-height: 1.6;
+}
+
+.feedback-area textarea:focus {
+  outline: none;
+  border-color: var(--accent);
+  box-shadow: 0 0 0 3px rgba(77, 142, 245, 0.12);
+}
+
+.feedback-area textarea::placeholder { color: var(--muted); }
+
+/* ─── Submit Bar ─────────────────────────────────────────────────────── */
+.submit-bar {
+  margin-top: 28px;
+  padding: 16px 20px;
+  background: var(--surface-2);
+  border: 1px solid var(--border);
+  border-radius: var(--radius-lg);
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  box-shadow: var(--shadow);
+}
+
+.submit-bar .status {
+  font-size: 12px;
+  color: var(--muted);
+  font-weight: 500;
+}
+
+.btn-group { display: flex; gap: 10px; align-items: center; }
+
+.submit-btn {
+  padding: 9px 22px;
+  border-radius: var(--radius);
+  border: none;
+  background: var(--accent);
+  color: #fff;
+  font-weight: 600;
+  cursor: pointer;
+  font-size: 13px;
+  font-family: var(--font-sans);
+  letter-spacing: -0.01em;
+  transition: background 0.15s, transform 0.1s, box-shadow 0.15s;
+  box-shadow: 0 1px 4px rgba(77, 142, 245, 0.3);
+}
+
+.submit-btn:hover { background: #5a99f8; box-shadow: 0 2px 8px rgba(77, 142, 245, 0.45); }
+.submit-btn:active { transform: translateY(1px); }
+.submit-btn:focus-visible { outline: 2px solid var(--accent); outline-offset: 3px; }
+
+.reveal-btn {
+  padding: 9px 22px;
+  border-radius: var(--radius);
+  border: 1px solid rgba(155, 114, 245, 0.4);
+  background: var(--purple-dim);
+  color: var(--purple);
+  font-weight: 600;
+  cursor: pointer;
+  font-size: 13px;
+  font-family: var(--font-sans);
+  letter-spacing: -0.01em;
+  transition: all 0.15s;
+  display: none;
+}
+
+.reveal-btn.show { display: inline-block; }
+.reveal-btn:hover { background: #261b4a; border-color: rgba(155, 114, 245, 0.7); }
+.reveal-btn:focus-visible { outline: 2px solid var(--purple); outline-offset: 3px; }
+
+/* ─── Results & Tables ───────────────────────────────────────────────── */
+table {
+  width: 100%;
+  border-collapse: collapse;
+  margin: 14px 0;
+  font-size: 13px;
+}
+
+th, td {
+  padding: 10px 14px;
+  text-align: left;
+  border-bottom: 1px solid var(--border);
+}
+
+thead tr { background: var(--surface-2); }
+
+th {
+  color: var(--muted);
+  font-weight: 600;
+  font-size: 11px;
+  text-transform: uppercase;
+  letter-spacing: 0.07em;
+}
+
+tbody tr { transition: background 0.1s; }
+tbody tr:hover { background: var(--surface-2); }
+
+.d-pos { color: var(--green); font-weight: 600; }
+.d-neg { color: var(--red); font-weight: 600; }
+.d-zero { color: var(--muted); }
+
+/* ─── Winner Banner ──────────────────────────────────────────────────── */
+.banner {
+  padding: 20px 28px;
+  margin: 0 0 22px 0;
+  border-radius: var(--radius-lg);
+  font-size: 16px;
+  font-weight: 700;
+  letter-spacing: -0.02em;
+  position: relative;
+  overflow: hidden;
+}
+
+.banner-1 {
+  background: linear-gradient(135deg, var(--accent-dim) 0%, #0d1c38 100%);
+  border: 1px solid rgba(77, 142, 245, 0.3);
+  color: var(--accent);
+  box-shadow: 0 0 40px rgba(77, 142, 245, 0.08);
+}
+
+.banner-2 {
+  background: linear-gradient(135deg, var(--green-dim) 0%, #081c15 100%);
+  border: 1px solid rgba(61, 186, 108, 0.3);
+  color: var(--green);
+  box-shadow: 0 0 40px rgba(61, 186, 108, 0.08);
+}
+
+.banner-tie {
+  background: linear-gradient(135deg, var(--yellow-dim) 0%, #1c1605 100%);
+  border: 1px solid rgba(212, 168, 48, 0.3);
+  color: var(--yellow);
+}
+
+/* ─── Grades Page ────────────────────────────────────────────────────── */
+.grades-section { margin-bottom: 18px; }
+
+/* ─── Utilities ──────────────────────────────────────────────────────── */
+.hidden { display: none !important; }
+
+@media (max-width: 900px) {
+  .grid { grid-template-columns: 1fr; }
+  .header, .nav, .page { padding-left: 16px; padding-right: 16px; }
+}
+
+@media (prefers-reduced-motion: reduce) {
+  *, *::before, *::after { transition: none !important; animation: none !important; }
+}
+</style>
+</head>
+<body>
+
+<header class="header">
+  <div class="header-left">
+    <h1>Blind A/B Code Review</h1>
+    <span class="blind-notice">Blind Mode</span>
+  </div>
+</header>
+
+<nav class="nav" role="navigation" aria-label="Main navigation">
+  <button class="nav-btn active" data-page="review">Review</button>
+  <button class="nav-btn" data-page="grades">Grades</button>
+  <button class="nav-btn" data-page="results">Results</button>
+</nav>
+
+<main>
+  <div id="review-page" class="page active" role="tabpanel"></div>
+  <div id="grades-page" class="page" role="tabpanel"></div>
+  <div id="results-page" class="page" role="tabpanel"></div>
+</main>
+
+<script>
+const DATA = __DATA_PLACEHOLDER__;
+
+// Randomize which side is Output 1 vs Output 2 per eval (seeded by eval name)
+function hashStr(s){let h=0;for(let i=0;i<s.length;i++){h=((h<<5)-h)+s.charCodeAt(i);h|=0}return h}
+function isSwapped(name){return (hashStr(name) & 1) === 1}
+
+// State
+const reviews = {};
+
+// Nav
+document.querySelectorAll('.nav-btn').forEach(btn => {
+  btn.addEventListener('click', function(){
+    document.querySelectorAll('.nav-btn').forEach(b=>b.classList.remove('active'));
+    document.querySelectorAll('.page').forEach(p=>p.classList.remove('active'));
+    this.classList.add('active');
+    document.getElementById(this.dataset.page+'-page').classList.add('active');
+  });
+});
+
+function el(tag,cls,text){const e=document.createElement(tag);if(cls)e.className=cls;if(text)e.textContent=text;return e}
+
+// Go syntax highlighter — pure regex, no dependencies
+function highlightGo(code){
+  if(!code)return '';
+  // Escape HTML entities first
+  const esc=code.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');
+  // Order matters: comments first (greedy), then strings, then keywords
+  return esc
+    // Line comments
+    .replace(/(\/\/[^\n]*)/g,'<span class="sh-cmt">$1</span>')
+    // Block comments
+    .replace(/(\/\*[\s\S]*?\*\/)/g,'<span class="sh-cmt">$1</span>')
+    // Backtick strings
+    .replace(/(`[^`]*`)/g,'<span class="sh-str">$1</span>')
+    // Double-quoted strings
+    .replace(/("(?:[^"\\]|\\.)*")/g,'<span class="sh-str">$1</span>')
+    // Keywords
+    .replace(/\b(package|import|func|return|if|else|for|range|switch|case|default|var|const|type|struct|interface|map|chan|go|defer|select|break|continue|fallthrough|nil|true|false|make|append|len|cap|new|delete|close|panic|recover|copy)\b/g,'<span class="sh-kw">$1</span>')
+    // Built-in types
+    .replace(/\b(string|int|int8|int16|int32|int64|uint|uint8|uint16|uint32|uint64|float32|float64|complex64|complex128|bool|byte|rune|error|any|comparable)\b/g,'<span class="sh-type">$1</span>')
+    // Numbers
+    .replace(/\b(\d+\.?\d*(?:e[+-]?\d+)?)\b/g,'<span class="sh-num">$1</span>')
+    // Qualified function calls: pkg.Func(
+    .replace(/\b([a-z]\w*)\.([A-Z]\w*)\s*\(/g,'<span class="sh-pkg">$1</span>.<span class="sh-fn">$2</span>(')
+    // Standalone function calls: Func(
+    .replace(/\b([A-Z]\w*)\s*\(/g,'<span class="sh-fn">$1</span>(')
+    // Method/func definitions after func keyword
+    .replace(/(<span class="sh-kw">func<\/span>\s+)(\w+)\s*\(/g,'$1<span class="sh-fn">$2</span>(');
+}
+
+function buildStat(parent,name,val,cls){
+  const row=el('div','stat');
+  row.appendChild(el('span','stat-name',name));
+  const v=el('span','stat-val '+(cls||''),val);
+  row.appendChild(v);
+  parent.appendChild(row);
+}
+
+function buildChecks(parent,checks){
+  if(!checks||!checks.length){parent.appendChild(el('p','stat-name','No grading data'));return}
+  const wrap=el('div','checks');
+  checks.forEach(c=>{
+    const row=el('div','chk '+(c.passed?'p':'f'));
+    const icon=el('span','icon',c.passed?'\u2713':'\u2717');
+    row.appendChild(icon);
+    const txt=el('span','',c.text);
+    if(c.evidence){const ev=el('span','ev',c.evidence);txt.appendChild(ev)}
+    row.appendChild(txt);
+    wrap.appendChild(row);
+  });
+  parent.appendChild(wrap);
+}
+
+function buildCard(parent,data,label,id){
+  const card=el('div','card');card.dataset.outputId=id;
+  const head=el('div','card-head');
+  head.appendChild(el('h3','',label));
+  const tag=el('span','label','\u2022 blind');
+  tag.dataset.revealId=id;
+  head.appendChild(tag);
+  card.appendChild(head);
+  const body=el('div','card-body');
+  buildStat(body,'Compiles',data.compiles||'?',data.compiles==='yes'?'pass':'fail');
+  buildStat(body,'Tests pass',data.tests_pass||'?',data.tests_pass==='yes'?'pass':'fail');
+  buildStat(body,'go vet',data.govet||'?',data.govet==='clean'?'pass':'warn');
+  buildStat(body,'Lines',String(data.lines||'?'));
+  buildStat(body,'Files',String(data.fileCount||'?'));
+  if(data.files&&data.files.length){
+    const tree=el('div','code-block',data.files.join('\n'));
+    body.appendChild(tree);
+  }
+  if(data.code_preview){
+    const preview=el('div','code-block');
+    // highlightGo escapes HTML entities (&amp; &lt; &gt;) before inserting
+    // span tags with class names only — no user-controlled attributes
+    const highlighted=highlightGo(data.code_preview);
+    const range=document.createRange();
+    range.selectNode(document.body);
+    preview.appendChild(range.createContextualFragment(highlighted));
+    body.appendChild(preview);
+  }
+  buildChecks(body,data.grading?data.grading.expectations:null);
+  card.appendChild(body);
+  parent.appendChild(card);
+}
+
+function buildScoreBar(parent,dim,val){
+  const row=el('div','score-bar-row');
+  row.appendChild(el('span','score-dim',dim));
+  const track=el('div','score-track');
+  const tierCls=val>=7?'high':val>=5?'mid':'low';
+  const fill=el('div','score-fill '+tierCls);
+  fill.style.width=String((val/10)*100)+'%';
+  track.appendChild(fill);
+  row.appendChild(track);
+  const num=el('span','score-num '+tierCls,String(val)+'/10');
+  row.appendChild(num);
+  parent.appendChild(row);
+}
+
+function buildReviewPage(){
+  const page=document.getElementById('review-page');
+  if(!DATA||!DATA.evals||!DATA.evals.length){
+    page.appendChild(el('p','stat-name','No data. Run generate_comparison.py to populate.'));
+    return;
+  }
+  const tabs=el('div','eval-tabs');
+  const content=el('div','');
+
+  function renderEval(ev,idx){
+    content.replaceChildren();
+    const swapped=isSwapped(ev.name);
+    const left=swapped?ev.variantB:ev.variantA;
+    const right=swapped?ev.variantA:ev.variantB;
+    const leftId=swapped?'B':'A';
+    const rightId=swapped?'A':'B';
+
+    const promptSection=el('div','prompt-section');
+    promptSection.appendChild(el('p','section-label','Prompt'));
+    const promptText=el('div','code-block',ev.prompt||'(no prompt recorded)');
+    promptSection.appendChild(promptText);
+    content.appendChild(promptSection);
+
+    const grid=el('div','grid');
+    buildCard(grid,left||{},'Output 1',leftId);
+    buildCard(grid,right||{},'Output 2',rightId);
+    // Variant C if present (workflow-first reorder)
+    if(ev.variantC){
+      buildCard(grid,ev.variantC,'Output 3','C');
+    }
+    content.appendChild(grid);
+
+    // Winner picker — dynamically include Output 3 if variant C exists
+    const picker=el('div','winner-pick');
+    picker.appendChild(el('span','section-label','Which is better?'));
+    const options=ev.variantC?['Output 1','Output 2','Output 3','Tie']:['Output 1','Tie','Output 2'];
+    options.forEach((lbl)=>{
+      const btn=el('button','winner-btn',lbl);
+      let val;
+      if(lbl==='Output 1') val=leftId;
+      else if(lbl==='Output 2') val=rightId;
+      else if(lbl==='Output 3') val='C';
+      else val='tie';
+      if(reviews[ev.name]&&reviews[ev.name].winner===val)btn.classList.add('selected');
+      btn.addEventListener('click',()=>{
+        picker.querySelectorAll('.winner-btn').forEach(b=>b.classList.remove('selected'));
+        btn.classList.add('selected');
+        if(!reviews[ev.name])reviews[ev.name]={};
+        reviews[ev.name].winner=val;
+        updateStatus();
+      });
+      picker.appendChild(btn);
+    });
+    content.appendChild(picker);
+
+    // Agent review panel (collapsed by default, shown if data exists)
+    if(ev.blind_comparison){
+      const bc=ev.blind_comparison;
+      const agentPanel=el('div','card agent-panel');
+
+      const agentHead=el('div','card-head');
+
+      const toggleBtn=el('button','agent-toggle-btn','');
+      const toggleLabel=el('span','eval-tab','Agent Review');
+      toggleBtn.appendChild(toggleLabel);
+
+      let agentWinnerLabel='Tie';
+      if(bc.winner==='Output 1'||bc.winner==='Output 2'){
+        agentWinnerLabel=bc.winner;
+      } else if(bc.winner==='Tie'){
+        agentWinnerLabel='Tie';
+      }
+      const winTag=el('span','label',agentWinnerLabel);
+      toggleBtn.appendChild(winTag);
+      agentHead.appendChild(toggleBtn);
+      agentPanel.appendChild(agentHead);
+
+      const agentBody=el('div','agent-body');
+
+      toggleBtn.addEventListener('click',()=>{
+        const expanded=agentBody.style.display==='block';
+        agentBody.style.display=expanded?'none':'block';
+        toggleLabel.textContent=expanded?'Agent Review':'Agent Review';
+        toggleBtn.setAttribute('aria-expanded',String(!expanded));
+      });
+      toggleBtn.setAttribute('aria-expanded','false');
+
+      // Reasoning
+      const reasonPara=el('p','',bc.reasoning||'');
+      agentBody.appendChild(reasonPara);
+
+      // Scores
+      if(bc.scores){
+        const scoreGrid=el('div','grid');
+        scoreGrid.style.marginTop='16px';
+        ['output_1','output_2'].forEach((key,si)=>{
+          const scores=bc.scores[key];
+          if(!scores)return;
+          const col=el('div','score-col');
+          const heading=el('h3','',String(si===0?'Output 1':'Output 2')+' — '+String(scores.overall||'?')+'/10');
+          col.appendChild(heading);
+          Object.entries(scores).forEach(([dim,val])=>{
+            if(dim==='overall')return;
+            buildScoreBar(col,dim,Number(val));
+          });
+          scoreGrid.appendChild(col);
+        });
+        agentBody.appendChild(scoreGrid);
+      }
+
+      // Notable differences
+      if(bc.notable_differences&&bc.notable_differences.length){
+        const diffList=el('div','diff-list');
+        diffList.appendChild(el('h3','','Notable Differences'));
+        bc.notable_differences.forEach(d=>{
+          const item=el('div','chk p');
+          const icon=el('span','icon','\u2022');
+          item.appendChild(icon);
+          item.appendChild(el('span','',d));
+          diffList.appendChild(item);
+        });
+        agentBody.appendChild(diffList);
+      }
+
+      agentPanel.appendChild(agentBody);
+      content.appendChild(agentPanel);
+    }
+
+    // Feedback textareas
+    const fb=el('div','feedback-area');
+    fb.appendChild(el('label','','Notes'));
+    const ta=document.createElement('textarea');
+    ta.placeholder='e.g. "Output 1 used table-driven tests, Output 2 had a race condition..."';
+    if(reviews[ev.name]&&reviews[ev.name].notes)ta.value=reviews[ev.name].notes;
+    ta.addEventListener('input',()=>{
+      if(!reviews[ev.name])reviews[ev.name]={};
+      reviews[ev.name].notes=ta.value;
+    });
+    fb.appendChild(ta);
+    content.appendChild(fb);
+  }
+
+  DATA.evals.forEach((ev,i)=>{
+    const btn=el('button','eval-tab'+(i===0?' active':''),ev.name);
+    btn.addEventListener('click',()=>{
+      tabs.querySelectorAll('.eval-tab').forEach(b=>b.classList.remove('active'));
+      btn.classList.add('active');
+      renderEval(ev,i);
+    });
+    tabs.appendChild(btn);
+  });
+
+  page.appendChild(tabs);
+  page.appendChild(content);
+
+  // Submit bar
+  const bar=el('div','submit-bar');
+  const status=el('span','status','0/'+DATA.evals.length+' reviewed');
+  status.id='review-status';
+  bar.appendChild(status);
+
+  const btnGroup=el('div','btn-group');
+  function doReveal(){
+    document.body.classList.add('revealed');
+    document.querySelectorAll('[data-reveal-id]').forEach(tag=>{
+      const id=tag.dataset.revealId;
+      const names={A:DATA.variantAName,B:DATA.variantBName,C:DATA.variantCName||'Variant C'};
+      tag.textContent=names[id]||id;
+      tag.className='label label-'+id.toLowerCase();
+    });
+    const notice=document.querySelector('.blind-notice');
+    notice.textContent='Identities Revealed';
+    notice.style.borderColor='rgba(61, 186, 108, 0.4)';
+    notice.style.color='var(--green)';
+    notice.style.background='var(--green-dim)';
+    // Show results tab
+    document.querySelectorAll('.nav-btn').forEach(b=>b.classList.remove('active'));
+    document.querySelectorAll('.page').forEach(p=>p.classList.remove('active'));
+    document.querySelector('[data-page="results"]').classList.add('active');
+    document.getElementById('results-page').classList.add('active');
+  }
+
+  const submitBtn=el('button','submit-btn','Submit Reviews');
+  submitBtn.addEventListener('click',()=>{
+    const blob=new Blob([JSON.stringify({reviews:reviews,timestamp:new Date().toISOString()},null,2)],{type:'application/json'});
+    const url=URL.createObjectURL(blob);
+    const a=document.createElement('a');a.href=url;a.download='feedback.json';a.click();
+    URL.revokeObjectURL(url);
+    document.getElementById('reveal-btn').classList.add('show');
+  });
+  btnGroup.appendChild(submitBtn);
+
+  const skipBtn=el('button','skip-btn','Skip to Results');
+  skipBtn.style.cssText='padding:8px 20px;border-radius:8px;border:1px solid var(--border-2);background:var(--surface-2);color:var(--muted);cursor:pointer;font-size:12px;margin-left:8px';
+  skipBtn.addEventListener('click',()=>{ doReveal(); });
+  btnGroup.appendChild(skipBtn);
+
+  const revealBtn=el('button','reveal-btn','Reveal Identities');
+  revealBtn.id='reveal-btn';
+  revealBtn.addEventListener('click',()=>{
+    doReveal();
+    revealBtn.style.display='none';
+  });
+  btnGroup.appendChild(revealBtn);
+  bar.appendChild(btnGroup);
+  page.appendChild(bar);
+
+  renderEval(DATA.evals[0],0);
+}
+
+function updateStatus(){
+  const reviewed=Object.keys(reviews).filter(k=>reviews[k].winner).length;
+  const total=DATA.evals?DATA.evals.length:0;
+  const s=document.getElementById('review-status');
+  if(s)s.textContent=reviewed+'/'+total+' reviewed';
+}
+
+function buildGradesPage(){
+  const page=document.getElementById('grades-page');
+  if(!DATA||!DATA.evals)return;
+  DATA.evals.forEach(ev=>{
+    const section=el('div','card grades-section');
+    const head=el('div','card-head');
+    head.appendChild(el('h3','',ev.name));
+    section.appendChild(head);
+    const body=el('div','card-body');
+    const grid=el('div','grid');
+
+    [['Variant A',ev.variantA],['Variant B',ev.variantB]].forEach(([label,data])=>{
+      const col=el('div','');
+      col.appendChild(el('h3','',label+' — '+(data&&data.grading?data.grading.summary.passed+'/'+data.grading.summary.total:'ungraded')));
+      buildChecks(col,data&&data.grading?data.grading.expectations:null);
+      grid.appendChild(col);
+    });
+    body.appendChild(grid);
+    section.appendChild(body);
+    page.appendChild(section);
+  });
+}
+
+function buildResultsPage(){
+  const page=document.getElementById('results-page');
+  if(!DATA||!DATA.evals||!DATA.evals.length){
+    page.appendChild(el('p','muted','No data loaded.'));
+    return;
+  }
+
+  // ── Helper: render one comparison set ──
+  function renderComparisonSet(parent,title,leftLabel,rightLabel,dataKey){
+    const section=el('div','card');
+    section.style.marginBottom='20px';
+    const head=el('div','card-head');
+    head.appendChild(el('h3','',title));
+    section.appendChild(head);
+    const body=el('div','card-body');
+    let count=0;
+
+    DATA.evals.forEach(ev=>{
+      const bc=ev[dataKey];
+      if(!bc) return;
+      count++;
+      const row=el('div','');
+      row.style.cssText='padding:10px 0;border-bottom:1px solid var(--border)';
+
+      const header=el('div','');
+      header.style.cssText='display:flex;justify-content:space-between;align-items:center;margin-bottom:6px';
+      header.appendChild(el('span','bright',ev.name));
+
+      const winnerText=bc.winner||'?';
+      const winBadge=el('span','label',winnerText);
+      if(winnerText.includes('1')) winBadge.style.cssText='background:var(--accent-dim);color:var(--accent)';
+      else if(winnerText.includes('2')) winBadge.style.cssText='background:var(--green-dim);color:var(--green)';
+      else winBadge.style.cssText='background:#2d2200;color:var(--yellow)';
+      header.appendChild(winBadge);
+      row.appendChild(header);
+
+      const reasoning=el('p','muted',bc.reasoning||'');
+      reasoning.style.fontSize='12px';
+      row.appendChild(reasoning);
+
+      if(bc.scores){
+        const scoreRow=el('div','grid');
+        scoreRow.style.cssText='grid-template-columns:1fr 1fr;gap:12px;margin-top:8px';
+        ['output_1','output_2'].forEach((key,si)=>{
+          const s=bc.scores[key];
+          if(!s)return;
+          const col=el('div','');
+          col.appendChild(el('span','muted',(si===0?leftLabel:rightLabel)+': '+String(s.overall||'?')+'/10'));
+          const dims=el('div','');
+          dims.style.cssText='display:flex;gap:8px;flex-wrap:wrap;margin-top:4px';
+          Object.entries(s).forEach(([dim,val])=>{
+            if(dim==='overall')return;
+            const pill=el('span','',dim+':'+val);
+            pill.style.cssText='font-size:10px;padding:1px 6px;border-radius:8px;border:1px solid var(--border);color:'+(val>=7?'var(--green)':val>=5?'var(--yellow)':'var(--red)');
+            dims.appendChild(pill);
+          });
+          col.appendChild(dims);
+          scoreRow.appendChild(col);
+        });
+        row.appendChild(scoreRow);
+      }
+
+      if(bc.notable_differences&&bc.notable_differences.length){
+        const diffs=el('div','');
+        diffs.style.cssText='margin-top:8px;font-size:11px;color:var(--muted)';
+        bc.notable_differences.forEach(d=>{
+          const item=el('div','');
+          item.style.padding='2px 0';
+          item.textContent='\u2022 '+d;
+          diffs.appendChild(item);
+        });
+        row.appendChild(diffs);
+      }
+
+      body.appendChild(row);
+    });
+
+    if(count===0){
+      body.appendChild(el('p','muted','No agent reviews available for this comparison.'));
+    }
+    section.appendChild(body);
+    parent.appendChild(section);
+  }
+
+  // ── Agent Blind Review Results ──
+  renderComparisonSet(page,'Agent Review: A vs B (baseline vs constraints-first)','Output 1 (A)','Output 2 (B)','blind_comparison');
+  renderComparisonSet(page,'Agent Review: B vs C (constraints-first vs workflow-first)','Output 1 (B)','Output 2 (C)','blind_comparison_bc');
+
+  // ── Deterministic Checks Summary ──
+  const detSection=el('div','card');
+  detSection.style.marginBottom='20px';
+  const detHead=el('div','card-head');
+  detHead.appendChild(el('h3','','Deterministic Checks'));
+  detSection.appendChild(detHead);
+  const detBody=el('div','card-body');
+
+  const detTbl=document.createElement('table');
+  const dthead=document.createElement('thead');
+  const dhr=document.createElement('tr');
+  const hasC=DATA.evals.some(ev=>ev.variantC);
+  const detHeaders=['Eval','A: build','A: test','A: vet','B: build','B: test','B: vet'];
+  if(hasC) detHeaders.push('C: build','C: test','C: vet');
+  detHeaders.forEach(h=>{dhr.appendChild(el('th','',h))});
+  dthead.appendChild(dhr);detTbl.appendChild(dthead);
+
+  const dtbody=document.createElement('tbody');
+  DATA.evals.forEach(ev=>{
+    const tr=document.createElement('tr');
+    tr.appendChild(el('td','',ev.name));
+    [ev.variantA,ev.variantB].forEach(v=>{
+      const d=v||{};
+      const bld=d.compiles||'?';
+      const tst=d.tests_pass||'?';
+      const vet=d.govet||'?';
+      tr.appendChild(el('td',bld==='yes'?'d-pos':'d-neg',bld));
+      tr.appendChild(el('td',tst==='yes'?'d-pos':'d-neg',tst));
+      tr.appendChild(el('td',vet==='clean'?'d-pos':'d-neg',vet));
+    });
+    if(hasC){
+      const d=ev.variantC||{};
+      const bld=d.compiles||'?';
+      const tst=d.tests_pass||'?';
+      const vet=d.govet||'?';
+      tr.appendChild(el('td',bld==='yes'?'d-pos':'d-neg',bld));
+      tr.appendChild(el('td',tst==='yes'?'d-pos':'d-neg',tst));
+      tr.appendChild(el('td',vet==='clean'?'d-pos':'d-neg',vet));
+    }
+    dtbody.appendChild(tr);
+  });
+  detTbl.appendChild(dtbody);
+  detBody.appendChild(detTbl);
+  detSection.appendChild(detBody);
+  page.appendChild(detSection);
+
+  // ── Variant Identity Key ──
+  const keySection=el('div','card');
+  const keyHead=el('div','card-head');
+  keyHead.appendChild(el('h3','','Variant Identity Key'));
+  keySection.appendChild(keyHead);
+  const keyBody=el('div','card-body');
+  const variants=[
+    ['A',DATA.variantAName||'Variant A'],
+    ['B',DATA.variantBName||'Variant B'],
+  ];
+  if(DATA.variantCName) variants.push(['C',DATA.variantCName]);
+  variants.forEach(([id,name])=>{
+    const row=el('div','stat');
+    const badge=el('span','label label-'+id.toLowerCase(),id);
+    row.appendChild(badge);
+    row.appendChild(el('span','bright',name));
+    keyBody.appendChild(row);
+  });
+  keySection.appendChild(keyBody);
+  page.appendChild(keySection);
+}
+
+buildReviewPage();
+buildGradesPage();
+buildResultsPage();
+</script>
+</body>
+</html>
diff --git a/agents/skill-creator-engineer/references/anti-patterns.md b/skills/skill-creator/references/anti-patterns.md
similarity index 100%
rename from agents/skill-creator-engineer/references/anti-patterns.md
rename to skills/skill-creator/references/anti-patterns.md
diff --git a/skills/skill-creator/references/artifact-schemas.md b/skills/skill-creator/references/artifact-schemas.md
new file mode 100644
index 0000000..98eac8b
--- /dev/null
+++ b/skills/skill-creator/references/artifact-schemas.md
@@ -0,0 +1,302 @@
+# Artifact Schemas
+
+JSON contracts for all eval pipeline artifacts. Field names, types, and nesting are
+contracts between producers and consumers. Downstream scripts parse by field name —
+do not rename fields without updating all consumers.
+
+## Producer/Consumer Map
+
+| Schema | Producer | Consumer(s) |
+|--------|----------|-------------|
+| `evals.json` | Skill creator (human) | `run_eval.py`, grader agent |
+| `grading.json` | grader agent | `aggregate_benchmark.py`, analyzer agent |
+| `benchmark.json` | `aggregate_benchmark.py` | analyzer agent, `package_results.py` |
+| `comparison.json` | comparator agent | analyzer agent |
+| `analysis.json` | analyzer agent | `package_results.py`, skill creator |
+| `timing.json` | `run_eval.py` | `aggregate_benchmark.py` |
+| `metrics.json` | `run_eval.py` | grader agent |
+| `eval_metadata.json` | `run_eval.py` | grader agent, comparator agent |
+| `trigger-eval.json` | Skill creator (human) | `optimize_description.py` |
+
+---
+
+## evals.json
+
+Location: `skill-workspace/evals/evals.json`
+
+```json
+[
+  {
+    "eval_id": "string — unique identifier for this eval, used as directory name",
+    "prompt": "string — the test prompt text passed to claude -p",
+    "assertions": [
+      "string — one assertion per entry, binary and evidence-checkable"
+    ],
+    "metadata": {
+      "description": "string — optional human-readable description of what this eval tests",
+      "tags": ["optional array of tags for filtering"]
+    }
+  }
+]
+```
+
+**Rules**:
+- `eval_id` must be a valid directory name (kebab-case recommended)
+- Each assertion must be binary: it either passes or fails, with evidence
+- Assertions should test skill-specific behavior, not generic output properties
+
+---
+
+## grading.json
+
+Location: `skill-workspace/iteration-N/{eval-id}/grading.json`
+
+```json
+{
+  "eval_id": "string — matches the eval_id from evals.json",
+  "configuration": "string — 'with_skill' or 'without_skill'",
+  "timestamp": "string — ISO 8601 timestamp",
+  "assertions": [
+    {
+      "assertion": "string — the assertion text from evals.json",
+      "verdict": "string — 'PASS' or 'FAIL'",
+      "evidence": "string — quoted excerpt or file reference",
+      "confidence": "string — 'high', 'medium', or 'low'"
+    }
+  ],
+  "pass_count": "integer",
+  "fail_count": "integer",
+  "pass_rate": "float — range 0.0 to 1.0",
+  "implicit_claims": [
+    {
+      "claim": "string",
+      "verdict": "string — 'VERIFIED', 'UNVERIFIED', or 'CONTRADICTED'",
+      "evidence": "string"
+    }
+  ],
+  "eval_critique": {
+    "non_discriminating_assertions": ["array of assertion text strings"],
+    "recommendation": "string"
+  },
+  "grader_notes": "string or null"
+}
+```
+
+**Required fields for `aggregate_benchmark.py`**: `pass_rate`, `pass_count`, `fail_count`
+
+---
+
+## benchmark.json
+
+Location: `skill-workspace/iteration-N/benchmark.json`
+
+```json
+{
+  "skill_name": "string",
+  "workspace": "string — absolute path",
+  "timestamp": "string — ISO 8601",
+  "eval_count": "integer",
+  "with_skill": {
+    "pass_rate": {
+      "mean": "float",
+      "stddev": "float",
+      "min": "float",
+      "max": "float"
+    },
+    "tokens": {
+      "mean": "float",
+      "stddev": "float"
+    },
+    "time_seconds": {
+      "mean": "float",
+      "stddev": "float"
+    }
+  },
+  "without_skill": {
+    "pass_rate": { "mean": "float", "stddev": "float", "min": "float", "max": "float" },
+    "tokens": { "mean": "float", "stddev": "float" },
+    "time_seconds": { "mean": "float", "stddev": "float" }
+  },
+  "delta": {
+    "pass_rate": "float or null — with_skill minus without_skill",
+    "description": "string — human-readable interpretation"
+  },
+  "eval_results": [
+    {
+      "eval_id": "string",
+      "configuration": "string",
+      "pass_rate": "float",
+      "pass_count": "integer",
+      "fail_count": "integer",
+      "without_skill_pass_rate": "float or null",
+      "with_skill_tokens": "integer",
+      "with_skill_duration": "float",
+      "without_skill_tokens": "integer",
+      "without_skill_duration": "float"
+    }
+  ]
+}
+```
+
+**Required fields for analyzer agent**: `with_skill.pass_rate.mean`,
+`without_skill.pass_rate.mean`, `delta.pass_rate`
+
+---
+
+## comparison.json
+
+Location: `skill-workspace/iteration-N/{eval-id}/comparison.json`
+
+```json
+{
+  "eval_id": "string",
+  "timestamp": "string — ISO 8601",
+  "rubric": [
+    {
+      "criterion": "string",
+      "description": "string",
+      "weight": "float — all weights sum to 1.0"
+    }
+  ],
+  "scores": {
+    "A": {
+      "criteria_scores": [
+        {
+          "criterion": "string — must match rubric criterion name",
+          "score": "integer — 1 to 5",
+          "rationale": "string — specific evidence"
+        }
+      ],
+      "total_score": "float — weighted sum normalized to 1-10 scale",
+      "assertion_pass_rate": "float or null"
+    },
+    "B": {
+      "criteria_scores": [],
+      "total_score": "float",
+      "assertion_pass_rate": "float or null"
+    }
+  },
+  "winner": "string — 'A', 'B', or 'tie'",
+  "winner_margin": "float — absolute difference in total_score",
+  "reasoning": "string — 2-4 sentences with specific criterion references",
+  "confidence": "string — 'high', 'medium', or 'low'",
+  "comparator_notes": "string or null"
+}
+```
+
+**Required fields for analyzer agent**: `winner`, `scores.A.total_score`,
+`scores.B.total_score`, `reasoning`
+
+---
+
+## analysis.json
+
+Location: `skill-workspace/iteration-N/analysis.json`
+
+```json
+{
+  "mode": "string — 'comparison' or 'benchmark'",
+  "timestamp": "string — ISO 8601",
+  "skill_won": "boolean",
+  "findings": [
+    {
+      "category": "string — one of: winner_factors, loser_improvements, instruction_analysis, transcript_waste, assertion_quality, metric_outliers, variance",
+      "priority": "string — 'high', 'medium', or 'low'",
+      "finding": "string — specific observation with evidence",
+      "actionable_suggestion": "string — concrete change"
+    }
+  ],
+  "improvements_for_skill": [
+    {
+      "target": "string — which section/instruction",
+      "current_behavior": "string",
+      "desired_behavior": "string",
+      "rationale": "string",
+      "generalization_risk": "string — 'low', 'medium', or 'high'"
+    }
+  ],
+  "improvements_for_evals": [
+    {
+      "assertion": "string",
+      "problem": "string",
+      "replacement": "string"
+    }
+  ],
+  "benchmark_summary": {
+    "with_skill_pass_rate_mean": "float or null",
+    "without_skill_pass_rate_mean": "float or null",
+    "delta": "float or null",
+    "comparator_win_rate": "float or null",
+    "top_failure_categories": ["array of strings"]
+  },
+  "analyzer_notes": "string or null"
+}
+```
+
+**Required fields for `package_results.py`**: `findings`, `improvements_for_skill`,
+`benchmark_summary.delta`
+
+---
+
+## timing.json
+
+Location: `skill-workspace/iteration-N/{eval-id}/{configuration}/timing.json`
+
+```json
+{
+  "duration_seconds": "float — wall-clock seconds for the claude -p run",
+  "tokens_total": "integer — sum of input_tokens and output_tokens",
+  "timed_out": "boolean — true if the run hit the timeout limit"
+}
+```
+
+Produced by: `run_eval.py`
+Consumed by: `aggregate_benchmark.py`
+
+---
+
+## metrics.json
+
+Location: `skill-workspace/iteration-N/{eval-id}/{configuration}/metrics.json`
+
+```json
+{
+  "tool_usage": {
+    "Read": "integer — number of Read tool calls",
+    "Write": "integer",
+    "Edit": "integer",
+    "Bash": "integer",
+    "Grep": "integer",
+    "Glob": "integer",
+    "Agent": "integer"
+  },
+  "total_tool_calls": "integer — sum of all tool_usage values"
+}
+```
+
+Produced by: `run_eval.py`
+Consumed by: grader agent (for context about execution behavior)
+
+---
+
+## trigger-eval.json
+
+Location: `skill-workspace/evals/trigger-eval.json`
+
+```json
+[
+  {
+    "query": "string — user prompt to test triggering",
+    "should_trigger": "boolean — true if the skill should activate for this query"
+  }
+]
+```
+
+**Conventions**:
+- Include 10 should_trigger: true entries (vary directness and phrasing)
+- Include 10 should_trigger: false entries (near-miss adjacent domains)
+- Use realistic prompts with context, not abstract one-liners
+- Test edge cases where the skill competes with adjacent skills
+
+Produced by: Skill creator (human)
+Consumed by: `optimize_description.py`
diff --git a/agents/skill-creator-engineer/references/complexity-examples.md b/skills/skill-creator/references/complexity-tiers.md
similarity index 100%
rename from agents/skill-creator-engineer/references/complexity-examples.md
rename to skills/skill-creator/references/complexity-tiers.md
diff --git a/agents/skill-creator-engineer/references/error-catalog.md b/skills/skill-creator/references/error-catalog.md
similarity index 100%
rename from agents/skill-creator-engineer/references/error-catalog.md
rename to skills/skill-creator/references/error-catalog.md
diff --git a/agents/skill-creator-engineer/references/skill-template.md b/skills/skill-creator/references/skill-template.md
similarity index 100%
rename from agents/skill-creator-engineer/references/skill-template.md
rename to skills/skill-creator/references/skill-template.md
diff --git a/agents/skill-creator-engineer/references/workflow-patterns.md b/skills/skill-creator/references/workflow-patterns.md
similarity index 100%
rename from agents/skill-creator-engineer/references/workflow-patterns.md
rename to skills/skill-creator/references/workflow-patterns.md
diff --git a/skills/skill-creator/scripts/aggregate_benchmark.py b/skills/skill-creator/scripts/aggregate_benchmark.py
new file mode 100644
index 0000000..e4795e9
--- /dev/null
+++ b/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+aggregate_benchmark.py — Compute statistics across eval runs in an iteration workspace.
+
+Reads grading.json from each eval directory. Computes mean, standard deviation, and
+delta (with_skill minus without_skill) for pass_rate, time_seconds, and tokens.
+
+Produces:
+  {workspace}/benchmark.json   Machine-readable statistics
+  {workspace}/benchmark.md     Human-readable summary
+"""
+
+import argparse
+import json
+import math
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Aggregate benchmark statistics from eval grading results",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    p.add_argument("workspace", help="Path to iteration workspace directory (e.g. skill-workspace/iteration-1)")
+    p.add_argument("--skill-name", required=True, help="Name of the skill being benchmarked")
+    return p
+
+
+def find_eval_dirs(workspace: Path) -> list[Path]:
+    """Find all eval directories that contain grading.json."""
+    eval_dirs = []
+    for child in sorted(workspace.iterdir()):
+        if child.is_dir() and (child / "grading.json").exists():
+            eval_dirs.append(child)
+    return eval_dirs
+
+
+def load_grading(eval_dir: Path) -> dict | None:
+    """Load grading.json from an eval directory."""
+    grading_path = eval_dir / "grading.json"
+    try:
+        return json.loads(grading_path.read_text())
+    except (json.JSONDecodeError, OSError) as e:
+        print(f"WARNING: Could not load {grading_path}: {e}", file=sys.stderr)
+        return None
+
+
+def load_timing(eval_dir: Path, configuration: str) -> dict:
+    """Load timing.json for a given configuration (with_skill or without_skill)."""
+    timing_path = eval_dir / configuration / "timing.json"
+    try:
+        return json.loads(timing_path.read_text())
+    except (json.JSONDecodeError, OSError):
+        return {"duration_seconds": 0.0, "tokens_total": 0}
+
+
+def mean(values: list[float]) -> float:
+    if not values:
+        return 0.0
+    return sum(values) / len(values)
+
+
+def stddev(values: list[float]) -> float:
+    if len(values) < 2:
+        return 0.0
+    m = mean(values)
+    variance = sum((v - m) ** 2 for v in values) / (len(values) - 1)
+    return math.sqrt(variance)
+
+
+def aggregate(workspace: Path, skill_name: str) -> dict:
+    eval_dirs = find_eval_dirs(workspace)
+    if not eval_dirs:
+        print(f"ERROR: No eval directories with grading.json found in {workspace}", file=sys.stderr)
+        sys.exit(1)
+
+    with_skill_pass_rates = []
+    without_skill_pass_rates = []
+    with_skill_tokens = []
+    without_skill_tokens = []
+    with_skill_durations = []
+    without_skill_durations = []
+
+    eval_results = []
+
+    for eval_dir in eval_dirs:
+        grading = load_grading(eval_dir)
+        if grading is None:
+            continue
+
+        config = grading.get("configuration")
+        if config not in ("with_skill", "without_skill"):
+            print(f"WARNING: {eval_dir.name}/grading.json missing 'configuration' field, skipping", file=sys.stderr)
+            continue
+        pass_rate = float(grading.get("pass_rate", 0.0))
+
+        with_timing = load_timing(eval_dir, "with_skill")
+        without_timing = load_timing(eval_dir, "without_skill")
+
+        if config == "with_skill":
+            with_skill_pass_rates.append(pass_rate)
+            with_skill_tokens.append(float(with_timing.get("tokens_total", 0)))
+            with_skill_durations.append(float(with_timing.get("duration_seconds", 0)))
+        else:
+            without_skill_pass_rates.append(pass_rate)
+            without_skill_tokens.append(float(without_timing.get("tokens_total", 0)))
+            without_skill_durations.append(float(without_timing.get("duration_seconds", 0)))
+
+        # Try to load the paired configuration if this is with_skill grading
+        # (eval dirs may contain only one grading.json; paired data comes from timing files)
+        without_pass_rate = None
+        paired_grading_path = eval_dir / "grading_without.json"
+        if paired_grading_path.exists():
+            try:
+                paired = json.loads(paired_grading_path.read_text())
+                without_pass_rate = float(paired.get("pass_rate", 0.0))
+            except (json.JSONDecodeError, OSError):
+                pass
+
+        eval_results.append(
+            {
+                "eval_id": eval_dir.name,
+                "configuration": config,
+                "pass_rate": pass_rate,
+                "pass_count": grading.get("pass_count", 0),
+                "fail_count": grading.get("fail_count", 0),
+                "without_skill_pass_rate": without_pass_rate,
+                "with_skill_tokens": with_timing.get("tokens_total", 0),
+                "with_skill_duration": with_timing.get("duration_seconds", 0),
+                "without_skill_tokens": without_timing.get("tokens_total", 0),
+                "without_skill_duration": without_timing.get("duration_seconds", 0),
+            }
+        )
+
+    # Compute aggregates
+    ws_mean = mean(with_skill_pass_rates)
+    wos_mean = mean(without_skill_pass_rates)
+    delta = ws_mean - wos_mean if with_skill_pass_rates and without_skill_pass_rates else None
+
+    benchmark = {
+        "skill_name": skill_name,
+        "workspace": str(workspace),
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "eval_count": len(eval_results),
+        "with_skill": {
+            "pass_rate": {
+                "mean": round(ws_mean, 4),
+                "stddev": round(stddev(with_skill_pass_rates), 4),
+                "min": round(min(with_skill_pass_rates), 4) if with_skill_pass_rates else 0.0,
+                "max": round(max(with_skill_pass_rates), 4) if with_skill_pass_rates else 0.0,
+            },
+            "tokens": {
+                "mean": round(mean(with_skill_tokens), 1),
+                "stddev": round(stddev(with_skill_tokens), 1),
+            },
+            "time_seconds": {
+                "mean": round(mean(with_skill_durations), 2),
+                "stddev": round(stddev(with_skill_durations), 2),
+            },
+        },
+        "without_skill": {
+            "pass_rate": {
+                "mean": round(wos_mean, 4),
+                "stddev": round(stddev(without_skill_pass_rates), 4),
+                "min": round(min(without_skill_pass_rates), 4) if without_skill_pass_rates else 0.0,
+                "max": round(max(without_skill_pass_rates), 4) if without_skill_pass_rates else 0.0,
+            },
+            "tokens": {
+                "mean": round(mean(without_skill_tokens), 1),
+                "stddev": round(stddev(without_skill_tokens), 1),
+            },
+            "time_seconds": {
+                "mean": round(mean(without_skill_durations), 2),
+                "stddev": round(stddev(without_skill_durations), 2),
+            },
+        },
+        "delta": {
+            "pass_rate": round(delta, 4) if delta is not None else None,
+            "description": "with_skill minus without_skill; positive means skill helps",
+        },
+        "eval_results": eval_results,
+    }
+
+    return benchmark
+
+
+def render_markdown(benchmark: dict) -> str:
+    ws = benchmark["with_skill"]
+    wos = benchmark["without_skill"]
+    delta = benchmark["delta"]["pass_rate"]
+    delta_str = f"+{delta:.1%}" if delta is not None and delta > 0 else (f"{delta:.1%}" if delta is not None else "N/A")
+
+    lines = [
+        f"# Benchmark: {benchmark['skill_name']}\n",
+        f"**Generated**: {benchmark['timestamp']}  \n",
+        f"**Evals**: {benchmark['eval_count']}\n\n",
+        "## Pass Rate\n\n",
+        "| Configuration | Mean | StdDev | Min | Max |\n",
+        "|--------------|------|--------|-----|-----|\n",
+        f"| with_skill   | {ws['pass_rate']['mean']:.1%} | {ws['pass_rate']['stddev']:.1%} | {ws['pass_rate']['min']:.1%} | {ws['pass_rate']['max']:.1%} |\n",
+        f"| without_skill | {wos['pass_rate']['mean']:.1%} | {wos['pass_rate']['stddev']:.1%} | {wos['pass_rate']['min']:.1%} | {wos['pass_rate']['max']:.1%} |\n",
+        f"| **delta** | **{delta_str}** | — | — | — |\n\n",
+        "## Token Usage\n\n",
+        "| Configuration | Mean Tokens | StdDev |\n",
+        "|--------------|-------------|--------|\n",
+        f"| with_skill   | {ws['tokens']['mean']:.0f} | {ws['tokens']['stddev']:.0f} |\n",
+        f"| without_skill | {wos['tokens']['mean']:.0f} | {wos['tokens']['stddev']:.0f} |\n\n",
+        "## Duration (seconds)\n\n",
+        "| Configuration | Mean | StdDev |\n",
+        "|--------------|------|--------|\n",
+        f"| with_skill   | {ws['time_seconds']['mean']:.1f}s | {ws['time_seconds']['stddev']:.1f}s |\n",
+        f"| without_skill | {wos['time_seconds']['mean']:.1f}s | {wos['time_seconds']['stddev']:.1f}s |\n\n",
+        "## Per-Eval Results\n\n",
+        "| Eval | Config | Pass Rate | Pass | Fail |\n",
+        "|------|--------|-----------|------|------|\n",
+    ]
+
+    for er in benchmark["eval_results"]:
+        lines.append(
+            f"| {er['eval_id']} | {er['configuration']} | {er['pass_rate']:.1%} | {er['pass_count']} | {er['fail_count']} |\n"
+        )
+
+    return "".join(lines)
+
+
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+    workspace = Path(args.workspace).resolve()
+
+    if not workspace.exists():
+        print(f"ERROR: Workspace directory does not exist: {workspace}", file=sys.stderr)
+        return 1
+
+    benchmark = aggregate(workspace, args.skill_name)
+
+    benchmark_json = workspace / "benchmark.json"
+    benchmark_json.write_text(json.dumps(benchmark, indent=2))
+    print(f"Written: {benchmark_json}", file=sys.stderr)
+
+    benchmark_md = workspace / "benchmark.md"
+    benchmark_md.write_text(render_markdown(benchmark))
+    print(f"Written: {benchmark_md}", file=sys.stderr)
+
+    delta = benchmark["delta"]["pass_rate"]
+    if delta is not None:
+        sign = "+" if delta > 0 else ""
+        print(f"Pass rate delta: {sign}{delta:.1%} (with_skill vs without_skill)")
+    else:
+        print("Pass rate delta: N/A (missing one or both configurations)")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/skills/skill-creator/scripts/eval_compare.py b/skills/skill-creator/scripts/eval_compare.py
new file mode 100644
index 0000000..58f1849
--- /dev/null
+++ b/skills/skill-creator/scripts/eval_compare.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+"""Generate blind A/B comparison HTML from eval workspace data.
+
+Scans workspace, collects output files, runs deterministic checks
+(go build, go vet, go test -race where applicable), loads grading
+and blind comparison data, injects into compare.html template.
+Outputs compare_report.html.
+
+Usage:
+    python3 eval_compare.py <workspace_dir>
+    python3 eval_compare.py --help
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Generate blind A/B comparison HTML from eval workspace data.",
+        epilog="Workspace must contain compare.html template and iteration-*/ directories.",
+    )
+    p.add_argument("workspace", type=Path, help="Path to the eval workspace directory")
+    p.add_argument(
+        "--output", type=Path, default=None, help="Output HTML path (default: <workspace>/compare_report.html)"
+    )
+    return p
+
+
+def load_json_safe(path: Path) -> dict | None:
+    """Load JSON from a file, returning None on any error."""
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError, UnicodeDecodeError) as e:
+        print(f"WARNING: Could not load {path}: {e}", file=sys.stderr)
+        return None
+
+
+def read_text_safe(path: Path) -> str:
+    """Read text file with encoding fallback."""
+    try:
+        return path.read_text(encoding="utf-8", errors="replace")
+    except OSError:
+        return ""
+
+
+def find_files(outputs_dir: Path) -> list[str]:
+    """List all files relative to outputs dir."""
+    files = []
+    for root, _, filenames in os.walk(outputs_dir):
+        for f in filenames:
+            rel = os.path.relpath(Path(root, f), outputs_dir)
+            files.append(rel)
+    return sorted(files)
+
+
+def count_go_lines(outputs_dir: Path) -> int:
+    """Count total lines across all .go files."""
+    total = 0
+    for root, _, filenames in os.walk(outputs_dir):
+        for f in filenames:
+            if f.endswith(".go"):
+                content = read_text_safe(Path(root, f))
+                total += len(content.splitlines())
+    return total
+
+
+def get_code_preview(outputs_dir: Path, max_lines: int = 60) -> str:
+    """Get preview of main .go file content."""
+    for root, _, filenames in os.walk(outputs_dir):
+        for f in sorted(filenames):
+            if f.endswith(".go") and not f.endswith("_test.go"):
+                content = read_text_safe(Path(root, f))
+                lines = content.splitlines()
+                if len(lines) > max_lines:
+                    return "\n".join(lines[:max_lines]) + f"\n... ({len(lines) - max_lines} more lines)"
+                return content
+    return ""
+
+
+def run_go_check(outputs_dir: Path, cmd: list[str], timeout: int = 30) -> str:
+    """Run a go command in the outputs directory, return 'yes'/'no'/'clean'/'issues'."""
+    # Find the go module root (prefer directory with go.mod)
+    mod_root = None
+    go_dirs = []
+    for root, _, files in os.walk(outputs_dir):
+        if "go.mod" in files:
+            mod_root = root
+            break
+        if any(f.endswith(".go") for f in files):
+            go_dirs.append(root)
+
+    target = mod_root or (go_dirs[0] if go_dirs else None)
+    if target is None:
+        return "no_go_files"
+
+    try:
+        result = subprocess.run(cmd, cwd=target, capture_output=True, text=True, timeout=timeout)
+        if result.returncode == 0:
+            return "yes" if "build" in cmd or "test" in cmd else "clean"
+        return "no" if "build" in cmd or "test" in cmd else "issues"
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return "skip"
+
+
+def load_grading(variant_dir: Path) -> dict | None:
+    """Load and normalize grading.json."""
+    path = variant_dir / "grading.json"
+    if not path.exists():
+        return None
+    raw = load_json_safe(path)
+    if raw is None:
+        return None
+    exps = raw.get("expectations", raw.get("assertions", []))
+    normalized = []
+    for e in exps:
+        text = e.get("text", e.get("assertion", "?"))
+        is_pass = e.get("passed") is True or e.get("verdict", "") == "PASS"
+        evidence = e.get("evidence", "")
+        normalized.append({"text": text, "passed": is_pass, "evidence": evidence})
+    passed = sum(1 for n in normalized if n["passed"])
+    tl = raw.get("pass_count")
+    if tl is not None:
+        passed = tl
+    total = len(normalized)
+    return {
+        "expectations": normalized,
+        "summary": {
+            "passed": passed,
+            "failed": total - passed,
+            "total": total,
+            "pass_rate": round(passed / total, 3) if total > 0 else 0,
+        },
+    }
+
+
+def build_variant_data(variant_dir: Path) -> dict:
+    """Build data dict for one variant."""
+    outputs = variant_dir / "outputs"
+    if not outputs.exists():
+        return {}
+    files = find_files(outputs)
+    return {
+        "lines": count_go_lines(outputs),
+        "files": files,
+        "fileCount": len(files),
+        "code_preview": get_code_preview(outputs),
+        "compiles": run_go_check(outputs, ["go", "build", "./..."]),
+        "tests_pass": run_go_check(outputs, ["go", "test", "-race", "-count=1", "./..."]),
+        "govet": run_go_check(outputs, ["go", "vet", "./..."]),
+        "grading": load_grading(variant_dir),
+    }
+
+
+def find_iteration_dirs(workspace: Path) -> list[Path]:
+    """Find all iteration-N directories, sorted by number."""
+    dirs = sorted(workspace.glob("iteration-*"))
+    return [d for d in dirs if d.is_dir()]
+
+
+def build_data(workspace: Path) -> dict:
+    """Build full comparison data."""
+    evals_path = workspace / "evals" / "evals.json"
+    evals_meta = {}
+    evals_raw = None
+    if evals_path.exists():
+        evals_raw = load_json_safe(evals_path)
+        if evals_raw:
+            for ev in evals_raw.get("evals", []):
+                evals_meta[ev.get("name", ev.get("id", ""))] = ev
+
+    evals_data = []
+    benchmark = []
+
+    # Use the latest iteration directory (or iteration-1 as fallback)
+    iterations = find_iteration_dirs(workspace)
+    if not iterations:
+        return {
+            "evals": [],
+            "benchmark": [],
+            "variantAName": "Variant A",
+            "variantBName": "Variant B",
+            "variantCName": "Variant C",
+        }
+
+    iteration = iterations[-1]  # Latest iteration
+
+    for eval_dir in sorted(iteration.iterdir()):
+        if not eval_dir.is_dir():
+            continue
+        name = eval_dir.name
+        a_data = build_variant_data(eval_dir / "variant-A")
+        b_data = build_variant_data(eval_dir / "variant-B")
+        c_data = build_variant_data(eval_dir / "variant-C")
+
+        prompt = evals_meta.get(name, {}).get("prompt", "")
+
+        # Load blind comparisons if available
+        blind = (
+            load_json_safe(eval_dir / "blind_comparison.json")
+            if (eval_dir / "blind_comparison.json").exists()
+            else None
+        )
+        blind_bc = (
+            load_json_safe(eval_dir / "blind_comparison_bc.json")
+            if (eval_dir / "blind_comparison_bc.json").exists()
+            else None
+        )
+
+        eval_entry = {
+            "name": name,
+            "prompt": prompt,
+            "variantA": a_data,
+            "variantB": b_data,
+            "blind_comparison": blind,
+            "blind_comparison_bc": blind_bc,
+        }
+        if c_data:
+            eval_entry["variantC"] = c_data
+        evals_data.append(eval_entry)
+
+        a_rate = a_data.get("grading", {}).get("summary", {}).get("pass_rate", 0) if a_data.get("grading") else 0
+        b_rate = b_data.get("grading", {}).get("summary", {}).get("pass_rate", 0) if b_data.get("grading") else 0
+        c_rate = c_data.get("grading", {}).get("summary", {}).get("pass_rate", 0) if c_data.get("grading") else 0
+        bm = {"name": name, "aRate": a_rate, "bRate": b_rate}
+        if c_data:
+            bm["cRate"] = c_rate
+        benchmark.append(bm)
+
+    variants = evals_raw.get("variants", {}) if evals_raw else {}
+
+    return {
+        "evals": evals_data,
+        "benchmark": benchmark,
+        "variantAName": variants.get("A", {}).get("name", "Variant A"),
+        "variantBName": variants.get("B", {}).get("name", "Variant B"),
+        "variantCName": variants.get("C", {}).get("name", "Variant C"),
+    }
+
+
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+
+    workspace = args.workspace.resolve()
+    template = workspace / "compare.html"
+    output = (args.output or workspace / "compare_report.html").resolve()
+
+    if not template.exists():
+        print(f"Error: {template} not found", file=sys.stderr)
+        return 1
+
+    data = build_data(workspace)
+    html = read_text_safe(template).replace("__DATA_PLACEHOLDER__", json.dumps(data, indent=2))
+    output.write_text(html, encoding="utf-8")
+
+    print(f"Report: {output}")
+    print(f"Evals: {len(data['evals'])}")
+    for ev in data["evals"]:
+        a = ev.get("variantA", {})
+        b = ev.get("variantB", {})
+        print(
+            f"  {ev['name']}: A={a.get('lines', 0)}L/{a.get('compiles', '?')} B={b.get('lines', 0)}L/{b.get('compiles', '?')}"
+        )
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/skills/skill-creator/scripts/optimize_description.py b/skills/skill-creator/scripts/optimize_description.py
new file mode 100644
index 0000000..ae36723
--- /dev/null
+++ b/skills/skill-creator/scripts/optimize_description.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python3
+"""
+optimize_description.py — Train/test description optimization for skill triggering accuracy.
+
+Splits eval queries 60/40 train/test. Evaluates the current description (3 runs per query
+for variance reduction). Proposes improvements based on train set failures. Re-evaluates
+on both sets. Selects best description by test score to prevent overfitting.
+
+Eval set format (trigger-eval.json):
+  [
+    {"query": "user prompt text", "should_trigger": true},
+    {"query": "adjacent domain prompt", "should_trigger": false}
+  ]
+"""
+
+import argparse
+import json
+import math
+import random
+import shutil
+import subprocess
+import sys
+import tempfile
+from datetime import datetime, timezone
+from pathlib import Path
+
+RUNS_PER_QUERY = 3  # Runs per query for variance reduction
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Optimize skill description for triggering accuracy",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    p.add_argument("--skill-path", required=True, help="Path to the skill directory (contains SKILL.md)")
+    p.add_argument("--eval-set", required=True, help="Path to trigger-eval.json")
+    p.add_argument("--model", default="claude-sonnet-4-6", help="Claude model to use (default: claude-sonnet-4-6)")
+    p.add_argument("--max-iterations", type=int, default=5, help="Maximum optimization iterations (default: 5)")
+    p.add_argument("--seed", type=int, default=42, help="Random seed for train/test split (default: 42)")
+    p.add_argument("--dry-run", action="store_true", help="Show split and current accuracy without optimizing")
+    return p
+
+
+def check_claude_available() -> None:
+    if shutil.which("claude") is None:
+        print(
+            "ERROR: 'claude' CLI not found in PATH.\nInstall with: npm install -g @anthropic-ai/claude-code",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def load_eval_set(eval_path: Path) -> list[dict]:
+    try:
+        data = json.loads(eval_path.read_text())
+    except (json.JSONDecodeError, OSError) as e:
+        print(f"ERROR: Could not load eval set {eval_path}: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if not isinstance(data, list) or not data:
+        print("ERROR: eval set must be a non-empty JSON array", file=sys.stderr)
+        sys.exit(1)
+
+    for entry in data:
+        if "query" not in entry or "should_trigger" not in entry:
+            print(
+                f"ERROR: each eval entry must have 'query' and 'should_trigger' fields. Got: {entry}",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+    return data
+
+
+def split_eval_set(eval_set: list[dict], seed: int) -> tuple[list[dict], list[dict]]:
+    """60/40 train/test split, stratified by should_trigger."""
+    rng = random.Random(seed)
+    should_trigger = [e for e in eval_set if e["should_trigger"]]
+    should_not = [e for e in eval_set if not e["should_trigger"]]
+
+    def split(items: list) -> tuple[list, list]:
+        shuffled = items[:]
+        rng.shuffle(shuffled)
+        split_point = math.ceil(len(shuffled) * 0.6)
+        return shuffled[:split_point], shuffled[split_point:]
+
+    train_trigger, test_trigger = split(should_trigger)
+    train_no, test_no = split(should_not)
+    return train_trigger + train_no, test_trigger + test_no
+
+
+def test_trigger(query: str, description: str, model: str) -> bool:
+    """
+    Ask claude whether it would use the skill given this description and query.
+    Returns True if the skill should trigger, False otherwise.
+    """
+    prompt = (
+        f"You are a routing system. A skill has this description:\n\n"
+        f"---\n{description}\n---\n\n"
+        f'A user says: "{query}"\n\n'
+        f"Answer with exactly one word: YES if you would use this skill for this request, "
+        f"NO if you would not. Do not explain."
+    )
+
+    try:
+        result = subprocess.run(
+            ["claude", "-p", prompt, "--model", model],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        if result.returncode != 0:
+            print(
+                f"WARNING: claude exited {result.returncode}: {result.stderr[:200]}",
+                file=sys.stderr,
+            )
+            return False
+        answer = result.stdout.strip().upper()
+        return answer.startswith("YES")
+    except subprocess.TimeoutExpired:
+        return False
+
+
+def evaluate_description(description: str, eval_queries: list[dict], model: str, runs: int = RUNS_PER_QUERY) -> float:
+    """Evaluate a description against a set of queries. Returns accuracy (0.0-1.0)."""
+    if not eval_queries:
+        return 0.0
+
+    correct = 0
+    total = 0
+
+    for entry in eval_queries:
+        query = entry["query"]
+        should_trigger = entry["should_trigger"]
+
+        # Run multiple times for variance reduction; take majority vote
+        votes = [test_trigger(query, description, model) for _ in range(runs)]
+        majority_triggered = votes.count(True) > runs / 2
+
+        if majority_triggered == should_trigger:
+            correct += 1
+        total += 1
+
+    return correct / total if total > 0 else 0.0
+
+
+def propose_improvement(
+    description: str,
+    train_queries: list[dict],
+    failures: list[dict],
+    model: str,
+) -> str:
+    """
+    Ask claude to propose a better description based on train set failures.
+    Returns the proposed description text.
+    """
+    failure_examples = "\n".join(
+        f'- Query: "{f["query"]}" | Expected: {"TRIGGER" if f["should_trigger"] else "NO TRIGGER"} | Got: {"TRIGGER" if f["triggered"] else "NO TRIGGER"}'
+        for f in failures[:10]  # Cap at 10 examples to avoid prompt bloat
+    )
+
+    prompt = (
+        f"You are improving a Claude skill's description to optimize triggering accuracy.\n\n"
+        f"Current description:\n---\n{description}\n---\n\n"
+        f"Failures on training set:\n{failure_examples}\n\n"
+        f"Requirements:\n"
+        f"1. Keep the description under 1024 characters\n"
+        f"2. No XML angle brackets (< or >)\n"
+        f"3. Maintain the What+When formula: 'Do X when Y. Use for [triggers]. Do NOT use for [anti-triggers].'\n"
+        f"4. Do not overfit to the failure examples — improve the description generally\n"
+        f"5. Return ONLY the new description text, no explanation\n\n"
+        f"New description:"
+    )
+
+    try:
+        result = subprocess.run(
+            ["claude", "-p", prompt, "--model", model],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if result.returncode != 0:
+            print(
+                f"WARNING: claude exited {result.returncode} proposing improvement: {result.stderr[:200]}",
+                file=sys.stderr,
+            )
+            return description
+        proposed = result.stdout.strip()
+        if not proposed:
+            print("WARNING: claude returned empty improvement. Keeping current.", file=sys.stderr)
+            return description
+        if len(proposed) > 1024:
+            print(f"WARNING: Proposed description exceeds 1024 chars ({len(proposed)}). Truncating.", file=sys.stderr)
+            proposed = proposed[:1020] + "..."
+        return proposed
+    except subprocess.TimeoutExpired:
+        print("WARNING: Timeout proposing description improvement. Keeping current.", file=sys.stderr)
+        return description
+
+
+def identify_failures(description: str, queries: list[dict], model: str) -> list[dict]:
+    """Return list of queries where the description produced incorrect routing."""
+    failures = []
+    for entry in queries:
+        query = entry["query"]
+        should_trigger = entry["should_trigger"]
+        votes = [test_trigger(query, description, model) for _ in range(RUNS_PER_QUERY)]
+        triggered = votes.count(True) > RUNS_PER_QUERY / 2
+        if triggered != should_trigger:
+            failures.append({**entry, "triggered": triggered})
+    return failures
+
+
+def optimize(args: argparse.Namespace) -> int:
+    check_claude_available()
+
+    skill_path = Path(args.skill_path).resolve()
+    skill_md = skill_path / "SKILL.md"
+    eval_path = Path(args.eval_set).resolve()
+
+    if not skill_md.exists():
+        print(f"ERROR: SKILL.md not found at {skill_md}", file=sys.stderr)
+        return 1
+
+    eval_set = load_eval_set(eval_path)
+    train_set, test_set = split_eval_set(eval_set, seed=args.seed)
+
+    print(f"Eval set: {len(eval_set)} queries ({len(train_set)} train, {len(test_set)} test)", file=sys.stderr)
+
+    # Extract current description from SKILL.md frontmatter
+    skill_text = skill_md.read_text()
+    description_start = skill_text.find("description: |")
+    if description_start == -1:
+        print("ERROR: Could not find 'description: |' in SKILL.md frontmatter", file=sys.stderr)
+        return 1
+
+    # Extract description block (lines until next YAML key)
+    lines = skill_text.split("\n")
+    desc_lines = []
+    in_desc = False
+    for line in lines:
+        if line.strip().startswith("description: |"):
+            in_desc = True
+            continue
+        if in_desc:
+            if line and not line[0].isspace() and ":" in line:
+                break
+            desc_lines.append(line.lstrip())
+
+    current_description = "\n".join(desc_lines).strip()
+    print(f"Current description ({len(current_description)} chars)", file=sys.stderr)
+
+    if args.dry_run:
+        train_acc = evaluate_description(current_description, train_set, args.model)
+        test_acc = evaluate_description(current_description, test_set, args.model)
+        print(f"Train accuracy: {train_acc:.1%}")
+        print(f"Test accuracy:  {test_acc:.1%}")
+        return 0
+
+    # Evaluate initial accuracy
+    print("Evaluating initial description...", file=sys.stderr)
+    best_description = current_description
+    best_test_acc = evaluate_description(current_description, test_set, args.model)
+    print(f"Initial test accuracy: {best_test_acc:.1%}", file=sys.stderr)
+
+    history = [{"iteration": 0, "description": current_description, "test_accuracy": best_test_acc}]
+
+    for iteration in range(1, args.max_iterations + 1):
+        print(f"\nIteration {iteration}/{args.max_iterations}", file=sys.stderr)
+
+        failures = identify_failures(best_description, train_set, args.model)
+        train_acc = 1.0 - (len(failures) / len(train_set)) if train_set else 0.0
+        print(f"Train accuracy: {train_acc:.1%} ({len(failures)} failures)", file=sys.stderr)
+
+        if not failures:
+            print("No failures on train set. Optimization complete.", file=sys.stderr)
+            break
+
+        proposed = propose_improvement(best_description, train_set, failures, args.model)
+        proposed_test_acc = evaluate_description(proposed, test_set, args.model)
+        print(f"Proposed test accuracy: {proposed_test_acc:.1%}", file=sys.stderr)
+
+        history.append(
+            {
+                "iteration": iteration,
+                "description": proposed,
+                "train_accuracy": train_acc,
+                "test_accuracy": proposed_test_acc,
+            }
+        )
+
+        if proposed_test_acc >= best_test_acc:
+            best_description = proposed
+            best_test_acc = proposed_test_acc
+            print(f"Accepted (test accuracy improved or held: {best_test_acc:.1%})", file=sys.stderr)
+        else:
+            print(f"Rejected (test accuracy decreased: {proposed_test_acc:.1%} < {best_test_acc:.1%})", file=sys.stderr)
+
+    # Report results
+    print(f"\n=== Optimization Complete ===")
+    print(f"Best test accuracy: {best_test_acc:.1%}")
+    print(f"Iterations run: {len(history) - 1}")
+
+    if best_description != current_description:
+        print(f"\nBest description ({len(best_description)} chars):\n")
+        print(best_description)
+    else:
+        print("\nNo improvement found. Current description is already optimal.")
+
+    # Write history to optimization_history.json alongside the eval set
+    history_path = eval_path.parent / "optimization_history.json"
+    history_path.write_text(
+        json.dumps(
+            {
+                "skill_path": str(skill_path),
+                "eval_set": str(eval_path),
+                "model": args.model,
+                "timestamp": datetime.now(timezone.utc).isoformat(),
+                "best_test_accuracy": best_test_acc,
+                "best_description": best_description,
+                "history": history,
+            },
+            indent=2,
+        )
+    )
+    print(f"\nHistory written: {history_path}", file=sys.stderr)
+
+    return 0
+
+
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+    return optimize(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/skills/skill-creator/scripts/package_results.py b/skills/skill-creator/scripts/package_results.py
new file mode 100644
index 0000000..07ce725
--- /dev/null
+++ b/skills/skill-creator/scripts/package_results.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""
+package_results.py — Consolidate all iteration artifacts into a summary report.
+
+Reads grading.json, benchmark.json, analysis.json, and changes.md from each iteration
+directory in the workspace. Produces a single summary report.
+
+Usage:
+  python3 package_results.py workspace/ --format markdown
+  python3 package_results.py workspace/ --format json
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Consolidate eval iteration artifacts into a summary report",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    p.add_argument("workspace", help="Path to skill-workspace/ root directory")
+    p.add_argument(
+        "--format", choices=["markdown", "json"], default="markdown", help="Output format (default: markdown)"
+    )
+    p.add_argument("--output", help="Output file path (default: workspace/summary.md or summary.json)")
+    return p
+
+
+def find_iteration_dirs(workspace: Path) -> list[Path]:
+    """Find all iteration-N directories in the workspace."""
+    iterations = []
+    for child in sorted(workspace.iterdir()):
+        if child.is_dir() and child.name.startswith("iteration-"):
+            try:
+                int(child.name.split("-")[1])
+                iterations.append(child)
+            except (IndexError, ValueError):
+                pass
+    return sorted(iterations, key=lambda p: int(p.name.split("-")[1]))
+
+
+def load_json_safe(path: Path) -> dict | list | None:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text())
+    except (json.JSONDecodeError, OSError):
+        return None
+
+
+def load_text_safe(path: Path) -> str | None:
+    if not path.exists():
+        return None
+    try:
+        return path.read_text()
+    except OSError:
+        return None
+
+
+def collect_iteration_data(iteration_dir: Path) -> dict:
+    """Collect all artifacts from a single iteration directory."""
+    data = {
+        "iteration": iteration_dir.name,
+        "benchmark": load_json_safe(iteration_dir / "benchmark.json"),
+        "analysis": load_json_safe(iteration_dir / "analysis.json"),
+        "changes": load_text_safe(iteration_dir / "changes.md"),
+        "evals": [],
+    }
+
+    # Collect per-eval data
+    for child in sorted(iteration_dir.iterdir()):
+        if child.is_dir():
+            grading = load_json_safe(child / "grading.json")
+            if grading:
+                data["evals"].append(
+                    {
+                        "eval_id": child.name,
+                        "grading": grading,
+                    }
+                )
+
+    return data
+
+
+def render_markdown(workspace: Path, iterations: list[dict]) -> str:
+    lines = [
+        "# Skill Eval Summary\n",
+        f"**Workspace**: `{workspace}`  \n",
+        f"**Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}  \n",
+        f"**Iterations**: {len(iterations)}\n\n",
+    ]
+
+    # Progress table across iterations
+    if any(it["benchmark"] for it in iterations):
+        lines.append("## Pass Rate Progression\n\n")
+        lines.append("| Iteration | With Skill | Without Skill | Delta |\n")
+        lines.append("|-----------|-----------|---------------|-------|\n")
+
+        for it in iterations:
+            b = it["benchmark"]
+            if b:
+                ws = b.get("with_skill", {}).get("pass_rate", {}).get("mean", 0)
+                wos = b.get("without_skill", {}).get("pass_rate", {}).get("mean", 0)
+                delta = b.get("delta", {}).get("pass_rate")
+                delta_str = (
+                    f"+{delta:.1%}"
+                    if delta is not None and delta > 0
+                    else (f"{delta:.1%}" if delta is not None else "N/A")
+                )
+                lines.append(f"| {it['iteration']} | {ws:.1%} | {wos:.1%} | {delta_str} |\n")
+            else:
+                lines.append(f"| {it['iteration']} | — | — | — |\n")
+
+        lines.append("\n")
+
+    # Per-iteration sections
+    for it in iterations:
+        lines.append(f"## {it['iteration'].replace('-', ' ').title()}\n\n")
+
+        # Changes summary
+        if it["changes"]:
+            lines.append("### Changes Made\n\n")
+            # Include first 50 lines of changes.md
+            change_lines = it["changes"].split("\n")[:50]
+            lines.append("\n".join(change_lines))
+            if len(it["changes"].split("\n")) > 50:
+                lines.append("\n_(truncated — see changes.md for full content)_")
+            lines.append("\n\n")
+
+        # Eval results
+        if it["evals"]:
+            lines.append("### Eval Results\n\n")
+            lines.append("| Eval | Pass Rate | Pass | Fail |\n")
+            lines.append("|------|-----------|------|------|\n")
+            for ev in it["evals"]:
+                g = ev["grading"]
+                lines.append(
+                    f"| {ev['eval_id']} | {g.get('pass_rate', 0):.1%} | {g.get('pass_count', 0)} | {g.get('fail_count', 0)} |\n"
+                )
+            lines.append("\n")
+
+        # Top findings from analysis
+        if it["analysis"]:
+            findings = it["analysis"].get("findings", [])
+            high_priority = [f for f in findings if f.get("priority") == "high"]
+            if high_priority:
+                lines.append("### High-Priority Findings\n\n")
+                for f in high_priority[:5]:
+                    lines.append(f"- **{f.get('category', 'finding')}**: {f.get('finding', '')}\n")
+                    if f.get("actionable_suggestion"):
+                        lines.append(f"  - Suggestion: {f['actionable_suggestion']}\n")
+                lines.append("\n")
+
+    # Final recommendation
+    if iterations:
+        last = iterations[-1]
+        b = last.get("benchmark")
+        if b:
+            delta = b.get("delta", {}).get("pass_rate")
+            if delta is not None:
+                lines.append("## Final Assessment\n\n")
+                if delta > 0.05:
+                    lines.append(f"The skill demonstrates measurable improvement: pass rate delta = +{delta:.1%}\n")
+                elif delta < -0.05:
+                    lines.append(f"The skill performs below baseline: pass rate delta = {delta:.1%}\n")
+                    lines.append(
+                        "Consider reviewing skill instructions — they may be adding noise rather than signal.\n"
+                    )
+                else:
+                    lines.append(f"The skill shows marginal impact: pass rate delta = {delta:.1%}\n")
+                    lines.append("Check whether eval assertions are discriminating (test skill-specific behavior).\n")
+
+    return "".join(lines)
+
+
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+    workspace = Path(args.workspace).resolve()
+
+    if not workspace.exists():
+        print(f"ERROR: Workspace does not exist: {workspace}", file=sys.stderr)
+        return 1
+
+    iteration_dirs = find_iteration_dirs(workspace)
+    if not iteration_dirs:
+        print(f"WARNING: No iteration directories found in {workspace}", file=sys.stderr)
+
+    iterations = [collect_iteration_data(d) for d in iteration_dirs]
+
+    if args.format == "markdown":
+        content = render_markdown(workspace, iterations)
+        default_name = "summary.md"
+    else:
+        content = json.dumps(
+            {
+                "workspace": str(workspace),
+                "generated": datetime.now(timezone.utc).isoformat(),
+                "iterations": iterations,
+            },
+            indent=2,
+        )
+        default_name = "summary.json"
+
+    output_path = Path(args.output).resolve() if args.output else (workspace / default_name)
+    output_path.write_text(content)
+    print(f"Written: {output_path}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/skills/skill-creator/scripts/run_eval.py b/skills/skill-creator/scripts/run_eval.py
new file mode 100644
index 0000000..d83ce2a
--- /dev/null
+++ b/skills/skill-creator/scripts/run_eval.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""
+run_eval.py — Execute a skill against a test prompt via claude -p subprocess.
+
+Produces in --output-dir:
+  outputs/         All files written during the run
+  transcript.md    Full execution log
+  timing.json      Token count and wall-clock duration
+  metrics.json     Tool usage counts
+"""
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Execute a skill against a test prompt via claude -p",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    p.add_argument("--skill-path", required=True, help="Path to the skill directory (contains SKILL.md)")
+    p.add_argument("--prompt", required=True, help="Test prompt text to run")
+    p.add_argument("--output-dir", required=True, help="Directory to write outputs, transcript, timing, metrics")
+    p.add_argument("--model", default="claude-sonnet-4-6", help="Claude model to use (default: claude-sonnet-4-6)")
+    p.add_argument("--no-skill", action="store_true", help="Run without loading the skill (baseline run)")
+    p.add_argument("--timeout", type=int, default=300, help="Max seconds to wait for claude -p (default: 300)")
+    return p
+
+
+def check_claude_available() -> None:
+    """Verify claude CLI is in PATH. Exit 1 with actionable message if not."""
+    if shutil.which("claude") is None:
+        print(
+            "ERROR: 'claude' CLI not found in PATH.\n"
+            "Install with: npm install -g @anthropic-ai/claude-code\n"
+            "Verify with: which claude && claude --version",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def prepare_output_dir(output_dir: Path) -> Path:
+    """Create output directory structure. Returns outputs/ subdirectory."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    outputs = output_dir / "outputs"
+    outputs.mkdir(exist_ok=True)
+    return outputs
+
+
+def build_claude_command(
+    skill_path: Path,
+    prompt: str,
+    outputs_dir: Path,
+    model: str,
+    no_skill: bool,
+) -> list[str]:
+    """Construct the claude -p command with appropriate flags."""
+    cmd = [
+        "claude",
+        "-p",
+        prompt,
+        "--model",
+        model,
+        "--output-format",
+        "json",
+    ]
+
+    if not no_skill:
+        skill_md = skill_path / "SKILL.md"
+        if not skill_md.exists():
+            print(f"ERROR: SKILL.md not found at {skill_md}", file=sys.stderr)
+            sys.exit(1)
+        cmd.extend(["--system-prompt-file", str(skill_md)])
+
+    # Ask claude to write outputs to the outputs directory
+    cmd.extend(
+        [
+            "--working-dir",
+            str(outputs_dir),
+        ]
+    )
+
+    return cmd
+
+
+def count_tools(transcript_text: str) -> dict:
+    """Count tool invocations by type from transcript text."""
+    import re
+
+    tool_pattern = re.compile(r'"tool":\s*"([^"]+)"')
+    counts: dict[str, int] = {}
+    for match in tool_pattern.finditer(transcript_text):
+        tool = match.group(1)
+        counts[tool] = counts.get(tool, 0) + 1
+    return counts
+
+
+def run_eval(args: argparse.Namespace) -> int:
+    check_claude_available()
+
+    skill_path = Path(args.skill_path).resolve()
+    output_dir = Path(args.output_dir).resolve()
+    outputs_dir = prepare_output_dir(output_dir)
+
+    cmd = build_claude_command(
+        skill_path=skill_path,
+        prompt=args.prompt,
+        outputs_dir=outputs_dir,
+        model=args.model,
+        no_skill=args.no_skill,
+    )
+
+    print(f"Running: {' '.join(cmd[:4])} ...", file=sys.stderr)
+    start_time = time.monotonic()
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=args.timeout,
+            cwd=str(outputs_dir),
+        )
+    except subprocess.TimeoutExpired:
+        print(f"ERROR: claude -p timed out after {args.timeout}s", file=sys.stderr)
+        (output_dir / "transcript.md").write_text(
+            f"# Execution Timeout\n\nRun timed out after {args.timeout} seconds.\n"
+        )
+        _write_timing(output_dir, duration=float(args.timeout), tokens=0, timed_out=True)
+        _write_metrics(output_dir, tool_counts={})
+        return 1
+
+    duration = time.monotonic() - start_time
+
+    # Write transcript
+    transcript_lines = [
+        "# Execution Transcript\n",
+        f"**Model**: {args.model}\n",
+        f"**Skill loaded**: {not args.no_skill}\n",
+        f"**Duration**: {duration:.2f}s\n",
+        f"**Exit code**: {result.returncode}\n\n",
+        "## stdout\n\n```\n",
+        result.stdout or "(empty)",
+        "\n```\n\n## stderr\n\n```\n",
+        result.stderr or "(empty)",
+        "\n```\n",
+    ]
+    transcript_text = "".join(transcript_lines)
+    (output_dir / "transcript.md").write_text(transcript_text)
+
+    # Parse token counts from JSON output if available
+    tokens = 0
+    try:
+        response = json.loads(result.stdout)
+        usage = response.get("usage", {})
+        tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
+    except (json.JSONDecodeError, AttributeError):
+        pass
+
+    _write_timing(output_dir, duration=duration, tokens=tokens, timed_out=False)
+    _write_metrics(output_dir, tool_counts=count_tools(result.stdout + result.stderr))
+
+    if result.returncode != 0:
+        print(
+            f"WARNING: claude -p exited with code {result.returncode}. Check transcript.md for details.",
+            file=sys.stderr,
+        )
+        return result.returncode
+
+    print(f"Eval complete. Outputs: {output_dir}", file=sys.stderr)
+    return 0
+
+
+def _write_timing(output_dir: Path, duration: float, tokens: int, timed_out: bool) -> None:
+    timing = {
+        "duration_seconds": round(duration, 3),
+        "tokens_total": tokens,
+        "timed_out": timed_out,
+    }
+    (output_dir / "timing.json").write_text(json.dumps(timing, indent=2))
+
+
+def _write_metrics(output_dir: Path, tool_counts: dict) -> None:
+    metrics = {
+        "tool_usage": tool_counts,
+        "total_tool_calls": sum(tool_counts.values()),
+    }
+    (output_dir / "metrics.json").write_text(json.dumps(metrics, indent=2))
+
+
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+    return run_eval(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/skills/skill-eval/SKILL.md b/skills/skill-eval/SKILL.md
index 8bd7ac4..26cbfcf 100644
--- a/skills/skill-eval/SKILL.md
+++ b/skills/skill-eval/SKILL.md
@@ -7,11 +7,11 @@ description: |
   with A/B comparisons, and validate skill structure. Use when user says
   "improve skill", "test skill triggers", "optimize description", "benchmark
   skill", "eval skill", or "skill quality". Do NOT use for creating new skills
-  (use skill-creator-engineer).
+  (use skill-creator).
 version: 1.0.0
 user-invocable: false
 argument-hint: "<skill-name>"
-agent: skill-creator-engineer
+agent: skill-creator
 allowed-tools:
   - Read
   - Write
@@ -66,7 +66,7 @@ This skill operates as the eval-driven improvement pipeline for Claude Code skil
 - Generate HTML reports for visual review
 
 ## What This Skill CANNOT Do
-- Create new skills from scratch (use skill-creator-engineer)
+- Create new skills from scratch (use skill-creator)
 - Modify skill instructions automatically (human reviews changes)
 - Test skills that require specific MCP servers or external services
 - Run evals without the `claude` CLI available
diff --git a/skills/workflow-help/SKILL.md b/skills/workflow-help/SKILL.md
index 9c6d00a..50729a5 100644
--- a/skills/workflow-help/SKILL.md
+++ b/skills/workflow-help/SKILL.md
@@ -62,7 +62,7 @@ This skill operates as an operator for workflow education and guidance, configur
 ## What This Skill CANNOT Do
 - Execute workflows (use workflow-orchestrator)
 - Debug code (use systematic-debugging)
-- Create or modify skills (use skill-creator-engineer)
+- Create or modify skills (use skill-creator)
 - Run tests or validate code (use verification-before-completion)
 - Make decisions about which approach to take for the user's actual task